def httptasker(spider_q, retry=5): while True: req, callback = yield from spider_q.get(interval=1.0) if req is None: tl = random.random() * 0.001 yield gen.Task(io_loop.add_timeout, tl) continue def get_response(): for i in range(retry): try: rq = httpclient.HTTPRequest(**req) response = yield httpcli.fetch(rq) response.ourl = req['url'] return response except httpclient.HTTPError as e: print('@HTTPError: %s with URL: %s Retry: %s' % (e.code, req['url'], i + 1), file=sys.stderr) if e.code in (403, 404): break continue except Exception as e: print('@ErrorProcess: %s\nURL: %s' % (e, req['url']), file=sys.stderr) print('@error_trace_back', format_exc(), file=sys.stderr) break return None response = yield from get_response() try: if response is None: yield from spider_q.fail(req['url']) print('@Failed: %s' % req['url'], file=sys.stderr) continue else: yield from spider_q.ack(req['url']) c = get_charset(response, default='gb18030') response.ubody = response.body.decode(c, 'ignore') response.charset = c g = callback(response) if isinstance(g, types.GeneratorType): yield from g except Exception as e: print('@ErrorProcess: %s\nURL: %s' % (e, req['url']), file=sys.stderr) print('@error_trace_back', format_exc(), file=sys.stderr)
def httptasker(spider_q, retry=5): while True: req, callback = yield from spider_q.get(interval=1.0) if req is None: tl = random.random() * 0.001 yield gen.Task(io_loop.add_timeout, tl) continue def get_response(): for i in range(retry): try: rq = httpclient.HTTPRequest(**req) response = yield httpcli.fetch(rq) response.ourl = req["url"] return response except httpclient.HTTPError as e: print("@HTTPError: %s with URL: %s Retry: %s" % (e.code, req["url"], i + 1), file=sys.stderr) if e.code in (403, 404): break continue except Exception as e: print("@ErrorProcess: %s\nURL: %s" % (e, req["url"]), file=sys.stderr) print("@error_trace_back", format_exc(), file=sys.stderr) break return None response = yield from get_response() try: if response is None: yield from spider_q.fail(req["url"]) print("@Failed: %s" % req["url"], file=sys.stderr) continue else: yield from spider_q.ack(req["url"]) c = get_charset(response, default="gb18030") response.ubody = response.body.decode(c, "ignore") response.charset = c g = callback(response) if isinstance(g, types.GeneratorType): yield from g except Exception as e: print("@ErrorProcess: %s\nURL: %s" % (e, req["url"]), file=sys.stderr) print("@error_trace_back", format_exc(), file=sys.stderr)
def httptasker(reqs, retry=3): for req, callback in reqs: if req is None: tl = random.random()*0.01 yield from asyncio.sleep(tl) continue def get_response(): response = None for i in range(retry): try: try: response = yield from asyncio.wait_for(aiohttp.request(**req), 5.0) response.body = yield from response.read() response.ourl = req['url'] finally: if response is not None: response.close() return response except aiohttp_errors as e: print('@FetchError: %s, %s, retry: %d' % (type(e), req['url'], i), file=sys.stderr) except http.cookies.CookieError as e: print('@CookieError: %s, retry: %d' % (req['url'], i), file=sys.stderr) #print('@error_trace_back', format_exc(), file=sys.stderr) except Exception as e: print('@ErrorFetching: %s\nURL: %s' % (e, req['url']), file=sys.stderr) print('@error_trace_back', format_exc(), file=sys.stderr) return None response = yield from get_response() try: if response is None: print('@Failed: %s' % req['url'], file=sys.stderr) continue c = get_charset(response) response.body = response.body.decode(c, 'ignore') response.charset = c yield from callback(response) except Exception as e: print('@ErrorProcess: %s\nURL: %s' % (e, req['url']), file=sys.stderr) print('@error_trace_back', format_exc(), file=sys.stderr)