def do_request(s, url, params, proxies, method, data, retry): sleep_time = 5 while True: with get_request_lock(url): r = s.request(method, url, timeout=20, params=params, data=data, proxies=proxies) grabber_log(url, r.url, r.request.headers, r.headers) if r.status_code == 200: content_length = r.headers.get("Content-Length") if content_length and int(content_length) != r.raw.tell(): raise Exception( "incomplete response. Content-Length: {content_length}, got: {actual}" .format(content_length=content_length, actual=r.raw.tell()) ) break if not retry or r.status_code not in RETRYABLE_HTTP_CODES: r.raise_for_status() # 302 error without location header if r.status_code == 302: # pylint: disable=protected-access match = re.search( r"^location:\s*(.+)", str(r.raw._original_response.msg), re.M + re.I ) if not match: raise Exception("status 302 without location header") url = match.group(1) continue print("retry after {sleep_time} seconds".format(sleep_time=sleep_time)) sleep(sleep_time) sleep_time *= 2 return r
def download_error(er): if is_http(er, code=429): # retry doesn't work with 429 error sleep(5) raise er crawler.handle_error(er) sleep(5)
def do_request(s, url, params, proxies, method, data, retry): sleep_time = 5 while True: with get_request_lock(url): r = s.request(method, url, timeout=20, params=params, data=data, proxies=proxies) grabber_log(url, r.url, r.request.headers, r.headers) if r.status_code == 200: content_length = r.headers.get("Content-Length") if content_length and int(content_length) != r.raw.tell(): raise Exception( "incomplete response. Content-Length: {content_length}, got: {actual}" .format(content_length=content_length, actual=r.raw.tell())) break if not retry or r.status_code not in RETRYABLE_HTTP_CODES: r.raise_for_status() # 302 error without location header if r.status_code == 302: # pylint: disable=protected-access match = re.search(r"^location:\s*(.+)", str(r.raw._original_response.msg), re.M + re.I) if not match: raise Exception("status 302 without location header") url = match.group(1) continue print("retry after {sleep_time} seconds".format(sleep_time=sleep_time)) sleep(sleep_time) sleep_time *= 2 return r
def grabber(url, header=None, *, referer=None, cookie=None, raise_429=True): """Request url, return text or bytes of the content.""" scheme, netloc, path, query, frag = urlsplit(url) if netloc not in sessions: s = requests.Session() s.headers.update(default_header) sessions[netloc] = s else: s = sessions[netloc] if header: s.headers.update(header) if referer: s.headers['referer'] = quote_unicode(referer) if cookie: quote_unicode_dict(cookie) requests.utils.add_dict_to_cookiejar(s.cookies, cookie) while True: r = s.get(url, timeout=20) grabber_log(url, r.request.headers, r.headers) if r.status_code == 200: break if r.status_code != 429 or raise_429: r.raise_for_status() sleep(5) return r
def do_request(s, url, params, proxies, method, data, raise_429): while True: r = s.request(method, url, timeout=20, params=params, data=data, proxies=proxies) grabber_log(url, r.url, r.request.headers, r.headers) if r.status_code == 200: content_length = r.headers.get("Content-Length") if content_length and int(content_length) != r.raw.tell(): raise Exception( "incomplete response. Content-Length: {content_length}, got: {actual}" .format(content_length=content_length, actual=r.raw.tell())) break if r.status_code != 429 or raise_429: r.raise_for_status() # 302 error without location header if r.status_code == 302: # pylint: disable=protected-access match = re.search(r"^location:\s*(.+)", str(r.raw._original_response.msg), re.M + re.I) if not match: raise Exception("status 302 without location header") url = match.group(1) continue sleep(5) return r
def do_request(s, url, params, proxies, method, data, raise_429): while True: r = s.request(method, url, timeout=20, params=params, data=data, proxies=proxies) grabber_log(url, r.url, r.request.headers, r.headers) if r.status_code == 200: break if r.status_code != 429 or raise_429: r.raise_for_status() # 302 error without location header if r.status_code == 302: # pylint: disable=protected-access match = re.search(r"^location:\s*(.+)", str(r.raw._original_response.msg), re.M + re.I) if not match: raise Exception("status 302 without location header") url = match.group(1) continue sleep(5) return r
def download_error(er): if is_429(er): # retry doesn't work with 429 error sleep(5) raise else: crawler.handle_error(er) sleep(5)
def download_error(er, count): t = 5 * 2**count print(f"wait {t} seconds...") if is_http(er, code=429): # retry doesn't work with 429 error sleep(t) raise er crawler.handle_error(er) sleep(t)
def do_request(s, url, params, raise_429): while True: r = s.get(url, timeout=20, params=params) grabber_log(url, r.url, r.request.headers, r.headers) if r.status_code == 200: break if r.status_code != 429 or raise_429: r.raise_for_status() sleep(5) return r
def increaser(): nonlocal a @listen("set") def _(event): nonlocal a a = event.data while True: sleep(1) a += 1
def analyze_pages(self): """Crawl for each pages""" url = self.mission.url old_eps = EpisodeList(self.mission.episodes or ()) new_eps = EpisodeList() while True: try: eps = self.mission.module.get_episodes(self.html, url) except SkipPageError: pass except LastPageError: break else: if not eps: print("Warning: get_episodes returns an empty list") self.transform_title(eps) eps = EpisodeList(eps) # add result episodes into new_eps in new to old order. for ep in reversed(eps): new_eps.add(ep) # FIXME: do we really need this check? # one-time mission? if self.is_onetime(new_eps): break # duplicate with old_eps if any(e in old_eps for e in eps): break # get next page next_url = self.get_next_page(self.html, url) if not next_url: break url = next_url print('Analyzing {}...'.format(url)) sleep(getattr(self.mission.module, "rest_analyze", 0)) self.html = self.grabber.html(url, retry=True) has_new_ep = False for ep in reversed(new_eps): if old_eps.add(ep): has_new_ep = True self.mission.episodes = list(old_eps) if has_new_ep: self.mission.last_update = time.time() if not self.mission.episodes: raise Exception("Episode list is empty")
def test_later_cancel(self): from worker import later, sleep a = False def task(): nonlocal a a = True pending = later(task, timeout=1) sleep(0.5) pending.stop() sleep(1) self.assertFalse(a)
def analyze_pages(self): """Crawl for each pages""" url = self.mission.url old_eps = EpisodeList(self.mission.episodes or ()) new_eps = EpisodeList() while True: try: eps = self.mission.module.get_episodes(self.html, url) except SkipPageError: pass else: if not eps: print("Warning: get_episodes returns an empty list") self.transform_title(eps) eps = EpisodeList(eps) # add result episodes into new_eps in new to old order. for ep in reversed(eps): new_eps.add(ep) # FIXME: do we really need this check? # one-time mission? if self.is_onetime(new_eps): break # duplicate with old_eps if any(e in old_eps for e in eps): break # get next page next_url = self.get_next_page(self.html, url) if not next_url: break url = next_url print('Analyzing {}...'.format(url)) sleep(getattr(self.mission.module, "rest_analyze", 0)) self.html = self.grabber.html(url, retry=True) has_new_ep = False for ep in reversed(new_eps): if old_eps.add(ep): has_new_ep = True self.mission.episodes = list(old_eps) if has_new_ep: self.mission.last_update = time.time() if not self.mission.episodes: raise Exception("Episode list is empty")
def test_later(self): from worker import current, later, sleep a = 0 b = None def add(value): nonlocal a nonlocal b b = current() a += value current().later(add, 10, timeout=2) with self.subTest("not yet"): sleep(1) self.assertEqual(a, 0) self.assertEqual(b, None) with self.subTest("finished"): sleep(2) self.assertEqual(a, 10) self.assertEqual(b, current()) later(add, 10, timeout=2) with self.subTest("not yet"): sleep(1) self.assertEqual(a, 10) with self.subTest("finished"): sleep(2) self.assertEqual(a, 20) self.assertEqual(b, current())
def test_create_worker(self): from worker import create_worker, sleep a = False @create_worker def thread(): nonlocal a sleep(1) a = True sleep(0.5) self.assertFalse(a) sleep(1) self.assertTrue(a) self.assertFalse(thread.is_running()) thread.join()
def do_analyze(self): for mission in self.gen_missions: err = None try: sleep(self.get_cooldown(mission)) with load_episodes(mission): Analyzer(mission).analyze() except WorkerExit: raise except BaseException as _err: err = _err if self.stop_on_error and (not callable(self.stop_on_error) or self.stop_on_error(err)): err.mission = mission raise finally: if self.on_item_finished: self.on_item_finished(err, mission) self.cooldown[mission.module.name] = time()
def do_analyze(self): for mission in self.gen_missions: err = None try: sleep(self.get_cooldown(mission)) with load_episodes(mission): Analyzer(mission).analyze() except WorkerExit: raise except BaseException as _err: # pylint: disable=broad-except err = _err if self.stop_on_error and (not callable(self.stop_on_error) or self.stop_on_error(err)): err.mission = mission raise finally: if self.on_item_finished: self.on_item_finished(err, mission) self.cooldown[mission.module.name] = time()
def rest(self): """Rest some time.""" sleep(getattr(self.mod, "rest", 0))
def rest(self): """Rest some time.""" sleep(getattr(self.downloader, "rest", 0))
def thread(): nonlocal a sleep(1) a = True
def blocking_task(): sleep(1)