Ejemplo n.º 1
0
def do_request(s, url, params, proxies, method, data, retry):
	sleep_time = 5
	while True:
		with get_request_lock(url):
			r = s.request(method, url, timeout=20, params=params,
				data=data, proxies=proxies)
		grabber_log(url, r.url, r.request.headers, r.headers)

		if r.status_code == 200:
			content_length = r.headers.get("Content-Length")
			if content_length and int(content_length) != r.raw.tell():
				raise Exception(
					"incomplete response. Content-Length: {content_length}, got: {actual}"
						.format(content_length=content_length, actual=r.raw.tell())
				)
			break
		if not retry or r.status_code not in RETRYABLE_HTTP_CODES:
			r.raise_for_status()
		# 302 error without location header
		if r.status_code == 302:
			# pylint: disable=protected-access
			match = re.search(
				r"^location:\s*(.+)",
				str(r.raw._original_response.msg),
				re.M + re.I
			)
			if not match:
				raise Exception("status 302 without location header")
			url = match.group(1)
			continue
		print("retry after {sleep_time} seconds".format(sleep_time=sleep_time))
		sleep(sleep_time)
		sleep_time *= 2
	return r
Ejemplo n.º 2
0
	def download_error(er):
		if is_http(er, code=429):
			# retry doesn't work with 429 error
			sleep(5)
			raise er
		crawler.handle_error(er)
		sleep(5)
Ejemplo n.º 3
0
def do_request(s, url, params, proxies, method, data, retry):
    sleep_time = 5
    while True:
        with get_request_lock(url):
            r = s.request(method,
                          url,
                          timeout=20,
                          params=params,
                          data=data,
                          proxies=proxies)
        grabber_log(url, r.url, r.request.headers, r.headers)

        if r.status_code == 200:
            content_length = r.headers.get("Content-Length")
            if content_length and int(content_length) != r.raw.tell():
                raise Exception(
                    "incomplete response. Content-Length: {content_length}, got: {actual}"
                    .format(content_length=content_length,
                            actual=r.raw.tell()))
            break
        if not retry or r.status_code not in RETRYABLE_HTTP_CODES:
            r.raise_for_status()
        # 302 error without location header
        if r.status_code == 302:
            # pylint: disable=protected-access
            match = re.search(r"^location:\s*(.+)",
                              str(r.raw._original_response.msg), re.M + re.I)
            if not match:
                raise Exception("status 302 without location header")
            url = match.group(1)
            continue
        print("retry after {sleep_time} seconds".format(sleep_time=sleep_time))
        sleep(sleep_time)
        sleep_time *= 2
    return r
Ejemplo n.º 4
0
def grabber(url, header=None, *, referer=None, cookie=None, raise_429=True):
    """Request url, return text or bytes of the content."""
    scheme, netloc, path, query, frag = urlsplit(url)

    if netloc not in sessions:
        s = requests.Session()
        s.headers.update(default_header)
        sessions[netloc] = s
    else:
        s = sessions[netloc]

    if header:
        s.headers.update(header)

    if referer:
        s.headers['referer'] = quote_unicode(referer)

    if cookie:
        quote_unicode_dict(cookie)
        requests.utils.add_dict_to_cookiejar(s.cookies, cookie)

    while True:
        r = s.get(url, timeout=20)
        grabber_log(url, r.request.headers, r.headers)

        if r.status_code == 200:
            break
        if r.status_code != 429 or raise_429:
            r.raise_for_status()
        sleep(5)
    return r
Ejemplo n.º 5
0
	def download_error(er):
		if is_http(er, code=429):
			# retry doesn't work with 429 error
			sleep(5)
			raise er
		crawler.handle_error(er)
		sleep(5)
Ejemplo n.º 6
0
def do_request(s, url, params, proxies, method, data, raise_429):
    while True:
        r = s.request(method,
                      url,
                      timeout=20,
                      params=params,
                      data=data,
                      proxies=proxies)
        grabber_log(url, r.url, r.request.headers, r.headers)

        if r.status_code == 200:
            content_length = r.headers.get("Content-Length")
            if content_length and int(content_length) != r.raw.tell():
                raise Exception(
                    "incomplete response. Content-Length: {content_length}, got: {actual}"
                    .format(content_length=content_length,
                            actual=r.raw.tell()))
            break
        if r.status_code != 429 or raise_429:
            r.raise_for_status()
        # 302 error without location header
        if r.status_code == 302:
            # pylint: disable=protected-access
            match = re.search(r"^location:\s*(.+)",
                              str(r.raw._original_response.msg), re.M + re.I)
            if not match:
                raise Exception("status 302 without location header")
            url = match.group(1)
            continue
        sleep(5)
    return r
Ejemplo n.º 7
0
def do_request(s, url, params, proxies, method, data, raise_429):
    while True:
        r = s.request(method,
                      url,
                      timeout=20,
                      params=params,
                      data=data,
                      proxies=proxies)
        grabber_log(url, r.url, r.request.headers, r.headers)

        if r.status_code == 200:
            break
        if r.status_code != 429 or raise_429:
            r.raise_for_status()
        # 302 error without location header
        if r.status_code == 302:
            # pylint: disable=protected-access
            match = re.search(r"^location:\s*(.+)",
                              str(r.raw._original_response.msg), re.M + re.I)
            if not match:
                raise Exception("status 302 without location header")
            url = match.group(1)
            continue
        sleep(5)
    return r
Ejemplo n.º 8
0
 def download_error(er):
     if is_429(er):
         # retry doesn't work with 429 error
         sleep(5)
         raise
     else:
         crawler.handle_error(er)
         sleep(5)
Ejemplo n.º 9
0
 def download_error(er, count):
     t = 5 * 2**count
     print(f"wait {t} seconds...")
     if is_http(er, code=429):
         # retry doesn't work with 429 error
         sleep(t)
         raise er
     crawler.handle_error(er)
     sleep(t)
Ejemplo n.º 10
0
def do_request(s, url, params, raise_429):
    while True:
        r = s.get(url, timeout=20, params=params)
        grabber_log(url, r.url, r.request.headers, r.headers)

        if r.status_code == 200:
            break
        if r.status_code != 429 or raise_429:
            r.raise_for_status()
        sleep(5)
    return r
Ejemplo n.º 11
0
        def increaser():
            nonlocal a

            @listen("set")
            def _(event):
                nonlocal a
                a = event.data

            while True:
                sleep(1)
                a += 1
Ejemplo n.º 12
0
    def analyze_pages(self):
        """Crawl for each pages"""
        url = self.mission.url
        old_eps = EpisodeList(self.mission.episodes or ())
        new_eps = EpisodeList()

        while True:
            try:
                eps = self.mission.module.get_episodes(self.html, url)
            except SkipPageError:
                pass
            except LastPageError:
                break
            else:
                if not eps:
                    print("Warning: get_episodes returns an empty list")
                self.transform_title(eps)

                eps = EpisodeList(eps)

                # add result episodes into new_eps in new to old order.
                for ep in reversed(eps):
                    new_eps.add(ep)

                # FIXME: do we really need this check?
                # one-time mission?
                if self.is_onetime(new_eps):
                    break

                # duplicate with old_eps
                if any(e in old_eps for e in eps):
                    break

            # get next page
            next_url = self.get_next_page(self.html, url)
            if not next_url:
                break
            url = next_url
            print('Analyzing {}...'.format(url))
            sleep(getattr(self.mission.module, "rest_analyze", 0))
            self.html = self.grabber.html(url, retry=True)

        has_new_ep = False
        for ep in reversed(new_eps):
            if old_eps.add(ep):
                has_new_ep = True
        self.mission.episodes = list(old_eps)

        if has_new_ep:
            self.mission.last_update = time.time()

        if not self.mission.episodes:
            raise Exception("Episode list is empty")
Ejemplo n.º 13
0
 def test_later_cancel(self):
     from worker import later, sleep
     
     a = False
     def task():
         nonlocal a
         a = True
     pending = later(task, timeout=1)
     sleep(0.5)
     pending.stop()
     sleep(1)
     self.assertFalse(a)
Ejemplo n.º 14
0
	def analyze_pages(self):
		"""Crawl for each pages"""
		url = self.mission.url
		old_eps = EpisodeList(self.mission.episodes or ())
		new_eps = EpisodeList()
		
		while True:
			try:
				eps = self.mission.module.get_episodes(self.html, url)
			except SkipPageError:
				pass
			else:
				if not eps:
					print("Warning: get_episodes returns an empty list")
				self.transform_title(eps)
				
				eps = EpisodeList(eps)
				
				# add result episodes into new_eps in new to old order.
				for ep in reversed(eps):
					new_eps.add(ep)
					
				# FIXME: do we really need this check?
				# one-time mission?
				if self.is_onetime(new_eps):
					break
					
				# duplicate with old_eps
				if any(e in old_eps for e in eps):
					break
				
			# get next page
			next_url = self.get_next_page(self.html, url)
			if not next_url:
				break
			url = next_url
			print('Analyzing {}...'.format(url))
			sleep(getattr(self.mission.module, "rest_analyze", 0))
			self.html = self.grabber.html(url, retry=True)
		
		has_new_ep = False
		for ep in reversed(new_eps):
			if old_eps.add(ep):
				has_new_ep = True
		self.mission.episodes = list(old_eps)
		
		if has_new_ep:
			self.mission.last_update = time.time()
		
		if not self.mission.episodes:
			raise Exception("Episode list is empty")
Ejemplo n.º 15
0
 def test_later(self):
     from worker import current, later, sleep
     
     a = 0
     b = None
     
     def add(value):
         nonlocal a
         nonlocal b
         b = current()
         a += value
         
     current().later(add, 10, timeout=2)
     
     with self.subTest("not yet"):
         sleep(1)
         self.assertEqual(a, 0)
         self.assertEqual(b, None)
     
     with self.subTest("finished"):
         sleep(2)
         self.assertEqual(a, 10)
         self.assertEqual(b, current())
         
     later(add, 10, timeout=2)
     
     with self.subTest("not yet"):
         sleep(1)
         self.assertEqual(a, 10)
         
     with self.subTest("finished"):
         sleep(2)
         self.assertEqual(a, 20)
         self.assertEqual(b, current())
Ejemplo n.º 16
0
 def test_create_worker(self):
     from worker import create_worker, sleep
     
     a = False
     
     @create_worker
     def thread():
         nonlocal a
         sleep(1)
         a = True
         
     sleep(0.5)
     self.assertFalse(a)
     sleep(1)
     self.assertTrue(a)
     self.assertFalse(thread.is_running())
     thread.join()
Ejemplo n.º 17
0
	def do_analyze(self):
		for mission in self.gen_missions:
			err = None
			try:
				sleep(self.get_cooldown(mission))
				with load_episodes(mission):
					Analyzer(mission).analyze()
			except WorkerExit:
				raise
			except BaseException as _err:
				err = _err
				if self.stop_on_error and (not callable(self.stop_on_error) or self.stop_on_error(err)):
					err.mission = mission
					raise
			finally:
				if self.on_item_finished:
					self.on_item_finished(err, mission)
				self.cooldown[mission.module.name] = time()		
Ejemplo n.º 18
0
	def do_analyze(self):
		for mission in self.gen_missions:
			err = None
			try:
				sleep(self.get_cooldown(mission))
				with load_episodes(mission):
					Analyzer(mission).analyze()
			except WorkerExit:
				raise
			except BaseException as _err: # pylint: disable=broad-except
				err = _err
				if self.stop_on_error and (not callable(self.stop_on_error) or self.stop_on_error(err)):
					err.mission = mission
					raise
			finally:
				if self.on_item_finished:
					self.on_item_finished(err, mission)
				self.cooldown[mission.module.name] = time()		
Ejemplo n.º 19
0
 def rest(self):
     """Rest some time."""
     sleep(getattr(self.mod, "rest", 0))
Ejemplo n.º 20
0
 def rest(self):
     """Rest some time."""
     sleep(getattr(self.downloader, "rest", 0))
Ejemplo n.º 21
0
	def rest(self):
		"""Rest some time."""
		sleep(getattr(self.mod, "rest", 0))
Ejemplo n.º 22
0
 def thread():
     nonlocal a
     sleep(1)
     a = True
Ejemplo n.º 23
0
 def blocking_task():
     sleep(1)