Ejemplo n.º 1
0
 def check(self, proxy, good):
     """
         检查代理是否可用
     """
     with ExceptContext(errback=lambda *args: True):
         if self.is_anonymous(proxy):
             good.add(proxy)
Ejemplo n.º 2
0
 def reset_proxies(self):
     """
     分发有效代理和无效代理
     :return:
     """
     self.logger.debug("Start resets thread. ")
     while self.alive:
         with ExceptContext(errback=self.log_err):
             proxies = list(self.proxies_check_out_channel.pop_all())
             if proxies:
                 self.logger.debug(f"Got {len(proxies)} proxies to reset.")
                 bp = self.settings.get("BAD_PROXY_HASH", "bad_proxies")
                 gp = self.settings.get("GOOD_PROXY_SET", "good_proxies")
                 for proxy, good in proxies:
                     if good:
                         self.redis_conn.sadd(gp, proxy)
                         self.redis_conn.hdel(bp, proxy)
                     else:
                         count = self.redis_conn.hincrby(bp, proxy)
                         if count > self.settings.get_int("FAILED_TIMES", 5):
                             self.redis_conn.hdel(bp, proxy)
                             self.logger.debug(
                                 f"Abandon {proxy} of failed {count} times.")
                         self.redis_conn.srem(gp, proxy)
             else:
                 time.sleep(1)
         time.sleep(1)
     self.logger.debug("Stop resets thread. ")
Ejemplo n.º 3
0
    def parse(self, response):
        with ExceptContext(errback=self.log_err) as ec:
            self.logger.debug("Start response in parse. ")
            item_urls = self.extract_item_urls(response)
            # 增加这个字段的目的是为了记住去重后的url有多少个,如果为空,对于按参数翻页的网站,有可能已经翻到了最后一页。
            effective_urls = [i for i in item_urls if not (
                self.need_duplicate and self.duplicate_filter(response, i, self.need_duplicate))]
            self.crawler.stats.inc_total_pages(
                response.meta['crawlid'], len(effective_urls))
            yield from self.gen_requests(
                [dict(url=u,
                      errback="errback",
                      meta={
                          "priority": response.meta["priority"]-20,
                          "seed": response.url,
                          "proxy": None
                      })
                 for u in effective_urls], "parse_item", response)

            next_page_url = self.extract_page_url(response, effective_urls, item_urls)
            if next_page_url:
                yield from self.gen_requests([next_page_url], "parse", response)
        if ec.got_err:
            self.crawler.stats.set_failed_download(
                response.meta['crawlid'],
                response.request.url,
                "In parse: " + "".join(traceback.format_exception(*ec.err_info)))
Ejemplo n.º 4
0
    def parse_item(self, response):
        with ExceptContext(errback=self.log_err) as ec:
            response.meta["request_count_per_item"] = 1
            base_loader = self.get_base_loader(response)
            meta = response.request.meta
            self.enrich_base_data(base_loader, response)
            meta["item_collector"] = \
                ItemCollector(Node(None, base_loader, None, self.enrich_data))
            yield self.yield_item_or_req(meta["item_collector"], response)

        if ec.got_err:
            self.crawler.stats.set_failed_download(
                response.meta['crawlid'],
                response.request.url,
                "In parse_item: " + "".join(traceback.format_exception(*ec.err_info)))
Ejemplo n.º 5
0
    def parse_next(self, response):
        if response.status == 999:
            self.logger.error(
                "Partial request error: crawlid:%s, url: %s. " % (
                    response.meta["crawlid"], response.url))
        with ExceptContext(errback=self.log_err) as ec:
            response.meta["request_count_per_item"] = \
                response.meta.get("request_count_per_item", 1) + 1
            yield self.yield_item_or_req(
                response.request.meta["item_collector"], response)

        if ec.got_err:
            self.crawler.stats.set_failed_download(
                response.meta['crawlid'],
                response.request.url,
                "In parse_next: " + "".join(traceback.format_exception(*ec.err_info)))
Ejemplo n.º 6
0
    def start(self):
        self.logger.debug("Start proxy factory. ")
        self.gen_thread(self.check_proxies)
        self.gen_thread(self.bad_source)
        self.gen_thread(self.good_source)
        self.gen_thread(self.reset_proxies)

        while self.alive or any(th for th in self.children if th.is_alive()):
            with ExceptContext(errback=self.log_err):
                if self.alive:
                    self.logger.debug("Start to fetch proxies. ")
                    proxies = self.fetch_all()
                    self.logger.debug("%s proxies found. " % len(proxies))
                    self.proxies_check_in_channel.update(proxies)

            Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60)).\
                wait_timeout_or_notify(notify=lambda: not self.alive)
        self.logger.debug("Stop proxy factory. ")
Ejemplo n.º 7
0
    def good_source(self):
        """
        每隔指定时间间隔将有效代理放到待检查队列进行检查
        :return:
        """
        self.logger.debug("Start good source thread. ")
        while self.alive:
            with ExceptContext(errback=self.log_err):
                proxies = self.redis_conn.smembers(
                    self.settings.get("GOOD_PROXY_SET", "good_proxies"))
                if proxies:
                    self.logger.debug(
                        f"Good proxy count is: {len(proxies)}, ready to check.")
                    self.proxies_check_in_channel.update(proxies)
            Blocker(self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5)).\
                wait_timeout_or_notify(notify=lambda: not self.alive)

        self.logger.debug("Stop good source thread. ")
Ejemplo n.º 8
0
    def bad_source(self):
        """
        每隔指定时间间隔将无效代理放到待检查队列进行检查
        :return:
        """
        self.logger.debug("Start bad source thread. ")
        while self.alive:
            if len(self.proxies_check_in_channel):
                continue

            with ExceptContext(errback=self.log_err):
                proxies = self.redis_conn.hgetall(
                    self.settings.get("BAD_PROXY_HASH", "bad_proxies"))
                if proxies:
                    self.logger.debug(
                        f"Bad proxy count is: {len(proxies)}, ready to check.")
                    while proxies:
                        proxy, times = proxies.popitem()
                        self.proxies_check_in_channel.add(proxy)

            Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5)).\
                wait_timeout_or_notify(notify=lambda: not self.alive)

        self.logger.debug("Stop bad source thread. ")
Ejemplo n.º 9
0
def test_raise():
    with pytest.raises(Exception):
        with ExceptContext(Exception, errback=lambda name, *args: print(name)):
            raise Exception("test. ..")
Ejemplo n.º 10
0
def test_no_raise():
    with ExceptContext(Exception,
                       errback=lambda name, *args: print(name) is None):
        raise Exception("test. ..")
Ejemplo n.º 11
0
 def inc_crawled_pages(self, crawlid):
     with ExceptContext():
         self.redis_conn.hincrby("crawlid:%s" % crawlid, "crawled_pages", 1)
         self.update(crawlid)
Ejemplo n.º 12
0
 def set_total_pages(self, crawlid, num=1):
     with ExceptContext():
         self.redis_conn.hset("crawlid:%s" % crawlid, "total_pages", num)
         self.update(crawlid)
Ejemplo n.º 13
0
 def set_failed(self, crawlid, url, reason, _type="pages"):
     with ExceptContext():
         self.redis_conn.hset("failed_download_%s:%s" % (_type, crawlid),
                              url, reason)
         self.redis_conn.expire("failed_download_%s:%s" % (_type, crawlid),
                                60 * 60 * 24 * 2)
Ejemplo n.º 14
0
 def set_failed_download(self, crawlid, url, reason, _type="pages"):
     with ExceptContext():
         self.redis_conn.hincrby("crawlid:%s" % crawlid,
                                 "failed_download_%s" % _type, 1)
         self.update(crawlid)
         self.set_failed(crawlid, reason, url, _type)