def check(self, proxy, good): """ 检查代理是否可用 """ with ExceptContext(errback=lambda *args: True): if self.is_anonymous(proxy): good.add(proxy)
def reset_proxies(self): """ 分发有效代理和无效代理 :return: """ self.logger.debug("Start resets thread. ") while self.alive: with ExceptContext(errback=self.log_err): proxies = list(self.proxies_check_out_channel.pop_all()) if proxies: self.logger.debug(f"Got {len(proxies)} proxies to reset.") bp = self.settings.get("BAD_PROXY_HASH", "bad_proxies") gp = self.settings.get("GOOD_PROXY_SET", "good_proxies") for proxy, good in proxies: if good: self.redis_conn.sadd(gp, proxy) self.redis_conn.hdel(bp, proxy) else: count = self.redis_conn.hincrby(bp, proxy) if count > self.settings.get_int("FAILED_TIMES", 5): self.redis_conn.hdel(bp, proxy) self.logger.debug( f"Abandon {proxy} of failed {count} times.") self.redis_conn.srem(gp, proxy) else: time.sleep(1) time.sleep(1) self.logger.debug("Stop resets thread. ")
def parse(self, response): with ExceptContext(errback=self.log_err) as ec: self.logger.debug("Start response in parse. ") item_urls = self.extract_item_urls(response) # 增加这个字段的目的是为了记住去重后的url有多少个,如果为空,对于按参数翻页的网站,有可能已经翻到了最后一页。 effective_urls = [i for i in item_urls if not ( self.need_duplicate and self.duplicate_filter(response, i, self.need_duplicate))] self.crawler.stats.inc_total_pages( response.meta['crawlid'], len(effective_urls)) yield from self.gen_requests( [dict(url=u, errback="errback", meta={ "priority": response.meta["priority"]-20, "seed": response.url, "proxy": None }) for u in effective_urls], "parse_item", response) next_page_url = self.extract_page_url(response, effective_urls, item_urls) if next_page_url: yield from self.gen_requests([next_page_url], "parse", response) if ec.got_err: self.crawler.stats.set_failed_download( response.meta['crawlid'], response.request.url, "In parse: " + "".join(traceback.format_exception(*ec.err_info)))
def parse_item(self, response): with ExceptContext(errback=self.log_err) as ec: response.meta["request_count_per_item"] = 1 base_loader = self.get_base_loader(response) meta = response.request.meta self.enrich_base_data(base_loader, response) meta["item_collector"] = \ ItemCollector(Node(None, base_loader, None, self.enrich_data)) yield self.yield_item_or_req(meta["item_collector"], response) if ec.got_err: self.crawler.stats.set_failed_download( response.meta['crawlid'], response.request.url, "In parse_item: " + "".join(traceback.format_exception(*ec.err_info)))
def parse_next(self, response): if response.status == 999: self.logger.error( "Partial request error: crawlid:%s, url: %s. " % ( response.meta["crawlid"], response.url)) with ExceptContext(errback=self.log_err) as ec: response.meta["request_count_per_item"] = \ response.meta.get("request_count_per_item", 1) + 1 yield self.yield_item_or_req( response.request.meta["item_collector"], response) if ec.got_err: self.crawler.stats.set_failed_download( response.meta['crawlid'], response.request.url, "In parse_next: " + "".join(traceback.format_exception(*ec.err_info)))
def start(self): self.logger.debug("Start proxy factory. ") self.gen_thread(self.check_proxies) self.gen_thread(self.bad_source) self.gen_thread(self.good_source) self.gen_thread(self.reset_proxies) while self.alive or any(th for th in self.children if th.is_alive()): with ExceptContext(errback=self.log_err): if self.alive: self.logger.debug("Start to fetch proxies. ") proxies = self.fetch_all() self.logger.debug("%s proxies found. " % len(proxies)) self.proxies_check_in_channel.update(proxies) Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60)).\ wait_timeout_or_notify(notify=lambda: not self.alive) self.logger.debug("Stop proxy factory. ")
def good_source(self): """ 每隔指定时间间隔将有效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start good source thread. ") while self.alive: with ExceptContext(errback=self.log_err): proxies = self.redis_conn.smembers( self.settings.get("GOOD_PROXY_SET", "good_proxies")) if proxies: self.logger.debug( f"Good proxy count is: {len(proxies)}, ready to check.") self.proxies_check_in_channel.update(proxies) Blocker(self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5)).\ wait_timeout_or_notify(notify=lambda: not self.alive) self.logger.debug("Stop good source thread. ")
def bad_source(self): """ 每隔指定时间间隔将无效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start bad source thread. ") while self.alive: if len(self.proxies_check_in_channel): continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.hgetall( self.settings.get("BAD_PROXY_HASH", "bad_proxies")) if proxies: self.logger.debug( f"Bad proxy count is: {len(proxies)}, ready to check.") while proxies: proxy, times = proxies.popitem() self.proxies_check_in_channel.add(proxy) Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5)).\ wait_timeout_or_notify(notify=lambda: not self.alive) self.logger.debug("Stop bad source thread. ")
def test_raise(): with pytest.raises(Exception): with ExceptContext(Exception, errback=lambda name, *args: print(name)): raise Exception("test. ..")
def test_no_raise(): with ExceptContext(Exception, errback=lambda name, *args: print(name) is None): raise Exception("test. ..")
def inc_crawled_pages(self, crawlid): with ExceptContext(): self.redis_conn.hincrby("crawlid:%s" % crawlid, "crawled_pages", 1) self.update(crawlid)
def set_total_pages(self, crawlid, num=1): with ExceptContext(): self.redis_conn.hset("crawlid:%s" % crawlid, "total_pages", num) self.update(crawlid)
def set_failed(self, crawlid, url, reason, _type="pages"): with ExceptContext(): self.redis_conn.hset("failed_download_%s:%s" % (_type, crawlid), url, reason) self.redis_conn.expire("failed_download_%s:%s" % (_type, crawlid), 60 * 60 * 24 * 2)
def set_failed_download(self, crawlid, url, reason, _type="pages"): with ExceptContext(): self.redis_conn.hincrby("crawlid:%s" % crawlid, "failed_download_%s" % _type, 1) self.update(crawlid) self.set_failed(crawlid, reason, url, _type)