def parse(self, response):
        with ExceptContext(errback=self.log_err) as ec:
            self.logger.debug("Start response in parse. ")
            item_urls = self.extract_item_urls(response)
            # 增加这个字段的目的是为了记住去重后的url有多少个,如果为空,对于按参数翻页的网站,有可能已经翻到了最后一页。
            effective_urls = [i for i in item_urls if not (
                self.need_duplicate and self.duplicate_filter(response, i, self.need_duplicate))]
            self.crawler.stats.inc_total_pages(
                response.meta['crawlid'], len(effective_urls))
            yield from self.gen_requests(
                [dict(url=u,
                      errback="errback",
                      meta={
                          "priority": response.meta["priority"]-20,
                          "seed": response.url,
                          "proxy": None
                      })
                 for u in effective_urls], "parse_item", response)

            next_page_url = self.extract_page_url(response, effective_urls, item_urls)
            if next_page_url:
                yield from self.gen_requests([next_page_url], "parse", response)
        if ec.got_err:
            self.crawler.stats.set_failed_download(
                response.meta['crawlid'],
                response.request.url,
                "In parse: " + "".join(traceback.format_exception(*ec.err_info)))
Beispiel #2
0
    def check_proxies(self):
        """
        对待检查队列中的代理进行检查
        :return:
        """
        self.logger.debug("Start check thread. ")
        while self.alive:
            with ExceptContext(errback=self.log_err):
                proxies = list(self.proxies_check_in_set.pop_all())
                if proxies:
                    self.logger.debug("Got %s proxies to check. " % len(proxies))
                    proxies = [proxy.decode() if isinstance(proxy, bytes) else proxy for proxy in proxies]
                    good = list()
                    for i in range(0, len(proxies), 150):
                        # 分批检查
                        thread_list = []
                        for proxy in proxies[i: i+150]:
                            th = Thread(target=self.check, args=(proxy, good))
                            th.setDaemon(True)
                            th.start()
                            thread_list.append(th)
                        start_time = time.time()
                        while [thread for thread in thread_list if thread.is_alive()] and start_time + 60 > time.time():
                            time.sleep(1)

                    self.logger.debug("%s proxies is good. " % (len(good)))
                    self.proxies_check_out_set.update(dict((proxy, proxy in good) for proxy in proxies))
                else:
                    time.sleep(1)
            time.sleep(1)
        self.logger.debug("Stop check thread. ")
Beispiel #3
0
 def reset_proxies(self):
     """
     分发有效代理和无效代理
     :return:
     """
     self.logger.debug("Start resets thread. ")
     while self.alive:
         with ExceptContext(errback=self.log_err):
             proxies = list(self.proxies_check_out_channel.pop_all())
             if proxies:
                 self.logger.debug("Got %s proxies to reset. " %
                                   len(proxies))
                 for proxy, good in proxies:
                     if good:
                         self.redis_conn.sadd("good_proxies", proxy)
                         self.redis_conn.hdel("bad_proxies", proxy)
                     else:
                         count = self.redis_conn.hincrby(
                             "bad_proxies", proxy)
                         if count > self.settings.get_int(
                                 "FAILED_TIMES", 5):
                             self.redis_conn.hdel("bad_proxies", proxy)
                             self.logger.debug(
                                 "Abandon %s of failed for %s times. " %
                                 (proxy, count))
                         self.redis_conn.srem("good_proxies", proxy)
             else:
                 time.sleep(1)
         time.sleep(1)
     self.logger.debug("Stop resets thread. ")
Beispiel #4
0
 def check(self, proxy, good):
     """
         检查代理是否可用
     """
     with ExceptContext(errback=lambda *args: True):
         if self.check_method(proxy):
             good.add(proxy)
Beispiel #5
0
 def receive(self):
     while self.alive:
         with ExceptContext():
             messages = self.hash_set.rid_all()
             # 当messages数量很大时,如果不对messages进行切分导入output_channel,
             # 则push函数调用会持续很久,由于output_channel线程安全,其它线程因此会等待。
             for start in range(0, len(messages), 1000):
                 self.output_channel.push(*messages[start:start + 1000])
                 time.sleep(0.01)
         time.sleep(1)
Beispiel #6
0
    def parse_item(self, response):
        with ExceptContext(errback=self.log_err) as ec:
            response.meta["request_count_per_item"] = 1
            base_loader = self.get_base_loader(response)
            meta = response.request.meta
            self.enrich_base_data(base_loader, response)
            meta["item_collector"] = \
                ItemCollector(Node(None, base_loader, None, self.enrich_data))
            yield self.yield_item_or_req(meta["item_collector"], response)

        if ec.got_err:
            self.crawler.stats.set_failed_download(
                response.meta['crawlid'], response.request.url,
                "In parse_item: " + traceback.format_exc())
Beispiel #7
0
    def parse_next(self, response):
        if response.status == 999:
            self.logger.error("Partial request error: crawlid:%s, url: %s. " %
                              (response.meta["crawlid"], response.url))
        with ExceptContext(errback=self.log_err) as ec:
            response.meta["request_count_per_item"] = \
                response.meta.get("request_count_per_item", 1) + 1
            yield self.yield_item_or_req(
                response.request.meta["item_collector"], response)

        if ec.got_err:
            self.crawler.stats.set_failed_download(
                response.meta['crawlid'], response.request.url,
                "In parse_next: " + traceback.format_exc())
Beispiel #8
0
 def good_source(self):
     """
     每隔指定时间间隔将有效代理放到待检查队列进行检查
     :return:
     """
     self.logger.debug("Start good source thread. ")
     while self.alive:
         with Blocker(self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5),
                      self, notify=lambda instance: not instance.alive) as blocker:
             if blocker.is_notified:
                 continue
             with ExceptContext(errback=self.log_err):
                 proxies = self.redis_conn.smembers("good_proxies")
                 if proxies:
                     self.logger.debug("Good proxy count is : %s, ready to check. " % len(proxies))
                     self.proxies_check_in_set.update(proxies)
     self.logger.debug("Stop good source thread. ")
Beispiel #9
0
 def start(self):
     self.logger.debug("Start proxy factory. ")
     self.gen_thread(self.check_proxies)
     self.gen_thread(self.bad_source)
     self.gen_thread(self.good_source)
     self.gen_thread(self.reset_proxies)
     is_started = False
     while self.alive or [thread for thread in self.children if thread.is_alive()]:
         with Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60),
                      self, notify=lambda instance: not instance.alive, immediately=not is_started) as blocker:
             if blocker.is_notified:
                 continue
             with ExceptContext(errback=self.log_err):
                 if self.alive:
                     self.logger.debug("Start to fetch proxies. ")
                     proxies = self.fetch_all()
                     self.logger.debug("%s proxies found. " % len(proxies))
                     self.proxies_check_in_set.update(proxies)
         is_started = True
     self.logger.debug("Stop proxy factory. ")
Beispiel #10
0
    def bad_source(self):
        """
        每隔指定时间间隔将无效代理放到待检查队列进行检查
        :return:
        """
        self.logger.debug("Start bad source thread. ")
        while self.alive:
            with Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5),
                         self, notify=lambda instance: not instance.alive) as blocker:
                if blocker.is_notified or len(self.proxies_check_in_channel):
                    continue
                with ExceptContext(errback=self.log_err):
                    proxies = self.redis_conn.hgetall("bad_proxies")
                    if proxies:
                        self.logger.debug("Bad proxy count is : %s, ready to check. " % len(proxies))
                        while proxies:
                            proxy, times = proxies.popitem()
                            self.proxies_check_in_channel.add(proxy)

        self.logger.debug("Stop bad source thread. ")
Beispiel #11
0
 def bad_source(self):
     """
     每隔指定时间间隔将无效代理放到待检查队列进行检查
     :return:
     """
     self.logger.debug("Start bad source thread. ")
     while self.alive:
         with Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5),
                      self, notify=lambda instance: not instance.alive) as blocker:
             if blocker.is_notified:
                 continue
             with ExceptContext(errback=self.log_err):
                 proxies = self.redis_conn.hgetall("bad_proxies")
                 if proxies:
                     self.logger.debug("Bad proxy count is : %s, ready to check. " % len(proxies))
                     for proxy, times in proxies.items():
                         if int(times) > self.settings.get_int("FAILED_TIMES", 5):
                             self.redis_conn.hdel("bad_proxies", proxy)
                             self.logger.debug("Abandon %s of failed for %s times. " % (proxy, times))
                     self.proxies_check_in_set.update(proxies.keys())
     self.logger.debug("Stop bad source thread. ")
Beispiel #12
0
 def reset_proxies(self):
     """
     分发有效代理和无效代理
     :return:
     """
     self.logger.debug("Start resets thread. ")
     while self.alive:
         with ExceptContext(errback=self.log_err):
             proxies = list(self.proxies_check_out_set.pop_all())
             if proxies:
                 self.logger.debug("Got %s proxies to reset. " % len(proxies))
                 for proxy, good in proxies:
                     if good:
                         self.redis_conn.sadd("good_proxies", proxy)
                         self.redis_conn.hdel("bad_proxies", proxy)
                     else:
                         self.redis_conn.hincrby("bad_proxies", proxy)
                         self.redis_conn.srem("good_proxies", proxy)
             else:
                 time.sleep(1)
         time.sleep(1)
     self.logger.debug("Stop resets thread. ")
Beispiel #13
0
 def start(self):
     with ExceptContext(errback=lambda *args: self.log_err(*args) and self.stop() is None, finalback=self.close):
         retriever = Thread(target=self.retrieve_messages)
         self.children.append(retriever)
         retriever.start()
         while self.alive or [thread for thread in self.children if thread.is_alive()]:
             with Blocker(self.settings.get_int("INTERVAL", 10 * 60), self, notify=lambda instance: not instance.alive) as blocker:
                 if blocker.is_notified:
                     continue
                 print("Heath or not: %s" % self.task_scheduler.is_health(), blocker.is_notified, [thread for thread in self.children if thread.is_alive()], flush=True)
                 need_crawl_messages = self.consume_messages(self.settings.get_int("CONSUME_MAX_COUNT", 100000))
                 if need_crawl_messages and self.alive:
                     self.logger.debug("Got %s messages to build crawl task. "%len(need_crawl_messages))
                     for site, messages in groupby(
                             need_crawl_messages, lambda message: message["source_site_code"]).items():
                         if not site:
                             continue
                         if self.can_do(site):
                             # 发送jay-monitor,每一千条分开发送
                             while messages and self.alive:
                                 count = 0
                                 sub_messages = []
                                 while messages and count < self.settings.get_int("TERM_COUNT", 5000):
                                     sub_messages.append(messages.pop())
                                     count += 1
                                 self.send(site, sub_messages)
                                 time.sleep(1)
                         # 当且仅当self.alive = False时,messages才不会空,这时,为了防止信息丢失,我们重新将message放回output_channel,下同
                         if messages:
                             #print("deep 2 len: %s" % len(messages))
                             self.task_scheduler.push(*map(
                                 lambda message: pickle.dumps(message), messages), delay=False)
                 else:
                     #print("deep 1 len: %s" % len(need_crawl_messages))
                     if need_crawl_messages:
                         self.task_scheduler.push(*map(
                             lambda message: pickle.dumps(message), need_crawl_messages), delay=False)
                     self.logger.debug("Haven't got expired message. ")
             print("Heath or not: %s" % self.task_scheduler.is_health(), flush=True)
Beispiel #14
0
 def send(self):
     while self.alive:
         data = self.input_channel.pop()
         if not data:
             time.sleep(1)
             continue
         if not self.alive:
             self.push(data)
             continue
         message = pickle.loads(data)
         if message.get("event") == "delete":
             del self.hash_set[message["data"]["roc_id"]]
             continue
         delay_date = message.get("delay_date")
         if delay_date:
             del message["delay_date"]
             expect = delay_date.timestamp() * 1000 if isinstance(
                 delay_date, datetime) else delay_date
         else:
             delay = self.default_delay
             expect = time.time() * 1000 + delay
         with ExceptContext(errback=lambda *args: self.push(data) is None):
             self.hash_set.push(pickle.dumps(message),
                                self.identity(message), expect)
Beispiel #15
0
 def stop(self, *args):
     self.task_scheduler.alive = False
     self.alive = False
     with ExceptContext(errback=lambda *args: True):
         if self.consumer:
             self.consumer.close()
Beispiel #16
0
 def set_failed(self, crawlid, url, reason, _type="pages"):
     with ExceptContext():
         self.redis_conn.hset("failed_download_%s:%s" % (_type, crawlid), url, reason)
         self.redis_conn.expire("failed_download_%s:%s" % (_type, crawlid), 60 * 60 * 24 * 2)
Beispiel #17
0
 def test_raise(self):
     with self.assertRaises(Exception):
         with ExceptContext(Exception, errback=lambda name, *args: print(name)):
             raise Exception("test. ..")
Beispiel #18
0
 def fun():
     with ExceptContext(Exception, errback=lambda name, *args: print(name) is None):
         raise Exception("test. ..")
Beispiel #19
0
 def set_total_pages(self, crawlid, num=1):
     with ExceptContext():
         self.redis_conn.hset("crawlid:%s" % crawlid, "total_pages", num)
         self.update(crawlid)
Beispiel #20
0
 def inc_crawled_pages(self, crawlid):
     with ExceptContext():
         self.redis_conn.hincrby("crawlid:%s" % crawlid, "crawled_pages", 1)
         self.update(crawlid)
Beispiel #21
0
def test_raise():
    with pytest.raises(Exception):
        with ExceptContext(Exception, errback=lambda name, *args: print(name)):
            raise Exception("test. ..")
Beispiel #22
0
 def set_failed_download(self, crawlid, url, reason, _type="pages"):
     with ExceptContext():
         self.redis_conn.hincrby("crawlid:%s" % crawlid, "failed_download_%s" % _type, 1)
         self.update(crawlid)
         self.set_failed(crawlid, reason, url, _type)