Beispiel #1
0
 def good_source(self):
     """
     每隔指定时间间隔将有效代理放到待检查队列进行检查
     :return:
     """
     self.logger.debug("Start good source thread. ")
     while self.alive:
         with Blocker(self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5),
                      self, notify=lambda instance: not instance.alive) as blocker:
             if blocker.is_notified:
                 continue
             with ExceptContext(errback=self.log_err):
                 proxies = self.redis_conn.smembers("good_proxies")
                 if proxies:
                     self.logger.debug("Good proxy count is : %s, ready to check. " % len(proxies))
                     self.proxies_check_in_set.update(proxies)
     self.logger.debug("Stop good source thread. ")
Beispiel #2
0
 def start(self):
     self.logger.debug("Start proxy factory. ")
     self.gen_thread(self.check_proxies)
     self.gen_thread(self.bad_source)
     self.gen_thread(self.good_source)
     self.gen_thread(self.reset_proxies)
     is_started = False
     while self.alive or [thread for thread in self.children if thread.is_alive()]:
         with Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60),
                      self, notify=lambda instance: not instance.alive, immediately=not is_started) as blocker:
             if blocker.is_notified:
                 continue
             with ExceptContext(errback=self.log_err):
                 if self.alive:
                     self.logger.debug("Start to fetch proxies. ")
                     proxies = self.fetch_all()
                     self.logger.debug("%s proxies found. " % len(proxies))
                     self.proxies_check_in_set.update(proxies)
         is_started = True
     self.logger.debug("Stop proxy factory. ")
Beispiel #3
0
    def bad_source(self):
        """
        每隔指定时间间隔将无效代理放到待检查队列进行检查
        :return:
        """
        self.logger.debug("Start bad source thread. ")
        while self.alive:
            with Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5),
                         self, notify=lambda instance: not instance.alive) as blocker:
                if blocker.is_notified or len(self.proxies_check_in_channel):
                    continue
                with ExceptContext(errback=self.log_err):
                    proxies = self.redis_conn.hgetall("bad_proxies")
                    if proxies:
                        self.logger.debug("Bad proxy count is : %s, ready to check. " % len(proxies))
                        while proxies:
                            proxy, times = proxies.popitem()
                            self.proxies_check_in_channel.add(proxy)

        self.logger.debug("Stop bad source thread. ")
Beispiel #4
0
 def bad_source(self):
     """
     每隔指定时间间隔将无效代理放到待检查队列进行检查
     :return:
     """
     self.logger.debug("Start bad source thread. ")
     while self.alive:
         with Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5),
                      self, notify=lambda instance: not instance.alive) as blocker:
             if blocker.is_notified:
                 continue
             with ExceptContext(errback=self.log_err):
                 proxies = self.redis_conn.hgetall("bad_proxies")
                 if proxies:
                     self.logger.debug("Bad proxy count is : %s, ready to check. " % len(proxies))
                     for proxy, times in proxies.items():
                         if int(times) > self.settings.get_int("FAILED_TIMES", 5):
                             self.redis_conn.hdel("bad_proxies", proxy)
                             self.logger.debug("Abandon %s of failed for %s times. " % (proxy, times))
                     self.proxies_check_in_set.update(proxies.keys())
     self.logger.debug("Stop bad source thread. ")
Beispiel #5
0
 def start(self):
     with ExceptContext(errback=lambda *args: self.log_err(*args) and self.stop() is None, finalback=self.close):
         retriever = Thread(target=self.retrieve_messages)
         self.children.append(retriever)
         retriever.start()
         while self.alive or [thread for thread in self.children if thread.is_alive()]:
             with Blocker(self.settings.get_int("INTERVAL", 10 * 60), self, notify=lambda instance: not instance.alive) as blocker:
                 if blocker.is_notified:
                     continue
                 print("Heath or not: %s" % self.task_scheduler.is_health(), blocker.is_notified, [thread for thread in self.children if thread.is_alive()], flush=True)
                 need_crawl_messages = self.consume_messages(self.settings.get_int("CONSUME_MAX_COUNT", 100000))
                 if need_crawl_messages and self.alive:
                     self.logger.debug("Got %s messages to build crawl task. "%len(need_crawl_messages))
                     for site, messages in groupby(
                             need_crawl_messages, lambda message: message["source_site_code"]).items():
                         if not site:
                             continue
                         if self.can_do(site):
                             # 发送jay-monitor,每一千条分开发送
                             while messages and self.alive:
                                 count = 0
                                 sub_messages = []
                                 while messages and count < self.settings.get_int("TERM_COUNT", 5000):
                                     sub_messages.append(messages.pop())
                                     count += 1
                                 self.send(site, sub_messages)
                                 time.sleep(1)
                         # 当且仅当self.alive = False时,messages才不会空,这时,为了防止信息丢失,我们重新将message放回output_channel,下同
                         if messages:
                             #print("deep 2 len: %s" % len(messages))
                             self.task_scheduler.push(*map(
                                 lambda message: pickle.dumps(message), messages), delay=False)
                 else:
                     #print("deep 1 len: %s" % len(need_crawl_messages))
                     if need_crawl_messages:
                         self.task_scheduler.push(*map(
                             lambda message: pickle.dumps(message), need_crawl_messages), delay=False)
                     self.logger.debug("Haven't got expired message. ")
             print("Heath or not: %s" % self.task_scheduler.is_health(), flush=True)