def good_source(self): """ 每隔指定时间间隔将有效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start good source thread. ") while self.alive: with Blocker(self.settings.get_int("GOOD_CHECK_INTERVAL", 60 * 5), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified: continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.smembers("good_proxies") if proxies: self.logger.debug("Good proxy count is : %s, ready to check. " % len(proxies)) self.proxies_check_in_set.update(proxies) self.logger.debug("Stop good source thread. ")
def start(self): self.logger.debug("Start proxy factory. ") self.gen_thread(self.check_proxies) self.gen_thread(self.bad_source) self.gen_thread(self.good_source) self.gen_thread(self.reset_proxies) is_started = False while self.alive or [thread for thread in self.children if thread.is_alive()]: with Blocker(self.settings.get_int("FETCH_INTERVAL", 10 * 60), self, notify=lambda instance: not instance.alive, immediately=not is_started) as blocker: if blocker.is_notified: continue with ExceptContext(errback=self.log_err): if self.alive: self.logger.debug("Start to fetch proxies. ") proxies = self.fetch_all() self.logger.debug("%s proxies found. " % len(proxies)) self.proxies_check_in_set.update(proxies) is_started = True self.logger.debug("Stop proxy factory. ")
def bad_source(self): """ 每隔指定时间间隔将无效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start bad source thread. ") while self.alive: with Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified or len(self.proxies_check_in_channel): continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.hgetall("bad_proxies") if proxies: self.logger.debug("Bad proxy count is : %s, ready to check. " % len(proxies)) while proxies: proxy, times = proxies.popitem() self.proxies_check_in_channel.add(proxy) self.logger.debug("Stop bad source thread. ")
def bad_source(self): """ 每隔指定时间间隔将无效代理放到待检查队列进行检查 :return: """ self.logger.debug("Start bad source thread. ") while self.alive: with Blocker(self.settings.get_int("BAD_CHECK_INTERVAL", 60 * 5), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified: continue with ExceptContext(errback=self.log_err): proxies = self.redis_conn.hgetall("bad_proxies") if proxies: self.logger.debug("Bad proxy count is : %s, ready to check. " % len(proxies)) for proxy, times in proxies.items(): if int(times) > self.settings.get_int("FAILED_TIMES", 5): self.redis_conn.hdel("bad_proxies", proxy) self.logger.debug("Abandon %s of failed for %s times. " % (proxy, times)) self.proxies_check_in_set.update(proxies.keys()) self.logger.debug("Stop bad source thread. ")
def start(self): with ExceptContext(errback=lambda *args: self.log_err(*args) and self.stop() is None, finalback=self.close): retriever = Thread(target=self.retrieve_messages) self.children.append(retriever) retriever.start() while self.alive or [thread for thread in self.children if thread.is_alive()]: with Blocker(self.settings.get_int("INTERVAL", 10 * 60), self, notify=lambda instance: not instance.alive) as blocker: if blocker.is_notified: continue print("Heath or not: %s" % self.task_scheduler.is_health(), blocker.is_notified, [thread for thread in self.children if thread.is_alive()], flush=True) need_crawl_messages = self.consume_messages(self.settings.get_int("CONSUME_MAX_COUNT", 100000)) if need_crawl_messages and self.alive: self.logger.debug("Got %s messages to build crawl task. "%len(need_crawl_messages)) for site, messages in groupby( need_crawl_messages, lambda message: message["source_site_code"]).items(): if not site: continue if self.can_do(site): # 发送jay-monitor,每一千条分开发送 while messages and self.alive: count = 0 sub_messages = [] while messages and count < self.settings.get_int("TERM_COUNT", 5000): sub_messages.append(messages.pop()) count += 1 self.send(site, sub_messages) time.sleep(1) # 当且仅当self.alive = False时,messages才不会空,这时,为了防止信息丢失,我们重新将message放回output_channel,下同 if messages: #print("deep 2 len: %s" % len(messages)) self.task_scheduler.push(*map( lambda message: pickle.dumps(message), messages), delay=False) else: #print("deep 1 len: %s" % len(need_crawl_messages)) if need_crawl_messages: self.task_scheduler.push(*map( lambda message: pickle.dumps(message), need_crawl_messages), delay=False) self.logger.debug("Haven't got expired message. ") print("Heath or not: %s" % self.task_scheduler.is_health(), flush=True)