def on_message(self, headers, body): """ 不停的从队列中获取消息,并且放到本地的buffer queue中 只有当确实放到了buffer queue中之后,再回复ACK接收新消息 """ # 检查线程的状态,如果线程已经结束,不再处理任何消息 if self.fetcher_thread_ctx.status != self.fetcher_thread_ctx.EngineStatus.STATUS_RUNNING: return # 获取buffer queue redstone_app = self.fetcher_thread_ctx.app_context buffer_queue = redstone_app.BufferQueues.TASK_BUFFER_QUEUE try: message_id = headers["message_id"] logger.info("Receive message, id: {}, body: {}".format( message_id, body)) # 直接把消息放到本地的buffer queue中 # todo:考虑把消息封装到一个dataclass里,写的时候比较方便 while True: try: buffer_queue.put_nowait(body) break except queue.Full: self.ev.wait(1) continue finally: # 保证一定会发送ACK确认消息 message_id = headers["message_id"] self.fetcher_thread_ctx.remote_task_queue.make_ack(message_id)
def _worker(self): """ 监视spiders文件夹,如果有文件变动,重新计算md5并重新加载 """ logger.info("{} start!".format(self.name)) while self.status == self.EngineStatus.STATUS_RUNNING: self.ev.wait(settings.SPIDER_HOT_LOAD_IDLE) logger.debug("self._cache: {}".format(self._cache)) for _filename in os.listdir(self.spider_dir): # 跳过双下划线开头的文件和非.py结尾的文件 if _filename.startswith("__") or not _filename.endswith(".py"): continue # 加载文件并且存到缓存中 result = self._load_and_update_cache(_filename) if not result["success"]: logger.error(result["message"]) # for test, 如果要调试reload模块,只需要取消下面几行的注释即可 # inst = self._cache["rss"][1].get_class() # inst = inst() # inst.run() logger.info("{} end!".format(self.name))
def run(self): """ 程序真正的入口 """ logger.info("redstone application start!") # TODO: 初始化keeper # 初始化RefreshEngine和它所用的本地buffer queue self.BufferedQueues.REFRESH_TASK_BUFFER_QUEUE = queue.Queue() self.AppEngines.REFRESH_ENGINE = RefreshEngine(app_ctx=self)
def _worker(self): """ 连接远程队列,保活 """ logger.info("{} start!".format(self.name)) # 把线程订阅到对应的queue上 # 添加这个确保prefetch数量 headers={'activemq.prefetchSize': 1} self.remote_task_queue.set_listener("", FetcherListener(self)) self.remote_task_queue.connect() self.remote_task_queue.subscribe() # 保活Fetcher线程 while self.status == self.EngineStatus.STATUS_RUNNING: self.ev.wait(1) logger.info("{} end!".format(self.name))
def _worker(self): logger.info("RefreshEngine start!") while self._status == EngineStatus.RUNNING: # 每次读库前等一会 self._ev.wait(5) current_time = datetime.datetime.now() rows = RedstoneFeedsModel.objects.filter(is_deleted=0).all() for _feed in rows: if _feed.fetch_time + datetime.timedelta(minutes=_feed.interval) <= current_time: logger.debug("Detected out-date rss. (ID:%s, Name:%s)", _feed.id, _feed.name) try: # 获取该feed使用的spider名称 sp = RedstoneSpiderModel.objects.filter(is_deleted=0, pk=_feed.spider_type).first() if not sp: logger.error("Can't get (name: {}, id: {}) spider info!".format(_feed.name, _feed.id)) # TODO: 考虑将feed的状态设置为失效 continue # 将需要刷新的任务封装成指定的格式 task = { "feed_url": _feed.url, "feed_id": _feed.id, "feed_name": _feed.name, "feed_config": { "use_proxy": _feed.use_proxy }, "spider_name": sp.name } task = json.dumps(task) self.push_task(task) finally: # 保证一定更新fetch_time字段 _feed.fetch_time = current_time _feed.save() logger.info("RefreshEngine end!")
def _worker(self): """ 发送结果到远端队列 """ logger.info("{} start!".format(self.name)) app_ctx = self.app_context result_buffer_queue: queue.Queue = app_ctx.BufferQueues.RESULT_BUFFER_QUEUE while self.status == self.EngineStatus.STATUS_RUNNING: try: result = result_buffer_queue.get_nowait() except queue.Empty: self.ev.wait(1) continue self.remote_result_queue.put(result) logger.info("{} end!".format(self.name))
def start(self) -> bool: """ 启动方法,负责启动整个爬虫模块 :return: True-启动成功,False-启动失败 :rtype: bool """ logger.info("Starting RedstoneSpiderApplication!") # 初始化本地的buffer queue self.BufferQueues.TASK_BUFFER_QUEUE = queue.Queue( maxsize=settings.SPIDER_POOL_SIZE) self.BufferQueues.RESULT_BUFFER_QUEUE = queue.Queue() logger.info("Initialize local buffer queue success!") # 初始化爬虫加载器 self.AppEngines.SPIDER_LOADER = SpiderLoader() self.AppEngines.SPIDER_LOADER.start() logger.info("Initialize spider_loader success!") # 初始化爬虫线程池 return True
def run(self): logger.info("RSS Spider running, target: {}".format(self._url)) # 获取RSS页面的内容 resp_result = self._get_page_content() if not resp_result["success"]: error_message = "Can't fetch target URL's content, error msg: {}".format( resp_result["message"]) logger.error(error_message) self.spider_result.success = False self.spider_result.error_message = error_message return False # 解析RSS raw_rss_content = resp_result["page_content"] parsed_rss = feedparser.parse(raw_rss_content) # 提取item信息,最后封装成一个dict """ item = { "title": "", "link": "", "summary": "" "content": "" if not empty else title+url, "publish_time": "", } """ items = parsed_rss["entries"] # rss_info = parsed_rss["feed"] for item in items: title = item["title"] link = item["link"] summary = item["summary"] if item["summary"] else "该文章暂无简介" content = item["content"] if not content: content = "{title}<br><a href=\"{link}\">{title}</a>".format( title=title, link=link) # 匹配时间字符串 raw_published_time = item["published"] fmt1 = "%a, %d %b %Y %H:%M:%S %z" fmt2 = "%a, %d %b %Y %H:%M:%S %Z" try: st = time.strptime(raw_published_time, fmt1) except ValueError: try: st = time.strptime(raw_published_time, fmt2) except ValueError: # 没救了,转不出来 logger.warning( "Can't convert rss time to struct_time: '{}', use current time instead." .format(raw_published_time)) st = time.localtime() # 把struct_time转成timestamp,处理时区问题 published_time = time.mktime(st) published_time = \ published_time + 8 * 3600 if not st.tm_gmtoff else published_time + 8 * 3600 + abs(st.tm_gmtoff) # 拼装result result = { "title": title, "link": link, "summary": summary, "content": content, "publish_time": published_time } self._push_result(result) self.spider_result.results = True logger.info("Rss spider done.") return True
def reload_now(self): """ 立即reload爬虫 """ logger.info("Receive reload signal!") self.ev.set()