def _worker(self):

        logger.info(f"{self.name} start!")

        while self.is_running():
            try:
                task = self.save_queue.get_nowait()
            except queue.Empty:
                self.thread_event.wait(1)
                continue

            feed_id = task.get("feed_task").get("id")
            title: str = task.get("title")
            content: str = task.get("content")
            link = task.get("link")
            unique_hash = "{}|{}".format(
                hashlib.md5(title.encode()).hexdigest(),
                hashlib.md5(content.encode()).hexdigest()
            )
            tags = task.get("tags")
            publish_time = task.get("publish_time")

            # 根据 unique_hash 来决定这个 paper 是否已经存在了
            # TODO 这里需要考虑文章更新的情况
            if PaperModel.instance.get_paper_by_unique_hash(unique_hash):
                continue
            else:
                paper = PaperModel(feed_id=feed_id, title=title, content=content, link=link, unique_hash=unique_hash,
                                   tags=tags, pushed_status=0, publish_time=publish_time)
                paper.save()
                logger.debug(f"{self.name} paper saved, title: {title}, unique_hash: {unique_hash}")

        logger.info(f"{self.name} stop!")
    def _worker(self):

        logger.info(f"{self.name} start!")
        task_queue = g.queue_context.feed_task_queue

        while self.is_running():

            # 从db中找到所有的待刷新的 feed 源
            row: FeedModel
            for row in FeedModel.instance.all():
                logger.debug(
                    f"name: {row.name} ,last_refresh_time: {row.last_refresh_time}"
                )
                if row.last_refresh_time + datetime.timedelta(
                        minutes=row.interval) > datetime.datetime.now():
                    continue

                # add to queue
                task = {
                    "id": row.id,
                    "name": row.name,
                    "feed_url": row.feed_url,
                }
                self.put_to_queue(task_queue, task)
                logger.debug(f"put task {task} to queue.")

                # update last_refresh_time
                row.last_refresh_time = datetime.datetime.now()
                row.status = FeedStatus.UPDATING
                row.save()

            # wait for next wakeup
            self.thread_event.wait(self.refresh_interval)

        logger.info(f"{self.name} stop!")
    async def _worker(self, idx):
        self._init_event()
        cur_name = f"{self.name}-{idx}"
        logger.info(f"{cur_name} start.")

        while self.is_running():
            try:
                task = self.feed_task_queue.get_nowait()
                logger.debug(f"get task: {task}")

                # 从 task 中获取 feed_url
                feed_url = task.get("feed_url")
                if not feed_url:
                    logger.warning(f"{cur_name} empty feed_url!")
                    await self._wait(1)
                    continue

                # 发起请求
                async with aiohttp.request(
                        "GET", feed_url, timeout=self.client_timeout) as resp:
                    if resp.status != 200:
                        logger.warning(
                            f"{cur_name} Error while making request to {feed_url}, status code: {resp.status}"
                        )
                        # TODO 更新 feed 的状态到 dead
                        await self._wait(1)
                        continue

                    # status code 是 200 的情况
                    content = await resp.text()
                    parser_task = {
                        "feed_task": task,
                        "content": content,
                    }
                    while self.is_running():
                        try:
                            self.parser_task_queue.put_nowait(parser_task)
                        except queue.Full:
                            logger.warning(
                                f"{cur_name} parser task queue full, retry..")
                            await self._wait(1)
                        else:
                            break

                    logger.debug(f"{cur_name} content size: {len(content)}")

            except queue.Empty:
                await self._wait(1)
                continue

        logger.info(f"{cur_name} stop.")
Exemple #4
0
    def start(self):
        signal.signal(signal.SIGINT, self.__sigint)
        logger.info("exit process register done.")

        self.init_queues()
        logger.info("Init queues done.")

        self.init_engines()
        logger.info("Init engines done.")
Exemple #5
0
 def __sigint(self, sig, frame):
     logger.info("Receive exit signal.")
     self.engines.stop_engines()