def on_message(self, headers, body):
        """
        不停的从队列中获取消息,并且放到本地的buffer queue中
        只有当确实放到了buffer queue中之后,再回复ACK接收新消息
        """

        # 检查线程的状态,如果线程已经结束,不再处理任何消息
        if self.fetcher_thread_ctx.status != self.fetcher_thread_ctx.EngineStatus.STATUS_RUNNING:
            return

        # 获取buffer queue
        redstone_app = self.fetcher_thread_ctx.app_context
        buffer_queue = redstone_app.BufferQueues.TASK_BUFFER_QUEUE

        try:
            message_id = headers["message_id"]
            logger.info("Receive message, id: {}, body: {}".format(
                message_id, body))

            # 直接把消息放到本地的buffer queue中
            # todo:考虑把消息封装到一个dataclass里,写的时候比较方便
            while True:
                try:
                    buffer_queue.put_nowait(body)
                    break
                except queue.Full:
                    self.ev.wait(1)
                    continue
        finally:
            # 保证一定会发送ACK确认消息
            message_id = headers["message_id"]
            self.fetcher_thread_ctx.remote_task_queue.make_ack(message_id)
Beispiel #2
0
    def _worker(self):
        """
        监视spiders文件夹,如果有文件变动,重新计算md5并重新加载
        """

        logger.info("{} start!".format(self.name))

        while self.status == self.EngineStatus.STATUS_RUNNING:

            self.ev.wait(settings.SPIDER_HOT_LOAD_IDLE)
            logger.debug("self._cache: {}".format(self._cache))

            for _filename in os.listdir(self.spider_dir):
                # 跳过双下划线开头的文件和非.py结尾的文件
                if _filename.startswith("__") or not _filename.endswith(".py"):
                    continue
                # 加载文件并且存到缓存中
                result = self._load_and_update_cache(_filename)
                if not result["success"]:
                    logger.error(result["message"])

            # for test, 如果要调试reload模块,只需要取消下面几行的注释即可
            # inst = self._cache["rss"][1].get_class()
            # inst = inst()
            # inst.run()

        logger.info("{} end!".format(self.name))
Beispiel #3
0
    def run(self):
        """
        程序真正的入口
        """
        logger.info("redstone application start!")

        # TODO: 初始化keeper

        # 初始化RefreshEngine和它所用的本地buffer queue
        self.BufferedQueues.REFRESH_TASK_BUFFER_QUEUE = queue.Queue()
        self.AppEngines.REFRESH_ENGINE = RefreshEngine(app_ctx=self)
Beispiel #4
0
    def _worker(self):
        """
        连接远程队列,保活
        """

        logger.info("{} start!".format(self.name))

        # 把线程订阅到对应的queue上
        # 添加这个确保prefetch数量 headers={'activemq.prefetchSize': 1}
        self.remote_task_queue.set_listener("", FetcherListener(self))
        self.remote_task_queue.connect()
        self.remote_task_queue.subscribe()

        # 保活Fetcher线程
        while self.status == self.EngineStatus.STATUS_RUNNING:
            self.ev.wait(1)

        logger.info("{} end!".format(self.name))
Beispiel #5
0
    def _worker(self):
        logger.info("RefreshEngine start!")

        while self._status == EngineStatus.RUNNING:

            # 每次读库前等一会
            self._ev.wait(5)

            current_time = datetime.datetime.now()
            rows = RedstoneFeedsModel.objects.filter(is_deleted=0).all()
            for _feed in rows:
                if _feed.fetch_time + datetime.timedelta(minutes=_feed.interval) <= current_time:

                    logger.debug("Detected out-date rss. (ID:%s, Name:%s)", _feed.id, _feed.name)

                    try:
                        # 获取该feed使用的spider名称
                        sp = RedstoneSpiderModel.objects.filter(is_deleted=0, pk=_feed.spider_type).first()
                        if not sp:
                            logger.error("Can't get (name: {}, id: {}) spider info!".format(_feed.name, _feed.id))
                            # TODO: 考虑将feed的状态设置为失效
                            continue

                        # 将需要刷新的任务封装成指定的格式
                        task = {
                            "feed_url": _feed.url,
                            "feed_id": _feed.id,
                            "feed_name": _feed.name,
                            "feed_config": {
                                "use_proxy": _feed.use_proxy
                            },
                            "spider_name": sp.name
                        }

                        task = json.dumps(task)
                        self.push_task(task)
                    finally:
                        # 保证一定更新fetch_time字段
                        _feed.fetch_time = current_time
                        _feed.save()

        logger.info("RefreshEngine end!")
Beispiel #6
0
    def _worker(self):
        """
        发送结果到远端队列
        """

        logger.info("{} start!".format(self.name))

        app_ctx = self.app_context
        result_buffer_queue: queue.Queue = app_ctx.BufferQueues.RESULT_BUFFER_QUEUE

        while self.status == self.EngineStatus.STATUS_RUNNING:
            try:
                result = result_buffer_queue.get_nowait()
            except queue.Empty:
                self.ev.wait(1)
                continue

            self.remote_result_queue.put(result)

        logger.info("{} end!".format(self.name))
    def start(self) -> bool:
        """
        启动方法,负责启动整个爬虫模块
        :return: True-启动成功,False-启动失败
        :rtype: bool
        """

        logger.info("Starting RedstoneSpiderApplication!")

        # 初始化本地的buffer queue
        self.BufferQueues.TASK_BUFFER_QUEUE = queue.Queue(
            maxsize=settings.SPIDER_POOL_SIZE)
        self.BufferQueues.RESULT_BUFFER_QUEUE = queue.Queue()
        logger.info("Initialize local buffer queue success!")

        # 初始化爬虫加载器
        self.AppEngines.SPIDER_LOADER = SpiderLoader()
        self.AppEngines.SPIDER_LOADER.start()
        logger.info("Initialize spider_loader success!")

        # 初始化爬虫线程池

        return True
Beispiel #8
0
    def run(self):
        logger.info("RSS Spider running, target: {}".format(self._url))

        # 获取RSS页面的内容
        resp_result = self._get_page_content()
        if not resp_result["success"]:
            error_message = "Can't fetch target URL's content, error msg: {}".format(
                resp_result["message"])
            logger.error(error_message)
            self.spider_result.success = False
            self.spider_result.error_message = error_message
            return False

        # 解析RSS
        raw_rss_content = resp_result["page_content"]
        parsed_rss = feedparser.parse(raw_rss_content)

        # 提取item信息,最后封装成一个dict
        """
        item = {
            "title": "",
            "link": "",
            "summary": ""
            "content": "" if not empty else title+url,
            "publish_time": "",
        }
        """
        items = parsed_rss["entries"]
        # rss_info = parsed_rss["feed"]

        for item in items:
            title = item["title"]
            link = item["link"]
            summary = item["summary"] if item["summary"] else "该文章暂无简介"
            content = item["content"]
            if not content:
                content = "{title}<br><a href=\"{link}\">{title}</a>".format(
                    title=title, link=link)

            # 匹配时间字符串
            raw_published_time = item["published"]
            fmt1 = "%a, %d %b %Y %H:%M:%S %z"
            fmt2 = "%a, %d %b %Y %H:%M:%S %Z"
            try:
                st = time.strptime(raw_published_time, fmt1)
            except ValueError:
                try:
                    st = time.strptime(raw_published_time, fmt2)
                except ValueError:
                    # 没救了,转不出来
                    logger.warning(
                        "Can't convert rss time to struct_time: '{}', use current time instead."
                        .format(raw_published_time))
                    st = time.localtime()

            # 把struct_time转成timestamp,处理时区问题
            published_time = time.mktime(st)
            published_time = \
                published_time + 8 * 3600 if not st.tm_gmtoff else published_time + 8 * 3600 + abs(st.tm_gmtoff)

            # 拼装result
            result = {
                "title": title,
                "link": link,
                "summary": summary,
                "content": content,
                "publish_time": published_time
            }
            self._push_result(result)

        self.spider_result.results = True
        logger.info("Rss spider done.")
        return True
Beispiel #9
0
 def reload_now(self):
     """
     立即reload爬虫
     """
     logger.info("Receive reload signal!")
     self.ev.set()