def __connect_remote_queue(self): """ 连接到远程的ActiveMQ队列 :return: True-连接成功,False-连接失败 :rtype: bool """ queue_info = self.queue_info check_key = ("host", "port", "username", "password", "queue_name") for k in check_key: if not queue_info[k]: logger.error("Connect information error! {} can't be empty!".format(k)) return False final_queue_name = "/queue/{}".format(queue_info["queue_name"]) host_tuple = (queue_info["host"], int(queue_info["port"])) # 生成当前机器的consumer_id consumer_id = make_consumer_id() self.remote_result_queue = ActiveMQQueue( host_tuple, queue_info["username"], queue_info["password"], final_queue_name, consumer_id ) return True
def __connect_remote_queue(self): """ 连接到远程的ActiveMQ队列 :return: True-连接成功,False-连接失败 :rtype: bool """ queue_info = self.queue_info check_key = ("host", "port", "username", "password", "queue_name") for k in check_key: if not queue_info[k]: logger.error( "Connect information error! {} can't be empty!".format(k)) return False final_queue_name = "/queue/{}".format(queue_info["queue_name"]) host_tuple = (queue_info["host"], int(queue_info["port"])) # 生成当前机器的consumer id # 当前机器的IP + "|" + "当前的timestamp(精确到秒)" consumer_id = "{ip}|{timestamp}".format(ip=IPAddress.current_ip(), timestamp=int(time.time())) self.remote_task_queue = ActiveMQQueue(host_tuple, queue_info["username"], queue_info["password"], final_queue_name, consumer_id) return True
def _worker(self): """ 监视spiders文件夹,如果有文件变动,重新计算md5并重新加载 """ logger.info("{} start!".format(self.name)) while self.status == self.EngineStatus.STATUS_RUNNING: self.ev.wait(settings.SPIDER_HOT_LOAD_IDLE) logger.debug("self._cache: {}".format(self._cache)) for _filename in os.listdir(self.spider_dir): # 跳过双下划线开头的文件和非.py结尾的文件 if _filename.startswith("__") or not _filename.endswith(".py"): continue # 加载文件并且存到缓存中 result = self._load_and_update_cache(_filename) if not result["success"]: logger.error(result["message"]) # for test, 如果要调试reload模块,只需要取消下面几行的注释即可 # inst = self._cache["rss"][1].get_class() # inst = inst() # inst.run() logger.info("{} end!".format(self.name))
def start(self): """ 先连接到远程队列,再调用super启动engine """ if not self.__connect_remote_queue(): logger.error("Can't connect to remote task queue!") return False super(SenderEngine, self).start() return True
def _worker(self): logger.info("RefreshEngine start!") while self._status == EngineStatus.RUNNING: # 每次读库前等一会 self._ev.wait(5) current_time = datetime.datetime.now() rows = RedstoneFeedsModel.objects.filter(is_deleted=0).all() for _feed in rows: if _feed.fetch_time + datetime.timedelta(minutes=_feed.interval) <= current_time: logger.debug("Detected out-date rss. (ID:%s, Name:%s)", _feed.id, _feed.name) try: # 获取该feed使用的spider名称 sp = RedstoneSpiderModel.objects.filter(is_deleted=0, pk=_feed.spider_type).first() if not sp: logger.error("Can't get (name: {}, id: {}) spider info!".format(_feed.name, _feed.id)) # TODO: 考虑将feed的状态设置为失效 continue # 将需要刷新的任务封装成指定的格式 task = { "feed_url": _feed.url, "feed_id": _feed.id, "feed_name": _feed.name, "feed_config": { "use_proxy": _feed.use_proxy }, "spider_name": sp.name } task = json.dumps(task) self.push_task(task) finally: # 保证一定更新fetch_time字段 _feed.fetch_time = current_time _feed.save() logger.info("RefreshEngine end!")
def load_class_by_name(self, spider_name) -> Optional[SpiderBase]: logger.debug("Try to load spider: {}".format(spider_name)) # 把爬虫名字转换成文件名,并提取pkg名 # SpiderName: ExampleSpider => Pkg name: example_spider => Filename: example_spider.py pkg_name = [ch if ch.islower() else " " + ch for ch in spider_name] pkg_name = "".join(pkg_name).strip() pkg_name = pkg_name.replace(" ", "_") filename = pkg_name + ".py" # 不在缓存中,加载一下 if pkg_name not in self._cache: result = self._load_and_update_cache(filename) if not result["success"]: logger.error(result["message"]) return None # 直接调用对应爬虫module模块的get_class()方法获取爬虫类 try: return self._cache[pkg_name][1].get_class() except AttributeError: logger.error("Spider doesn't have 'get_class()' method!") return None
def run(self): logger.info("RSS Spider running, target: {}".format(self._url)) # 获取RSS页面的内容 resp_result = self._get_page_content() if not resp_result["success"]: error_message = "Can't fetch target URL's content, error msg: {}".format( resp_result["message"]) logger.error(error_message) self.spider_result.success = False self.spider_result.error_message = error_message return False # 解析RSS raw_rss_content = resp_result["page_content"] parsed_rss = feedparser.parse(raw_rss_content) # 提取item信息,最后封装成一个dict """ item = { "title": "", "link": "", "summary": "" "content": "" if not empty else title+url, "publish_time": "", } """ items = parsed_rss["entries"] # rss_info = parsed_rss["feed"] for item in items: title = item["title"] link = item["link"] summary = item["summary"] if item["summary"] else "该文章暂无简介" content = item["content"] if not content: content = "{title}<br><a href=\"{link}\">{title}</a>".format( title=title, link=link) # 匹配时间字符串 raw_published_time = item["published"] fmt1 = "%a, %d %b %Y %H:%M:%S %z" fmt2 = "%a, %d %b %Y %H:%M:%S %Z" try: st = time.strptime(raw_published_time, fmt1) except ValueError: try: st = time.strptime(raw_published_time, fmt2) except ValueError: # 没救了,转不出来 logger.warning( "Can't convert rss time to struct_time: '{}', use current time instead." .format(raw_published_time)) st = time.localtime() # 把struct_time转成timestamp,处理时区问题 published_time = time.mktime(st) published_time = \ published_time + 8 * 3600 if not st.tm_gmtoff else published_time + 8 * 3600 + abs(st.tm_gmtoff) # 拼装result result = { "title": title, "link": link, "summary": summary, "content": content, "publish_time": published_time } self._push_result(result) self.spider_result.results = True logger.info("Rss spider done.") return True