def get_task(self): """ 从数据库中定时获取需要执行的任务,发送至下载队列 :return: """ task_cell = self.crawler_setting.get( "task_cell") if self.crawler_setting.get("task_cell") else 10 mq_queue = get_queue(self.crawler_setting, 'download') mq_conn = connect(mq_queue, self.mq_params[0], self.mq_params[1], self.mq_params[2], self.mq_params[3]) while True: if RedisUtil.get_lock(): tasks = SqlUtil.get_task() if tasks: for task in tasks: task_id = task.get("task_id") RedisUtil.monitor_task(task_id) task["main_task_flag"] = 1 message = repr(task) # 判断是否超出队列限制大小,超出则不下发 is_send(self.mq_params, self.crawler_setting, mq_queue) send_data(mq_conn, '', message, mq_queue) SqlUtil.update_task( 1, "'{}'".format(task_id), "'{}'".format(task.get("exec_time")), "'{}'".format(task.get("pre_exec_time"))) Logger.logger.info( "任务发送完成, 开始进行休眠, 休眠..{}s..".format(task_cell)) else: Logger.logger.info( "没有可提取的任务,开始进行休眠,休眠..{}s..".format(task_cell)) RedisUtil.release_lock() else: Logger.logger.info("未抢到锁,休眠..{}s..".format(task_cell)) time.sleep(task_cell)
def call_back(ch, method, properties, body): """ rabitmq 回调消费者汉书, 如果有next_pages 则为生成任务, 没有则为回收任务 :param ch: :param method: :param properties: :param body: :return: """ ch.basic_ack(delivery_tag=method.delivery_tag) message: dict = eval(body.decode()) if message.get("next_pages"): next_pages = copy.deepcopy(message.get("next_pages")) del message["next_pages"] for result in next_pages: url = result.get("url") header = result.get("header") message["task_url"] = url message["main_task_flag"] = 0 message["is_detail"] = result.get("is_detail") if header: message["header"] = header Logger.logger.info("新任务:{}".format(message)) mq_queue = get_queue(Dispatch.crawler_setting, 'download') mq_params = get_login_info(Dispatch.crawler_setting) is_send(mq_params, Dispatch.crawler_setting, mq_queue) send_data(ch, '', repr(message), mq_queue) else: send_data(ch, '', repr(message), 'download')
def call_back(ch, method, properties, body): ch.basic_ack(delivery_tag=method.delivery_tag) message: dict = eval(body.decode()) path = get_plugin_path(Extractor.crawler_setting, 'extract') result = process(message, path) mq_queue = get_queue(Extractor.crawler_setting, "storage_dup") send_data(ch, '', repr(result), mq_queue) Logger.logger.info("发送任务至排重入库")
def call_back(ch, method, properties, body): ch.basic_ack(delivery_tag=method.delivery_tag) message: dict = eval(body.decode()) path = get_plugin_path(Downloader.crawler_setting, 'download') result = process(message, path) if result.get("recovery_flag"): if result.get("recovery_flag") < 3: mq_queue = get_queue(Downloader.crawler_setting, "recovery") send_data(ch, '', repr(result), mq_queue) Logger.logger.info("回收--{}--成功".format(result.get("task_url"))) else: # 任务下载失败, 都需要清除调redis临时任务库中的url RedisUtil.del_exist(message.get("task_id"), hashlib.md5(message.get("task_url").encode("utf-8")).hexdigest()) # 主任务回收, 则需要更新任务状态, 以及清除调在redis中生成的临时任务库 if message.get("main_task_flag"): while True: if RedisUtil.get_lock(): pre_exec_time = message.get("exec_time") exec_time = message.get("exec_time") + datetime.timedelta(seconds=message.get("task_cell")) SqlUtil.update_task(0, "'{}'".format(message.get("task_id")), "'{}'".format(str(exec_time)), "'{}'".format(str(pre_exec_time))) RedisUtil.release_lock() RedisUtil.release_monitor(message.get("task_id")) break time.sleep(0.3) # 进行判断, 如果redis临时任务库中所有的数据的分没有10, 则关闭任务(注:分值为10表示详细页面, 分值为100表示列表页面) if not RedisUtil.monitor_score(message.get("task_id")): RedisUtil.release_monitor(message.get("task_id")) while True: if RedisUtil.get_lock(): pre_exec_time = message.get("exec_time") exec_time = datetime.datetime.now() + datetime.timedelta(seconds=message.get("task_cell")) SqlUtil.update_task(0, "'{}'".format(message.get("task_id")), "'{}'".format(str(exec_time)), "'{}'".format(str(pre_exec_time))) RedisUtil.release_lock() break time.sleep(0.3) Logger.logger.info("{}--超出回收次数上限, 不做回收".format(result.get("task_url"))) else: mq_queue = get_queue(Downloader.crawler_setting, "extract") send_data(ch, '', repr(result), mq_queue) Logger.logger.info(result) Logger.logger.info("发送任务至提取中心")
def back_task(self): """ 回收任务 :return: """ mq_queue = get_queue(self.crawler_setting, "recovery") mq_conn_recovery = connect(mq_queue, self.mq_params[0], self.mq_params[1], self.mq_params[2], self.mq_params[3]) self.call_back(**{ "no_ack": None, "channel": mq_conn_recovery, "routing_key": mq_queue })
def generate_task(self): """ 生成任务 :return: """ mq_queue = get_queue(self.crawler_setting, "dispatch") mq_conn_download = connect(mq_queue, self.mq_params[0], self.mq_params[1], self.mq_params[2], self.mq_params[3]) self.call_back(**{ "no_ack": None, "channel": mq_conn_download, "routing_key": mq_queue })
def process(self): crawler_mode = self.crawler_setting.get("crawler_mode") if not crawler_mode: self.simple() else: try: user = self.crawler_setting.get("mq").get("user") pwd = self.crawler_setting.get("mq").get("pwd") host = self.crawler_setting.get("mq").get("host") port = self.crawler_setting.get("mq").get("port") mq_queue = get_queue(self.crawler_setting, "extract") except AttributeError: user = "******" pwd = "crawler4py" host = "127.0.0.1" port = 5672 mq_queue = "extract" mq_conn = connect(mq_queue, user, pwd, host, port) self.call_back(**{"no_ack": None, "channel": mq_conn, "routing_key": mq_queue})
def call_back(ch, method, properties, body): ch.basic_ack(delivery_tag=method.delivery_tag) message: dict = eval(body.decode()) Logger.logger.info(message) path = get_plugin_path(BaseStorageDup.crawler_setting, 'storage_dup') del message["view_source"] if not message.get("next_pages"): process(message, path) else: # 非详细页面, 需要先判断临时任务库是否存在,存在则进行处理 if RedisUtil.monitor_is_exist( message.get("task_id")) and RedisUtil.monitor_ttl( message.get("task_id")) > 10: result = process(message, path) if len(message.get("next_pages")): mq_queue = get_queue(BaseStorageDup.crawler_setting, 'dispatch') send_data(ch, '', repr(result), mq_queue) Logger.logger.info("发送数据至dispatch进行构造任务") else: Logger.logger.info("所有数据都被排掉, 不添加数据") else: Logger.logger.info("监控集合已经消失或者超出监控时间, 不再发送任务") # 每次处理数据, 需要判断当前临时任务的状态, 确定是否关闭 if not RedisUtil.monitor_score(message.get("task_id")): RedisUtil.release_monitor(message.get("task_id")) while True: if RedisUtil.get_lock(): pre_exec_time = message.get("exec_time") exec_time = datetime.datetime.now() + datetime.timedelta( seconds=message.get("task_cell")) SqlUtil.update_task(0, "'{}'".format(message.get("task_id")), "'{}'".format(str(exec_time)), "'{}'".format(str(pre_exec_time))) RedisUtil.release_lock() break time.sleep(0.3)
def process(self): crawler_mode = self.crawler_setting.get("crawler_mode") if not crawler_mode: self.simple() else: try: user = self.crawler_setting.get("mq").get("user") pwd = self.crawler_setting.get("mq").get("pwd") host = self.crawler_setting.get("mq").get("host") port = self.crawler_setting.get("mq").get("port") mq_queue = get_queue(self.crawler_setting, "download") except AttributeError: user = "******" pwd = "crawler4py" host = "127.0.0.1" port = 5672 mq_queue = "download" mq_conn = connect(mq_queue, user, pwd, host, port) try: plugin_path = self.crawler_setting.get("plugins").get("download") except ArithmeticError: plugin_path = None self.call_back( **{"no_ack": None, "channel": mq_conn, "routing_key": mq_queue, "plugin_path": plugin_path})