def get_pending_task(batch_num): agent_name = os.environ['agent_name'] job = os.environ['job'] if job == "bc": task_pools = session.query(TaskItem).filter(TaskItem.batch_num == batch_num).filter( TaskItem.status == 'pending', TaskItem.type == 'bc') else: task_pools = session.query(TaskItem).filter(TaskItem.batch_num == batch_num).filter( TaskItem.status == 'pending', TaskItem.type != 'bc') if task_pools.count() == 0: logger.info("本Agent没有待巡检任务,Agent切换为waiting状态: %s", agent_name) # 没有pending状态的任务 gl.set_value('STATUS', False) return None, None else: logger.info("%s 准备执行可以处理的任务,倒数第:%s 个...", agent_name, str(task_pools.count())) task_pool = task_pools.first() session.query(TaskItem).filter(TaskItem.id == task_pool.id).update({"status": "processing"}) if task_pool.type == "weburl": logger.info("task_pool.website_id:%s", task_pool.website_id) weburl = session.query(Weburl).filter(Weburl.url == task_pool.url).filter( Weburl.website_id == task_pool.website_id).all() if len(weburl): return weburl[0], task_pool else: logger.info("task_pool.website_id:%s", task_pool.website_id) logger.info("task_pool.id:%s", task_pool.id) session.query(TaskItem).filter(TaskItem.id == task_pool.id).update({"status": "done"}) return None, None else: website = session.query(Website).filter(Website.id == task_pool.website_id).one() return website, task_pool
def stop(): job = os.environ['job'] if job == "gather": logger.info("My Job is gather,ignore the order!") else: gl.set_value('STATUS', False) ims_api.heartbeat() return 'SUCCESS'
def stop_tracking(): job = os.environ['job'] if job == "tracking": gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.heartbeat() else: logger.info("My Job is tracking,ignore the order!") return 'SUCCESS'
def tracking_execute(): job = os.environ['job'] if job == "tracking": # 重启selenium stop_selenium() stop_chrome() gl.set_value('STATUS', True) gl.set_value('TRACKING_STATUS', True) ims_api.heartbeat() try: task_id = request.form['taskId'] status = request.form['status'] logger.info("tracking begin task_id: %s,status: %s" % (str(task_id), str(status))) t = threading.Thread(target=inspect_tracking, args=(task_id, status)) t.setDaemon(True) t.start() return 'OK' except Exception as e: logger.error(e) else: logger.info("Tracking is not my job!") return 'OK'
def gather_urls(): logger.info("receive gather_urls req !") job = os.environ['job'] if job == "gather": try: # task_id = request.form['taskId'] gl.set_value('STATUS', True) weburl_service = WeburlService() website_id = request.form['websiteId'] if website_id == 'all': website_id = None else: pass weburl_service.gather_urls_by_website(website_id) gl.set_value('STATUS', False) return 'SUCCESS' except KeyError as e: print(e) return 'ERROR' else: logger.info("Gather is not my job!") return 'SUCCESS'
def execute(): logger.info("receive execute req !") job = os.environ['job'] if job == "bc" or job == "other": # 重启selenium logger.info("restart selenium...") stop_selenium() start_selenium() logger.info("update status...") gl.set_value('STATUS', True) logger.info("heartbeat...") ims_api.heartbeat() try: batch_num = request.form['batchNum'] logger.info("spider begin batchNum: %s" % str(batch_num)) t = threading.Thread(target=inspect, args=(batch_num,)) t.setDaemon(True) t.start() return 'OK' except Exception as e: logger.error(e) else: logger.info("spider is not my job!") return 'OK'
def monitor(task_id, status): ims_api = ImsApi() tracking_dao = TrackingDetailDao() status_dict = {'0': '查询中', '1': '查询不到', '2': '运输途中', '3': '到达待取', '4': '成功签收', '5': '运输过久', '6': '投递失败', '7': '可能异常'} normal_status_dict = {'0': '查询中', '1': '查询不到', '2': '运输途中', '3': '到达待取', '4': '成功签收', '5': '运输过久'} tracking_details = tracking_dao.get_by_task(task_id, status) if tracking_details.__len__() > 0: try: driver = WebDriver.get_chrome() driver.get("https://www.trackingmore.com/login-cn.html") driver.find_element_by_id("email").send_keys("*****@*****.**") driver.find_element_by_id("password").send_keys("0418YXYwlx") driver.find_element_by_id("login_test").click() time.sleep(5) for tracking_detail in tracking_details: if gl.get_value('TRACKING_STATUS'): pass else: logger.info("快递单任务已停止,任务id:%s", task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) return tracking_detail.start_time = datetime.datetime.now() tracking_detail.status = "done" logger.info("准备检查单号:%s ", tracking_detail.tracking_num) try: driver.get( "https://my.51tracking.com/numbers.php?lang=cn&keywordType=trackNumber&p=1&searchnumber=" + tracking_detail.tracking_num) driver.maximize_window() time.sleep(3) # driver.find_element_by_class_name("show_lastEvent").click() driver.find_element_by_id('trackItem_0').click() time.sleep(1) snapshot = SnapshotService.snapshot_tracking(driver, tracking_detail) url = "https://my.51tracking.com/data/data-numbers.php?lang=cn&action=get_my_number" \ "&source=2&where=lang%3Dcn%26p%3D1%26keywordType%3DtrackNumber%26searchnumber%3D" \ + tracking_detail.tracking_num + "&page=1" driver.get(url) json_data = driver.find_element_by_tag_name("body").text json_obj = json.loads(str(json_data)) status = json_obj['data'][0]['track_status'] tracking_detail.des = status_dict[status] tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = "" tracking_detail.snapshot = snapshot if status in normal_status_dict: logger.info("单号巡检状态:%s", status) tracking_detail.result = "true" else: tracking_detail.result = "false" tracking_dao.update(tracking_detail) except Exception as e: logger.error(e) tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = "" tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) time.sleep(600) except Exception as e: logger.error(e) tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = "" tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) finally: driver.quit() else: logger.info("单号任务没有需要检索的单号,任务id:%s,单号状态: %s", task_id, status) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False)
def monitor(task_id, status): ims_api = ImsApi() tracking_dao = TrackingDetailDao() strategy_service = StrategyService() strategy = strategy_service.get_strategy() tracking_details = tracking_dao.get_by_task(task_id, status) if tracking_details.__len__() > 0: for tracking_detail in tracking_details: if gl.get_value('TRACKING_STATUS'): pass else: logger.info("快递单任务已停止,任务id:%s", task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) return if strategy.frequency == 0 or strategy.frequency is None: logger.info("未设置爬取频率限制,继续执行任务..") else: logger.info("爬取频率限制为:%s 秒", strategy.frequency) time.sleep(strategy.frequency) random_seconds = random.randint(10, 15) logger.info("快递单检测随机等待 %s 秒...", str(random_seconds)) time.sleep(random_seconds) tracking_detail.start_time = datetime.datetime.now() tracking_detail.status = "done" logger.info("准备检查单号:%s ", tracking_detail.tracking_num) url = "https://www.trackingmore.com/cn/" + tracking_detail.tracking_num logger.info("url:%s ", url) driver = WebDriver.get_phantomjs() try: driver.get(url) except Exception as e: logger.error(e) tracking_detail.result = "true" tracking_detail.des = "检测超时,建议手动验证:" + url tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) logger.info("单号巡检发生异常,跳过") driver.quit() continue try: source = driver.page_source soup = BeautifulSoup(source, 'html.parser') snapshot = SnapshotService.snapshot_tracking( driver, tracking_detail) a_tags = soup.find_all("a", attrs={'class': 'ulliselect'}) has_tracking = False if a_tags.__len__() > 0: for a_tag in a_tags: if a_tag.get_text().strip( ) == tracking_detail.tracking_name: has_tracking = True url = "http:" + a_tag.get("href") driver.get(url) snapshot = SnapshotService.snapshot_tracking( driver, tracking_detail) try: source = driver.page_source soup = BeautifulSoup(source, 'html.parser') items = soup.find_all( attrs={ 'class': 'line-gutter-backdrop' }) # 异常为0 if items.__len__() != 0: tracking_detail.result = "false" tracking_detail.des = "爬虫请求疑似被拦截,建议手动验证!" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot else: soup = BeautifulSoup( source, 'html.parser') item_length = soup.find_all( "li", attrs={ 'class': 's-packStatst' }).__len__() if item_length > 0: tracking_detail.result = "true" tracking_detail.des = "物流正常" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot else: tracking_detail.result = "false" tracking_detail.des = "没有查询到物流信息" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot except Exception as e: print(e) # 正常 tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot break else: continue if not has_tracking: tracking_detail.result = "false" tracking_detail.des = "提供的单号-快递公司关系疑似不匹配" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = snapshot else: item_length = soup.find_all( "dd", attrs={'class': 'post_message'}) if item_length.__len__() > 0: tracking_detail.result = "true" tracking_detail.des = "巡检正常" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = snapshot else: tracking_detail.result = "false" tracking_detail.des = "没有查询物流信息" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = snapshot tracking_dao.update(tracking_detail) except Exception as e: logger.error(e) tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) finally: driver.quit() else: logger.info("单号任务没有需要检索的单号,任务id:%s,单号状态: %s", task_id, status) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False)
from flask import request import config.global_val as gl from config.mylog import logger from dao.tracking_task_dao import TrackingTaskDao from manager.gather_center import GatherCenter from manager.ims_api import ImsApi from service.monitor_bc_service import MonitorBcService from service.monitor_tracking_service import MonitorTrackingService from service.weburl_service import WeburlService app = Flask(__name__) gl._init() # 定义跨模块全局变量 gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api = ImsApi() @app.route('/verify_cookie', methods=['POST']) def verify_cookie(): monitor_bc_service = MonitorBcService() # return monitor_bc_service.check_cookie() return "SUCCESS" ''' 爬取数据 '''