class TestWangdaitianyan(object): if __name__ == "__main__": """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="C:/chromedriver_2.38/chromedriver.exe") driver.get("http://www.chinaft.com.cn/news/search/_1.shtml?key=" + urllib.parse.quote("京东")) """ driver = WebDriver.get_chrome() try: driver.get("http://www.chinaft.com.cn/news/search/_1.shtml?key=" + urllib.parse.quote("京东")) except Exception as e: print("error") pass print("e") SnapshotService.create_snapshot(driver) time.sleep(5) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') news = soup.find_all("div", attrs={'class': 'xixi_ChinaFT_left_news_box'}) if news.__len__() > 0: for new in news: href = new.find_all('a')[1].get("href") print("http://www.chinaft.com.cn" + href) print(new.find_all('a')[1].get_text())
class TestWangdaitianyan(object): if __name__ == "__main__": """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="C:/chromedriver_2.38/chromedriver.exe") driver.get("http://www.wangdaibus.com/search.php?mod=forum") driver.find_element_by_id("scform_srchtxt").send_keys(u"京东") driver.find_element_by_id("scform_submit").click() """ driver = WebDriver.get_chrome() try: driver.get( "http://www.wangdaibus.com/search/list?subject=%E4%BA%AC%E4%B8%9C" ) aaa = "京东" #lement_by_xpath('//input[@name="subject"]').send_keys(aaa) #driver.find_element_by_xpath('//input[@name="subject"]').send_keys(Keys.ENTER) time.sleep(10) except Exception as e: # 异常处理 logger.error(e) pass SnapshotService.create_snapshot(driver) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') news = soup.find_all("h3", attrs={'class': 'xs3'}) if news.__len__() > 0: for new in news: href = new.find_all('a')[0].get("href") logger.info("http://www.wangdaibus.com/" + href) logger.info(new.get_text()) '''
class TestWangdaitianyan(object): if __name__ == "__main__": """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="C:/chromedriver_2.38/chromedriver.exe") driver.get("https://www.baidu.com/") driver.find_element_by_id("kw").send_keys(u"京东") driver.find_element_by_id("su").click() """ driver = WebDriver.get_chrome() try: driver.get("https://www.baidu.com/") driver.find_element_by_xpath('//input[@name="wd"]').send_keys(u"京东") except Exception as e: # 异常处理 print(e) pass SnapshotService.create_snapshot(driver) time.sleep(5) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') for result_table in soup.find_all('h3', class_='t'): a_click = result_table.find("a"); print(a_click.get_text()) # 标题 print(str(a_click.get("href"))) # 链接 '''
class TestWangdaitianyan(object): if __name__ == "__main__": """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="C:/chromedriver_2.38/chromedriver.exe") driver.get("https://www.wdzj.com/front/search/index?key=" + urllib.parse.quote("猫小贷")) """ try: driver = WebDriver.get_chrome() driver.get("http://www.wdzj.com/news/yc/2934681.html") except Exception as e: # 异常处理 print(e) pass SnapshotService.create_snapshot(driver) time.sleep(5) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') print(soup.find_all("ul", attrs={'class': 'so-tzbox'})) news = soup.find_all("ul", attrs={'class': 'so-tzbox'})[0].find_all("li") if news.__len__() > 0: for new in news: href = new.find_all('a')[0].get("href") print(href[2:])
def senti_process_text(platform, text, href, batch_num, website): driver = WebDriver.get_chrome() keyword_dao = KeywordDao() monitor_third_dao = MonitorThirdDao() # 截图 try: driver.get(href) snapshot = SnapshotService.create_snapshot(driver, batch_num, website, '舆情') is_normal = "正常" keywords = keyword_dao.get_all() for keyword in keywords: index = text.find(keyword.name) monitor_third = MonitorThird() monitor_third.website_name = website.website_name monitor_third.merchant_num = website.merchant_num monitor_third.merchant_name = website.merchant_name monitor_third.domain_name = website.domain_name monitor_third.saler = website.saler monitor_third.batch_num = batch_num monitor_third.url = href monitor_third.type = platform if index != -1: is_normal = "异常" monitor_third.is_normal = is_normal monitor_third.level = '高' monitor_third.outline = '检测到敏感词:' + str(keyword.name) monitor_third.snapshot = snapshot monitor_third_dao.add(monitor_third) else: pass if is_normal == "正常": if platform == "百度百科": monitor_third.level = '-' monitor_third.outline = '-' monitor_third.is_normal = is_normal monitor_third.snapshot = snapshot monitor_third_dao.add(monitor_third) pass except ConnectionError as conn_error: logger.error(conn_error) except Exception as e: logger.error(e) return finally: driver.quit()
def inspect(batch_num, url, website): monitor_bc_dao = MonitorBcDao() monitor_bc = MonitorBc() monitor_bc.batch_num = batch_num monitor_bc.domain_name = website.domain_name monitor_bc.merchant_num = website.merchant_num monitor_bc.website_name = website.website_name monitor_bc.merchant_name = website.merchant_name monitor_bc.saler = website.saler monitor_bc.is_normal = '正常' monitor_bc.kinds = '工商巡检' monitor_bc.outline = '企业工商信息检查正常' monitor_bc.level = '-' url = ims_rest_base + "open/api/v1/agent/monitor_bc" data_json = { "merchantNum": website.merchant_num, "merchantName": website.merchant_name } data = bytes(parse.urlencode(data_json), encoding="utf8") new_url = request.Request(url, data) res = request.urlopen(new_url).read().decode('utf-8') bc_response = json.loads(res) if bc_response['status'] is True: logger.info("企业工商信息检测正常:%s", website.merchant_name) else: logger.info("企业工商信息检测异常:%s", website.merchant_name) monitor_bc.is_normal = '异常' monitor_bc.kinds = '企业工商信息' monitor_bc.outline = bc_response['msg'] monitor_bc.level = '高' url = ims_rest_base + "views/system/qichacha.jsp?merchantNum=" + website.merchant_num driver = WebDriver.get_phantomjs() try: logger.info("企业工商信息截图:%s", website.merchant_name) driver.get(url) snapshot = SnapshotService.create_snapshot(driver, batch_num, website, "工商巡检") monitor_bc.snapshot = snapshot monitor_bc_dao.add(monitor_bc) except Exception as e: print(e) monitor_bc_dao.add(monitor_bc) finally: driver.quit()
def snapshot_home(platform, href, batch_num, website, driver): monitor_third_dao = MonitorThirdDao() # 截图 try: snapshot = SnapshotService.create_snapshot(driver, batch_num, website, '舆情') is_normal = "正常" monitor_third = MonitorThird() monitor_third.merchant_num = website.merchant_num monitor_third.merchant_name = website.merchant_name monitor_third.website_name = website.website_name monitor_third.domain_name = website.domain_name monitor_third.saler = website.saler monitor_third.batch_num = batch_num monitor_third.url = href monitor_third.type = platform monitor_third.level = '-' monitor_third.outline = '首页截图' monitor_third.is_normal = is_normal monitor_third.snapshot = snapshot monitor_third_dao.add(monitor_third) except Exception as e: logger.error(e) return
def editUserAgent(): dcap = dict(DesiredCapabilities.PHANTOMJS) dcap['phantomjs.page.settings.userAgent'] = ( 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36' ) driver = webdriver.PhantomJS( executable_path="D:/software/phantomjs-2.1.1-windows/bin/phantomjs.exe", desired_capabilities=dcap, service_args=[ '--ignore-ssl-errors=true', '--ssl-protocol=any', '--load-images=false' ]) try: url = "www.gragreati.com" driver.set_page_load_timeout(60) driver.set_script_timeout(60) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') website = Website() website.domain_name = url if website.domain_name is None or len(website.domain_name) == 0: return else: pass # 首页监控 domain_names = str(website.domain_name) domain_name_list = domain_names.split(",") for domain_name in domain_name_list: domain_name_rich = domain_name dns = domain_name if str(domain_name).startswith("http"): temp = domain_name[domain_name.find("/") + 2:] if str(temp).find("/") == -1: dns = temp else: start = temp.find("/") dns = temp[0:start] else: if str(domain_name).find("/") == -1: pass else: start = domain_name.find("/") dns = domain_name[0:start] pass domain_name_rich = "http://" + domain_name try: conn = http.client.HTTPSConnection(dns, timeout=10) conn.request('GET', domain_name_rich) resp = conn.getresponse() code = resp.code except Exception as e: if str(e).find("timed out") == -1: pass else: continue try: driver.get(domain_name_rich) current_url = driver.current_url title = driver.title source = driver.page_source snapshot = SnapshotService.create_snapshot( driver, "", website, '网站') if str(current_url) == "about:blank" and str( source ) == "<html><head></head><body></body></html>" and str( title) == "": driver.quit() continue else: pass if str(current_url).find(dns) == -1: driver.quit() continue else: pass except Exception as e: print(e) finally: driver.quit() driver.save_screenshot("D:/333.png") print() except Exception as e: print(e) driver.quit()
def inspect(batch_num, url, website, task_id): monitor_bc_dao = MonitorBcDao() monitor_bc = MonitorBc() monitor_bc.batch_num = batch_num monitor_bc.domain_name = website.domain_name monitor_bc.merchant_num = website.merchant_num monitor_bc.website_name = website.website_name monitor_bc.merchant_name = website.merchant_name monitor_bc.saler = website.saler monitor_bc.is_normal = '正常' monitor_bc.kinds = '工商巡检' monitor_bc.outline = '企业工商信息检查正常' monitor_bc.level = '-' url = ims_rest_base + "open/api/v1/agent/monitor_bc" data_json = { "merchantNum": website.merchant_num, "merchantName": website.merchant_name, "taskId": task_id, "batchNum": batch_num } data = bytes(parse.urlencode(data_json), encoding="utf8") new_url = request.Request(url, data) res = request.urlopen(new_url).read().decode('utf-8') logger.info("res:%s", res) bc_response = json.loads(res) logger.info("bc_response:%s", bc_response) if bc_response['isNormal'] == "正常": monitor_bc.status = bc_response['businessInfo']['enterpriseBase'][ 'entStatus'] logger.info("企业工商信息检测正常:%s", website.merchant_name) elif bc_response['isNormal'] == "无法获取": logger.info("企业工商信息检测无法获取:%s", website.merchant_name) monitor_bc.is_normal = '无法获取' monitor_bc.status = bc_response['status'] monitor_bc.outline = bc_response['msg'] monitor_bc.outline = '企业工商信息无法获取,跳过巡检。' else: logger.info("企业工商信息检测异常:%s", website.merchant_name) monitor_bc.is_normal = '异常' monitor_bc.kinds = '企业工商信息' monitor_bc.outline = bc_response['msg'] enterpriseBase = bc_response['businessInfo']['enterpriseBase'] if enterpriseBase is None: monitor_bc.status = bc_response['status'] else: monitor_bc.status = bc_response['businessInfo'][ 'enterpriseBase']['entStatus'] monitor_bc.level = '高' url = ims_rest_base + "views/system/enterprise.jsp?merchantNum=" + website.merchant_num driver = WebDriver.get_phantomjs() try: logger.info("企业工商信息截图:%s", website.merchant_name) driver.get(url) snapshot = SnapshotService.create_snapshot(driver, batch_num, website, "工商巡检") monitor_bc.snapshot = snapshot monitor_bc_dao.add(monitor_bc) except Exception as e: print(e) monitor_bc_dao.add(monitor_bc) finally: driver.quit()
def monitor(task_id, status): ims_api = ImsApi() tracking_dao = TrackingDetailDao() status_dict = {'0': '查询中', '1': '查询不到', '2': '运输途中', '3': '到达待取', '4': '成功签收', '5': '运输过久', '6': '投递失败', '7': '可能异常'} normal_status_dict = {'0': '查询中', '1': '查询不到', '2': '运输途中', '3': '到达待取', '4': '成功签收', '5': '运输过久'} tracking_details = tracking_dao.get_by_task(task_id, status) if tracking_details.__len__() > 0: try: driver = WebDriver.get_chrome() driver.get("https://www.trackingmore.com/login-cn.html") driver.find_element_by_id("email").send_keys("*****@*****.**") driver.find_element_by_id("password").send_keys("0418YXYwlx") driver.find_element_by_id("login_test").click() time.sleep(5) for tracking_detail in tracking_details: if gl.get_value('TRACKING_STATUS'): pass else: logger.info("快递单任务已停止,任务id:%s", task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) return tracking_detail.start_time = datetime.datetime.now() tracking_detail.status = "done" logger.info("准备检查单号:%s ", tracking_detail.tracking_num) try: driver.get( "https://my.51tracking.com/numbers.php?lang=cn&keywordType=trackNumber&p=1&searchnumber=" + tracking_detail.tracking_num) driver.maximize_window() time.sleep(3) # driver.find_element_by_class_name("show_lastEvent").click() driver.find_element_by_id('trackItem_0').click() time.sleep(1) snapshot = SnapshotService.snapshot_tracking(driver, tracking_detail) url = "https://my.51tracking.com/data/data-numbers.php?lang=cn&action=get_my_number" \ "&source=2&where=lang%3Dcn%26p%3D1%26keywordType%3DtrackNumber%26searchnumber%3D" \ + tracking_detail.tracking_num + "&page=1" driver.get(url) json_data = driver.find_element_by_tag_name("body").text json_obj = json.loads(str(json_data)) status = json_obj['data'][0]['track_status'] tracking_detail.des = status_dict[status] tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = "" tracking_detail.snapshot = snapshot if status in normal_status_dict: logger.info("单号巡检状态:%s", status) tracking_detail.result = "true" else: tracking_detail.result = "false" tracking_dao.update(tracking_detail) except Exception as e: logger.error(e) tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = "" tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) time.sleep(600) except Exception as e: logger.error(e) tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = "" tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) finally: driver.quit() else: logger.info("单号任务没有需要检索的单号,任务id:%s,单号状态: %s", task_id, status) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False)
def monitor_website(website, batch_num): monitor_website_dao = MonitorWebsiteDao if len(website.domain_name) == 0: logger.info("website_domain is None! merchant_name: %s ", website.merchant_name) monitor_website = MonitorWebsite() monitor_website.website_name = website.website_name monitor_website.merchant_name = website.merchant_name monitor_website.merchant_num = website.merchant_num monitor_website.domain_name = website.domain_name monitor_website.saler = website.saler monitor_website.batch_num = batch_num monitor_website.kinds = "首页是否可打开" monitor_website.level = '-' monitor_website.access = '异常' monitor_website.is_normal = '无法获取' monitor_website.outline = '商户域名为空。' monitor_website.level = '-' monitor_website.pageview = '-' monitor_website_dao.add(monitor_website) return else: logger.info("website_domain is not None! merchant_name: %s ", website.domain_name) # 首页监控 driver = WebDriver.get_phantomjs() service = TrafficService() access = AccessibleService() domain_names = str(website.domain_name) domain_name_list = domain_names.split(",") for domain_name in domain_name_list: try: logger.info("-------------------") logger.info("check whether website available,domain_name : %s", website.domain_name) # 截图 monitor_website = MonitorWebsite() monitor_website.website_name = website.website_name monitor_website.merchant_name = website.merchant_name monitor_website.merchant_num = website.merchant_num monitor_website.saler = website.saler monitor_website.domain_name = domain_name monitor_website.batch_num = batch_num monitor_website.kinds = "首页是否可打开" monitor_website.level = '-' monitor_website.snapshot = "" logger.info("预留使用代理入口...") # domain_name_rich, current_url = access.get_proxy_access_res(domain_name) # if domain_name_rich is None: # logger.info("不使用代理重试访问: %s", domain_name) # domain_name_rich, current_url = access.get_access_res(domain_name) # else: # logger.info("使用代理可以访问: %s", domain_name_rich) domain_name_rich, current_url = access.get_access_res( domain_name) logger.info("domain_name: %s", domain_name) logger.info("domain_name_rich: %s", domain_name_rich) logger.info("current_url: %s", current_url) if domain_name_rich is not None: logger.info("domain : %s", str(domain_name_rich)) monitor_website.access = '正常' monitor_website.is_normal = '正常' monitor_website.outline = '正常' monitor_website.level = '-' monitor_website.pageview = '-' monitor_website.batch_num = batch_num pageview = service.get_traffic( domain_name=domain_name_rich) monitor_website.pageview = pageview.reach_rank[0] try: driver.get(domain_name_rich) title = driver.title snapshot = SnapshotService.create_snapshot( driver, batch_num, website, '网站') monitor_website.snapshot = snapshot if title == '没有找到站点' or title == '未备案提示': monitor_website.access = '异常' monitor_website.is_normal = '异常' monitor_website.outline = title monitor_website.level = '高' monitor_website_dao.add(monitor_website) else: monitor_website_dao.add(monitor_website) except Exception as e: logger.info(e) monitor_website.access = '异常' monitor_website.is_normal = '异常' monitor_website.outline = '首页访问检测到异常' monitor_website.level = '高' monitor_website.pageview = '-' monitor_website.snapshot = SnapshotService.simulation_404( domain_name) monitor_website.batch_num = batch_num monitor_website_dao.add(monitor_website) else: monitor_website.access = '异常' monitor_website.is_normal = '异常' monitor_website.outline = '首页访问检测到异常' monitor_website.level = '高' monitor_website.pageview = '-' monitor_website.batch_num = batch_num if current_url is None: logger.info("snapshot 404") monitor_website.snapshot = SnapshotService.simulation_404( domain_name) else: chrome_driver = WebDriver.get_chrome() try: chrome_driver.get(current_url) snapshot = SnapshotService.create_snapshot( chrome_driver, batch_num, website, '网站') monitor_website.snapshot = snapshot except Exception as e: logger.error(e) index = str(e).find("timeout") if index != -1: logger.info("访问超时") monitor_website.outline = '访问超时' monitor_website.snapshot = SnapshotService.simulation_404( current_url) else: monitor_website.outline = str(e) monitor_website.snapshot = SnapshotService.simulation_404( current_url) monitor_website_dao.add(monitor_website) return None, None finally: chrome_driver.quit() logger.info("website is not available : %s return!", domain_name) monitor_website_dao.add(monitor_website) return except Exception as e: logger.info(e) monitor_website.access = '异常' monitor_website.is_normal = '异常' monitor_website.outline = '巡检系统异常,建议手动重试!' monitor_website.level = '高' monitor_website_dao.add(monitor_website) finally: driver.quit()
def monitor_website(weburl, batch_num): # 內容监控 keyword_dao = KeywordDao() keywords = keyword_dao.get_all() monitor_weburl_dao = MonitorWeburlDao() monitor_weburl = MonitorUrl() monitor_weburl.website_name = weburl.website_name monitor_weburl.domain_name = weburl.domain_name monitor_weburl.merchant_name = weburl.merchant_name monitor_weburl.merchant_num = weburl.merchant_num monitor_weburl.saler = weburl.saler monitor_weburl.url = weburl.url monitor_weburl.batch_num = batch_num monitor_weburl.title = weburl.title driver = WebDriver.get_phantomjs() try: logger.info("monitor_url: %s", weburl.url) if str(weburl.url).startswith("http"): print() else: weburl.url = "http://" + weburl.url logger.info("weburl.url: %s", weburl) logger.info("weburl.url: %s", weburl.url) driver.get(weburl.url) snapshot = SnapshotService.snapshot_weburl(driver, batch_num, weburl, '网站内容') logger.info("snapshot: %s", snapshot) monitor_weburl.outline = '' monitor_weburl.is_normal = '正常' monitor_weburl.level = '-' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '是否能打开' logger.info("monitor_url: add %s", weburl.url) source = driver.page_source if source.__eq__('<html><head></head><body></body></html>'): monitor_weburl.outline = '网页打开异常' monitor_weburl.is_normal = '异常' else: print() monitor_weburl_dao.add(monitor_weburl) soup = BeautifulSoup(source, 'html.parser') # 监测页面敏感词 for keyword in keywords: index = soup.find(keyword.name) if index is not None: logger.info( "senti url alert,there is [ %s] in the url page!", str(keyword.name)) monitor_weburl.outline = '检测到敏感词:' + str(keyword.name) monitor_weburl.is_normal = '异常' monitor_weburl.level = '低' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '命中敏感词' monitor_weburl_dao.add(monitor_weburl) # 监测 非金融平台包含充值、提现、钱包功能 illegal_fun = soup.find("充值") if illegal_fun is not None: logger.info("senti url alert,there is [ %s] in the url page!", str("充值")) monitor_weburl.outline = '检测到包含充值、提现、钱包功能' monitor_weburl.is_normal = '异常' monitor_weburl.level = '低' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '非法功能' monitor_weburl_dao.add(monitor_weburl) # 监测 误导宣传 mislead1 = soup.find("融宝资金担保") mislead2 = soup.find("融宝托管") if mislead1 is not None or mislead2 is not None: monitor_weburl.outline = '检测到误导宣传' monitor_weburl.is_normal = '异常' monitor_weburl.level = '中' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '误导宣传' monitor_weburl_dao.add(monitor_weburl) except Exception as e: # ERROR No transaction is begun. logger.error(e) conn = DB_Session() try: logger.info("检测到误404 : %s", weburl.url) monitor_weburl.outline = '检测到页面404' monitor_weburl.is_normal = '异常' monitor_weburl.level = '高' snapshot = SnapshotService.simulation_404(weburl.url) monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '死链接' logger.info("monitor_url:Exception %s", weburl.url) monitor_weburl_dao.add(monitor_weburl) except Exception as e: logger.info(e) conn.rollback() raise finally: conn.close() finally: driver.quit()
def monitor(task_id, status): ims_api = ImsApi() tracking_dao = TrackingDetailDao() strategy_service = StrategyService() strategy = strategy_service.get_strategy() tracking_details = tracking_dao.get_by_task(task_id, status) if tracking_details.__len__() > 0: for tracking_detail in tracking_details: if gl.get_value('TRACKING_STATUS'): pass else: logger.info("快递单任务已停止,任务id:%s", task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) return if strategy.frequency == 0 or strategy.frequency is None: logger.info("未设置爬取频率限制,继续执行任务..") else: logger.info("爬取频率限制为:%s 秒", strategy.frequency) time.sleep(strategy.frequency) random_seconds = random.randint(10, 15) logger.info("快递单检测随机等待 %s 秒...", str(random_seconds)) time.sleep(random_seconds) tracking_detail.start_time = datetime.datetime.now() tracking_detail.status = "done" logger.info("准备检查单号:%s ", tracking_detail.tracking_num) url = "https://www.trackingmore.com/cn/" + tracking_detail.tracking_num logger.info("url:%s ", url) driver = WebDriver.get_phantomjs() try: driver.get(url) except Exception as e: logger.error(e) tracking_detail.result = "true" tracking_detail.des = "检测超时,建议手动验证:" + url tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) logger.info("单号巡检发生异常,跳过") driver.quit() continue try: source = driver.page_source soup = BeautifulSoup(source, 'html.parser') snapshot = SnapshotService.snapshot_tracking( driver, tracking_detail) a_tags = soup.find_all("a", attrs={'class': 'ulliselect'}) has_tracking = False if a_tags.__len__() > 0: for a_tag in a_tags: if a_tag.get_text().strip( ) == tracking_detail.tracking_name: has_tracking = True url = "http:" + a_tag.get("href") driver.get(url) snapshot = SnapshotService.snapshot_tracking( driver, tracking_detail) try: source = driver.page_source soup = BeautifulSoup(source, 'html.parser') items = soup.find_all( attrs={ 'class': 'line-gutter-backdrop' }) # 异常为0 if items.__len__() != 0: tracking_detail.result = "false" tracking_detail.des = "爬虫请求疑似被拦截,建议手动验证!" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot else: soup = BeautifulSoup( source, 'html.parser') item_length = soup.find_all( "li", attrs={ 'class': 's-packStatst' }).__len__() if item_length > 0: tracking_detail.result = "true" tracking_detail.des = "物流正常" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot else: tracking_detail.result = "false" tracking_detail.des = "没有查询到物流信息" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot except Exception as e: print(e) # 正常 tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now( ) tracking_detail.url = url tracking_detail.snapshot = snapshot break else: continue if not has_tracking: tracking_detail.result = "false" tracking_detail.des = "提供的单号-快递公司关系疑似不匹配" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = snapshot else: item_length = soup.find_all( "dd", attrs={'class': 'post_message'}) if item_length.__len__() > 0: tracking_detail.result = "true" tracking_detail.des = "巡检正常" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = snapshot else: tracking_detail.result = "false" tracking_detail.des = "没有查询物流信息" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = snapshot tracking_dao.update(tracking_detail) except Exception as e: logger.error(e) tracking_detail.result = "false" tracking_detail.des = "检测疑似异常,建议手动验证!" tracking_detail.end_time = datetime.datetime.now() tracking_detail.url = url tracking_detail.snapshot = "" tracking_dao.update(tracking_detail) finally: driver.quit() else: logger.info("单号任务没有需要检索的单号,任务id:%s,单号状态: %s", task_id, status) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.done_tracking(task_id) gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False)
def monitor_website(weburl, batch_num): # 內容监控 keyword_dao = KeywordDao() keywords = keyword_dao.get_all() access = AccessibleService() monitor_weburl_dao = MonitorWeburlDao() monitor_weburl = MonitorUrl() monitor_weburl.website_name = weburl.website_name monitor_weburl.domain_name = weburl.domain_name monitor_weburl.merchant_name = weburl.merchant_name monitor_weburl.merchant_num = weburl.merchant_num monitor_weburl.saler = weburl.saler monitor_weburl.url = weburl.url monitor_weburl.batch_num = batch_num monitor_weburl.title = weburl.title # 监测死链接 reachable, current_url = access.get_access_res(weburl.url) use_proxy = False if reachable is None: logger.info("使用代理重试访问: %s", weburl.url) reachable, current_url = access.get_proxy_access_res(weburl.url) use_proxy = True else: logger.info("不使用代理可以访问: %s", weburl.url) if reachable is None: logger.info("检测到误404 : %s", weburl.url) monitor_weburl.outline = '检测到误404' monitor_weburl.is_normal = '异常' monitor_weburl.level = '高' snapshot = SnapshotService.simulation_404(weburl.url) monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '死链接' monitor_weburl_dao.add(monitor_weburl) return else: logger.info("url可以访问: %s", weburl.url) # 截图 if use_proxy: driver = WebDriver.get_proxy_chrome() else: driver = WebDriver.get_chrome() try: driver.get(weburl.url) snapshot = SnapshotService.snapshot_weburl(driver, batch_num, weburl, '网站内容') print(snapshot) print(monitor_weburl) monitor_weburl.outline = '网页打开正常' monitor_weburl.is_normal = '正常' monitor_weburl.level = '-' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '是否能打开' monitor_weburl_dao.add(monitor_weburl) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') # 监测页面敏感词 for keyword in keywords: index = soup.find(keyword.name) if index is not None: logger.info("senti url alert,there is : %s", str(keyword.name)) monitor_weburl.outline = '检测到敏感词:' + str(keyword.name) monitor_weburl.is_normal = '异常' monitor_weburl.level = '低' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '命中敏感词' monitor_weburl_dao.add(monitor_weburl) # 监测 非金融平台包含充值、提现、钱包功能 illegal_fun = soup.find("充值") if illegal_fun is not None: logger.info("senti url alert,there is : %s", str("充值")) monitor_weburl.outline = '检测到包含充值、提现、钱包功能' monitor_weburl.is_normal = '异常' monitor_weburl.level = '低' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '非法功能' monitor_weburl_dao.add(monitor_weburl) # 监测 误导宣传 mislead1 = soup.find("融宝资金担保") mislead2 = soup.find("融宝托管") if mislead1 is not None or mislead2 is not None: monitor_weburl.outline = '检测到误导宣传' monitor_weburl.is_normal = '异常' monitor_weburl.level = '中' monitor_weburl.snapshot = snapshot monitor_weburl.kinds = '误导宣传' monitor_weburl_dao.add(monitor_weburl) except Exception as e: logger.error(e) return finally: driver.quit()
def monitor_website(website, batch_num): monitor_website_dao = MonitorWebsiteDao monitor_website = MonitorWebsite() monitor_website.website_name = website.website_name monitor_website.merchant_name = website.merchant_name monitor_website.merchant_num = website.merchant_num monitor_website.domain_name = website.domain_name monitor_website.saler = website.saler monitor_website.batch_num = batch_num monitor_website.kinds = "首页是否可打开" monitor_website.level = '高' monitor_website.access = '异常' monitor_website.is_normal = '异常' monitor_website.pageview = '-' if len(website.domain_name) == 0: logger.info("website_domain is None! merchant_name: %s ", website.merchant_name) monitor_website.outline = '商户网址为空。' monitor_website_dao.add(monitor_website) return else: logger.info("domain_name is %s! Go to inspect... ", website.domain_name) # 首页监控 domain_names = str(website.domain_name) domain_name_list = domain_names.split(",") for domain_name in domain_name_list: logger.info("-------------------") domain_name_rich = domain_name dns = domain_name if str(domain_name).startswith("http"): temp = domain_name[domain_name.find("/") + 2:] logger.info("domain with out http:: %s", temp) if str(temp).find("/") == -1: dns = temp else: start = temp.find("/") dns = temp[0:start] else: if str(domain_name).find("/") == -1: pass else: start = domain_name.find("/") dns = domain_name[0:start] pass domain_name_rich = "http://" + domain_name try: logger.info("dns: %s", dns) conn = http.client.HTTPSConnection(dns, timeout=10) conn.request('GET', domain_name_rich) resp = conn.getresponse() code = resp.code logger.info("code: %s", code) if code == 200: logger.info("使用webdriver进行截图: %s ... ", domain_name_rich) try: driver = WebDriver.get_phantomjs() driver.get(domain_name_rich) current_url = driver.current_url title = driver.title source = driver.page_source snapshot = SnapshotService.create_snapshot( driver, batch_num, website, '网站') logger.info("title: %s", title) logger.info("current_url: %s", current_url) if str(current_url) == "about:blank" and str( source ) == "<html><head></head><body></body></html>" and str( title) == "": logger.info("检测到about:blank : %s", current_url) monitor_website.outline = "网站疑似无法访问" monitor_website.snapshot = SnapshotService.simulation_404( domain_name_rich) monitor_website_dao.add(monitor_website) driver.quit() continue else: pass if str(current_url).index(domain_name_rich[7:]) == -1: logger.info("疑似跳转...: %s", current_url) monitor_website.outline = "疑似跳转,检测到首页地址为:" + current_url monitor_website.snapshot = snapshot monitor_website_dao.add(monitor_website) driver.quit() continue else: pass monitor_website.snapshot = snapshot logger.info("check title和source...") if title.__contains__('404'): monitor_website.outline = "疑似异常,检测到404" elif source.__contains__('ERR_NAME_NOT_RESOLVED'): monitor_website.outline = "疑似异常,Title信息:" + title elif source.__contains__('ERR_CONNECTION_REFUSED'): monitor_website.outline = "疑似异常,检测到 ERR_CONNECTION_REFUSED" elif source.__contains__('ERR_CONNECTION_TIMED_OUT'): monitor_website.outline = "疑似异常,检测到 ERR_CONNECTION_TIMED_OUT" elif source.__contains__('ERR_NAME_NOT_RESOLVED'): monitor_website.outline = "疑似异常,检测到 ERR_NAME_NOT_RESOLVED" elif source.__contains__('ERR_NAME_RESOLUTION_FAILED'): monitor_website.outline = "疑似异常,检测到 ERR_NAME_RESOLUTION_FAILED" elif source.__contains__( 'DNS_PROBE_FINISHED_NXDOMAIN'): monitor_website.outline = "疑似异常,检测到 DNS_PROBE_FINISHED_NXDOMAIN" elif source.__contains__('ERR_EMPTY_RESPONSE'): monitor_website.outline = "疑似异常,检测到 ERR_EMPTY_RESPONSE" elif source.__contains__('主机开设成功'): monitor_website.outline = "疑似异常,检测到类似网站在建信息" elif source.__contains__('非法阻断'): monitor_website.outline = "疑似异常,检测到非法阻断" elif source.__contains__('Bad Request'): monitor_website.outline = "疑似异常,检测到 Bad Request" elif source.__contains__('404 page not found'): monitor_website.outline = "疑似异常,检测到 404 page not found" elif source.__contains__( 'https://wanwang.aliyun.com/domain/parking'): monitor_website.outline = "疑似异常,检测到阻断拦截" elif source.__contains__('没有找到站点'): monitor_website.outline = "疑似异常,没有找到站点" elif source.__contains__('未备案提示'): monitor_website.outline = "疑似异常,未备案提示" elif str( source ) == "<html><head></head><body></body></html>" and str( title) == "": monitor_website.snapshot = SnapshotService.simulation_404( domain_name_rich) monitor_website.outline = "疑似无法访问" else: monitor_website.outline = '检测正常' monitor_website.access = '正常' monitor_website.is_normal = '正常' monitor_website.level = '-' logger.info("outline: %s", monitor_website.outline) monitor_website_dao.add(monitor_website) except Exception as e: logger.error(e) monitor_website.snapshot = SnapshotService.simulation_404( domain_name_rich) monitor_website.outline = '访问超时,可能被目标网站屏蔽,建议手动验证!' monitor_website_dao.add(monitor_website) finally: driver.quit() else: logger.info("确定无法访问!") monitor_website.outline = '检测到网站异常' monitor_website.snapshot = SnapshotService.simulation_404( domain_name_rich) monitor_website_dao.add(monitor_website) except Exception as e: logger.error(e) logger.info("urlopen 无法打开页面..") monitor_website.outline = 'urlopen无法打开网站。' monitor_website.snapshot = SnapshotService.simulation_404( domain_name_rich) monitor_website_dao.add(monitor_website)
def inspect(batch_num, url, website): monitor_bc_dao = MonitorBcDao() monitor_bc = MonitorBc() monitor_bc.batch_num = batch_num monitor_bc.domain_name = website.domain_name monitor_bc.merchant_num = website.merchant_num monitor_bc.website_name = website.website_name monitor_bc.merchant_name = website.merchant_name monitor_bc.saler = website.saler random_seconds = random.randint(20, 30) logger.info("企查查随机等待 %s 秒...", str(random_seconds)) time.sleep(random_seconds) driver, snapshot = SnapshotService.snapshot_qichacha( batch_num, url, website) logger.info("driver: %s ,snapshot:%s", driver, snapshot) try: if driver is None: logger.info("由于企查查反扒策略无法继续!") monitor_bc.snapshot = '中' monitor_bc.is_normal = '异常' monitor_bc.kinds = '检测失败' monitor_bc.outline = '由于企查查反扒策略无法访问企业详情页。请尝试手动访问:' + url, monitor_bc.level = '高' monitor_bc_dao.add(monitor_bc) return else: logger.info("企查查完成截图 : %s", website.merchant_name) logger.info("企查查检测股东成员 : %s", website.merchant_name) # 2.股东成员 random_seconds = random.randint(10, 20) logger.info("企查查随机等待 %s 秒...", str(random_seconds)) time.sleep(random_seconds) driver.get(url) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') chengyuans = soup.find_all(name='section', id=re.compile('Mainmember')) if chengyuans.__len__() > 0: chengyuan = chengyuans[0] tbodys = chengyuan.find_all('tbody') trs = tbodys[0].find_all('tr') line_num = 0 bc_person_dao = BcPersonDao() for tr in trs: line_num += 1 if line_num != 1: tds = tr.find_all('td') fullname = tds[1].find_all('a')[0].get_text() job = tds[2].get_text() bc_person = BcPerson() bc_person.batch_num = batch_num bc_person.merchant_name = website.merchant_name.strip() bc_person.fullname = fullname.strip() bc_person.job = job.strip() bc_person_dao.add(bc_person) try: # 3.法人变更 driver.find_element_by_link_text("工商信息").send_keys(Keys.RETURN) logger.info("企查查检测法人变更 : %s", website.merchant_name) legalmans = soup.find_all(class_='seo font-20') if str(website.legal_person).strip() is "": monitor_bc.outline = '商户未维护法人信息,不作对比' monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '法人变更' if legalmans.__len__() > 0: monitor_bc.kinds = '法人变更:' + legalmans[0].get_text() monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) else: if legalmans.__len__() > 0: if str(website.legal_person).strip( ) == legalmans[0].get_text(): monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '法人变更:' + legalmans[0].get_text( ) monitor_bc.outline = '未检测到法人变更' monitor_bc.level = '-' else: monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '异常' monitor_bc.kinds = '法人变更:' + legalmans[0].get_text( ) monitor_bc.outline = '检测到法人变更,变更为:' + legalmans[ 0].get_text() monitor_bc.level = '低' monitor_bc_dao.add(monitor_bc) except Exception as e: logger.info(e) logger.info("html没有解析到[工商信息]..") monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '法人变更' monitor_bc.outline = '页面没有解析到工商信息。' monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) try: # 4.经营状态:注销 迁出 logger.info("准备检测经营状态:注销 迁出 : %s ...", website.merchant_name) cminfo = soup.find_all(name='section', id=re.compile('Cominfo')) tables = cminfo[0].find_all(name='table', class_='ntable') trs = tables[0].find_all(name='tr') tds = trs[2].find_all(name='td') manage_state = tds[1].get_text().strip() # 5.经营状态-注销 logger.info("企查查检测经营状态-注销 : %s", website.merchant_name) if str(manage_state).find("注销") >= 0: monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '异常' monitor_bc.kinds = '经营状态' monitor_bc.outline = '检测到经营状态异常:注销', monitor_bc.level = '高' else: monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '经营状态' monitor_bc.outline = '经营状态正常,无注销', monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) # 6.经营状态-迁出 logger.info("企查查检测经营状态-迁出 : %s", website.merchant_name) if str(manage_state).find("迁出") >= 0: monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '异常' monitor_bc.kinds = '经营状态' monitor_bc.outline = '检测到经营状态异常:迁出', monitor_bc.level = '高' else: monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '经营状态' monitor_bc.outline = '经营状态正常,无 迁出', monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) except Exception as e: logger.info(e) logger.info("html没有解析到[经营状态]..") monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '经营状态' monitor_bc.outline = '页面没有解析到经营状态信息。' monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) try: # 7.严重违法 logger.info("企查查检测严重违法 : %s", website.merchant_name) driver.find_element_by_partial_link_text("经营风险").send_keys( Keys.RETURN) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') companys = soup.find_all(name='div', class_='company-nav-tab') risks = companys[3].find_all(name='span') manage_abn = risks[2].get_text() serious_illegal = risks[4].get_text() # 经营状态异常数: 严重违法 0 # 严重风险数: 股权出质 0 logger.info("经营状态异常数: %s", str(manage_abn)) logger.info("严重风险数: %s", str(serious_illegal)) if (len(str(manage_abn).split()) == 1 and int(manage_abn) > 0 ) or (len(str(manage_abn).split()) == 2 and int(str(manage_abn).split()[1]) > 0): monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '异常' monitor_bc.kinds = '经营状态' monitor_bc.outline = '检测到经营异常风险', monitor_bc.level = '高' else: monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '经营状态' monitor_bc.outline = '未检测到经营异常风险', monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) # 8.严重违法 if (len(str(serious_illegal).split()) == 1 and int(serious_illegal) > 0) or ( len(str(serious_illegal).split()) == 2 and int(str(serious_illegal).split()[1]) > 0): monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '异常' monitor_bc.kinds = '严重违法' monitor_bc.outline = '检测到严重违法风险', monitor_bc.level = '高' else: monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '严重违法' monitor_bc.outline = '未检测到严重违法风险', monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) except Exception as e: logger.info(e) logger.info("html没有解析到[经营风险]..") monitor_bc.snapshot = str(snapshot) monitor_bc.is_normal = '正常' monitor_bc.kinds = '严重违法' monitor_bc.outline = '页面没有解析到经营风险信息。' monitor_bc.level = '-' monitor_bc_dao.add(monitor_bc) # 1.受益人 logger.info("企查查检测受益人 : %s", website.merchant_name) rest_url = url + "#base" driver.get(rest_url) random_seconds = random.randint(20, 30) logger.info("企查查随机等待 %s 秒...", str(random_seconds)) time.sleep(random_seconds) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') shou_yi_rens = soup.find_all(name='section', id=re.compile('partnerslist')) if shou_yi_rens.__len__() > 0: shouyiren = shou_yi_rens[0] tables = shouyiren.find_all('table') trs = tables[0].find_all('tr') num = 0 bc_benefit_dao = BcBenefitDao() for tr in trs: num += 1 if num != 1: tds = tr.find_all('td') if tds.__len__() >= 3: fullname = tds[1].find_all( name='a')[0].get_text() shouyirens = tds[1].find_all( name='span', class_=re.compile( 'ntag sm text-primary click')) if shouyirens.__len__() >= 1: is_shouyiren = shouyirens[0].get_text( ).find('受益人') > 0 if is_shouyiren: proportion = tds[2].get_text().strip() invest_train = '-' bc_benefit = BcBenefit() bc_benefit.batch_num = batch_num bc_benefit.merchant_name = website.merchant_name.strip( ) bc_benefit.fullname = fullname.strip() bc_benefit.proportion = proportion.strip( ) bc_benefit.invest_train = invest_train.strip( ) bc_benefit_dao.add(bc_benefit) except Exception as e: logger.info(e) finally: if driver is not None: driver.quit() else: pass