def monitor(keyword, batch_num, website): driver = WebDriver.get_chrome() senti_util = SentiUtil() url = "http://www.paycircle.cn/company/search.php?kw=" + urllib.parse.quote( keyword) + "&c=SearchList&" if driver is None: senti_util.log_error("支付圈", url, batch_num, website) return else: pass try: driver.get(url) source = driver.page_source senti_util.snapshot_home("支付圈", url, batch_num, website, driver) soup = BeautifulSoup(source, 'html.parser') div_list = soup.find_all(attrs={'class': 'list'}) if div_list.__len__() > 0: news = div_list[0].find_all('tr') for new in news: href = new.find_all('td')[2].find_all('a')[0].get("href") content = new.find_all('td')[2].find_all( 'li')[1].get_text() if content.find(keyword) != -1: senti_util.senti_process_text("支付圈", content, href, batch_num, website) else: logger.info("支付圈没有搜索到数据: %s", keyword) except Exception as e: logger.error(e) return finally: driver.quit()
def get_traffic(domain_name): header = { 'User-Agent': ' Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } try: req = urllib.request.Request( 'http://data.alexa.com/data?cli=10&dat=snbamz&url=%s' % domain_name, headers=header) res = urllib.request.urlopen(req, timeout=10).read() res = res.decode('UTF-8') reach_rank = re.findall("REACH[^\d]*(\d+)", res) # 访客排名 if not reach_rank: reach_rank = "-" # 全球排名 popularity_rank = re.findall("POPULARITY[^\d]*(\d+)", res) if not popularity_rank: popularity_rank = "-" traffic = Traffic(reach_rank, popularity_rank) # print(res) return traffic except Exception as e: logger.info(e) traffic = Traffic([0, 0], 0) # print(res) return traffic
def monitor(website_name, merchant_name, batch_num): try: driver = WebDriver.get_chrome() senti_util = SentiUtil() url = "https://www.p2peye.com/search.php?mod=zonghe&srchtxt=" + urllib.parse.quote(website_name) driver.get(url) source = driver.page_source senti_util.snapshot_home("网贷天眼", merchant_name, url, batch_num, driver) soup = BeautifulSoup(source, 'html.parser') news = soup.find_all(attrs={'class': 'result-t'}) if news.__len__() > 0: for new in news: href = new.find_all('a')[0].get("href") content = new.get_text() if content.find(website_name) != -1: senti_util.senti_process_text("网贷天眼", merchant_name,content, "http://" + href[2:], batch_num) else: logger.info("网贷天眼没有搜索到数据: %s", merchant_name) except Exception as e: logger.error(e) return finally: driver.quit()
class TestWangdaitianyan(object): if __name__ == "__main__": """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path="C:/chromedriver_2.38/chromedriver.exe") driver.get("http://www.wangdaibus.com/search.php?mod=forum") driver.find_element_by_id("scform_srchtxt").send_keys(u"京东") driver.find_element_by_id("scform_submit").click() """ driver = WebDriver.get_chrome() try: driver.get( "http://www.wangdaibus.com/search/list?subject=%E4%BA%AC%E4%B8%9C" ) aaa = "京东" #lement_by_xpath('//input[@name="subject"]').send_keys(aaa) #driver.find_element_by_xpath('//input[@name="subject"]').send_keys(Keys.ENTER) time.sleep(10) except Exception as e: # 异常处理 logger.error(e) pass SnapshotService.create_snapshot(driver) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') news = soup.find_all("h3", attrs={'class': 'xs3'}) if news.__len__() > 0: for new in news: href = new.find_all('a')[0].get("href") logger.info("http://www.wangdaibus.com/" + href) logger.info(new.get_text()) '''
def add(weburl): logger.info("add weburl to db: %s", weburl.url) engine = create_engine( 'mysql://%s:%s@%s/%s?charset=utf8&autocommit=true' % (username, password, host, database), encoding='utf-8', echo=False, pool_size=100, pool_recycle=10) Session = sessionmaker(bind=engine) session = Session() try: exist_weburl = session.query(Weburl).filter( Weburl.url == weburl.url).filter( Weburl.website_id == weburl.website_id).all() if len(exist_weburl): pass else: weburl.create_time = datetime.datetime.now() weburl.last_update = datetime.datetime.now() session.add(weburl) session.commit() except Exception as e: print(e) session.rollback() raise finally: session.close()
def monitor(keyword, batch_num, website): driver = WebDriver.get_chrome() senti_util = SentiUtil() url = "https://tousu.sina.com.cn/index/search/?keywords=" + urllib.parse.quote(keyword) + "&t=0" if driver is None: senti_util.log_error("黑猫投诉", url, batch_num, website) return else: pass try: driver.get(url) source = driver.page_source senti_util.snapshot_home("黑猫投诉", url, batch_num, website, driver) soup = BeautifulSoup(source, 'html.parser') items = soup.find_all(attrs={'class': 'blackcat-con'}) if items.__len__() > 0: for item in items: href = item.find_all('a')[0].get("href") content = item.find_all('h1')[0].get_text() if content.find(keyword) != -1: senti_util.senti_process_text("黑猫投诉", content, href, batch_num, website) else: logger.info("黑猫投诉没有搜索到数据: %s", keyword) except Exception as e: logger.error(e) return finally: driver.quit()
def get_access_res(url): driver = WebDriver.get_chrome_for_access() try: if str(url).startswith("http"): http_url = str(url) else: http_url = "http://" + str(url) logger.info("http_url: %s", http_url) driver.get(http_url) title = driver.title source = driver.page_source if title.__contains__('404') or source.__contains__( 'ERR_NAME_NOT_RESOLVED') or source.__contains__( 'ERR_CONNECTION_REFUSED') or source.__contains__( 'ERR_CONNECTION_TIMED_OUT') or source.__contains__( 'ERR_NAME_NOT_RESOLVED') or source.__contains__( 'ERR_NAME_RESOLUTION_FAILED') or source.__contains__( 'DNS_PROBE_FINISHED_NXDOMAIN') or source.__contains__( 'ERR_EMPTY_RESPONSE') or source.__contains__( '主机开设成功') or source.__contains__( '非法阻断') or source.__contains__( 'Bad Request') or source.__contains__( '404 page not found') or source.__contains__('https://wanwang.aliyun.com/domain/parking'): return None, http_url else: return http_url, driver.current_url except Exception as e: logger.error(e) return None, None finally: driver.quit()
def monitor(keyword, batch_num, website): driver = WebDriver.get_chrome() senti_util = SentiUtil() url = "http://paynews.net/search.php?mod=forum" if driver is None: senti_util.log_error("支付产业网", url, batch_num, website) return else: pass try: driver.get(url) search_text_blank = driver.find_element_by_id("scform_srchtxt") search_text_blank.send_keys(keyword) search_text_blank.send_keys(Keys.RETURN) senti_util.snapshot_home("支付产业网", url, batch_num, website, driver) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') div_list = soup.find(attrs={'class': 'slst mtw'}) if div_list is not None and div_list.__len__() > 0: news = div_list.find_all('li') for new in news: href = new.find_all('a')[0].get("href") content = new.find_all('a')[0].get_text() if content.find(keyword) != -1: senti_util.senti_process_text( "支付产业网", content, "http://paynews.net/" + href, batch_num, website) else: logger.info("支付产业网没有搜索到数据: %s", keyword) except Exception as e: logger.error(e) return finally: driver.quit()
def monitor(keyword, batch_num, website): driver = WebDriver.get_chrome() senti_util = SentiUtil() url = "http://ts.21cn.com/home/search?keyword=" + urllib.parse.quote( keyword) if driver is None: senti_util.log_error("聚投诉", url, batch_num, website) return else: pass try: driver.get(url) driver.implicitly_wait(3) source = driver.page_source senti_util.snapshot_home("聚投诉", url, batch_num, website, driver) soup = BeautifulSoup(source, 'html.parser') items = soup.find_all(attrs={'class': 'complain-item'}) if items.__len__() > 0: for item in items: href = item.find_all('a')[1].get("href") content = item.find_all('a')[1].get_text() if content.find(keyword) != -1: senti_util.senti_process_text( "聚投诉", content, "http://www.paycircle.cn" + href[1:], batch_num, website) else: logger.info("聚投诉没有搜索到数据: %s", keyword) except Exception as e: logger.error(e) return finally: driver.quit()
def monitor(keyword, website_name, batch_num, merchant_name, merchant_num): """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver_path) """ driver = WebDriver.get_chrome() try: senti_util = SentiUtil() url = "http://tieba.baidu.com/f?fr=wwwt&kw=" + urllib.parse.quote( keyword) driver.get(url) senti_util.snapshot_home("百度贴吧", website_name, url, batch_num, merchant_name, merchant_num, driver) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') news = soup.find_all( "div", attrs={'class': 'threadlist_title pull_left j_th_tit '}) if news.__len__() > 0: for new in news: href = new.find_all('a')[0].get("href") content = new.find_all('a')[0].get_text() if content.find(keyword) != -1: senti_util.senti_process_text( "百度贴吧", website_name, content, "http://tieba.baidu.com" + href, batch_num, merchant_name, merchant_num) else: logger.info("百度贴吧没有搜索到数据: %s", keyword) except Exception as e: logger.error(e) return finally: driver.quit()
def get_pending_task(batch_num): agent_name = os.environ['agent_name'] job = os.environ['job'] if job == "bc": task_pools = session.query(TaskItem).filter(TaskItem.batch_num == batch_num).filter( TaskItem.status == 'pending', TaskItem.type == 'bc') else: task_pools = session.query(TaskItem).filter(TaskItem.batch_num == batch_num).filter( TaskItem.status == 'pending', TaskItem.type != 'bc') if task_pools.count() == 0: logger.info("本Agent没有待巡检任务,Agent切换为waiting状态: %s", agent_name) # 没有pending状态的任务 gl.set_value('STATUS', False) return None, None else: logger.info("%s 准备执行可以处理的任务,倒数第:%s 个...", agent_name, str(task_pools.count())) task_pool = task_pools.first() session.query(TaskItem).filter(TaskItem.id == task_pool.id).update({"status": "processing"}) if task_pool.type == "weburl": logger.info("task_pool.website_id:%s", task_pool.website_id) weburl = session.query(Weburl).filter(Weburl.url == task_pool.url).filter( Weburl.website_id == task_pool.website_id).all() if len(weburl): return weburl[0], task_pool else: logger.info("task_pool.website_id:%s", task_pool.website_id) logger.info("task_pool.id:%s", task_pool.id) session.query(TaskItem).filter(TaskItem.id == task_pool.id).update({"status": "done"}) return None, None else: website = session.query(Website).filter(Website.id == task_pool.website_id).one() return website, task_pool
def monitor(website_name, merchant_name, batch_num): """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver_path) """ try: driver = WebDriver.get_chrome() senti_util = SentiUtil() url = "http://www.chinaft.com.cn/news/search/_1.shtml?key=" + urllib.parse.quote( website_name) driver.get(url) source = driver.page_source senti_util.snapshot_home("交易中国", merchant_name, url, batch_num, driver) soup = BeautifulSoup(source, 'html.parser') news = soup.find_all("div", attrs={'class': 'xixi_ChinaFT_left_news_box'}) if news.__len__() > 0: for new in news: if not gl.check_by_batch_num(batch_num): break href = new.find_all('a')[1].get("href") content = new.find_all('a')[1].get_text() if content.find(website_name) != -1: senti_util.senti_process_text( "交易中国", merchant_name, content, "http://www.chinaft.com.cn" + href, batch_num) else: logger.info("交易中国没有搜索到数据: %s", merchant_name) except Exception as e: logger.error(e) finally: driver.quit()
def monitor(keyword, batch_num, website): driver = WebDriver.get_chrome() senti_util = SentiUtil() url = 'https://baike.baidu.com/item/%s' % urllib.parse.quote(keyword) if driver is None: senti_util.log_error("百度百科", url, batch_num, website) return else: pass try: driver.get(url) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') check_exist = soup.find_all( name='p', attrs={'class': re.compile('sorryCont')}) if check_exist.__len__() == 0: description = soup.find( attrs={"name": "description"})['content'] senti_util.senti_process_text("百度百科", description, url, batch_num, website) else: senti_util.snapshot_home("百度百科", url, batch_num, website, driver) logger.info("百度百科没有搜索到数据: %s", keyword) except Exception as e: logger.error(e) return finally: driver.quit()
def stop(): job = os.environ['job'] if job == "gather": logger.info("My Job is gather,ignore the order!") else: gl.set_value('STATUS', False) ims_api.heartbeat() return 'SUCCESS'
def stop_tracking(): job = os.environ['job'] if job == "tracking": gl.set_value('STATUS', False) gl.set_value('TRACKING_STATUS', False) ims_api.heartbeat() else: logger.info("My Job is tracking,ignore the order!") return 'SUCCESS'
def download(jpg_link): timestamp = int(time.time()) path = base_filepath + "/" + str(timestamp) + ".png" try: request.urlretrieve(jpg_link, path) except Exception as e: logger.info(e) return None return path
def gather_urls_by_website(self, website_id): ims_api = ImsApi() website_dao = WebsiteDao() if website_id is not None: website = website_dao.get_by_id(website_id) logger.info("gather url for domain_name: %s ", website.domain_name) logger.info("gather url for websiteId: %s ", website.id) if website.domain_name is None or len(website.domain_name) == 0: logger.info( "gather url for %s,but website.domain_name is None,ignored! ", website.merchant_name) else: if str(website.domain_name).startswith('http'): uri = website.domain_name self.gather_urls(website.id, uri, website.website_name, website.domain_name, website.merchant_name, website.merchant_num, website.saler, 0) else: uri = 'http://' + website.domain_name self.gather_urls(website.id, uri, website.website_name, website.domain_name, website.merchant_name, website.merchant_num, website.saler, 0) ims_api.done_url_gather(website) else: websites = website_dao.get_overtime() logger.info("需要采集url的商户网站供 %s 个 ", websites.__len__()) for website in websites: if len(website.domain_name ) == 0 or website.domain_name is None: logger.info( "gather url for %s,but website.domain_name is None,ignored! ", website.merchant_name) else: if str(website.domain_name).startswith('http'): uri = website.domain_name self.gather_urls(website.id, uri, website.website_name, website.domain_name, website.merchant_name, website.merchant_num, website.saler, 0) else: uri = 'http://' + website.domain_name self.gather_urls(website.id, uri, website.website_name, website.domain_name, website.merchant_name, website.merchant_num, website.saler, 0) ims_api.done_url_gather(website)
def snapshot_tracking(driver, tracking_detail): timestamp = int(time.time()) path = base_filepath + "/" + tracking_detail.tracking_num + "_" + str( timestamp) snapshot = tracking_detail.tracking_num + "_" + str(timestamp) + ".png" try: driver.save_screenshot(path + ".png") im = Image.open(path + ".png") im_resize = im.resize((50, 50), Image.ANTIALIAS) im_resize.save(path + "_thumb.bmp") return snapshot except Exception as e: logger.info(e) return None
def heartbeat(): try: agent_name = os.environ['agent_name'] hostname = socket.gethostname() ip = socket.gethostbyname(hostname) url = ims_rest_base + "open/api/v1/agent/heartbeat" status = gl.get_value('STATUS') data_json = {"ip": ip, "status": status, "job": agent_name} data = bytes(parse.urlencode(data_json), encoding="utf8") new_url = request.Request(url, data) request.urlopen(new_url) except Exception as e: logger.info(e) logger.info("heartbeat fail")
def snapshot_weburl(driver, batch_num, weburl, senti_type): timestamp = int(time.time()) snapshot = batch_num + "_" + weburl.merchant_name + "_" + weburl.merchant_num + "_" + senti_type + "_" + str( timestamp) + ".png" path = base_filepath + "/" + batch_num + "_" + weburl.merchant_name + "_" + weburl.merchant_num + "_" + senti_type + "_" + str( timestamp) try: driver.save_screenshot(path + ".png") im = Image.open(path + ".png") im_resize = im.resize((50, 50), Image.ANTIALIAS) im_resize.save(path + "_thumb.bmp") except Exception as e: logger.info(e) return snapshot return snapshot
class TestMysql(object): if __name__ == "__main__": url = "http://ts.21cn.com/home/search?keyword=%E4%BA%AC%E4%B8%9C" driver = webdriver.Remote( command_executor='http://172.17.161.230:8912/wd/hub', desired_capabilities=DesiredCapabilities.CHROME) driver.set_page_load_timeout(10) driver.set_script_timeout(10) driver.maximize_window() try: driver.get(url) time.sleep(5) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') items = soup.find_all(attrs={'class': 'blackcat-con'}) if items.__len__() > 0: for item in items: href = item.find_all('a')[0].get("href") content = item.find_all('h1')[0].get_text() else: logger.info("黑猫投诉没有搜索到数据:") driver.quit() except Exception as e: logger.error(e) driver.quit()
class TestMysql(object): if __name__ == "__main__": url = "http://www.paycircle.cn/company/search.php?kw=" + urllib.parse.quote( '京东') + "&c=SearchList&" driver = webdriver.Remote( command_executor='http://172.17.161.230:8911/wd/hub', desired_capabilities=DesiredCapabilities.CHROME) driver.set_page_load_timeout(10) driver.set_script_timeout(10) driver.maximize_window() try: driver.get(url) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') div_list = soup.find_all(attrs={'class': 'list'}) if div_list.__len__() > 0: news = div_list[0].find_all('tr') for new in news: href = new.find_all('td')[2].find_all('a')[0].get("href") content = new.find_all('td')[2].find_all('li')[1].get_text() else: logger.info("支付圈没有搜索到数据") driver.quit() except Exception as e: logger.error(e)
class TestMysql(object): if __name__ == "__main__": url = "http://paynews.net/search.php?mod=forum" driver = webdriver.Remote( command_executor='http://172.17.161.230:8911/wd/hub', desired_capabilities=DesiredCapabilities.CHROME) driver.set_page_load_timeout(10) driver.set_script_timeout(10) driver.maximize_window() try: driver.get(url) search_text_blank = driver.find_element_by_id("scform_srchtxt") search_text_blank.send_keys('京东') search_text_blank.send_keys(Keys.RETURN) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') driver.save_screenshot("D:/a.png") div_list = soup.find(attrs={'class': 'slst mtw'}) if div_list.__len__() > 0: news = div_list.find_all('li') for new in news: href = new.find_all('a')[0].get("href") content = new.find_all('a')[0].get_text() print(content) else: logger.info("支付产业网没有搜索到数据: %s") driver.quit() except Exception as e: logger.error(e) driver.quit()
class TestMysql(object): if __name__ == "__main__": url = "http://www.zfzj.cn/search.php" driver = webdriver.Remote( command_executor='http://172.17.161.230:8911/wd/hub', desired_capabilities=DesiredCapabilities.CHROME) driver.set_page_load_timeout(10) driver.set_script_timeout(10) driver.maximize_window() try: driver.get(url) search_text_blank = driver.find_element_by_id("scform_srchtxt") search_text_blank.send_keys('京东') search_text_blank.send_keys(Keys.RETURN) time.sleep(5) source = driver.page_source soup = BeautifulSoup(source, 'html.parser') items = soup.find_all(attrs={'class': 'blackcat-con'}) if items.__len__() > 0: for item in items: href = item.find_all('a')[0].get("href") content = item.find_all('h1')[0].get_text() else: logger.info("黑猫投诉没有搜索到数据:") driver.quit() except Exception as e: logger.error(e) driver.quit()
def simulation_404(url): timestamp = str(time.time()) snapshot = timestamp + ".png" path = ims_rest_base + "/views/system/404.jsp?url=" + str(url) img_404 = base_filepath + "/" + timestamp try: driver = WebDriver.get_chrome() driver.get(path) driver.save_screenshot(img_404 + ".png") im = Image.open(img_404 + ".png") im_resize = im.resize((50, 50), Image.ANTIALIAS) im_resize.save(img_404 + "_thumb.bmp") except Exception as e: logger.info(e) return snapshot finally: driver.quit() return snapshot
def tran2text(url): # TODO logger.error("TODO %s" % url) return None try: if str(url).endswith(".jpg") or str(url).endswith(".png") or str(url).endswith(".bmp") or str(url).endswith( ".jpeg"): logger.info("src path: %s" % url) text = pytesseract.image_to_string(Image.open(url), lang='chi_sim') logger.info("text: %s", str(text)) if text == "": return None return text else: logger.error("url is not pic! url:%s" % url) return None except Exception as e: logger.error(e) return None
def get_proxy_chrome(): chrome_options = Options() strategy_service = StrategyService() strategy = strategy_service.get_strategy() if strategy.proxy_server is None or strategy.proxy_server == '': logger.info("proxy_server is none!") return None else: proxy_servers = strategy.proxy_server.split(",") chrome_options.add_argument("--proxy-server=" + random.choice(proxy_servers)) # 禁止图片和css加载 prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2} chrome_options.add_experimental_option("prefs", prefs) driver = webdriver.Remote(command_executor='http://localhost:4444/wd/hub', desired_capabilities=DesiredCapabilities.CHROME, options=chrome_options) driver.set_page_load_timeout(30) driver.set_script_timeout(10) driver.maximize_window() return driver
def inspect(batch_num): spider_manager = GatherCenter() while gl.get_value('STATUS'): logger.info("inspect task start! batch_num:%s" % str(batch_num)) spider_manager.gather(batch_num) logger.info("inspect task end! batch_num:%s" % str(batch_num)) logger.info("batchNum inspect task end! batch_num:%s" % str(batch_num))
def monitor(website_name, merchant_name, batch_num): """ chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=chromedriver_path) """ try: driver = WebDriver.get_chrome() senti_util = SentiUtil() url = "https://www.wdzj.com/front/search/index?key=" + urllib.parse.quote( website_name) driver.get(url) source = driver.page_source senti_util.snapshot_home("网贷之家", merchant_name, url, batch_num, driver) soup = BeautifulSoup(source, 'html.parser') tzbox = soup.find_all("ul", attrs={'class': 'so-tzbox'}) if tzbox.__len__() == 0: return news = tzbox[0].find_all("li") if news.__len__() > 0: for new in news: if not gl.check_by_batch_num(batch_num): break href = new.find_all('a')[0].get("href") content = new.get_text() if content.find(website_name) != -1: senti_util.senti_process_text("网贷之家", merchant_name, content, "http://" + href[2:], batch_num) else: logger.info("网贷之家没有搜索到数据: %s", merchant_name) except Exception as e: logger.error(e) return finally: driver.quit()
def tracking_execute(): job = os.environ['job'] if job == "tracking": # 重启selenium stop_selenium() stop_chrome() gl.set_value('STATUS', True) gl.set_value('TRACKING_STATUS', True) ims_api.heartbeat() try: task_id = request.form['taskId'] status = request.form['status'] logger.info("tracking begin task_id: %s,status: %s" % (str(task_id), str(status))) t = threading.Thread(target=inspect_tracking, args=(task_id, status)) t.setDaemon(True) t.start() return 'OK' except Exception as e: logger.error(e) else: logger.info("Tracking is not my job!") return 'OK'