コード例 #1
0
    def monitor(keyword, batch_num, website):
        driver = WebDriver.get_chrome()
        senti_util = SentiUtil()
        url = "http://www.paycircle.cn/company/search.php?kw=" + urllib.parse.quote(
            keyword) + "&c=SearchList&"
        if driver is None:
            senti_util.log_error("支付圈", url, batch_num, website)
            return
        else:
            pass
        try:
            driver.get(url)
            source = driver.page_source
            senti_util.snapshot_home("支付圈", url, batch_num, website, driver)
            soup = BeautifulSoup(source, 'html.parser')
            div_list = soup.find_all(attrs={'class': 'list'})
            if div_list.__len__() > 0:
                news = div_list[0].find_all('tr')
                for new in news:
                    href = new.find_all('td')[2].find_all('a')[0].get("href")
                    content = new.find_all('td')[2].find_all(
                        'li')[1].get_text()
                    if content.find(keyword) != -1:
                        senti_util.senti_process_text("支付圈", content, href,
                                                      batch_num, website)
            else:
                logger.info("支付圈没有搜索到数据: %s", keyword)

        except Exception as e:
            logger.error(e)
            return
        finally:
            driver.quit()
コード例 #2
0
ファイル: traffic_service.py プロジェクト: mengshanxi/spider
    def get_traffic(domain_name):
        header = {
            'User-Agent':
            ' Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
        }
        try:
            req = urllib.request.Request(
                'http://data.alexa.com/data?cli=10&dat=snbamz&url=%s' %
                domain_name,
                headers=header)
            res = urllib.request.urlopen(req, timeout=10).read()
            res = res.decode('UTF-8')
            reach_rank = re.findall("REACH[^\d]*(\d+)", res)

            # 访客排名
            if not reach_rank:
                reach_rank = "-"
            # 全球排名
            popularity_rank = re.findall("POPULARITY[^\d]*(\d+)", res)
            if not popularity_rank:
                popularity_rank = "-"
            traffic = Traffic(reach_rank, popularity_rank)
            # print(res)
            return traffic
        except Exception as e:
            logger.info(e)
            traffic = Traffic([0, 0], 0)
            # print(res)
            return traffic
コード例 #3
0
 def monitor(website_name, merchant_name, batch_num):
     try:
         driver = WebDriver.get_chrome()
         senti_util = SentiUtil()
         url = "https://www.p2peye.com/search.php?mod=zonghe&srchtxt=" + urllib.parse.quote(website_name)
         driver.get(url)
         source = driver.page_source
         senti_util.snapshot_home("网贷天眼", merchant_name, url,
                                  batch_num, driver)
         soup = BeautifulSoup(source, 'html.parser')
         news = soup.find_all(attrs={'class': 'result-t'})
         if news.__len__() > 0:
             for new in news:
                 href = new.find_all('a')[0].get("href")
                 content = new.get_text()
                 if content.find(website_name) != -1:
                     senti_util.senti_process_text("网贷天眼", merchant_name,content, "http://" + href[2:],
                                                   batch_num)
         else:
             logger.info("网贷天眼没有搜索到数据: %s", merchant_name)
     except Exception as e:
         logger.error(e)
         return
     finally:
         driver.quit()
コード例 #4
0
class TestWangdaitianyan(object):
    if __name__ == "__main__":
        """
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        driver = webdriver.Chrome(chrome_options=chrome_options,
                                  executable_path="C:/chromedriver_2.38/chromedriver.exe")  
        driver.get("http://www.wangdaibus.com/search.php?mod=forum")
        driver.find_element_by_id("scform_srchtxt").send_keys(u"京东")
        driver.find_element_by_id("scform_submit").click()     
        """
        driver = WebDriver.get_chrome()
        try:
            driver.get(
                "http://www.wangdaibus.com/search/list?subject=%E4%BA%AC%E4%B8%9C"
            )
            aaa = "京东"
            #lement_by_xpath('//input[@name="subject"]').send_keys(aaa)
            #driver.find_element_by_xpath('//input[@name="subject"]').send_keys(Keys.ENTER)
            time.sleep(10)

        except Exception as e:  # 异常处理
            logger.error(e)
            pass
        SnapshotService.create_snapshot(driver)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        news = soup.find_all("h3", attrs={'class': 'xs3'})
        if news.__len__() > 0:
            for new in news:
                href = new.find_all('a')[0].get("href")
                logger.info("http://www.wangdaibus.com/" + href)
                logger.info(new.get_text())
        '''
コード例 #5
0
ファイル: weburl_dao.py プロジェクト: mengshanxi/spider
 def add(weburl):
     logger.info("add weburl to db: %s", weburl.url)
     engine = create_engine(
         'mysql://%s:%s@%s/%s?charset=utf8&autocommit=true' %
         (username, password, host, database),
         encoding='utf-8',
         echo=False,
         pool_size=100,
         pool_recycle=10)
     Session = sessionmaker(bind=engine)
     session = Session()
     try:
         exist_weburl = session.query(Weburl).filter(
             Weburl.url == weburl.url).filter(
                 Weburl.website_id == weburl.website_id).all()
         if len(exist_weburl):
             pass
         else:
             weburl.create_time = datetime.datetime.now()
             weburl.last_update = datetime.datetime.now()
             session.add(weburl)
             session.commit()
     except Exception as e:
         print(e)
         session.rollback()
         raise
     finally:
         session.close()
コード例 #6
0
 def monitor(keyword, batch_num, website):
     driver = WebDriver.get_chrome()
     senti_util = SentiUtil()
     url = "https://tousu.sina.com.cn/index/search/?keywords=" + urllib.parse.quote(keyword) + "&t=0"
     if driver is None:
         senti_util.log_error("黑猫投诉", url, batch_num, website)
         return
     else:
         pass
     try:
         driver.get(url)
         source = driver.page_source
         senti_util.snapshot_home("黑猫投诉", url, batch_num, website, driver)
         soup = BeautifulSoup(source, 'html.parser')
         items = soup.find_all(attrs={'class': 'blackcat-con'})
         if items.__len__() > 0:
             for item in items:
                 href = item.find_all('a')[0].get("href")
                 content = item.find_all('h1')[0].get_text()
                 if content.find(keyword) != -1:
                     senti_util.senti_process_text("黑猫投诉", content, href,
                                                   batch_num, website)
         else:
             logger.info("黑猫投诉没有搜索到数据: %s", keyword)
     except Exception as e:
         logger.error(e)
         return
     finally:
         driver.quit()
コード例 #7
0
 def get_access_res(url):
     driver = WebDriver.get_chrome_for_access()
     try:
         if str(url).startswith("http"):
             http_url = str(url)
         else:
             http_url = "http://" + str(url)
         logger.info("http_url: %s", http_url)
         driver.get(http_url)
         title = driver.title
         source = driver.page_source
         if title.__contains__('404') or source.__contains__(
                 'ERR_NAME_NOT_RESOLVED') or source.__contains__(
             'ERR_CONNECTION_REFUSED') or source.__contains__(
             'ERR_CONNECTION_TIMED_OUT') or source.__contains__(
             'ERR_NAME_NOT_RESOLVED') or source.__contains__(
             'ERR_NAME_RESOLUTION_FAILED') or source.__contains__(
             'DNS_PROBE_FINISHED_NXDOMAIN') or source.__contains__(
             'ERR_EMPTY_RESPONSE') or source.__contains__(
             '主机开设成功') or source.__contains__(
             '非法阻断') or source.__contains__(
             'Bad Request') or source.__contains__(
             '404 page not found') or source.__contains__('https://wanwang.aliyun.com/domain/parking'):
             return None, http_url
         else:
             return http_url, driver.current_url
     except Exception as e:
         logger.error(e)
         return None, None
     finally:
         driver.quit()
コード例 #8
0
 def monitor(keyword, batch_num, website):
     driver = WebDriver.get_chrome()
     senti_util = SentiUtil()
     url = "http://paynews.net/search.php?mod=forum"
     if driver is None:
         senti_util.log_error("支付产业网", url, batch_num, website)
         return
     else:
         pass
     try:
         driver.get(url)
         search_text_blank = driver.find_element_by_id("scform_srchtxt")
         search_text_blank.send_keys(keyword)
         search_text_blank.send_keys(Keys.RETURN)
         senti_util.snapshot_home("支付产业网", url, batch_num, website, driver)
         source = driver.page_source
         soup = BeautifulSoup(source, 'html.parser')
         div_list = soup.find(attrs={'class': 'slst mtw'})
         if div_list is not None and div_list.__len__() > 0:
             news = div_list.find_all('li')
             for new in news:
                 href = new.find_all('a')[0].get("href")
                 content = new.find_all('a')[0].get_text()
                 if content.find(keyword) != -1:
                     senti_util.senti_process_text(
                         "支付产业网", content, "http://paynews.net/" + href,
                         batch_num, website)
         else:
             logger.info("支付产业网没有搜索到数据: %s", keyword)
     except Exception as e:
         logger.error(e)
         return
     finally:
         driver.quit()
コード例 #9
0
 def monitor(keyword, batch_num, website):
     driver = WebDriver.get_chrome()
     senti_util = SentiUtil()
     url = "http://ts.21cn.com/home/search?keyword=" + urllib.parse.quote(
         keyword)
     if driver is None:
         senti_util.log_error("聚投诉", url, batch_num, website)
         return
     else:
         pass
     try:
         driver.get(url)
         driver.implicitly_wait(3)
         source = driver.page_source
         senti_util.snapshot_home("聚投诉", url, batch_num, website, driver)
         soup = BeautifulSoup(source, 'html.parser')
         items = soup.find_all(attrs={'class': 'complain-item'})
         if items.__len__() > 0:
             for item in items:
                 href = item.find_all('a')[1].get("href")
                 content = item.find_all('a')[1].get_text()
                 if content.find(keyword) != -1:
                     senti_util.senti_process_text(
                         "聚投诉", content,
                         "http://www.paycircle.cn" + href[1:], batch_num,
                         website)
         else:
             logger.info("聚投诉没有搜索到数据: %s", keyword)
     except Exception as e:
         logger.error(e)
         return
     finally:
         driver.quit()
コード例 #10
0
 def monitor(keyword, website_name, batch_num, merchant_name, merchant_num):
     """
     chrome_options = webdriver.ChromeOptions()
     chrome_options.add_argument('--headless')
     driver = webdriver.Chrome(chrome_options=chrome_options,
                               executable_path=chromedriver_path)
     """
     driver = WebDriver.get_chrome()
     try:
         senti_util = SentiUtil()
         url = "http://tieba.baidu.com/f?fr=wwwt&kw=" + urllib.parse.quote(
             keyword)
         driver.get(url)
         senti_util.snapshot_home("百度贴吧", website_name, url, batch_num,
                                  merchant_name, merchant_num, driver)
         source = driver.page_source
         soup = BeautifulSoup(source, 'html.parser')
         news = soup.find_all(
             "div", attrs={'class': 'threadlist_title pull_left j_th_tit '})
         if news.__len__() > 0:
             for new in news:
                 href = new.find_all('a')[0].get("href")
                 content = new.find_all('a')[0].get_text()
                 if content.find(keyword) != -1:
                     senti_util.senti_process_text(
                         "百度贴吧", website_name, content,
                         "http://tieba.baidu.com" + href, batch_num,
                         merchant_name, merchant_num)
         else:
             logger.info("百度贴吧没有搜索到数据: %s", keyword)
     except Exception as e:
         logger.error(e)
         return
     finally:
         driver.quit()
コード例 #11
0
 def get_pending_task(batch_num):
     agent_name = os.environ['agent_name']
     job = os.environ['job']
     if job == "bc":
         task_pools = session.query(TaskItem).filter(TaskItem.batch_num == batch_num).filter(
             TaskItem.status == 'pending', TaskItem.type == 'bc')
     else:
         task_pools = session.query(TaskItem).filter(TaskItem.batch_num == batch_num).filter(
             TaskItem.status == 'pending', TaskItem.type != 'bc')
     if task_pools.count() == 0:
         logger.info("本Agent没有待巡检任务,Agent切换为waiting状态: %s", agent_name)
         #  没有pending状态的任务
         gl.set_value('STATUS', False)
         return None, None
     else:
         logger.info("%s 准备执行可以处理的任务,倒数第:%s 个...", agent_name, str(task_pools.count()))
     task_pool = task_pools.first()
     session.query(TaskItem).filter(TaskItem.id == task_pool.id).update({"status": "processing"})
     if task_pool.type == "weburl":
         logger.info("task_pool.website_id:%s", task_pool.website_id)
         weburl = session.query(Weburl).filter(Weburl.url == task_pool.url).filter(
             Weburl.website_id == task_pool.website_id).all()
         if len(weburl):
             return weburl[0], task_pool
         else:
             logger.info("task_pool.website_id:%s", task_pool.website_id)
             logger.info("task_pool.id:%s", task_pool.id)
             session.query(TaskItem).filter(TaskItem.id == task_pool.id).update({"status": "done"})
             return None, None
     else:
         website = session.query(Website).filter(Website.id == task_pool.website_id).one()
         return website, task_pool
コード例 #12
0
 def monitor(website_name, merchant_name, batch_num):
     """
     chrome_options = webdriver.ChromeOptions()
     chrome_options.add_argument('--headless')
     driver = webdriver.Chrome(chrome_options=chrome_options,
                               executable_path=chromedriver_path)
     """
     try:
         driver = WebDriver.get_chrome()
         senti_util = SentiUtil()
         url = "http://www.chinaft.com.cn/news/search/_1.shtml?key=" + urllib.parse.quote(
             website_name)
         driver.get(url)
         source = driver.page_source
         senti_util.snapshot_home("交易中国", merchant_name, url, batch_num,
                                  driver)
         soup = BeautifulSoup(source, 'html.parser')
         news = soup.find_all("div",
                              attrs={'class': 'xixi_ChinaFT_left_news_box'})
         if news.__len__() > 0:
             for new in news:
                 if not gl.check_by_batch_num(batch_num):
                     break
                 href = new.find_all('a')[1].get("href")
                 content = new.find_all('a')[1].get_text()
                 if content.find(website_name) != -1:
                     senti_util.senti_process_text(
                         "交易中国", merchant_name, content,
                         "http://www.chinaft.com.cn" + href, batch_num)
         else:
             logger.info("交易中国没有搜索到数据: %s", merchant_name)
     except Exception as e:
         logger.error(e)
     finally:
         driver.quit()
コード例 #13
0
    def monitor(keyword, batch_num, website):
        driver = WebDriver.get_chrome()
        senti_util = SentiUtil()
        url = 'https://baike.baidu.com/item/%s' % urllib.parse.quote(keyword)
        if driver is None:
            senti_util.log_error("百度百科", url, batch_num, website)
            return
        else:
            pass
        try:
            driver.get(url)
            source = driver.page_source
            soup = BeautifulSoup(source, 'html.parser')
            check_exist = soup.find_all(
                name='p', attrs={'class': re.compile('sorryCont')})
            if check_exist.__len__() == 0:
                description = soup.find(
                    attrs={"name": "description"})['content']
                senti_util.senti_process_text("百度百科", description, url,
                                              batch_num, website)
            else:
                senti_util.snapshot_home("百度百科", url, batch_num, website,
                                         driver)
                logger.info("百度百科没有搜索到数据: %s", keyword)

        except Exception as e:
            logger.error(e)
            return
        finally:
            driver.quit()
コード例 #14
0
def stop():
    job = os.environ['job']
    if job == "gather":
        logger.info("My Job is gather,ignore the order!")
    else:
        gl.set_value('STATUS', False)
        ims_api.heartbeat()
        return 'SUCCESS'
コード例 #15
0
def stop_tracking():
    job = os.environ['job']
    if job == "tracking":
        gl.set_value('STATUS', False)
        gl.set_value('TRACKING_STATUS', False)
        ims_api.heartbeat()
    else:
        logger.info("My Job is tracking,ignore the order!")
    return 'SUCCESS'
コード例 #16
0
ファイル: snapshot_service.py プロジェクト: mengshanxi/spider
 def download(jpg_link):
     timestamp = int(time.time())
     path = base_filepath + "/" + str(timestamp) + ".png"
     try:
         request.urlretrieve(jpg_link, path)
     except Exception as e:
         logger.info(e)
         return None
     return path
コード例 #17
0
    def gather_urls_by_website(self, website_id):
        ims_api = ImsApi()
        website_dao = WebsiteDao()
        if website_id is not None:
            website = website_dao.get_by_id(website_id)
            logger.info("gather url for domain_name: %s ", website.domain_name)
            logger.info("gather url for websiteId: %s  ", website.id)
            if website.domain_name is None or len(website.domain_name) == 0:
                logger.info(
                    "gather url for %s,but website.domain_name is None,ignored! ",
                    website.merchant_name)
            else:
                if str(website.domain_name).startswith('http'):
                    uri = website.domain_name
                    self.gather_urls(website.id, uri, website.website_name,
                                     website.domain_name,
                                     website.merchant_name,
                                     website.merchant_num, website.saler, 0)
                else:
                    uri = 'http://' + website.domain_name
                    self.gather_urls(website.id, uri, website.website_name,
                                     website.domain_name,
                                     website.merchant_name,
                                     website.merchant_num, website.saler, 0)
                ims_api.done_url_gather(website)

        else:
            websites = website_dao.get_overtime()
            logger.info("需要采集url的商户网站供 %s 个 ", websites.__len__())
            for website in websites:
                if len(website.domain_name
                       ) == 0 or website.domain_name is None:
                    logger.info(
                        "gather url for %s,but website.domain_name is None,ignored! ",
                        website.merchant_name)
                else:
                    if str(website.domain_name).startswith('http'):
                        uri = website.domain_name
                        self.gather_urls(website.id, uri, website.website_name,
                                         website.domain_name,
                                         website.merchant_name,
                                         website.merchant_num, website.saler,
                                         0)
                    else:
                        uri = 'http://' + website.domain_name
                        self.gather_urls(website.id, uri, website.website_name,
                                         website.domain_name,
                                         website.merchant_name,
                                         website.merchant_num, website.saler,
                                         0)
                    ims_api.done_url_gather(website)
コード例 #18
0
ファイル: snapshot_service.py プロジェクト: mengshanxi/spider
 def snapshot_tracking(driver, tracking_detail):
     timestamp = int(time.time())
     path = base_filepath + "/" + tracking_detail.tracking_num + "_" + str(
         timestamp)
     snapshot = tracking_detail.tracking_num + "_" + str(timestamp) + ".png"
     try:
         driver.save_screenshot(path + ".png")
         im = Image.open(path + ".png")
         im_resize = im.resize((50, 50), Image.ANTIALIAS)
         im_resize.save(path + "_thumb.bmp")
         return snapshot
     except Exception as e:
         logger.info(e)
         return None
コード例 #19
0
 def heartbeat():
     try:
         agent_name = os.environ['agent_name']
         hostname = socket.gethostname()
         ip = socket.gethostbyname(hostname)
         url = ims_rest_base + "open/api/v1/agent/heartbeat"
         status = gl.get_value('STATUS')
         data_json = {"ip": ip, "status": status, "job": agent_name}
         data = bytes(parse.urlencode(data_json), encoding="utf8")
         new_url = request.Request(url, data)
         request.urlopen(new_url)
     except Exception as e:
         logger.info(e)
         logger.info("heartbeat fail")
コード例 #20
0
ファイル: snapshot_service.py プロジェクト: mengshanxi/spider
 def snapshot_weburl(driver, batch_num, weburl, senti_type):
     timestamp = int(time.time())
     snapshot = batch_num + "_" + weburl.merchant_name + "_" + weburl.merchant_num + "_" + senti_type + "_" + str(
         timestamp) + ".png"
     path = base_filepath + "/" + batch_num + "_" + weburl.merchant_name + "_" + weburl.merchant_num + "_" + senti_type + "_" + str(
         timestamp)
     try:
         driver.save_screenshot(path + ".png")
         im = Image.open(path + ".png")
         im_resize = im.resize((50, 50), Image.ANTIALIAS)
         im_resize.save(path + "_thumb.bmp")
     except Exception as e:
         logger.info(e)
         return snapshot
     return snapshot
コード例 #21
0
class TestMysql(object):
    if __name__ == "__main__":
        url = "http://ts.21cn.com/home/search?keyword=%E4%BA%AC%E4%B8%9C"
        driver = webdriver.Remote(
            command_executor='http://172.17.161.230:8912/wd/hub',
            desired_capabilities=DesiredCapabilities.CHROME)

        driver.set_page_load_timeout(10)
        driver.set_script_timeout(10)
        driver.maximize_window()
    try:
        driver.get(url)
        time.sleep(5)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        items = soup.find_all(attrs={'class': 'blackcat-con'})
        if items.__len__() > 0:
            for item in items:
                href = item.find_all('a')[0].get("href")
                content = item.find_all('h1')[0].get_text()
        else:
            logger.info("黑猫投诉没有搜索到数据:")
        driver.quit()
    except Exception as e:
        logger.error(e)
        driver.quit()
コード例 #22
0
ファイル: test_paycircle.py プロジェクト: mengshanxi/spider
class TestMysql(object):
    if __name__ == "__main__":
        url = "http://www.paycircle.cn/company/search.php?kw=" + urllib.parse.quote(
            '京东') + "&c=SearchList&"
        driver = webdriver.Remote(
            command_executor='http://172.17.161.230:8911/wd/hub',
            desired_capabilities=DesiredCapabilities.CHROME)

        driver.set_page_load_timeout(10)
        driver.set_script_timeout(10)
        driver.maximize_window()
    try:
        driver.get(url)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        div_list = soup.find_all(attrs={'class': 'list'})
        if div_list.__len__() > 0:
            news = div_list[0].find_all('tr')
            for new in news:
                href = new.find_all('td')[2].find_all('a')[0].get("href")
                content = new.find_all('td')[2].find_all('li')[1].get_text()
        else:
            logger.info("支付圈没有搜索到数据")
        driver.quit()
    except Exception as e:
        logger.error(e)
コード例 #23
0
class TestMysql(object):
    if __name__ == "__main__":
        url = "http://paynews.net/search.php?mod=forum"
        driver = webdriver.Remote(
            command_executor='http://172.17.161.230:8911/wd/hub',
            desired_capabilities=DesiredCapabilities.CHROME)

        driver.set_page_load_timeout(10)
        driver.set_script_timeout(10)
        driver.maximize_window()
    try:
        driver.get(url)
        search_text_blank = driver.find_element_by_id("scform_srchtxt")
        search_text_blank.send_keys('京东')
        search_text_blank.send_keys(Keys.RETURN)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        driver.save_screenshot("D:/a.png")
        div_list = soup.find(attrs={'class': 'slst mtw'})
        if div_list.__len__() > 0:
            news = div_list.find_all('li')
            for new in news:
                href = new.find_all('a')[0].get("href")
                content = new.find_all('a')[0].get_text()
                print(content)
        else:
            logger.info("支付产业网没有搜索到数据: %s")
        driver.quit()
    except Exception as e:
        logger.error(e)
        driver.quit()
コード例 #24
0
ファイル: test_zfzj.py プロジェクト: mengshanxi/spider
class TestMysql(object):
    if __name__ == "__main__":
        url = "http://www.zfzj.cn/search.php"
        driver = webdriver.Remote(
            command_executor='http://172.17.161.230:8911/wd/hub',
            desired_capabilities=DesiredCapabilities.CHROME)

        driver.set_page_load_timeout(10)
        driver.set_script_timeout(10)
        driver.maximize_window()
    try:
        driver.get(url)
        search_text_blank = driver.find_element_by_id("scform_srchtxt")
        search_text_blank.send_keys('京东')
        search_text_blank.send_keys(Keys.RETURN)
        time.sleep(5)
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')
        items = soup.find_all(attrs={'class': 'blackcat-con'})
        if items.__len__() > 0:
            for item in items:
                href = item.find_all('a')[0].get("href")
                content = item.find_all('h1')[0].get_text()
        else:
            logger.info("黑猫投诉没有搜索到数据:")
        driver.quit()
    except Exception as e:
        logger.error(e)
        driver.quit()
コード例 #25
0
ファイル: snapshot_service.py プロジェクト: mengshanxi/spider
 def simulation_404(url):
     timestamp = str(time.time())
     snapshot = timestamp + ".png"
     path = ims_rest_base + "/views/system/404.jsp?url=" + str(url)
     img_404 = base_filepath + "/" + timestamp
     try:
         driver = WebDriver.get_chrome()
         driver.get(path)
         driver.save_screenshot(img_404 + ".png")
         im = Image.open(img_404 + ".png")
         im_resize = im.resize((50, 50), Image.ANTIALIAS)
         im_resize.save(img_404 + "_thumb.bmp")
     except Exception as e:
         logger.info(e)
         return snapshot
     finally:
         driver.quit()
     return snapshot
コード例 #26
0
ファイル: pic_recg_service.py プロジェクト: mengshanxi/spider
 def tran2text(url):
     # TODO
     logger.error("TODO %s" % url)
     return None
     try:
         if str(url).endswith(".jpg") or str(url).endswith(".png") or str(url).endswith(".bmp") or str(url).endswith(
                 ".jpeg"):
             logger.info("src path: %s" % url)
             text = pytesseract.image_to_string(Image.open(url), lang='chi_sim')
             logger.info("text: %s", str(text))
             if text == "":
                 return None
             return text
         else:
             logger.error("url is not pic! url:%s" % url)
             return None
     except Exception as e:
         logger.error(e)
         return None
コード例 #27
0
ファイル: webdriver_util.py プロジェクト: mengshanxi/spider
 def get_proxy_chrome():
     chrome_options = Options()
     strategy_service = StrategyService()
     strategy = strategy_service.get_strategy()
     if strategy.proxy_server is None or strategy.proxy_server == '':
         logger.info("proxy_server is none!")
         return None
     else:
         proxy_servers = strategy.proxy_server.split(",")
         chrome_options.add_argument("--proxy-server=" + random.choice(proxy_servers))
         # 禁止图片和css加载
         prefs = {"profile.managed_default_content_settings.images": 2, 'permissions.default.stylesheet': 2}
         chrome_options.add_experimental_option("prefs", prefs)
         driver = webdriver.Remote(command_executor='http://localhost:4444/wd/hub',
                                   desired_capabilities=DesiredCapabilities.CHROME,
                                   options=chrome_options)
         driver.set_page_load_timeout(30)
         driver.set_script_timeout(10)
         driver.maximize_window()
         return driver
コード例 #28
0
def inspect(batch_num):
    spider_manager = GatherCenter()
    while gl.get_value('STATUS'):
        logger.info("inspect task start!  batch_num:%s" % str(batch_num))
        spider_manager.gather(batch_num)
        logger.info("inspect task end!  batch_num:%s" % str(batch_num))
    logger.info("batchNum inspect task end!  batch_num:%s" % str(batch_num))
コード例 #29
0
 def monitor(website_name, merchant_name, batch_num):
     """
     chrome_options = webdriver.ChromeOptions()
     chrome_options.add_argument('--headless')
     driver = webdriver.Chrome(chrome_options=chrome_options,
                               executable_path=chromedriver_path)
     """
     try:
         driver = WebDriver.get_chrome()
         senti_util = SentiUtil()
         url = "https://www.wdzj.com/front/search/index?key=" + urllib.parse.quote(
             website_name)
         driver.get(url)
         source = driver.page_source
         senti_util.snapshot_home("网贷之家", merchant_name, url, batch_num,
                                  driver)
         soup = BeautifulSoup(source, 'html.parser')
         tzbox = soup.find_all("ul", attrs={'class': 'so-tzbox'})
         if tzbox.__len__() == 0:
             return
         news = tzbox[0].find_all("li")
         if news.__len__() > 0:
             for new in news:
                 if not gl.check_by_batch_num(batch_num):
                     break
                 href = new.find_all('a')[0].get("href")
                 content = new.get_text()
                 if content.find(website_name) != -1:
                     senti_util.senti_process_text("网贷之家", merchant_name,
                                                   content,
                                                   "http://" + href[2:],
                                                   batch_num)
         else:
             logger.info("网贷之家没有搜索到数据: %s", merchant_name)
     except Exception as e:
         logger.error(e)
         return
     finally:
         driver.quit()
コード例 #30
0
def tracking_execute():
    job = os.environ['job']
    if job == "tracking":
        # 重启selenium
        stop_selenium()
        stop_chrome()
        gl.set_value('STATUS', True)
        gl.set_value('TRACKING_STATUS', True)
        ims_api.heartbeat()
        try:
            task_id = request.form['taskId']
            status = request.form['status']
            logger.info("tracking begin task_id: %s,status: %s" % (str(task_id), str(status)))
            t = threading.Thread(target=inspect_tracking, args=(task_id, status))
            t.setDaemon(True)
            t.start()
            return 'OK'
        except Exception as e:
            logger.error(e)
    else:
        logger.info("Tracking is not my job!")
        return 'OK'