Example #1
0
    def _requests_getPagesource(page_source_q,
                                url,
                                method,
                                data,
                                use_proxy=False):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                headers = {"User-agent": user_agent()}
                if use_oa_proxy:
                    proxies, ip, port = Spider._getproxy()

                if method == "POST":
                    res = requests.post(url,
                                        data=data,
                                        proxies=proxies,
                                        headers=headers)
                elif method == "GET":
                    res = requests.get(url,
                                       data=data,
                                       proxies=proxies,
                                       headers=headers)
                if res.status_code == 200 and Spider._pagesourceLegal(
                        res.text):
                    page_source_q.put(res.text)
            except Exception as e:
                print(e)
                if ip: redis_client.delete(ip)
Example #2
0
def saveWebsiteDelaytime():
    """
    @summary: 保存网站爬取延迟到数据库中
    """
    try:
        for website_id in Cache.keys(cache.websiteDelay_dict):
            delaytime = Cache.getDict(cache.websiteDelay_dict, website_id)
            db.saveDelay_time(website_id, delaytime)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
Example #3
0
def show_delay_time():
    """
    @summary: 显示各网站的爬取延迟
    """
    records = []
    keys = Cache.keys(cache.websiteDelay_dict) or []
    for website_id in keys:
        record = mysql.Mysql.queryWebsiteUrl(website_id)    # id,url,xpath,detail,delay_time
        records.append((record[0][0], record[0][3] or record[0][1], Cache.getDict(cache.websiteDelay_dict, website_id)))
    headers = ["id", "url", "delay-time(s)"]
    print(tabulate(records, headers=headers))
Example #4
0
 def putRecord(self, record):
     """
     @summary: 把record添加到正在等待的网站队列中
     """
     website_id, website_url, xpath = record[:3]
     if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \
             not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id):
         Cache.appendList(cache.workingWebsite_list, website_id)
         Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath))
         sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id)
         for i in range(int(sleep_time)):
             if global_EXIT: return
             time.sleep(1)
         Cache.removeList(cache.workingWebsite_list, website_id)
Example #5
0
def resetDelay_time():
    """
    @summary: 重置各网站的爬取延迟
    """
    db = None
    try:
        db = mysql.Mysql()
        for website_id in Cache.keys(cache.websiteDelay_dict):
            record = Cache.getDict(cache.websiteDelay_dict, website_id)
            Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0))
            db.saveDelay_time(website_id, 0)
    except Exception as e:
        log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
    finally:
        if db: db.dispose()
Example #6
0
 def run(self):
     while not global_EXIT:
         url = ""
         try:
             url = Cache.randomKey(cache.unrecognized_contentUrl_dict)
             if url:
                 website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url)
                 res = filterContentInfoFunc(website_id, url)
                 if res == SpiderResType.success or res == SpiderResType.alreadyExist:
                     Cache.removeDict(cache.unrecognized_contentUrl_dict, url)
                     Cache.appendList(cache.oldContent_list, url)
             for i in range(300):
                 if global_EXIT: break
                 time.sleep(1)
         except Exception as e:
             log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
Example #7
0
 def run(self):
     while not Cache.getDict(cache.globalArgs_dict, "LogThread_EXIT"):
         try:
             info = Cache.getQueue(cache.log_queue, False)
             if os.path.exists(self.getFilename()):
                 log_size = os.path.getsize(
                     self.getFilename()) / 1024 / 1024  # 日志大小超过1M时另建新的日志文件
                 if log_size > 1:
                     self.index += 1
             with open(self.getFilename(), 'a') as f:
                 info += '<%s>\n' % (
                     datetime.datetime.now().strftime("%H:%M:%S"))
                 f.write(info)
         except Exception as e:
             if type(e) is not queue.Empty:
                 print("Log Error: %s" % e)
Example #8
0
    def run(self):
        while not global_EXIT:
            website_url = ""
            if not Cache.dempty(cache.unrecognized_websiteUrl_dict):
                try:
                    website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict)
                    if not website_id:
                        for i in range(30):
                            if global_EXIT: break
                            time.sleep(1)
                            continue

                    website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id)
                    if (website_id, website_url, xpath):
                        Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id)

                except Exception as e:
                    log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))
Example #9
0
    def _chrome_getPagesource(page_source_q, url, timeout):
        driver, ip, port = None, None, None
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and page_source_q.empty():
            try:
                if system == "Linux":
                    chrome_options = Options()
                    chrome_options.add_argument('--headless')
                    chrome_options.add_argument('--disable-gpu')
                else:
                    os.environ["webdriver.chrome.driver"] = chromedriver
                    chrome_options = webdriver.ChromeOptions()

                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()

                if ip and port:
                    chrome_options.add_argument("--proxy-server=http://%s:%s" %
                                                (ip, port))

                if system == "Linux":
                    driver = webdriver.Chrome(chrome_options=chrome_options)
                else:
                    driver = webdriver.Chrome(chromedriver,
                                              chrome_options=chrome_options)

                driver.get(url)
                time.sleep(timeout)
                js = "document.body.scrollTop=1000"
                driver.execute_script(js)
                time.sleep(3)
                page_source = driver.page_source
                driver.close()
                driver.quit()
                if page_source and Spider._pagesourceLegal(page_source):
                    page_source_q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)
                if driver:
                    driver.close()
                    driver.quit()
Example #10
0
    def _urllib_getPagesource(q, url):
        while not Cache.getDict(cache.globalArgs_dict,
                                "global_EXIT") and q.empty():
            proxies, ip, port = None, None, None
            try:
                if Spider._useProxy(url):
                    proxies, ip, port = Spider._getproxy()
                if proxies:
                    proxy_handler = urllib.request.ProxyHandler(proxies)
                    opener = urllib.request.build_opener(proxy_handler)
                    opener.addheaders = [('User-agent', user_agent())]
                    res = opener.open(url, timeout=5)
                    page_source = res.read().decode("utf8")
                else:
                    req = urllib.request.Request(
                        url, headers={"User-agent": user_agent()})
                    resp = urllib.request.urlopen(req)
                    page_source = resp.read().decode("utf8")

                if page_source and Spider._pagesourceLegal(page_source):
                    q.put(page_source)
            except Exception as e:
                if ip: redis_client.delete(ip)
Example #11
0
def incrDelay_time(website_id, timeout):
    """
    @summary: 对网站增加timeout个时间延迟
    """
    record = Cache.getDict(cache.websiteDelay_dict, website_id)
    Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)