def _requests_getPagesource(page_source_q, url, method, data, use_proxy=False): while not Cache.getDict(cache.globalArgs_dict, "global_EXIT") and page_source_q.empty(): try: headers = {"User-agent": user_agent()} if use_oa_proxy: proxies, ip, port = Spider._getproxy() if method == "POST": res = requests.post(url, data=data, proxies=proxies, headers=headers) elif method == "GET": res = requests.get(url, data=data, proxies=proxies, headers=headers) if res.status_code == 200 and Spider._pagesourceLegal( res.text): page_source_q.put(res.text) except Exception as e: print(e) if ip: redis_client.delete(ip)
def saveWebsiteDelaytime(): """ @summary: 保存网站爬取延迟到数据库中 """ try: for website_id in Cache.keys(cache.websiteDelay_dict): delaytime = Cache.getDict(cache.websiteDelay_dict, website_id) db.saveDelay_time(website_id, delaytime) except Exception as e: log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
def show_delay_time(): """ @summary: 显示各网站的爬取延迟 """ records = [] keys = Cache.keys(cache.websiteDelay_dict) or [] for website_id in keys: record = mysql.Mysql.queryWebsiteUrl(website_id) # id,url,xpath,detail,delay_time records.append((record[0][0], record[0][3] or record[0][1], Cache.getDict(cache.websiteDelay_dict, website_id))) headers = ["id", "url", "delay-time(s)"] print(tabulate(records, headers=headers))
def putRecord(self, record): """ @summary: 把record添加到正在等待的网站队列中 """ website_id, website_url, xpath = record[:3] if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \ not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id): Cache.appendList(cache.workingWebsite_list, website_id) Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath)) sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id) for i in range(int(sleep_time)): if global_EXIT: return time.sleep(1) Cache.removeList(cache.workingWebsite_list, website_id)
def resetDelay_time(): """ @summary: 重置各网站的爬取延迟 """ db = None try: db = mysql.Mysql() for website_id in Cache.keys(cache.websiteDelay_dict): record = Cache.getDict(cache.websiteDelay_dict, website_id) Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0)) db.saveDelay_time(website_id, 0) except Exception as e: log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e))) finally: if db: db.dispose()
def run(self): while not global_EXIT: url = "" try: url = Cache.randomKey(cache.unrecognized_contentUrl_dict) if url: website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.removeDict(cache.unrecognized_contentUrl_dict, url) Cache.appendList(cache.oldContent_list, url) for i in range(300): if global_EXIT: break time.sleep(1) except Exception as e: log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
def run(self): while not Cache.getDict(cache.globalArgs_dict, "LogThread_EXIT"): try: info = Cache.getQueue(cache.log_queue, False) if os.path.exists(self.getFilename()): log_size = os.path.getsize( self.getFilename()) / 1024 / 1024 # 日志大小超过1M时另建新的日志文件 if log_size > 1: self.index += 1 with open(self.getFilename(), 'a') as f: info += '<%s>\n' % ( datetime.datetime.now().strftime("%H:%M:%S")) f.write(info) except Exception as e: if type(e) is not queue.Empty: print("Log Error: %s" % e)
def run(self): while not global_EXIT: website_url = "" if not Cache.dempty(cache.unrecognized_websiteUrl_dict): try: website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict) if not website_id: for i in range(30): if global_EXIT: break time.sleep(1) continue website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id) if (website_id, website_url, xpath): Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id) except Exception as e: log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))
def _chrome_getPagesource(page_source_q, url, timeout): driver, ip, port = None, None, None while not Cache.getDict(cache.globalArgs_dict, "global_EXIT") and page_source_q.empty(): try: if system == "Linux": chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') else: os.environ["webdriver.chrome.driver"] = chromedriver chrome_options = webdriver.ChromeOptions() if Spider._useProxy(url): proxies, ip, port = Spider._getproxy() if ip and port: chrome_options.add_argument("--proxy-server=http://%s:%s" % (ip, port)) if system == "Linux": driver = webdriver.Chrome(chrome_options=chrome_options) else: driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options) driver.get(url) time.sleep(timeout) js = "document.body.scrollTop=1000" driver.execute_script(js) time.sleep(3) page_source = driver.page_source driver.close() driver.quit() if page_source and Spider._pagesourceLegal(page_source): page_source_q.put(page_source) except Exception as e: if ip: redis_client.delete(ip) if driver: driver.close() driver.quit()
def _urllib_getPagesource(q, url): while not Cache.getDict(cache.globalArgs_dict, "global_EXIT") and q.empty(): proxies, ip, port = None, None, None try: if Spider._useProxy(url): proxies, ip, port = Spider._getproxy() if proxies: proxy_handler = urllib.request.ProxyHandler(proxies) opener = urllib.request.build_opener(proxy_handler) opener.addheaders = [('User-agent', user_agent())] res = opener.open(url, timeout=5) page_source = res.read().decode("utf8") else: req = urllib.request.Request( url, headers={"User-agent": user_agent()}) resp = urllib.request.urlopen(req) page_source = resp.read().decode("utf8") if page_source and Spider._pagesourceLegal(page_source): q.put(page_source) except Exception as e: if ip: redis_client.delete(ip)
def incrDelay_time(website_id, timeout): """ @summary: 对网站增加timeout个时间延迟 """ record = Cache.getDict(cache.websiteDelay_dict, website_id) Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)