def filterContentUrlFunc(website_id, website_url, xpath): """ @summary: 筛选出网站的内容url """ try: spiderRes = Spider().chromedriver(website_url) html_selector = spiderRes.selector if html_selector is None: log.logMsg(LogType.htmlSelectorNone, "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None.")) return False hrefs = filterHrefs(website_url, xpath, html_selector) if len(hrefs) == 0: return False flag = False for href in hrefs: if not Cache.listItemExist(cache.oldContent_list, href) and \ not Cache.listItemExist(cache.unrecognized_contentUrl_dict, href): Cache.putQueue(cache.freshContentUrl_queue, (website_id, href)) flag = True if not flag: # 如果没有新数据,则延迟15分钟的爬取时间 incrDelay_time(website_id, 900) return True except Exception as e: log.logMsg(LogType.error, "[FilterContentUrlThread] %s %s" % (website_url, traceback.format_exc())) return False
def initContentUrl_dict(): """ @summary: 初始化去重列表 """ items = mysql.Mysql.queryContentUrl() for item in items: Cache.appendList(cache.oldContent_list, item[0])
def initWebsite_delay_dict(self, record): """ @summary: 初始化网站的等待更新时间 :param record: 网站记录(id, url, xpath, delay_time) :return: """ if not Cache.keyExist(cache.websiteDelay_dict, record[0]): Cache.setDict(cache.websiteDelay_dict, record[0], record[-1])
def saveWebsiteDelaytime(): """ @summary: 保存网站爬取延迟到数据库中 """ try: for website_id in Cache.keys(cache.websiteDelay_dict): delaytime = Cache.getDict(cache.websiteDelay_dict, website_id) db.saveDelay_time(website_id, delaytime) except Exception as e: log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e)))
def show_delay_time(): """ @summary: 显示各网站的爬取延迟 """ records = [] keys = Cache.keys(cache.websiteDelay_dict) or [] for website_id in keys: record = mysql.Mysql.queryWebsiteUrl(website_id) # id,url,xpath,detail,delay_time records.append((record[0][0], record[0][3] or record[0][1], Cache.getDict(cache.websiteDelay_dict, website_id))) headers = ["id", "url", "delay-time(s)"] print(tabulate(records, headers=headers))
def run(self): while not global_EXIT: url = "" try: website_id, url = Cache.getQueue(cache.freshContentUrl_queue, False) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.appendList(cache.oldContent_list, url) else: Cache.setDict(cache.unrecognized_contentUrl_dict, url, website_id) except Exception as e: if type(e) is not queue.Empty: log.logMsg(LogType.error, "[FilterContentInfoThread] %s %s" % (url, traceback.format_exc()))
def run(self): while not global_EXIT: website_url = "" try: website_id, website_url, xpath = Cache.getQueue(cache.websiteUrl_queue, False) if not filterContentUrlFunc(website_id, website_url, xpath): Cache.setDict(cache.unrecognized_websiteUrl_dict, website_id, (website_url, xpath)) except Exception as e: if type(e) is not queue.Empty: log.logMsg(LogType.error, "[FilterContentUrlThread.freshHandler] %s %s"%(website_url, traceback.format_exc())) else: for i in range(10): if global_EXIT: break time.sleep(1)
def resetDelay_time(): """ @summary: 重置各网站的爬取延迟 """ db = None try: db = mysql.Mysql() for website_id in Cache.keys(cache.websiteDelay_dict): record = Cache.getDict(cache.websiteDelay_dict, website_id) Cache.setDict(cache.websiteDelay_dict, website_id, (record[0], 0)) db.saveDelay_time(website_id, 0) except Exception as e: log.logMsg(LogType.error, "[saveWebsiteDelaytime] %s" % (repr(e))) finally: if db: db.dispose()
def main(): thread_count = 3 pre_threads = [] initdb() # 初始化redis数据库 initGlobalArgs() initContentUrl_dict() # 初始化去重表 log_thread = log.LogThread() # 启动日志记录线程 log_thread.start() QueryWebsiteUrl_thread = QueryWebsiteUrlThread() # 启动读取网站地址线程 QueryWebsiteUrl_thread.start() pre_threads.append(QueryWebsiteUrl_thread) filterContentUrl_thread = FilterContentUrlThread() # 启动爬取内容地址线程 filterContentUrl_thread.start() pre_threads.append(filterContentUrl_thread) for i in range(thread_count): thread = FilterContentInfoThread() thread.start() pre_threads.append(thread) unrecognizedWebsiteUrl_thread = UnrecognizedWebsiteUrl_Thread() unrecognizedWebsiteUrl_thread.start() pre_threads.append(unrecognizedWebsiteUrl_thread) unrecognizedContentUrl_thread = UnrecognizedContentUrl_Thread() unrecognizedContentUrl_thread.start() pre_threads.append(unrecognizedContentUrl_thread) while not global_EXIT: pass time.sleep(5) saveWebsiteDelaytime() # 保存各网站的延迟时间 for t in pre_threads: t.join() log.logMsg(LogType.success, "--------------------bye---------------------\n") while not Cache.qempty(cache.log_queue): pass # 等待把所有日志写到文件中 Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", True) log_thread.join() if db: db.dispose()
def _requests_getPagesource(page_source_q, url, method, data, use_proxy=False): while not Cache.getDict(cache.globalArgs_dict, "global_EXIT") and page_source_q.empty(): try: headers = {"User-agent": user_agent()} if use_oa_proxy: proxies, ip, port = Spider._getproxy() if method == "POST": res = requests.post(url, data=data, proxies=proxies, headers=headers) elif method == "GET": res = requests.get(url, data=data, proxies=proxies, headers=headers) if res.status_code == 200 and Spider._pagesourceLegal( res.text): page_source_q.put(res.text) except Exception as e: print(e) if ip: redis_client.delete(ip)
def run(self): while not Cache.getDict(cache.globalArgs_dict, "LogThread_EXIT"): try: info = Cache.getQueue(cache.log_queue, False) if os.path.exists(self.getFilename()): log_size = os.path.getsize( self.getFilename()) / 1024 / 1024 # 日志大小超过1M时另建新的日志文件 if log_size > 1: self.index += 1 with open(self.getFilename(), 'a') as f: info += '<%s>\n' % ( datetime.datetime.now().strftime("%H:%M:%S")) f.write(info) except Exception as e: if type(e) is not queue.Empty: print("Log Error: %s" % e)
def putRecord(self, record): """ @summary: 把record添加到正在等待的网站队列中 """ website_id, website_url, xpath = record[:3] if not Cache.listItemExist(cache.workingWebsite_list, website_id) and \ not Cache.keyExist(cache.unrecognized_websiteUrl_dict, website_id): Cache.appendList(cache.workingWebsite_list, website_id) Cache.putQueue(cache.websiteUrl_queue, (website_id, website_url, xpath)) sleep_time = Cache.getDict(cache.websiteDelay_dict, website_id) for i in range(int(sleep_time)): if global_EXIT: return time.sleep(1) Cache.removeList(cache.workingWebsite_list, website_id)
def run(self): while not global_EXIT: website_url = "" if not Cache.dempty(cache.unrecognized_websiteUrl_dict): try: website_id = Cache.randomKey(cache.unrecognized_websiteUrl_dict) if not website_id: for i in range(30): if global_EXIT: break time.sleep(1) continue website_url, xpath = Cache.getDict(cache.unrecognized_websiteUrl_dict, website_id) if (website_id, website_url, xpath): Cache.removeDict(cache.unrecognized_websiteUrl_dict, website_id) except Exception as e: log.logMsg(LogType.error, "[FilterContentUrlThread.unrecognizedHandler] %s %s" % (website_url, traceback.format_exc()))
def logMsg(logType, msg, website_id="", content_url=""): """ @summary: 把日志放到redis中(partialNone要放到数据库中) :param logType: 日志类型 :param msg: 日志内容 :param website_id: 网站id :param content_url: 内容url :return: """ if logType == LogType.error and msg: msg = "》Error《:%s" % msg elif logType == LogType.htmlSelectorNone or logType == LogType.partialNone: msg = "?Warning?:%s" % msg elif logType == LogType.success: msg = "【Success】:%s" % msg else: msg = "--Other--:%s" % msg if logType == LogType.partialNone: Mysql.writeWebsiteMsg(website_id, content_url) Cache.putQueue(cache.log_queue, msg)
def run(self): while not global_EXIT: try: if Cache.qempty(cache.websiteUrl_queue): records = mysql.Mysql.queryWebsiteUrl() for record in records: # record: id,url,xpath,detail,delay_time record = [str(item) for item in record] self.initWebsite_delay_dict(record) t = threading.Thread(target=self.putRecord, args=(record,)) t.setDaemon(True) t.start() except Exception as e: log.logMsg(LogType.error, "[QueryWebsiteUrlThread] %s" % (traceback.format_exc())) for i in range(60): if global_EXIT: break time.sleep(1)
def _chrome_getPagesource(page_source_q, url, timeout): driver, ip, port = None, None, None while not Cache.getDict(cache.globalArgs_dict, "global_EXIT") and page_source_q.empty(): try: if system == "Linux": chrome_options = Options() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') else: os.environ["webdriver.chrome.driver"] = chromedriver chrome_options = webdriver.ChromeOptions() if Spider._useProxy(url): proxies, ip, port = Spider._getproxy() if ip and port: chrome_options.add_argument("--proxy-server=http://%s:%s" % (ip, port)) if system == "Linux": driver = webdriver.Chrome(chrome_options=chrome_options) else: driver = webdriver.Chrome(chromedriver, chrome_options=chrome_options) driver.get(url) time.sleep(timeout) js = "document.body.scrollTop=1000" driver.execute_script(js) time.sleep(3) page_source = driver.page_source driver.close() driver.quit() if page_source and Spider._pagesourceLegal(page_source): page_source_q.put(page_source) except Exception as e: if ip: redis_client.delete(ip) if driver: driver.close() driver.quit()
def run(self): while not global_EXIT: url = "" try: url = Cache.randomKey(cache.unrecognized_contentUrl_dict) if url: website_id = Cache.getDict(cache.unrecognized_contentUrl_dict, url) res = filterContentInfoFunc(website_id, url) if res == SpiderResType.success or res == SpiderResType.alreadyExist: Cache.removeDict(cache.unrecognized_contentUrl_dict, url) Cache.appendList(cache.oldContent_list, url) for i in range(300): if global_EXIT: break time.sleep(1) except Exception as e: log.logMsg(LogType.error, "[FilterContentInfoThread.freshHandler] %s %s" % (url, traceback.format_exc()))
def _urllib_getPagesource(q, url): while not Cache.getDict(cache.globalArgs_dict, "global_EXIT") and q.empty(): proxies, ip, port = None, None, None try: if Spider._useProxy(url): proxies, ip, port = Spider._getproxy() if proxies: proxy_handler = urllib.request.ProxyHandler(proxies) opener = urllib.request.build_opener(proxy_handler) opener.addheaders = [('User-agent', user_agent())] res = opener.open(url, timeout=5) page_source = res.read().decode("utf8") else: req = urllib.request.Request( url, headers={"User-agent": user_agent()}) resp = urllib.request.urlopen(req) page_source = resp.read().decode("utf8") if page_source and Spider._pagesourceLegal(page_source): q.put(page_source) except Exception as e: if ip: redis_client.delete(ip)
def incrDelay_time(website_id, timeout): """ @summary: 对网站增加timeout个时间延迟 """ record = Cache.getDict(cache.websiteDelay_dict, website_id) Cache.setDict(cache.websiteDelay_dict, website_id, int(record) + timeout)
def content_count(): """ @summary: 显示已爬取的内容地址数量 """ print("content'count: %s"%(Cache.listLength(cache.oldContent_list)))
import threading import os import datetime import queue from Spider.cache import Cache from Spider.config import LOG_DIR from Spider.mysql import Mysql cache = Cache() class LogType(object): error = 0 htmlSelectorNone = 1 partialNone = 2 success = 3 other = 4 def logMsg(logType, msg, website_id="", content_url=""): """ @summary: 把日志放到redis中(partialNone要放到数据库中) :param logType: 日志类型 :param msg: 日志内容 :param website_id: 网站id :param content_url: 内容url :return: """ if logType == LogType.error and msg: msg = "》Error《:%s" % msg
def initGlobalArgs(): """ @summary: 初始化全局变量 """ Cache.setDict(cache.globalArgs_dict, "LogThread_EXIT", False) Cache.setDict(cache.globalArgs_dict, "global_EXIT", False)
def initdb(): """ @summary: 清空redis中的数据 """ Cache.flushdb(cache.websiteDelay_dict) Cache.flushdb(cache.workingWebsite_list) Cache.flushdb(cache.websiteUrl_queue) Cache.flushdb(cache.oldContent_list) Cache.flushdb(cache.freshContentUrl_queue) Cache.flushdb(cache.log_queue) Cache.flushdb(cache.unrecognized_websiteUrl_dict) Cache.flushdb(cache.unrecognized_contentUrl_dict) Cache.flushdb(cache.globalArgs_dict)
import queue from tabulate import tabulate import traceback from Spider import mysql from Spider import log from Spider.log import LogType from Spider import thumbnail from Spider.spider import Spider from Spider.cache import Cache from Spider.common import imgSrcHandler, hrefHandler,filterPureTag, incrDelay_time, brief, randomImg, spaceHandler,filterHrefs from Spider.config import global_Chrome from Spider.models import SpiderResType cache = Cache() db = mysql.Mysql() global_EXIT = False def filterContentUrlFunc(website_id, website_url, xpath): """ @summary: 筛选出网站的内容url """ try: spiderRes = Spider().chromedriver(website_url) html_selector = spiderRes.selector if html_selector is None: log.logMsg(LogType.htmlSelectorNone, "[FilterContentUrlThread] %s %s" % (website_url, "html_selector is None.")) return False