def __init__(self): super(WxSpider, self).__init__() # 必须指定self.log self.log = UtilLogger( 'WxSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_WxSpider.log')) self.log_record = UtilLogger( 'SourceSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_SourceSpider.log')) self.ext = Wx_extractor() # self.new_store = SourceStore(config.TEST_DB) self.step = 100 # 800 一个批次 self.queue_maxsize = 800 # 发送量 self.sended_queue_maxsize = 1000 # 已发送 self.table_count = 1000000 self.table_index = 0 self.md5_table = "news_md5" self.s_table = "news_{}" self.create_table_sql = """ create table news_{} like news_copy; """ self.spider_count = 0 self.repeat_count = 0 self.no_china_count = 0
def __init__(self): super(ToutiaoSpider, self).__init__() # 必须指定self.log self.log = UtilLogger( 'ToutiaoSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_ToutiaoSpider.log')) self.log_record = UtilLogger( 'SourceSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_SourceSpider.log')) self.ext = ToutiaoExtractor() # self.new_store = SourceStore(config.TEST_DB) self.step = 100 # 800 一个批次 self.queue_maxsize = 500 # 发送量 self.sended_queue_maxsize = 800 # 已发送 self.table_count = 1000000 self.table_index = 0 self.md5_table = "news_md5" self.s_table = "news_{}" self.create_table_sql = """ create table news_{} like news_copy; """ self.spider_count = 0 self.repeat_count = 0 self.no_china_count = 0 self.send_url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab'
def __init__(self): super(WxSpider, self).__init__() # 必须指定self.log self.log = UtilLogger( 'WxSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_WxSpider.log')) self.log_record = UtilLogger( 'SourceSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_SourceSpider.log')) self.ext = Wx_extractor() # self.new_store = SourceStore(config.TEST_DB) self.step = 500 # 800 一个批次 self.queue_maxsize = 1500 self.sended_queue_maxsize = 3000 self.table_count = 1000000 self.table_index = 8 self.md5_table = "news_md5" self.s_table = "news_{}" self.create_table_sql = """ create table news_{} like news_copy; """ # """ # CREATE TABLE `news_{}` ( # `id` int(10) unsigned NOT NULL AUTO_INCREMENT, # `title` varchar(512) NOT NULL COMMENT '标题', # `summary` text NOT NULL COMMENT '概要', # `content` longtext CHARACTER SET utf8mb4 NOT NULL COMMENT '文章内容html', # `wechat_name` varchar(255) NOT NULL COMMENT '微信名称', # `wechat_num` varchar(255) DEFAULT '' COMMENT '微信公众号', # `keyword` varchar(100) NOT NULL COMMENT '关键词', # `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, # PRIMARY KEY (`id`) # ) ENGINE=InnoDB AUTO_INCREMENT=1043500 DEFAULT CHARSET=utf8 # """ self.spider_count = 0 self.repeat_count = 0 self.no_china_count = 0
def __init__(self): super(WxSpider, self).__init__() # 必须指定self.log self.log = UtilLogger( 'WxSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_WxSpider.log')) self.log_record = UtilLogger( 'SourceSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_SourceSpider.log')) self.ext = Wx_extractor() # self.new_store = SourceStore(config.TEST_DB) self.step = 1000 self.sended_queue_maxsize = 2000 self.spider_count = 0 self.repeat_count = 0
def __init__(self): super(BaseRankSpider, self).__init__() # 定时休眠时间 分钟 self.difsecond = 180 log_path = os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__)))) + "/logs/" self.log = UtilLogger('PageSpider', log_path + 'log_page_spider') self.log_record = UtilLogger('RecordPageSpider', log_path + 'log_record_page_spider') self.rank_store = RankStore() self.history_store = RankHistoryStore() self.sleep_time = 60 * 15 # 没有任务休眠时间 self.sended_queue_maxsize = 3000 # 发送限制 self.send_one_tasks = 800 # 一次取出 self.reset_task_time = 60 * 60 # 1小时 # self.saveport = 3 # 端口 self.task_table = "task" self.conf_finish_state = False self.re_send_count = 4 self.db_pool = StoreMysqlPool(**config.baidu_spider_move)
class WxSpider(BaseSpiderSign): """ 爬取微信文章 流程: 1 从数据库检索出不同的关键词 2 从入口:搜狗微信搜索搜索关键词并解析列表页 入口url:http://weixin.sogou.com/weixin?type=2&s_from=input&query=%E4%BD%A0%E5%93%88&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=2889&sst0=1511337321983&lkt=0%2C0%2C0 3 解析列表业码后得出该关键词对应文章的 页码数量 从而拼出剩下的页码url 4 解析列表页内的文章 链接、摘要,传入 详情页的解析器 5 解析微信内的文章最后存入数据库 """ def __init__(self): super(WxSpider, self).__init__() # 必须指定self.log self.log = UtilLogger( 'WxSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_WxSpider.log')) self.log_record = UtilLogger( 'SourceSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_SourceSpider.log')) self.ext = Wx_extractor() # self.new_store = SourceStore(config.TEST_DB) self.step = 100 # 800 一个批次 self.queue_maxsize = 800 # 发送量 self.sended_queue_maxsize = 1000 # 已发送 self.table_count = 1000000 self.table_index = 0 self.md5_table = "news_md5" self.s_table = "news_{}" self.create_table_sql = """ create table news_{} like news_copy; """ self.spider_count = 0 self.repeat_count = 0 self.no_china_count = 0 def get_user_password(self): # return 'zhouhao', 'zhspider' # return 'xuliang', 'xlspider' return 'sunxiang', 'sxspider' def send_get_spider(self, urls): """ 封装好 GET request请求,并发送到下载队列 """ basic_request = SpiderRequest( headers={'User-Agent': random.choice(self.pc_user_agents)}, urls=urls, config={"redirect": 1}) self.sending_queue.put_nowait(basic_request) def is_get_tasks(self): if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.queue_maxsize \ and self.response_queue.qsize() < self.queue_maxsize and self.store_queue.qsize() < self.queue_maxsize: return True else: return False def start_requests(self): try: while 1: if self.is_get_tasks(): db = StoreMysql(**config.local_content) update_time = str(datetime.now()).split(".")[0] sql = "select id, keyword from keywords where status = 1 order by update_time asc, priority desc limit 0, {}".format( self.step) rows = db.query(sql) self.log_record.info( "datetime:{},task_results length:{}".format( datetime.now(), len(rows))) ids = list() if rows: for word in rows: task_id = word[0] ids.append({ "id": task_id, "update_time": update_time }) keyword = word[1] for i in range(1, 11): send_url = "http://weixin.sogou.com/weixin?query={}&_sug_type_=&s_from=input&_sug_=n&type=2&page={}&ie=utf8".format( keyword, i) urls = [{ "url": send_url, "type": 1, "ext_type": 1, 'keyword': keyword, 'task_id': task_id, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) self.stores[0].store_table(ids, "keywords", type=2, field="id") else: time.sleep(60 * 10) db.close() time.sleep(60 * 1) except Exception: print traceback.format_exc() def get_stores(self): """ 可定义多个数据源 :return: """ stores = list() stores.append(SourceStore(config.local_content)) self.stores = stores return stores def deal_response_results_status(self, task_status, url, result, request): """ 处理 task_status 是2,3的任务 重试返回数组, 若重试需切换headers内容需自行定义 :param task_status: :param url: :param result: :param request: :return: """ if task_status == '2': ext_type = url["ext_type"] if ext_type == 1: self.deal_response_list(url, result['result']) elif ext_type == 2: self.deal_response_detail(url, result['result']) else: self.log.info("status is 3 url:{}; headers:{}; config:{}".format( url["url"], request.headers, request.config)) # @fn_timer def deal_response_list(self, url, html): try: keyword = url['keyword'] task_id = url['task_id'] info_list = self.ext.list_extractor(html, keyword, task_id) if info_list == -1: self.log.info("deal_response_list exception url:{}".format( url["url"])) else: self.store_queue.put({"result": info_list, "type": 1}) except: print(traceback.format_exc()) # @fn_timer def deal_response_detail(self, url, html): try: info = url['info'] info.pop("we_name") res = self.ext.detail_extractor(html, info) if res != -1: self.store_queue.put({"result": res, "type": 2}) else: self.log.info("deal_response_detail exception url:{}".format( url["url"])) except: print(traceback.format_exc()) def to_store_results(self, results, stores): """ type 1 列表页 title name 去重 2 详情页 数据 :param results: :param stores: :return: """ try: result = results["result"] type = results["type"] if type == 1: # log_start = time.time() for info in result: log_md5 = UtilMD5.md5(info["title"] + info["we_name"]) sql = "insert ignore into {}(md5) values('{}')".format( self.md5_table, str(log_md5)) s_id = stores[0].insert_row(sql) if s_id > 0: # self.spider_count += 1 urls = [{ "url": info['url'], "type": 1, "ext_type": 2, 'info': info, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) else: self.repeat_count += 1 if self.repeat_count > 1000: self.log_record.info("repeat_count:{}".format( self.repeat_count)) self.repeat_count = 0 elif type == 2: data = result if not self.judge_china(data["content"]): # 没有中文 self.no_china_count += 1 if self.no_china_count > 1000: self.log_record.info("no_china_count:{}".format( self.no_china_count)) self.no_china_count = 0 return weixin_content = { "summary": data.get("summary", ""), "content": data.get("content", ""), "keyword": data.get("keyword", ""), "title": data.get("title", ""), "wechat_name": data.get("wechat_name", ""), "wechat_num": data.get("wechat_num", "") } s_id = stores[0].store_table_one( weixin_content, "news_{}".format(self.table_index)) if s_id > 0: if s_id % self.table_count == 0: db = StoreMysql(**config.weixin_content) update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format( self.table_index) db.do(update_sql) self.table_index += 1 db.do(self.create_table_sql.format(self.table_index)) insert_sql = "insert into spider_table(table_name) values('news_{}')".format( self.table_index) db.do(insert_sql) time.sleep(1) db.close() else: time.sleep(0.1) time.sleep(2) except: print(traceback.format_exc()) def judge_china(self, c_text): zhPattern = re.compile(u'[\u4e00-\u9fa5]+') match = zhPattern.search(u"" + str(c_text)) if match: return True else: return False def send_wait(self): """ 发送等待, 控制发往下载中心的速率 """ time.sleep(1)
class WxSpider(BaseSpiderSign): """ 爬取微信文章 流程: 1 从数据库检索出不同的关键词 2 从入口:搜狗微信搜索搜索关键词并解析列表页 入口url:http://weixin.sogou.com/weixin?type=2&s_from=input&query=%E4%BD%A0%E5%93%88&ie=utf8&_sug_=y&_sug_type_=&w=01019900&sut=2889&sst0=1511337321983&lkt=0%2C0%2C0 3 解析列表业码后得出该关键词对应文章的 页码数量 从而拼出剩下的页码url 4 解析列表页内的文章 链接、摘要,传入 详情页的解析器 5 解析微信内的文章最后存入数据库 """ def __init__(self): super(WxSpider, self).__init__() # 必须指定self.log self.log = UtilLogger( 'WxSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_WxSpider.log')) self.log_record = UtilLogger( 'SourceSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_SourceSpider.log')) self.ext = Wx_extractor() # self.new_store = SourceStore(config.TEST_DB) self.step = 1000 self.sended_queue_maxsize = 2000 self.spider_count = 0 self.repeat_count = 0 def get_user_password(self): return 'zhouhao', 'zhspider' # return 'xuliang', 'xlspider' def send_get_spider(self, urls): """ 封装好 GET request请求,并发送到下载队列 """ basic_request = SpiderRequest( headers={'User-Agent': random.choice(self.pc_user_agents)}, urls=urls, config={"redirect": 1}) self.sending_queue.put_nowait(basic_request) def start_requests(self): try: while 1: if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \ and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize: db = StoreMysql(**config.weixin_content) source = SourceStore(config.weixin_content) update_time = str(datetime.now()).split(".")[0] sql = "select id, keyword from content_center.keywords order by update_time asc, priority desc limit 0, {};".format( self.step) rows = db.query(sql) self.log.info("datetime:{},task_results length:{}".format( datetime.now(), len(rows))) ids = list() if rows: for word in rows: task_id = word[0] ids.append({ "id": task_id, "update_time": update_time }) keyword = word[1] send_url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query={}&ie=utf8&_sug_=y&_sug_type_='.format( keyword) urls = [{ "url": send_url, "type": 1, "ext_type": 3, 'keyword': keyword, 'task_id': task_id, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) source.store_table(ids, "keywords", type=2, field="id") db.close() time.sleep(60 * 2) except Exception: print traceback.format_exc() def get_stores(self): """ 可定义多个数据源 :return: """ stores = list() stores.append(SourceStore(config.weixin_spider)) self.stores = stores return stores def deal_response_results_status(self, task_status, url, result, request): """ 处理 task_status 是2,3的任务 重试返回数组, 若重试需切换headers内容需自行定义 :param task_status: :param url: :param result: :param request: :return: """ if task_status == '2': ext_type = url["ext_type"] if ext_type == 3: self.deal_response_page(url, result['result']) elif ext_type == 1: self.deal_response_list(url, result['result']) elif ext_type == 2: self.deal_response_detail(url, result['result']) else: self.log.info("status is 3 url:{}; headers:{}; config:{}".format( url["url"], request.headers, request.config)) def deal_response_page(self, url, html): try: keyword = url['keyword'] task_id = url['task_id'] page = self.ext.page_extractor(html) if page == -1: self.log.info("deal_response_page exception url:{}".format( url["url"])) else: # 最多10 页 page_c = 10 if page < 10: page_c = page for i in range(1, page_c + 1): send_url = "http://weixin.sogou.com/weixin?query={}&_sug_type_=&s_from=input&_sug_=n&type=2&page={}&ie=utf8".format( keyword, i) urls = [{ "url": send_url, "type": 1, "ext_type": 1, 'keyword': keyword, 'task_id': task_id, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) except: print 'ext page count error!{}'.format(url) # @fn_timer def deal_response_list(self, url, html): try: keyword = url['keyword'] task_id = url['task_id'] # 解析列表页逻辑: info_list = self.ext.list_extractor(html, keyword, task_id) if info_list == -1: self.log.info("deal_response_list exception url:{}".format( url["url"])) else: self.store_queue.put({"result": info_list, "type": 1}) except: print(traceback.format_exc()) # @fn_timer def deal_response_detail(self, url, html): try: info = url['info'] info.pop("we_name") res = self.ext.detail_extractor(html, info) if res != -1: self.store_queue.put({"result": res, "type": 2}) else: self.log.info("deal_response_detail = -1 url:{}".format( url["url"])) except: print(traceback.format_exc()) def to_store_results(self, results, stores): """ type 1 列表页 title name 去重 2 详情页 数据 :param results: :param stores: :return: """ try: result = results["result"] type = results["type"] if type == 1: log_start = time.time() for info in result: log_md5 = UtilMD5.md5(info["title"] + info["we_name"]) sql = "insert ignore into spider_log(md5, type) values('{}', '1')".format( str(log_md5)) s_id = stores[0].insert_row(sql) if s_id > 0: self.spider_count += 1 urls = [{ "url": info['url'], "type": 1, "ext_type": 2, 'info': info, 'unique_key': self.get_unique_key(), }] self.send_get_spider(urls) else: self.repeat_count += 1 # self.log_record.info("spider_log title:{}".format(info["title"])) if self.spider_count > 1000: self.log_record.info("spider_count:{}".format( self.spider_count)) self.spider_count = 0 if self.repeat_count > 1000: self.log_record.info("repeat_count:{}".format( self.repeat_count)) self.repeat_count = 0 t_inter = int(time.time() - log_start) if t_inter > 5: self.log_record.info("spider_log time:{}".format(t_inter)) elif type == 2: # data_start = time.time() data = result ke_id = str(data["keyword_id"])[-1:] spider_weixin = 'spider_weixin_{}'.format(ke_id) spider_weixin_content = 'spider_weixin_content_{}'.format( ke_id) if not self.judge_china(data["content"]): # self.log_record.info("spider_weixin_lang add") return # spider_weixin = "spider_weixin_lang" # spider_weixin_content = "spider_weixin_content_lang" weixin_content = { "summary": data.pop("summary", ""), "content": data.pop("content", ""), "keyword_id": data.get("keyword_id", 0), "keyword": data.get("keyword", "") } s_id = stores[0].store_table_one(data, spider_weixin) if s_id > 0: weixin_content["id"] = s_id stores[0].store_table_one(weixin_content, spider_weixin_content) # self.log_record.info("data_weixin time:{}".format(time.time() - data_start)) except: print(traceback.format_exc()) def judge_china(self, c_text): zhPattern = re.compile(u'[\u4e00-\u9fa5]+') match = zhPattern.search(u"" + str(c_text)) if match: return True # print '有中文:%s' % (match.group(0),) else: return False def send_wait(self): """ 发送等待, 控制发往下载中心的速率 """ if self.sended_queue.qsize() > 4000: time.sleep(0.4) elif self.sending_queue.qsize() < 10000: time.sleep(0.4)
class BaseRankSpider(BaseSpiderSign): """ 层级拓展 完全匹配更改 1、最靠前排名 暂存 内存 查完url 发送 2、本地 返回真实url """ search_device = None extractor = None def __init__(self): super(BaseRankSpider, self).__init__() # 定时休眠时间 分钟 self.difsecond = 180 log_path = os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__)))) + "/logs/" self.log = UtilLogger('PageSpider', log_path + 'log_page_spider') self.log_record = UtilLogger('RecordPageSpider', log_path + 'log_record_page_spider') self.rank_store = RankStore() self.history_store = RankHistoryStore() self.sleep_time = 60 * 15 # 没有任务休眠时间 self.sended_queue_maxsize = 3000 # 发送限制 self.send_one_tasks = 800 # 一次取出 self.reset_task_time = 60 * 60 # 1小时 # self.saveport = 3 # 端口 self.task_table = "task" self.conf_finish_state = False self.re_send_count = 4 self.db_pool = StoreMysqlPool(**config.baidu_spider_move) def get_user_password(self): return 'fxt', 'fxt_spider' def removeCharacters(self, previou_url): if previou_url.startswith("https://"): previou_url = previou_url.replace("https://", "") if previou_url.startswith("http://"): previou_url = previou_url.replace("http://", "") if previou_url.endswith("/"): previou_url = previou_url[0:len(previou_url) - 1] return previou_url def start_requests(self): try: while True: print(self.sended_queue.qsize()) if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \ and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize: device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc' task_results = self.rank_store.find_task_lists( device, self.send_one_tasks) if len(task_results) > 0: print "datetime:{},task_results length:{}".format( datetime.now(), len(task_results)) for result in task_results: # id, keyword, urlAddress, device, page, searchType, keyword_id, saveport task_id = result[0] keyword = result[1] target_url = result[2] page = result[3] spidertype = result[4] # 不完全匹配 keyword_id = result[5] site_name = result[6] req = self.get_request_param( task_id, keyword, target_url, page, spidertype, keyword_id, site_name, 1) basic_request = SpiderRequest( headers=req['headers'], urls=req['urls'], config=req['configs']) self.sending_queue.put(basic_request) time.sleep(20) else: time.sleep(self.sleep_time) else: time.sleep(self.sleep_time) except Exception: print traceback.format_exc() def deal_rank_spider_response(self, url, html, r_capture, request, ip): page = url["page"] # 总页数 pnum = url["pnum"] # 当前页数 pcount = (pnum - 1) * 10 result = self.extractor.extractor(html, ck=url['ckurl'], site_name=url['site_name'], pcount=pcount) if result == 0: self.log_record.info("extractor failure result 0") self.store_rank(url, -2, html, ip) elif type(result) == int: self.store_rank(url, -1, html, ip) self.log_record.info( "extractor failure deal_baidu_response_pc url:{} request:{}". format(url["url"], request.headers['User-Agent'])) return True else: if "rank" in result: # for rank_result in result["rank"]: self.store_rank(url, result["rank"], html, ip, realaddress=result["realaddress"], r_capture=r_capture) elif pnum <= page: req = self.get_request_param(task_id=url["id"], keyword=url["keyword"], target_url=url["ckurl"], page=url["page"], spidertype=url["spidertype"], keyword_id=url["keyword_id"], site_name=url['site_name'], pnum=pnum + 1) basic_request = SpiderRequest(headers=req['headers'], urls=req['urls'], config=req['configs']) self.sending_queue.put(basic_request) else: self.store_rank(url, -2, html, ip) @abc.abstractmethod def get_request_param(self, task_id, keyword, target_url, page, spidertype, keyword_id, site_name, pnum): """{'headers':{}, 'configs':{}, 'url':''}""" return def store_rank(self, url, rank, response_body, ip, realaddress="", r_capture=""): item = dict() item["keyword"] = url["keyword"] item["rank"] = rank item["taskId"] = int(url["id"]) item["device"] = url["search_device"] item["response_body"] = response_body item['ip'] = ip if realaddress != "": item["urlAddress"] = realaddress else: item["urlAddress"] = "" self.store_queue.put({ "result": item, "task_id": url["id"], "type": StoreTypeEnums.mysql.value, "rank": rank, "keyword_id": url["keyword_id"], "r_capture": r_capture }) def get_stores(self): stores = list() stores.append(SourceStore(config.baidu_spider_move)) self.stores = stores return stores def query_status(self, id): """ 查询status """ db = StoreMysql(**config.baidu_spider_move) query_status_sql = 'select `status` from {} where id = {}'.format( self.rank_store.table, id) try: result = db.query(query_status_sql) db.close() return result[0][0] except: print "query_status error" traceback.print_exc() def deal_response_results_status(self, task_status, url, result, request): try: status = self.query_status(url['id']) if status is not None and status <= 2: if task_status == '2': r_html = '' r_capture = '' r_l = result["result"].split("||||") if len(r_l) == 1: # 非截图 r_html = r_l[0] r_capture = "" elif len(r_l) == 2: # 截图 r_capture = r_l[0] r_html = r_l[1] ip = result['inter_pro'] self.deal_rank_spider_response(url, r_html, r_capture, request, ip) else: # 根据情况做处理 ip = result['inter_pro'] self.store_rank(url, -1, result["result"], ip) self.log.info('spider failure:%s' % url) self.re_send(url, request) except: print "deal_response_results_status error" traceback.print_exc() def re_send(self, url, request): self.log_record.info("re_send url:{}, User-Agent:{}".format( url["url"], request.headers["User-Agent"])) retry_urls = list() if "conf_search_count" in url: if int(url["conf_search_count"]) < self.re_send_count: url["conf_search_count"] = int(url["conf_search_count"]) + 1 retry_urls.append(url) else: self.log_record.info( "datetime:{}; state_url:{}; heasers:{}; config:{}".format( datetime.now(), url["url"], request.headers, request.config)) return else: url["conf_search_count"] = 1 retry_urls.append(url) new_request = SpiderRequest(headers=request.headers, config=request.config) new_request.urls = retry_urls new_request.config["priority"] = 3 new_request.headers["User-Agent"] = UserAgentUtil().random_one( self.search_device) self.sending_queue.put(new_request) def send_response_body_cos(self, response_body, keyword_id, device, ip): """ 将response_body存入腾讯云 """ try: region = config.qcloud_cos.get('region') app_id = config.qcloud_cos.get('app_id') secret_id = config.qcloud_cos.get('secret_id') secret_key = config.qcloud_cos.get('secret_key') token = config.qcloud_cos.get('token') scheme = config.qcloud_cos.get('scheme') bucket = config.qcloud_cos.get('bucket') prefix = config.qcloud_cos.get('prefix') db_name = config.baidu_spider_move.get("db") filename = "{prefix}/html/{date}/{device}/{db_name}/{keyword_id}_{ip}.txt".format( prefix=prefix, date=date.today().isoformat(), device=device, db_name=db_name, keyword_id=keyword_id, ip=ip) cos_config = CosConfig(Region=region, Appid=app_id, SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme=scheme) client = CosS3Client(cos_config) response = client.put_object(Bucket=bucket, Body=response_body, Key=filename, StorageClass='STANDARD', EnableMD5=False) print response['ETag'] except Exception as e: print "save_response_body_cos error: {}".format(e) # @timeout(10) def to_store_results(self, results, stores): """ results type 1: 正常删除task 新增rank,2,3:判断, 3:有完全匹配 task_id task表id """ try: # start_time = time.time() task_id = results["task_id"] keyword_id = results["keyword_id"] result = results["result"] rank = result["rank"] response_body = result["response_body"] screenshot_url = '' device = result["device"] ip = result['ip'] if config.is_send_html_to_cos: self.send_response_body_cos(response_body, keyword_id, device, ip) if results.get("r_capture", "") != "": r_capture = results.get("r_capture") r_capture_bin = base64.b64decode(r_capture) m = hashlib.md5() m.update(r_capture_bin) md5 = m.hexdigest() region = config.qcloud_cos.get('region') app_id = config.qcloud_cos.get('app_id') secret_id = config.qcloud_cos.get('secret_id') secret_key = config.qcloud_cos.get('secret_key') token = config.qcloud_cos.get('token') scheme = config.qcloud_cos.get('scheme') bucket = config.qcloud_cos.get('bucket') prefix = config.qcloud_cos.get('prefix') db_name = config.baidu_spider_move.get("db") filename = "{prefix}/rank_imgs/{date}/{device}/{db_name}/{keyword_id}_{ip}.png".format( prefix=prefix, date=date.today().isoformat(), device=device, db_name=db_name, keyword_id=keyword_id, ip=ip) cos_config = CosConfig(Region=region, Appid=app_id, SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme=scheme) client = CosS3Client(cos_config) response = client.put_object(Bucket=bucket, Body=r_capture_bin, Key=filename, StorageClass='STANDARD', EnableMD5=False) print response['ETag'] screenshot_url = "https://{}.cos.{}.myqcloud.com/{}".format( bucket, region, filename) device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc' rank_data = [{ "keywordid": keyword_id, "url": result["urlAddress"], "rank": rank, "device": device, "keyword": result["keyword"], "screenshot": '', "screenshot_url": screenshot_url }] send_data = {"rankLists": json.dumps(rank_data)} flag = self.send_rank_data(config.callback_url, send_data) if flag: if int(result["rank"]) > 0 and results.get("r_capture", "") == "": self.log.info( "r_capture kong keyword_id:{}, send_url:{}".format( keyword_id, config.callback_url)) self.log_record.info( "one finish keyword_id:{}, rank:{}; urlAddress: {}".format( keyword_id, result["rank"], result.get("urlAddress", ""))) else: self.log.info( "send exception keyword_id:{}, send_url:{}".format( keyword_id, config.callback_url)) # self.log.info("send exception data:{}".format(send_data)) self.rank_store.update_status_id(task_id, result["rank"]) self.history_store.save(self.search_device.name, keyword_id, rank) except: print traceback.format_exc() def send_rank_data(self, send_url, send_data): for i in xrange(0, 2): try: request = urllib2.Request(send_url, data=urllib.urlencode(send_data)) response = urllib2.urlopen(request, timeout=10) res_content = response.read() if not str(res_content).find("success") > -1: self.log_record.info( "res_content no success send_url: {}".format(send_url)) return False return True except: self.log.info(traceback.format_exc()) self.log.info(send_url + ",send_rank_data: " + urllib.urlencode(send_data)) time.sleep(2) return False def reset_task(self): """ 重置任务表 状态 """ while True: time.sleep(10) # self.log_record.info("reset:%s" % str(datetime.today())) self.rank_store.reset_task(self.reset_task_time) time.sleep(self.reset_task_time)
class BaseSugRankSpider(BaseSpiderSign): """ 层级拓展 完全匹配更改 1、最靠前排名 暂存 内存 查完url 发送 2、本地 返回真实url """ search_device = None extractor = None def __init__(self): super(BaseSugRankSpider, self).__init__() # 定时休眠时间 分钟 self.difsecond = 180 log_path = os.path.dirname( os.path.dirname(os.path.dirname( os.path.abspath(__file__)))) + "/logs/" self.log = UtilLogger('SugSpider', log_path + 'log_sug_spider') self.log_record = UtilLogger('RecordSugSpider', log_path + 'log_record_sug_spider') self.rank_store = RankStore() self.history_store = RankHistoryStore() self.sleep_time = 60 * 15 # 没有任务休眠时间 self.sended_queue_maxsize = 1500 # 发送限制 self.send_one_tasks = 800 # 一次取出 self.reset_task_time = 60 * 60 # 1小时 # self.saveport = 3 # 端口 self.task_table = "task" self.conf_finish_state = False self.re_send_count = 4 self.db_pool = StoreMysqlPool(**config.baidu_spider_move) def get_user_password(self): return 'fxt', 'fxt_spider' def start_requests(self): try: while True: print(self.sended_queue.qsize()) if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \ and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize: device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc' task_results = self.rank_store.find_task_lists( device, self.send_one_tasks) if len(task_results) > 0: print "datetime:{},task_results length:{}".format( datetime.now(), len(task_results)) for result in task_results: # id, keyword, urlAddress, device, page, searchType, keyword_id, saveport task_id = result[0] keyword = result[1] target_url = result[2] page = result[3] spidertype = result[4] keyword_id = result[5] req = self.get_request_param( task_id, keyword, target_url, keyword_id) basic_request = SpiderRequest( urls=req['urls'], config=req['configs']) self.sending_queue.put(basic_request) time.sleep(20) else: time.sleep(self.sleep_time) else: time.sleep(self.sleep_time) except Exception: print traceback.format_exc() def deal_rank_spider_response(self, url, html, ip): result = self.extractor.extractor(html, url['ckurl']) self.store_rank(url, result, html, ip) @abc.abstractmethod def get_request_param(self, task_id, keyword, target_url, keyword_id): """{'headers':{}, 'configs':{}, 'url':''}""" return def store_rank(self, url, rank, response_body, ip): item = dict() item["keyword"] = url["keyword"] item["rank"] = rank item["taskId"] = int(url["id"]) item['target'] = url['ckurl'] item["response_body"] = response_body item['device'] = url['search_device'] item['ip'] = ip self.store_queue.put({ "result": item, "task_id": url["id"], "type": StoreTypeEnums.mysql.value, "rank": rank, "keyword_id": url["keyword_id"] }) def get_stores(self): stores = list() stores.append(SourceStore(config.baidu_spider_move)) self.stores = stores return stores def deal_response_results_status(self, task_status, url, result, request): if task_status == '2': self.deal_rank_spider_response(url, result["result"], result['inter_pro']) else: # 根据情况做处理 self.store_rank(url, -1, result["result"], result['inter_pro']) self.log.info('spider failure:%s' % url) self.re_send(url, request) def re_send(self, url, request): self.log_record.info("re_send url:{}, User-Agent:{}".format( url["url"], request.headers["User-Agent"])) retry_urls = list() if "conf_search_count" in url: if int(url["conf_search_count"]) < self.re_send_count: url["conf_search_count"] = int(url["conf_search_count"]) + 1 retry_urls.append(url) else: self.log_record.info( "datetime:{}; state_url:{}; heasers:{}; config:{}".format( datetime.now(), url["url"], request.headers, request.config)) return else: url["conf_search_count"] = 1 retry_urls.append(url) new_request = SpiderRequest(headers=request.headers, config=request.config) new_request.urls = retry_urls new_request.config["priority"] = 3 new_request.headers["User-Agent"] = UserAgentUtil().random_one( self.search_device) self.sending_queue.put(new_request) def send_response_body_cos(self, response_body, keyword_id, device, ip): """ 将response_body存入腾讯云 :return: """ try: region = config.qcloud_cos.get('region') app_id = config.qcloud_cos.get('app_id') secret_id = config.qcloud_cos.get('secret_id') secret_key = config.qcloud_cos.get('secret_key') token = config.qcloud_cos.get('token') scheme = config.qcloud_cos.get('scheme') bucket = config.qcloud_cos.get('bucket') prefix = config.qcloud_cos.get('prefix') db_name = config.baidu_spider_move.get("db") filename = "{prefix}/html/{date}/{device}/{db_name}/{keyword_id}_{ip}.txt".format( prefix=prefix, date=date.today().isoformat(), device=device, db_name=db_name, keyword_id=keyword_id, ip=ip) cos_config = CosConfig(Region=region, Appid=app_id, SecretId=secret_id, SecretKey=secret_key, Token=token, Scheme=scheme) client = CosS3Client(cos_config) response = client.put_object(Bucket=bucket, Body=response_body, Key=filename, StorageClass='STANDARD', EnableMD5=False) print response['ETag'] except Exception as e: print "save_response_body_cos error: {}".format(e) # @timeout(10) def to_store_results(self, results, stores): """ results type 1: 正常删除task 新增rank,2,3:判断, 3:有完全匹配 task_id task表id """ try: # start_time = time.time() task_id = results["task_id"] keyword_id = results["keyword_id"] result = results["result"] rank = result["rank"] response_body = result["response_body"] ip = result['ip'] device = result['device'] if config.is_send_html_to_cos: self.send_response_body_cos(response_body, keyword_id, device, ip) device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc' rank_data = [{ "keywordid": keyword_id, "url": result["target"], "rank": rank, "device": device, "keyword": result["keyword"] }] send_data = {"rankLists": json.dumps(rank_data)} flag = self.send_rank_data(config.callback_url, send_data) if flag: self.log_record.info( "one finish keyword_id:{}, rank:{}; target: {}".format( keyword_id, result["rank"], result.get("target", ""))) else: self.log.info( "send exception keyword_id:{}, send_url:{}".format( keyword_id, config.callback_url)) # self.log.info("send exception data:{}".format(send_data)) self.rank_store.update_status_id(task_id, result["rank"]) self.history_store.save(self.search_device.name, keyword_id, rank) except: print traceback.format_exc() def send_rank_data(self, send_url, send_data): for i in xrange(0, 2): try: request = urllib2.Request(send_url, data=urllib.urlencode(send_data)) response = urllib2.urlopen(request, timeout=10) res_content = response.read() if not str(res_content).find("success") > -1: self.log_record.info( "res_content no success send_url: {}".format(send_url)) return False return True except: self.log.info(traceback.format_exc()) self.log.info(send_url + ",send_rank_data: " + urllib.urlencode(send_data)) time.sleep(2) return False def reset_task(self): """ 重置任务表 状态 :return: """ while True: time.sleep(10) # self.log_record.info("reset:%s" % str(datetime.today())) self.rank_store.reset_task(self.reset_task_time) time.sleep(self.reset_task_time)
class ToutiaoSpider(BaseSpiderSign): """ 爬取头条文章 流程: 1 从数据库检索出不同的关键词 2 从入口:搜狗微信搜索搜索关键词并解析列表页 入口url:https://www.toutiao.com/search_content/?offset=40&format=json&keyword=%E8%8B%B9%E6%9E%9CWWDC%E6%97%B6%E9%97%B4&autoload=true&count=20&cur_tab=1&from=search_tab 3 解析列表业码后得出该关键词对应文章的 页码数量 从而拼出剩下的页码url 4 解析列表页内的文章 链接、摘要,传入 详情页的解析器 5 解析微信内的文章最后存入数据库 """ def __init__(self): super(ToutiaoSpider, self).__init__() # 必须指定self.log self.log = UtilLogger( 'ToutiaoSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_ToutiaoSpider.log')) self.log_record = UtilLogger( 'SourceSpider', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'log_SourceSpider.log')) self.ext = ToutiaoExtractor() # self.new_store = SourceStore(config.TEST_DB) self.step = 100 # 800 一个批次 self.queue_maxsize = 500 # 发送量 self.sended_queue_maxsize = 800 # 已发送 self.table_count = 1000000 self.table_index = 0 self.md5_table = "news_md5" self.s_table = "news_{}" self.create_table_sql = """ create table news_{} like news_copy; """ self.spider_count = 0 self.repeat_count = 0 self.no_china_count = 0 self.send_url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword={}&autoload=true&count=20&cur_tab=1&from=search_tab' def get_user_password(self): # return 'zhouhao', 'zhspider' # return 'xuliang', 'xlspider' return 'sunxiang', 'sxspider' def send_get_spider(self, urls): """ 封装好 GET request请求,并发送到下载队列 """ basic_request = SpiderRequest( headers={'User-Agent': random.choice(self.pc_user_agents)}, urls=urls, config={"redirect": 1}) self.sending_queue.put_nowait(basic_request) def is_get_tasks(self): if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.queue_maxsize \ and self.response_queue.qsize() < self.queue_maxsize and self.store_queue.qsize() < self.queue_maxsize: return True else: return False def start_requests(self): try: while 1: if self.is_get_tasks(): db = StoreMysql(**config.local_content) update_time = str(datetime.now()).split(".")[0] sql = "select id, keyword from keywords where status = 1 order by update_time asc, priority desc limit 0, {}".format( self.step) rows = db.query(sql) self.log_record.info( "datetime:{},task_results length:{}".format( datetime.now(), len(rows))) ids = list() if rows: for word in rows: task_id = word[0] ids.append({ "id": task_id, "update_time": update_time }) keyword = word[1] for i in range(0, 6): send_url = self.send_url.format( i * 20, keyword) urls = [{ "url": send_url, "type": 1, "ext_type": 1, 'keyword': keyword, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) self.stores[0].store_table(ids, "keywords", type=2, field="id") else: time.sleep(60 * 10) db.close() time.sleep(10) except Exception: print traceback.format_exc() def get_stores(self): """ 可定义多个数据源 :return: """ stores = list() stores.append(SourceStore(config.local_content)) self.stores = stores return stores def deal_response_results_status(self, task_status, url, result, request): """ 处理 task_status 是2,3的任务 重试返回数组, 若重试需切换headers内容需自行定义 :param task_status: :param url: :param result: :param request: :return: """ if task_status == '2': ext_type = url["ext_type"] if ext_type == 1: self.deal_response_list(url, result['result']) elif ext_type == 2: self.deal_response_detail(url, result['result']) else: self.log.info("status is 3 url:{}; headers:{}; config:{}".format( url["url"], request.headers, request.config)) # @fn_timer def deal_response_list(self, url, html): try: keyword = url['keyword'] # task_id = url['task_id'] info_list = self.ext.list_extractor(html, keyword) if info_list == -1: self.log.info("deal_response_list exception url:{}".format( url["url"])) else: self.store_queue.put({"result": info_list, "type": 1}) except: print(traceback.format_exc()) # @fn_timer def deal_response_detail(self, url, html): try: list_info = url['info'] # info.pop("we_name")\ res = self.ext.detail_extractor(html, list_info) if res != -1: self.store_queue.put({"result": res, "type": 2}) else: self.log.info("deal_response_detail exception url:{}".format( url["url"])) except: print(traceback.format_exc()) def to_store_results(self, results, stores): """ type 1 列表页 title name 去重 2 详情页 数据 :param results:从store里读取的一个封装好的字典{'result':{'title':'...','abstract':'...'},'type':2} :param stores: :return: """ try: result = results["result"] # 这个是真正的结果,格式:字典 type = results["type"] # 处理方式,type=1 处理列表页,2:处理详情页 if type == 1: # log_start = time.time() # keyword = results["keyword"] for info in result: log_md5 = UtilMD5.md5(info["title"]) sql = "insert ignore into {}(md5) values('{}')".format( self.md5_table, str(log_md5)) s_id = stores[0].insert_row(sql) # 返回一个rowcount(受影响的行?) if s_id > 0: # self.spider_count += 1 urls = [{ "url": info['url'], "type": 1, "ext_type": 2, 'info': info, 'unique_key': self.get_unique_key() }] self.send_get_spider(urls) # 封装列表页获取的的url,去请求 else: self.repeat_count += 1 if self.repeat_count > 1000: self.log_record.info("repeat_count:{}".format( self.repeat_count)) self.repeat_count = 0 elif type == 2: data = result if not self.judge_china(data["content"]): # 没有中文 self.no_china_count += 1 if self.no_china_count > 1000: self.log_record.info("no_china_count:{}".format( self.no_china_count)) self.no_china_count = 0 return toutiao_content = { "category": data.get("category", ""), "content": data.get("content", ""), "publish_time": data.get("publish_time", ""), "title": data.get("title", ""), "abstract": data.get("abstract", ""), "tags": data.get("tags", ""), 'url': data.get('url', ''), 'keyword': data.get('keyword', '') } s_id = stores[0].store_table_one( toutiao_content, "news_{}".format(self.table_index)) if s_id > 0: if s_id % self.table_count == 0: db = StoreMysql(**config.toutiao_content) update_sql = "update spider_table set status = 1 where table_name = 'news_{}'".format( self.table_index) db.do(update_sql) self.table_index += 1 db.do(self.create_table_sql.format(self.table_index)) insert_sql = "insert into spider_table(table_name) values('news_{}')".format( self.table_index) db.do(insert_sql) time.sleep(1) db.close() else: time.sleep(0.1) time.sleep(4) except: print(traceback.format_exc()) def judge_china(self, c_text): zhPattern = re.compile(u'[\u4e00-\u9fa5]+') match = zhPattern.search(u"" + str(c_text)) if match: return True else: return False def send_wait(self): """ 发送等待, 控制发往下载中心的速率 """ time.sleep(1)