def deal_response_results(self, request, results, stores): if results == 0: return False else: urls = list() # 重新去获取的url failed_urls = list() # 失败重发的url for u in request.urls: url = u['unique_md5'] if url in results: result = results[url] task_status = str(result['status']) if task_status in ['2', '3']: ret_failed_urls = self.deal_response_results_status( task_status, u, result, request) if ret_failed_urls: failed_urls = failed_urls + ret_failed_urls else: urls.append(u) else: # self.log.error('url发送失败unique_md5:{}; 已有url:{}'.format(url, u["url"])) if self.url_repeat: failed_urls.append(u) if len(urls) > 0: request.urls = urls self.sended_queue.put(request) if len(failed_urls) > 0: new_request = SpiderRequest(headers=request.headers, config=request.config) # new_request = copy(request) new_request.urls = failed_urls self.sending_queue.put(new_request) new_request = None
def start_requests(self): try: while True: print(self.sended_queue.qsize()) if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \ and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize: device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc' task_results = self.rank_store.find_task_lists( device, self.send_one_tasks) if len(task_results) > 0: print "datetime:{},task_results length:{}".format( datetime.now(), len(task_results)) for result in task_results: # id, keyword, urlAddress, device, page, searchType, keyword_id, saveport task_id = result[0] keyword = result[1] target_url = result[2] page = result[3] spidertype = result[4] keyword_id = result[5] req = self.get_request_param( task_id, keyword, target_url, keyword_id) basic_request = SpiderRequest( urls=req['urls'], config=req['configs']) self.sending_queue.put(basic_request) time.sleep(20) else: time.sleep(self.sleep_time) else: time.sleep(self.sleep_time) except Exception: print traceback.format_exc()
def send_get_spider(self, urls): """ 封装好 GET request请求,并发送到下载队列 """ basic_request = SpiderRequest( headers={'User-Agent': random.choice(self.pc_user_agents)}, urls=urls, config={"redirect": 1}) self.sending_queue.put_nowait(basic_request)
def re_send(self, url, request): self.log_record.info("re_send url:{}, User-Agent:{}".format( url["url"], request.headers["User-Agent"])) retry_urls = list() if "conf_search_count" in url: if int(url["conf_search_count"]) < self.re_send_count: url["conf_search_count"] = int(url["conf_search_count"]) + 1 retry_urls.append(url) else: self.log_record.info( "datetime:{}; state_url:{}; heasers:{}; config:{}".format( datetime.now(), url["url"], request.headers, request.config)) return else: url["conf_search_count"] = 1 retry_urls.append(url) new_request = SpiderRequest(headers=request.headers, config=request.config) new_request.urls = retry_urls new_request.config["priority"] = 3 new_request.headers["User-Agent"] = UserAgentUtil().random_one( self.search_device) self.sending_queue.put(new_request)
def start_requests(self): try: while True: if self.control_tasks(): condition = { "$or": [{ 'status': 0 }, { 'status': -1 }] } # 查询status为0或-1的数据 request_datas = self.store.read_request_datas(condition) if request_datas is not None: try: print(request_datas['unique_key']) headers, configs, urls = self.parameter_format( request_datas) spider_request = SpiderRequest(headers=headers, urls=urls, config=configs) self.sending_queue.put(spider_request) self.store.update_status( request_datas['unique_key'], 1) # 请求发送成功,则将mongo中原数据的status置为1 except: if request_datas['status'] == 0: # 请求发送失败,则将mongo中原数据的status置为-1 self.store.update_status( request_datas['unique_key'], -1) else: # 请求发送失败,则将mongo中原数据的status置为-2 self.store.update_status( request_datas['unique_key'], -2) print("start_requests error, 1.") traceback.print_exc() time.sleep(1) else: time.sleep(self.sleep_time) else: time.sleep(self.sleep_time) except: print("start_requests error, 2.") traceback.print_exc()
def deal_rank_spider_response(self, url, html, r_capture, request, ip): page = url["page"] # 总页数 pnum = url["pnum"] # 当前页数 pcount = (pnum - 1) * 10 result = self.extractor.extractor(html, ck=url['ckurl'], site_name=url['site_name'], pcount=pcount) if result == 0: self.log_record.info("extractor failure result 0") self.store_rank(url, -2, html, ip) elif type(result) == int: self.store_rank(url, -1, html, ip) self.log_record.info( "extractor failure deal_baidu_response_pc url:{} request:{}". format(url["url"], request.headers['User-Agent'])) return True else: if "rank" in result: # for rank_result in result["rank"]: self.store_rank(url, result["rank"], html, ip, realaddress=result["realaddress"], r_capture=r_capture) elif pnum <= page: req = self.get_request_param(task_id=url["id"], keyword=url["keyword"], target_url=url["ckurl"], page=url["page"], spidertype=url["spidertype"], keyword_id=url["keyword_id"], site_name=url['site_name'], pnum=pnum + 1) basic_request = SpiderRequest(headers=req['headers'], urls=req['urls'], config=req['configs']) self.sending_queue.put(basic_request) else: self.store_rank(url, -2, html, ip)
def send_url_to_sended_queue(self): """ 每次初始运行时,将status为1的url重新发送至sended_queue. 为了防止死循环,url发送之后先将status置为临时状态5, 所有url发送完后再将status重新置为1. """ temp_status = 5 # 设置一个临时状态 unique_keys = list() try: while True: condition = {'status': 1} # 查询status=1的数据 request_datas = self.store.read_request_datas(condition) if request_datas is not None: unique_keys.append(request_datas['unique_key']) try: headers, configs, urls = self.parameter_format( request_datas) spider_request = SpiderRequest(headers=headers, urls=urls, config=configs) self.sending_queue.put(spider_request) self.store.update_status(request_datas['unique_key'], temp_status) # 将状态改为临时状态 except: self.store.update_status( request_datas['unique_key'], -1) # 请求发送失败,则将mongo中原数据的status置为-1 traceback.print_exc() else: break # 将临时状态的status改为1 for unique_key in unique_keys: self.store.update_status(unique_key, 1) del unique_keys # 从内存中删除变量 except: print("send_uncomplished_url_to_dc error") traceback.print_exc()