コード例 #1
0
 def deal_response_results(self, request, results, stores):
     if results == 0:
         return False
     else:
         urls = list()  # 重新去获取的url
         failed_urls = list()  # 失败重发的url
         for u in request.urls:
             url = u['unique_md5']
             if url in results:
                 result = results[url]
                 task_status = str(result['status'])
                 if task_status in ['2', '3']:
                     ret_failed_urls = self.deal_response_results_status(
                         task_status, u, result, request)
                     if ret_failed_urls:
                         failed_urls = failed_urls + ret_failed_urls
                 else:
                     urls.append(u)
             else:
                 # self.log.error('url发送失败unique_md5:{}; 已有url:{}'.format(url, u["url"]))
                 if self.url_repeat:
                     failed_urls.append(u)
         if len(urls) > 0:
             request.urls = urls
             self.sended_queue.put(request)
         if len(failed_urls) > 0:
             new_request = SpiderRequest(headers=request.headers,
                                         config=request.config)
             # new_request = copy(request)
             new_request.urls = failed_urls
             self.sending_queue.put(new_request)
             new_request = None
コード例 #2
0
    def start_requests(self):
        try:
            while True:
                print(self.sended_queue.qsize())
                if self.sended_queue.qsize() < self.sended_queue_maxsize and self.sending_queue.qsize() < self.sended_queue_maxsize \
                        and self.response_queue.qsize() < self.sended_queue_maxsize and self.store_queue.qsize() < self.sended_queue_maxsize:
                    device = self.search_device.name if self.search_device != DeviceEnums.pc_360 else '360_pc'
                    task_results = self.rank_store.find_task_lists(
                        device, self.send_one_tasks)
                    if len(task_results) > 0:
                        print "datetime:{},task_results length:{}".format(
                            datetime.now(), len(task_results))
                        for result in task_results:
                            #  id, keyword, urlAddress, device, page, searchType, keyword_id, saveport
                            task_id = result[0]
                            keyword = result[1]
                            target_url = result[2]
                            page = result[3]
                            spidertype = result[4]
                            keyword_id = result[5]

                            req = self.get_request_param(
                                task_id, keyword, target_url, keyword_id)
                            basic_request = SpiderRequest(
                                urls=req['urls'], config=req['configs'])
                            self.sending_queue.put(basic_request)
                        time.sleep(20)
                    else:
                        time.sleep(self.sleep_time)
                else:
                    time.sleep(self.sleep_time)
        except Exception:
            print traceback.format_exc()
コード例 #3
0
 def send_get_spider(self, urls):
     """
     封装好 GET request请求,并发送到下载队列
     """
     basic_request = SpiderRequest(
         headers={'User-Agent': random.choice(self.pc_user_agents)},
         urls=urls,
         config={"redirect": 1})
     self.sending_queue.put_nowait(basic_request)
コード例 #4
0
 def re_send(self, url, request):
     self.log_record.info("re_send url:{}, User-Agent:{}".format(
         url["url"], request.headers["User-Agent"]))
     retry_urls = list()
     if "conf_search_count" in url:
         if int(url["conf_search_count"]) < self.re_send_count:
             url["conf_search_count"] = int(url["conf_search_count"]) + 1
             retry_urls.append(url)
         else:
             self.log_record.info(
                 "datetime:{}; state_url:{}; heasers:{}; config:{}".format(
                     datetime.now(), url["url"], request.headers,
                     request.config))
             return
     else:
         url["conf_search_count"] = 1
         retry_urls.append(url)
     new_request = SpiderRequest(headers=request.headers,
                                 config=request.config)
     new_request.urls = retry_urls
     new_request.config["priority"] = 3
     new_request.headers["User-Agent"] = UserAgentUtil().random_one(
         self.search_device)
     self.sending_queue.put(new_request)
コード例 #5
0
ファイル: dc_wrapper.py プロジェクト: muyun001/dc_drapper
 def start_requests(self):
     try:
         while True:
             if self.control_tasks():
                 condition = {
                     "$or": [{
                         'status': 0
                     }, {
                         'status': -1
                     }]
                 }  # 查询status为0或-1的数据
                 request_datas = self.store.read_request_datas(condition)
                 if request_datas is not None:
                     try:
                         print(request_datas['unique_key'])
                         headers, configs, urls = self.parameter_format(
                             request_datas)
                         spider_request = SpiderRequest(headers=headers,
                                                        urls=urls,
                                                        config=configs)
                         self.sending_queue.put(spider_request)
                         self.store.update_status(
                             request_datas['unique_key'],
                             1)  # 请求发送成功,则将mongo中原数据的status置为1
                     except:
                         if request_datas['status'] == 0:
                             # 请求发送失败,则将mongo中原数据的status置为-1
                             self.store.update_status(
                                 request_datas['unique_key'], -1)
                         else:
                             # 请求发送失败,则将mongo中原数据的status置为-2
                             self.store.update_status(
                                 request_datas['unique_key'], -2)
                         print("start_requests error, 1.")
                         traceback.print_exc()
                         time.sleep(1)
                 else:
                     time.sleep(self.sleep_time)
             else:
                 time.sleep(self.sleep_time)
     except:
         print("start_requests error, 2.")
         traceback.print_exc()
コード例 #6
0
 def deal_rank_spider_response(self, url, html, r_capture, request, ip):
     page = url["page"]  # 总页数
     pnum = url["pnum"]  # 当前页数
     pcount = (pnum - 1) * 10
     result = self.extractor.extractor(html,
                                       ck=url['ckurl'],
                                       site_name=url['site_name'],
                                       pcount=pcount)
     if result == 0:
         self.log_record.info("extractor failure result 0")
         self.store_rank(url, -2, html, ip)
     elif type(result) == int:
         self.store_rank(url, -1, html, ip)
         self.log_record.info(
             "extractor failure deal_baidu_response_pc url:{}   request:{}".
             format(url["url"], request.headers['User-Agent']))
         return True
     else:
         if "rank" in result:
             # for rank_result in result["rank"]:
             self.store_rank(url,
                             result["rank"],
                             html,
                             ip,
                             realaddress=result["realaddress"],
                             r_capture=r_capture)
         elif pnum <= page:
             req = self.get_request_param(task_id=url["id"],
                                          keyword=url["keyword"],
                                          target_url=url["ckurl"],
                                          page=url["page"],
                                          spidertype=url["spidertype"],
                                          keyword_id=url["keyword_id"],
                                          site_name=url['site_name'],
                                          pnum=pnum + 1)
             basic_request = SpiderRequest(headers=req['headers'],
                                           urls=req['urls'],
                                           config=req['configs'])
             self.sending_queue.put(basic_request)
         else:
             self.store_rank(url, -2, html, ip)
コード例 #7
0
ファイル: dc_wrapper.py プロジェクト: muyun001/dc_drapper
    def send_url_to_sended_queue(self):
        """
        每次初始运行时,将status为1的url重新发送至sended_queue.
        为了防止死循环,url发送之后先将status置为临时状态5,
        所有url发送完后再将status重新置为1.
        """
        temp_status = 5  # 设置一个临时状态
        unique_keys = list()
        try:
            while True:
                condition = {'status': 1}  # 查询status=1的数据
                request_datas = self.store.read_request_datas(condition)
                if request_datas is not None:
                    unique_keys.append(request_datas['unique_key'])
                    try:
                        headers, configs, urls = self.parameter_format(
                            request_datas)
                        spider_request = SpiderRequest(headers=headers,
                                                       urls=urls,
                                                       config=configs)
                        self.sending_queue.put(spider_request)
                        self.store.update_status(request_datas['unique_key'],
                                                 temp_status)  # 将状态改为临时状态
                    except:
                        self.store.update_status(
                            request_datas['unique_key'],
                            -1)  # 请求发送失败,则将mongo中原数据的status置为-1
                        traceback.print_exc()
                else:
                    break

            # 将临时状态的status改为1
            for unique_key in unique_keys:
                self.store.update_status(unique_key, 1)
            del unique_keys  # 从内存中删除变量
        except:
            print("send_uncomplished_url_to_dc error")
            traceback.print_exc()