async def add(self, proxy, score=INIT_SCORE):
     if not await self.redis.execute('zscore', self.name, proxy):
         if await self.redis.execute('zadd', self.name,
                                     {proxy: INIT_SCORE}):
             logger.info(f"添加<{proxy}>成功")
     else:
         logger.info(f"<{proxy}>已存在")
 async def decr(self, proxy):
     score = await self.redis.zscore(self.name, proxy)
     if score and score > MIN_SCORE + 1:
         score = await self.redis.zincrby(self.name, -1, proxy)
         logger.info(f'<{proxy}>分数下调至{score}分')
     else:
         logger.info(f'{proxy}分数为0,移除')
         await self.redis.zrem(self.name, proxy)
Exemple #3
0
 def add_request(self, request):
     # self.total_request += 1
     self.count.incr_total_request()
     if not self.is_repeat(request) or not request.filter:
         self.queue.put(request)
     else:
         logger.info(f'发现重复请求{request.url}')
         # self.total_repeat_request += 1
         self.count.incr_total_repeat_request()
 def first_get(self):
     result = self.redis.zrangebyscore(self.name, MIN_SCORE, MAX_SCORE)
     if result:
         return choice(result)
     else:
         result = self.redis.zrevrangebyscore(self.name, MAX_SCORE,
                                              MIN_SCORE)[:20]
         if result:
             return choice(result)
         else:
             logger.info('代理池空空如也')
             return
 async def get(self):
     result = await self.redis.execute('zrangebyscore', self.name,
                                       MAX_SCORE, MAX_SCORE)
     if result:
         return choice(result)
     else:
         result = await self.redis.execute('zrevrangebyscore', self.name,
                                           MAX_SCORE, MIN_SCORE)[:20]
         if result:
             return choice(result)
         else:
             logger.info('代理池空空如也')
             return
Exemple #6
0
 def get_resp(self, request):
     for i in range(RETRY_NUM):
         if request.method == 'GET':
             resp = requests.get(request.url,
                                 params=request.params,
                                 headers=request.headers)
         elif request.method == 'POST':
             resp = requests.post(url=request.url,
                                  data=request.data,
                                  params=request.params,
                                  headers=request.headers)
         else:
             logger.info('不支持的请求方法')
         if resp.status_code in [200, 301, 302]:
             logger.info("请求 {}<{}>成功".format(request.url,
                                              resp.status_code))
             return Response(url=resp.url,
                             status=resp.status_code,
                             headers=resp.headers,
                             text=resp.text,
                             content=resp.content,
                             request=request)
         else:
             logger.info("请求 {}<{}>失败,已重试次数{}".format(
                 request.url, resp.status_code, request.retry_num))
             request.retry_num += 1
Exemple #7
0
    def start(self):

        logger.info('爬虫开启')
        start_time = time.perf_counter()
        if self.backupqueue.llen() == 0:
            if OPEN_PROCESSING:
                self.pool.apply_async(self.start_engine)
                for i in range(PROCESSING_NUM):
                    self.pool.apply_async(self.request_downloder_parse,
                                          callback=self._backfun)
            else:
                self.start_engine()
        else:
            while self.backupqueue.llen():
                self.backupqueue.rpoplpush()
        while True:
            if OPEN_PROCESSING:
                time.sleep(0.001)
            else:
                self.request_downloder_parse()
            # if self.schedule.total_request != 0:
            #     if self.schedule.total_request == self.downloder.total_resp + self.schedule.total_repeat_request:
            if self.count.start_spider_num == len(self.spiders):
                if self.count.total_request == self.count.total_resp + self.count.total_repeat_request:
                    self.stop_flag = True
                    break

        logger.info('爬虫结束,共耗时{}秒'.format(time.perf_counter() - start_time))
        # logger.info(f'共发送请求数量:{self.schedule.total_request}')
        # logger.info(f'请求成功数:{self.downloder.total_resp}')
        # logger.info(f'重复请求{self.schedule.total_repeat_request}')
        logger.info(f'共发送请求数量:{self.count.total_request}')
        logger.info(f'请求成功数:{self.count.total_resp}')
        logger.info(f'重复请求{self.count.total_repeat_request}')
        self.count.clear()
        self.schedule.ins_list.clear()
 async def set_max(self, proxy):
     await self.redis.zadd(self.name, {proxy: MAX_SCORE})
     logger.info(f'已将<{proxy}>分数调至100')
Exemple #9
0
    def get_resp(self, request):
        for i in range(RETRY_NUM):
            resp=None
            if request.method == 'GET':
                try:
                    resp = requests.get(request.url, params=request.params, headers=request.headers,timeout=5)
                except ConnectionRefusedError:
                    logger.info('拒绝连接')
                except exceptions.ConnectTimeoutError:
                    logger.info('连接超时')
                except exceptions.MaxRetryError:
                    logger.info('超过最大尝试次数')
                except requests.exceptions.ProxyError:
                    logger.info('代理错误,目标计算机积极拒绝')
                except requests.exceptions.ConnectionError:
                    logger.info(request.url, "服务器拒绝访问")
                except requests.exceptions.ConnectTimeout:
                    logger.info(request.url, "连接超时")
                except Exception as e:
                    logger.info(request.url, "未知错误")
                finally:
                    if request.proxies:
                        # print(request.proxies['http'][7:])
                        self.proxies_db.decr(request.proxies['http'][7:])

            elif request.method == 'POST':
                try:
                    resp = requests.post(url=request.url, data=request.data, params=request.params, headers=request.headers)
                except Exception as e:
                    logger.info(request.url,e)
            else:
                logger.info('不支持的请求方法')
            if request.proxies:
                self.proxies_db.set_max(request.proxies['http'][7:])
            if not resp:
                return
            if resp.status_code in [200,301,302]:
                logger.info("请求 {}<{}>成功".format(request.url, resp.status_code))
                return Response(url=resp.url, status=resp.status_code, headers=resp.headers, text=resp.text,content=resp.content, request=request)

            else:
                logger.info("请求 {}<{}>失败,已重试次数{}".format(request.url, resp.status_code,request.retry_num))
                request.retry_num+=1
                if request.retry_num >= RETRY_NUM:
                    self.count.incr_fail_request_num()
                    self.backupqueue.pop()
Exemple #10
0
 def add(self, proxy, score=INIT_SCORE):
     if not self.redis.zscore(self.name, proxy):
         if self.redis.zadd(self.name, {proxy: INIT_SCORE}):
             logger.info(f"添加<{proxy}>成功")
     else:
         logger.info(f"<{proxy}>已存在")