async def add(self, proxy, score=INIT_SCORE): if not await self.redis.execute('zscore', self.name, proxy): if await self.redis.execute('zadd', self.name, {proxy: INIT_SCORE}): logger.info(f"添加<{proxy}>成功") else: logger.info(f"<{proxy}>已存在")
async def decr(self, proxy): score = await self.redis.zscore(self.name, proxy) if score and score > MIN_SCORE + 1: score = await self.redis.zincrby(self.name, -1, proxy) logger.info(f'<{proxy}>分数下调至{score}分') else: logger.info(f'{proxy}分数为0,移除') await self.redis.zrem(self.name, proxy)
def add_request(self, request): # self.total_request += 1 self.count.incr_total_request() if not self.is_repeat(request) or not request.filter: self.queue.put(request) else: logger.info(f'发现重复请求{request.url}') # self.total_repeat_request += 1 self.count.incr_total_repeat_request()
def first_get(self): result = self.redis.zrangebyscore(self.name, MIN_SCORE, MAX_SCORE) if result: return choice(result) else: result = self.redis.zrevrangebyscore(self.name, MAX_SCORE, MIN_SCORE)[:20] if result: return choice(result) else: logger.info('代理池空空如也') return
async def get(self): result = await self.redis.execute('zrangebyscore', self.name, MAX_SCORE, MAX_SCORE) if result: return choice(result) else: result = await self.redis.execute('zrevrangebyscore', self.name, MAX_SCORE, MIN_SCORE)[:20] if result: return choice(result) else: logger.info('代理池空空如也') return
def get_resp(self, request): for i in range(RETRY_NUM): if request.method == 'GET': resp = requests.get(request.url, params=request.params, headers=request.headers) elif request.method == 'POST': resp = requests.post(url=request.url, data=request.data, params=request.params, headers=request.headers) else: logger.info('不支持的请求方法') if resp.status_code in [200, 301, 302]: logger.info("请求 {}<{}>成功".format(request.url, resp.status_code)) return Response(url=resp.url, status=resp.status_code, headers=resp.headers, text=resp.text, content=resp.content, request=request) else: logger.info("请求 {}<{}>失败,已重试次数{}".format( request.url, resp.status_code, request.retry_num)) request.retry_num += 1
def start(self): logger.info('爬虫开启') start_time = time.perf_counter() if self.backupqueue.llen() == 0: if OPEN_PROCESSING: self.pool.apply_async(self.start_engine) for i in range(PROCESSING_NUM): self.pool.apply_async(self.request_downloder_parse, callback=self._backfun) else: self.start_engine() else: while self.backupqueue.llen(): self.backupqueue.rpoplpush() while True: if OPEN_PROCESSING: time.sleep(0.001) else: self.request_downloder_parse() # if self.schedule.total_request != 0: # if self.schedule.total_request == self.downloder.total_resp + self.schedule.total_repeat_request: if self.count.start_spider_num == len(self.spiders): if self.count.total_request == self.count.total_resp + self.count.total_repeat_request: self.stop_flag = True break logger.info('爬虫结束,共耗时{}秒'.format(time.perf_counter() - start_time)) # logger.info(f'共发送请求数量:{self.schedule.total_request}') # logger.info(f'请求成功数:{self.downloder.total_resp}') # logger.info(f'重复请求{self.schedule.total_repeat_request}') logger.info(f'共发送请求数量:{self.count.total_request}') logger.info(f'请求成功数:{self.count.total_resp}') logger.info(f'重复请求{self.count.total_repeat_request}') self.count.clear() self.schedule.ins_list.clear()
async def set_max(self, proxy): await self.redis.zadd(self.name, {proxy: MAX_SCORE}) logger.info(f'已将<{proxy}>分数调至100')
def get_resp(self, request): for i in range(RETRY_NUM): resp=None if request.method == 'GET': try: resp = requests.get(request.url, params=request.params, headers=request.headers,timeout=5) except ConnectionRefusedError: logger.info('拒绝连接') except exceptions.ConnectTimeoutError: logger.info('连接超时') except exceptions.MaxRetryError: logger.info('超过最大尝试次数') except requests.exceptions.ProxyError: logger.info('代理错误,目标计算机积极拒绝') except requests.exceptions.ConnectionError: logger.info(request.url, "服务器拒绝访问") except requests.exceptions.ConnectTimeout: logger.info(request.url, "连接超时") except Exception as e: logger.info(request.url, "未知错误") finally: if request.proxies: # print(request.proxies['http'][7:]) self.proxies_db.decr(request.proxies['http'][7:]) elif request.method == 'POST': try: resp = requests.post(url=request.url, data=request.data, params=request.params, headers=request.headers) except Exception as e: logger.info(request.url,e) else: logger.info('不支持的请求方法') if request.proxies: self.proxies_db.set_max(request.proxies['http'][7:]) if not resp: return if resp.status_code in [200,301,302]: logger.info("请求 {}<{}>成功".format(request.url, resp.status_code)) return Response(url=resp.url, status=resp.status_code, headers=resp.headers, text=resp.text,content=resp.content, request=request) else: logger.info("请求 {}<{}>失败,已重试次数{}".format(request.url, resp.status_code,request.retry_num)) request.retry_num+=1 if request.retry_num >= RETRY_NUM: self.count.incr_fail_request_num() self.backupqueue.pop()
def add(self, proxy, score=INIT_SCORE): if not self.redis.zscore(self.name, proxy): if self.redis.zadd(self.name, {proxy: INIT_SCORE}): logger.info(f"添加<{proxy}>成功") else: logger.info(f"<{proxy}>已存在")