def get_response(self, request): if request.method.upper() == "GET": res = requests.get( request.url, headers = request.headers, params = request.params ) elif request.method.upper() == "POST": res = requests.post( request.url, headers = request.headers, params = request.params, data = request.data ) else: raise Exception("ERROR : 不支持该请求方法") logger.info(u"[{}] <{}>".format(res.status_code, res.url)) return Response( res.url, res.status_code, res.headers, res.content )
def start(self): start_time = datetime.now() # print(3333) self._start_engine() end_time = datetime.now() # print(1111) logger.info('总时间{}'.format((end_time-start_time).total_seconds()))
def get_request(self): # request = self.queue.get_nowait() try: request = self.queue.get(False) return request except Exception as e: logger.info(e)
def get_response(self,request): # 判断请求方法 if request.method.upper() == "GET": response = requests.get( url=request.url, headers=request.headers, cookies=request.cookies, params=request.params, ) elif request.method.upper() == "POST": response=request.post( url=request.url, headers=request.headers, cookies=request.cookies, params=request.params, data=request.data ) else: raise Exception('框架不支持的请求类型 {}'.format(request.method)) logger.info("下载器成功获取<{}>对应的响应".format(request.url)) # 构建响应对象 res = Response( url=response.url, body=response.content, headers=response.headers, code=response.status_code, request=request, meta=request.meta ) return res
def _filter_request(self, request): '''请求去重: request的指纹不在集合中,指纹入集合,返回true''' fp = self._gen_fp(request) if fp not in self.fp_set: self.fp_set.add(fp) return True self.total_repeat_nums += 1 logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False
def _filter_request(self, request): '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True''' fp = self._gen_fp(request) if fp not in self.fp_set: self.fp_set.add(fp) return True self.total_repeat_nums += 1 # 重复的请求数 +1 logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False
def _filter_request(self, request): # 过滤去重,暂不实现 fp = self._gen_fp(request) if fp not in self.fp_set: # 判断如果指纹不在集合中 self.fp_set.add(fp) # 于是,就把指纹放入集合中 return True # 返回True 请求可以入队 logger.info('发现重复的请求:<{}>'.format(request.url)) self.repeat_request_nums += 1 return False # 请求重复了,返回False
def add_request(self, request): if not request.filter: request.fp = self._gen_fp(request) self.queue.put(request) logger.info("添加不去重的请求<{} {}>".format(request.method, request.url)) return if self._filter_request(request): self.queue.put(request)
def _filter_request(self, fp, request): """ 判断是否是重复请求,如果是重复的返回True,否则返回False """ if self._filter_set.is_filter(fp): logger.info(u"发现重复请求 : [{}] <{}>".format(request.method, request.url)) return True else: return False
def start(self): # 被调用执行引擎逻辑的入口函数 start = datetime.now() # 起始时间 logger.info("开始运行时间:%s" % start) # 使用日志记录起始运行时间 self._start_engine() stop = datetime.now() # 结束时间 logger.info("结束运行时间:%s" % stop) # 使用日志记录结束运行时间 logger.info("耗时:%.2f" % (stop - start).total_seconds()) # 使用日志记录运行耗时 logger.info("总的请求数量:{}".format(self.total_request_nums)) logger.info("总的响应数量:{}".format(self.total_response_nums))
def add_request(self, request): # 把request放入请求队列 # 判断指纹是否在集合中,如果不在就入队 request.fp = self._gen_fp(request) if not request.filter: # 构造request声明不过滤重复请求的情况下 self.fp_container.add_fp(request.fp) #也要让指纹进集合! self.q.put(request) logger.info("添加不去重的请求<{} {}>".format(request.method, request.url)) return # 避免在本函数中请求重复入队 if self._filter_request(request): self.q.put(request)
def _filter_request(self, request): # 实现请求去重 如果该请求需要被过滤就返回true,否则返回false request.fp = self._gen_fp(request) if request.fp not in self._filter_container: self._filter_container.add(request.fp) return True else: self.request_repeat_nums += 1 logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False
def start(self): start_time = datetime.now() logger.info("开始运行时间:%s"%start_time) self._start_engine() stop = datetime.now() end_time = datetime.now() logger.info("结束运行时间:%s"% end_time) logger.info("耗时:%.2f" % (stop - start_time).total_seconds()) logger.info("总的请求数量:{}".format(self.scheduler.total_request_number)) logger.info("总的响应数量:{}".format(self.total_response_nums))
def start_engine(self): start = datetime.now() logger.info("框架启动的时间为:[{}]".format(start)) logger.info("并发类型为{}".format(settings.ASYNC_TYPE)) logger.info("并发数量为{}".format(settings.MAX_ASYNC_THREAD_NUMBER)) self._start_engine() stop = datetime.now() logger.info("框架停止的时间为:[{}]".format(stop)) logger.info("框架运行的时间为:[{}]".format((stop - start).total_seconds())) self.collector.clear() # 清除redis中所有的计数的值,但不清除指纹集合; 视情况而看
def _filter_request(self, request): '''请求去重: 判断指纹是否在集合中,如果不在就指纹进集合,返回True''' # request.fp = self._gen_fp(request) # if fp not in self.fp_set: if not self.fp_container.exists(request.fp): self.fp_container.add_fp(request.fp) # 指纹进集合 return True # self.total_repeat_nums += 1 # 重复的请求数 +1 self.collector.incr(self.collector.repeat_request_nums_key) logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False
def _filter_request(self, request): """去重""" request.fp = self._gen_fp(request) if not self._filter_container.exists(request.fp): self._filter_container.add_fp(request.fp) return True else: # self.repeat_request_num += 1 self.collector.incr(self.collector.repeat_request_nums_key) logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) return False
def start(self): '''提供程序启动入口, 启动整个引擎''' # 测试log功能 start_time = datetime.now() logger.info('爬虫启动:{}'.format(start_time)) self._start_engine() end_time = datetime.now() print("爬虫结束:{}".format(end_time)) print('爬虫共运行:{}秒'.format( (end_time - start_time).total_seconds())) # total_seconds
def add_request(self, request): """添加请求到请求对列中""" # 如果需要过滤,并且是重复请求才过滤 if not request.dont_filter and self.__filter_request(request): # 如果请求需要过滤,记录日志,直接返回 logger.info('过滤掉了重复的请求:%s' % request.url) self.stats_collector.incr( self.stats_collector.repeat_request_nums_key) return self.queue.put(request) # 每添加一次请求,就让总请求数量加1 self.stats_collector.incr(self.stats_collector.request_nums_key)
def _filter_request(self, request): """ 实现对请求对象的去重 :param request: 请求对象 :return: bool """ # 给request对象添加一个fp属性,保存指纹 request.fp = self._gen_fp(request) if not self._filter_container.exists(request.fp): self._filter_container.add_fp(request.fp) # 把request指纹添加到指纹集合中 return True else: logger.info("发现重复的请求:<{} {}>".format(request.method, request.url)) # self.repeat_request_nums += 1 self.collector.incr(self.collector.repeat_request_nums_key)
def add_request(self, request): """ 实现添加request到队列中 :param request: :return: """ # self._filter_request(request) # 在加入请求队列之前先过滤 # 判断请求是否进行去重,如果不需要,直接添加到队列 if not request.filter: # 不需要去重 request.fp = self._gen_fp(request) self.queue.put(request) logger.info("添加不去重的请求<{} {}>".format(request.method, request.url)) return if self._filter_request(request): self.queue.put(request)
def add_request(self, request): """ 对请求去重,并添加不重复的请求到队列中 """ if not request.filter: logger.info(u"添加请求(dont filter) 成功: [{}] <{}>".format(request.method, request.url)) self.queue.put(request) self.total_request += 1 return # 生成每个请求的指纹数据 fp = self._gen_fingerprint(request) if not self._filter_request(fp, request): # 添加请求到请求队列,添加指纹数据到指纹集合里 logger.info(u"添加请求成功: [{}] <{}>".format(request.method, request.url)) self.queue.put(request) self.total_request += 1 self._filter_set.add_fp(fp)
def put_request(self, request): ''' 将请求放入待爬取队列 :param request: :return: ''' if request.filter == False: self.queue.put(request) logger.info("重复的请求<{}>被设置为不过滤".format(request.url)) return fp = self._gen_fp(request) if not self._filter_request(fp): self.queue.put(request) self._filter_container.add_fp(fp) else: # self.repeat_request_num += 1 self.collector.incr(self.collector.repeat_request_nums_key) logger.info("重复的请求<{}>已经被过滤掉了,hash值为<{}>".format(request.url, fp))
def start(self): start = datetime.now() logger.info("引擎启动时间{}".format(start)) self._start_engine() stop = datetime.now() logger.info("引擎停止时间{}".format(stop)) logger.info("爬虫运行时间{}".format((stop-start).total_seconds()))
def get_response(self, request): """ 实现结构请求对象,发送请求,获取响应 :param request: :return: """ if request.method.upper() == "GET": resp = requests.get(request.url, headers=request.headers, params=request.params) elif request.method.upper() == "POST": resp = requests.post(request.url, headers=request.headers, params=request.params, data=request.data) else: raise Exception("不支持的请求方法:<{}>".format(request.method)) logger.info("<{} {}> ".format(resp.status_code, resp.url)) return Response(url=resp.url, body=resp.content, headers=resp.headers, status_code=resp.status_code)
def start(self): start = datetime.now() logger.info("start time: {}".format(start)) self._start_engine() stop = datetime.now() logger.info("stop time: {}".format(stop)) time = (stop - start).total_seconds() logger.info("useing time: {}".format(time))
def add_request(self, request): # 在入队列前便生成指纹 # 根据请求对象生成指纹进行比对 fp = self._create_fp(request) if request.filter: # 过滤 # 直接根据指纹进行判断 if not self.filter_request(fp): # 表示不重复,添加新多请求对象并入队列 self.queue.put(request) # 不再将url进入队列,而是直接进入指纹,方便去重判定 self.filter_container.add_fp(fp) else: logger.info('this is a repetitive request:{}'.format( request.url)) # self.total_repeat_num += 1 self.collector.incr(self.collector.repeat_request_nums_key) else: # 过滤,直接入队列 self.queue.put(request) logger.info('a repetitive request is added in queue:{}'.format( request.url))
def start(self): # 被调用执行引擎逻辑的入口函数 start = datetime.now() # 起始时间 logger.info("开始运行时间:%s" % start) # 使用日志记录起始运行时间 self._start_engine() stop = datetime.now() # 结束时间 logger.info("结束运行时间:%s" % stop) # 使用日志记录结束运行时间 logger.info("耗时:%.2f" % (stop - start).total_seconds()) # 使用日志记录运行耗时
def start(self): """启动整个引擎,主要调用逻辑代码写在_start_engine中""" start = datetime.now() # 获取当前时间 logger.info('开始运行时间:%s' % start) self._start_engine() stop = datetime.now() logger.info('运行结束时间:%s' % stop) # 运行总耗时时间 logger.info('耗时: %.2f' % (stop - start).total_seconds())
def _start_engine(self): if ROLE == "master" or ROLE is None: # 处理start_urls里的请求 if ASYNC_TYPE == "coroutine": logger.info(u"协程正在执行...") else: logger.info(u"子线程正在执行...") self.pool.apply_async(self._start_requests) #self._start_requests() if ROLE == "slave" or ROLE is None: # 通过settings 控制并发量 for i in range(ASYNC_MAX_COUNT): if ASYNC_TYPE == "coroutine": logger.info(u"协程正在执行...") else: logger.info(u"子线程正在执行...") self.pool.apply_async(self._execute_request_response_item, callback = self._callback) # 处理调度器的请求 while True: # 避免CPU疯狂空转,浪费资源 time.sleep(0.01) #self._execute_request_response_item() # 当请求计数器和响应计数器相等时,表示所有请求已经处理结束 # 且至少有一个响应处理完毕,循环退出(避免程序刚执行就退出) if self.total_response == self.scheduler.total_request and self.total_response != 0: self.is_running = False break self.pool.close() # 不再向线程池添加任务了,协程默认执行pass self.pool.join() # 让主线程等待所有子线程执行结束 logger.info(u"主线程执行结束")
def start(self): # 框架启动的入口函数 start = datetime.now() # 起始时间 logger.info("开始运行时间:%s" % start) # 使用日志记录起始运行时间 self._start_engine() stop = datetime.now() # 结束时间 logger.info("结束运行时间:%s" % stop) # 使用日志记录结束运行时间 logger.info("耗时:%.2f" % (stop - start).total_seconds()) # 使用日志记录运行耗时 logger.info("总的请求数量:{}".format(self.collector.request_nums)) logger.info("总的响应数量:{}".format(self.collector.response_nums)) logger.info("重复请求数量:{}".format(self.collector.repeat_request_nums)) self.collector.clear() # 清除计数统计!