def build_check_request(self, item): scheme = item.get('scheme') proxy_url = item.get('url') self.logger.debug('Checking proxy: %s' % proxy_url) url, response_parser = self.get_check_approach(scheme) url = url.format(scheme=scheme) meta = { 'proxy': proxy_url, 'max_retry_times': 3, 'download_timeout': 20, '_item_obj': item, '_start_time': time.time(), '_response_parser': response_parser, } req = Request(url, callback=self.check_ip, meta=meta, dont_filter=True) if self.name == 'checker': req.errback = self.check_ip_failed return req
def build_check_request(self, item: Proxy): scheme = item.get('scheme') proxy_url = item.get('url') self.logger.debug('Checking %s' % proxy_url) url, response_parser = self.get_check_approach(scheme) url = url.format(scheme=scheme) timeout = getattr(config, 'CHECK_TIMEOUT', 20) meta = { 'proxy': proxy_url, 'max_retry_times': 5, 'download_timeout': timeout, '_item_obj': item, '_response_parser': response_parser, } req = Request(url, callback=self.check_ip, meta=meta, dont_filter=True) if self.name == 'checker': req.errback = self.check_ip_failed return req
def next_request(self): block_pop_timeout = self.idle_before_close item = self.queue.pop(block_pop_timeout) if item: try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request(item['url']) try: if 'callback' in item and item['callback'] is not None: req.callback = getattr(self.spider, item['callback']) except AttributeError: print 'kk' try: if 'errback' in item and item['errback'] is not None: req.errback = getattr(self.spider, item['errback']) except AttributeError: print 'kk' # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 req.meta['field_css'] = item['meta'] if 'item' in item['meta']: req.meta['item'] = item['meta']['item'] if 'field_css' in item['meta']: req.meta['field_css'] = item['meta']['field_css'] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] return req