def process_response(self, request, response, spider): """Only allow HTTP response types that match the given regular expressions whitelist. Each spider must define a whitelist iterable containing regular expressions whose content types the spider wishes to download. """ whitelist = getattr(spider, "whitelist", None) if not whitelist: return response content_type = response.headers.get('content-type', None) if not content_type: logging.info( "spider {}: ignored: {} does not contain a content-type header" .format(spider.name, response.url)) raise IgnoreRequest() if self.is_content_type_okay(whitelist, content_type): return response logging.info( "spider {}: ignored: {} has type {}, which was not whitelisted". format(spider.name, response.url, content_type)) raise IgnoreRequest()
def process_request(self, request, spider): if hasattr( request, 'meta' ) and 'webdriver' in request.meta and request.meta['webdriver'].get( 'name', '') == 'selenium_grid': driver = self.get_driver() if driver: try: meta = request.meta['webdriver'] action = meta.get('module', None) if not action: raise IgnoreRequest( 'selenium grid request must have "act" item in meta' ) m = importlib.import_module('zeus_actions.' + action) f = getattr(m, 'act') if f is None or not hasattr(f, '__call__'): raise IgnoreRequest( 'module %s must implement "act" method' % action) self.driver.get(request.url) f(self.driver) body = self.driver.page_source return HtmlResponse(url=request.url, body=body, request=request, encoding='utf-8') finally: #self.driver.close() pass
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called # if request.url.rstrip() in self.bing_archive: # raise IgnoreRequest() for word in self.word_to_ignore: if word.lower() in request.url.lower(): raise IgnoreRequest() for ext in self.extensions_to_ignore: if request.url.lower().endswith(ext): raise IgnoreRequest() # with open(self.bing_archive_path,'a') as f: # f.write(request.url+"\n") self.visited_urls.append(request.url) return None
def process_response(self, request, response, spider): """ 要考虑两种情况,一是被封,二是ip 失效 :param request: :param response: :param spider: :return: """ proxy_str = request.meta['proxy'] proxy = ProxyItem.parse(proxy_str) # 持有的是方法,只有一个实例,所以并发时 self.proxy 应该是不准确的,需从 request 获取 code, _ = douyin.parse_result(response.body.decode()) if code == 1: proxy_manager.success(proxy) elif code == 2: proxy_manager.banned(proxy) if douyin_spider.ANONYMOUS: # 匿名则忽略并继续 ,不匿名返回处理 raise IgnoreRequest() else: return response else: proxy_manager.fail(proxy) raise IgnoreRequest() return response
def process_request(self, request, spider): # todo pylint:disable=unused-argument """Process incoming request.""" parsed_uri = urlparse(request.url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if '.onion' in domain: if domain[-7:-1] != '.onion': msg = 'Ignoring request %s, not .onion domain.' % domain logging.info(msg) raise IgnoreRequest() # Not .onion domain # Drop connections to the old onion v2 addresses and other invalid domains if len( domain.split('.')[-2].replace('http://', '').replace( 'https://', '')) != 56: msg = 'Ignoring request %s, not v3 onion domain.' % domain logging.info(msg) raise IgnoreRequest() # Not a valid onion v3 address # List of proxies available if parsed_uri.scheme == "https": # For those few HTTPS onion websites tor_proxy_list = settings.get('HTTPS_PROXY_TOR_PROXIES') else: # Plain text HTTP without TLS tor_proxy_list = settings.get('HTTP_PROXY_TOR_PROXIES') # Always select the same proxy for the same onion domain # This will keep only one underlining Tor circuit to the onion service # Onion addresses form an uniform distribution # Therefore this address can be used as a seed for random hash = '{uri.netloc}'.format(uri=parsed_uri).replace(".onion", "") random.seed(hash) # A seed for randomness is the onion domain # Always select the same proxy for the same onion address request.meta['proxy'] = random.choice(tor_proxy_list) elif ".i2p" in domain and ".i2p." not in domain: if parsed_uri.scheme == "https": request.meta['proxy'] = settings.get('HTTPS_PROXY_I2P') else: request.meta['proxy'] = settings.get('HTTP_PROXY_I2P')
def process_request(self, request, spider): # Get meta prices for comparaison price = request.meta.get('prop_price', False) discount_price = request.meta.get('discount_price', False) if price and discount_price: # It's discount item # Format prices price = self.Utils.format_price(price) discount_price = self.Utils.format_price(discount_price) # Check price if price == 0 or discount_price == 0: raise IgnoreRequest('isRequestItem_price_null') if price == discount_price: raise IgnoreRequest('isRequestItem_prices_egal') # Check if exist if (request.url, str(discount_price), str(price)) in self.items: self.DB.update(request.url) raise IgnoreRequest('already exist') else: # Remove it in case self.DB.delete_one(request.url) elif price or discount_price: # It's not discount item raise IgnoreRequest('not_discount_item')
def process_request(self, request, spider): path = urlparse(request.url).path or '/' for allow_path in spider.allow_path: if not match(escape(allow_path), path): raise IgnoreRequest("outside scope %s") for deny_path in spider.deny_path: if match(escape(deny_path), path): raise IgnoreRequest("outside scope %s")
def process_exception(self, request, exception, spider): # 出现异常时(超时)使用代理 print("\n出现异常,正在使用代理重试....\n") if isinstance(exception, pymysql.DatabaseError): raise IgnoreRequest("数据库错误,不做处理") if isinstance(exception, (HttpError, DNSLookupError, TimeoutError)): lines = self.get_proxies() current_proxy = lines[random.randint(0, len(lines) - 1)].strip() # 对当前reque加上代理 print("更换代理为{}".format(current_proxy)) raise IgnoreRequest("超过最大请求,{}\t被跳过".format(request.url))
def process_exception(self, request, exception, spider): # 没有新房时日志记录 province, city = request.meta['data'] if isinstance(exception, DNSLookupError) and isinstance(spider, XinFangSpider) and '/loupan' in request.url: spider.logger.error(f'{province}-{city} 没有新房。。。') raise IgnoreRequest() # return TextResponse(url=request.url, body='没有新房'.encode()) # 没有二手房时日志记录 elif isinstance(exception, DNSLookupError) and spider.name == 'erShouFang' and '/ershoufang' in request.url: spider.logger.error(f'{province}-{city} 没有二手房。。。') raise IgnoreRequest() elif isinstance(exception, DNSLookupError) and spider.name == 'zuFang' and '/zufang' in request.url: spider.logger.error(f'{province}-{city} 没有租房信息。。。') raise IgnoreRequest()
def process_response(self, request, response, spider): """Process response hook.""" allowed_domains = getattr(spider, 'allowed_domains', None) blacklist_urls = getattr(spider, 'blacklist_urls', []) url = response.url # Offsite check if not is_url_in_domains(url, allowed_domains): raise IgnoreRequest(request) # Blacklist check for bad_url in blacklist_urls: if (isinstance(bad_url, str) and url == bad_url) \ or bad_url.search(url): raise IgnoreRequest(request) return response
def process_request(self, request, spider): request.headers['User-Agent'] = random.choice(self.user_agents) # Drop requests for "other-make" if 'other-make' in request.url: raise IgnoreRequest('Other make ignored.') # Drop duplicate Request if request.url in self.requests: raise IgnoreRequest('Dropping duplicate {0}'.format(request.url)) # Otherwise, add URL to seen list else: self.requests |= request.url
def process_response(self, request, response, spider): text = response.text if text.startswith("<script>") and "__jsl_clearance" in text: if '__jsl_clearance' in request.cookies: spider.log("Calculate __jsl_clearance value wrong, Ignore this Request", level=WARNING) raise IgnoreRequest() try: key, value = get_anti_spider_clearance(text.strip()).split("=", 1) clearance = {key: value} except: spider.log("Calculate __jsl_clearance error, Ignore this Request", level=WARNING) raise IgnoreRequest() else: return request.replace(dont_filter=True, cookies=clearance) return response
def process_request(self, request, spider): if request.meta.get('dont_cache', False): return # Skip uncacheable requests if not self.policy.should_cache_request(request): request.meta['_dont_cache'] = True # flag as uncacheable return # Look for cached response and check if expired cachedresponse = self.storage.retrieve_response(spider, request) if cachedresponse is None: self.stats.inc_value('httpcache/miss', spider=spider) if self.ignore_missing: self.stats.inc_value('httpcache/ignore', spider=spider) raise IgnoreRequest("Ignored request not in cache: %s" % request) return # first time request # Return cached response only if not expired cachedresponse.flags.append('cached') if self.policy.is_cached_response_fresh(cachedresponse, request): self.stats.inc_value('httpcache/hit', spider=spider) return cachedresponse # Keep a reference to cached response to avoid a second cache lookup on # process_response hook request.meta['cached_response'] = cachedresponse
def process_response(self, request, response, spider): if 'x-ignore-response' in request.url: raise IgnoreRequest() elif 'x-error-response' in request.url: _ = 1 / 0 else: return response
def process_request(self, request, spider): digest = hash(request.url) if digest in self.visited: raise IgnoreRequest("Duplicated url %s"%(request.url)) else: self.visited.append(digest) return None
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust logger.debug( "Redirecting (%(reason)s) to %(redirected)s from %(request)s", { 'reason': reason, 'redirected': redirected, 'request': request }, extra={'spider': spider}) return redirected else: logger.debug("Discarding %(request)s: max redirections reached", {'request': request}, extra={'spider': spider}) raise IgnoreRequest("max redirections reached")
def parse_category(self, response: HtmlResponse) -> HtmlResponse: """ List category and traverse product pages. """ products_query = response.css( "section#bc-sf-filter-products > div.product-grid-item") if not products_query: raise IgnoreRequest('Product items not found') self.logger.info( f'parse product_categories len: {len(products_query)}') for pdp in products_query.css('div.product-grid-item'): item_loader = ProductLoader(item=UrgeItem(), selector=pdp) item_loader.add_css('product_name', 'div.product-text > p.title::text') item_loader.add_css('product_brand', 'div.product-text > h2.vendor.h5::text') # get regular product price through OR (,). item_loader.add_css( 'product_price', 'div.product-text p.price s::text , span[itemprop="price"]::text' ) item_loader.add_css( 'product_sale_price', 'div.product-text p.sale span[itemprop="price"]::text') if 'href' in pdp.css('a').attrib: product_url = pdp.css('a').attrib['href'] yield response.follow(product_url, callback=self.product_page, meta={'item': item_loader.load_item()})
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called headers = { 'Host': 'twitter.com', # 'User-Agent': "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Language': 'zh-CN,zh;q=0.9' } try: content = requests.get(url=request.url, headers=headers).text except Exception as e: raise IgnoreRequest(e) response = HtmlResponse(url=request.url, body=content, request=request, encoding='utf-8') return response
def test_process_spider_exception(self): assert self.instance.counters == {'all': 0, 'error': 0} self.instance.save_response = mock.Mock() # all conditions are true self.instance.on_error_enabled = True self.instance.process_spider_exception('err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 1} # on_error flag is disabled, skipping self.instance.on_error_enabled = False self.instance.process_spider_exception('err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 1} # exceeded error limit self.instance.on_error_enabled = True self.instance.counters['error'] = 11 self.instance.process_spider_exception('err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 11} # skip IgnoreRequest self.instance.limits['error'] = 12 self.instance.process_spider_exception('err-response', IgnoreRequest(), self.spider) assert self.instance.counters == {'all': 0, 'error': 11} # all conditions are true again self.instance.limits['all'] = 12 self.instance.process_spider_exception('err-response', Exception(), self.spider) assert self.instance.counters == {'all': 0, 'error': 12}
def _redirect(self, redirected, request, spider, reason): reason = response_status_message(reason) redirects = request.meta.get('redirect_times', 0) + 1 if redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.meta['priority'] = redirected.meta[ 'priority'] + self.priority_adjust self.logger.debug("Redirecting %s to %s from %s for %s times " % (reason, redirected.url, request.url, redirected.meta.get("redirect_times"))) return redirected else: self.logger.info("Discarding %s: max redirections reached" % request.url) # 错误信息记录出错的url ,而不是最初的url # request.meta["url"] = request.url if request.meta.get("callback") == "parse": # 对于分类页失败,总数+1 self.crawler.stats.inc_total_pages( crawlid=request.meta['crawlid']) self.logger.error( " in redicrect request error to failed pages url:%s, exception:%s, meta:%s" % (request.url, reason, request.meta)) raise IgnoreRequest("max redirections reached:%s" % reason)
def process_request_2(self, rp, request, spider): if rp is not None and not rp.can_fetch(to_native_str(self._useragent), request.url): logger.debug("Forbidden by robots.txt: %(request)s", {'request': request}, extra={'spider': spider}) raise IgnoreRequest()
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault("redirect_ttl", self.max_redirect_times) redirects = request.meta.get("redirect_times", 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta["redirect_times"] = redirects redirected.meta["redirect_ttl"] = ttl - 1 redirected.meta["redirect_urls"] = request.meta.get( "redirect_urls", []) + [request.url] redirected.meta["redirect_reasons"] = request.meta.get( "redirect_reasons", []) + [reason] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust logger.debug( "Redirecting (%(reason)s) to %(redirected)s from %(request)s", { "reason": reason, "redirected": redirected, "request": request }, extra={"spider": spider}, ) return redirected else: logger.debug( "Discarding %(request)s: max redirections reached", {"request": request}, extra={"spider": spider}, ) raise IgnoreRequest("max redirections reached")
def process_request(self, request, spider): if spider.use_selenium(): driver = SeleniumDriver( request.meta.get('proxy', None), request.meta['site_settings'].headless ).driver cookies = request.meta['site_settings'].cookies if len(cookies) >= 1: # Selenium can only add cookies to # the domain that it is already on driver.get(request.url) driver.delete_all_cookies() for cookie in cookies: driver.add_cookie(cookie) try: driver.get(request.url) except TimeoutException: raise IgnoreRequest() self._wait_for_page(driver, spider, request) return HtmlResponse( driver.current_url, body=driver.page_source, encoding='UTF-8', request=request )
def process_response(self, request, response, spider): # If we hit an ignorable error (eg. "payment required") then ignore the request if response.status in self.ignore_http_codes: raise IgnoreRequest( "Skipping page which returned a status code that we ignore.") # If we hit a 'service unavailable' error then increase the delay if response.status in self.delay_http_codes: self.delay_interval += self.delay_increment spider.logger.info( "Too many requests - server returned an error (code {}). " "Adding {:.2f}s delay to future requests. " "Current delay interval is {:.2f}s".format( response.status, self.delay_increment, self.delay_interval)) self.num_responses = 0 # If we manage to hit 'num_responses_threshold' responses in a row # without problems then reduce the delay else: self.num_responses += 1 if self.delay_interval and self.num_responses >= self.num_responses_threshold: self.delay_interval = max( self.delay_interval - self.delay_increment, 0) spider.logger.info( "Made {} requests without a server error. " "Reducing delay for future requests by {:.2f}s. " "Current delay interval is {:.2f}s".format( self.num_responses, self.delay_increment, self.delay_interval)) self.num_responses = 0 # Wait if the delay is non-zero if self.delay_interval: time.sleep(self.delay_interval) return super().process_response(request, response, spider)
def process_request(self, request, spider): # print(request.url) if len(self.crawled_urls) % 100 == 0: print("Crawled sets: ", len(self.crawled_urls)) if request.url in self.crawled_urls: print("Duplicate request", request.url) raise IgnoreRequest() elif any(x in request.url for x in self.deny_url_contains): raise IgnoreRequest() else: self.crawled_urls.add(request.url) return None
def process_request(self, request, spider): if request.meta.get('captcha_request', False): return if self.paused: self.queue.append((request, spider)) raise IgnoreRequest('Crawling paused, because CAPTCHA is ' 'being solved')
def process_response(self, request, response, spider): # 处理下载完成的response # 排除状态码不是304的所有以3为开头的响应 http_code = response.status if http_code // 100 == 2: global count count = 0 return response if http_code // 100 == 3 and http_code != 304: # 获取重定向的url # url = response.headers['location'] # domain = urlparse.urlparse(url).netloc # 判断重定向的url的domain是否在allowed_domains中 # if domain in spider.allowed_domains: # return Request(url=url, meta=request.meta) # else: count if count == 1: sendMessage_warning() # print count count += 1 #把request返回到下载器 return request.replace(dont_filter=True) if http_code // 100 == 4: # 需要注意403不是响应错误,是无权访问 raise IgnoreRequest(u'404') if http_code // 100 == 5: return request.replace(dont_filter=True)
def process_request(self, request, spider): url = request.url key = hashlib.md5(url).hexdigest() info = self.mongo.find_one({'key': key}) if info: logger.warn("ingore repeat url: %s" % request.url) raise IgnoreRequest("ingore repeat url: %s" % request.url)
def process_request(self, request, spider: CollectAnnotableUrlsSpider): parsed_url = urlparse(request.url) domain = parsed_url.netloc domain_without_www = domain.replace('www.', '') if domain_without_www in self.limit_per_domain: max_visits_domain = self.limit_per_domain[ domain_without_www] * MAX_VISITED_REQUIRED_MATCHED_RATIO limit_by_visits_reached = self.counter_visited.get( domain_without_www, 0) >= max_visits_domain limit_by_matches_reached = self.counter_matched.get( domain_without_www, 0) >= self.limit_per_domain[domain_without_www] if limit_by_visits_reached or limit_by_matches_reached: spider.logger.info( f'process_request() filtering request for domain {domain_without_www}: ' f'visited: {self.counter_visited[domain_without_www]}, ' f'matched: {self.counter_matched[domain_without_www]}') raise IgnoreRequest() spider.logger.info( f'process_request() accepting request for domain {domain_without_www}: ' f'visited: {self.counter_visited[domain_without_www]}, ' f'matched: {self.counter_matched[domain_without_www]}') return None
def process_request(self, request, spider): if not request.url: return None channel_id = request.meta.get('channel_id', 0) if is_dup_detail(request.url, spider.name, channel_id): raise IgnoreRequest("Spider: %s, DeDuplicationRequest: %s" % (spider.name, request.url))