def should_follow(self, response, request): """Only follow offsite links to JS and CSS files, not new pages.""" res_url_data = urlparse_cached(response) req_url_data = urlparse_cached(request) if has_extension(request, "js") or has_extension(request, "css"): return True # Otherwise, ensure that the domains share the same root origin return req_url_data.netloc == res_url_data.netloc
def test_urlparse_cached(self): url = "http://www.example.com/index.html" request1 = Request(url) request2 = Request(url) req1a = urlparse_cached(request1) req1b = urlparse_cached(request1) req2 = urlparse_cached(request2) urlp = urlparse.urlparse(url) assert req1a == req2 assert req1a == urlp assert req1a is req1b assert req1a is not req2 assert req1a is not req2
def process_request(self, request, spider): hostname = urlparse_cached(request).hostname if spider.domain_name == 's3.amazonaws.com' \ or (hostname and hostname.endswith('s3.amazonaws.com')): request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", \ time.gmtime()) sign_request(request, self.access_key, self.secret_key)
def robot_parser(self, request, spider): url = urlparse_cached(request) netloc = url.netloc if netloc not in self._parsers: self._parsers[netloc] = Deferred() robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc) robotsreq = Request( robotsurl, priority=self.DOWNLOAD_PRIORITY, meta={'dont_obey_robotstxt': True} ) dfd = self.crawler.engine.download(robotsreq, spider) dfd.addCallback(self._parse_robots, netloc) dfd.addErrback(self._logerror, robotsreq, spider) dfd.addErrback(self._robots_error, netloc) if isinstance(self._parsers[netloc], Deferred): d = Deferred() def cb(result): d.callback(result) return result self._parsers[netloc].addCallback(cb) return d else: return self._parsers[netloc]
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme try: handler = self._handlers[scheme] except KeyError: raise NotSupported("Unsupported URL scheme '%s' in: <%s>" % (scheme, request.url)) return handler(request, spider)
def warcrec_from_scrapy_request(self, request): headers = request.headers body = request.body parsed = urlparse_cached(request) scheme, netloc, host, port, path = _parsed_url_args(parsed) headers.setdefault('Host', netloc) if body is not None and len(body) > 0: headers['Content-Length'] = len(body) headers.setdefault("Connection", "close") # Compile the request using buf buf = StringIO() buf.write('%s %s HTTP/1.0\r\n' % (request.method, path)) for name, values in headers.items(): for value in values: buf.write('%s: %s\r\n' % (name, value)) buf.write('\r\n') if body is not None: buf.write(body) request_str = buf.getvalue() return warcrecords.WarcRequestRecord(url=request.url, block=request_str)
def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme handler = self._get_handler(scheme) if not handler: raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, self._notconfigured[scheme])) return handler.download_request(request, spider)
def download_request(self, request, spider): p = urlparse_cached(request) scheme = 'https' if request.meta.get('is_secure') else 'http' bucket = p.hostname path = p.path + '?' + p.query if p.query else p.path url = '{0!s}://{1!s}.s3.amazonaws.com{2!s}'.format(scheme, bucket, path) if self.anon: request = request.replace(url=url) elif self._signer is not None: import botocore.awsrequest awsrequest = botocore.awsrequest.AWSRequest( method=request.method, url='{0!s}://s3.amazonaws.com/{1!s}{2!s}'.format(scheme, bucket, path), headers=request.headers.to_unicode_dict(), data=request.body) self._signer.add_auth(awsrequest) request = request.replace( url=url, headers=awsrequest.headers.items()) else: signed_headers = self.conn.make_request( method=request.method, bucket=bucket, key=unquote(p.path), query_args=unquote(p.query), headers=request.headers, data=request.body) request = request.replace(url=url, headers=signed_headers) return self._download_http(request, spider)
def download_request(self, request, spider): p = urlparse_cached(request) scheme = "https" if request.meta.get("is_secure") else "http" url = "%s://%s.s3.amazonaws.com%s" % (scheme, p.hostname, p.path) httpreq = request.replace(url=url) self.conn.add_aws_auth_header(httpreq.headers, httpreq.method, "%s/%s" % (p.hostname, p.path)) return self._download_http(httpreq, spider)
def add_cookie_header(self, request): wreq = WrappedRequest(request) self.policy._now = self.jar._now = int(time.time()) # the cookiejar implementation iterates through all domains # instead we restrict to potential matches on the domain req_host = urlparse_cached(request).hostname if not IPV4_RE.search(req_host): hosts = potential_domain_matches(req_host) if req_host.find(".") == -1: hosts += req_host + ".local" else: hosts = [req_host] cookies = [] for host in hosts: if host in self.jar._cookies: cookies += self.jar._cookies_for_domain(host, wreq) attrs = self.jar._cookie_attrs(cookies) if attrs: if not wreq.has_header("Cookie"): wreq.add_unredirected_header("Cookie", "; ".join(attrs)) self.processed += 1 if self.processed % self.check_expired_frequency == 0: # This is still quite inefficient for large number of cookies self.jar.clear_expired_cookies()
def process_request(self, request, spider): for p in self.no_proxy_patterns: if p.search(request.url): return retries = request.meta.get('retry_times', None) #已手动制定代理的不设置 if 'proxy' in request.meta: if retries is None: return #当到达最大重试次数时,使用本机直接访问,确保失败时始终有一次本机访问. if retries == self.max_retry_times: now = time.time() should_sleep = self.local_interval - (now - self.local_last_use_time) if should_sleep > 0: log.msg('ProxyMiddleware:use proxy fail,local sleep %s' % should_sleep, log.DEBUG) time.sleep(should_sleep) return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return self._set_proxy(request, scheme)
def process_spider_output(self, response, result, spider): for res in result: if isinstance(res, Request): parsed = urlparse_cached(res) if parsed.query: parsed = parsed._replace(query=_filter_query(parsed.query, self.remove, self.keep)) res = res.replace(url=parsed.geturl()) yield res
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme try: handler = self._handlers[scheme] except KeyError: msg = self._notconfigured.get(scheme, "no handler available for that scheme") raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg)) return handler(request, spider)
def should_follow(self, request, spider): #Custom code to update regex self.update_regex(spider) regex = self.host_regex # hostname can be None for wrong urls (like javascript links) host = urlparse_cached(request).hostname or '' return bool(regex.search(host))
def parse(self, response): ld = self._load_webpage_item(response, is_seed=response.meta['is_seed']) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() if self.finishing: return now = datetime.utcnow() if self.operating_time > 0 and (now - self.start_time).total_seconds() > self.operating_time: log.msg("Reached operating time constraint. Waiting for Scrapy queue to exhaust.") self.finishing = True self.crawler.stop() return if not isinstance(response, TextResponse): return body = response.body_as_unicode().strip().encode('utf8') or '<html/>' score = self.ranker.score_html(body) log.msg("TC: %s has score=%f" % (response.url, score), _level=log.DEBUG) if score > 0.5: #!for some reason this is returning the raw splash response JSON #!and not the rendered HTML from splash #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG) #for link in self.linkextractor.extract_links(response): #can something like the line below fix it? Seems like a hack... for link in self.linkextractor.extract_links(response): log.msg("****---LINK EXTRACED: %s----***" % str(link.url), _level=log.DEBUG) if self.use_splash: r = self._splash_request(url=link.url) else: r = Request(url=link.url) external = is_external_url(response.url, link.url) depth = response.meta.get('link_depth', 0) r.meta.update({ 'link': { 'url': link.url, 'text': link.text, 'fragment': link.fragment, 'nofollow': link.nofollow}, 'link_depth': 0 if external else depth + 1, 'referrer_depth': depth, 'referrer_url': response.url, }) url_parts = urlparse_cached(r) path_parts = url_parts.path.split('/') r.meta['score'] = 1.0 / len(path_parts) r.meta['is_seed'] = False yield r
def should_cache_request(self, request): if urlparse_cached(request).scheme in self.ignore_schemes: return False cc = self._parse_cachecontrol(request) # obey user-agent directive "Cache-Control: no-store" if b'no-store' in cc: return False # Any other is eligible for caching return True
def _get_slot_key(self, request, spider): if "download_slot" in request.meta: return request.meta["download_slot"] key = urlparse_cached(request).hostname or "" if self.ip_concurrency: key = dnscache.get(key, key) return key
def _get_slot_key(self, request, spider): if self.DOWNLOAD_SLOT in request.meta: return request.meta[self.DOWNLOAD_SLOT] key = urlparse_cached(request).hostname or '' if self.ip_concurrency: key = dnscache.get(key, key) return key
def process_request(self, request, spider): """Process a spider request.""" if request.dont_filter or self.should_follow(request, spider): return None else: domain = urlparse_cached(request).hostname logging.debug("Filtered offsite request to %(domain)r: %(request)s" % {"domain": domain, "request": request}) raise IgnoreRequest
def should_follow(self, request, spider): referer = request.headers.get('Referer', '') refhost = urlparse(referer).hostname or '' if refhost: referer_ok = bool(self.follow_regex.search(refhost)) if not referer_ok: return False # hostname can be None for wrong urls (like javascript links) host = urlparse_cached(request).hostname or '' return bool(self.host_regex.search(host))
def process_request(self, request): request.meta['download_timeout'] = 60 parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def download_request(self, request, spider): parsed_url = urlparse_cached(request) user = request.meta.get("ftp_user", self.default_user) password = request.meta.get("ftp_password", self.default_password) passive_mode = 1 if bool(request.meta.get("ftp_passive", self.passive_mode)) else 0 creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode) return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient, request, unquote(parsed_url.path))
def should_follow(self, request, spider): allowed_regex = self.host_allowed_regex disallowed_regex = self.host_disallowed_regex # hostname can be None for wrong urls (like javascript links) host = urlparse_cached(request).hostname or '' allowed_res = bool(allowed_regex.search(host)) disallowed_res = not bool(disallowed_regex.search(host)) # log.msg('{0}:{1} allow?{2} disallow?{3}'.format(request.url, host, allowed_res, disallowed_res)) return allowed_res and disallowed_res
def robot_parser(self, request, spider): url = urlparse_cached(request) netloc = url.netloc if netloc not in self._parsers: self._parsers[netloc] = None robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc) robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY) dfd = self.crawler.engine.download(robotsreq, spider) dfd.addCallback(self._parse_robots) self._spider_netlocs[spider].add(netloc) return self._parsers[netloc]
def _parse_robots(self, response, spider): if response.status != 200: return rules = RobotRules( url=response.url, status=response.status, content=response.body_as_unicode(), expires=None ) self._robot_rules[urlparse_cached(response).netloc] = rules self._adjust_delay(rules, response, spider)
def _load_keys(self, requests, item): # Preload file paths into the requests because we use the item data to # generate the path. for req in requests: pr = urlparse_cached(req) # filename is last part of the URL path. image = pr.path.rpartition('/')[-1] req.meta['file_path'] = '/{slide_id}/{image}'.format( spider=item['spider'], slide_id=item['id'], image=image, )
def _get_slot(self, request, spider): key = urlparse_cached(request).hostname or '' if self.ip_concurrency: key = dnscache.get(key, key) if key not in self.slots: if self.ip_concurrency: concurrency = self.ip_concurrency else: concurrency = self.domain_concurrency concurrency, delay = _get_concurrency_delay(concurrency, spider, self.settings) self.slots[key] = Slot(concurrency, delay, self.settings) return key, self.slots[key]
def robotstxt(self, request, spider): url = urlparse_cached(request) if url.netloc not in self._robot_rules: self._robot_rules[url.netloc] = None req = Request( get_robotstxt_url(url), priority=self.DOWNLOAD_PRIORITY, meta={'dont_process_robots': True} ) dfd = self.crawler.engine.download(req, spider) dfd.addCallback(self._parse_robots, spider=spider) return self._robot_rules[url.netloc]
def process_spider_output(self, response, result, spider): for req in result: if isinstance(req, Request): if req.dont_filter or self.should_follow(response, req): yield req else: domain = urlparse_cached(req).hostname if domain and domain not in self.domains_seen[spider]: log.msg("Filtered offsite request to %r: %s" % (domain, req), level=log.DEBUG, spider=spider) self.domains_seen[spider].add(domain) else: yield req
def request_httprepr(request): """Return the raw HTTP representation (as bytes) of the given request. This is provided only for reference since it's not the actual stream of bytes that will be send when performing the request (that's controlled by Twisted). """ parsed = urlparse_cached(request) path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n" s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n" if request.headers: s += request.headers.to_string() + b"\r\n" s += b"\r\n" s += request.body return s
def _parse_robots(self, response): rp = robotparser.RobotFileParser(response.url) body = '' if hasattr(response, 'body_as_unicode'): body = response.body_as_unicode() else: # last effort try try: body = response.body.decode('utf-8') except UnicodeDecodeError: # If we found garbage, disregard it:, # but keep the lookup cached (in self._parsers) # Running rp.parse() will set rp state from # 'disallow all' to 'allow any'. pass rp.parse(body.splitlines()) self._parsers[urlparse_cached(response).netloc] = rp
def _get_slot(self, request, spider): key = urlparse_cached(request).hostname or '' if self.ip_concurrency: key = dnscache.get(key, key) if key not in self.slots: if key in self.inactive_slots: self.slots[key] = self.inactive_slots.pop(key) else: if self.ip_concurrency: concurrency = self.ip_concurrency else: concurrency = self.domain_concurrency concurrency, delay = _get_concurrency_delay( concurrency, spider, self.settings) self.slots[key] = Slot(concurrency, delay, self.settings) return key, self.slots[key]
def process_request(self, request, spider): creds, proxy_url = None, None if 'proxy' in request.meta: if request.meta['proxy'] is not None: creds, proxy_url = self._get_proxy(request.meta['proxy'], '') elif self.proxies: parsed = urlparse_cached(request) scheme = parsed.scheme if (( # 'no_proxy' is only supported by http schemes scheme not in ('http', 'https') or not proxy_bypass(parsed.hostname)) and scheme in self.proxies): creds, proxy_url = self.proxies[scheme] self._set_proxy_and_creds(request, proxy_url, creds)
def process_spider_output(self, response, result, spider): for x in result: if isinstance(x, Request): if x.dont_filter or self.should_follow(x, spider): yield x else: domain = urlparse_cached(x).hostname if domain and domain not in self.domains_seen: self.domains_seen.add(domain) logger.debug( "Filtered offsite request to %(domain)r: %(request)s", {'domain': domain, 'request': x}, extra={'spider': spider}) self.stats.inc_value('offsite/domains', spider=spider) self.stats.inc_value('offsite/filtered', spider=spider) else: yield x
def download_request(self, request, spider): from twisted.internet import reactor parsed_url = urlparse_cached(request) user = request.meta.get("ftp_user", self.default_user) password = request.meta.get("ftp_password", self.default_password) passive_mode = (1 if bool( request.meta.get("ftp_passive", self.passive_mode)) else 0) creator = ClientCreator(reactor, FTPClient, user, password, passive=passive_mode) dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21) return dfd.addCallback(self.gotClient, request, unquote(parsed_url.path))
def process_request(self, request, spider): #判断是否已经开启了proxy if not self._is_enabled_for_request(request): return #获取请求的url scheme parsed = urlparse_cached(request) scheme = parsed.scheme #可用代理的个数 if self.len_valid_proxy(scheme) > 0: self.set_proxy(request, scheme) # if 'download_timeout' not in request.meta: request.meta['download_timeout'] = self.download_timeout else: # 没有可用代理,直连 if 'proxy' in request.meta: del request.meta['proxy']
def process_spider_output(self, response, result, spider): # We could have images and PDFs here - they don't generate any links if isinstance(response, TextResponse): for x in result: if isinstance(x, Request): if x.dont_filter or self.should_follow(x, spider): yield x else: domain = urlparse_cached(x).hostname if domain and domain not in self.domains_seen: self.domains_seen.add(domain) log.msg(format="Filtered offsite request to %(domain)r: %(request)s", level=log.DEBUG, spider=spider, domain=domain, request=x) self.stats.inc_value('offsite/domains', spider=spider) self.stats.inc_value('offsite/filtered', spider=spider) else: yield x
def process_spider_output(self, response, result, spider): for x in result: if isinstance(x, Request): if x.dont_filter or self.should_follow(x, spider): yield x else: domain = urlparse_cached(x).hostname if domain and domain not in self.domains_seen[spider]: self.domains_seen[spider].add(domain) log.msg( format= "Filtered offsite request to %(domain)r: %(request)s", level=log.DEBUG, spider=spider, domain=domain, request=x) else: yield x
def process_request(self, request, spider): # update proxies global count count+=1 if count % 100 == 0: count = 1 self._update_proxies() if 'direct_connect' in request.meta: value = request.meta['direct_connect'] del request.meta['direct_connect'] if value: if 'proxy' in request.meta: del request.meta['proxy'] logger.debug('HTTP_PROXY-->Direct') return # change proxy if 'proxy' in request.meta: proxy_url = None creds = None if request.meta['proxy'] is None: self._set_proxy(request) return else: creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: # local ip return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return # add proxy self._set_proxy(request)
def download_request(self, request, spider): p = urlparse_cached(request) scheme = 'https' if request.meta.get('is_secure') else 'http' bucket = p.hostname path = p.path + '?' + p.query if p.query else p.path url = f'{scheme}://{bucket}.s3.amazonaws.com{path}' if self.anon: request = request.replace(url=url) else: import botocore.awsrequest awsrequest = botocore.awsrequest.AWSRequest( method=request.method, url=f'{scheme}://s3.amazonaws.com/{bucket}{path}', headers=request.headers.to_unicode_dict(), data=request.body) self._signer.add_auth(awsrequest) request = request.replace( url=url, headers=awsrequest.headers.items()) return self._download_http(request, spider)
def process_request(self, request: Request, spider): if 'proxy' in request.meta: if request.meta['proxy'] is None: return creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def __call__(self, request): url = urlparse_cached(request) ret = True for name, regex in self._url_regexes.items(): attribute = str(getattr(url, name, '')) if regex is not None and not regex.match(attribute): ret = False break if ret: # still True after above loop if (self._source_anchor_regex is not None and not self._source_anchor_regex.match(request.meta['source_anchor'].lower())): ret = False elif self.max_depth is not None and request.meta['depth'] > self.max_depth: ret = False elif (self.max_hops_from_seed is not None and request.meta['hops_from_seed'] > self.max_hops_from_seed): ret = False else: # test all regexes in query_regexes, ignoring unknown query args from URL query = urllib.parse.parse_qs(url.query) for k, regex in self._query_regexes.items(): if not ret: break # also break outer loop if we break out of inner one below try: l = query[k] except KeyError: # all keys must be present ret = False break else: # key present, test all args for a in l: if not regex.match(a): ret = False break # inner loop # return, inverting if necessary return ret if not self.invert else not ret
def is_excluded(self, request): """Return whether the request is excluded (due to exclusion rules).""" # Build a string to match against, containing the path, and if # present, the query and fragment as well. url = urlparse_cached(request) match_against = url.netloc + url.path if url.query != '': match_against += "?" + url.query if url.fragment != '': match_against += "#" + url.fragment for rule in self.exclusion_rules: if isinstance(rule, str): # Do case-insensitive substring search if match_against.lower().find(rule.lower()) != -1: return True else: # Do regex search against the URL if rule.search(match_against) is not None: return True return False
def process_request(self, request, spider): # ignore if proxy is already set if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return parsed = urlparse_cached(request) scheme = parsed.scheme if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return self._set_proxy(request, scheme)
def process_request(self, request, spider): # ignore if proxy is already set if "proxy" in request.meta: if request.meta["proxy"] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta["proxy"], "") request.meta["proxy"] = proxy_url if creds and not request.headers.get("Proxy-Authorization"): request.headers["Proxy-Authorization"] = b"Basic " + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ("http", "https") and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def process_spider_output(self, response, result, spider): """ Process the output from a spider and filter out any requests which may lead to hosts we have explicitly disallowed in the `disallowed_hosts` property of the spider Args: response: The response from the crawl result: A list of requests that are prepped from the response spider: The spider instance doing the crawl """ disallowed_hosts = getattr(spider, 'disallowed_hosts', []) for x in result: if isinstance(x, Request): domain = urlparse_cached(x).hostname if domain and domain in disallowed_hosts: # The domain is a disallowed one if domain not in self.domains_seen: # We only fire this once for every time we come # across a domain that we're filtering self.domains_seen.add(domain) logger.debug( " Filtered request to %(domain)s: %(request)s", { 'domain': domain, 'request': x }, extra={'spider': spider}) self.stats.inc_value('disallowed/domains', spider=spider) self.stats.inc_value('disallowed/filtered', spider=spider) else: # We're not filtering this domain yield x else: # Not a request, yield it yield x
def _cache_if(self, spider, request, response=None): """ A request is cacheable if the URI scheme is not in HTTPCACHE_IGNORE_SCHEMES. By default: file:// - not cacheable http:// - cacheable A response is cacheable if the http response code is not in HTTPCACHE_IGNORE_HTTP_CODES. For example, we may choose to ignore 404. """ cacheable_request = (urlparse_cached(request).scheme not in self.ignore_schemes) if (not response) or (not cacheable_request): # == if not (response and cacheable_request) return cacheable_request cacheable_response = ( 'cached' not in response.flags and # from HttpCacheMiddleware 'historic' not in response.flags and # from HistoryMiddleware response.status not in self.ignore_http_codes) return cacheable_request and cacheable_response
def process_request(self, request, spider): # ignore if proxy is already set try: if 'proxy' in request.meta: if request.meta['proxy'] is None: return # extract credentials if present creds, proxy_url = self._get_proxy(request.meta['proxy'], '') request.meta['proxy'] = proxy_url if creds and not request.headers.get('Proxy-Authorization'): request.headers['Proxy-Authorization'] = b'Basic ' + creds return elif not self.proxies: return parsed = urlparse_cached(request) scheme = parsed.scheme except (Exception) as e: print(e) # 'no_proxy' is only supported by http schemes if scheme in ('http', 'https') and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def is_cacheable(self, request): return urlparse_cached(request).scheme not in self.ignore_schemes
def should_follow(self, request, rule): host = urlparse_cached(request).hostname or '' return bool(rule.search(host))
def get_origin_req_host(self): return urlparse_cached(self.request).hostname
def _get_slot_key(self, request, spider): if 'download_slot' in request.meta: return request.meta['download_slot'] host_name = urlparse_cached(request).hostname or '' key = my_dnscache.get(host_name) return key
def get_host(self): return urlparse_cached(self.request).netloc
def _parse_robots(self, response): rp = robotparser.RobotFileParser(response.url) rp.parse(response.body.splitlines()) self._parsers[urlparse_cached(response).netloc] = rp
def _get_slot(self, request): downloader = self.crawler.engine.downloader key = urlparse_cached(request).hostname or '' if downloader.ip_concurrency: key = dnscache.get(key, key) return downloader.slots.get(key)
def should_cache_request(self, request): return urlparse_cached(request).scheme not in self.ignore_schemes
def get_type(self): return urlparse_cached(self.request).scheme
def should_follow(self, request, spider): regex = self.host_regex # hostname can be None for wrong urls (like javascript links) host = urlparse_cached(request).hostname or '' return bool(regex.search(host))
def _get_slot_key(self, request, spider): if Downloader.DOWNLOAD_SLOT in request.meta: return request.meta[Downloader.DOWNLOAD_SLOT] return urlparse_cached(request).hostname or ''