def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url
def add_cookie_header(self, request): wreq = WrappedRequest(request) self.policy._now = self.jar._now = int(time.time()) # the cookiejar implementation iterates through all domains # instead we restrict to potential matches on the domain req_host = urlparse_cached(request).hostname if not req_host: return if not IPV4_RE.search(req_host): hosts = potential_domain_matches(req_host) if req_host.find(".") == -1: hosts += req_host + ".local" else: hosts = [req_host] cookies = [] for host in hosts: if host in self.jar._cookies: cookies += self.jar._cookies_for_domain(host, wreq) attrs = self.jar._cookie_attrs(cookies) if attrs: if not wreq.has_header("Cookie"): wreq.add_unredirected_header("Cookie", "; ".join(attrs)) self.processed += 1 if self.processed % self.check_expired_frequency == 0: # This is still quite inefficient for large number of cookies self.jar.clear_expired_cookies()
def _get_slot_key(self, request, spider): if 'key' in request.meta: return request.meta['key'] key = urlparse_cached(request).hostname or '' return key
def _get_slot_key(self, request, spider): if 'download_slot' in request.meta: return request.meta['download_slot'] key = urlparse_cached(request).hostname or '' if self.ip_concurrency: key = dnscache.get(key, key) return key
def should_cache_request(self, request): if urlparse_cached(request).scheme in self.ignore_schemes: return False cc = self._parse_cachecontrol(request) # obey user-agent directive "Cache-Control: no-store" if 'no-store' in cc: return False # Any other is eligible for caching return True
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme try: handler = self._handlers[scheme].download_request except KeyError: msg = self._notconfigured.get(scheme, \ 'no handler available for that scheme') raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg)) return handler(request, spider)
def robot_parser(self, request, spider): url = urlparse_cached(request) netloc = url.netloc if netloc not in self._parsers: self._parsers[netloc] = None robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc) robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY) dfd = self.crawler.engine.download(robotsreq, spider) dfd.addCallback(self._parse_robots) self._spider_netlocs.add(netloc) return self._parsers[netloc]
def process_request(self, request, spider): # ignore if proxy is already seted if "proxy" in request.meta: return parsed = urlparse_cached(request) scheme = parsed.scheme # 'no_proxy' is only supported by http schemes if scheme in ("http", "https") and proxy_bypass(parsed.hostname): return if scheme in self.proxies: self._set_proxy(request, scheme)
def download_request(self, request, spider): p = urlparse_cached(request) scheme = 'https' if request.meta.get('is_secure') else 'http' bucket = p.hostname path = p.path + '?' + p.query if p.query else p.path url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path) signed_headers = self.conn.make_request( method=request.method, bucket=bucket, key=p.path, query_args=p.query, headers=request.headers, data=request.body) httpreq = request.replace(url=url, headers=signed_headers) return self._download_http(httpreq, spider)
def _parse_robots(self, response): rp = robotparser.RobotFileParser(response.url) rp.parse(response.body.splitlines()) self._parsers[urlparse_cached(response).netloc] = rp
def should_cache_request(self, request): return urlparse_cached(request).scheme not in self.ignore_schemes
def get_origin_req_host(self): return urlparse_cached(self.request).hostname
def get_type(self): return urlparse_cached(self.request).scheme
def get_host(self): return urlparse_cached(self.request).netloc