Exemple #1
0
 def _set_connection_attributes(self, request):
     parsed = urlparse_cached(request)
     self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
     proxy = request.meta.get('proxy')
     if proxy:
         self.scheme, _, self.host, self.port, _ = _parse(proxy)
         self.path = self.url
Exemple #2
0
    def add_cookie_header(self, request):
        wreq = WrappedRequest(request)
        self.policy._now = self.jar._now = int(time.time())

        # the cookiejar implementation iterates through all domains
        # instead we restrict to potential matches on the domain
        req_host = urlparse_cached(request).hostname
        if not req_host:
            return

        if not IPV4_RE.search(req_host):
            hosts = potential_domain_matches(req_host)
            if req_host.find(".") == -1:
                hosts += req_host + ".local"
        else:
            hosts = [req_host]

        cookies = []
        for host in hosts:
            if host in self.jar._cookies:
                cookies += self.jar._cookies_for_domain(host, wreq)

        attrs = self.jar._cookie_attrs(cookies)
        if attrs:
            if not wreq.has_header("Cookie"):
                wreq.add_unredirected_header("Cookie", "; ".join(attrs))

        self.processed += 1
        if self.processed % self.check_expired_frequency == 0:
            # This is still quite inefficient for large number of cookies
            self.jar.clear_expired_cookies()
Exemple #3
0
    def _get_slot_key(self, request, spider):
        if 'key' in request.meta:
            return request.meta['key']

        key = urlparse_cached(request).hostname or ''

        return key
Exemple #4
0
    def _get_slot_key(self, request, spider):
        if 'download_slot' in request.meta:
            return request.meta['download_slot']

        key = urlparse_cached(request).hostname or ''
        if self.ip_concurrency:
            key = dnscache.get(key, key)

        return key
Exemple #5
0
 def should_cache_request(self, request):
     if urlparse_cached(request).scheme in self.ignore_schemes:
         return False
     cc = self._parse_cachecontrol(request)
     # obey user-agent directive "Cache-Control: no-store"
     if 'no-store' in cc:
         return False
     # Any other is eligible for caching
     return True
Exemple #6
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     try:
         handler = self._handlers[scheme].download_request
     except KeyError:
         msg = self._notconfigured.get(scheme, \
                 'no handler available for that scheme')
         raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg))
     return handler(request, spider)
Exemple #7
0
 def robot_parser(self, request, spider):
     url = urlparse_cached(request)
     netloc = url.netloc
     if netloc not in self._parsers:
         self._parsers[netloc] = None
         robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
         robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
         dfd = self.crawler.engine.download(robotsreq, spider)
         dfd.addCallback(self._parse_robots)
         self._spider_netlocs.add(netloc)
     return self._parsers[netloc]
Exemple #8
0
    def process_request(self, request, spider):
        # ignore if proxy is already seted
        if "proxy" in request.meta:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ("http", "https") and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Exemple #9
0
 def download_request(self, request, spider):
     p = urlparse_cached(request)
     scheme = 'https' if request.meta.get('is_secure') else 'http'
     bucket = p.hostname
     path = p.path + '?' + p.query if p.query else p.path
     url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path)
     signed_headers = self.conn.make_request(
             method=request.method,
             bucket=bucket,
             key=p.path,
             query_args=p.query,
             headers=request.headers,
             data=request.body)
     httpreq = request.replace(url=url, headers=signed_headers)
     return self._download_http(httpreq, spider)
Exemple #10
0
 def _parse_robots(self, response):
     rp = robotparser.RobotFileParser(response.url)
     rp.parse(response.body.splitlines())
     self._parsers[urlparse_cached(response).netloc] = rp
Exemple #11
0
 def should_cache_request(self, request):
     return urlparse_cached(request).scheme not in self.ignore_schemes
Exemple #12
0
 def get_origin_req_host(self):
     return urlparse_cached(self.request).hostname
Exemple #13
0
 def get_type(self):
     return urlparse_cached(self.request).scheme
Exemple #14
0
 def get_host(self):
     return urlparse_cached(self.request).netloc