Exemple #1
0
    def should_follow(self, response, request):
        """Only follow offsite links to JS and CSS files, not new pages."""
        res_url_data = urlparse_cached(response)
        req_url_data = urlparse_cached(request)

        if has_extension(request, "js") or has_extension(request, "css"):
            return True

        # Otherwise, ensure that the domains share the same root origin
        return req_url_data.netloc == res_url_data.netloc
Exemple #2
0
    def test_urlparse_cached(self):
        url = "http://www.example.com/index.html"
        request1 = Request(url)
        request2 = Request(url)
        req1a = urlparse_cached(request1)
        req1b = urlparse_cached(request1)
        req2 = urlparse_cached(request2)
        urlp = urlparse.urlparse(url)

        assert req1a == req2
        assert req1a == urlp
        assert req1a is req1b
        assert req1a is not req2
        assert req1a is not req2
Exemple #3
0
 def process_request(self, request, spider):
     hostname = urlparse_cached(request).hostname
     if spider.domain_name == 's3.amazonaws.com' \
             or (hostname and hostname.endswith('s3.amazonaws.com')):
         request.headers['Date'] = time.strftime("%a, %d %b %Y %H:%M:%S GMT", \
             time.gmtime())
         sign_request(request, self.access_key, self.secret_key)
Exemple #4
0
    def robot_parser(self, request, spider):
        url = urlparse_cached(request)
        netloc = url.netloc

        if netloc not in self._parsers:
            self._parsers[netloc] = Deferred()
            robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
            robotsreq = Request(
                robotsurl,
                priority=self.DOWNLOAD_PRIORITY,
                meta={'dont_obey_robotstxt': True}
            )
            dfd = self.crawler.engine.download(robotsreq, spider)
            dfd.addCallback(self._parse_robots, netloc)
            dfd.addErrback(self._logerror, robotsreq, spider)
            dfd.addErrback(self._robots_error, netloc)

        if isinstance(self._parsers[netloc], Deferred):
            d = Deferred()
            def cb(result):
                d.callback(result)
                return result
            self._parsers[netloc].addCallback(cb)
            return d
        else:
            return self._parsers[netloc]
Exemple #5
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     try:
         handler = self._handlers[scheme]
     except KeyError:
         raise NotSupported("Unsupported URL scheme '%s' in: <%s>" % (scheme, request.url))
     return handler(request, spider)
    def warcrec_from_scrapy_request(self, request):
        headers = request.headers
        body = request.body

        parsed = urlparse_cached(request)
        scheme, netloc, host, port, path = _parsed_url_args(parsed)

        headers.setdefault('Host', netloc)

        if body is not None and len(body) > 0:
            headers['Content-Length'] = len(body)
            headers.setdefault("Connection", "close")

        # Compile the request using buf
        buf = StringIO()
        buf.write('%s %s HTTP/1.0\r\n' % (request.method, path))
        for name, values in headers.items():
            for value in values:
                buf.write('%s: %s\r\n' % (name, value))
        buf.write('\r\n')
        if body is not None:
            buf.write(body)
        request_str = buf.getvalue()
        
        return warcrecords.WarcRequestRecord(url=request.url, block=request_str)
Exemple #7
0
 def _set_connection_attributes(self, request):
     parsed = urlparse_cached(request)
     self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
     proxy = request.meta.get('proxy')
     if proxy:
         self.scheme, _, self.host, self.port, _ = _parse(proxy)
         self.path = self.url
Exemple #8
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     handler = self._get_handler(scheme)
     if not handler:
         raise NotSupported("Unsupported URL scheme '%s': %s" %
                            (scheme, self._notconfigured[scheme]))
     return handler.download_request(request, spider)
Exemple #9
0
 def download_request(self, request, spider):
     p = urlparse_cached(request)
     scheme = 'https' if request.meta.get('is_secure') else 'http'
     bucket = p.hostname
     path = p.path + '?' + p.query if p.query else p.path
     url = '{0!s}://{1!s}.s3.amazonaws.com{2!s}'.format(scheme, bucket, path)
     if self.anon:
         request = request.replace(url=url)
     elif self._signer is not None:
         import botocore.awsrequest
         awsrequest = botocore.awsrequest.AWSRequest(
             method=request.method,
             url='{0!s}://s3.amazonaws.com/{1!s}{2!s}'.format(scheme, bucket, path),
             headers=request.headers.to_unicode_dict(),
             data=request.body)
         self._signer.add_auth(awsrequest)
         request = request.replace(
             url=url, headers=awsrequest.headers.items())
     else:
         signed_headers = self.conn.make_request(
                 method=request.method,
                 bucket=bucket,
                 key=unquote(p.path),
                 query_args=unquote(p.query),
                 headers=request.headers,
                 data=request.body)
         request = request.replace(url=url, headers=signed_headers)
     return self._download_http(request, spider)
Exemple #10
0
 def download_request(self, request, spider):
     p = urlparse_cached(request)
     scheme = "https" if request.meta.get("is_secure") else "http"
     url = "%s://%s.s3.amazonaws.com%s" % (scheme, p.hostname, p.path)
     httpreq = request.replace(url=url)
     self.conn.add_aws_auth_header(httpreq.headers, httpreq.method, "%s/%s" % (p.hostname, p.path))
     return self._download_http(httpreq, spider)
Exemple #11
0
    def add_cookie_header(self, request):
        wreq = WrappedRequest(request)
        self.policy._now = self.jar._now = int(time.time())

        # the cookiejar implementation iterates through all domains
        # instead we restrict to potential matches on the domain
        req_host = urlparse_cached(request).hostname

        if not IPV4_RE.search(req_host):
            hosts = potential_domain_matches(req_host)
            if req_host.find(".") == -1:
                hosts += req_host + ".local"
        else:
            hosts = [req_host]

        cookies = []
        for host in hosts:
            if host in self.jar._cookies:
                cookies += self.jar._cookies_for_domain(host, wreq)

        attrs = self.jar._cookie_attrs(cookies)
        if attrs:
            if not wreq.has_header("Cookie"):
                wreq.add_unredirected_header("Cookie", "; ".join(attrs))

        self.processed += 1
        if self.processed % self.check_expired_frequency == 0:
            # This is still quite inefficient for large number of cookies
            self.jar.clear_expired_cookies()
Exemple #12
0
    def process_request(self, request, spider):
        for p in self.no_proxy_patterns:
            if p.search(request.url):
                return
        retries = request.meta.get('retry_times', None)
        #已手动制定代理的不设置
        if 'proxy' in request.meta:
            if retries is None:
                return

        #当到达最大重试次数时,使用本机直接访问,确保失败时始终有一次本机访问.
        if retries == self.max_retry_times:
            now = time.time()
            should_sleep = self.local_interval - (now - self.local_last_use_time)
            if should_sleep > 0:
                log.msg('ProxyMiddleware:use proxy fail,local sleep %s' % should_sleep, log.DEBUG)
                time.sleep(should_sleep)
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme
        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return
        self._set_proxy(request, scheme)
Exemple #13
0
 def process_spider_output(self, response, result, spider):
     for res in result:
         if isinstance(res, Request):
             parsed = urlparse_cached(res)
             if parsed.query:
                 parsed = parsed._replace(query=_filter_query(parsed.query, self.remove, self.keep))
                 res = res.replace(url=parsed.geturl())
         yield res
Exemple #14
0
 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     try:
         handler = self._handlers[scheme]
     except KeyError:
         msg = self._notconfigured.get(scheme, "no handler available for that scheme")
         raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg))
     return handler(request, spider)
Exemple #15
0
    def should_follow(self, request, spider):
        #Custom code to update regex
        self.update_regex(spider)

        regex = self.host_regex
        # hostname can be None for wrong urls (like javascript links)
        host = urlparse_cached(request).hostname or ''
        return bool(regex.search(host))
    def parse(self, response):
        ld = self._load_webpage_item(response, is_seed=response.meta['is_seed'])
        if self.use_splash:
            self._process_splash_response(response, ld)
        yield ld.load_item()

        if self.finishing:
            return

        now = datetime.utcnow()
        if self.operating_time > 0 and (now - self.start_time).total_seconds() > self.operating_time:
            log.msg("Reached operating time constraint. Waiting for Scrapy queue to exhaust.")
            self.finishing = True
            self.crawler.stop()
            return

        if not isinstance(response, TextResponse):
            return

        body = response.body_as_unicode().strip().encode('utf8') or '<html/>'
        score = self.ranker.score_html(body)
        log.msg("TC: %s has score=%f" % (response.url, score), _level=log.DEBUG)

        if score > 0.5:

            #!for some reason this is returning the raw splash response JSON
            #!and not the rendered HTML from splash
            #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG)

            #for link in self.linkextractor.extract_links(response):
            #can something like the line below fix it? Seems like a hack...
            for link in self.linkextractor.extract_links(response):

                log.msg("****---LINK EXTRACED: %s----***" % str(link.url), _level=log.DEBUG)

                if self.use_splash:
                    r = self._splash_request(url=link.url)
                else:
                    r = Request(url=link.url)

                external = is_external_url(response.url, link.url)
                depth = response.meta.get('link_depth', 0)
                r.meta.update({
                    'link': {
                        'url': link.url,
                        'text': link.text,
                        'fragment': link.fragment,
                        'nofollow': link.nofollow},
                    'link_depth': 0 if external else depth + 1,
                    'referrer_depth': depth,
                    'referrer_url': response.url,
                })

                url_parts = urlparse_cached(r)
                path_parts = url_parts.path.split('/')
                r.meta['score'] = 1.0 / len(path_parts)
                r.meta['is_seed'] = False
                yield r
Exemple #17
0
 def should_cache_request(self, request):
     if urlparse_cached(request).scheme in self.ignore_schemes:
         return False
     cc = self._parse_cachecontrol(request)
     # obey user-agent directive "Cache-Control: no-store"
     if b'no-store' in cc:
         return False
     # Any other is eligible for caching
     return True
Exemple #18
0
    def _get_slot_key(self, request, spider):
        if "download_slot" in request.meta:
            return request.meta["download_slot"]

        key = urlparse_cached(request).hostname or ""
        if self.ip_concurrency:
            key = dnscache.get(key, key)

        return key
Exemple #19
0
    def _get_slot_key(self, request, spider):
        if self.DOWNLOAD_SLOT in request.meta:
            return request.meta[self.DOWNLOAD_SLOT]

        key = urlparse_cached(request).hostname or ''
        if self.ip_concurrency:
            key = dnscache.get(key, key)

        return key
 def process_request(self, request, spider):
     """Process a spider request."""
     if request.dont_filter or self.should_follow(request, spider):
         return None
     else:
         domain = urlparse_cached(request).hostname
         logging.debug("Filtered offsite request to %(domain)r: %(request)s" %
                       {"domain": domain, "request": request})
         raise IgnoreRequest
Exemple #21
0
 def should_follow(self, request, spider):
     referer = request.headers.get('Referer', '')
     refhost = urlparse(referer).hostname or ''
     if refhost:
         referer_ok = bool(self.follow_regex.search(refhost))
         if not referer_ok:
             return False
     # hostname can be None for wrong urls (like javascript links)
     host = urlparse_cached(request).hostname or ''
     return bool(self.host_regex.search(host))
Exemple #22
0
    def process_request(self, request):
        request.meta['download_timeout'] = 60

        parsed = urlparse_cached(request)
        scheme = parsed.scheme
        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return
        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Exemple #23
0
 def download_request(self, request, spider):
     parsed_url = urlparse_cached(request)
     user = request.meta.get("ftp_user", self.default_user)
     password = request.meta.get("ftp_password", self.default_password)
     passive_mode = 1 if bool(request.meta.get("ftp_passive",
                                               self.passive_mode)) else 0
     creator = ClientCreator(reactor, FTPClient, user, password,
         passive=passive_mode)
     return creator.connectTCP(parsed_url.hostname, parsed_url.port or 21).addCallback(self.gotClient,
                             request, unquote(parsed_url.path))
    def should_follow(self, request, spider):
        allowed_regex = self.host_allowed_regex
        disallowed_regex = self.host_disallowed_regex
        # hostname can be None for wrong urls (like javascript links)
        host = urlparse_cached(request).hostname or ''

        allowed_res = bool(allowed_regex.search(host))
        disallowed_res = not bool(disallowed_regex.search(host))
        # log.msg('{0}:{1} allow?{2} disallow?{3}'.format(request.url, host, allowed_res, disallowed_res))
        return allowed_res and disallowed_res
Exemple #25
0
 def robot_parser(self, request, spider):
     url = urlparse_cached(request)
     netloc = url.netloc
     if netloc not in self._parsers:
         self._parsers[netloc] = None
         robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
         robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
         dfd = self.crawler.engine.download(robotsreq, spider)
         dfd.addCallback(self._parse_robots)
         self._spider_netlocs[spider].add(netloc)
     return self._parsers[netloc]
    def _parse_robots(self, response, spider):
        if response.status != 200:
            return

        rules = RobotRules(
            url=response.url,
            status=response.status,
            content=response.body_as_unicode(),
            expires=None
        )
        self._robot_rules[urlparse_cached(response).netloc] = rules
        self._adjust_delay(rules, response, spider)
Exemple #27
0
 def _load_keys(self, requests, item):
     # Preload file paths into the requests because we use the item data to
     # generate the path.
     for req in requests:
         pr = urlparse_cached(req)
         # filename is last part of the URL path.
         image = pr.path.rpartition('/')[-1]
         req.meta['file_path'] = '/{slide_id}/{image}'.format(
             spider=item['spider'],
             slide_id=item['id'],
             image=image,
         )
Exemple #28
0
 def _get_slot(self, request, spider):
     key = urlparse_cached(request).hostname or ''
     if self.ip_concurrency:
         key = dnscache.get(key, key)
     if key not in self.slots:
         if self.ip_concurrency:
             concurrency = self.ip_concurrency
         else:
             concurrency = self.domain_concurrency
         concurrency, delay = _get_concurrency_delay(concurrency, spider, self.settings)
         self.slots[key] = Slot(concurrency, delay, self.settings)
     return key, self.slots[key]
 def robotstxt(self, request, spider):
     url = urlparse_cached(request)
     if url.netloc not in self._robot_rules:
         self._robot_rules[url.netloc] = None
         req = Request(
             get_robotstxt_url(url),
             priority=self.DOWNLOAD_PRIORITY,
             meta={'dont_process_robots': True}
         )
         dfd = self.crawler.engine.download(req, spider)
         dfd.addCallback(self._parse_robots, spider=spider)
     return self._robot_rules[url.netloc]
Exemple #30
0
 def process_spider_output(self, response, result, spider):
     for req in result:
         if isinstance(req, Request):
             if req.dont_filter or self.should_follow(response, req):
                 yield req
             else:
                 domain = urlparse_cached(req).hostname
                 if domain and domain not in self.domains_seen[spider]:
                     log.msg("Filtered offsite request to %r: %s" % (domain, req), level=log.DEBUG, spider=spider)
                     self.domains_seen[spider].add(domain)
         else:
             yield req
Exemple #31
0
def request_httprepr(request):
    """Return the raw HTTP representation (as bytes) of the given request.
    This is provided only for reference since it's not the actual stream of
    bytes that will be send when performing the request (that's controlled
    by Twisted).
    """
    parsed = urlparse_cached(request)
    path = urlunparse(('', '', parsed.path
                       or '/', parsed.params, parsed.query, ''))
    s = to_bytes(request.method) + b" " + to_bytes(path) + b" HTTP/1.1\r\n"
    s += b"Host: " + to_bytes(parsed.hostname or b'') + b"\r\n"
    if request.headers:
        s += request.headers.to_string() + b"\r\n"
    s += b"\r\n"
    s += request.body
    return s
Exemple #32
0
 def _parse_robots(self, response):
     rp = robotparser.RobotFileParser(response.url)
     body = ''
     if hasattr(response, 'body_as_unicode'):
         body = response.body_as_unicode()
     else: # last effort try
         try:
             body = response.body.decode('utf-8')
         except UnicodeDecodeError:
             # If we found garbage, disregard it:,
             # but keep the lookup cached (in self._parsers)
             # Running rp.parse() will set rp state from
             # 'disallow all' to 'allow any'.
             pass
     rp.parse(body.splitlines())
     self._parsers[urlparse_cached(response).netloc] = rp
Exemple #33
0
 def _get_slot(self, request, spider):
     key = urlparse_cached(request).hostname or ''
     if self.ip_concurrency:
         key = dnscache.get(key, key)
     if key not in self.slots:
         if key in self.inactive_slots:
             self.slots[key] = self.inactive_slots.pop(key)
         else:
             if self.ip_concurrency:
                 concurrency = self.ip_concurrency
             else:
                 concurrency = self.domain_concurrency
             concurrency, delay = _get_concurrency_delay(
                 concurrency, spider, self.settings)
             self.slots[key] = Slot(concurrency, delay, self.settings)
     return key, self.slots[key]
Exemple #34
0
    def process_request(self, request, spider):
        creds, proxy_url = None, None
        if 'proxy' in request.meta:
            if request.meta['proxy'] is not None:
                creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
        elif self.proxies:
            parsed = urlparse_cached(request)
            scheme = parsed.scheme
            if ((
                    # 'no_proxy' is only supported by http schemes
                    scheme not in
                ('http', 'https') or not proxy_bypass(parsed.hostname))
                    and scheme in self.proxies):
                creds, proxy_url = self.proxies[scheme]

        self._set_proxy_and_creds(request, proxy_url, creds)
Exemple #35
0
 def process_spider_output(self, response, result, spider):
     for x in result:
         if isinstance(x, Request):
             if x.dont_filter or self.should_follow(x, spider):
                 yield x
             else:
                 domain = urlparse_cached(x).hostname
                 if domain and domain not in self.domains_seen:
                     self.domains_seen.add(domain)
                     logger.debug(
                         "Filtered offsite request to %(domain)r: %(request)s",
                         {'domain': domain, 'request': x}, extra={'spider': spider})
                     self.stats.inc_value('offsite/domains', spider=spider)
                 self.stats.inc_value('offsite/filtered', spider=spider)
         else:
             yield x
Exemple #36
0
    def download_request(self, request, spider):
        from twisted.internet import reactor

        parsed_url = urlparse_cached(request)
        user = request.meta.get("ftp_user", self.default_user)
        password = request.meta.get("ftp_password", self.default_password)
        passive_mode = (1 if bool(
            request.meta.get("ftp_passive", self.passive_mode)) else 0)
        creator = ClientCreator(reactor,
                                FTPClient,
                                user,
                                password,
                                passive=passive_mode)
        dfd = creator.connectTCP(parsed_url.hostname, parsed_url.port or 21)
        return dfd.addCallback(self.gotClient, request,
                               unquote(parsed_url.path))
Exemple #37
0
    def process_request(self, request, spider):
        #判断是否已经开启了proxy
        if not self._is_enabled_for_request(request):
            return

        #获取请求的url scheme
        parsed = urlparse_cached(request)
        scheme = parsed.scheme
        #可用代理的个数
        if self.len_valid_proxy(scheme) > 0:
            self.set_proxy(request, scheme)
            # if 'download_timeout' not in request.meta:
            request.meta['download_timeout'] = self.download_timeout
        else:
            # 没有可用代理,直连
            if 'proxy' in request.meta:
                del request.meta['proxy']
Exemple #38
0
 def process_spider_output(self, response, result, spider):
     # We could have images and PDFs here - they don't generate any links
     if isinstance(response, TextResponse):
         for x in result:
             if isinstance(x, Request):
                 if x.dont_filter or self.should_follow(x, spider):
                     yield x
                 else:
                     domain = urlparse_cached(x).hostname
                     if domain and domain not in self.domains_seen:
                         self.domains_seen.add(domain)
                         log.msg(format="Filtered offsite request to %(domain)r: %(request)s",
                                 level=log.DEBUG, spider=spider, domain=domain, request=x)
                         self.stats.inc_value('offsite/domains', spider=spider)
                     self.stats.inc_value('offsite/filtered', spider=spider)
             else:
                 yield x
Exemple #39
0
 def process_spider_output(self, response, result, spider):
     for x in result:
         if isinstance(x, Request):
             if x.dont_filter or self.should_follow(x, spider):
                 yield x
             else:
                 domain = urlparse_cached(x).hostname
                 if domain and domain not in self.domains_seen[spider]:
                     self.domains_seen[spider].add(domain)
                     log.msg(
                         format=
                         "Filtered offsite request to %(domain)r: %(request)s",
                         level=log.DEBUG,
                         spider=spider,
                         domain=domain,
                         request=x)
         else:
             yield x
    def process_request(self, request, spider):
        # update proxies
        global count
        count+=1
        if count % 100 == 0:
            count = 1
            self._update_proxies()

        if 'direct_connect' in request.meta:                
            value = request.meta['direct_connect']
            del request.meta['direct_connect']
            if value:
                if 'proxy' in request.meta:
                    del request.meta['proxy']
                logger.debug('HTTP_PROXY-->Direct')
                return

        # change proxy
        if 'proxy' in request.meta:
            proxy_url = None
            creds = None
            if request.meta['proxy'] is None:
                self._set_proxy(request)
                return
            else:
                creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url
            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds
            return
        elif not self.proxies:
            # local ip
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        # add proxy
        self._set_proxy(request)
Exemple #41
0
 def download_request(self, request, spider):
     p = urlparse_cached(request)
     scheme = 'https' if request.meta.get('is_secure') else 'http'
     bucket = p.hostname
     path = p.path + '?' + p.query if p.query else p.path
     url = f'{scheme}://{bucket}.s3.amazonaws.com{path}'
     if self.anon:
         request = request.replace(url=url)
     else:
         import botocore.awsrequest
         awsrequest = botocore.awsrequest.AWSRequest(
             method=request.method,
             url=f'{scheme}://s3.amazonaws.com/{bucket}{path}',
             headers=request.headers.to_unicode_dict(),
             data=request.body)
         self._signer.add_auth(awsrequest)
         request = request.replace(
             url=url, headers=awsrequest.headers.items())
     return self._download_http(request, spider)
Exemple #42
0
    def process_request(self, request: Request, spider):
        if 'proxy' in request.meta:
            if request.meta['proxy'] is None:
                return
            creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url
            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds
            return
        elif not self.proxies:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Exemple #43
0
    def __call__(self, request):
        url = urlparse_cached(request)

        ret = True

        for name, regex in self._url_regexes.items():
            attribute = str(getattr(url, name, ''))
            if regex is not None and not regex.match(attribute):
                ret = False
                break

        if ret: # still True after above loop
            if (self._source_anchor_regex is not None and
                not self._source_anchor_regex.match(request.meta['source_anchor'].lower())):
                ret = False
            elif self.max_depth is not None and request.meta['depth'] > self.max_depth:
                ret = False
            elif (self.max_hops_from_seed is not None and
                  request.meta['hops_from_seed'] > self.max_hops_from_seed):
                ret = False
            else:
                # test all regexes in query_regexes, ignoring unknown query args from URL
                query = urllib.parse.parse_qs(url.query)
                for k, regex in self._query_regexes.items():
                    if not ret:
                        break # also break outer loop if we break out of inner one below
                    try:
                        l = query[k]
                    except KeyError:
                        # all keys must be present
                        ret = False
                        break
                    else:
                        # key present, test all args
                        for a in l:
                            if not regex.match(a):
                                ret = False
                                break # inner loop

        # return, inverting if necessary
        return ret if not self.invert else not ret
Exemple #44
0
    def is_excluded(self, request):
        """Return whether the request is excluded (due to exclusion rules)."""
        # Build a string to match against, containing the path, and if
        # present, the query and fragment as well.
        url = urlparse_cached(request)
        match_against = url.netloc + url.path
        if url.query != '':
            match_against += "?" + url.query
        if url.fragment != '':
            match_against += "#" + url.fragment

        for rule in self.exclusion_rules:
            if isinstance(rule, str):
                # Do case-insensitive substring search
                if match_against.lower().find(rule.lower()) != -1:
                    return True
            else:
                # Do regex search against the URL
                if rule.search(match_against) is not None:
                    return True
        return False
Exemple #45
0
    def process_request(self, request, spider):
        # ignore if proxy is already set

        if 'proxy' in request.meta:
            if request.meta['proxy'] is None:
                return
            # extract credentials if present
            creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url

            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds

            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        self._set_proxy(request, scheme)
Exemple #46
0
    def process_request(self, request, spider):
        # ignore if proxy is already set
        if "proxy" in request.meta:
            if request.meta["proxy"] is None:
                return
            # extract credentials if present
            creds, proxy_url = self._get_proxy(request.meta["proxy"], "")
            request.meta["proxy"] = proxy_url
            if creds and not request.headers.get("Proxy-Authorization"):
                request.headers["Proxy-Authorization"] = b"Basic " + creds
            return
        elif not self.proxies:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ("http", "https") and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Exemple #47
0
    def process_spider_output(self, response, result, spider):
        """ Process the output from a spider and filter out any requests which
        may lead to hosts we have explicitly disallowed in the
        `disallowed_hosts` property of the spider

        Args:
            response: The response from the crawl
            result: A list of requests that are prepped from the response
            spider: The spider instance doing the crawl
        """
        disallowed_hosts = getattr(spider, 'disallowed_hosts', [])

        for x in result:
            if isinstance(x, Request):
                domain = urlparse_cached(x).hostname
                if domain and domain in disallowed_hosts:
                    # The domain is a disallowed one
                    if domain not in self.domains_seen:
                        # We only fire this once for every time we come
                        # across a domain that we're filtering
                        self.domains_seen.add(domain)
                        logger.debug(
                            " Filtered request to %(domain)s: %(request)s", {
                                'domain': domain,
                                'request': x
                            },
                            extra={'spider': spider})
                        self.stats.inc_value('disallowed/domains',
                                             spider=spider)

                    self.stats.inc_value('disallowed/filtered', spider=spider)
                else:
                    # We're not filtering this domain
                    yield x
            else:
                # Not a request, yield it
                yield x
Exemple #48
0
    def _cache_if(self, spider, request, response=None):
        """
        A request is cacheable if the URI scheme is not in
        HTTPCACHE_IGNORE_SCHEMES. By default:
            file:// - not cacheable
            http:// - cacheable

        A response is cacheable if the http response code is not in
        HTTPCACHE_IGNORE_HTTP_CODES. For example, we may choose to
        ignore 404.
        """
        cacheable_request = (urlparse_cached(request).scheme
                             not in self.ignore_schemes)

        if (not response) or (not cacheable_request):
            # == if not (response and cacheable_request)
            return cacheable_request

        cacheable_response = (
            'cached' not in response.flags and  # from HttpCacheMiddleware
            'historic' not in response.flags and  # from HistoryMiddleware
            response.status not in self.ignore_http_codes)

        return cacheable_request and cacheable_response
Exemple #49
0
    def process_request(self, request, spider):
        # ignore if proxy is already set
        try:
            if 'proxy' in request.meta:
                if request.meta['proxy'] is None:
                    return
                # extract credentials if present
                creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
                request.meta['proxy'] = proxy_url
                if creds and not request.headers.get('Proxy-Authorization'):
                    request.headers['Proxy-Authorization'] = b'Basic ' + creds
                return
            elif not self.proxies:
                return
            parsed = urlparse_cached(request)
            scheme = parsed.scheme
        except (Exception) as e:
            print(e)
        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Exemple #50
0
 def is_cacheable(self, request):
     return urlparse_cached(request).scheme not in self.ignore_schemes
Exemple #51
0
 def should_follow(self, request, rule):
     host = urlparse_cached(request).hostname or ''
     return bool(rule.search(host))
Exemple #52
0
 def get_origin_req_host(self):
     return urlparse_cached(self.request).hostname
Exemple #53
0
 def _get_slot_key(self, request, spider):
     if 'download_slot' in request.meta:
         return request.meta['download_slot']
     host_name = urlparse_cached(request).hostname or ''
     key = my_dnscache.get(host_name)
     return key
Exemple #54
0
 def get_host(self):
     return urlparse_cached(self.request).netloc
Exemple #55
0
 def _parse_robots(self, response):
     rp = robotparser.RobotFileParser(response.url)
     rp.parse(response.body.splitlines())
     self._parsers[urlparse_cached(response).netloc] = rp
Exemple #56
0
 def _get_slot(self, request):
     downloader = self.crawler.engine.downloader
     key = urlparse_cached(request).hostname or ''
     if downloader.ip_concurrency:
         key = dnscache.get(key, key)
     return downloader.slots.get(key)
Exemple #57
0
 def should_cache_request(self, request):
     return urlparse_cached(request).scheme not in self.ignore_schemes
Exemple #58
0
 def get_type(self):
     return urlparse_cached(self.request).scheme
Exemple #59
0
 def should_follow(self, request, spider):
     regex = self.host_regex
     # hostname can be None for wrong urls (like javascript links)
     host = urlparse_cached(request).hostname or ''
     return bool(regex.search(host))
    def _get_slot_key(self, request, spider):
        if Downloader.DOWNLOAD_SLOT in request.meta:
            return request.meta[Downloader.DOWNLOAD_SLOT]

        return urlparse_cached(request).hostname or ''