Exemple #1
0
    def process_request(self, request, spider):
        # Set the location of the proxy

        use_vpn = False
        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https'):
            if proxy_bypass(parsed.hostname):
                return
            use_vpn = use_vpn_only(parsed.hostname, spider.vpn_only)

        if 'http://' == request.url[0:7]:
            if use_vpn:
                request.meta['proxy'] = 'http://' + spider.vpn_proxy
            else:
                self.http_index = divmod(self.http_index + 1,
                                         len(spider.http_proxy))[1]

                http_proxy = spider.http_proxy[self.http_index]
                request.meta['proxy'] = http_proxy

        if spider.crawlera_enabled:
            return

        elif 'https://' == request.url[0:8]:
            if use_vpn:
                request.meta['proxy'] = 'https://' + spider.vpn_proxy
            else:
                self.https_index = divmod(self.https_index + 1,
                                          len(spider.https_proxy))[1]

                https_proxy = spider.https_proxy[self.https_index]
                request.meta['proxy'] = https_proxy
Exemple #2
0
    def process_request(self, request, spider):
        # ignore if proxy is already set
        if 'proxy' in request.meta:
            if request.meta['proxy'] is None:
                return

            # extract credentials if present
            creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url
            # request设置认证头
            if creds and not request.headers.get('Proxy-Authorization'): 
                request.headers['Proxy-Authorization'] = b'Basic ' + creds
            return
        elif not self.proxies:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes   # ??
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Exemple #3
0
    def process_request(self, request, spider):
        # ignore if proxy is already set

        if 'proxy' in request.meta:
            if request.meta['proxy'] is None:
                return
            # extract credentials if present
            creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url

            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds

            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if self.use_proxy_rate < 1:
            if random.random() < self.use_proxy_rate:
                self._set_proxy(request, scheme)
        else:
            self._set_proxy(request, scheme)
Exemple #4
0
 def _set_proxy(self, request, proxies):
     if not proxies:
         return
     parsed = urlparse_cached(request)
     scheme = parsed.scheme
     if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
         return
     if scheme not in proxies:
         return
     creds, proxy = proxies[scheme]
     request.meta['proxy'] = proxy
     if creds:
         request.headers['Proxy-Authorization'] = b'Basic ' + creds
Exemple #5
0
    def process_request(self, request, spider):
        # ignore if proxy is already seted
        if 'proxy' in request.meta:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Exemple #6
0
    def process_request(self, request, spider):
        # ignore if proxy is already set
        if 'proxy' in request.meta:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
    def process_request(self, request, spider):
        # update proxies
        global count
        count+=1
        if count % 100 == 0:
            count = 1
            self._update_proxies()

        if 'direct_connect' in request.meta:                
            value = request.meta['direct_connect']
            del request.meta['direct_connect']
            if value:
                if 'proxy' in request.meta:
                    del request.meta['proxy']
                logger.debug('HTTP_PROXY-->Direct')
                return

        # change proxy
        if 'proxy' in request.meta:
            proxy_url = None
            creds = None
            if request.meta['proxy'] is None:
                self._set_proxy(request)
                return
            else:
                creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url
            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds
            return
        elif not self.proxies:
            # local ip
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        # add proxy
        self._set_proxy(request)
    def process_request(self, request, spider):
        # When Retry, dont_filter=True, reset proxy
        if 'proxy' in request.meta:
            if request.meta['proxy'] is None:
                return
            creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url
            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds
            return
        elif not self.proxies:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            yield self._set_proxy(request, scheme)
Exemple #9
0
    def process_request(self, request, spider):
        # ignore if proxy is already set
        if 'proxy' in request.meta:
            if request.meta['proxy'] is None:
                return
            # extract credentials if present
            creds, proxy_url = self._get_proxy(request.meta['proxy'], '')
            request.meta['proxy'] = proxy_url
            if creds and not request.headers.get('Proxy-Authorization'):
                request.headers['Proxy-Authorization'] = b'Basic ' + creds
            return
        elif not self.proxies:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)