Exemple #1
0
    def process_response(self, request, response, spider):
        if not self._is_enabled_for_request(request):
            return response

        if response.status in self.ban_code:
            self.invaild_proxy(request.meta['proxy'])
            logger.debug("Proxy[%s] ban because return httpstatuscode:[%s]. ",
                         request.meta['proxy'], str(response.status))
            new_request = request.copy()
            new_request.dont_filter = True
            return new_request

        if self.ban_re:
            try:
                pattern = re.compile(self.ban_re)
            except TypeError:
                logger.error('Wrong "ban_re", please check settings')
                return response
            match = re.search(pattern, response.body)
            if match:
                self.invaild_proxy(request.meta['proxy'])
                logger.debug("Proxy[%s] ban because pattern match:[%s]. ",
                             request.meta['proxy'], str(match))
                new_request = request.copy()
                new_request.dont_filter = True
                return new_request

        p = request.meta['proxy']
        self.counter_proxy[p] = self.counter_proxy.setdefault(p, 1) + 1
        return response
Exemple #2
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        retry_times = self.max_retry_times

        if 'max_retry_times' in request.meta:
            retry_times = request.meta['max_retry_times']

        stats = spider.crawler.stats
        if retries <= retry_times:
            logger.debug("Retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust
            # 重新获取代理,添加代理
            proxy_ip = "http://" + getProxy()
            retryreq.meta['proxy'] = proxy_ip

            if isinstance(reason, Exception):
                reason = global_object_name(reason.__class__)

            stats.inc_value('retry/count')
            stats.inc_value('retry/reason_count/%s' % reason)
            # print(retryreq)
            # print("*"*100)
            return retryreq
        else:
            stats.inc_value('retry/max_reached')
            logger.debug("Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                         {'request': request, 'retries': retries, 'reason': reason},
                         extra={'spider': spider})
def baseQuery(request, lang):

    queryResults = []

    request['action'] = 'query'

    lastContinue = {'continue': ''}
    while True:
        # Clone original request
        req = request.copy()
        # Modify it with the values returned in the 'continue' section of the last result.
        req.update(lastContinue)

        #Create query string
        #queryString = "?"
        #for key, value in req.items():
            #value = str(value) #Convert to string in case the value is not
            #queryString += key + "=" + value + "&"

        # Call API
        #result = urllib.request.urlopen('http://' + lang + '.wikipedia.org/w/api.php' + queryString).read()
        result = requests.get('http://en.wikipedia.org/w/api.php', params=req).text
        return result;
        if 'error' in result:
            raise Error(result['error'])
        if 'warnings' in result:
            print(result['warnings'])
        if 'query' in result:
            #yield result['query']
            queryResults.append(result['query'])
        if 'continue' not in result:
            break
        lastContinue = result['continue']

    return queryResults
Exemple #4
0
 def process_exception(self, request, exception, spider):
     if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \
             and request.meta.get('proxy', False):
         self.invaild_proxy(request.meta['proxy'])
         logger.debug("Proxy[%s] connect exception[%s].",
                      request.meta['proxy'], exception)
         new_request = request.copy()
         new_request.dont_filter = True
         return new_request
Exemple #5
0
 def add_proxy(self, request):
     new_request = request.copy()
     new_request.meta["proxy"] = self.proxy_server
     new_request.headers["Proxy-Authorization"] = self.proxy_auth
     new_request.dont_filter = False
     logger.debug("Use proxy to request %s" % new_request.url)
     new_request.priority = new_request.priority + RETRY_PRIORITY_ADJUST
     time.sleep(HTTPPROXY_DELAY)
     return new_request
Exemple #6
0
    def process_exception(self, request, exception):

        from twisted.internet.error import TimeoutError
        if isinstance(exception, self.EXCEPTIONS_TO_CHANGE) \
                and request.meta.get('proxy', False):
            self.invalid_proxy(request.meta['proxy'])
            logger.debug("Proxy[%s] connect exception[%s].",
                         request.meta['proxy'], exception)
            new_request = request.copy()
            new_request.dont_filter = True
            return new_request

        elif isinstance(exception, TimeoutError):
            print('TimeoutError了,此时返回request')
            # return request
        elif isinstance(exception, TypeError):
            print('TypeError,此时返回request')
Exemple #7
0
    def __call__(self, request):
        requests_request = request.copy()

        destination_urls = self._create_forwarded_urls(requests_request.url)

        # Use gzip even if the original requestor didn't support it
        requests_request.headers['Accept-Encoding'] = 'gzip,identity'
        # Host header is automatically added for each request by grequests
        del requests_request.headers['Host']

        requests = (grequests.request(requests_request.method,
                                      destination_url,
                                      data=requests_request.body,
                                      headers=requests_request.headers,
                                      allow_redirects=False,
                                      verify=True)
                    for destination_url in destination_urls)

        exhandler = partial(helpers.log_failures, environ=request.environ)
        requests_responses = grequests.map(requests,
                                           stream=True,
                                           exception_handler=exhandler)

        self._log_responses(request, requests_responses)
        requests_responses = self._filter_responses(requests_responses)

        response = None
        if None in requests_responses:
            response = MetadataResponse(request, requests_responses)
            response.response.status = 504
        elif ('Proxy-Aggregator-Body' in request.headers
              and request.headers['Proxy-Aggregator-Body'].lower()
              == 'response-metadata'):
            response = MetadataResponse(request, requests_responses)
        elif len(requests_responses) == 1:
            response = SingleResponse(request, requests_responses[0])
        elif any(r.status_code >= 400 for r in requests_responses):
            response = ErrorResponse(request, requests_responses,
                                     self._priority_errors)
        else:
            response = MultipleResponse(request, requests_responses)

        return response.response
Exemple #8
0
    def download_image(self, request):
        """Attempt to download an image from the grabber's URL.

        By default, this will attempt to use the built in
        ``get_image_from_url(url)`` method, however a different callable
        can be chosen by setting the grabber's ``download_callable`` attribute.

        The returned "result" dict contains the downloaded image (if download
        was successful), information about any errors, the requested URL, etc.,
        which should be enough for result handlers to do their thing.

        The callable used as the downloader must have the following
        signtature::

            def some_downloader(url, grabber):

        Args:
            dict: The "request" dictionary.

        Returns:
            dict: A "result" dictionary.
        """
        url = request['url']
        download_callable = self.get_download_callable()

        result = request.copy()
        result['error'] = None

        im = None
        try:
            im = download_callable(url, self)
        except Exception as e:
            if not self.ignore_download_exception(e):
                raise e
            result['error'] = e

        result['image'] = im

        return result