Ejemplo n.º 1
0
    def build_check_request(self, item):
        scheme = item.get('scheme')
        proxy_url = item.get('url')
        self.logger.debug('Checking proxy: %s' % proxy_url)

        url, response_parser = self.get_check_approach(scheme)
        url = url.format(scheme=scheme)

        meta = {
            'proxy': proxy_url,
            'max_retry_times': 3,
            'download_timeout': 20,
            '_item_obj': item,
            '_start_time': time.time(),
            '_response_parser': response_parser,
        }

        req = Request(url,
                      callback=self.check_ip,
                      meta=meta,
                      dont_filter=True)

        if self.name == 'checker':
            req.errback = self.check_ip_failed

        return req
Ejemplo n.º 2
0
    def build_check_request(self, item: Proxy):
        scheme = item.get('scheme')
        proxy_url = item.get('url')
        self.logger.debug('Checking %s' % proxy_url)

        url, response_parser = self.get_check_approach(scheme)
        url = url.format(scheme=scheme)

        timeout = getattr(config, 'CHECK_TIMEOUT', 20)
        meta = {
            'proxy': proxy_url,
            'max_retry_times': 5,
            'download_timeout': timeout,
            '_item_obj': item,
            '_response_parser': response_parser,
        }

        req = Request(url,
                      callback=self.check_ip,
                      meta=meta,
                      dont_filter=True)

        if self.name == 'checker':
            req.errback = self.check_ip_failed

        return req
Ejemplo n.º 3
0
    def next_request(self):
        block_pop_timeout = self.idle_before_close
        item = self.queue.pop(block_pop_timeout)
        if item:

            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request(item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                print 'kk'

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                print 'kk'

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            req.meta['field_css'] = item['meta']
            if 'item' in item['meta']:
                req.meta['item'] = item['meta']['item']

            if 'field_css' in item['meta']:
                req.meta['field_css'] = item['meta']['field_css']
            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']

            return req