Exemple #1
0
def normalize_post_data(data, encoding='utf-8'):
    if isinstance(data, six.text_type):
        return make_str(data, encoding=encoding)
    elif isinstance(data, six.binary_type):
        return data
    else:
        # it calls `normalize_http_values()`
        return make_str(smart_urlencode(data, encoding))
Exemple #2
0
def save_list(lst, path):
    """
    Save items from list to the file.
    """

    with open(path, 'wb') as out:
        lines = []
        for item in lst:
            if isinstance(item, (six.text_type, six.binary_type)):
                lines.append(make_str(item))
            else:
                lines.append(make_str(json.dumps(item)))
        out.write(b'\n'.join(lines) + b'\n')
Exemple #3
0
def save_list(lst, path):
    """
    Save items from list to the file.
    """

    with open(path, 'wb') as out:
        lines = []
        for item in lst:
            if isinstance(item, (six.text_type, six.binary_type)):
                lines.append(make_str(item))
            else:
                lines.append(make_str(json.dumps(item)))
        out.write(b'\n'.join(lines) + b'\n')
Exemple #4
0
 def api_info(self):
     #if result and self.cache_reader_service:
     #    result = result and (
     #        not self.cache_reader_service.input_queue.qsize()
     #        and not self.cache_writer_service.input_queue.qsize()
     info = {
         'counters':
         self.spider.stat.counters,
         'collections':
         dict((x, len(y))
              for (x, y) in self.spider.stat.collections.items()),
         'thread_number':
         self.spider.thread_number,
         'parser_pool_size':
         self.spider.parser_pool_size,
         'task_queue':
         self.spider.task_queue.size(),
         'task_dispatcher_input_queue':
         (self.spider.task_dispatcher.input_queue.qsize()),
         'parser_service_input_queue':
         (self.spider.parser_service.input_queue.qsize()),
         'network_service_active_threads':
         (self.spider.network_service.get_active_threads_number()),
         'cache_reader_input_queue':
         (self.spider.cache_reader_service.input_queue.size()
          if self.spider.cache_reader_service else '--'),
         'cache_writer_input_queue':
         (self.spider.cache_writer_service.input_queue.qsize()
          if self.spider.cache_writer_service else '--'),
     }
     content = make_str(json.dumps(info))
     self.response(content=content)
Exemple #5
0
 def api_info(self):
     #if result and self.cache_reader_service:
     #    result = result and (
     #        not self.cache_reader_service.input_queue.qsize()
     #        and not self.cache_writer_service.input_queue.qsize()
     info = {
         'counters': self.spider.stat.counters,
         'collections': dict((x, len(y)) for (x, y)
                             in self.spider.stat.collections.items()),
         'thread_number': self.spider.thread_number,
         'parser_pool_size': self.spider.parser_pool_size,
         'task_queue': self.spider.task_queue.size(),
         'task_dispatcher_input_queue': (
             self.spider.task_dispatcher.input_queue.qsize()
         ),
         'parser_service_input_queue': (
             self.spider.parser_service.input_queue.qsize()
         ),
         'network_service_active_threads': (
             self.spider.network_service.get_active_threads_number()
         ),
         'cache_reader_input_queue': (
             self.spider.cache_reader_service.input_queue.size()
             if self.spider.cache_reader_service else '--'
         ),
         'cache_writer_input_queue': (
             self.spider.cache_writer_service.input_queue.qsize()
             if self.spider.cache_writer_service else '--'
         ),
     }
     content = make_str(json.dumps(info))
     self.response(content=content)
 def test_normalize_bytes(self):
     fh, path = mkstemp()
     dumper = MysqlCSVDumper(path)
     dumper.add_row((make_str('фуу'),))
     dumper.close()
     self.assertTrue(u'фуу' in open(path, encoding='utf-8').read())
     os.unlink(path)
Exemple #7
0
    def prepare_response(self, grab):
        #if self.body_file:
        #    self.body_file.close()
        response = Response()

        head = ''
        for key, val in self._response.getheaders().items():
            head += '%s: %s\r\n' % (key, val)
        head += '\r\n'
        response.head = make_str(head, encoding='latin', errors='ignore')

        #if self.body_path:
        #    response.body_path = self.body_path
        #else:
        #    response.body = b''.join(self.response_body_chunks)
        if self._request._response_path:
            response.body_path = self._request._response_path
            # Quick dirty hack, actullay, response is fully read into memory
            self._request._response_file.write(self._response.read())  #data)
            self._request._response_file.close()
        else:
            if self._request.body_maxsize is not None:
                #if self.response_body_bytes_read > self.config_body_maxsize:
                #    logger.debug('Response body max size limit reached: %s' %
                #                 self.config_body_maxsize)
                response.body = self._response.read(self._request.body_maxsize)
            else:
                response.body = self._response.read()  #data

        # Clear memory
        #self.response_header_chunks = []

        response.code = self._response.status
        #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self._response.get_redirect_location(
        ) or self._request.url

        import email.message
        hdr = email.message.Message()
        for key, val in self._response.getheaders().items():
            hdr[key] = val
        response.parse(charset=grab.config['document_charset'], headers=hdr)

        jar = self.extract_cookiejar(self._response, self._request)
        response.cookies = CookieManager(jar)

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
Exemple #8
0
 def api_info(self):
     info = {
         "counters": self.spider.stat.counters,
         "collections": dict((x, len(y)) for (x, y) in self.spider.stat.collections.items()),
         "thread_number": self.spider.thread_number,
         "parser_pool_size": self.spider.parser_pool_size,
     }
     content = make_str(json.dumps(info))
     self.response(content=content)
Exemple #9
0
def normalize_url(url):
    # https://tools.ietf.org/html/rfc3986
    url = make_unicode(url)
    if RE_NOT_SAFE_CHAR.search(url):
        parts = list(urlsplit(url))
        # Scheme
        pass
        # Network location (user:pass@hostname)
        if RE_NON_ALPHA_DIGIT_NETLOC.search(parts[1]):
            parts[1] = parts[1].encode('idna')
        # Path
        parts[2] = quote(make_str(parts[2]), safe=RESERVED_CHARS)
        # Query
        parts[3] = quote(make_str(parts[3]), safe=RESERVED_CHARS)
        # Fragment
        parts[4] = quote(make_str(parts[4]), safe=RESERVED_CHARS)
        return urlunsplit(map(make_unicode, parts))
    return url
Exemple #10
0
    def prepare_response(self, grab):
        #if self.body_file:
        #    self.body_file.close()
        response = Response()

        head = ''
        for key, val in self._response.getheaders().items():
            head += '%s: %s\r\n' % (key, val)
        head += '\r\n'
        response.head = make_str(head, encoding='latin', errors='ignore')

        #if self.body_path:
        #    response.body_path = self.body_path
        #else:
        #    response.body = b''.join(self.response_body_chunks)
        if self._request._response_path:
            response.body_path = self._request._response_path
            # Quick dirty hack, actullay, response is fully read into memory
            self._request._response_file.write(self._response.read())#data)
            self._request._response_file.close()
        else:
            if self._request.body_maxsize is not None:
                #if self.response_body_bytes_read > self.config_body_maxsize:
                #    logger.debug('Response body max size limit reached: %s' %
                #                 self.config_body_maxsize)
                response.body = self._response.read(self._request.body_maxsize)
            else:
                response.body = self._response.read()#data

        # Clear memory
        #self.response_header_chunks = []

        response.code = self._response.status
        #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self._response.get_redirect_location() or self._request.url

        import email.message
        hdr = email.message.Message()
        for key, val in self._response.getheaders().items():
            hdr[key] = val
        response.parse(charset=grab.config['document_charset'],
                       headers=hdr)

        jar = self.extract_cookiejar(self._response, self._request)
        response.cookies = CookieManager(jar)

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
Exemple #11
0
 def api_info(self):
     info = {
         'counters': self.spider.stat.counters,
         'collections': dict((x, len(y)) for (x, y)
                             in self.spider.stat.collections.items()),
         'thread_number': self.spider.thread_number,
         'parser_pool_size': self.spider.parser_pool_size,
     }
     content = make_str(json.dumps(info))
     self.response(content=content)
Exemple #12
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            if req.proxy_type == 'socks5':
                pool = SOCKSProxyManager(proxy_url)  # , proxy_headers=headers)
            else:
                pool = ProxyManager(proxy_url, proxy_headers=headers)
        else:
            pool = self.pool
        try:
            retry = Retry(redirect=False, connect=False, read=False)
            # The read timeout is not total response time timeout
            # It is the timeout on read of next data chunk from the server
            # Total response timeout is handled by Grab
            timeout = Timeout(connect=req.connect_timeout, read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            req.op_started = time.time()
            res = pool.urlopen(req_method,
                               req_url,
                               body=req.data,
                               timeout=timeout,
                               retries=retry,
                               headers=req.headers,
                               preload_content=False)
        except exceptions.ReadTimeoutError as ex:
            raise error.GrabTimeoutError('ReadTimeoutError', ex)
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError('ConnectTimeoutError', ex)
        except exceptions.ProtocolError as ex:
            # TODO:
            # the code
            # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])
            # fails
            # with error TypeError: 'OSError' object is not subscriptable
            raise error.GrabConnectionError('ProtocolError', ex)

        # WTF?
        self.request_head = b''
        self.request_body = b''
        self.request_log = b''

        self._response = res
Exemple #13
0
 def process(item):
     key, value = item
     # Process key
     if isinstance(key, six.text_type):
         key = make_str(key, encoding=charset)
     # Process value
     if ignore_classes and isinstance(value, ignore_classes):
         pass
     elif isinstance(value, six.text_type):
         value = make_str(value, encoding=charset)
     elif value is None:
         value = b''
     elif isinstance(value, (list, tuple)):
         for subval in value:
             for res in process((key, subval)):
                 yield res
         return
     else:
         value = make_str(value)
     yield key, value
Exemple #14
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            try:
                pool = ProxyManager(proxy_url, proxy_headers=headers)
            except ProxySchemeUnknown:
                raise GrabMisuseError('Urllib3 transport does '
                                      'not support %s proxies' %
                                      req.proxy_type)
        else:
            pool = self.pool
        try:
            retry = Retry(redirect=False, connect=False, read=False)
            # The read timeout is not total response time timeout
            # It is the timeout on read of next data chunk from the server
            # Total response timeout is handled by Grab
            timeout = Timeout(connect=req.connect_timeout, read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            req.op_started = time.time()
            res = pool.urlopen(req_method,
                               req_url,
                               body=req.data,
                               timeout=timeout,
                               retries=retry,
                               headers=req.headers,
                               preload_content=False)
        except exceptions.ReadTimeoutError as ex:
            raise error.GrabTimeoutError('Read timeout')
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError('Could not create connection')
        except exceptions.ProtocolError as ex:
            raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])

        # WTF?
        self.request_head = b''
        self.request_body = b''
        self.request_log = b''

        self._response = res
Exemple #15
0
    def execute_task_handler(self, res, handler):
        """
        Apply `handler` function to the network result.

        If network result is failed then submit task again
        to the network task queue.
        """

        try:
            handler_name = handler.__name__
        except AttributeError:
            handler_name = 'NONE'

        if (res['task'].get('raw') or (
            res['ok'] and self.valid_response_code(res['grab'].response.code,
                                                   res['task']))):
            try:
                with self.stat.log_time('response_handler'):
                    with self.stat.log_time('response_handler.%s' % handler_name):
                        result = handler(res['grab'], res['task'])
                        if result is None:
                            pass
                        else:
                            for item in result:
                                self.process_handler_result(item, res['task'])
            except NoDataHandler as ex:
                raise
            except Exception as ex:
                self.process_handler_error(handler_name, ex, res['task'])
            else:
                self.stat.inc('spider:task-%s-ok' % res['task'].name)
        else:
            # Log the error
            if res['ok']:
                msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code
            else:
                msg = res['emsg']

            self.stat.inc('spider:network-error-%s' %
                          make_str(res['emsg'][:20], errors='ignore'))
            logger.error(u'Network error: %s' %
                         make_unicode(msg, errors='ignore'))

            # Try to repeat the same network query
            if self.network_try_limit > 0:
                task = res['task']
                task.refresh_cache = True
                # Should use task.grab_config or backup of grab_config
                task.setup_grab_config(res['grab_config_backup'])
                self.add_task(task)
Exemple #16
0
    def execute_task_handler(self, res, handler):
        """
        Apply `handler` function to the network result.

        If network result is failed then submit task again
        to the network task queue.
        """

        try:
            handler_name = handler.__name__
        except AttributeError:
            handler_name = 'NONE'

        if (res['task'].get('raw') or (res['ok'] and self.valid_response_code(
                res['grab'].response.code, res['task']))):
            try:
                with self.save_timer('response_handler'):
                    with self.save_timer('response_handler.%s' % handler_name):
                        result = handler(res['grab'], res['task'])
                        if result is None:
                            pass
                        else:
                            for item in result:
                                self.process_handler_result(item, res['task'])
            except NoDataHandler as ex:
                raise
            except Exception as ex:
                self.process_handler_error(handler_name, ex, res['task'])
            else:
                self.inc_count('task-%s-ok' % res['task'].name)
        else:
            # Log the error
            if res['ok']:
                msg = res['emsg'] = 'HTTP %s' % res['grab'].response.code
            else:
                msg = res['emsg']

            self.inc_count('network-error-%s' %
                           make_str(res['emsg'][:20], errors='ignore'))
            logger.error(u'Network error: %s' %
                         make_unicode(msg, errors='ignore'))

            # Try to repeat the same network query
            if self.network_try_limit > 0:
                task = res['task']
                task.refresh_cache = True
                # Should use task.grab_config or backup of grab_config
                task.setup_grab_config(res['grab_config_backup'])
                self.add_task(task)
Exemple #17
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            try:
                pool = ProxyManager(proxy_url, proxy_headers=headers)
            except ProxySchemeUnknown:
                raise GrabMisuseError('Urllib3 transport does '
                                      'not support %s proxies' % req.proxy_type)
        else:
            pool = self.pool
        try:
            retry = Retry(redirect=False, connect=False, read=False)
            timeout = Timeout(connect=req.connect_timeout,
                              read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            res = pool.urlopen(req_method,
                               req_url,
                               body=req.data, timeout=timeout,
                               retries=retry, headers=req.headers,
                               preload_content=False)
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError('Could not create connection')
        except exceptions.ProtocolError as ex:
            raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])

        # WTF?
        self.request_head = ''
        self.request_body = ''
        self.request_log = ''

        self._response = res
Exemple #18
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                auth = "%s@" % req.proxy_userpwd
            else:
                auth = ""
            proxy_url = "%s://%s%s" % (req.proxy_type, auth, req.proxy)
            pool = ProxyManager(proxy_url)
        else:
            pool = self.pool
        try:
            retry = Retry(redirect=False, connect=False, read=False)
            timeout = Timeout(connect=req.connect_timeout, read=req.timeout)
            # req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            res = pool.urlopen(
                req_method,
                req_url,
                body=req.data,
                timeout=timeout,
                retries=retry,
                headers=req.headers,
                preload_content=False,
            )
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError("Could not create connection")
        except exceptions.ProtocolError as ex:
            raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])

        # WTF?
        self.request_head = ""
        self.request_body = ""
        self.request_log = ""

        self._response = res
Exemple #19
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                auth = '%s@' % req.proxy_userpwd
            else:
                auth = ''
            proxy_url = '%s://%s%s' % (req.proxy_type, auth, req.proxy)
            pool = ProxyManager(proxy_url)
        else:
            pool = self.pool
        try:
            retry = Retry(redirect=False, connect=False, read=False)
            timeout = Timeout(connect=req.connect_timeout, read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            res = pool.urlopen(req_method,
                               req_url,
                               body=req.data,
                               timeout=timeout,
                               retries=retry,
                               headers=req.headers,
                               preload_content=False)
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError('Could not create connection')
        except exceptions.ProtocolError as ex:
            raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])

        # WTF?
        self.request_head = ''
        self.request_body = ''
        self.request_log = ''

        self._response = res
Exemple #20
0
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config['nobody']
        self.config_body_maxsize = grab.config['body_maxsize']

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' % (six.text_type(ex), grab.config['url']))

        # py3 hack
        if not six.PY3:
            request_url = make_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        # Actually, FOLLOWLOCATION should always be 0
        # because redirect logic takes place in Grab.request method
        # BUT in Grab.Spider this method is not invoked
        # So, in Grab.Spider we still rely on Grab internal ability
        # to follow 30X Locations
        self.curl.setopt(pycurl.FOLLOWLOCATION,
                         1 if grab.config['follow_location'] else 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit'])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout'])
        self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout'])
        #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config['connection_reuse']:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor)

        if grab.config['body_inmemory']:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config['body_storage_dir']:
                raise error.GrabMisuseError(
                    'Option body_storage_dir is not defined')
            self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config['verbose_logging']:
            self.verbose_logging = True

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config['user_agent']:
            grab.config['user_agent'] = ''

        self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent'])

        if grab.config['debug']:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method in ('POST', 'PUT'):
            if (grab.config['post'] is None and
                grab.config['multipart_post'] is None):
                    raise GrabMisuseError('Neither `post` or `multipart_post`'
                                          ' options was specified for the %s'
                                          ' request' % grab.request_method)

        if grab.request_method == 'POST':
            self.curl.setopt(pycurl.POST, 1)
            if grab.config['multipart_post']:
                if isinstance(grab.config['multipart_post'], six.string_types):
                    raise error.GrabMisuseError(
                        'multipart_post option could not be a string')
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                if six.PY3:
                    post_items = decode_pairs(post_items,
                                              grab.config['charset'])
                self.curl.setopt(pycurl.HTTPPOST,
                                 process_upload_items(post_items))
            elif grab.config['post']:
                post_data = normalize_post_data(grab.config['post'],
                                                grab.config['charset'])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, '')
        elif grab.request_method == 'PUT':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError(
                    'Value of post option could be only '
                    'byte string if PUT method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'PATCH':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError(
                    'Value of post option could be only byte '
                    'string if PATCH method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'DELETE':
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE')
        elif grab.request_method == 'HEAD':
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == 'UPLOAD':
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == 'GET':
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif grab.request_method == 'OPTIONS':
            data = grab.config['post']
            if data is not None:
                if isinstance(data, six.text_type):
                    raise error.GrabMisuseError(
                        'Value of post option could be only byte '
                        'string if PATCH method is used')
                self.curl.setopt(pycurl.UPLOAD, 1)
                self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
                self.curl.setopt(pycurl.INFILESIZE, len(data))
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS')
        else:
            raise error.GrabMisuseError('Invalid method: %s' %
                                        grab.request_method)

        headers = grab.config['common_headers']
        if grab.config['headers']:
            headers.update(grab.config['headers'])
        # This is required to avoid some problems
        headers.update({'Expect': ''})
        header_tuples = [str('%s: %s' % x) for x
                         in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config['referer']:
            self.curl.setopt(pycurl.REFERER, str(grab.config['referer']))

        if grab.config['proxy']:
            self.curl.setopt(pycurl.PROXY, str(grab.config['proxy']))
        else:
            self.curl.setopt(pycurl.PROXY, '')

        if grab.config['proxy_userpwd']:
            self.curl.setopt(pycurl.PROXYUSERPWD,
                             str(grab.config['proxy_userpwd']))

        if grab.config['proxy_type']:
            key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config['encoding']:
            if ('gzip' in grab.config['encoding'] and
                    'zlib' not in pycurl.version):
                raise error.GrabMisuseError(
                    'You can not use gzip encoding because '
                    'pycurl was built without zlib support')
            self.curl.setopt(pycurl.ENCODING, grab.config['encoding'])

        if grab.config['userpwd']:
            self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd']))

        if grab.config.get('interface') is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config['interface'])

        if grab.config.get('reject_file_size') is not None:
            self.curl.setopt(pycurl.MAXFILESIZE,
                             grab.config['reject_file_size'])
Exemple #21
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' %
                (six.text_type(ex),
                 make_unicode(grab.config['url'], errors='ignore')))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.config_body_maxsize = grab.config['body_maxsize']
        req.config_nobody = grab.config['nobody']

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError('Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req.response_file = file_
            req.response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                post_items = decode_pairs(post_items, grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent']

        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)

        self._request = req
Exemple #22
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            if req.proxy_type == 'socks5':
                pool = SOCKSProxyManager(
                    proxy_url,
                    cert_reqs='CERT_REQUIRED',
                    ca_certs=certifi.where()
                ) # , proxy_headers=headers)
            else:
                pool = ProxyManager(
                    proxy_url, proxy_headers=headers,
                    cert_reqs='CERT_REQUIRED',
                    ca_certs=certifi.where()
                )
        else:
            pool = self.pool
        try:
            # Retries can be disabled by passing False:
            # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry
            # Do not use False because of warning:
            # Converted retries value: False -> Retry(total=False,
            # connect=None, read=None, redirect=0, status=None)
            retry = Retry(
                total=False,
                connect=False,
                read=False,
                redirect=0,
                status=None,
            )
            # The read timeout is not total response time timeout
            # It is the timeout on read of next data chunk from the server
            # Total response timeout is handled by Grab
            timeout = Timeout(connect=req.connect_timeout,
                              read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            req.op_started = time.time()
            try:
                res = pool.urlopen(req_method,
                                   req_url,
                                   body=req.data, timeout=timeout,
                                   retries=retry,
                                   headers=req.headers,
                                   preload_content=False)
            except UnicodeError as ex:
                raise error.GrabConnectionError('GrabInvalidUrl', ex)
        except exceptions.ReadTimeoutError as ex:
            raise error.GrabTimeoutError('ReadTimeoutError', ex)
        except exceptions.ConnectTimeoutError as ex:
            raise error.GrabConnectionError('ConnectTimeoutError', ex)
        except exceptions.ProtocolError as ex:
            # TODO:
            # the code
            # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])
            # fails
            # with error TypeError: 'OSError' object is not subscriptable
            raise error.GrabConnectionError('ProtocolError', ex)
        except exceptions.SSLError as ex:
            raise error.GrabConnectionError('SSLError', ex)

        # WTF?
        self.request_head = b''
        self.request_body = b''
        self.request_log = b''

        self._response = res
Exemple #23
0
    def prepare_response(self, grab):
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Response()

            head = ''
            for key, val in self._response.getheaders().items():
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='latin', errors='ignore')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                total_size = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        total_size += len(chunk)
                        chunks.append(chunk)
                        if maxsize and total_size > maxsize:
                            logger.debug(
                                'Response body max size limit reached: %s' %
                                maxsize)
                    else:
                        break
                    if self._request.timeout:
                        if time.time(
                        ) - self._request.op_started > self._request.timeout:
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request._response_path:
                response.body_path = self._request._response_path
                # FIXME: Quick dirty hack, actullay, response is fully read into memory
                self._request._response_file.write(read_with_timeout())
                self._request._response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = self._response.get_redirect_location(
            ) or self._request.url

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar()  #self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
Exemple #24
0
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config['nobody']
        self.config_body_maxsize = grab.config['body_maxsize']

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(u'%s: %s' %
                                       (six.text_type(ex), grab.config['url']))

        # py3 hack
        if not six.PY3:
            request_url = make_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        # 30* redirects are handled by Grab
        self.curl.setopt(pycurl.FOLLOWLOCATION, 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config['redirect_limit'])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config['connect_timeout'])
        self.curl.setopt(pycurl.TIMEOUT, grab.config['timeout'])
        #self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config['connection_reuse']:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor)

        if grab.config['body_inmemory']:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config['body_storage_dir']:
                raise error.GrabMisuseError(
                    'Option body_storage_dir is not defined')
            self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config['verbose_logging']:
            self.verbose_logging = True

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config['user_agent']:
            grab.config['user_agent'] = ''

        self.curl.setopt(pycurl.USERAGENT, grab.config['user_agent'])

        if grab.config['debug']:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method in ('POST', 'PUT'):
            if (grab.config['post'] is None
                    and grab.config['multipart_post'] is None):
                raise GrabMisuseError('Neither `post` or `multipart_post`'
                                      ' options was specified for the %s'
                                      ' request' % grab.request_method)

        if grab.request_method == 'POST':
            self.curl.setopt(pycurl.POST, 1)
            if grab.config['multipart_post']:
                if isinstance(grab.config['multipart_post'], six.string_types):
                    raise error.GrabMisuseError(
                        'multipart_post option could not be a string')
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                #if six.PY3:
                #    post_items = decode_pairs(post_items,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.HTTPPOST,
                                 process_upload_items(post_items))
            elif grab.config['post']:
                post_data = normalize_post_data(grab.config['post'],
                                                grab.config['charset'])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, '')
        elif grab.request_method == 'PUT':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError(
                    'Value of post option could be only '
                    'byte string if PUT method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PUT')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'PATCH':
            data = grab.config['post']
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError(
                    'Value of post option could be only byte '
                    'string if PATCH method is used')
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'PATCH')
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == 'DELETE':
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'DELETE')
        elif grab.request_method == 'HEAD':
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == 'UPLOAD':
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == 'GET':
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif grab.request_method == 'OPTIONS':
            data = grab.config['post']
            if data is not None:
                if isinstance(data, six.text_type):
                    raise error.GrabMisuseError(
                        'Value of post option could be only byte '
                        'string if PATCH method is used')
                self.curl.setopt(pycurl.UPLOAD, 1)
                self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
                self.curl.setopt(pycurl.INFILESIZE, len(data))
            self.curl.setopt(pycurl.CUSTOMREQUEST, 'OPTIONS')
        else:
            raise error.GrabMisuseError('Invalid method: %s' %
                                        grab.request_method)

        headers = grab.config['common_headers']
        if grab.config['headers']:
            headers.update(grab.config['headers'])
        # This is required to avoid some problems
        headers.update({'Expect': ''})
        header_tuples = [str('%s: %s' % x) for x in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config['referer']:
            self.curl.setopt(pycurl.REFERER, str(grab.config['referer']))

        if grab.config['proxy']:
            self.curl.setopt(pycurl.PROXY, str(grab.config['proxy']))
        else:
            self.curl.setopt(pycurl.PROXY, '')

        if grab.config['proxy_userpwd']:
            self.curl.setopt(pycurl.PROXYUSERPWD,
                             str(grab.config['proxy_userpwd']))

        if grab.config['proxy_type']:
            key = 'PROXYTYPE_%s' % grab.config['proxy_type'].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config['encoding']:
            if ('gzip' in grab.config['encoding']
                    and 'zlib' not in pycurl.version):
                raise error.GrabMisuseError(
                    'You can not use gzip encoding because '
                    'pycurl was built without zlib support')
            self.curl.setopt(pycurl.ENCODING, grab.config['encoding'])

        if grab.config['userpwd']:
            self.curl.setopt(pycurl.USERPWD, str(grab.config['userpwd']))

        if grab.config.get('interface') is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config['interface'])

        if grab.config.get('reject_file_size') is not None:
            self.curl.setopt(pycurl.MAXFILESIZE,
                             grab.config['reject_file_size'])
Exemple #25
0
 def test_make_str_from_str(self):
     self.assertEqual(
         make_str(u'фыва'.encode('utf-8')),
         u'фыва'.encode('utf-8'),
     )
Exemple #26
0
 def build_hash(self, url):
     with self.spider.stat.log_time('cache.read.build_hash'):
         utf_url = make_str(url)
         return sha1(utf_url).hexdigest()
Exemple #27
0
def normalize_unicode(value, charset='utf-8'):
    return make_str(value, encoding=charset, errors='ignore')
Exemple #28
0
 def get_random_filename(self):
     return md5(make_str(str(time.time()))).hexdigest()[:10]
Exemple #29
0
 def get_random_filename(self):
     return md5(make_str(str(time.time()))).hexdigest()[:10]
Exemple #30
0
def main(spider_name, thread_number=None, slave=False,
         settings_module='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False,
         disable_report=False,
         disable_default_logs=False,
         *args, **kwargs):
    if disable_default_logs:
        default_logging(propagate_network_logger=network_logs,
                        grab_log=None, network_log=None)
    else:
        default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing'))
    stats_with_time = bot.render_stats(timing=True)

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.items():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats_with_time))

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Exemple #31
0
 def build_hash(self, url):
     utf_url = make_str(url)
     return sha1(utf_url).hexdigest()
Exemple #32
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config["url"])
        except Exception as ex:
            raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"]))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.body_maxsize = grab.config["body_maxsize"]
        if grab.config["nobody"]:
            req.body_maxsize = 0

        req.timeout = grab.config["timeout"]
        req.connect_timeout = grab.config["connect_timeout"]

        extra_headers = {}

        # Body processing
        if grab.config["body_inmemory"]:
            pass
        else:
            if not grab.config["body_storage_dir"]:
                raise GrabMisuseError("Option body_storage_dir is not defined")
            file_, path_ = self.setup_body_file(
                grab.config["body_storage_dir"],
                grab.config["body_storage_filename"],
                create_dir=grab.config["body_storage_create_dir"],
            )
            req._response_file = file_
            req._response_path = path_

        if grab.config["multipart_post"] is not None:
            post_data = grab.config["multipart_post"]
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError("Option multipart_post data" " does not accept unicode.")
            else:
                post_items = normalize_http_values(
                    grab.config["multipart_post"],
                    charset=grab.config["charset"],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # if six.PY3:
                post_items = decode_pairs(post_items, grab.config["charset"])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers["Content-Type"] = content_type
            extra_headers["Content-Length"] = len(post_data)
            req.data = post_data
        elif grab.config["post"] is not None:
            post_data = normalize_post_data(grab.config["post"], grab.config["charset"])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers["Content-Length"] = len(post_data)
            req.data = post_data

        if method in ("POST", "PUT"):
            if grab.config["post"] is None and grab.config["multipart_post"] is None:
                raise GrabMisuseError(
                    "Neither `post` or `multipart_post`" " options was specified for the %s" " request" % method
                )
        # Proxy
        if grab.config["proxy"]:
            req.proxy = grab.config["proxy"]

        if grab.config["proxy_userpwd"]:
            req.proxy_userpwd = grab.config["proxy_userpwd"]

        if grab.config["proxy_type"]:
            req.proxy_type = grab.config["proxy_type"]
        else:
            req.proxy_type = "http"

        # User-Agent
        if grab.config["user_agent"] is None:
            if grab.config["user_agent_file"] is not None:
                with open(grab.config["user_agent_file"]) as inf:
                    lines = inf.read().splitlines()
                grab.config["user_agent"] = random.choice(lines)
            else:
                grab.config["user_agent"] = generate_user_agent()

        extra_headers["User-Agent"] = grab.config["user_agent"]

        # Headers
        headers = extra_headers
        headers.update(grab.config["common_headers"])

        if grab.config["headers"]:
            headers.update(grab.config["headers"])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)

        self._request = req
Exemple #33
0
    def request(self):
        req = self._request

        if req.proxy:
            if req.proxy_userpwd:
                headers = make_headers(proxy_basic_auth=req.proxy_userpwd)
            else:
                headers = None
            proxy_url = '%s://%s' % (req.proxy_type, req.proxy)
            if req.proxy_type == 'socks5':
                pool = SOCKSProxyManager(
                    proxy_url,
                    cert_reqs='CERT_REQUIRED',
                    ca_certs=certifi.where())  # , proxy_headers=headers)
            else:
                pool = ProxyManager(proxy_url,
                                    proxy_headers=headers,
                                    cert_reqs='CERT_REQUIRED',
                                    ca_certs=certifi.where())
        else:
            pool = self.pool
        with self.wrap_transport_error():
            # Retries can be disabled by passing False:
            # http://urllib3.readthedocs.io/en/latest/reference/urllib3.util.html#module-urllib3.util.retry
            # Do not use False because of warning:
            # Converted retries value: False -> Retry(total=False,
            # connect=None, read=None, redirect=0, status=None)
            retry = Retry(
                total=False,
                connect=False,
                read=False,
                redirect=0,
                status=None,
            )
            # The read timeout is not total response time timeout
            # It is the timeout on read of next data chunk from the server
            # Total response timeout is handled by Grab
            timeout = Timeout(connect=req.connect_timeout, read=req.timeout)
            #req_headers = dict((make_unicode(x), make_unicode(y))
            #                   for (x, y) in req.headers.items())
            if six.PY3:
                req_url = make_unicode(req.url)
                req_method = make_unicode(req.method)
            else:
                req_url = make_str(req.url)
                req_method = req.method
            req.op_started = time.time()
            try:
                res = pool.urlopen(req_method,
                                   req_url,
                                   body=req.data,
                                   timeout=timeout,
                                   retries=retry,
                                   headers=req.headers,
                                   preload_content=False)
            except UnicodeError as ex:
                raise error.GrabConnectionError('GrabInvalidUrl', ex)
        #except exceptions.ReadTimeoutError as ex:
        #    raise error.GrabTimeoutError('ReadTimeoutError', ex)
        #except exceptions.ConnectTimeoutError as ex:
        #    raise error.GrabConnectionError('ConnectTimeoutError', ex)
        #except exceptions.ProtocolError as ex:
        #    # TODO:
        #    # the code
        #    # raise error.GrabConnectionError(ex.args[1][0], ex.args[1][1])
        #    # fails
        #    # with error TypeError: 'OSError' object is not subscriptable
        #    raise error.GrabConnectionError('ProtocolError', ex)
        #except exceptions.SSLError as ex:
        #    raise error.GrabConnectionError('SSLError', ex)

        # WTF?
        self.request_head = b''
        self.request_body = b''
        self.request_log = b''

        self._response = res
Exemple #34
0
    def process_config(self, grab):
        """
        Setup curl instance with values from ``self.config``.
        """

        # Copy some config for future usage
        self.config_nobody = grab.config["nobody"]
        self.config_body_maxsize = grab.config["body_maxsize"]

        try:
            request_url = normalize_url(grab.config["url"])
        except Exception as ex:
            raise error.GrabInvalidUrl(u"%s: %s" % (six.text_type(ex), grab.config["url"]))

        # py3 hack
        if not six.PY3:
            request_url = make_str(request_url)

        self.curl.setopt(pycurl.URL, request_url)

        # Actually, FOLLOWLOCATION should always be 0
        # because redirect logic takes place in Grab.request method
        # BUT in Grab.Spider this method is not invoked
        # So, in Grab.Spider we still rely on Grab internal ability
        # to follow 30X Locations
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1 if grab.config["follow_location"] else 0)
        self.curl.setopt(pycurl.MAXREDIRS, grab.config["redirect_limit"])
        self.curl.setopt(pycurl.CONNECTTIMEOUT, grab.config["connect_timeout"])
        self.curl.setopt(pycurl.TIMEOUT, grab.config["timeout"])
        self.curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4)
        # self.curl.setopt(pycurl.DNS_CACHE_TIMEOUT, 0)
        if not grab.config["connection_reuse"]:
            self.curl.setopt(pycurl.FRESH_CONNECT, 1)
            self.curl.setopt(pycurl.FORBID_REUSE, 1)

        self.curl.setopt(pycurl.NOSIGNAL, 1)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_processor)

        if grab.config["body_inmemory"]:
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)
        else:
            if not grab.config["body_storage_dir"]:
                raise error.GrabMisuseError("Option body_storage_dir is not defined")
            self.setup_body_file(
                grab.config["body_storage_dir"],
                grab.config["body_storage_filename"],
                create_dir=grab.config["body_storage_create_dir"],
            )
            self.curl.setopt(pycurl.WRITEFUNCTION, self.body_processor)

        if grab.config["verbose_logging"]:
            self.verbose_logging = True

        # User-Agent
        if grab.config["user_agent"] is None:
            if grab.config["user_agent_file"] is not None:
                with open(grab.config["user_agent_file"]) as inf:
                    lines = inf.read().splitlines()
                grab.config["user_agent"] = random.choice(lines)
            else:
                grab.config["user_agent"] = generate_user_agent()

        # If value is None then set empty string
        # None is not acceptable because in such case
        # pycurl will set its default user agent "PycURL/x.xx.x"
        if not grab.config["user_agent"]:
            grab.config["user_agent"] = ""

        self.curl.setopt(pycurl.USERAGENT, grab.config["user_agent"])

        if grab.config["debug"]:
            self.curl.setopt(pycurl.VERBOSE, 1)
            self.curl.setopt(pycurl.DEBUGFUNCTION, self.debug_processor)

        # Ignore SSL errors
        self.curl.setopt(pycurl.SSL_VERIFYPEER, 0)
        self.curl.setopt(pycurl.SSL_VERIFYHOST, 0)

        # Disabled to avoid SSL3_READ_BYTES:sslv3 alert handshake failure error
        # self.curl.setopt(pycurl.SSLVERSION, pycurl.SSLVERSION_SSLv3)

        if grab.request_method in ("POST", "PUT"):
            if grab.config["post"] is None and grab.config["multipart_post"] is None:
                raise GrabMisuseError(
                    "Neither `post` or `multipart_post`"
                    " options was specified for the %s"
                    " request" % grab.request_method
                )

        if grab.request_method == "POST":
            self.curl.setopt(pycurl.POST, 1)
            if grab.config["multipart_post"]:
                if isinstance(grab.config["multipart_post"], six.string_types):
                    raise error.GrabMisuseError("multipart_post option could not be a string")
                post_items = normalize_http_values(
                    grab.config["multipart_post"],
                    charset=grab.config["charset"],
                    ignore_classes=(UploadFile, UploadContent),
                )
                # py3 hack
                if six.PY3:
                    post_items = decode_pairs(post_items, grab.config["charset"])
                # import pdb; pdb.set_trace()
                self.curl.setopt(pycurl.HTTPPOST, process_upload_items(post_items))
            elif grab.config["post"]:
                post_data = normalize_post_data(grab.config["post"], grab.config["charset"])
                # py3 hack
                # if six.PY3:
                #    post_data = smart_unicode(post_data,
                #                              grab.config['charset'])
                self.curl.setopt(pycurl.POSTFIELDS, post_data)
            else:
                self.curl.setopt(pycurl.POSTFIELDS, "")
        elif grab.request_method == "PUT":
            data = grab.config["post"]
            if isinstance(data, six.text_type):
                # py3 hack
                # if six.PY3:
                #    data = data.encode('utf-8')
                # else:
                raise error.GrabMisuseError("Value of post option could be only " "byte string if PUT method is used")
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, "PUT")
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == "PATCH":
            data = grab.config["post"]
            if isinstance(data, six.text_type):
                raise error.GrabMisuseError("Value of post option could be only byte " "string if PATCH method is used")
            self.curl.setopt(pycurl.UPLOAD, 1)
            self.curl.setopt(pycurl.CUSTOMREQUEST, "PATCH")
            self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
            self.curl.setopt(pycurl.INFILESIZE, len(data))
        elif grab.request_method == "DELETE":
            self.curl.setopt(pycurl.CUSTOMREQUEST, "DELETE")
        elif grab.request_method == "HEAD":
            self.curl.setopt(pycurl.NOBODY, 1)
        elif grab.request_method == "UPLOAD":
            self.curl.setopt(pycurl.UPLOAD, 1)
        elif grab.request_method == "GET":
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif grab.request_method == "OPTIONS":
            data = grab.config["post"]
            if data is not None:
                if isinstance(data, six.text_type):
                    raise error.GrabMisuseError(
                        "Value of post option could be only byte " "string if PATCH method is used"
                    )
                self.curl.setopt(pycurl.UPLOAD, 1)
                self.curl.setopt(pycurl.READFUNCTION, StringIO(data).read)
                self.curl.setopt(pycurl.INFILESIZE, len(data))
            self.curl.setopt(pycurl.CUSTOMREQUEST, "OPTIONS")
        else:
            raise error.GrabMisuseError("Invalid method: %s" % grab.request_method)

        headers = grab.config["common_headers"]
        if grab.config["headers"]:
            headers.update(grab.config["headers"])
        # This is required to avoid some problems
        headers.update({"Expect": ""})
        header_tuples = [str("%s: %s" % x) for x in headers.items()]
        self.curl.setopt(pycurl.HTTPHEADER, header_tuples)

        self.process_cookie_options(grab, request_url)

        if grab.config["referer"]:
            self.curl.setopt(pycurl.REFERER, str(grab.config["referer"]))

        if grab.config["proxy"]:
            self.curl.setopt(pycurl.PROXY, str(grab.config["proxy"]))
        else:
            self.curl.setopt(pycurl.PROXY, "")

        if grab.config["proxy_userpwd"]:
            self.curl.setopt(pycurl.PROXYUSERPWD, str(grab.config["proxy_userpwd"]))

        if grab.config["proxy_type"]:
            key = "PROXYTYPE_%s" % grab.config["proxy_type"].upper()
            self.curl.setopt(pycurl.PROXYTYPE, getattr(pycurl, key))

        if grab.config["encoding"]:
            if "gzip" in grab.config["encoding"] and "zlib" not in pycurl.version:
                raise error.GrabMisuseError(
                    "You can not use gzip encoding because " "pycurl was built without zlib support"
                )
            self.curl.setopt(pycurl.ENCODING, grab.config["encoding"])

        if grab.config["userpwd"]:
            self.curl.setopt(pycurl.USERPWD, str(grab.config["userpwd"]))

        if grab.config.get("interface") is not None:
            self.curl.setopt(pycurl.INTERFACE, grab.config["interface"])

        if grab.config.get("reject_file_size") is not None:
            self.curl.setopt(pycurl.MAXFILESIZE, grab.config["reject_file_size"])
Exemple #35
0
    def prepare_response(self, grab):
        # Information about urllib3
        # On python2 urllib3 headers contains original binary data
        # On python3 urllib3 headers are converted to unicode
        # using latin encoding
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Document()

            head = ''
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='utf-8')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                bytes_read = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        bytes_read += len(chunk)
                        chunks.append(chunk)
                        if maxsize and bytes_read > maxsize:
                            # reached limit on bytes to read
                            break
                    else:
                        break
                    if self._request.timeout:
                        if (time.time() - self._request.op_started >
                                self._request.timeout):
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request.response_path:
                response.body_path = self._request.response_path
                # FIXME: Quick dirty hack, actullay, response is fully
                # read into memory
                self._request.response_file.write(read_with_timeout())
                self._request.response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = (self.curl
            #                             .getinfo(pycurl.NAMELOOKUP_TIME))
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = (self._response.get_redirect_location()
                            or self._request.url)

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                #if key == 'Location':
                #    import pdb; pdb.set_trace()
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar()  # self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
Exemple #36
0
    def process_config(self, grab):
        req = Request(data=None)

        try:
            request_url = normalize_url(grab.config['url'])
        except Exception as ex:
            raise error.GrabInvalidUrl(
                u'%s: %s' % (six.text_type(ex), grab.config['url']))
        req.url = request_url

        method = grab.detect_request_method()
        req.method = make_str(method)

        req.body_maxsize = grab.config['body_maxsize']
        if grab.config['nobody']:
            req.body_maxsize = 0

        req.timeout = grab.config['timeout']
        req.connect_timeout = grab.config['connect_timeout']

        extra_headers = {}

        # Body processing
        if grab.config['body_inmemory']:
            pass
        else:
            if not grab.config['body_storage_dir']:
                raise GrabMisuseError(
                    'Option body_storage_dir is not defined')
            file_, path_ = self.setup_body_file(
                grab.config['body_storage_dir'],
                grab.config['body_storage_filename'],
                create_dir=grab.config['body_storage_create_dir'])
            req._response_file = file_
            req._response_path = path_

        if grab.config['multipart_post'] is not None:
            post_data = grab.config['multipart_post']
            if isinstance(post_data, six.binary_type):
                pass
            elif isinstance(post_data, six.text_type):
                raise GrabMisuseError('Option multipart_post data'
                                      ' does not accept unicode.')
            else:
                post_items = normalize_http_values(
                    grab.config['multipart_post'],
                    charset=grab.config['charset'],
                    ignore_classes=(UploadFile, UploadContent),
                )
                #if six.PY3:
                post_items = decode_pairs(post_items,
                                          grab.config['charset'])
                post_items = process_upload_items(post_items)
                post_data, content_type = encode_multipart_formdata(post_items)
                extra_headers['Content-Type'] = content_type
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data
        elif grab.config['post'] is not None:
            post_data = normalize_post_data(grab.config['post'],
                                            grab.config['charset'])
            # py3 hack
            # if six.PY3:
            #    post_data = smart_unicode(post_data,
            #                              grab.config['charset'])
            extra_headers['Content-Length'] = len(post_data)
            req.data = post_data

        if method in ('POST', 'PUT'):
            if (grab.config['post'] is None and
                grab.config['multipart_post'] is None):
                    raise GrabMisuseError('Neither `post` or `multipart_post`'
                                          ' options was specified for the %s'
                                          ' request' % method)
        # Proxy
        if grab.config['proxy']:
            req.proxy = grab.config['proxy']

        if grab.config['proxy_userpwd']:
            req.proxy_userpwd = grab.config['proxy_userpwd']

        if grab.config['proxy_type']:
            req.proxy_type = grab.config['proxy_type']
        else:
            req.proxy_type = 'http'

        # User-Agent
        if grab.config['user_agent'] is None:
            if grab.config['user_agent_file'] is not None:
                with open(grab.config['user_agent_file']) as inf:
                    lines = inf.read().splitlines()
                grab.config['user_agent'] = random.choice(lines)
            else:
                grab.config['user_agent'] = generate_user_agent()

        extra_headers['User-Agent'] = grab.config['user_agent'] 


        # Headers
        headers = extra_headers
        headers.update(grab.config['common_headers'])

        if grab.config['headers']:
            headers.update(grab.config['headers'])
        req.headers = headers

        # Cookies
        self.process_cookie_options(grab, req)


        self._request = req
Exemple #37
0
def main(spider_name,
         thread_number=None,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs):  # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }
Exemple #38
0
 def test_make_str_from_none(self):
     self.assertEqual(
         make_str(None),
         b'None',
     )
Exemple #39
0
    def prepare_response(self, grab):
        # Information about urllib3
        # On python2 urllib3 headers contains original binary data
        # On python3 urllib3 headers are converted to unicode
        # using latin encoding
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Document()

            head = ''
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='utf-8')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                bytes_read = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        bytes_read += len(chunk)
                        chunks.append(chunk)
                        if maxsize and bytes_read > maxsize:
                            # reached limit on bytes to read
                            break
                    else:
                        break
                    if self._request.timeout:
                        if (time.time() - self._request.op_started
                                > self._request.timeout):
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request.response_path:
                response.body_path = self._request.response_path
                # FIXME: Quick dirty hack, actullay, response is fully
                # read into memory
                self._request.response_file.write(read_with_timeout())
                self._request.response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = (self.curl
            #                             .getinfo(pycurl.NAMELOOKUP_TIME))
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = (self._response.get_redirect_location()
                            or self._request.url)

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                #if key == 'Location':
                #    import pdb; pdb.set_trace()
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar() # self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
Exemple #40
0
 def test_make_str_cp1251_from_unicode(self):
     self.assertEqual(
         make_str(u'фыва', encoding='cp1251'),
         u'фыва'.encode('cp1251'),
     )
Exemple #41
0
 def build_hash(self, url):
     with self.spider.timer.log_time('cache.read.build_hash'):
         utf_url = make_str(url)
         return sha1(utf_url).hexdigest()
Exemple #42
0
 def test_make_str_from_int(self):
     self.assertEqual(
         make_str(1),
         b'1',
     )