コード例 #1
0
    def __init__(self, document_body=None, transport=None, **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()

        # makes pylint happy
        self.request_counter = None
        self.request_head = None
        self.request_body = None
        self.request_method = None
        self.transport_param = transport
        self.transport = None

        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)
コード例 #2
0
ファイル: grab_cookies.py プロジェクト: fangjintang1989/grab
    def test_from_cookie_list(self):
        cookie = create_cookie("foo", "bar", self.server.address)
        mgr = CookieManager.from_cookie_list([cookie])
        test_cookie = [x for x in mgr.cookiejar if x.name == "foo"][0]
        self.assertEqual(cookie.name, test_cookie.name)

        mgr = CookieManager.from_cookie_list([])
        self.assertEqual(0, len(list(mgr.cookiejar)))
コード例 #3
0
ファイル: grab_cookies.py プロジェクト: lyicy/grab
    def test_from_cookie_list(self):
        cookie = create_cookie('foo', 'bar', self.server.address)
        mgr = CookieManager.from_cookie_list([cookie])
        test_cookie = [x for x in mgr.cookiejar if x.name == 'foo'][0]
        self.assertEqual(cookie.name, test_cookie.name)

        mgr = CookieManager.from_cookie_list([])
        self.assertEqual(0, len(list(mgr.cookiejar)))
コード例 #4
0
ファイル: grab_cookies.py プロジェクト: julia-bikova/grab
    def test_from_cookie_list(self):
        cookie = create_cookie('foo', 'bar')
        mgr = CookieManager.from_cookie_list([cookie])
        test_cookie = [x for x in mgr.cookiejar if x.name == 'foo'][0]
        self.assertEqual(cookie.name, test_cookie.name)

        mgr = CookieManager.from_cookie_list([])
        self.assertEqual(0, len(list(mgr.cookiejar)))
コード例 #5
0
ファイル: grab_cookies.py プロジェクト: julia-bikova/grab
 def test_pickle_serialization(self):
     cookie = create_cookie('foo', 'bar')
     mgr = CookieManager.from_cookie_list([cookie])
     dump = pickle.dumps(mgr)
     mgr2 = pickle.loads(dump)
     self.assertEqual(list(mgr.cookiejar)[0].value,
                      list(mgr2.cookiejar)[0].value)
コード例 #6
0
ファイル: mock.py プロジェクト: subeax/grab
    def prepare_response(self, grab):
        response = Response()

        try:
            response.body = MOCK_REGISTRY[self.request_url]['body']
        except KeyError:
            raise GrabMockNotFoundError(
                'Mock registry does not have information about '\
                'following URL: %s' % self.request_url)

        now_str = datetime.now().strftime('%a, %d %B %Y %H:%M:%S')
        response.head = '\r\n'.join((
            'Accept-Ranges:bytes',
            'Content-Length:%d' % len(response.body),
            'Content-Type:text/plain',
            'Date:%s GMT' % now_str,
            'Last-Modified:%s GMT' % now_str,
            'Vary:Accept-Encoding',
        ))

        response.code = 200
        response.total_time = 0
        response.name_lookup_time = 0
        response.connect_time = 0
        response.url = self.request_url
        response.parse()
        response.cookies = CookieManager(self.extract_cookiejar())

        return response
コード例 #7
0
ファイル: grab_cookies.py プロジェクト: wyrover/grab
 def test_pickle_serialization(self):
     cookie = create_cookie('foo', 'bar', self.server.address)
     mgr = CookieManager.from_cookie_list([cookie])
     dump = pickle.dumps(mgr)
     mgr2 = pickle.loads(dump)
     self.assertEqual(list(mgr.cookiejar)[0].value,
                      list(mgr2.cookiejar)[0].value)
コード例 #8
0
ファイル: base.py プロジェクト: lorien/grab
    def __init__(self, document_body=None,
                 transport=None, **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()
        self.exception = None

        # makes pylint happy
        self.request_counter = None
        self.request_head = None
        self.request_body = None
        self.request_method = None
        self.transport_param = transport
        self.transport = None

        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)
コード例 #9
0
ファイル: curl.py プロジェクト: lyicy/grab
    def prepare_response(self, grab):
        if self.body_file:
            self.body_file.close()
        response = Document()

        response.head = b''.join(self.response_header_chunks)

        if self.body_path:
            response.body_path = self.body_path
        else:
            response.body = b''.join(self.response_body_chunks)

        # Clear memory
        self.response_header_chunks = []
        self.response_body_chunks = []

        response.code = self.curl.getinfo(pycurl.HTTP_CODE)
        response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self.curl.getinfo(pycurl.EFFECTIVE_URL)

        response.parse(charset=grab.config['document_charset'])

        response.cookies = CookieManager(self.extract_cookiejar())

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
コード例 #10
0
ファイル: base.py プロジェクト: FeodorFitsner/grab
    def __init__(self, document_body=None, transport='pycurl', **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()
        self.setup_transport(transport)
        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)
コード例 #11
0
ファイル: base.py プロジェクト: Kuznitsin/grab
    def load_config(self, config):
        """
        Configure grab instance with external config object.
        """

        self.config = copy_config(config, self.mutable_config_keys)
        if 'cookiejar_cookies' in config['state']:
            self.cookies = CookieManager.from_cookie_list(config['state']['cookiejar_cookies'])
コード例 #12
0
    def prepare_response(self, grab):
        #if self.body_file:
        #    self.body_file.close()
        response = Response()

        head = ''
        for key, val in self._response.getheaders().items():
            head += '%s: %s\r\n' % (key, val)
        head += '\r\n'
        response.head = make_str(head, encoding='latin', errors='ignore')

        #if self.body_path:
        #    response.body_path = self.body_path
        #else:
        #    response.body = b''.join(self.response_body_chunks)
        if self._request._response_path:
            response.body_path = self._request._response_path
            # Quick dirty hack, actullay, response is fully read into memory
            self._request._response_file.write(self._response.read())  #data)
            self._request._response_file.close()
        else:
            if self._request.body_maxsize is not None:
                #if self.response_body_bytes_read > self.config_body_maxsize:
                #    logger.debug('Response body max size limit reached: %s' %
                #                 self.config_body_maxsize)
                response.body = self._response.read(self._request.body_maxsize)
            else:
                response.body = self._response.read()  #data

        # Clear memory
        #self.response_header_chunks = []

        response.code = self._response.status
        #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self._response.get_redirect_location(
        ) or self._request.url

        import email.message
        hdr = email.message.Message()
        for key, val in self._response.getheaders().items():
            hdr[key] = val
        response.parse(charset=grab.config['document_charset'], headers=hdr)

        jar = self.extract_cookiejar(self._response, self._request)
        response.cookies = CookieManager(jar)

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
コード例 #13
0
    def load_config(self, config):
        """
        Configure grab instance with external config object.
        """

        self.config = copy_config(config, self.mutable_config_keys)
        if 'cookiejar_cookies' in config['state']:
            self.cookies = CookieManager.from_cookie_list(
                config['state']['cookiejar_cookies'])
コード例 #14
0
 def custom_prepare_response_func(transport, g):
     response = Response()
     response.head = cache_item['head']
     response.body = body
     response.code = cache_item['response_code']
     response.download_size = len(body)
     response.upload_size = 0
     response.download_speed = 0
     response.url = cache_item['response_url']
     response.parse()
     response.cookies = CookieManager(transport.extract_cookiejar())
     return response
コード例 #15
0
 def custom_prepare_response_func(transport, grab):
     response = Response()
     response.head = cache_item['head']
     response.body = body
     response.code = cache_item['response_code']
     response.download_size = len(body)
     response.upload_size = 0
     response.download_speed = 0
     response.url = cache_item['response_url']
     response.parse(charset=grab.config['document_charset'])
     response.cookies = CookieManager(transport.extract_cookiejar())
     response.from_cache = True
     return response
コード例 #16
0
ファイル: mysql.py プロジェクト: degustaf/pylint-corpus
 def custom_prepare_response_func(transport, grab):
     doc = Document()
     doc.head = cache_item['head']
     doc.body = body
     doc.code = cache_item['response_code']
     doc.download_size = len(body)
     doc.upload_size = 0
     doc.download_speed = 0
     doc.url = cache_item['response_url']
     doc.parse(charset=grab.config['document_charset'])
     doc.cookies = CookieManager(transport.extract_cookiejar())
     doc.from_cache = True
     return doc
コード例 #17
0
ファイル: base.py プロジェクト: subeax/grab
    def __init__(self,
                 document_body=None,
                 transport='grab.transport.curl.CurlTransport',
                 **kwargs):
        """
        Create Grab instance
        """

        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self._request_prepared = False
        self.cookies = CookieManager()
        self.proxylist = ProxyList()

        self.setup_transport(transport)

        self.reset()

        if kwargs:
            self.setup(**kwargs)
        self.clone_counter = 0
        if document_body is not None:
            self.setup_document(document_body)
コード例 #18
0
ファイル: base.py プロジェクト: ixtel/grab
    def __init__(self, document_body=None,
                 transport='pycurl', **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()
        self.setup_transport(transport)
        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)
コード例 #19
0
ファイル: document.py プロジェクト: abaelhe/grab
    def __init__(self, grab=None):
        if grab is None:
            self.grab = None
        else:
            if isinstance(grab, weakref.ProxyType):
                self.grab = grab
            else:
                self.grab = weakref.proxy(grab)

        self.status = None
        self.code = None
        self.head = None
        self.headers = None
        self.url = None
        self.cookies = CookieManager()
        self.charset = 'utf-8'
        self.bom = None
        self.timestamp = datetime.utcnow()
        self.name_lookup_time = 0
        self.connect_time = 0
        self.total_time = 0
        self.download_size = 0
        self.upload_size = 0
        self.download_speed = 0
        self.error_code = None
        self.error_msg = None

        # Body
        self.body_path = None
        self._cached_body = None
        self._unicode_body = None
        self._runtime_body = None
        self._unicode_runtime_body = None

        # DOM Tree
        self._lxml_tree = None
        self._strict_lxml_tree = None

        # Pyquery
        self._pyquery = None

        # Form
        self._lxml_form = None
        self._file_fields = {}
コード例 #20
0
ファイル: postgresql.py プロジェクト: subeax/grab
        def custom_prepare_response_func(transport, g):
            response = Response()
            response.head = cache_item['head']
            response.body = body
            response.code = cache_item['response_code']
            response.download_size = len(body)
            response.upload_size = 0
            response.download_speed = 0

            # Hack for deprecated behaviour
            if 'response_url' in cache_item:
                response.url = cache_item['response_url']
            else:
                logger.debug('You cache contains items without `response_url` key. It is depricated data format. Please re-download you cache or build manually `response_url` keys.')
                response.url = cache_item['url']

            response.parse()
            response.cookies = CookieManager(transport.extract_cookiejar())
            return response
コード例 #21
0
    def __init__(self, grab=None):
        self._grab_config = {}
        self.grab = None
        if grab:
            self.process_grab(grab)
        self.status = None
        self.code = None
        self.head = None
        self.headers = None
        self.url = None
        self.cookies = CookieManager()
        self.charset = 'utf-8'
        self.bom = None
        self.timestamp = datetime.utcnow()
        self.name_lookup_time = 0
        self.connect_time = 0
        self.total_time = 0
        self.download_size = 0
        self.upload_size = 0
        self.download_speed = 0
        self.error_code = None
        self.error_msg = None
        self.from_cache = False

        # Body
        self.body_path = None
        self._bytes_body = None
        self._unicode_body = None

        # DOM Tree
        self._lxml_tree = None
        self._strict_lxml_tree = None

        # Pyquery
        self._pyquery = None

        # Form
        self._lxml_form = None
        self._file_fields = {}
コード例 #22
0
ファイル: urllib3.py プロジェクト: sn-donbenjamin/grab
    def prepare_response(self, grab):
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Response()

            head = ''
            for key, val in self._response.getheaders().items():
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='latin', errors='ignore')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                total_size = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        total_size += len(chunk)
                        chunks.append(chunk)
                        if maxsize and total_size > maxsize:
                            logger.debug(
                                'Response body max size limit reached: %s' %
                                maxsize)
                    else:
                        break
                    if self._request.timeout:
                        if time.time(
                        ) - self._request.op_started > self._request.timeout:
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request._response_path:
                response.body_path = self._request._response_path
                # FIXME: Quick dirty hack, actullay, response is fully read into memory
                self._request._response_file.write(read_with_timeout())
                self._request._response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = self._response.get_redirect_location(
            ) or self._request.url

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar()  #self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
コード例 #23
0
ファイル: grab_cookies.py プロジェクト: fangjintang1989/grab
 def test_get_item(self):
     cookie = create_cookie("foo", "bar", self.server.address)
     mgr = CookieManager.from_cookie_list([cookie])
     self.assertEqual("bar", mgr["foo"])
     self.assertRaises(KeyError, lambda: mgr["zzz"])
コード例 #24
0
class Grab(FormExtension, DeprecatedThings):

    __slots__ = (
        'request_head',
        'request_log',
        'request_body',
        'proxylist',
        'config',
        'transport',
        'transport_param',
        'request_method',
        'request_counter',
        '__weakref__',
        'cookies',

        # Dirty hack to make it possible to inherit Grab from
        # multiple base classes with __slots__
        '_lxml_form',
        '_file_fields',
        '_pyquery',
        '_doc',
        '_kit',
    )

    # Attributes which should be processed when clone
    # of Grab instance is creating
    clonable_attributes = ('request_head', 'request_log', 'request_body',
                           'proxylist')

    # Complex config items which points to mutable objects
    mutable_config_keys = copy(MUTABLE_CONFIG_KEYS)
    """
    Public methods
    """
    def __init__(self,
                 document_body=None,
                 transport='grab.transport.curl.CurlTransport',
                 **kwargs):
        """
        Create Grab instance
        """

        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()

        self.setup_transport(transport)

        self.reset()
        if kwargs:
            self.setup(**kwargs)

        if document_body is not None:
            self.setup_document(document_body)

    def _get_doc(self):
        if self._doc is None:
            self._doc = Document(self)
        return self._doc

    def _set_doc(self, obj):
        self._doc = obj

    doc = property(_get_doc, _set_doc)

    def setup_transport(self, transport_param):
        self.transport_param = transport_param
        if isinstance(transport_param, basestring):
            mod_path, cls_name = transport_param.rsplit('.', 1)
            try:
                cls = TRANSPORT_CACHE[(mod_path, cls_name)]
            except KeyError:
                mod = __import__(mod_path, globals(), locals(), ['foo'])
                cls = getattr(mod, cls_name)
                TRANSPORT_CACHE[(mod_path, cls_name)] = cls
            self.transport = cls()
        elif isinstance(transport_param, collections.Callable):
            self.transport = transport_param()
        else:
            raise error.GrabMisuseError('Option `transport` should be string '
                                        'or callable. Got %s' %
                                        type(transport_param))

    def reset(self):
        """
        Reset all attributes which could be modified during previous request
        or which is not initialized yet if this is the new Grab instance.

        This methods is automatically called before each network request.
        """

        self.request_head = None
        self.request_log = None
        self.request_body = None

        self.request_method = None
        self.transport.reset()

        # KIT
        self._kit = None
        # Form extension
        self._lxml_form = None
        self._file_fields = {}

    def clone(self, **kwargs):
        """
        Create clone of Grab instance.

        Cloned instance will have the same state: cookies, referrer, response
        document data

        :param **kwargs: overrides settings of cloned grab instance
        """

        g = Grab(transport=self.transport_param)
        g.config = self.dump_config()

        g.doc = self.doc.copy()
        g.doc.grab = weakref.proxy(g)

        for key in self.clonable_attributes:
            setattr(g, key, getattr(self, key))
        g.cookies = deepcopy(self.cookies)

        if kwargs:
            g.setup(**kwargs)

        return g

    def adopt(self, g):
        """
        Copy the state of another `Grab` instance.

        Use case: create backup of current state to the cloned instance and
        then restore the state from it.
        """

        self.load_config(g.config)

        self.doc = g.doc.copy(new_grab=self)

        for key in self.clonable_attributes:
            setattr(self, key, getattr(g, key))
            self.cookies = deepcopy(g.cookies)

    def dump_config(self):
        """
        Make clone of current config.
        """

        conf = copy_config(self.config, self.mutable_config_keys)
        conf['state'] = {
            'cookiejar_cookies': list(self.cookies.cookiejar),
        }
        return conf

    def load_config(self, config):
        """
        Configure grab instance with external config object.
        """

        self.config = copy_config(config, self.mutable_config_keys)
        if 'cookiejar_cookies' in config['state']:
            self.cookies = CookieManager.from_cookie_list(
                config['state']['cookiejar_cookies'])

    def setup(self, **kwargs):
        """
        Setting up Grab instance configuration.
        """

        if 'hammer_mode' in kwargs:
            logging.error('Option hammer_mode is deprecated. Grab does not '
                          'support hammer mode anymore.')
            del kwargs['hammer_mode']

        if 'hammer_timeouts' in kwargs:
            logging.error('Option hammer_timeouts is deprecated. Grab does not'
                          ' support hammer mode anymore.')
            del kwargs['hammer_timeouts']

        for key in kwargs:
            if key not in self.config.keys():
                raise error.GrabMisuseError('Unknown option: %s' % key)

        if 'url' in kwargs:
            if self.config.get('url'):
                kwargs['url'] = self.make_url_absolute(kwargs['url'])
        self.config.update(kwargs)

    def go(self, url, **kwargs):
        """
        Go to ``url``

        Args:
            :url: could be absolute or relative. If relative then t will be
            appended to the absolute URL of previous request.
        """

        return self.request(url=url, **kwargs)

    def download(self, url, location, **kwargs):
        """
        Fetch document located at ``url`` and save to to ``location``.
        """

        doc = self.go(url, **kwargs)
        with open(location, 'wb') as out:
            out.write(doc.body)
        return len(doc.body)

    def prepare_request(self, **kwargs):
        """
        Configure all things to make real network request.
        This method is called before doing real request via
        transport extension.
        """

        self.reset()
        self.request_counter = next(REQUEST_COUNTER)
        if kwargs:
            self.setup(**kwargs)
        if not self.proxylist.is_empty() and self.config['proxy_auto_change']:
            self.change_proxy()
        self.request_method = self.detect_request_method()
        self.transport.process_config(self)

    def log_request(self, extra=''):
        """
        Send request details to logging system.
        """

        thread_name = threading.currentThread().getName().lower()
        if thread_name == 'mainthread':
            thread_name = ''
        else:
            thread_name = '-%s' % thread_name

        if self.config['proxy']:
            if self.config['proxy_userpwd']:
                auth = ' with authorization'
            else:
                auth = ''
            proxy_info = ' via %s proxy of type %s%s' % (
                self.config['proxy'], self.config['proxy_type'], auth)
        else:
            proxy_info = ''
        if extra:
            extra = '[%s] ' % extra
        logger_network.debug('[%02d%s] %s%s %s%s', self.request_counter,
                             thread_name, extra, self.request_method or 'GET',
                             self.config['url'], proxy_info)

    def request(self, **kwargs):
        """
        Perform network request.

        You can specify grab settings in ``**kwargs``.
        Any keyword argument will be passed to ``self.config``.

        Returns: ``Document`` objects.
        """

        self.prepare_request(**kwargs)
        self.log_request()

        try:
            self.transport.request()
        except error.GrabError:
            self.save_failed_dump()
            raise
        else:
            # That builds `self.doc`
            self.process_request_result()
            return self.doc

    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.now()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, basestring):
                    post = post[:self.config['debug_post_limit']] + '...'
                else:
                    items = normalize_http_values(post, charset='utf-8')
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[:self.
                                          config['debug_post_limit']] + '...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n' %
                                     (self.request_counter, post))

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        old_refresh_count = self.config['refresh_redirect_count']
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        # Workaround
        if self.doc.grab is None:
            self.doc.grab = weakref.proxy(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        self.save_dumps()

        # TODO: check max redirect count
        if self.config['follow_refresh']:
            url = find_refresh_url(self.doc.unicode_body())
            print('URL', url)
            if url is not None:
                inc_count = old_refresh_count + 1
                if inc_count > self.config['redirect_limit']:
                    raise error.GrabTooManyRedirectsError()
                else:
                    print(inc_count)
                    return self.request(url=url,
                                        refresh_redirect_count=inc_count)

        return None

    def reset_temporary_options(self):
        self.config['post'] = None
        self.config['multipart_post'] = None
        self.config['method'] = None
        self.config['body_storage_filename'] = None
        self.config['refresh_redirect_count'] = 0

    def save_failed_dump(self):
        """
        Save dump of failed request for debugging.

        This method is called then fatal network exception is raised.
        The saved dump could be used for debugging the reason of the failure.
        """

        # This is very untested feature, so
        # I put it inside try/except to not break
        # live spiders
        try:
            self.doc = self.transport.prepare_response(self)
            self.copy_request_data()
            self.save_dumps()
        except Exception as ex:
            logging.error(unicode(ex))

    def copy_request_data(self):
        # TODO: Maybe request object?
        self.request_head = self.transport.request_head
        self.request_body = self.transport.request_body
        self.request_log = self.transport.request_log

    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = ''
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc

    def change_proxy(self):
        """
        Set random proxy from proxylist.
        """

        if not self.proxylist.is_empty():
            proxy = self.proxylist.get_random_proxy()
            self.setup(proxy=proxy.address,
                       proxy_userpwd=proxy.userpwd,
                       proxy_type=proxy.proxy_type)
        else:
            logging.debug('Proxy list is empty')

    """
    Private methods
    """

    def common_headers(self):
        """
        Build headers which sends typical browser.
        """

        return {
            'Accept':
            'text/xml,application/xml,application/xhtml+xml'
            ',text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.%d' %
            randint(2, 5),
            'Accept-Language':
            'en-us,en;q=0.%d' % (randint(5, 9)),
            'Accept-Charset':
            'utf-8,windows-1251;q=0.7,*;q=0.%d' % randint(5, 7),
            'Keep-Alive':
            '300',
            'Expect':
            '',
        }

    def save_dumps(self):
        if self.config['log_dir']:
            thread_name = threading.currentThread().getName().lower()
            if thread_name == 'mainthread':
                thread_name = ''
            else:
                thread_name = '-%s' % thread_name
            file_name = os.path.join(
                self.config['log_dir'],
                '%02d%s.log' % (self.request_counter, thread_name))
            with open(file_name, 'w') as out:
                out.write('Request headers:\n')
                out.write(self.request_head)
                out.write('\n')
                out.write('Request body:\n')
                out.write(self.request_body)
                out.write('\n\n')
                out.write('Response headers:\n')
                out.write(self.doc.head)

            file_extension = 'html'
            file_name = os.path.join(
                self.config['log_dir'], '%02d%s.%s' %
                (self.request_counter, thread_name, file_extension))
            self.doc.save(file_name)

    def make_url_absolute(self, url, resolve_base=False):
        """
        Make url absolute using previous request url as base url.
        """

        if self.config['url']:
            if resolve_base:
                ubody = self.doc.unicode_body()
                base_url = find_base_url(ubody)
                if base_url:
                    return urljoin(base_url, url)
            return urljoin(self.config['url'], url)
        else:
            return url

    def detect_request_method(self):
        """
        Analyze request config and find which
        request method will be used.

        Returns request method in upper case

        This method needs simetime when `process_config` method
        was not called yet.
        """

        method = self.config['method']
        if method:
            method = method.upper()
        else:
            if self.config['post'] or self.config['multipart_post']:
                method = 'POST'
            else:
                method = 'GET'
        return method

    def clear_cookies(self):
        """
        Clear all remembered cookies.
        """

        self.config['cookies'] = {}
        self.cookies.clear()

    def setup_with_proxyline(self, line, proxy_type='http'):
        # TODO: remove from base class
        # maybe to proxylist?
        host, port, user, pwd = parse_proxy_line(line)
        server_port = '%s:%s' % (host, port)
        self.setup(proxy=server_port, proxy_type=proxy_type)
        if user:
            userpwd = '%s:%s' % (user, pwd)
            self.setup(proxy_userpwd=userpwd)

    def __getstate__(self):
        """
        Reset cached lxml objects which could not be pickled.
        """
        state = {}
        for cls in type(self).mro():
            cls_slots = getattr(cls, '__slots__', ())
            for slot in cls_slots:
                if slot != '__weakref__':
                    if hasattr(self, slot):
                        state[slot] = getattr(self, slot)

        state['_lxml_form'] = None

        if state['_doc']:
            state['_doc'].grab = weakref.proxy(self)

        return state

    def __setstate__(self, state):
        for slot, value in state.items():
            setattr(self, slot, value)

    @property
    def request_headers(self):
        """
        Temporary hack till the time I'll understand
        where to store request details.
        """

        try:
            first_head = self.request_head.split('\r\n\r\n')[0]
            lines = first_head.split('\r\n')
            lines = [x for x in lines if ':' in x]
            headers = email.message_from_string('\n'.join(lines))
            return headers
        except Exception as ex:
            logging.error('Could not parse request headers', exc_info=ex)
            return {}

    @property
    def kit(self):
        """
        Return KitInterface object that provides some
        methods to communicate with Kit transport.
        """

        if not self._kit:
            self._kit = GrabKitInterface(self)
        return self._kit
コード例 #25
0
ファイル: grab_cookies.py プロジェクト: julia-bikova/grab
 def test_get_item(self):
     cookie = create_cookie('foo', 'bar')
     mgr = CookieManager.from_cookie_list([cookie])
     self.assertEqual('bar', mgr['foo'])
     self.assertRaises(KeyError, lambda: mgr['zzz'])
コード例 #26
0
    def prepare_response(self, grab):
        # Information about urllib3
        # On python2 urllib3 headers contains original binary data
        # On python3 urllib3 headers are converted to unicode
        # using latin encoding
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Document()

            head = ''
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='utf-8')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                bytes_read = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        bytes_read += len(chunk)
                        chunks.append(chunk)
                        if maxsize and bytes_read > maxsize:
                            # reached limit on bytes to read
                            break
                    else:
                        break
                    if self._request.timeout:
                        if (time.time() - self._request.op_started >
                                self._request.timeout):
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request.response_path:
                response.body_path = self._request.response_path
                # FIXME: Quick dirty hack, actullay, response is fully
                # read into memory
                self._request.response_file.write(read_with_timeout())
                self._request.response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = (self.curl
            #                             .getinfo(pycurl.NAMELOOKUP_TIME))
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = (self._response.get_redirect_location()
                            or self._request.url)

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                #if key == 'Location':
                #    import pdb; pdb.set_trace()
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar()  # self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
コード例 #27
0
ファイル: base.py プロジェクト: lorien/grab
class Grab(DeprecatedThings):

    __slots__ = (
        'request_head', 'request_body',
        #'request_log',
        'proxylist', 'config',
        'transport',
        'transport_param', 'request_method', 'request_counter',
        '__weakref__', 'cookies',
        'meta', 'exception',

        # Dirty hack to make it possible to inherit Grab from
        # multiple base classes with __slots__
        '_doc',
    )

    # Attributes which should be processed when clone
    # of Grab instance is creating
    clonable_attributes = ('request_head', 'request_body',
                           #'request_log',
                           'proxylist')

    # Complex config items which points to mutable objects
    mutable_config_keys = copy(MUTABLE_CONFIG_KEYS)

    #
    # Public methods
    #

    def __init__(self, document_body=None,
                 transport=None, **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()
        self.exception = None

        # makes pylint happy
        self.request_counter = None
        self.request_head = None
        self.request_body = None
        self.request_method = None
        self.transport_param = transport
        self.transport = None

        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)

    def _get_doc(self):
        if self._doc is None:
            self._doc = Document(self)
        return self._doc

    def _set_doc(self, obj):
        self._doc = obj

    doc = property(_get_doc, _set_doc)

    def setup_transport(self, transport_param, reset=False):
        if self.transport is not None and not reset:
            raise error.GrabMisuseError(
                'Transport is already set up. Use'
                ' setup_transport(..., reset=True) to explicitly setup'
                ' new transport')
        if transport_param is None:
            transport_param = DEFAULT_TRANSPORT
        if isinstance(transport_param, six.string_types):
            if transport_param in TRANSPORT_ALIAS:
                transport_param = TRANSPORT_ALIAS[transport_param]
            if '.' not in transport_param:
                raise error.GrabMisuseError('Unknown transport: %s'
                                            % transport_param)
            else:
                mod_path, cls_name = transport_param.rsplit('.', 1)
                try:
                    cls = TRANSPORT_CACHE[(mod_path, cls_name)]
                except KeyError:
                    mod = __import__(mod_path, globals(), locals(), ['foo'])
                    cls = getattr(mod, cls_name)
                    TRANSPORT_CACHE[(mod_path, cls_name)] = cls
                self.transport = cls()
        elif isinstance(transport_param, collections.Callable):
            self.transport = transport_param()
        else:
            raise error.GrabMisuseError('Option `transport` should be string '
                                        'or class or callable. Got %s'
                                        % type(transport_param))

    def reset(self):
        """
        Reset all attributes which could be modified during previous request
        or which is not initialized yet if this is the new Grab instance.

        This methods is automatically called before each network request.
        """

        self.request_head = None
        #self.request_log = None
        self.request_body = None
        self.request_method = None
        self.request_counter = None
        self.exception = None
        if self.transport:
            self.transport.reset()

    def clone(self, **kwargs):
        """
        Create clone of Grab instance.

        Cloned instance will have the same state: cookies, referrer, response
        document data

        :param **kwargs: overrides settings of cloned grab instance
        """

        grab = Grab(transport=self.transport_param)
        grab.config = self.dump_config()

        grab.doc = self.doc.copy()
        #grab.doc.grab = weakref.proxy(grab)

        for key in self.clonable_attributes:
            setattr(grab, key, getattr(self, key))
        grab.cookies = deepcopy(self.cookies)

        if kwargs:
            grab.setup(**kwargs)

        return grab

    def adopt(self, grab):
        """
        Copy the state of another `Grab` instance.

        Use case: create backup of current state to the cloned instance and
        then restore the state from it.
        """

        self.load_config(grab.config)

        self.doc = grab.doc.copy(new_grab=self)

        for key in self.clonable_attributes:
            setattr(self, key, getattr(grab, key))
        self.cookies = deepcopy(grab.cookies)

    def dump_config(self):
        """
        Make clone of current config.
        """

        conf = copy_config(self.config, self.mutable_config_keys)
        conf['state'] = {
            'cookiejar_cookies': list(self.cookies.cookiejar),
        }
        return conf

    def load_config(self, config):
        """
        Configure grab instance with external config object.
        """

        self.config = copy_config(config, self.mutable_config_keys)
        if 'cookiejar_cookies' in config['state']:
            self.cookies = CookieManager.from_cookie_list(
                config['state']['cookiejar_cookies'])

    def setup(self, **kwargs):
        """
        Setting up Grab instance configuration.
        """

        for key in kwargs:
            if key not in self.config.keys():
                raise error.GrabMisuseError('Unknown option: %s' % key)

        if 'url' in kwargs:
            if self.config.get('url'):
                kwargs['url'] = self.make_url_absolute(kwargs['url'])
        self.config.update(kwargs)

    def go(self, url, **kwargs): # pylint: disable=invalid-name
        """
        Go to ``url``

        Args:
            :url: could be absolute or relative. If relative then t will be
            appended to the absolute URL of previous request.
        """

        return self.request(url=url, **kwargs)

    def download(self, url, location, **kwargs):
        """
        Fetch document located at ``url`` and save to to ``location``.
        """

        doc = self.go(url, **kwargs)
        with open(location, 'wb') as out:
            out.write(doc.body)
        return len(doc.body)

    def prepare_request(self, **kwargs):
        """
        Configure all things to make real network request.
        This method is called before doing real request via
        transport extension.
        """

        if self.transport is None:
            self.setup_transport(self.transport_param)
        self.reset()
        self.request_counter = next(REQUEST_COUNTER)
        if kwargs:
            self.setup(**kwargs)
        if self.proxylist.size() and self.config['proxy_auto_change']:
            self.change_proxy()
        self.request_method = self.detect_request_method()
        self.transport.process_config(self)

    def log_request(self, extra=''):
        """
        Send request details to logging system.
        """

        # pylint: disable=no-member
        thread_name = threading.currentThread().getName().lower()
        # pylint: enable=no-member
        if thread_name == 'mainthread':
            thread_name = ''
        else:
            thread_name = '-%s' % thread_name

        if self.config['proxy']:
            if self.config['proxy_userpwd']:
                auth = ' with authorization'
            else:
                auth = ''
            proxy_info = ' via %s proxy of type %s%s' % (
                self.config['proxy'], self.config['proxy_type'], auth)
        else:
            proxy_info = ''
        if extra:
            extra = '[%s] ' % extra
        logger_network.debug(
            '[%s%s] %s%s %s%s',
            ('%02d' % self.request_counter
             if self.request_counter is not None else 'NA'),
            thread_name,
            extra, self.request_method or 'GET',
            self.config['url'], proxy_info)

    def request(self, **kwargs):
        """
        Perform network request.

        You can specify grab settings in ``**kwargs``.
        Any keyword argument will be passed to ``self.config``.

        Returns: ``Document`` objects.
        """

        self.prepare_request(**kwargs)
        refresh_count = 0

        while True:
            self.log_request()

            try:
                self.transport.request()
            except error.GrabError as ex:
                self.exception = ex
                self.reset_temporary_options()
                if self.config['log_dir']:
                    self.save_failed_dump()
                raise
            else:
                doc = self.process_request_result()

                if self.config['follow_location']:
                    if doc.code in (301, 302, 303, 307, 308):
                        if doc.headers.get('Location'):
                            refresh_count += 1
                            if refresh_count > self.config['redirect_limit']:
                                raise error.GrabTooManyRedirectsError()
                            else:
                                url = doc.headers.get('Location')
                                self.prepare_request(
                                    url=self.make_url_absolute(url),
                                    referer=None)
                                continue

                if self.config['follow_refresh']:
                    refresh_url = self.doc.get_meta_refresh_url()
                    if refresh_url is not None:
                        refresh_count += 1
                        if refresh_count > self.config['redirect_limit']:
                            raise error.GrabTooManyRedirectsError()
                        else:
                            self.prepare_request(
                                url=self.make_url_absolute(refresh_url),
                                referer=None)
                            continue
                return doc

    def submit(self, make_request=True, **kwargs):
        """
        Submit current form.

        :param make_request: if `False` then grab instance will be
            configured with form post data but request will not be
            performed

        For details see `Document.submit()` method

        Example::

            # Assume that we going to some page with some form
            g.go('some url')
            # Fill some fields
            g.doc.set_input('username', 'bob')
            g.doc.set_input('pwd', '123')
            # Submit the form
            g.submit()

            # or we can just fill the form
            # and do manual submission
            g.doc.set_input('foo', 'bar')
            g.submit(make_request=False)
            g.request()

            # for multipart forms we can specify files
            from grab import UploadFile
            g.doc.set_input('img', UploadFile('/path/to/image.png'))
            g.submit()
        """
        result = self.doc.get_form_request(**kwargs)
        if result['multipart_post']:
            self.setup(multipart_post=result['multipart_post'])
        if result['post']:
            self.setup(post=result['post'])
        if result['url']:
            self.setup(url=result['url'])
        if make_request:
            return self.request()
        else:
            return None

    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.utcnow()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, six.string_types):
                    post = make_str(post[:self.config['debug_post_limit']],
                                    errors='ignore') + b'...'
                else:
                    items = normalize_http_values(
                        post, charset=self.config['charset'])
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[
                                :self.config['debug_post_limit']] + b'...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n',
                                     self.request_counter, post)

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        self.doc.process_grab(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        if self.config['log_dir']:
            self.save_dumps()

        return self.doc

    def reset_temporary_options(self):
        self.config['post'] = None
        self.config['multipart_post'] = None
        self.config['method'] = None
        self.config['body_storage_filename'] = None

    def save_failed_dump(self):
        """
        Save dump of failed request for debugging.

        This method is called then fatal network exception is raised.
        The saved dump could be used for debugging the reason of the failure.
        """

        # try/except for safety, to not break live spiders
        try:
            # FIXME
            if (self.transport.__class__.__name__ == 'Urllib3Transport'
                    and not getattr(self.transport, '_response', None)):
                self.doc = None
            else:
                self.doc = self.transport.prepare_response(self)
            self.copy_request_data()
            self.save_dumps()
        except Exception as ex: # pylint: disable=broad-except
            logger.error('', exc_info=ex)

    def copy_request_data(self):
        # TODO: Maybe request object?
        self.request_head = self.transport.request_head
        self.request_body = self.transport.request_body
        #self.request_log = self.transport.request_log

    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()
        if isinstance(content, six.text_type):
            raise error.GrabMisuseError('Method `setup_document` accepts only '
                                        'byte string in `content` argument.')

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = b'HTTP/1.1 200 OK\r\n\r\n'
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc

    def change_proxy(self, random=True):
        """
        Set random proxy from proxylist.
        """

        if self.proxylist.size():
            if random:
                proxy = self.proxylist.get_random_proxy()
            else:
                proxy = self.proxylist.get_next_proxy()
            self.setup(proxy=proxy.get_address(),
                       proxy_userpwd=proxy.get_userpwd(),
                       proxy_type=proxy.proxy_type)
        else:
            logger.debug('Proxy list is empty')

    #
    # Private methods
    #

    @classmethod
    def common_headers(cls):
        """
        Build headers which sends typical browser.
        """

        return {
            'Accept': 'text/xml,application/xml,application/xhtml+xml'
                      ',text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.%d'
                      % randint(2, 5),
            'Accept-Language': 'en-us,en;q=0.%d' % (randint(5, 9)),
            'Accept-Charset': 'utf-8,windows-1251;q=0.7,*;q=0.%d'
                              % randint(5, 7),
            'Keep-Alive': '300',
        }

    def save_dumps(self):
        # pylint: disable=no-member
        thread_name = threading.currentThread().getName().lower()
        # pylint: enable=no-member
        if thread_name == 'mainthread':
            thread_name = ''
        else:
            thread_name = '-%s' % thread_name
        file_name = os.path.join(self.config['log_dir'], '%02d%s.log' % (
            self.request_counter, thread_name))
        with open(file_name, 'wb') as out:
            out.write(b'Request headers:\n')
            out.write(self.request_head)
            out.write(b'\n')
            out.write(b'Request body:\n')
            out.write(self.request_body)
            out.write(b'\n\n')
            out.write(b'Response headers:\n')
            out.write(self.doc.head if (self.doc and self.doc.head)
                      else b'')

        file_extension = 'html'
        file_name = os.path.join(self.config['log_dir'], '%02d%s.%s' % (
            self.request_counter, thread_name, file_extension))
        self.doc.save(file_name)

    def make_url_absolute(self, url, resolve_base=False):
        """
        Make url absolute using previous request url as base url.
        """

        if self.config['url']:
            if resolve_base:
                ubody = self.doc.unicode_body()
                base_url = find_base_url(ubody)
                if base_url:
                    return urljoin(base_url, url)
            return urljoin(self.config['url'], url)
        else:
            return url

    def detect_request_method(self):
        """
        Analyze request config and find which
        request method will be used.

        Returns request method in upper case

        This method needs simetime when `process_config` method
        was not called yet.
        """

        method = self.config['method']
        if method:
            method = method.upper()
        else:
            if self.config['post'] or self.config['multipart_post']:
                method = 'POST'
            else:
                method = 'GET'
        return method

    def clear_cookies(self):
        """
        Clear all remembered cookies.
        """

        self.config['cookies'] = {}
        self.cookies.clear()

    def setup_with_proxyline(self, line, proxy_type='http'):
        # TODO: remove from base class
        # maybe to proxylist?
        host, port, user, pwd = parse_proxy_line(line)
        server_port = '%s:%s' % (host, port)
        self.setup(proxy=server_port, proxy_type=proxy_type)
        if user:
            userpwd = '%s:%s' % (user, pwd)
            self.setup(proxy_userpwd=userpwd)

    def __getstate__(self):
        """
        Reset cached lxml objects which could not be pickled.
        """
        state = {}
        for cls in type(self).mro():
            cls_slots = getattr(cls, '__slots__', ())
            for slot in cls_slots:
                if slot != '__weakref__':
                    if hasattr(self, slot):
                        state[slot] = getattr(self, slot)

        if state['_doc']:
            state['_doc'].grab = weakref.proxy(self)

        return state

    def __setstate__(self, state):
        for slot, value in state.items():
            setattr(self, slot, value)

    @property
    def request_headers(self):
        first_head = self.request_head.decode('utf-8').split('\r\n\r\n')[0]
        lines = first_head.split('\r\n')
        lines = [x for x in lines if ':' in x]
        return email.message_from_string('\n'.join(lines))
コード例 #28
0
class Grab(DeprecatedThings):

    __slots__ = (
        'request_head', 'request_body',
        #'request_log',
        'proxylist', 'config',
        'transport',
        'transport_param', 'request_method', 'request_counter',
        '__weakref__', 'cookies',
        'meta', 'exception',

        # Dirty hack to make it possible to inherit Grab from
        # multiple base classes with __slots__
        '_doc',
    )

    # Attributes which should be processed when clone
    # of Grab instance is creating
    clonable_attributes = ('request_head', 'request_body',
                           #'request_log',
                           'proxylist')

    # Complex config items which points to mutable objects
    mutable_config_keys = copy(MUTABLE_CONFIG_KEYS)

    #
    # Public methods
    #

    def __init__(self, document_body=None,
                 transport=None, **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()
        self.exception = None

        # makes pylint happy
        self.request_counter = None
        self.request_head = None
        self.request_body = None
        self.request_method = None
        self.transport_param = transport
        self.transport = None

        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)

    def _get_doc(self):
        if self._doc is None:
            self._doc = Document(self)
        return self._doc

    def _set_doc(self, obj):
        self._doc = obj

    doc = property(_get_doc, _set_doc)

    def setup_transport(self, transport_param, reset=False):
        if self.transport is not None and not reset:
            raise error.GrabMisuseError(
                'Transport is already set up. Use'
                ' setup_transport(..., reset=True) to explicitly setup'
                ' new transport')
        if transport_param is None:
            transport_param = DEFAULT_TRANSPORT
        if isinstance(transport_param, six.string_types):
            if transport_param in TRANSPORT_ALIAS:
                transport_param = TRANSPORT_ALIAS[transport_param]
            if '.' not in transport_param:
                raise error.GrabMisuseError('Unknown transport: %s'
                                            % transport_param)
            else:
                mod_path, cls_name = transport_param.rsplit('.', 1)
                try:
                    cls = TRANSPORT_CACHE[(mod_path, cls_name)]
                except KeyError:
                    mod = __import__(mod_path, globals(), locals(), ['foo'])
                    cls = getattr(mod, cls_name)
                    TRANSPORT_CACHE[(mod_path, cls_name)] = cls
                self.transport = cls()
        elif isinstance(transport_param, collections.Callable):
            self.transport = transport_param()
        else:
            raise error.GrabMisuseError('Option `transport` should be string '
                                        'or class or callable. Got %s'
                                        % type(transport_param))

    def reset(self):
        """
        Reset all attributes which could be modified during previous request
        or which is not initialized yet if this is the new Grab instance.

        This methods is automatically called before each network request.
        """

        self.request_head = None
        #self.request_log = None
        self.request_body = None
        self.request_method = None
        self.request_counter = None
        self.exception = None
        if self.transport:
            self.transport.reset()

    def clone(self, **kwargs):
        """
        Create clone of Grab instance.

        Cloned instance will have the same state: cookies, referrer, response
        document data

        :param **kwargs: overrides settings of cloned grab instance
        """

        grab = Grab(transport=self.transport_param)
        grab.config = self.dump_config()

        grab.doc = self.doc.copy()
        #grab.doc.grab = weakref.proxy(grab)

        for key in self.clonable_attributes:
            setattr(grab, key, getattr(self, key))
        grab.cookies = deepcopy(self.cookies)

        if kwargs:
            grab.setup(**kwargs)

        return grab

    def adopt(self, grab):
        """
        Copy the state of another `Grab` instance.

        Use case: create backup of current state to the cloned instance and
        then restore the state from it.
        """

        self.load_config(grab.config)

        self.doc = grab.doc.copy(new_grab=self)

        for key in self.clonable_attributes:
            setattr(self, key, getattr(grab, key))
        self.cookies = deepcopy(grab.cookies)

    def dump_config(self):
        """
        Make clone of current config.
        """

        conf = copy_config(self.config, self.mutable_config_keys)
        conf['state'] = {
            'cookiejar_cookies': list(self.cookies.cookiejar),
        }
        return conf

    def load_config(self, config):
        """
        Configure grab instance with external config object.
        """

        self.config = copy_config(config, self.mutable_config_keys)
        if 'cookiejar_cookies' in config['state']:
            self.cookies = CookieManager.from_cookie_list(
                config['state']['cookiejar_cookies'])

    def setup(self, **kwargs):
        """
        Setting up Grab instance configuration.
        """

        for key in kwargs:
            if key not in self.config.keys():
                raise error.GrabMisuseError('Unknown option: %s' % key)

        if 'url' in kwargs:
            if self.config.get('url'):
                kwargs['url'] = self.make_url_absolute(kwargs['url'])
        self.config.update(kwargs)

    def go(self, url, **kwargs): # pylint: disable=invalid-name
        """
        Go to ``url``

        Args:
            :url: could be absolute or relative. If relative then t will be
            appended to the absolute URL of previous request.
        """

        return self.request(url=url, **kwargs)

    def download(self, url, location, **kwargs):
        """
        Fetch document located at ``url`` and save to to ``location``.
        """

        doc = self.go(url, **kwargs)
        with open(location, 'wb') as out:
            out.write(doc.body)
        return len(doc.body)

    def prepare_request(self, **kwargs):
        """
        Configure all things to make real network request.
        This method is called before doing real request via
        transport extension.
        """

        if self.transport is None:
            self.setup_transport(self.transport_param)
        self.reset()
        self.request_counter = next(REQUEST_COUNTER)
        if kwargs:
            self.setup(**kwargs)
        if self.proxylist.size() and self.config['proxy_auto_change']:
            self.change_proxy()
        self.request_method = self.detect_request_method()
        self.transport.process_config(self)

    def log_request(self, extra=''):
        """
        Send request details to logging system.
        """

        # pylint: disable=no-member
        thread_name = threading.currentThread().getName().lower()
        # pylint: enable=no-member
        if thread_name == 'mainthread':
            thread_name = ''
        else:
            thread_name = '-%s' % thread_name

        if self.config['proxy']:
            if self.config['proxy_userpwd']:
                auth = ' with authorization'
            else:
                auth = ''
            proxy_info = ' via %s proxy of type %s%s' % (
                self.config['proxy'], self.config['proxy_type'], auth)
        else:
            proxy_info = ''
        if extra:
            extra = '[%s] ' % extra
        logger_network.debug(
            '[%s%s] %s%s %s%s',
            ('%02d' % self.request_counter
             if self.request_counter is not None else 'NA'),
            thread_name,
            extra, self.request_method or 'GET',
            self.config['url'], proxy_info)

    def request(self, **kwargs):
        """
        Perform network request.

        You can specify grab settings in ``**kwargs``.
        Any keyword argument will be passed to ``self.config``.

        Returns: ``Document`` objects.
        """

        self.prepare_request(**kwargs)
        refresh_count = 0

        while True:
            self.log_request()

            try:
                self.transport.request()
            except error.GrabError as ex:
                self.exception = ex
                self.reset_temporary_options()
                if self.config['log_dir']:
                    self.save_failed_dump()
                raise
            else:
                doc = self.process_request_result()

                if self.config['follow_location']:
                    if doc.code in (301, 302, 303, 307, 308):
                        if doc.headers.get('Location'):
                            refresh_count += 1
                            if refresh_count > self.config['redirect_limit']:
                                raise error.GrabTooManyRedirectsError()
                            else:
                                url = doc.headers.get('Location')
                                self.prepare_request(
                                    url=self.make_url_absolute(url),
                                    referer=None)
                                continue

                if self.config['follow_refresh']:
                    refresh_url = self.doc.get_meta_refresh_url()
                    if refresh_url is not None:
                        refresh_count += 1
                        if refresh_count > self.config['redirect_limit']:
                            raise error.GrabTooManyRedirectsError()
                        else:
                            self.prepare_request(
                                url=self.make_url_absolute(refresh_url),
                                referer=None)
                            continue
                return doc

    def submit(self, make_request=True, **kwargs):
        """
        Submit current form.

        :param make_request: if `False` then grab instance will be
            configured with form post data but request will not be
            performed

        For details see `Document.submit()` method

        Example::

            # Assume that we going to some page with some form
            g.go('some url')
            # Fill some fields
            g.doc.set_input('username', 'bob')
            g.doc.set_input('pwd', '123')
            # Submit the form
            g.submit()

            # or we can just fill the form
            # and do manual submission
            g.doc.set_input('foo', 'bar')
            g.submit(make_request=False)
            g.request()

            # for multipart forms we can specify files
            from grab import UploadFile
            g.doc.set_input('img', UploadFile('/path/to/image.png'))
            g.submit()
        """
        result = self.doc.get_form_request(**kwargs)
        if result['multipart_post']:
            self.setup(multipart_post=result['multipart_post'])
        if result['post']:
            self.setup(post=result['post'])
        if result['url']:
            self.setup(url=result['url'])
        if make_request:
            return self.request()
        else:
            return None

    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.utcnow()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, six.string_types):
                    post = make_str(post[:self.config['debug_post_limit']],
                                    errors='ignore') + b'...'
                else:
                    items = normalize_http_values(
                        post, charset=self.config['charset'])
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[
                                :self.config['debug_post_limit']] + b'...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n',
                                     self.request_counter, post)

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        self.doc.process_grab(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        if self.config['log_dir']:
            self.save_dumps()

        return self.doc

    def reset_temporary_options(self):
        self.config['post'] = None
        self.config['multipart_post'] = None
        self.config['method'] = None
        self.config['body_storage_filename'] = None

    def save_failed_dump(self):
        """
        Save dump of failed request for debugging.

        This method is called then fatal network exception is raised.
        The saved dump could be used for debugging the reason of the failure.
        """

        # try/except for safety, to not break live spiders
        try:
            # FIXME
            if (self.transport.__class__.__name__ == 'Urllib3Transport'
                    and not getattr(self.transport, '_response', None)):
                self.doc = None
            else:
                self.doc = self.transport.prepare_response(self)
            self.copy_request_data()
            self.save_dumps()
        except Exception as ex: # pylint: disable=broad-except
            logger.error('', exc_info=ex)

    def copy_request_data(self):
        # TODO: Maybe request object?
        self.request_head = self.transport.request_head
        self.request_body = self.transport.request_body
        #self.request_log = self.transport.request_log

    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()
        if isinstance(content, six.text_type):
            raise error.GrabMisuseError('Method `setup_document` accepts only '
                                        'byte string in `content` argument.')

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = b'HTTP/1.1 200 OK\r\n\r\n'
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc

    def change_proxy(self, random=True):
        """
        Set random proxy from proxylist.
        """

        if self.proxylist.size():
            if random:
                proxy = self.proxylist.get_random_proxy()
            else:
                proxy = self.proxylist.get_next_proxy()
            self.setup(proxy=proxy.get_address(),
                       proxy_userpwd=proxy.get_userpwd(),
                       proxy_type=proxy.proxy_type)
        else:
            logger.debug('Proxy list is empty')

    #
    # Private methods
    #

    @classmethod
    def common_headers(cls):
        """
        Build headers which sends typical browser.
        """

        return {
            'Accept': 'text/xml,application/xml,application/xhtml+xml'
                      ',text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.%d'
                      % randint(2, 5),
            'Accept-Language': 'en-us,en;q=0.%d' % (randint(5, 9)),
            'Accept-Charset': 'utf-8,windows-1251;q=0.7,*;q=0.%d'
                              % randint(5, 7),
            'Keep-Alive': '300',
        }

    def save_dumps(self):
        # pylint: disable=no-member
        thread_name = threading.currentThread().getName().lower()
        # pylint: enable=no-member
        if thread_name == 'mainthread':
            thread_name = ''
        else:
            thread_name = '-%s' % thread_name
        file_name = os.path.join(self.config['log_dir'], '%02d%s.log' % (
            self.request_counter, thread_name))
        with open(file_name, 'wb') as out:
            out.write(b'Request headers:\n')
            out.write(self.request_head)
            out.write(b'\n')
            out.write(b'Request body:\n')
            out.write(self.request_body)
            out.write(b'\n\n')
            out.write(b'Response headers:\n')
            out.write(self.doc.head if (self.doc and self.doc.head)
                      else b'')

        file_extension = 'html'
        file_name = os.path.join(self.config['log_dir'], '%02d%s.%s' % (
            self.request_counter, thread_name, file_extension))
        self.doc.save(file_name)

    def make_url_absolute(self, url, resolve_base=False):
        """
        Make url absolute using previous request url as base url.
        """

        if self.config['url']:
            if resolve_base:
                ubody = self.doc.unicode_body()
                base_url = find_base_url(ubody)
                if base_url:
                    return urljoin(base_url, url)
            return urljoin(self.config['url'], url)
        else:
            return url

    def detect_request_method(self):
        """
        Analyze request config and find which
        request method will be used.

        Returns request method in upper case

        This method needs simetime when `process_config` method
        was not called yet.
        """

        method = self.config['method']
        if method:
            method = method.upper()
        else:
            if self.config['post'] or self.config['multipart_post']:
                method = 'POST'
            else:
                method = 'GET'
        return method

    def clear_cookies(self):
        """
        Clear all remembered cookies.
        """

        self.config['cookies'] = {}
        self.cookies.clear()

    def setup_with_proxyline(self, line, proxy_type='http'):
        # TODO: remove from base class
        # maybe to proxylist?
        host, port, user, pwd = parse_proxy_line(line)
        server_port = '%s:%s' % (host, port)
        self.setup(proxy=server_port, proxy_type=proxy_type)
        if user:
            userpwd = '%s:%s' % (user, pwd)
            self.setup(proxy_userpwd=userpwd)

    def __getstate__(self):
        """
        Reset cached lxml objects which could not be pickled.
        """
        state = {}
        for cls in type(self).mro():
            cls_slots = getattr(cls, '__slots__', ())
            for slot in cls_slots:
                if slot != '__weakref__':
                    if hasattr(self, slot):
                        state[slot] = getattr(self, slot)

        if state['_doc']:
            state['_doc'].grab = weakref.proxy(self)

        return state

    def __setstate__(self, state):
        for slot, value in state.items():
            setattr(self, slot, value)

    @property
    def request_headers(self):
        first_head = self.request_head.decode('utf-8').split('\r\n\r\n')[0]
        lines = first_head.split('\r\n')
        lines = [x for x in lines if ':' in x]
        return email.message_from_string('\n'.join(lines))
コード例 #29
0
ファイル: grab_cookies.py プロジェクト: lyicy/grab
 def test_get_item(self):
     cookie = create_cookie('foo', 'bar', self.server.address)
     mgr = CookieManager.from_cookie_list([cookie])
     self.assertEqual('bar', mgr['foo'])
     self.assertRaises(KeyError, lambda: mgr['zzz'])
コード例 #30
0
ファイル: base.py プロジェクト: abael/grab
class Grab(DeprecatedThings):

    __slots__ = ('request_head', 'request_log', 'request_body',
                 'proxylist', 'config',
                 'transport',
                 'transport_param', 'request_method', 'request_counter',
                 '__weakref__', 'cookies',

                 # Dirty hack to make it possible to inherit Grab from
                 # multiple base classes with __slots__
                 '_doc',
                 )

    # Attributes which should be processed when clone
    # of Grab instance is creating
    clonable_attributes = ('request_head', 'request_log', 'request_body',
                           'proxylist')

    # Complex config items which points to mutable objects
    mutable_config_keys = copy(MUTABLE_CONFIG_KEYS)

    """
    Public methods
    """

    def __init__(self, document_body=None,
                 transport='grab.transport.curl.CurlTransport', **kwargs):
        """
        Create Grab instance
        """

        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()

        self.setup_transport(transport)

        self.reset()
        if kwargs:
            self.setup(**kwargs)

        if document_body is not None:
            self.setup_document(document_body)

    def _get_doc(self):
        if self._doc is None:
            self._doc = Document(self)
        return self._doc

    def _set_doc(self, obj):
        self._doc = obj

    doc = property(_get_doc, _set_doc)

    def setup_transport(self, transport_param):
        self.transport_param = transport_param
        if isinstance(transport_param, six.string_types):
            mod_path, cls_name = transport_param.rsplit('.', 1)
            try:
                cls = TRANSPORT_CACHE[(mod_path, cls_name)]
            except KeyError:
                mod = __import__(mod_path, globals(), locals(), ['foo'])
                cls = getattr(mod, cls_name)
                TRANSPORT_CACHE[(mod_path, cls_name)] = cls
            self.transport = cls()
        elif isinstance(transport_param, collections.Callable):
            self.transport = transport_param()
        else:
            raise error.GrabMisuseError('Option `transport` should be string '
                                        'or callable. Got %s'
                                        % type(transport_param))

    def reset(self):
        """
        Reset all attributes which could be modified during previous request
        or which is not initialized yet if this is the new Grab instance.

        This methods is automatically called before each network request.
        """

        self.request_head = None
        self.request_log = None
        self.request_body = None

        self.request_method = None
        self.transport.reset()

    def clone(self, **kwargs):
        """
        Create clone of Grab instance.

        Cloned instance will have the same state: cookies, referrer, response
        document data

        :param **kwargs: overrides settings of cloned grab instance
        """

        g = Grab(transport=self.transport_param)
        g.config = self.dump_config()

        g.doc = self.doc.copy()
        g.doc.grab = weakref.proxy(g)

        for key in self.clonable_attributes:
            setattr(g, key, getattr(self, key))
        g.cookies = deepcopy(self.cookies)

        if kwargs:
            g.setup(**kwargs)

        return g

    def adopt(self, g):
        """
        Copy the state of another `Grab` instance.

        Use case: create backup of current state to the cloned instance and
        then restore the state from it.
        """

        self.load_config(g.config)

        self.doc = g.doc.copy(new_grab=self)

        for key in self.clonable_attributes:
            setattr(self, key, getattr(g, key))
            self.cookies = deepcopy(g.cookies)

    def dump_config(self):
        """
        Make clone of current config.
        """

        conf = copy_config(self.config, self.mutable_config_keys)
        conf['state'] = {
            'cookiejar_cookies': list(self.cookies.cookiejar),
        }
        return conf

    def load_config(self, config):
        """
        Configure grab instance with external config object.
        """

        self.config = copy_config(config, self.mutable_config_keys)
        if 'cookiejar_cookies' in config['state']:
            self.cookies = CookieManager.from_cookie_list(
                config['state']['cookiejar_cookies'])

    def setup(self, **kwargs):
        """
        Setting up Grab instance configuration.
        """

        if 'hammer_mode' in kwargs:
            logging.error('Option hammer_mode is deprecated. Grab does not '
                          'support hammer mode anymore.')
            del kwargs['hammer_mode']

        if 'hammer_timeouts' in kwargs:
            logging.error('Option hammer_timeouts is deprecated. Grab does not'
                          ' support hammer mode anymore.')
            del kwargs['hammer_timeouts']

        for key in kwargs:
            if key not in self.config.keys():
                raise error.GrabMisuseError('Unknown option: %s' % key)

        if 'url' in kwargs:
            if self.config.get('url'):
                kwargs['url'] = self.make_url_absolute(kwargs['url'])
        self.config.update(kwargs)

    def go(self, url, **kwargs):
        """
        Go to ``url``

        Args:
            :url: could be absolute or relative. If relative then t will be
            appended to the absolute URL of previous request.
        """

        return self.request(url=url, **kwargs)

    def download(self, url, location, **kwargs):
        """
        Fetch document located at ``url`` and save to to ``location``.
        """

        doc = self.go(url, **kwargs)
        with open(location, 'wb') as out:
            out.write(doc.body)
        return len(doc.body)

    def prepare_request(self, **kwargs):
        """
        Configure all things to make real network request.
        This method is called before doing real request via
        transport extension.
        """

        self.reset()
        self.request_counter = next(REQUEST_COUNTER)
        if kwargs:
            self.setup(**kwargs)
        if not self.proxylist.is_empty() and self.config['proxy_auto_change']:
            self.change_proxy()
        self.request_method = self.detect_request_method()
        self.transport.process_config(self)

    def log_request(self, extra=''):
        """
        Send request details to logging system.
        """

        thread_name = threading.currentThread().getName().lower()
        if thread_name == 'mainthread':
            thread_name = ''
        else:
            thread_name = '-%s' % thread_name

        if self.config['proxy']:
            if self.config['proxy_userpwd']:
                auth = ' with authorization'
            else:
                auth = ''
            proxy_info = ' via %s proxy of type %s%s' % (
                self.config['proxy'], self.config['proxy_type'], auth)
        else:
            proxy_info = ''
        if extra:
            extra = '[%s] ' % extra
        logger_network.debug('[%02d%s] %s%s %s%s',
                             self.request_counter, thread_name,
                             extra, self.request_method or 'GET',
                             self.config['url'], proxy_info)

    def request(self, **kwargs):
        """
        Perform network request.

        You can specify grab settings in ``**kwargs``.
        Any keyword argument will be passed to ``self.config``.

        Returns: ``Document`` objects.
        """

        self.prepare_request(**kwargs)
        self.log_request()

        try:
            self.transport.request()
        except error.GrabError:
            self.reset_temporary_options()
            self.save_failed_dump()
            raise
        else:
            # That builds `self.doc`
            self.process_request_result()
            return self.doc

    def process_request_result(self, prepare_response_func=None):
        """
        Process result of real request performed via transport extension.
        """

        now = datetime.utcnow()
        # TODO: move into separate method
        if self.config['debug_post']:
            post = self.config['post'] or self.config['multipart_post']
            if isinstance(post, dict):
                post = list(post.items())
            if post:
                if isinstance(post, six.string_types):
                    post = post[:self.config['debug_post_limit']] + '...'
                else:
                    items = normalize_http_values(post, charset='utf-8')
                    new_items = []
                    for key, value in items:
                        if len(value) > self.config['debug_post_limit']:
                            value = value[
                                :self.config['debug_post_limit']] + '...'
                        else:
                            value = value
                        new_items.append((key, value))
                    post = '\n'.join('%-25s: %s' % x for x in new_items)
            if post:
                logger_network.debug('[%02d] POST request:\n%s\n'
                                     % (self.request_counter, post))

        # It's important to delete old POST data after request is performed.
        # If POST data is not cleared then next request will try to use them
        # again!
        old_refresh_count = self.config['refresh_redirect_count']
        self.reset_temporary_options()

        if prepare_response_func:
            self.doc = prepare_response_func(self.transport, self)
        else:
            self.doc = self.transport.prepare_response(self)

        # Workaround
        if self.doc.grab is None:
            self.doc.grab = weakref.proxy(self)

        if self.config['reuse_cookies']:
            self.cookies.update(self.doc.cookies)

        self.doc.timestamp = now

        self.config['charset'] = self.doc.charset

        if self.config['log_file']:
            with open(self.config['log_file'], 'wb') as out:
                out.write(self.doc.body)

        if self.config['cookiefile']:
            self.cookies.save_to_file(self.config['cookiefile'])

        if self.config['reuse_referer']:
            self.config['referer'] = self.doc.url

        self.copy_request_data()

        # Should be called after `copy_request_data`
        self.save_dumps()

        # TODO: check max redirect count
        if self.config['follow_refresh']:
            url = find_refresh_url(self.doc.unicode_body())
            if url is not None:
                inc_count = old_refresh_count + 1
                if inc_count > self.config['redirect_limit']:
                    raise error.GrabTooManyRedirectsError()
                else:
                    return self.request(url=url,
                                        refresh_redirect_count=inc_count)

        return None

    def reset_temporary_options(self):
        self.config['post'] = None
        self.config['multipart_post'] = None
        self.config['method'] = None
        self.config['body_storage_filename'] = None
        self.config['refresh_redirect_count'] = 0

    def save_failed_dump(self):
        """
        Save dump of failed request for debugging.

        This method is called then fatal network exception is raised.
        The saved dump could be used for debugging the reason of the failure.
        """

        # This is very untested feature, so
        # I put it inside try/except to not break
        # live spiders
        try:
            self.doc = self.transport.prepare_response(self)
            self.copy_request_data()
            self.save_dumps()
        except Exception as ex:
            logging.error(six.text_type(ex))

    def copy_request_data(self):
        # TODO: Maybe request object?
        self.request_head = self.transport.request_head
        self.request_body = self.transport.request_body
        self.request_log = self.transport.request_log

    def setup_document(self, content, **kwargs):
        """
        Setup `response` object without real network requests.

        Useful for testing and debuging.

        All ``**kwargs`` will be passed to `Document` constructor.
        """

        self.reset()

        if isinstance(content, six.text_type):
            raise error.GrabMisuseError('Method `setup_document` accepts only '
                                        'byte string in `content` argument.')

        # Configure Document instance
        doc = Document(grab=self)
        doc.body = content
        doc.status = ''
        doc.head = ''
        doc.parse(charset=kwargs.get('document_charset'))
        doc.code = 200
        doc.total_time = 0
        doc.connect_time = 0
        doc.name_lookup_time = 0
        doc.url = ''

        for key, value in kwargs.items():
            setattr(doc, key, value)

        self.doc = doc

    def change_proxy(self):
        """
        Set random proxy from proxylist.
        """

        if not self.proxylist.is_empty():
            proxy = self.proxylist.get_random_proxy()
            self.setup(proxy=proxy.address, proxy_userpwd=proxy.userpwd,
                       proxy_type=proxy.proxy_type)
        else:
            logging.debug('Proxy list is empty')

    """
    Private methods
    """

    def common_headers(self):
        """
        Build headers which sends typical browser.
        """

        return {
            'Accept': 'text/xml,application/xml,application/xhtml+xml'
                      ',text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.%d'
                      % randint(2, 5),
            'Accept-Language': 'en-us,en;q=0.%d' % (randint(5, 9)),
            'Accept-Charset': 'utf-8,windows-1251;q=0.7,*;q=0.%d'
                              % randint(5, 7),
            'Keep-Alive': '300',
            'Expect': '',
        }

    def save_dumps(self):
        if self.config['log_dir']:
            thread_name = threading.currentThread().getName().lower()
            if thread_name == 'mainthread':
                thread_name = ''
            else:
                thread_name = '-%s' % thread_name
            file_name = os.path.join(self.config['log_dir'], '%02d%s.log' % (
                self.request_counter, thread_name))
            with open(file_name, 'w') as out:
                out.write('Request headers:\n')
                out.write(self.request_head)
                out.write('\n')
                out.write('Request body:\n')
                out.write(self.request_body)
                out.write('\n\n')
                out.write('Response headers:\n')
                out.write(self.doc.head)

            file_extension = 'html'
            file_name = os.path.join(self.config['log_dir'], '%02d%s.%s' % (
                self.request_counter, thread_name, file_extension))
            self.doc.save(file_name)

    def make_url_absolute(self, url, resolve_base=False):
        """
        Make url absolute using previous request url as base url.
        """

        if self.config['url']:
            if resolve_base:
                ubody = self.doc.unicode_body()
                base_url = find_base_url(ubody)
                if base_url:
                    return urljoin(base_url, url)
            return urljoin(self.config['url'], url)
        else:
            return url

    def detect_request_method(self):
        """
        Analyze request config and find which
        request method will be used.

        Returns request method in upper case

        This method needs simetime when `process_config` method
        was not called yet.
        """

        method = self.config['method']
        if method:
            method = method.upper()
        else:
            if self.config['post'] or self.config['multipart_post']:
                method = 'POST'
            else:
                method = 'GET'
        return method

    def clear_cookies(self):
        """
        Clear all remembered cookies.
        """

        self.config['cookies'] = {}
        self.cookies.clear()

    def setup_with_proxyline(self, line, proxy_type='http'):
        # TODO: remove from base class
        # maybe to proxylist?
        host, port, user, pwd = parse_proxy_line(line)
        server_port = '%s:%s' % (host, port)
        self.setup(proxy=server_port, proxy_type=proxy_type)
        if user:
            userpwd = '%s:%s' % (user, pwd)
            self.setup(proxy_userpwd=userpwd)

    def __getstate__(self):
        """
        Reset cached lxml objects which could not be pickled.
        """
        state = {}
        for cls in type(self).mro():
            cls_slots = getattr(cls, '__slots__', ())
            for slot in cls_slots:
                if slot != '__weakref__':
                    if hasattr(self, slot):
                        state[slot] = getattr(self, slot)

        if state['_doc']:
            state['_doc'].grab = weakref.proxy(self)

        return state

    def __setstate__(self, state):
        for slot, value in state.items():
            setattr(self, slot, value)

    @property
    def request_headers(self):
        """
        Temporary hack till the time I'll understand
        where to store request details.
        """

        try:
            first_head = self.request_head.split('\r\n\r\n')[0]
            lines = first_head.split('\r\n')
            lines = [x for x in lines if ':' in x]
            headers = email.message_from_string('\n'.join(lines))
            return headers
        except Exception as ex:
            logging.error('Could not parse request headers', exc_info=ex)
            return {}

    def dump(self):
        """
        Shortcut for real-time debugging.
        """
        self.doc.save('/tmp/x.html')