コード例 #1
0
ファイル: curl.py プロジェクト: lyicy/grab
    def prepare_response(self, grab):
        if self.body_file:
            self.body_file.close()
        response = Document()

        response.head = b''.join(self.response_header_chunks)

        if self.body_path:
            response.body_path = self.body_path
        else:
            response.body = b''.join(self.response_body_chunks)

        # Clear memory
        self.response_header_chunks = []
        self.response_body_chunks = []

        response.code = self.curl.getinfo(pycurl.HTTP_CODE)
        response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self.curl.getinfo(pycurl.EFFECTIVE_URL)

        response.parse(charset=grab.config['document_charset'])

        response.cookies = CookieManager(self.extract_cookiejar())

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
コード例 #2
0
    def __init__(self, document_body=None, transport=None, **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()

        # makes pylint happy
        self.request_counter = None
        self.request_head = None
        self.request_body = None
        self.request_method = None
        self.transport_param = transport
        self.transport = None

        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)
コード例 #3
0
ファイル: mock.py プロジェクト: subeax/grab
    def prepare_response(self, grab):
        response = Response()

        try:
            response.body = MOCK_REGISTRY[self.request_url]['body']
        except KeyError:
            raise GrabMockNotFoundError(
                'Mock registry does not have information about '\
                'following URL: %s' % self.request_url)

        now_str = datetime.now().strftime('%a, %d %B %Y %H:%M:%S')
        response.head = '\r\n'.join((
            'Accept-Ranges:bytes',
            'Content-Length:%d' % len(response.body),
            'Content-Type:text/plain',
            'Date:%s GMT' % now_str,
            'Last-Modified:%s GMT' % now_str,
            'Vary:Accept-Encoding',
        ))

        response.code = 200
        response.total_time = 0
        response.name_lookup_time = 0
        response.connect_time = 0
        response.url = self.request_url
        response.parse()
        response.cookies = CookieManager(self.extract_cookiejar())

        return response
コード例 #4
0
    def prepare_response(self, grab):
        #if self.body_file:
        #    self.body_file.close()
        response = Response()

        head = ''
        for key, val in self._response.getheaders().items():
            head += '%s: %s\r\n' % (key, val)
        head += '\r\n'
        response.head = make_str(head, encoding='latin', errors='ignore')

        #if self.body_path:
        #    response.body_path = self.body_path
        #else:
        #    response.body = b''.join(self.response_body_chunks)
        if self._request._response_path:
            response.body_path = self._request._response_path
            # Quick dirty hack, actullay, response is fully read into memory
            self._request._response_file.write(self._response.read())  #data)
            self._request._response_file.close()
        else:
            if self._request.body_maxsize is not None:
                #if self.response_body_bytes_read > self.config_body_maxsize:
                #    logger.debug('Response body max size limit reached: %s' %
                #                 self.config_body_maxsize)
                response.body = self._response.read(self._request.body_maxsize)
            else:
                response.body = self._response.read()  #data

        # Clear memory
        #self.response_header_chunks = []

        response.code = self._response.status
        #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
        #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
        #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
        #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
        #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
        #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
        #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

        response.url = self._response.get_redirect_location(
        ) or self._request.url

        import email.message
        hdr = email.message.Message()
        for key, val in self._response.getheaders().items():
            hdr[key] = val
        response.parse(charset=grab.config['document_charset'], headers=hdr)

        jar = self.extract_cookiejar(self._response, self._request)
        response.cookies = CookieManager(jar)

        # We do not need anymore cookies stored in the
        # curl instance so drop them
        #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
        return response
コード例 #5
0
 def custom_prepare_response_func(transport, g):
     response = Response()
     response.head = cache_item['head']
     response.body = body
     response.code = cache_item['response_code']
     response.download_size = len(body)
     response.upload_size = 0
     response.download_speed = 0
     response.url = cache_item['response_url']
     response.parse()
     response.cookies = CookieManager(transport.extract_cookiejar())
     return response
コード例 #6
0
 def custom_prepare_response_func(transport, grab):
     response = Response()
     response.head = cache_item['head']
     response.body = body
     response.code = cache_item['response_code']
     response.download_size = len(body)
     response.upload_size = 0
     response.download_speed = 0
     response.url = cache_item['response_url']
     response.parse(charset=grab.config['document_charset'])
     response.cookies = CookieManager(transport.extract_cookiejar())
     response.from_cache = True
     return response
コード例 #7
0
ファイル: mysql.py プロジェクト: degustaf/pylint-corpus
 def custom_prepare_response_func(transport, grab):
     doc = Document()
     doc.head = cache_item['head']
     doc.body = body
     doc.code = cache_item['response_code']
     doc.download_size = len(body)
     doc.upload_size = 0
     doc.download_speed = 0
     doc.url = cache_item['response_url']
     doc.parse(charset=grab.config['document_charset'])
     doc.cookies = CookieManager(transport.extract_cookiejar())
     doc.from_cache = True
     return doc
コード例 #8
0
ファイル: base.py プロジェクト: FeodorFitsner/grab
    def __init__(self, document_body=None, transport='pycurl', **kwargs):
        """
        Create Grab instance
        """

        self.meta = {}
        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self.cookies = CookieManager()
        self.proxylist = ProxyList()
        self.setup_transport(transport)
        self.reset()
        if kwargs:
            self.setup(**kwargs)
        if document_body is not None:
            self.setup_document(document_body)
コード例 #9
0
ファイル: document.py プロジェクト: abaelhe/grab
    def __init__(self, grab=None):
        if grab is None:
            self.grab = None
        else:
            if isinstance(grab, weakref.ProxyType):
                self.grab = grab
            else:
                self.grab = weakref.proxy(grab)

        self.status = None
        self.code = None
        self.head = None
        self.headers = None
        self.url = None
        self.cookies = CookieManager()
        self.charset = 'utf-8'
        self.bom = None
        self.timestamp = datetime.utcnow()
        self.name_lookup_time = 0
        self.connect_time = 0
        self.total_time = 0
        self.download_size = 0
        self.upload_size = 0
        self.download_speed = 0
        self.error_code = None
        self.error_msg = None

        # Body
        self.body_path = None
        self._cached_body = None
        self._unicode_body = None
        self._runtime_body = None
        self._unicode_runtime_body = None

        # DOM Tree
        self._lxml_tree = None
        self._strict_lxml_tree = None

        # Pyquery
        self._pyquery = None

        # Form
        self._lxml_form = None
        self._file_fields = {}
コード例 #10
0
ファイル: postgresql.py プロジェクト: subeax/grab
        def custom_prepare_response_func(transport, g):
            response = Response()
            response.head = cache_item['head']
            response.body = body
            response.code = cache_item['response_code']
            response.download_size = len(body)
            response.upload_size = 0
            response.download_speed = 0

            # Hack for deprecated behaviour
            if 'response_url' in cache_item:
                response.url = cache_item['response_url']
            else:
                logger.debug('You cache contains items without `response_url` key. It is depricated data format. Please re-download you cache or build manually `response_url` keys.')
                response.url = cache_item['url']

            response.parse()
            response.cookies = CookieManager(transport.extract_cookiejar())
            return response
コード例 #11
0
    def __init__(self, grab=None):
        self._grab_config = {}
        self.grab = None
        if grab:
            self.process_grab(grab)
        self.status = None
        self.code = None
        self.head = None
        self.headers = None
        self.url = None
        self.cookies = CookieManager()
        self.charset = 'utf-8'
        self.bom = None
        self.timestamp = datetime.utcnow()
        self.name_lookup_time = 0
        self.connect_time = 0
        self.total_time = 0
        self.download_size = 0
        self.upload_size = 0
        self.download_speed = 0
        self.error_code = None
        self.error_msg = None
        self.from_cache = False

        # Body
        self.body_path = None
        self._bytes_body = None
        self._unicode_body = None

        # DOM Tree
        self._lxml_tree = None
        self._strict_lxml_tree = None

        # Pyquery
        self._pyquery = None

        # Form
        self._lxml_form = None
        self._file_fields = {}
コード例 #12
0
ファイル: base.py プロジェクト: subeax/grab
    def __init__(self,
                 document_body=None,
                 transport='grab.transport.curl.CurlTransport',
                 **kwargs):
        """
        Create Grab instance
        """

        self._doc = None
        self.config = default_config()
        self.config['common_headers'] = self.common_headers()
        self._request_prepared = False
        self.cookies = CookieManager()
        self.proxylist = ProxyList()

        self.setup_transport(transport)

        self.reset()

        if kwargs:
            self.setup(**kwargs)
        self.clone_counter = 0
        if document_body is not None:
            self.setup_document(document_body)
コード例 #13
0
    def prepare_response(self, grab):
        # Information about urllib3
        # On python2 urllib3 headers contains original binary data
        # On python3 urllib3 headers are converted to unicode
        # using latin encoding
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Document()

            head = ''
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='utf-8')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                bytes_read = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        bytes_read += len(chunk)
                        chunks.append(chunk)
                        if maxsize and bytes_read > maxsize:
                            # reached limit on bytes to read
                            break
                    else:
                        break
                    if self._request.timeout:
                        if (time.time() - self._request.op_started >
                                self._request.timeout):
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request.response_path:
                response.body_path = self._request.response_path
                # FIXME: Quick dirty hack, actullay, response is fully
                # read into memory
                self._request.response_file.write(read_with_timeout())
                self._request.response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = (self.curl
            #                             .getinfo(pycurl.NAMELOOKUP_TIME))
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = (self._response.get_redirect_location()
                            or self._request.url)

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                if six.PY2:
                    key = key.decode('utf-8', errors='ignore')
                    val = val.decode('utf-8', errors='ignore')
                if six.PY3:
                    key = key.encode('latin').decode('utf-8', errors='ignore')
                    val = val.encode('latin').decode('utf-8', errors='ignore')
                #if key == 'Location':
                #    import pdb; pdb.set_trace()
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar()  # self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()
コード例 #14
0
ファイル: urllib3.py プロジェクト: sn-donbenjamin/grab
    def prepare_response(self, grab):
        try:
            #if self.body_file:
            #    self.body_file.close()
            response = Response()

            head = ''
            for key, val in self._response.getheaders().items():
                head += '%s: %s\r\n' % (key, val)
            head += '\r\n'
            response.head = make_str(head, encoding='latin', errors='ignore')

            #if self.body_path:
            #    response.body_path = self.body_path
            #else:
            #    response.body = b''.join(self.response_body_chunks)
            def read_with_timeout():
                if self._request.config_nobody:
                    return b''
                maxsize = self._request.config_body_maxsize
                chunks = []
                default_chunk_size = 10000
                if maxsize:
                    chunk_size = min(default_chunk_size, maxsize + 1)
                else:
                    chunk_size = default_chunk_size
                total_size = 0
                while True:
                    chunk = self._response.read(chunk_size)
                    if chunk:
                        total_size += len(chunk)
                        chunks.append(chunk)
                        if maxsize and total_size > maxsize:
                            logger.debug(
                                'Response body max size limit reached: %s' %
                                maxsize)
                    else:
                        break
                    if self._request.timeout:
                        if time.time(
                        ) - self._request.op_started > self._request.timeout:
                            raise GrabTimeoutError
                data = b''.join(chunks)
                if maxsize:
                    data = data[:maxsize]
                return data

            if self._request._response_path:
                response.body_path = self._request._response_path
                # FIXME: Quick dirty hack, actullay, response is fully read into memory
                self._request._response_file.write(read_with_timeout())
                self._request._response_file.close()
            else:
                response.body = read_with_timeout()

            # Clear memory
            #self.response_header_chunks = []

            response.code = self._response.status
            #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME)
            #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME)
            #response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME)
            #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD)
            #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD)
            #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD)
            #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP)

            response.url = self._response.get_redirect_location(
            ) or self._request.url

            import email.message
            hdr = email.message.Message()
            for key, val in self._response.getheaders().items():
                hdr[key] = val
            response.parse(charset=grab.config['document_charset'],
                           headers=hdr)

            jar = self.extract_cookiejar()  #self._response, self._request)
            response.cookies = CookieManager(jar)

            # We do not need anymore cookies stored in the
            # curl instance so drop them
            #self.curl.setopt(pycurl.COOKIELIST, 'ALL')
            return response
        finally:
            self._response.release_conn()