def prepare_response(self, grab): if self.body_file: self.body_file.close() response = Document() response.head = b''.join(self.response_header_chunks) if self.body_path: response.body_path = self.body_path else: response.body = b''.join(self.response_body_chunks) # Clear memory self.response_header_chunks = [] self.response_body_chunks = [] response.code = self.curl.getinfo(pycurl.HTTP_CODE) response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME) response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME) response.name_lookup_time = self.curl.getinfo(pycurl.NAMELOOKUP_TIME) response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD) response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD) response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD) response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP) response.url = self.curl.getinfo(pycurl.EFFECTIVE_URL) response.parse(charset=grab.config['document_charset']) response.cookies = CookieManager(self.extract_cookiejar()) # We do not need anymore cookies stored in the # curl instance so drop them self.curl.setopt(pycurl.COOKIELIST, 'ALL') return response
def setup_document(self, content, **kwargs): """ Setup `response` object without real network requests. Useful for testing and debuging. All ``**kwargs`` will be passed to `Document` constructor. """ self.reset() # Configure Document instance doc = Document(grab=self) doc.body = content doc.status = '' doc.head = '' doc.parse(charset=kwargs.get('document_charset')) doc.code = 200 doc.total_time = 0 doc.connect_time = 0 doc.name_lookup_time = 0 doc.url = '' for key, value in kwargs.items(): setattr(doc, key, value) self.doc = doc
def setup_document(self, content, **kwargs): """ Setup `response` object without real network requests. Useful for testing and debuging. All ``**kwargs`` will be passed to `Document` constructor. """ self.reset() if isinstance(content, six.text_type): raise error.GrabMisuseError('Method `setup_document` accepts only ' 'byte string in `content` argument.') # Configure Document instance doc = Document(grab=self) doc.body = content doc.status = '' doc.head = b'HTTP/1.1 200 OK\r\n\r\n' doc.parse(charset=kwargs.get('document_charset')) doc.code = 200 doc.total_time = 0 doc.connect_time = 0 doc.name_lookup_time = 0 doc.url = '' for key, value in kwargs.items(): setattr(doc, key, value) self.doc = doc
def custom_prepare_response_func(transport, grab): doc = Document() doc.head = cache_item['head'] doc.body = body doc.code = cache_item['response_code'] doc.download_size = len(body) doc.upload_size = 0 doc.download_speed = 0 doc.url = cache_item['response_url'] doc.parse(charset=grab.config['document_charset']) doc.cookies = CookieManager(transport.extract_cookiejar()) doc.from_cache = True return doc
def prepare_response(self, grab): # Information about urllib3 # On python2 urllib3 headers contains original binary data # On python3 urllib3 headers are converted to unicode # using latin encoding try: #if self.body_file: # self.body_file.close() response = Document() head = '' for key, val in self._response.getheaders().items(): if six.PY2: key = key.decode('utf-8', errors='ignore') val = val.decode('utf-8', errors='ignore') if six.PY3: key = key.encode('latin').decode('utf-8', errors='ignore') val = val.encode('latin').decode('utf-8', errors='ignore') head += '%s: %s\r\n' % (key, val) head += '\r\n' response.head = make_str(head, encoding='utf-8') #if self.body_path: # response.body_path = self.body_path #else: # response.body = b''.join(self.response_body_chunks) def read_with_timeout(): if self._request.config_nobody: return b'' maxsize = self._request.config_body_maxsize chunks = [] default_chunk_size = 10000 if maxsize: chunk_size = min(default_chunk_size, maxsize + 1) else: chunk_size = default_chunk_size bytes_read = 0 while True: chunk = self._response.read(chunk_size) if chunk: bytes_read += len(chunk) chunks.append(chunk) if maxsize and bytes_read > maxsize: # reached limit on bytes to read break else: break if self._request.timeout: if (time.time() - self._request.op_started > self._request.timeout): raise GrabTimeoutError data = b''.join(chunks) if maxsize: data = data[:maxsize] return data if self._request.response_path: response.body_path = self._request.response_path # FIXME: Quick dirty hack, actullay, response is fully # read into memory self._request.response_file.write(read_with_timeout()) self._request.response_file.close() else: response.body = read_with_timeout() # Clear memory #self.response_header_chunks = [] response.code = self._response.status #response.total_time = self.curl.getinfo(pycurl.TOTAL_TIME) #response.connect_time = self.curl.getinfo(pycurl.CONNECT_TIME) #response.name_lookup_time = (self.curl # .getinfo(pycurl.NAMELOOKUP_TIME)) #response.download_size = self.curl.getinfo(pycurl.SIZE_DOWNLOAD) #response.upload_size = self.curl.getinfo(pycurl.SIZE_UPLOAD) #response.download_speed = self.curl.getinfo(pycurl.SPEED_DOWNLOAD) #response.remote_ip = self.curl.getinfo(pycurl.PRIMARY_IP) response.url = (self._response.get_redirect_location() or self._request.url) import email.message hdr = email.message.Message() for key, val in self._response.getheaders().items(): if six.PY2: key = key.decode('utf-8', errors='ignore') val = val.decode('utf-8', errors='ignore') if six.PY3: key = key.encode('latin').decode('utf-8', errors='ignore') val = val.encode('latin').decode('utf-8', errors='ignore') #if key == 'Location': # import pdb; pdb.set_trace() hdr[key] = val response.parse(charset=grab.config['document_charset'], headers=hdr) jar = self.extract_cookiejar() # self._response, self._request) response.cookies = CookieManager(jar) # We do not need anymore cookies stored in the # curl instance so drop them #self.curl.setopt(pycurl.COOKIELIST, 'ALL') return response finally: self._response.release_conn()