def buildScrapyResponse(self, response, body, url): status = response.code headers = Headers( {k: ','.join(v) for k, v in response.headers.getAllRawHeaders()}) respcls = responsetypes.from_args(headers=headers, url=url) return respcls(url=url, status=status, headers=headers, body=body)
def _download_request(self, request, spider): proxies = {} proxy = request.meta.get('proxy', '') if proxy: for p in self.proxies: if p.find(proxy) != -1: scheme = urlparse(p).scheme proxies[scheme] = p break timeout = request.meta.get('download_timeout', self.timeout) url = request.url method = request.method headers = headers_scrapy2dict(request.headers) data = request.body session = self._session or requests.sessions.Session() st = time.time() requests_response = session.request(method, url, headers=headers, data=data, timeout=timeout, proxies=proxies) et = time.time() cost = et - st request.meta['download_latency'] = cost headers = Headers(dict(requests_response.headers)) respcls = responsetypes.from_args(headers=headers, url=requests_response.url, body=requests_response.content) response_url = requests_response.url.encode(requests_response.encoding) response = respcls(url=response_url, status=requests_response.status_code, headers=headers, body=requests_response.content, ) return response
def process_response(self, request, response, spider): if request.method == 'HEAD': return response if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) if self._max_size and len(decoded_body) > self._max_size: error_msg = ("Cancelling processing of %(url)s: " "Uncompressed response size %(size)s larger " "than max allowed size (%(maxsize)s).") error_args = { 'url': response.url, 'size': len(decoded_body), 'maxsize': self._max_size } logger.error(error_msg, error_args) raise IgnoreRequest(error_msg % error_args) respcls = responsetypes.from_args(headers=response.headers, \ url=response.url, body=decoded_body) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
def test_from_args(self): # TODO: add more tests that check precedence between the different arguments mappings = [ ({ 'url': 'http://www.example.com/data.csv' }, TextResponse), # headers takes precedence over url ({ 'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/' }, HtmlResponse), ({ 'headers': Headers({ 'Content-Disposition': ['attachment; filename="data.xml.gz"'] }), 'url': 'http://www.example.com/page/' }, Response), ] for source, cls in mappings: retcls = responsetypes.from_args(**source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def retrieve_response(self, spider, request): response = super(S3CacheStorage, self).retrieve_response(spider, request) if response is None: # not in local filesystem cache, so try copying from s3 local_path = self._get_request_path(spider, request) remote_path = os.path.relpath(local_path, self.tmpcachedir).lower() bucket = self.conn.get_bucket(self.bucket_name, validate=False) def _get_key(filename): key_name = os.path.join(remote_path, filename) return bucket.get_key(key_name) # check if the key exists metadata_key = _get_key('pickled_meta') if metadata_key is None: return None # key not found # check if the cache entry has expired mtime = parse_ts(metadata_key.last_modified) if 0 < self.expiration_secs < (datetime.datetime.utcnow() - mtime).total_seconds(): return None # expired # deserialise the cached response metadata = pickle.loads(metadata_key.get_contents_as_string()) body = _get_key('response_body').get_contents_as_string() rawheaders = _get_key('response_headers').get_contents_as_string() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def _build_response(self, result, request, protocol): self.result = result respcls = responsetypes.from_args(url=request.url) protocol.close() body = protocol.filename or protocol.body.read() headers = {"local filename": protocol.filename or '', "size": protocol.size} return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
async def _download_request_page(self, request: Request, spider: Spider, page: Page) -> Response: self.stats.inc_value("pyppeteer/page_count") if self.navigation_timeout is not None: page.setDefaultNavigationTimeout(self.navigation_timeout) await page.setRequestInterception(True) page.on( "request", partial(_request_handler, scrapy_request=request, stats=self.stats)) page.on("response", partial(_response_handler, stats=self.stats)) start_time = time() response = await page.goto(request.url) page_coroutines = request.meta.get("pyppeteer_page_coroutines") or () if isinstance(page_coroutines, dict): page_coroutines = page_coroutines.values() for pc in page_coroutines: if isinstance(pc, PageCoroutine): method = getattr(page, pc.method) # set PageCoroutine timeout if self.page_coroutine_timeout is not None and not pc.kwargs.get( "timeout", None): pc.kwargs["timeout"] = self.page_coroutine_timeout if isinstance(pc, NavigationPageCoroutine): await asyncio.gather(page.waitForNavigation(), method(*pc.args, **pc.kwargs)) else: pc.result = await method(*pc.args, **pc.kwargs) body = (await page.content()).encode("utf8") request.meta["download_latency"] = time() - start_time callback = request.callback or spider.parse annotations = getattr(callback, "__annotations__", {}) for key, value in annotations.items(): if value is pyppeteer.page.Page: request.cb_kwargs[key] = page self.stats.inc_value("pyppeteer/page_count/injected_callback") break else: await page.close() self.stats.inc_value("pyppeteer/page_count/closed") headers = Headers(response.headers) headers.pop("Content-Encoding", None) respcls = responsetypes.from_args(headers=headers, url=page.url, body=body) return respcls( url=page.url, status=response.status, headers=headers, body=body, request=request, flags=["pyppeteer"], )
def process_response(self, request, response, spider): if request.method == 'HEAD': return response if isinstance(response, Response): if response.url.endswith('.xml.gz'): response.headers.setlist('Content-Encoding', [ b'gzip', ]) response.headers.setlist('Content-Type', [ b'application/xml', ]) content_encoding = response.headers.getlist('Content-Encoding') if content_encoding and not is_gzipped(response): encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) respcls = responsetypes.from_args(headers=response.headers, url=response.url) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" key = self._get_request_key(spider, request) expiration_time = self._get_expiration_time(spider) metadata = self._read_meta(key, expiration_time) if metadata is None: return # not cached spider.log("%s: found cache for %s" % (self.__class__.__name__, request.url)) res = self._get_cached_data(key, spider) if res is None: return None spider.log("%s: got response from cache for %s" % (self.__class__.__name__, request.url)) response_url = res['response_url'] response_body = str(res['response_body']) response_rawheaders = res['response_meta']['headers'] status = res['response_meta']['status'] response_headers = Headers(headers_raw_to_dict(response_rawheaders)) response_cls = responsetypes.from_args(headers=response_headers, url=response_url) response = response_cls(url=response_url, headers=response_headers, status=status, body=response_body) return response
def _cb_bodydone(self, result, request, url): txresponse, body, flags = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=body) # print(respcls) # <class 'scrapy.http.response.html.HtmlResponse'> return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
def retrieve_response(self, spider, request): key = self._request_key(spider, request) try: gf = self.fs[spider].fetchDocument(key, rawResults=True) print("found\t" + str(request.url), request.priority) except pyArango.theExceptions.DocumentNotFoundError: print("not found\t" + str(request.url), request.priority) return url = str(gf["url"]) status = str(gf["status"]) bod = gf["body"].encode("utf-8") body = zlib.decompress(base64.urlsafe_b64decode(bod)) try: if gf["headers"].get("content-encoding" ) == "gzip" and not gzip_magic_number(body): del gf["headers"]["content-encoding"] elif not gf["headers"].get( "content-encoding") and gzip_magic_number(body): gf["headers"]["content-encoding"] = "gzip" headers = Headers([(x, str(y)) for x, y in gf["headers"].items()]) except AttributeError as e: if gzip_magic_number(body): #print("added headers") headers = Headers((("Content-Encoding", "gzip"), )) else: #print("headers=None") headers = None respcls = responsetypes.from_args(headers=headers, url=url, body=body) response = respcls(url=url, headers=headers, status=status, body=body) return response
def process_response(self, request, response, spider): if request.method == 'HEAD': return response if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) if self.stats: self.stats.inc_value('httpcompression/response_bytes', len(decoded_body), spider=spider) self.stats.inc_value('httpcompression/response_count', spider=spider) respcls = responsetypes.from_args(headers=response.headers, url=response.url, body=decoded_body) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
def _build_response(self, result, request, protocol): self.result = result respcls = responsetypes.from_args(url=request.url) protocol.close() body = protocol.filename or protocol.body.read() headers = {"local filename": protocol.filename or '', "size": protocol.size} return respcls(url=request.url, status=200, body=body, headers=headers)
def retrieve_response(self, spider, request): """ Return response if present in cache, or None otherwise. """ key = self._get_key(spider, request) epoch = request.meta.get('epoch') # guaranteed to be True or datetime s3_key = self._get_s3_key(key, epoch) if not s3_key: return log.msg('S3Storage (epoch => %s): retrieving response for %s.' % (epoch, request.url)) try: data_string = s3_key.get_contents_as_string() except boto.exception.S3ResponseError as e: # See store_response for error descriptions raise e finally: s3_key.close() data = pickle.loads(data_string) metadata = data['metadata'] request_headers = data['request_headers'] request_body = data['request_body'] response_headers = data['response_headers'] response_body = data['response_body'] url = metadata['response_url'] status = metadata.get('status') Response = responsetypes.from_args(headers=response_headers, url=url) return Response(url=url, headers=response_headers, status=status, body=response_body)
def _is_bzip2(self, response): try: body = bz2.decompress(response.body) except IOError: return respcls = responsetypes.from_args(body=body) return response.replace(body=body, cls=respcls)
def _build_response(self, body, request): if self.invalid_headers: raise BadHttpHeaderError('Invalid headers received: %s' % self.invalid_headers) request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body)
def _cb_bodydone(self, result): url, status, headers, body, flags = result respcls = responsetypes.from_args(headers=headers, url=url, body=body) return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
def _is_gzip(self, response): archive = StringIO(response.body) try: body = gzip.GzipFile(fileobj=archive).read() except IOError: return respcls = responsetypes.from_args(body=body) return response.replace(body=body, cls=respcls)
def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time - self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self._url) return respcls(url=self._url, status=status, headers=headers, body=body)
def _is_tar(self, response): archive = StringIO(response.body) try: tar_file = tarfile.open(name=mktemp(), fileobj=archive) except tarfile.ReadError: return body = tar_file.extractfile(tar_file.members[0]).read() respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) return response.replace(body=body, cls=respcls)
def _cb_bodydone(self, result, request, url): txresponse, body, flags = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=body) return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
def retrieve_response(self, spider, request): data = self._read_data(spider, request) if data is None: return # not cached url = data['url'] status = data['status'] headers = Headers(data['headers']) body = data['body'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): doc = self.col.find_one({'url': request.url}) if doc is None: return # not cached status = 200 url = doc['url'] headers = Headers(doc['headers']) body = doc['body'].encode('utf-8') respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def response_from_dict(response, spider=None, **kwargs): """Returns a dict based on a response from a spider""" url = response.get("url") status = response.get("status") headers = Headers([(x, map(str, y)) for x, y in response.get("headers").iteritems()]) body = response.get("body") respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def response_from_dict(response, spider=None, **kwargs): """Returns a dict based on a response from a spider""" url = response.get("url") status = "200" headers = Headers([(x, list(map(str, y))) for x, y in response.get("headers").items()]) body = response.get("body") respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): gf = self._get_file(spider, request) if gf is None: return # not cached url = str(gf.url) status = str(gf.status) headers = [(x, map(str, y)) for x, y in gf.headers.iteritems()] body = gf.read() respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): data = self._read_data(spider, request) if data is None: return # not cached url = data['url'] status = data['status'] headers = Headers(data['headers']) body = data['body'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def _is_zip(self, response): archive = StringIO(response.body) try: zip_file = zipfile.ZipFile(archive) except zipfile.BadZipfile: return namelist = zip_file.namelist() body = zip_file.read(namelist[0]) respcls = responsetypes.from_args(filename=namelist[0], body=body) return response.replace(body=body, cls=respcls)
def retrieve_response(self, spider, request): doc = self.col.find_one({'url': request.url}) if doc is None: return # not cached status = 200 url = doc['url'] headers = Headers(doc['headers']) body = doc['body'].encode('utf-8') respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): key = self._request_key(spider, request) gf = self._get_file(spider, key) if gf is None: return # not cached url = str(gf.url) status = str(gf.status) headers = Headers([(x, map(str, y)) for x, y in gf.headers.iteritems()]) body = gf.read() respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider: TSpider, request: TRequest) -> Optional[TResponse]: data = self._read_data(spider, request) if data is None: return # not cached url = data["url"] status = data["status"] headers = Headers(data["headers"]) body = data["body"] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def _response(self, _, driver, spider): """ handle response of webpage Args: driver: phantomjs driver from selenium's webdriver """ print("driver hash {},finish time:{},url:{}".format( hash(driver), str(datetime.datetime.now()), driver.current_url)) body = driver.execute_script( "return document.documentElement.innerHTML" ) # comapre with outerHTML which has html tag if body.startswith( "<head></head>"): # cannot access response header in Selenium body = driver.execute_script( "return document.documentElement.textContent" ) # only text part, ignore tags url = driver.current_url respcls = responsetypes.from_args(url=url, body=body[:100]) # according to different returning code,construct diffenent response object to other's component if "403 Forbidden" in body or \ "为确认本次访问为正常用户行为,请您协助验证" in body or \ 'You have attempted to use an application which is in violation of your internet usage policy' in body: from scrapy.shell import inspect_response inspect_response(response, self) resp = respcls(url=url, status=403, body=body, encoding="utf-8") elif "<head></head><body></body>" in body or \ "<head><title>500 Internal Server Error</title></head>" in body or \ '<meta name="keywords" content="企业注册信息查询,企业工商信息查询,企业信用查询,企业信息查询">' in body or \ '<title>504 Gateway Time-out</title>' in body or \ '502 Bad Gateway' in body or \ 'Maximum number of open connections reached.' in body or \ 'Gateway Timeout' in body or \ 'Sorry, the page you are looking for is currently unavailable' in body: resp = respcls(url=url, status=503, body=body, encoding="utf-8") else: resp = respcls(url=url, status=200, body=body, encoding="utf-8") #driver.save_screenshot("_response_"+url.split("?")[-1].split("/")[-1][:35]+'.png') response_failed = getattr(spider, "response_failed", None) if response_failed and callable(response_failed) and response_failed( resp, driver): driver.quit() return defer.fail(Failure()) else: if datetime.datetime.now( ).minute % 6 == 0: # handle memory leak every 6 mins driver.quit() self.queue.put(driver) return defer.succeed(resp)
def test_from_args(self): # TODO: add more tests that check precedence between the different arguments mappings = [ ({'url': 'http://www.example.com/data.csv'}, TextResponse), # headers takes precedence over url ({'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/'}, HtmlResponse), ({'headers': Headers({'Content-Disposition': ['attachment; filename="data.xml.gz"']}), 'url': 'http://www.example.com/page/'}, Response), ] for source, cls in mappings: retcls = responsetypes.from_args(**source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def test_process_response_no_content_type_header(self): headers = { 'Content-Encoding': 'identity', } plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">""" respcls = responsetypes.from_args(url="http://www.example.com/index", headers=headers, body=plainbody) response = respcls("http://www.example.com/index", headers=headers, body=plainbody) request = Request("http://www.example.com/index") newresponse = self.mw.process_response(request, response, self.spider) assert isinstance(newresponse, respcls) self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
def test_process_response_no_content_type_header(self): headers = { 'Content-Encoding': 'identity', } plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">""" respcls = responsetypes.from_args(url="http://www.example.com/index", headers=headers, body=plainbody) response = respcls("http://www.example.com/index", headers=headers, body=plainbody) request = Request("http://www.example.com/index") newresponse = self.mw.process_response(request, response, self.spider) assert isinstance(newresponse, respcls) self.assertEqual(newresponse.body, plainbody) self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
def download_response(url, method='GET', tries=5): r = requests.request(method, url) current_try = 1 while r.status_code != 200 and current_try < tries: r = requests.request(method, url) current_try += 1 url = r.url.encode('utf-8') headers = r.headers.items() respcls = responsetypes.from_args(headers=headers, url=url, body=r.content) response = respcls(url=url, headers=headers, status=r.status_code, body=r.content) return response
def _cb_bodydone(self, result, request, url): headers = Headers(result["txresponse"].headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url, body=result["body"]) return respcls( url=url, status=int(result["txresponse"].code), headers=headers, body=result["body"], flags=result["flags"], certificate=result["certificate"], ip_address=result["ip_address"], )
def retrieve_response(self, spider, request): """ Return response if present in cache, or None otherwise. """ key = self._get_key(spider, request) epoch = request.meta.get('epoch') # guaranteed to be True or datetime s3_key = self._get_s3_key(key, epoch) logger.debug('S3Storage retrieving response for key %s.' % (s3_key)) if not s3_key: return logger.info('S3Storage (epoch => %s): retrieving response for %s.' % (epoch, request.url)) try: data_string = s3_key.get_contents_as_string() except boto.exception.S3ResponseError as e: # See store_response for error descriptions raise e finally: s3_key.close() data = json.loads(data_string) metadata = data['metadata'] request_headers = Headers(data['request_headers']) request_body = data['request_body'] response_headers = Headers(data['response_headers']) response_body = data['response_body'] if 'binary' in data and data['binary'] == True: logger.debug('S3Storage: retrieved binary body') response_body = base64.decode(response_body) url = metadata['response_url'] status = metadata.get('status') logger.debug( 'S3Storage: response headers {} '.format(response_headers)) Response = responsetypes.from_args(headers=response_headers, url=url, body=response_body) logger.debug('S3Storage: response type {} '.format(Response)) return Response(url=url, headers=response_headers, status=status, body=response_body)
def _response(self, _, driver, spider): body = driver.execute_script("return document.documentElement.innerHTML") if body.startswith("<head></head>"): # cannot access response header in Selenium body = driver.execute_script("return document.documentElement.textContent") url = driver.current_url respcls = responsetypes.from_args(url=url, body=body[:100].encode('utf8')) resp = respcls(url=url, body=body, encoding="utf-8") response_failed = getattr(spider, "response_failed", None) if response_failed and callable(response_failed) and response_failed(resp, driver): driver.close() return defer.fail(Failure()) else: self.queue.put(driver) return defer.succeed(resp)
def _response(self, _, driver, spider): body = driver.execute_script("return document.documentElement.innerHTML") if body.startswith("<head></head>"): # cannot access response header in Selenium body = driver.execute_script("return document.documentElement.textContent") url = driver.current_url respcls = responsetypes.from_args(url=url, body=body[:100].encode('utf8')) resp = respcls(url=url, body=body, encoding="utf-8") response_failed = getattr(spider, "response_failed", None) if response_failed and callable(response_failed) and response_failed(resp, driver): driver.close() return defer.fail(Failure()) else: self.queue.put(driver) return defer.succeed(resp)
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" try: doc = json.loads(self.bucket.get(self._inverse_url(request.url))[2]) except couchbase.exception.MemcachedError: return # @TODO expiration body = doc['response_body'] url = doc['response_url'] status = doc['status'] headers = Headers(headers_raw_to_dict(doc['response_headers'])) encoding = doc['encoding'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body, encoding=encoding) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with open(join(rpath, "response_body"), "rb") as f: body = f.read() with open(join(rpath, "response_headers"), "rb") as f: rawheaders = f.read() url = metadata.get("response_url") status = metadata["status"] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): key = request_fingerprint(request) value = self.conn.hget(self.name, key) if not value: return value_arr = value.split(self.separator) stored_data = json.loads(value_arr[0]) metadata = stored_data['metadata'] body = str(value_arr[2]) rawheaders = stored_data['response_headers'] url = str(metadata['response_url']) status = str(metadata['status']) headers = Headers(rawheaders) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with self._open(os.path.join(rpath, 'response_body'), 'rb') as f: body = f.read() with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def process_response(self, request, response, spider): if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) respcls = responsetypes.from_args(headers=response.headers, url=response.url) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
def retrieve_response(self, spider, request): """ Return response if present in cache, or None otherwise. """ key = self._get_request_storage_key(spider, request) epoch = request.meta.get('epoch') # guaranteed to be True or datetime s3_key = self._get_s3_key(key, epoch) logger.debug('Retrieving response for key {}.'.format(s3_key)) if not s3_key: return try: data_string = s3_key.get_contents_as_string() except boto.exception.S3ResponseError as e: # See store_response for error descriptions raise e finally: s3_key.close() data = json.loads(data_string) metadata = data['metadata'] response_headers = Headers(data['response_headers']) response_body = data['response_body'] if data.get('binary', False): logger.debug('retrieved binary body') response_body = base64.b64decode(response_body.decode('utf8')) encoding = {} else: encoding = {'encoding': 'utf8'} url = str(metadata['response_url']) status = metadata.get('status') Response = responsetypes.from_args(headers=response_headers, url=url) return Response(url=url, headers=response_headers, status=status, body=response_body, **encoding)
def response_requests2scrapy(requests_response, encoding=None): if encoding is not None: encoding = encoding elif encoding == 'apparent_encoding': encoding = requests_response.apparent_encoding elif encoding == 'encoding': encoding = requests_response.encoding else: if requests_response.encoding is None: encoding = requests_response.encoding else: encoding = requests_response.apparent_encoding url = requests_response.url.encode(encoding) status = requests_response.status_code headers = dict(requests_response.headers) body = requests_response.content request = request_requests2scrapy(requests_response.request) respcls = responsetypes.from_args(headers=headers, url=url, body=body) scrapy_response = respcls(url, status=status, headers=headers, body=body, request=request) return scrapy_response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" key = self._request_key(request) data = self.col.find_one({'key': key}) if not data: # not cache return # expiration? mtime = data['meta']['timestamp'] if 0 < self.expiration_secs < time() - float(mtime): return # expired # retrieve body = data['response_body'] url = str(data.get('url')) status = data['meta']['status'] headers = Headers(data['response_headers']) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def download_request(self, request, spider): filepath = file_uri_to_path(request.url) body = open(filepath, 'rb').read() respcls = responsetypes.from_args(filename=filepath, body=body) return respcls(url=request.url, body=body)
def buildScrapyResponse(self, response, body, url): status = response.code headers = Headers({k:','.join(v) for k,v in response.headers.getAllRawHeaders()}) respcls = responsetypes.from_args(headers=headers, url=url) return respcls(url=url, status=status, headers=headers, body=body)
def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body)
def _cb_bodydone(self, result, request, url): txresponse, body, flags = result status = int(txresponse.code) headers = Headers(txresponse.headers.getAllRawHeaders()) respcls = responsetypes.from_args(headers=headers, url=url) return respcls(url=url, status=status, headers=headers, body=body, flags=flags)