Esempio n. 1
0
File: server.py Progetto: zasfe/pol
 def buildScrapyResponse(self, response, body, url):
     status = response.code
     headers = Headers(
         {k: ','.join(v)
          for k, v in response.headers.getAllRawHeaders()})
     respcls = responsetypes.from_args(headers=headers, url=url)
     return respcls(url=url, status=status, headers=headers, body=body)
Esempio n. 2
0
 def _download_request(self, request, spider):
     proxies = {}
     proxy = request.meta.get('proxy', '')
     if proxy:
         for p in self.proxies:
             if p.find(proxy) != -1:
                 scheme = urlparse(p).scheme
                 proxies[scheme] = p
                 break
     timeout = request.meta.get('download_timeout', self.timeout)
     url = request.url
     method = request.method
     headers = headers_scrapy2dict(request.headers)
     data = request.body
     session = self._session or requests.sessions.Session()
     st = time.time()
     requests_response = session.request(method, url, headers=headers, data=data, timeout=timeout, proxies=proxies)
     et = time.time()
     cost = et - st
     request.meta['download_latency'] = cost
     headers = Headers(dict(requests_response.headers))
     respcls = responsetypes.from_args(headers=headers,
                                       url=requests_response.url,
                                       body=requests_response.content)
     response_url = requests_response.url.encode(requests_response.encoding)
     response = respcls(url=response_url,
                        status=requests_response.status_code,
                        headers=headers,
                        body=requests_response.content, )
     return response
Esempio n. 3
0
    def process_response(self, request, response, spider):

        if request.method == 'HEAD':
            return response
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding:
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                if self._max_size and len(decoded_body) > self._max_size:
                    error_msg = ("Cancelling processing of %(url)s: "
                                 "Uncompressed response size %(size)s larger "
                                 "than max allowed size (%(maxsize)s).")
                    error_args = {
                        'url': response.url,
                        'size': len(decoded_body),
                        'maxsize': self._max_size
                    }
                    logger.error(error_msg, error_args)
                    raise IgnoreRequest(error_msg % error_args)

                respcls = responsetypes.from_args(headers=response.headers, \
                    url=response.url, body=decoded_body)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
Esempio n. 4
0
 def test_from_args(self):
     # TODO: add more tests that check precedence between the different arguments
     mappings = [
         ({
             'url': 'http://www.example.com/data.csv'
         }, TextResponse),
         # headers takes precedence over url
         ({
             'headers':
             Headers({'Content-Type': ['text/html; charset=utf-8']}),
             'url': 'http://www.example.com/item/'
         }, HtmlResponse),
         ({
             'headers':
             Headers({
                 'Content-Disposition':
                 ['attachment; filename="data.xml.gz"']
             }),
             'url':
             'http://www.example.com/page/'
         }, Response),
     ]
     for source, cls in mappings:
         retcls = responsetypes.from_args(**source)
         assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
Esempio n. 5
0
    def retrieve_response(self, spider, request):
        response = super(S3CacheStorage, self).retrieve_response(spider, request)
        if response is None:  # not in local filesystem cache, so try copying from s3
            local_path = self._get_request_path(spider, request)
            remote_path = os.path.relpath(local_path, self.tmpcachedir).lower()
            bucket = self.conn.get_bucket(self.bucket_name, validate=False)

            def _get_key(filename):
                key_name = os.path.join(remote_path, filename)
                return bucket.get_key(key_name)

            # check if the key exists
            metadata_key = _get_key('pickled_meta')
            if metadata_key is None:
                return None  # key not found

            # check if the cache entry has expired
            mtime = parse_ts(metadata_key.last_modified)
            if 0 < self.expiration_secs < (datetime.datetime.utcnow() - mtime).total_seconds():
                return None  # expired

            # deserialise the cached response
            metadata = pickle.loads(metadata_key.get_contents_as_string())
            body = _get_key('response_body').get_contents_as_string()
            rawheaders = _get_key('response_headers').get_contents_as_string()
            url = metadata.get('response_url')
            status = metadata['status']
            headers = Headers(headers_raw_to_dict(rawheaders))
            respcls = responsetypes.from_args(headers=headers, url=url)
            response = respcls(url=url, headers=headers, status=status, body=body)

        return response
Esempio n. 6
0
 def _build_response(self, result, request, protocol):
     self.result = result
     respcls = responsetypes.from_args(url=request.url)
     protocol.close()
     body = protocol.filename or protocol.body.read()
     headers = {"local filename": protocol.filename or '', "size": protocol.size}
     return respcls(url=request.url, status=200, body=to_bytes(body), headers=headers)
Esempio n. 7
0
    async def _download_request_page(self, request: Request, spider: Spider,
                                     page: Page) -> Response:
        self.stats.inc_value("pyppeteer/page_count")
        if self.navigation_timeout is not None:
            page.setDefaultNavigationTimeout(self.navigation_timeout)
        await page.setRequestInterception(True)
        page.on(
            "request",
            partial(_request_handler, scrapy_request=request,
                    stats=self.stats))
        page.on("response", partial(_response_handler, stats=self.stats))

        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("pyppeteer_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)

                # set PageCoroutine timeout
                if self.page_coroutine_timeout is not None and not pc.kwargs.get(
                        "timeout", None):
                    pc.kwargs["timeout"] = self.page_coroutine_timeout

                if isinstance(pc, NavigationPageCoroutine):
                    await asyncio.gather(page.waitForNavigation(),
                                         method(*pc.args, **pc.kwargs))
                else:
                    pc.result = await method(*pc.args, **pc.kwargs)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        callback = request.callback or spider.parse
        annotations = getattr(callback, "__annotations__", {})
        for key, value in annotations.items():
            if value is pyppeteer.page.Page:
                request.cb_kwargs[key] = page
                self.stats.inc_value("pyppeteer/page_count/injected_callback")
                break
        else:
            await page.close()
            self.stats.inc_value("pyppeteer/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["pyppeteer"],
        )
Esempio n. 8
0
    def process_response(self, request, response, spider):
        if request.method == 'HEAD':
            return response
        if isinstance(response, Response):
            if response.url.endswith('.xml.gz'):
                response.headers.setlist('Content-Encoding', [
                    b'gzip',
                ])
                response.headers.setlist('Content-Type', [
                    b'application/xml',
                ])
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding and not is_gzipped(response):
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers,
                                                  url=response.url)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
    def retrieve_response(self, spider, request):
        """Return response if present in cache, or None otherwise."""
        key = self._get_request_key(spider, request)

        expiration_time = self._get_expiration_time(spider)
        metadata = self._read_meta(key, expiration_time)
        if metadata is None:
            return  # not cached
        spider.log("%s: found cache for %s" %
                   (self.__class__.__name__, request.url))

        res = self._get_cached_data(key, spider)

        if res is None:
            return None

        spider.log("%s: got response from cache for %s" %
                   (self.__class__.__name__, request.url))

        response_url = res['response_url']
        response_body = str(res['response_body'])
        response_rawheaders = res['response_meta']['headers']
        status = res['response_meta']['status']
        response_headers = Headers(headers_raw_to_dict(response_rawheaders))
        response_cls = responsetypes.from_args(headers=response_headers,
                                               url=response_url)
        response = response_cls(url=response_url,
                                headers=response_headers,
                                status=status,
                                body=response_body)
        return response
Esempio n. 10
0
 def _cb_bodydone(self, result, request, url):
     txresponse, body, flags = result
     status = int(txresponse.code)
     headers = Headers(txresponse.headers.getAllRawHeaders())
     respcls = responsetypes.from_args(headers=headers, url=url, body=body)
     # print(respcls)  # <class 'scrapy.http.response.html.HtmlResponse'>
     return respcls(url=url, status=status, headers=headers, body=body, flags=flags)
Esempio n. 11
0
    def retrieve_response(self, spider, request):
        key = self._request_key(spider, request)
        try:
            gf = self.fs[spider].fetchDocument(key, rawResults=True)
            print("found\t" + str(request.url), request.priority)
        except pyArango.theExceptions.DocumentNotFoundError:
            print("not found\t" + str(request.url), request.priority)
            return
        url = str(gf["url"])
        status = str(gf["status"])

        bod = gf["body"].encode("utf-8")
        body = zlib.decompress(base64.urlsafe_b64decode(bod))
        try:
            if gf["headers"].get("content-encoding"
                                 ) == "gzip" and not gzip_magic_number(body):
                del gf["headers"]["content-encoding"]
            elif not gf["headers"].get(
                    "content-encoding") and gzip_magic_number(body):
                gf["headers"]["content-encoding"] = "gzip"
            headers = Headers([(x, str(y)) for x, y in gf["headers"].items()])
        except AttributeError as e:
            if gzip_magic_number(body):
                #print("added headers")
                headers = Headers((("Content-Encoding", "gzip"), ))
            else:
                #print("headers=None")
                headers = None

        respcls = responsetypes.from_args(headers=headers, url=url, body=body)

        response = respcls(url=url, headers=headers, status=status, body=body)
        return response
Esempio n. 12
0
    def process_response(self, request, response, spider):

        if request.method == 'HEAD':
            return response
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding:
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                if self.stats:
                    self.stats.inc_value('httpcompression/response_bytes',
                                         len(decoded_body),
                                         spider=spider)
                    self.stats.inc_value('httpcompression/response_count',
                                         spider=spider)
                respcls = responsetypes.from_args(headers=response.headers,
                                                  url=response.url,
                                                  body=decoded_body)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
Esempio n. 13
0
File: ftp.py Progetto: 611953/scrapy
 def _build_response(self, result, request, protocol):
     self.result = result
     respcls = responsetypes.from_args(url=request.url)
     protocol.close()
     body = protocol.filename or protocol.body.read()
     headers = {"local filename": protocol.filename or '', "size": protocol.size}
     return respcls(url=request.url, status=200, body=body, headers=headers)
    def retrieve_response(self, spider, request):
        """
        Return response if present in cache, or None otherwise.
        """
        key = self._get_key(spider, request)

        epoch = request.meta.get('epoch') # guaranteed to be True or datetime
        s3_key = self._get_s3_key(key, epoch)

        if not s3_key:
            return

        log.msg('S3Storage (epoch => %s): retrieving response for %s.' % (epoch, request.url))
        try:
            data_string = s3_key.get_contents_as_string()
        except boto.exception.S3ResponseError as e:
            # See store_response for error descriptions
            raise e
        finally:
            s3_key.close()

        data = pickle.loads(data_string)

        metadata         = data['metadata']
        request_headers  = data['request_headers']
        request_body     = data['request_body']
        response_headers = data['response_headers']
        response_body    = data['response_body']

        url      = metadata['response_url']
        status   = metadata.get('status')
        Response = responsetypes.from_args(headers=response_headers, url=url)
        return Response(url=url, headers=response_headers, status=status, body=response_body)
Esempio n. 15
0
    def _is_bzip2(self, response):
        try:
            body = bz2.decompress(response.body)
        except IOError:
            return

        respcls = responsetypes.from_args(body=body)
        return response.replace(body=body, cls=respcls)
Esempio n. 16
0
 def _build_response(self, body, request):
     if self.invalid_headers:
         raise BadHttpHeaderError('Invalid headers received: %s' % self.invalid_headers)
     request.meta['download_latency'] = self.headers_time-self.start_time
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self.url)
     return respcls(url=self.url, status=status, headers=headers, body=body)
Esempio n. 17
0
 def _cb_bodydone(self, result):
     url, status, headers, body, flags = result
     respcls = responsetypes.from_args(headers=headers, url=url, body=body)
     return respcls(url=url,
                    status=status,
                    headers=headers,
                    body=body,
                    flags=flags)
Esempio n. 18
0
    def _is_gzip(self, response):
        archive = StringIO(response.body)
        try:
            body = gzip.GzipFile(fileobj=archive).read()
        except IOError:
            return

        respcls = responsetypes.from_args(body=body)
        return response.replace(body=body, cls=respcls)
Esempio n. 19
0
 def _build_response(self, body, request):
     request.meta['download_latency'] = self.headers_time - self.start_time
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self._url)
     return respcls(url=self._url,
                    status=status,
                    headers=headers,
                    body=body)
Esempio n. 20
0
    def _is_tar(self, response):
        archive = StringIO(response.body)
        try:
            tar_file = tarfile.open(name=mktemp(), fileobj=archive)
        except tarfile.ReadError:
            return

        body = tar_file.extractfile(tar_file.members[0]).read()
        respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
        return response.replace(body=body, cls=respcls)
Esempio n. 21
0
 def _cb_bodydone(self, result, request, url):
     txresponse, body, flags = result
     status = int(txresponse.code)
     headers = Headers(txresponse.headers.getAllRawHeaders())
     respcls = responsetypes.from_args(headers=headers, url=url, body=body)
     return respcls(url=url,
                    status=status,
                    headers=headers,
                    body=body,
                    flags=flags)
Esempio n. 22
0
 def retrieve_response(self, spider, request):
     data = self._read_data(spider, request)
     if data is None:
         return  # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['body']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 23
0
 def retrieve_response(self, spider, request):
     doc = self.col.find_one({'url': request.url})
     if doc is None:
         return  # not cached
     status = 200
     url = doc['url']
     headers = Headers(doc['headers'])
     body = doc['body'].encode('utf-8')
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 24
0
def response_from_dict(response, spider=None, **kwargs):
    """Returns a dict based on a response from a spider"""
    url = response.get("url")
    status = response.get("status")
    headers = Headers([(x, map(str, y)) for x, y in 
                    response.get("headers").iteritems()])
    body = response.get("body")

    respcls = responsetypes.from_args(headers=headers, url=url)
    response = respcls(url=url, headers=headers, status=status, body=body)
    return response
Esempio n. 25
0
def response_from_dict(response, spider=None, **kwargs):
    """Returns a dict based on a response from a spider"""
    url = response.get("url")
    status = "200"
    headers = Headers([(x, list(map(str, y))) for x, y in
                       response.get("headers").items()])
    body = response.get("body")

    respcls = responsetypes.from_args(headers=headers, url=url)
    response = respcls(url=url, headers=headers, status=status, body=body)
    return response
Esempio n. 26
0
 def retrieve_response(self, spider, request):
     gf = self._get_file(spider, request)
     if gf is None:
         return # not cached
     url = str(gf.url)
     status = str(gf.status)
     headers = [(x, map(str, y)) for x, y in gf.headers.iteritems()]
     body = gf.read()
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 27
0
 def retrieve_response(self, spider, request):
     data = self._read_data(spider, request)
     if data is None:
         return # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['body']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 28
0
    def _is_zip(self, response):
        archive = StringIO(response.body)
        try:
            zip_file = zipfile.ZipFile(archive)
        except zipfile.BadZipfile:
            return

        namelist = zip_file.namelist()
        body = zip_file.read(namelist[0])
        respcls = responsetypes.from_args(filename=namelist[0], body=body)
        return response.replace(body=body, cls=respcls)
Esempio n. 29
0
 def retrieve_response(self, spider, request):
     doc = self.col.find_one({'url': request.url})
     if doc is None:
         return  # not cached
     status = 200
     url = doc['url']
     headers = Headers(doc['headers'])
     body = doc['body'].encode('utf-8')
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 30
0
 def retrieve_response(self, spider, request):
     key = self._request_key(spider, request)
     gf = self._get_file(spider, key)
     if gf is None:
         return # not cached
     url = str(gf.url)
     status = str(gf.status)
     headers = Headers([(x, map(str, y)) for x, y in gf.headers.iteritems()])
     body = gf.read()
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 31
0
 def retrieve_response(self, spider: TSpider,
                       request: TRequest) -> Optional[TResponse]:
     data = self._read_data(spider, request)
     if data is None:
         return  # not cached
     url = data["url"]
     status = data["status"]
     headers = Headers(data["headers"])
     body = data["body"]
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 32
0
    def _response(self, _, driver, spider):
        """ handle response of webpage

            Args:
                driver: phantomjs driver from selenium's webdriver

        """
        print("driver hash {},finish time:{},url:{}".format(
            hash(driver), str(datetime.datetime.now()), driver.current_url))
        body = driver.execute_script(
            "return document.documentElement.innerHTML"
        )  # comapre with outerHTML which has html tag
        if body.startswith(
                "<head></head>"):  # cannot access response header in Selenium
            body = driver.execute_script(
                "return document.documentElement.textContent"
            )  # only text part, ignore tags
        url = driver.current_url
        respcls = responsetypes.from_args(url=url, body=body[:100])

        # according to different returning code,construct diffenent response object to other's component
        if  "403 Forbidden" in body or \
            "为确认本次访问为正常用户行为,请您协助验证" in body or \
            'You have attempted to use an application which is in violation of your internet usage policy' in body:
            from scrapy.shell import inspect_response
            inspect_response(response, self)
            resp = respcls(url=url, status=403, body=body, encoding="utf-8")
        elif  "<head></head><body></body>" in body or \
                    "<head><title>500 Internal Server Error</title></head>" in body or \
                    '<meta name="keywords" content="企业注册信息查询,企业工商信息查询,企业信用查询,企业信息查询">' in body or \
                    '<title>504 Gateway Time-out</title>' in body or \
                    '502 Bad Gateway' in body or \
                    'Maximum number of open connections reached.' in body or \
                    'Gateway Timeout' in body or \
                    'Sorry, the page you are looking for is currently unavailable' in body:
            resp = respcls(url=url, status=503, body=body, encoding="utf-8")
        else:
            resp = respcls(url=url, status=200, body=body, encoding="utf-8")

        #driver.save_screenshot("_response_"+url.split("?")[-1].split("/")[-1][:35]+'.png')

        response_failed = getattr(spider, "response_failed", None)
        if response_failed and callable(response_failed) and response_failed(
                resp, driver):
            driver.quit()
            return defer.fail(Failure())
        else:
            if datetime.datetime.now(
            ).minute % 6 == 0:  # handle memory leak every 6 mins
                driver.quit()
            self.queue.put(driver)
            return defer.succeed(resp)
Esempio n. 33
0
    def test_from_args(self):
        # TODO: add more tests that check precedence between the different arguments
        mappings = [
            ({'url': 'http://www.example.com/data.csv'}, TextResponse),
            # headers takes precedence over url
            ({'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/'}, HtmlResponse),
            ({'headers': Headers({'Content-Disposition': ['attachment; filename="data.xml.gz"']}), 'url': 'http://www.example.com/page/'}, Response),


        ]
        for source, cls in mappings:
            retcls = responsetypes.from_args(**source)
            assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
    def test_process_response_no_content_type_header(self):
        headers = {
            'Content-Encoding': 'identity',
        }
        plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
        respcls = responsetypes.from_args(url="http://www.example.com/index", headers=headers, body=plainbody)
        response = respcls("http://www.example.com/index", headers=headers, body=plainbody)
        request = Request("http://www.example.com/index")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, respcls)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
Esempio n. 35
0
    def test_process_response_no_content_type_header(self):
        headers = {
            'Content-Encoding': 'identity',
        }
        plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
        respcls = responsetypes.from_args(url="http://www.example.com/index", headers=headers, body=plainbody)
        response = respcls("http://www.example.com/index", headers=headers, body=plainbody)
        request = Request("http://www.example.com/index")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, respcls)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
Esempio n. 36
0
def download_response(url, method='GET', tries=5):
    r = requests.request(method, url)
    current_try = 1
    while r.status_code != 200 and current_try < tries:
        r = requests.request(method, url)
        current_try += 1
    url = r.url.encode('utf-8')
    headers = r.headers.items()
    respcls = responsetypes.from_args(headers=headers, url=url, body=r.content)
    response = respcls(url=url,
                       headers=headers,
                       status=r.status_code,
                       body=r.content)
    return response
Esempio n. 37
0
 def _cb_bodydone(self, result, request, url):
     headers = Headers(result["txresponse"].headers.getAllRawHeaders())
     respcls = responsetypes.from_args(headers=headers,
                                       url=url,
                                       body=result["body"])
     return respcls(
         url=url,
         status=int(result["txresponse"].code),
         headers=headers,
         body=result["body"],
         flags=result["flags"],
         certificate=result["certificate"],
         ip_address=result["ip_address"],
     )
Esempio n. 38
0
    def retrieve_response(self, spider, request):
        """
        Return response if present in cache, or None otherwise.
        """
        key = self._get_key(spider, request)

        epoch = request.meta.get('epoch')  # guaranteed to be True or datetime
        s3_key = self._get_s3_key(key, epoch)
        logger.debug('S3Storage retrieving response for key %s.' % (s3_key))

        if not s3_key:
            return

        logger.info('S3Storage (epoch => %s): retrieving response for %s.' %
                    (epoch, request.url))
        try:
            data_string = s3_key.get_contents_as_string()
        except boto.exception.S3ResponseError as e:
            # See store_response for error descriptions
            raise e
        finally:
            s3_key.close()

        data = json.loads(data_string)

        metadata = data['metadata']
        request_headers = Headers(data['request_headers'])
        request_body = data['request_body']
        response_headers = Headers(data['response_headers'])
        response_body = data['response_body']

        if 'binary' in data and data['binary'] == True:
            logger.debug('S3Storage: retrieved binary body')
            response_body = base64.decode(response_body)

        url = metadata['response_url']
        status = metadata.get('status')

        logger.debug(
            'S3Storage: response headers {} '.format(response_headers))
        Response = responsetypes.from_args(headers=response_headers,
                                           url=url,
                                           body=response_body)
        logger.debug('S3Storage: response type {} '.format(Response))

        return Response(url=url,
                        headers=response_headers,
                        status=status,
                        body=response_body)
Esempio n. 39
0
    def _response(self, _, driver, spider):
        body = driver.execute_script("return document.documentElement.innerHTML")
        if body.startswith("<head></head>"):  # cannot access response header in Selenium
            body = driver.execute_script("return document.documentElement.textContent")
        url = driver.current_url
        respcls = responsetypes.from_args(url=url, body=body[:100].encode('utf8'))
        resp = respcls(url=url, body=body, encoding="utf-8")

        response_failed = getattr(spider, "response_failed", None)
        if response_failed and callable(response_failed) and response_failed(resp, driver):
            driver.close()
            return defer.fail(Failure())
        else:
            self.queue.put(driver)
            return defer.succeed(resp)
Esempio n. 40
0
    def _response(self, _, driver, spider):
        body = driver.execute_script("return document.documentElement.innerHTML")
        if body.startswith("<head></head>"):  # cannot access response header in Selenium
            body = driver.execute_script("return document.documentElement.textContent")
        url = driver.current_url
        respcls = responsetypes.from_args(url=url, body=body[:100].encode('utf8'))
        resp = respcls(url=url, body=body, encoding="utf-8")

        response_failed = getattr(spider, "response_failed", None)
        if response_failed and callable(response_failed) and response_failed(resp, driver):
            driver.close()
            return defer.fail(Failure())
        else:
            self.queue.put(driver)
            return defer.succeed(resp)
Esempio n. 41
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     try:
         doc = json.loads(self.bucket.get(self._inverse_url(request.url))[2])
     except couchbase.exception.MemcachedError:
         return
         # @TODO expiration
     body = doc['response_body']
     url = doc['response_url']
     status = doc['status']
     headers = Headers(headers_raw_to_dict(doc['response_headers']))
     encoding = doc['encoding']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body,
         encoding=encoding)
     return response
Esempio n. 42
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, "response_body"), "rb") as f:
         body = f.read()
     with open(join(rpath, "response_headers"), "rb") as f:
         rawheaders = f.read()
     url = metadata.get("response_url")
     status = metadata["status"]
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 43
0
 def retrieve_response(self, spider, request):
     key = request_fingerprint(request)
     value = self.conn.hget(self.name, key)
     if not value:
         return
     value_arr = value.split(self.separator)
     stored_data = json.loads(value_arr[0])
     metadata = stored_data['metadata']
     body = str(value_arr[2])
     rawheaders = stored_data['response_headers']
     url = str(metadata['response_url'])
     status = str(metadata['status'])
     headers = Headers(rawheaders)
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 44
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     url = metadata.get('response_url')
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 45
0
    def process_response(self, request, response, spider):
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding:
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers,
                                                  url=response.url)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
Esempio n. 46
0
    def retrieve_response(self, spider, request):
        """
        Return response if present in cache, or None otherwise.
        """
        key = self._get_request_storage_key(spider, request)

        epoch = request.meta.get('epoch')  # guaranteed to be True or datetime
        s3_key = self._get_s3_key(key, epoch)
        logger.debug('Retrieving response for key {}.'.format(s3_key))

        if not s3_key:
            return

        try:
            data_string = s3_key.get_contents_as_string()
        except boto.exception.S3ResponseError as e:
            # See store_response for error descriptions
            raise e
        finally:
            s3_key.close()

        data = json.loads(data_string)

        metadata = data['metadata']
        response_headers = Headers(data['response_headers'])
        response_body = data['response_body']

        if data.get('binary', False):
            logger.debug('retrieved binary body')
            response_body = base64.b64decode(response_body.decode('utf8'))
            encoding = {}
        else:
            encoding = {'encoding': 'utf8'}
        url = str(metadata['response_url'])
        status = metadata.get('status')
        Response = responsetypes.from_args(headers=response_headers, url=url)

        return Response(url=url,
                        headers=response_headers,
                        status=status,
                        body=response_body,
                        **encoding)
Esempio n. 47
0
def response_requests2scrapy(requests_response, encoding=None):
    if encoding is not None:
        encoding = encoding
    elif encoding == 'apparent_encoding':
        encoding = requests_response.apparent_encoding
    elif encoding == 'encoding':
        encoding = requests_response.encoding
    else:
        if requests_response.encoding is None:
            encoding = requests_response.encoding
        else:
            encoding = requests_response.apparent_encoding
    url = requests_response.url.encode(encoding)
    status = requests_response.status_code
    headers = dict(requests_response.headers)
    body = requests_response.content
    request = request_requests2scrapy(requests_response.request)
    respcls = responsetypes.from_args(headers=headers, url=url, body=body)
    scrapy_response = respcls(url, status=status, headers=headers, body=body, request=request)
    return scrapy_response
Esempio n. 48
0
    def retrieve_response(self, spider, request):
        """Return response if present in cache, or None otherwise."""
        key = self._request_key(request)

        data = self.col.find_one({'key': key})
        if not data: # not cache
            return 

        # expiration?
        mtime = data['meta']['timestamp']
        if 0 < self.expiration_secs < time() - float(mtime):
            return  # expired
            
        # retrieve
        body = data['response_body']
        url = str(data.get('url'))
        status = data['meta']['status']
        headers = Headers(data['response_headers'])
        respcls = responsetypes.from_args(headers=headers, url=url)
        response = respcls(url=url, headers=headers, status=status, body=body)
        return response
Esempio n. 49
0
 def download_request(self, request, spider):
     filepath = file_uri_to_path(request.url)
     body = open(filepath, 'rb').read()
     respcls = responsetypes.from_args(filename=filepath, body=body)
     return respcls(url=request.url, body=body)
Esempio n. 50
0
 def buildScrapyResponse(self, response, body, url):
     status = response.code
     headers = Headers({k:','.join(v) for k,v in response.headers.getAllRawHeaders()})
     respcls = responsetypes.from_args(headers=headers, url=url)
     return respcls(url=url, status=status, headers=headers, body=body)
Esempio n. 51
0
 def _build_response(self, body, request):
     request.meta['download_latency'] = self.headers_time-self.start_time
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self.url)
     return respcls(url=self.url, status=status, headers=headers, body=body)
Esempio n. 52
0
 def _cb_bodydone(self, result, request, url):
     txresponse, body, flags = result
     status = int(txresponse.code)
     headers = Headers(txresponse.headers.getAllRawHeaders())
     respcls = responsetypes.from_args(headers=headers, url=url)
     return respcls(url=url, status=status, headers=headers, body=body, flags=flags)