Example #1
0
 def test_from_content_type(self):
     mappings = [
         ('text/html; charset=UTF-8', HtmlResponse),
         ('text/xml; charset=UTF-8', XmlResponse),
         ('application/xhtml+xml; charset=UTF-8', HtmlResponse),
         ('application/xml; charset=UTF-8', XmlResponse),
         ('application/octet-stream', Response),
     ]
     for source, cls in mappings:
         retcls = responsetypes.from_content_type(source)
         assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
Example #2
0
 def test_from_content_type(self):
     mappings = [
         ("text/html; charset=UTF-8", HtmlResponse),
         ("text/xml; charset=UTF-8", XmlResponse),
         ("application/xhtml+xml; charset=UTF-8", HtmlResponse),
         ("application/vnd.wap.xhtml+xml; charset=utf-8", HtmlResponse),
         ("application/xml; charset=UTF-8", XmlResponse),
         ("application/octet-stream", Response),
         ("application/x-json; encoding=UTF8;charset=UTF-8", TextResponse),
     ]
     for source, cls in mappings:
         retcls = responsetypes.from_content_type(source)
         assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
Example #3
0
 def test_from_content_type(self):
     mappings = [
         ('text/html; charset=UTF-8', HtmlResponse),
         ('text/xml; charset=UTF-8', XmlResponse),
         ('application/xhtml+xml; charset=UTF-8', HtmlResponse),
         ('application/vnd.wap.xhtml+xml; charset=utf-8', HtmlResponse),
         ('application/xml; charset=UTF-8', XmlResponse),
         ('application/octet-stream', Response),
         ('application/x-json; encoding=UTF8;charset=UTF-8', TextResponse),
     ]
     for source, cls in mappings:
         retcls = responsetypes.from_content_type(source)
         assert retcls is cls, "{0!s} ==> {1!s} != {2!s}".format(source, retcls, cls)
Example #4
0
 def test_from_content_type(self):
     mappings = [
         ('text/html; charset=UTF-8', HtmlResponse),
         ('text/xml; charset=UTF-8', XmlResponse),
         ('application/xhtml+xml; charset=UTF-8', HtmlResponse),
         ('application/vnd.wap.xhtml+xml; charset=utf-8', HtmlResponse),
         ('application/xml; charset=UTF-8', XmlResponse),
         ('application/octet-stream', Response),
         ('application/x-json; encoding=UTF8;charset=UTF-8', TextResponse),
         ('application/json-amazonui-streaming;charset=UTF-8', TextResponse),
     ]
     for source, cls in mappings:
         retcls = responsetypes.from_content_type(source)
         assert retcls is cls, f"{source} ==> {retcls} != {cls}"
Example #5
0
 def test_from_content_type(self):
     mappings = [
         ("text/html; charset=UTF-8", HtmlResponse),
         ("text/xml; charset=UTF-8", XmlResponse),
         ("application/xhtml+xml; charset=UTF-8", HtmlResponse),
         ("application/vnd.wap.xhtml+xml; charset=utf-8", HtmlResponse),
         ("application/xml; charset=UTF-8", XmlResponse),
         ("application/octet-stream", Response),
         ("application/x-json; encoding=UTF8;charset=UTF-8", TextResponse),
         ("application/json-amazonui-streaming;charset=UTF-8",
          TextResponse),
     ]
     for source, cls in mappings:
         retcls = responsetypes.from_content_type(source)
         assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
Example #6
0
    def on_headers_received(self, response, request, spider):
        maxsize = getattr(spider, 'download_maxsize', self._default_maxsize)
        maxsize = request.meta.get('download_maxsize', maxsize)

        warnsize = getattr(spider, 'download_warnsize', self._default_maxsize)
        warnsize = request.meta.get('download_warnsize', warnsize)

        expected_size = response.meta.get('expected_size')

        # cancel if expected_size is above maxsize
        if maxsize and expected_size > maxsize:
            error_message = ("Cancelling download of {url}: expected response "
                             "size ({size}) larger than "
                             "download max size ({maxsize})."
                             ).format(url=request.url, size=expected_size, maxsize=maxsize)

            logger.info(error_message)
            return True

        if warnsize and expected_size > warnsize:
            logger.info("Expected response size (%(size)s) larger than "
                           "download warn size (%(warnsize)s).",
                           {'size': expected_size, 'warnsize': warnsize})

        # don't cancel if non-200 request
        if not (200 <= response.status < 300):
            logger.info('response code not between 200 and 300 {0}'.format(response.status))
            return False

        # don't cancel if robots.txt request
        if 'robots.txt' in request.url:
            logger.info('robots.txt request')
            return False

        # cancel if response is not HTML
        if b'Content-Type' in response.headers:
            cls = responsetypes.from_content_type(response.headers[b'Content-Type'])
            return not issubclass(cls, HtmlResponse)

        # else don't cancel
        return False