Example #1
0
    def test_is_gzipped_case_insensitive(self):
        hdrs = Headers({"Content-Type": "Application/X-Gzip"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))

        hdrs = Headers({"Content-Type": "application/X-GZIP ; charset=utf-8"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))
    def test_is_gzipped_case_insensitive(self):
        hdrs = Headers({"Content-Type": "Application/X-Gzip"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))

        hdrs = Headers({"Content-Type": "application/X-GZIP ; charset=utf-8"})
        r1 = Response("http://www.example.com", headers=hdrs)
        self.assertTrue(is_gzipped(r1))
Example #3
0
    def process_response(self, request, response, spider):
        if request.method == 'HEAD':
            return response
        if isinstance(response, Response):
            if response.url.endswith('.xml.gz'):
                response.headers.setlist('Content-Encoding', [
                    b'gzip',
                ])
                response.headers.setlist('Content-Type', [
                    b'application/xml',
                ])
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding and not is_gzipped(response):
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers,
                                                  url=response.url)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
Example #4
0
	def _get_sitemap_body(self, response):
		"""Return the sitemap body contained in the given response, or None if the
		response is not a sitemap.
		"""
		if isinstance(response, XmlResponse):
			return response.body
		elif is_gzipped(response):
			return gunzip(response.body)
		elif response.url.endswith('.xml'):
			return response.body
		elif response.url.endswith('.xml.gz'):
			return gunzip(response.body)
Example #5
0
File: sitemap.py Project: DT021/wau
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response, or None if the
     response is not a sitemap.
     """
     if isinstance(response, XmlResponse):
         return response.body
     elif is_gzipped(response):
         return gunzip(response.body)
     elif response.url.endswith('.xml'):
         return response.body
     elif response.url.endswith('.xml.gz'):
         return gunzip(response.body)
Example #6
0
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response, or None if the
     response is not a sitemap.
     """
     try:
         if isinstance(response, XmlResponse):
             return response.body
         elif is_gzipped(response):
             return gunzip(response.body)
         elif response.url.endswith('.xml'):
             return response.body
         elif response.url.endswith('.xml.gz'):
             return gunzip(response.body)
     except Exception, e:
         self.log("Error %s ungzip %s" % (response.url, e))
Example #7
0
    def process_response(self, request, response, spider):
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding and not is_gzipped(response):
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers, \
                    url=response.url)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
 def test_is_x_gzipped_right(self):
     hdrs = Headers({"Content-Type": "application/x-gzip"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertTrue(is_gzipped(r1))
 def test_is_gzipped_empty(self):
     r1 = Response("http://www.example.com")
     self.assertFalse(is_gzipped(r1))
Example #10
0
 def test_is_gzipped_with_charset(self):
     hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertTrue(is_gzipped(r1))
Example #11
0
 def test_is_gzipped_wrong(self):
     hdrs = Headers({"Content-Type": "application/javascript"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertFalse(is_gzipped(r1))
Example #12
0
 def test_is_gzipped_empty(self):
     r1 = Response("http://www.example.com")
     self.assertFalse(is_gzipped(r1))
 def test_is_gzipped_wrong(self):
     hdrs = Headers({"Content-Type": "application/javascript"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertFalse(is_gzipped(r1))
Example #14
0
 def test_is_gzipped_not_quite(self):
     hdrs = Headers({"Content-Type": "application/gzippppp"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertFalse(is_gzipped(r1))
Example #15
0
 def test_is_x_gzipped_right(self):
     hdrs = Headers({"Content-Type": "application/x-gzip"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertTrue(is_gzipped(r1))
 def test_is_gzipped_not_quite(self):
     hdrs = Headers({"Content-Type": "application/gzippppp"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertFalse(is_gzipped(r1))
 def test_is_gzipped_with_charset(self):
     hdrs = Headers({"Content-Type": "application/x-gzip;charset=utf-8"})
     r1 = Response("http://www.example.com", headers=hdrs)
     self.assertTrue(is_gzipped(r1))