Exemple #1
0
    def test_gunzip_truncated_short(self):
        with open(join(SAMPLEDIR, 'truncated-crc-error-short.gz'), 'rb') as f:
            r1 = Response("http://www.example.com", body=f.read())
            self.assertTrue(gzip_magic_number(r1))

            r2 = Response("http://www.example.com", body=gunzip(r1.body))
            assert r2.body.endswith(b'</html>')
            self.assertFalse(gzip_magic_number(r2))
Exemple #2
0
    def test_gunzip_basic(self):
        with open(join(SAMPLEDIR, 'feed-sample1.xml.gz'), 'rb') as f:
            r1 = Response("http://www.example.com", body=f.read())
            self.assertTrue(gzip_magic_number(r1))

            r2 = Response("http://www.example.com", body=gunzip(r1.body))
            self.assertFalse(gzip_magic_number(r2))
            self.assertEqual(len(r2.body), 9950)
Exemple #3
0
    def _get_sitemap_body(self, response):
        """Return the sitemap body contained in the given response,
        or None if the response is not a sitemap.
        """

        if isinstance(response, XmlResponse):
            return response.body
        elif gzip_magic_number(response):
            return gunzip(response.body)
        # actual gzipped sitemap files are decompressed above ;
        # if we are here (response body is not gzipped)
        # and have a response for .xml.gz,
        # it usually means that it was already gunzipped
        # by HttpCompression middleware,
        # the HTTP response being sent with "Content-Encoding: gzip"
        # without actually being a .xml.gz file in the first place,
        # merely XML gzip-compressed on the fly,
        # in other word, here, we have plain XML
        elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
            return response.body
        elif response.url.endswith('sitemap.txt'):
            #print(response.body)
            a = response.body.decode("utf-8")
            #print(type(response.body))
            a = a.split('\r\n')
            body = '<?xml version="1.0" encoding="UTF-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'
            for x in a:
                body = body + '<url><loc>' + x + '</loc></url>'
            body = body + '</urlset>'
            return str.encode(body)
Exemple #4
0
    def _get_sitemap_body(self, response):
        """Return the sitemap body contained in the given response,
        or None if the response is not a sitemap.
        """
        if isinstance(response, XmlResponse):
            return response.body
        elif gzip_magic_number(response):
            return gunzip(response.body)

        elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
            return response.body

        try:
            root = ET.fromstring(response.body)
            return response.body
        except:
            pass
Exemple #5
0
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response,
     or None if the response is not a sitemap.
     """
     if isinstance(response, XmlResponse):
         return response.body
     elif gzip_magic_number(response):
         return gunzip(response.body)
     # actual gzipped sitemap files are decompressed above ;
     # if we are here (response body is not gzipped)
     # and have a response for .xml.gz,
     # it usually means that it was already gunzipped
     # by HttpCompression middleware,
     # the HTTP response being sent with "Content-Encoding: gzip"
     # without actually being a .xml.gz file in the first place,
     # merely XML gzip-compressed on the fly,
     # in other word, here, we have plain XML
     elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
         return response.body
Exemple #6
0
 def _get_sitemap_body(self, response):
     """Return the sitemap body contained in the given response,
     or None if the response is not a sitemap.
     """
     if isinstance(response, XmlResponse):
         return response.body
     elif gzip_magic_number(response):
         return gunzip(response.body)
     # actual gzipped sitemap files are decompressed above ;
     # if we are here (response body is not gzipped)
     # and have a response for .xml.gz,
     # it usually means that it was already gunzipped
     # by HttpCompression middleware,
     # the HTTP response being sent with "Content-Encoding: gzip"
     # without actually being a .xml.gz file in the first place,
     # merely XML gzip-compressed on the fly,
     # in other word, here, we have plain XML
     elif response.url.endswith('.xml') or response.url.endswith('.xml.gz'):
         return response.body
Exemple #7
0
 def test_is_gzipped_empty(self):
     r1 = Response("http://www.example.com")
     self.assertFalse(gzip_magic_number(r1))