Ejemplo n.º 1
0
 def _auto_detect_fun(self, text):
     for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
         try:
             text.decode(enc)
         except UnicodeError:
             continue
         return resolve_encoding(enc)
Ejemplo n.º 2
0
 def _auto_detect_fun(self, text):
     for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
         try:
             text.decode(enc)
         except UnicodeError:
             continue
         return resolve_encoding(enc)
Ejemplo n.º 3
0
 def _auto_detect_fun(self, text):
     for enc in (self._DEFAULT_ENCODING, 'utf-8', 'cp1252'):
         try:
             text.decode(enc)
         except UnicodeError:
             continue
         # print('!!!!!!!!', enc)
         return resolve_encoding(
             enc)  # todo, whats? how do you kill my encoding?
Ejemplo n.º 4
0
def _detect_encoding(bytestring, default_encoding='utf-8'):
    # NOTE: alternatively `UnicodeDammit(x).originalEncoding`
    # NOTE: alternatively use scrapy.http.TextResponse().text
    encoding = chardet.detect(bytestring).get('encoding')
    if encoding:
        # TODO: `resolve_encoding`?
        return resolve_encoding(encoding)
    else:
        return default_encoding
Ejemplo n.º 5
0
    def _auto_detect_fun(self, text):
        for enc in (self._DEFAULT_ENCODING, 'utf-8', 'ascii', 'GB18030'):
            try:
                text.decode(enc)
            except UnicodeError:
                continue
            return resolve_encoding(enc)

        #detect by chardet by wsy
        cc = chardet.detect(text)
        if cc is None:
            return
        enc = cc.get('encoding', None)
        if enc is None:
            return
        try:
            text.decode(enc)
        except:
            return None
        else:
            return resolve_encoding(enc)
    def test_process_response_no_content_type_header(self):
        headers = {
            'Content-Encoding': 'identity',
        }
        plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
        respcls = responsetypes.from_args(url="http://www.example.com/index", headers=headers, body=plainbody)
        response = respcls("http://www.example.com/index", headers=headers, body=plainbody)
        request = Request("http://www.example.com/index")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, respcls)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
Ejemplo n.º 7
0
    def test_process_response_no_content_type_header(self):
        headers = {
            'Content-Encoding': 'identity',
        }
        plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
        respcls = responsetypes.from_args(url="http://www.example.com/index", headers=headers, body=plainbody)
        response = respcls("http://www.example.com/index", headers=headers, body=plainbody)
        request = Request("http://www.example.com/index")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, respcls)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
    def test_process_response_force_recalculate_encoding(self):
        headers = {
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
        }
        f = BytesIO()
        plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
        zf = GzipFile(fileobj=f, mode='wb')
        zf.write(plainbody)
        zf.close()
        response = HtmlResponse("http;//www.example.com/page.html", headers=headers, body=f.getvalue())
        request = Request("http://www.example.com/")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, HtmlResponse)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
Ejemplo n.º 9
0
    def test_process_response_force_recalculate_encoding(self):
        headers = {
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
        }
        f = BytesIO()
        plainbody = b"""<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">"""
        zf = GzipFile(fileobj=f, mode='wb')
        zf.write(plainbody)
        zf.close()
        response = HtmlResponse("http;//www.example.com/page.html", headers=headers, body=f.getvalue())
        request = Request("http://www.example.com/")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, HtmlResponse)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
Ejemplo n.º 10
0
    def test_process_response_encoding_inside_body(self):
        headers = {
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
        }
        f = BytesIO()
        plainbody = (
            b'<html><head><title>Some page</title>'
            b'<meta http-equiv="Content-Type" content="text/html; charset=gb2312">'
        )
        zf = GzipFile(fileobj=f, mode='wb')
        zf.write(plainbody)
        zf.close()
        response = Response("http;//www.example.com/",
                            headers=headers,
                            body=f.getvalue())
        request = Request("http://www.example.com/")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, HtmlResponse)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
        self.assertStatsEqual('httpcompression/response_count', 1)
        self.assertStatsEqual('httpcompression/response_bytes', 104)
Ejemplo n.º 11
0
 def _assert_response_encoding(self, response, encoding):
     self.assertEqual(response.encoding, resolve_encoding(encoding))
Ejemplo n.º 12
0
 def _assert_response_encoding(self, response, encoding):
     self.assertEqual(response.encoding, resolve_encoding(encoding))
Ejemplo n.º 13
0
 def test_resolve_encoding(self):
     self.assertEqual(resolve_encoding('latin1'), 'cp1252')
     self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252')
     self.assertEqual(resolve_encoding('gb_2312-80'), 'gb18030')
     self.assertEqual(resolve_encoding('unknown encoding'), None)
Ejemplo n.º 14
0
 def test_resolve_encoding(self):
     self.assertEqual(resolve_encoding('latin1'), 'cp1252')
     self.assertEqual(resolve_encoding(' Latin-1'), 'cp1252')
     self.assertEqual(resolve_encoding('gb_2312-80'), 'gb18030')
     self.assertEqual(resolve_encoding('unknown encoding'), None)
Ejemplo n.º 15
0
 def test_resolve_encoding(self):
     self.assertEqual(resolve_encoding("latin1"), "cp1252")
     self.assertEqual(resolve_encoding(" Latin-1"), "cp1252")
     self.assertEqual(resolve_encoding("gb_2312-80"), "gb18030")
     self.assertEqual(resolve_encoding("unknown encoding"), None)