コード例 #1
0
    def test_main(self):
        domain = "abc.com"
        result = {"utf-8" : 5, "gbk" : 6, "xxx" : 1}
        for decoding, count in result.items():
            for _ in range(count):
                DomainDecodingCache.inc_domain_decoding(domain, decoding)

        decoding = DomainDecodingCache.get_domain_decoding(domain)
        self.assertEqual(decoding, "gbk")
コード例 #2
0
    def _decode_doc(self, url, message):
        if message["crawl_type"] == "dynamic":
            encoding = "utf-8"
        elif message["encoding"] is not None and message["encoding_created_time"] is not None and \
            datetime.datetime.utcnow() - timestamp2datetime(message["encoding_created_time"]) < \
            datetime.timedelta(seconds = self._settings["encoding_expiry_duration"]):
            encoding = message["encoding"]
        else:
            encoding = None

        if encoding is None:
            encoding = DomainDecodingCache.get_domain_decoding(message["full_domain"])

        content_type = message["headers"].get('Content-Type', None)
        decoded_doc, used_encoding = decoder.decode(url, {'Content-Type' : content_type}, \
            message["doc"], encoding)
        if message['encoding'] is None:
            message['encoding'] = used_encoding
            message['encoding_create_time'] = datetime.datetime.utcnow()

        return decoded_doc, used_encoding