def test_main(self): domain = "abc.com" result = {"utf-8" : 5, "gbk" : 6, "xxx" : 1} for decoding, count in result.items(): for _ in range(count): DomainDecodingCache.inc_domain_decoding(domain, decoding) decoding = DomainDecodingCache.get_domain_decoding(domain) self.assertEqual(decoding, "gbk")
def _decode_doc(self, url, message): if message["crawl_type"] == "dynamic": encoding = "utf-8" elif message["encoding"] is not None and message["encoding_created_time"] is not None and \ datetime.datetime.utcnow() - timestamp2datetime(message["encoding_created_time"]) < \ datetime.timedelta(seconds = self._settings["encoding_expiry_duration"]): encoding = message["encoding"] else: encoding = None if encoding is None: encoding = DomainDecodingCache.get_domain_decoding(message["full_domain"]) content_type = message["headers"].get('Content-Type', None) decoded_doc, used_encoding = decoder.decode(url, {'Content-Type' : content_type}, \ message["doc"], encoding) if message['encoding'] is None: message['encoding'] = used_encoding message['encoding_create_time'] = datetime.datetime.utcnow() return decoded_doc, used_encoding