def check_url_exists(cls, ori_url): url = copy.deepcopy(ori_url) success = common_settings.cache_client().set("url_dedup", url, data = "1", nx = True) if success: return False, True else: alive = common_settings.cache_client().get("url_dedup", url) == "1" return True, alive
def get_domain_decoding(cls, domain): all_decodings = common_settings.cache_client().get(DomainDecodingCache._data_key, domain) if all_decodings is None or len(all_decodings) == 0: return None else: max_decoding = max(all_decodings.items(), key = lambda pair : int(pair[1])) return max_decoding[0]
def _update_url_info(cls, url, update_map, inc_map, cond=None, with_get=False, fields=[]): deleting = update_map.has_key("crawl_status") and update_map["crawl_status"] in ["failed", "notAlive"] if not deleting: cache_update_map = misc.clone_dict(update_map, UrlCacheClient._fields, soft=True) cache_inc_map = misc.clone_dict(inc_map, UrlCacheClient._fields, soft=True) else: cache_update_map = {} cache_inc_map = {} ret_value = common_settings.cache_client().set("url", url, update_map = cache_update_map, inc_map = cache_inc_map, cond = cond, with_get = with_get, fields = fields) if deleting: common_settings.cache_client().delete("url", url) common_settings.cache_client().set("url_dedup", url, data = "0") return ret_value
def test_hash(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["hash_type"] = {"content_type" : "redis/hash", "fields" : [("a", int), "b", "c", ("e", long), "d", ("f", lambda v : v == "True", lambda v : "True" if v else "False")]} common_settings.load_cache_client(True) client = common_settings.cache_client() client.delete("hash_type", "abc") data = {"a" : 1, "b" : "xyz"} self.assertTrue(client.set("hash_type", "abc", update_map=data)) self.assertEqual({"b" : "xyz", "a" : 1}, client.get("hash_type", "abc", fields = ["b", "a"])) self.assertEqual({"b" : "xyz"}, client.get("hash_type", "abc", fields = ["b"])) self.assertTrue(client.set("hash_type", "abc", update_map={"c" : "True", "b" : "\\None", "d" : "\\\\None"})) self.assertEqual({"a" : 1, "c" : "True", "b": "\\None", "d" : "\\\\None"}, client.get("hash_type", "abc", fields = ["a", "c", "b", "d"])) self.assertTrue(client.set("hash_type", "abc", update_map={"b" : None, "f" : 1}, inc_map={"a" : 1, "e" : 2})) self.assertEqual({"f" : True, "b" : None, "a" : 2, "e" : 2}, client.get("hash_type", "abc", fields = ["b", "a", "e", "f"])) self.assertEqual({"a" : 2}, client.set("hash_type", "abc", inc_map={"a" : 3}, with_get=True, fields=["a"])) self.assertEqual({"a" : 5}, client.set("hash_type", "abc", inc_map={"a" : 3}, with_get=True, fields=["a"])) self.assertEqual({"a" : 8}, client.get("hash_type", "abc", fields=["a"])) client.delete("hash_type", "abc", "b") self.assertEqual({"a" : 8, "b" : None}, client.get("hash_type", "abc", fields = ["a", "b"], strict=False)) self.assertEqual(None, client.get("hash_type", "abc", fields=["a", "not_existed"], strict=True)) self._test_fail(lambda _ : client.set("hash_type", "abc", inc_map = {"c" : 1})) self.assertEqual({"a" : 8}, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=True, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : r["f"]})) self.assertEqual({"a" : 9}, client.get("hash_type", "abc", fields=["a"])) self.assertEqual(False, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=True, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : not r["f"]})) self.assertEqual({"a" : 9}, client.get("hash_type", "abc", fields=["a"])) self.assertEqual(False, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=False, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : not r["f"]})) self.assertEqual(True, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=False, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : r["f"]})) self.assertEqual({"a" : 10}, client.get("hash_type", "abc", fields=["a"]))
def test_not_existed_type(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.load_cache_client(True) client = common_settings.cache_client() data = {"a" : 1, "b" : "xyz"} self._test_fail(lambda _ : client.set("binaryxx", "abc", data=data)) self._test_fail(lambda _ : client.get("binaryxx", "abc"))
def test_validation(self): common_settings.redis_cache_config["validation_enabled"] = True common_settings.redis_cache_config["data_types"]["json_type"] = {"content_type" : "text/json"} common_settings.load_cache_client(True) client = common_settings.cache_client() data = {"a" : 1, "b" : "xyz"} client.set("json_type", "abc", data=data) self.assertEqual(data, client.get("json_type", "abc"))
def test_main(self): url = ["http://www.xyz.com/xyz", "http://www.xyz.com/xyz"] for u in url: common_settings.cache_client().delete("url_dedup", u) common_settings.cache_client().delete("url", u) self.assertEqual(False, common_settings.cache_client().exists("url_dedup", url[0])) self.assertEqual((False, True), UrlCacheClient.check_url_exists(url[0])) self.assertEqual((True, True), UrlCacheClient.check_url_exists(url[0])) update_map = {"crawl_priority" : 1, "crawl_depth" : 2, "crawl_status" : "crawling", "url_class" : None, "last_crawled" : 123, "not_included" : "xyz", "md5" : None} self.assertEqual(True, UrlCacheClient.update_url_info(url[0], update_map)) self.assertEqual({"crawl_priority" : 1, "url_class" : None}, UrlCacheClient.get_url_info(url[0], fields=["crawl_priority", "url_class"])) self.assertEqual({"crawl_priority" : 1, "url_class" : None}, UrlCacheClient.get_url_info_by_status(url[0], "crawling", fields=["crawl_priority", "url_class"])) self.assertEqual(False, UrlCacheClient.get_url_info_by_status(url[0], "notAlive", fields=["crawl_priority", "url_class"])) self.assertEqual(True, UrlCacheClient.update_url_info_by_status(url[0], "crawling", {"crawl_priority" : 2})) self.assertEqual(False, UrlCacheClient.update_url_info_by_status(url[0], "alive", {"crawl_priority" : 3})) self.assertEqual({"crawl_priority" : 2, "url_class" : None}, UrlCacheClient.find_and_modify_url_info(url[0], {"url_class" : "list"}, {}, ["crawl_priority", "url_class"])) self.assertEqual({"crawl_priority" : 2, "url_class" : "list"}, UrlCacheClient.find_and_modify_url_info_by_status(url[0], "crawling", {"crawl_depth" : 3}, {}, ["crawl_priority", "url_class"])) self.assertEqual(False, UrlCacheClient.find_and_modify_url_info_by_status(url[0], "notAlive", {"crawl_status" : "alive"}, {}, ["crawl_priority", "url_class"])) self.assertEqual({"md5" : None}, UrlCacheClient.find_and_modify_url_info_by_not_md5(url[0], "xyz", {"md5" : "xyz"}, {}, ["md5"])) self.assertEqual(False, UrlCacheClient.find_and_modify_url_info_by_not_md5(url[0], "xyz", {"md5" : "xyz"}, {}, ["md5"])) self.assertEqual({"crawl_priority" : 2, "crawl_depth" : 3, "crawl_status" : "crawling", "url_class" : "list", "last_crawled" : 123, "md5" : "xyz"}, UrlCacheClient.get_url_info(url[0])) self.assertEqual(True, UrlCacheClient.update_url_info(url[0], {"crawl_status" : "notAlive"})) self.assertEqual(False, common_settings.cache_client().exists("url", url[0])) self.assertEqual("0", common_settings.cache_client().get("url_dedup", url[0])) self.assertEqual((True, False), UrlCacheClient.check_url_exists(url[0]))
def launch(hosted_handlers): signal.signal(signal.SIGTERM, HostedHandlerManager.stop) signal.signal(signal.SIGINT, HostedHandlerManager.stop) # for ctrl-c common_settings.mqclient().set_stop_condition(stop_condition) common_settings.cache_client().set_stop_condition(stop_condition) for handler_name, handler_config in hosted_handlers.items(): handler_type = common_settings.mq_settings["handler_configs"][handler_name]["type"] # TODO get settings module from config handler_settings = common_settings.mq_settings["handler_configs"][handler_name].get("settings", None) if handler_settings is not None: handler_settings = __import__(handler_settings, {}, {}, ['']) common_settings.override_settings(handler_settings) concurrency = int(handler_config.get("concurrency", "0")) HostedHandlerManager.register_handler(handler_name, handler_type, concurrency) HostedHandlerManager.start()
def _get_robot_parser(scheme, host): robot_parser = object_cache.get("robot_parser", host) if robot_parser is None: robots_url = "%s://%s/robots.txt" % (scheme, host) robots_file = common_settings.cache_client().get("robots_txt", host) if robots_file is None: robots_file = misc.load_body(robots_url, encoding="utf-8")#TODO: change this to asynchronous if robots_file is None: return None else: common_settings.cache_client().set("robots_txt", host, data = robots_file.encode("utf-8", "ignore")) else: robots_file = robots_file.decode("utf-8", "ignore") robot_parser = robotparser.RobotFileParser(robots_url) robot_parser.parse(robots_file.splitlines()) object_cache.set("robot_parser", host, robot_parser) return robot_parser
def test_raw_mode(self): common_settings.redis_cache_config["validation_enabled"] = False data_type = "data_type" data_key = "data_key" common_settings.redis_cache_config["data_types"][data_type] = {"content_type" : "redis/hash", "raw" : True} client = common_settings.cache_client() client.delete(data_type, data_key) client.set(data_type, data_key, update_map = {"a" : "x"}, inc_map = {"b" : 1}) self.assertEqual({"a" : "x", "b" : '1'}, client.get(data_type, data_key)) client.set(data_type, data_key, inc_map = {"c" : 2}) self.assertEqual({"a" : "x", "b" : '1', "c" : '2'}, client.get(data_type, data_key))
def get_url_info(cls, ori_url, fields=None): """ found: returns object not_in_cache: returns None """ url = copy.deepcopy(ori_url) if fields == None: fields = UrlCacheClient._fields elif not misc.subset(fields, UrlCacheClient._fields): return None return common_settings.cache_client().get("url", url, fields=fields)
def test_set(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["data_type"] = {"content_type" : "redis/set"} common_settings.load_cache_client(True) client = common_settings.cache_client() client.delete("data_type", None) self.assertFalse(client.set("data_type", "first", with_get=True)) self.assertTrue(client.set("data_type", "first", with_get=True)) self.assertTrue(client.set("data_type", "first", with_get=False)) self.assertEqual(True, client.get("data_type", "first")) self.assertEqual(False, client.get("data_type", "second")) client.delete("data_type", "first") self.assertEqual(False, client.get("data_type", "first"))
def test_url_type(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["url_test"] = common_settings.redis_cache_config["data_types"]["url"] client = common_settings.cache_client() url_info = {"crawl_status" : "crawling", "url_class" : None, "crawl_priority" : 1, "crawl_depth" : 0, "last_crawled" : datetime2timestamp(datetime.datetime.utcnow())} url = "http://www.baidu.com" client.set("url_test", url, update_map = url_info) self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"])) url_info = {"crawl_status" : "alive", "url_class" : "details", "crawl_priority" : 3, "crawl_depth" : -1, "last_crawled" : None} client.set("url_test", url, update_map = url_info) self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"])) client.set("url_test", url, update_map = {"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123}) self.assertEqual({"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123, "crawl_depth" : -1}, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth"]))
def test_plain(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["binary_type"] = {"content_type" : "text/plain"} common_settings.load_cache_client(True) client = common_settings.cache_client() data = "xyz" self.assertTrue(client.set("binary_type", "abc", data=data)) self.assertEqual(data, client.get("binary_type", "abc")) data_new = "xyz1" self.assertEqual(data, client.set("binary_type", "abc", data=data_new, with_get=True)) self.assertEqual(data_new, client.get("binary_type", "abc")) client.delete("binary_type", "abc") self.assertEqual(None, client.get("binary_type", "abc")) self.assertEqual(True, client.set("binary_type", "abc", data=data_new, nx=True)) self.assertEqual(False, client.set("binary_type", "abc", data=data_new, nx=True))
def test_url_dedup_type(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["url_dedup_test"] = {"content_type" : "redis/set"} client = common_settings.cache_client() client.delete("url_dedup_test", None) url_list = ["http://www.baidu.com", "http://www.google.com", "http://www.sina.com.cn"] for url in url_list: md5 = misc.md5(url) client.set("url_dedup_test", md5) for url in url_list: self.assertEqual(True, client.get("url_dedup_test", misc.md5(url))) self.assertEqual(False, client.get("url_dedup_test", misc.md5("http://www.google.com/"))) self.assertFalse(client.set("url_dedup_test", misc.md5("http://www.google.com/"), with_get=True)) self.assertEqual(True, client.get("url_dedup_test", misc.md5("http://www.google.com/")))
def test_json(self): common_settings.redis_cache_config["validation_enabled"] = False common_settings.redis_cache_config["data_types"]["json_type"] = {"content_type" : "text/json"} common_settings.load_cache_client(True) client = common_settings.cache_client() data = {"a" : 1, "b" : "xyz"} self.assertTrue(client.set("json_type", "abc", data=data)) self.assertEqual(data, client.get("json_type", "abc")) self.assertEqual(None, client.get("json_type", "xyz")) data_new = {"a" : 2, "b" : "xyz"} self.assertEqual(data, client.set("json_type", "abc", with_get=True, data=data_new)) self.assertEqual(data_new, client.get("json_type", "abc")) client.delete("json_type", "abc") self.assertEqual(None, client.get("json_type", "abc")) self.assertEqual(True, client.set("json_type", "abc", data=data, nx=True)) self.assertEqual(False, client.set("json_type", "abc", data=data, nx=True))
def has_dns_cache(host): return common_settings.cache_client().get("dns", host) is not None
def set_dns_cache(host, ip): common_settings.cache_client().set("dns", host, data = ip.encode("utf-8", "ignore"))
def DNSCacheResolver(host): ip = common_settings.cache_client().get("dns", host) if ip is not None: return ip else: return host
def delete_domain(cls, domain): common_settings.cache_client().delete(DomainDecodingCache._data_key, domain)
def inc_domain_decoding(cls, domain, decoding): common_settings.cache_client().set(DomainDecodingCache._data_key, domain, inc_map = {decoding : 1})