Beispiel #1
0
 def check_url_exists(cls, ori_url):
     url = copy.deepcopy(ori_url)
     success = common_settings.cache_client().set("url_dedup", url, data = "1", nx = True)
     if success:
         return False, True
     else:
         alive = common_settings.cache_client().get("url_dedup", url) == "1"
         return True, alive
 def get_domain_decoding(cls, domain):
     all_decodings = common_settings.cache_client().get(DomainDecodingCache._data_key, domain)
     if all_decodings is None or len(all_decodings) == 0:
         return None
     else:
        max_decoding = max(all_decodings.items(), key = lambda pair : int(pair[1]))
        return max_decoding[0]
Beispiel #3
0
    def _update_url_info(cls, url, update_map, inc_map, cond=None, with_get=False, fields=[]):
        deleting = update_map.has_key("crawl_status") and update_map["crawl_status"] in ["failed", "notAlive"]
        if not deleting:
            cache_update_map = misc.clone_dict(update_map, UrlCacheClient._fields, soft=True)
            cache_inc_map = misc.clone_dict(inc_map, UrlCacheClient._fields, soft=True)
        else:
            cache_update_map = {}
            cache_inc_map = {}

        ret_value = common_settings.cache_client().set("url", url, update_map = cache_update_map, inc_map = cache_inc_map, cond = cond, with_get = with_get, fields = fields)

        if deleting:
            common_settings.cache_client().delete("url", url)
            common_settings.cache_client().set("url_dedup", url, data = "0")

        return ret_value
    def test_hash(self):
        common_settings.redis_cache_config["validation_enabled"] = False
        common_settings.redis_cache_config["data_types"]["hash_type"] = {"content_type" : "redis/hash", "fields" : [("a", int), "b", "c", ("e", long), "d", ("f", lambda v : v == "True", lambda v : "True" if v else "False")]}
        common_settings.load_cache_client(True)
        client = common_settings.cache_client()
        client.delete("hash_type", "abc")
        data = {"a" : 1, "b" : "xyz"}
        self.assertTrue(client.set("hash_type", "abc", update_map=data))
        self.assertEqual({"b" : "xyz", "a" : 1}, client.get("hash_type", "abc", fields = ["b", "a"]))
        self.assertEqual({"b" : "xyz"}, client.get("hash_type", "abc", fields = ["b"]))
        self.assertTrue(client.set("hash_type", "abc", update_map={"c" : "True", "b" : "\\None", "d" : "\\\\None"}))
        self.assertEqual({"a" : 1, "c" : "True", "b": "\\None", "d" : "\\\\None"}, client.get("hash_type", "abc", fields = ["a", "c", "b", "d"]))
        self.assertTrue(client.set("hash_type", "abc", update_map={"b" : None, "f" : 1}, inc_map={"a" : 1, "e" : 2}))
        self.assertEqual({"f" : True, "b" : None, "a" : 2, "e" : 2}, client.get("hash_type", "abc", fields = ["b", "a", "e", "f"]))
        self.assertEqual({"a" : 2}, client.set("hash_type", "abc", inc_map={"a" : 3}, with_get=True, fields=["a"]))
        self.assertEqual({"a" : 5}, client.set("hash_type", "abc", inc_map={"a" : 3}, with_get=True, fields=["a"]))
        self.assertEqual({"a" : 8}, client.get("hash_type", "abc", fields=["a"]))
        client.delete("hash_type", "abc", "b")
        self.assertEqual({"a" : 8, "b" : None}, client.get("hash_type", "abc", fields = ["a", "b"], strict=False))
        self.assertEqual(None, client.get("hash_type", "abc", fields=["a", "not_existed"], strict=True))

        self._test_fail(lambda _ : client.set("hash_type", "abc", inc_map = {"c" : 1}))

        self.assertEqual({"a" : 8}, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=True, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : r["f"]}))
        self.assertEqual({"a" : 9}, client.get("hash_type", "abc", fields=["a"]))

        self.assertEqual(False, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=True, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : not r["f"]}))
        self.assertEqual({"a" : 9}, client.get("hash_type", "abc", fields=["a"]))

        self.assertEqual(False, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=False, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : not r["f"]}))
        self.assertEqual(True, client.set("hash_type", "abc", inc_map = {"a" : 1}, with_get=False, fields=["a"], cond={"fields" : ["f"], "func" : lambda r : r["f"]}))
        self.assertEqual({"a" : 10}, client.get("hash_type", "abc", fields=["a"]))
 def test_not_existed_type(self):
     common_settings.redis_cache_config["validation_enabled"] = False
     common_settings.load_cache_client(True)
     client = common_settings.cache_client()
     data = {"a" : 1, "b" : "xyz"}
     self._test_fail(lambda _ : client.set("binaryxx", "abc", data=data))
     self._test_fail(lambda _ : client.get("binaryxx", "abc"))
 def test_validation(self):
     common_settings.redis_cache_config["validation_enabled"] = True
     common_settings.redis_cache_config["data_types"]["json_type"] = {"content_type" : "text/json"}
     common_settings.load_cache_client(True)
     client = common_settings.cache_client()
     data = {"a" : 1, "b" : "xyz"}
     client.set("json_type", "abc", data=data)
     self.assertEqual(data, client.get("json_type", "abc"))
    def test_main(self):
        url = ["http://www.xyz.com/xyz", "http://www.xyz.com/xyz"]
        for u in url:
            common_settings.cache_client().delete("url_dedup", u)
            common_settings.cache_client().delete("url", u)

        self.assertEqual(False, common_settings.cache_client().exists("url_dedup", url[0]))
        self.assertEqual((False, True), UrlCacheClient.check_url_exists(url[0]))
        self.assertEqual((True, True), UrlCacheClient.check_url_exists(url[0]))
        update_map = {"crawl_priority" : 1, "crawl_depth" : 2, "crawl_status" : "crawling", "url_class" : None, "last_crawled" : 123, "not_included" : "xyz", "md5" : None}
        self.assertEqual(True, UrlCacheClient.update_url_info(url[0], update_map))
        self.assertEqual({"crawl_priority" : 1, "url_class" : None}, UrlCacheClient.get_url_info(url[0], fields=["crawl_priority", "url_class"]))
        self.assertEqual({"crawl_priority" : 1, "url_class" : None}, UrlCacheClient.get_url_info_by_status(url[0], "crawling", fields=["crawl_priority", "url_class"]))
        self.assertEqual(False, UrlCacheClient.get_url_info_by_status(url[0], "notAlive", fields=["crawl_priority", "url_class"]))
        self.assertEqual(True, UrlCacheClient.update_url_info_by_status(url[0], "crawling", {"crawl_priority" : 2}))
        self.assertEqual(False, UrlCacheClient.update_url_info_by_status(url[0], "alive", {"crawl_priority" : 3}))
        self.assertEqual({"crawl_priority" : 2, "url_class" : None}, UrlCacheClient.find_and_modify_url_info(url[0], {"url_class" : "list"}, {}, ["crawl_priority", "url_class"]))
        self.assertEqual({"crawl_priority" : 2, "url_class" : "list"}, UrlCacheClient.find_and_modify_url_info_by_status(url[0], "crawling", {"crawl_depth" : 3}, {}, ["crawl_priority", "url_class"]))
        self.assertEqual(False, UrlCacheClient.find_and_modify_url_info_by_status(url[0], "notAlive", {"crawl_status" : "alive"}, {}, ["crawl_priority", "url_class"]))
        self.assertEqual({"md5" : None}, UrlCacheClient.find_and_modify_url_info_by_not_md5(url[0], "xyz", {"md5" : "xyz"}, {}, ["md5"]))
        self.assertEqual(False, UrlCacheClient.find_and_modify_url_info_by_not_md5(url[0], "xyz", {"md5" : "xyz"}, {}, ["md5"]))
        self.assertEqual({"crawl_priority" : 2, "crawl_depth" : 3, "crawl_status" : "crawling", "url_class" : "list", "last_crawled" : 123, "md5" : "xyz"}, UrlCacheClient.get_url_info(url[0]))
        self.assertEqual(True, UrlCacheClient.update_url_info(url[0], {"crawl_status" : "notAlive"}))
        self.assertEqual(False, common_settings.cache_client().exists("url", url[0]))
        self.assertEqual("0", common_settings.cache_client().get("url_dedup", url[0]))
        self.assertEqual((True, False), UrlCacheClient.check_url_exists(url[0]))
Beispiel #8
0
def launch(hosted_handlers):
    signal.signal(signal.SIGTERM, HostedHandlerManager.stop)
    signal.signal(signal.SIGINT, HostedHandlerManager.stop) # for ctrl-c

    common_settings.mqclient().set_stop_condition(stop_condition)
    common_settings.cache_client().set_stop_condition(stop_condition)


    for handler_name, handler_config in hosted_handlers.items():
        handler_type = common_settings.mq_settings["handler_configs"][handler_name]["type"]
        # TODO get settings module from config
        handler_settings = common_settings.mq_settings["handler_configs"][handler_name].get("settings", None)
        if handler_settings is not None:
            handler_settings = __import__(handler_settings, {}, {}, [''])
            common_settings.override_settings(handler_settings)
        concurrency = int(handler_config.get("concurrency", "0"))
        HostedHandlerManager.register_handler(handler_name, handler_type, concurrency)

    HostedHandlerManager.start()
Beispiel #9
0
def _get_robot_parser(scheme, host):
    robot_parser = object_cache.get("robot_parser", host)
    if robot_parser is None:
        robots_url = "%s://%s/robots.txt" % (scheme, host)
        robots_file = common_settings.cache_client().get("robots_txt", host)
        if robots_file is None:
            robots_file = misc.load_body(robots_url, encoding="utf-8")#TODO: change this to asynchronous
            if robots_file is None:
                return None
            else:
                common_settings.cache_client().set("robots_txt", host, data = robots_file.encode("utf-8", "ignore"))
        else:
            robots_file = robots_file.decode("utf-8", "ignore")

        robot_parser = robotparser.RobotFileParser(robots_url)
        robot_parser.parse(robots_file.splitlines())
        object_cache.set("robot_parser", host, robot_parser)

    return robot_parser
Beispiel #10
0
    def test_raw_mode(self):
        common_settings.redis_cache_config["validation_enabled"] = False
        data_type = "data_type"
        data_key = "data_key"
        common_settings.redis_cache_config["data_types"][data_type] = {"content_type" : "redis/hash", "raw" : True}
        client = common_settings.cache_client()
        client.delete(data_type, data_key)

        client.set(data_type, data_key, update_map = {"a" : "x"}, inc_map = {"b" : 1})
        self.assertEqual({"a" : "x", "b" : '1'}, client.get(data_type, data_key))
        client.set(data_type, data_key, inc_map = {"c" : 2})
        self.assertEqual({"a" : "x", "b" : '1', "c" : '2'}, client.get(data_type, data_key))
Beispiel #11
0
    def get_url_info(cls, ori_url, fields=None):
        """
        found: returns object
        not_in_cache: returns None
        """

        url = copy.deepcopy(ori_url)

        if fields == None:
            fields = UrlCacheClient._fields
        elif not misc.subset(fields, UrlCacheClient._fields):
            return None

        return common_settings.cache_client().get("url", url, fields=fields)
Beispiel #12
0
    def test_set(self):
        common_settings.redis_cache_config["validation_enabled"] = False
        common_settings.redis_cache_config["data_types"]["data_type"] = {"content_type" : "redis/set"}
        common_settings.load_cache_client(True)
        client = common_settings.cache_client()
        client.delete("data_type", None)

        self.assertFalse(client.set("data_type", "first", with_get=True))
        self.assertTrue(client.set("data_type", "first", with_get=True))
        self.assertTrue(client.set("data_type", "first", with_get=False))
        self.assertEqual(True, client.get("data_type", "first"))
        self.assertEqual(False, client.get("data_type", "second"))
        client.delete("data_type", "first")
        self.assertEqual(False, client.get("data_type", "first"))
Beispiel #13
0
    def test_url_type(self):
        common_settings.redis_cache_config["validation_enabled"] = False
        common_settings.redis_cache_config["data_types"]["url_test"] = common_settings.redis_cache_config["data_types"]["url"]
        client = common_settings.cache_client()
        url_info = {"crawl_status" : "crawling", "url_class" : None, "crawl_priority" : 1, "crawl_depth" : 0, "last_crawled" : datetime2timestamp(datetime.datetime.utcnow())}
        url = "http://www.baidu.com"
        client.set("url_test", url, update_map = url_info)
        self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"]))

        url_info = {"crawl_status" : "alive", "url_class" : "details", "crawl_priority" : 3, "crawl_depth" : -1, "last_crawled" : None}
        client.set("url_test", url, update_map = url_info)
        self.assertEqual(url_info, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth", "url_class"]))

        client.set("url_test", url, update_map = {"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123})
        self.assertEqual({"crawl_priority" : 5, "crawl_status" : "notAlive", "last_crawled" : 123, "crawl_depth" : -1}, client.get("url_test", url, fields = ["crawl_priority", "crawl_status", "last_crawled", "crawl_depth"]))
Beispiel #14
0
 def test_plain(self):
     common_settings.redis_cache_config["validation_enabled"] = False
     common_settings.redis_cache_config["data_types"]["binary_type"] = {"content_type" : "text/plain"}
     common_settings.load_cache_client(True)
     client = common_settings.cache_client()
     data = "xyz"
     self.assertTrue(client.set("binary_type", "abc", data=data))
     self.assertEqual(data, client.get("binary_type", "abc"))
     data_new = "xyz1"
     self.assertEqual(data, client.set("binary_type", "abc", data=data_new, with_get=True))
     self.assertEqual(data_new, client.get("binary_type", "abc"))
     client.delete("binary_type", "abc")
     self.assertEqual(None, client.get("binary_type", "abc"))
     self.assertEqual(True, client.set("binary_type", "abc", data=data_new, nx=True))
     self.assertEqual(False, client.set("binary_type", "abc", data=data_new, nx=True))
Beispiel #15
0
    def test_url_dedup_type(self):
        common_settings.redis_cache_config["validation_enabled"] = False
        common_settings.redis_cache_config["data_types"]["url_dedup_test"] = {"content_type" : "redis/set"}
        client = common_settings.cache_client()
        client.delete("url_dedup_test", None)
        url_list = ["http://www.baidu.com", "http://www.google.com", "http://www.sina.com.cn"]
        for url in url_list:
            md5 = misc.md5(url)
            client.set("url_dedup_test", md5)

        for url in url_list:
            self.assertEqual(True, client.get("url_dedup_test", misc.md5(url)))

        self.assertEqual(False, client.get("url_dedup_test", misc.md5("http://www.google.com/")))
        self.assertFalse(client.set("url_dedup_test", misc.md5("http://www.google.com/"), with_get=True))
        self.assertEqual(True, client.get("url_dedup_test", misc.md5("http://www.google.com/")))
Beispiel #16
0
 def test_json(self):
     common_settings.redis_cache_config["validation_enabled"] = False
     common_settings.redis_cache_config["data_types"]["json_type"] = {"content_type" : "text/json"}
     common_settings.load_cache_client(True)
     client = common_settings.cache_client()
     data = {"a" : 1, "b" : "xyz"}
     self.assertTrue(client.set("json_type", "abc", data=data))
     self.assertEqual(data, client.get("json_type", "abc"))
     self.assertEqual(None, client.get("json_type", "xyz"))
     data_new = {"a" : 2, "b" : "xyz"}
     self.assertEqual(data, client.set("json_type", "abc", with_get=True, data=data_new))
     self.assertEqual(data_new, client.get("json_type", "abc"))
     client.delete("json_type", "abc")
     self.assertEqual(None, client.get("json_type", "abc"))
     self.assertEqual(True, client.set("json_type", "abc", data=data, nx=True))
     self.assertEqual(False, client.set("json_type", "abc", data=data, nx=True))
Beispiel #17
0
def has_dns_cache(host):
    return common_settings.cache_client().get("dns", host) is not None
Beispiel #18
0
def set_dns_cache(host, ip):
    common_settings.cache_client().set("dns", host, data = ip.encode("utf-8", "ignore"))
Beispiel #19
0
def DNSCacheResolver(host):
    ip = common_settings.cache_client().get("dns", host)
    if ip is not None:
        return ip
    else:
        return host
 def delete_domain(cls, domain):
     common_settings.cache_client().delete(DomainDecodingCache._data_key, domain)
 def inc_domain_decoding(cls, domain, decoding):
     common_settings.cache_client().set(DomainDecodingCache._data_key, domain, inc_map = {decoding : 1})