Beispiel #1
0
def update_url_info(url, update_map, inc_map = None):
    """
    Enabled cache
    """

    UrlCacheClient.update_url_info(url, update_map, inc_map)
    cond = default_cond(url)
    _async_update_url_info(cond, update_map, inc_map)
Beispiel #2
0
def _insert_url_info(url, url_info):
    UrlCacheClient.update_url_info(url, url_info)

    first_update_map, second_update_map = misc.separate_dict(url_info, common_settings.database_table_fields["urlRepositoryMeta"])
    misc.copy_dict(first_update_map, second_update_map, common_settings.common_url_info_fields + ["url", "_id"])

    db.urlRepository.insert(first_update_map)
    crawlerMetadb.insert_url_info_meta(second_update_map)
Beispiel #3
0
def find_and_modify_url_info_md5(url, md5_hash):
    """
    Enabled cache
    update md5 fields
    """

    update_map = {"md5" : md5_hash}
    inc_map = None
    fields = ["md5"]

    url_info = UrlCacheClient.find_and_modify_url_info_by_not_md5(url, md5_hash, update_map, inc_map, fields)

    cond = default_cond(url)
    cond["md5"] = {"$ne" : md5_hash}

    if url_info is None:
        url_info =  _cond_update_url_info(cond, update_map, inc_map, fields)
    elif url_info == False:
        url_info = None
    else:
        _async_update_url_info(cond, update_map, inc_map)

    if url_info is None:
        return 0 #duplicate md5
    elif url_info["md5"] is not None:
        return 1 #md5 changed
    else:
        return 2 #first md5
Beispiel #4
0
def add_url_info(url, url_info, merge = False):
    """
    Enabled cache
    """

    assign_url_info_defaults(url, url_info)

    existed, alive = UrlCacheClient.check_url_exists(url)
    if not existed:
        _insert_url_info(url, url_info)
        return True, False
    elif alive and merge:
        now = datetime2timestamp(datetime.datetime.utcnow())
        update_map = {"last_discovered" : now}

        # duplicate crawl request merge, will promote crawl_priority/crawl_depth if any
        fields = ["crawl_priority", "crawl_depth", "crawl_status", "url_class", "last_crawled"]
        existing_url_info = get_url_info(url, fields = fields)
        if existing_url_info is None:
            return False, False
        priority_promoted = _merge_url_info(url, existing_url_info, url_info, update_map)

        promoted = False
        misc.copy_dict(existing_url_info, url_info, fields = ["crawl_status", "url_class", "last_crawled"])
        if common_settings.core_settings["general_crawl_policies"]["preemptive_priority_promotion"] and url_info["last_crawled"] is None and priority_promoted:
            if url_info["crawl_status"] == "crawling":
                update_map["expires"] = now
                promoted = True

        update_url_info(url, update_map, {"discovered_count" : 1})
        return False, promoted
    else:
        return False, False
Beispiel #5
0
def get_url_info(url, fields):
    """
    Enabled cache
    """
    url_info = UrlCacheClient.get_url_info(url, fields)
    if url_info is not None:
        return url_info
    else:
        return _cond_get_url_info(default_cond(url), fields)
Beispiel #6
0
def update_url_info_by_status(url, crawl_status, update_map, inc_map = None):
    """
    Enabled cache
    """

    success = UrlCacheClient.update_url_info_by_status(url, crawl_status, update_map, inc_map)
    cond = default_cond(url)
    cond["crawl_status"] = crawl_status
    _async_update_url_info(cond, update_map, inc_map)
Beispiel #7
0
def find_and_modify_url_info(url, update_map, inc_map, fields):
    """
    Enabled cache
    """

    cond = default_cond(url)
    url_info = UrlCacheClient.find_and_modify_url_info(url, update_map, inc_map, fields)
    if url_info is None:
        return _cond_update_url_info(cond, update_map, inc_map, fields)
    else:
        _async_update_url_info(cond, update_map, inc_map)
        return url_info
Beispiel #8
0
def get_url_info_by_status(url, crawl_status, fields):
    """
    Enabled cache
    """
    url_info = UrlCacheClient.get_url_info_by_status(url, crawl_status, fields)
    if url_info is None:
        cond = default_cond(url)
        cond["crawl_status"] = crawl_status
        return _cond_get_url_info(cond, fields)
    elif url_info == False:
        return None
    else:
        return url_info
Beispiel #9
0
def find_and_modify_url_info_by_status(url, crawl_status, update_map, inc_map, fields):
    """
    Enabled cache
    """

    cond = default_cond(url)
    cond["crawl_status"] = crawl_status
    url_info = UrlCacheClient.find_and_modify_url_info_by_status(url, crawl_status, update_map, inc_map, fields)
    if url_info is None:
        return _cond_update_url_info(cond, update_map, inc_map, fields)
    elif url_info == False:
        return None
    else:
        _async_update_url_info(cond, update_map, inc_map)
        return url_info
    def test_main(self):
        url = ["http://www.xyz.com/xyz", "http://www.xyz.com/xyz"]
        for u in url:
            common_settings.cache_client().delete("url_dedup", u)
            common_settings.cache_client().delete("url", u)

        self.assertEqual(False, common_settings.cache_client().exists("url_dedup", url[0]))
        self.assertEqual((False, True), UrlCacheClient.check_url_exists(url[0]))
        self.assertEqual((True, True), UrlCacheClient.check_url_exists(url[0]))
        update_map = {"crawl_priority" : 1, "crawl_depth" : 2, "crawl_status" : "crawling", "url_class" : None, "last_crawled" : 123, "not_included" : "xyz", "md5" : None}
        self.assertEqual(True, UrlCacheClient.update_url_info(url[0], update_map))
        self.assertEqual({"crawl_priority" : 1, "url_class" : None}, UrlCacheClient.get_url_info(url[0], fields=["crawl_priority", "url_class"]))
        self.assertEqual({"crawl_priority" : 1, "url_class" : None}, UrlCacheClient.get_url_info_by_status(url[0], "crawling", fields=["crawl_priority", "url_class"]))
        self.assertEqual(False, UrlCacheClient.get_url_info_by_status(url[0], "notAlive", fields=["crawl_priority", "url_class"]))
        self.assertEqual(True, UrlCacheClient.update_url_info_by_status(url[0], "crawling", {"crawl_priority" : 2}))
        self.assertEqual(False, UrlCacheClient.update_url_info_by_status(url[0], "alive", {"crawl_priority" : 3}))
        self.assertEqual({"crawl_priority" : 2, "url_class" : None}, UrlCacheClient.find_and_modify_url_info(url[0], {"url_class" : "list"}, {}, ["crawl_priority", "url_class"]))
        self.assertEqual({"crawl_priority" : 2, "url_class" : "list"}, UrlCacheClient.find_and_modify_url_info_by_status(url[0], "crawling", {"crawl_depth" : 3}, {}, ["crawl_priority", "url_class"]))
        self.assertEqual(False, UrlCacheClient.find_and_modify_url_info_by_status(url[0], "notAlive", {"crawl_status" : "alive"}, {}, ["crawl_priority", "url_class"]))
        self.assertEqual({"md5" : None}, UrlCacheClient.find_and_modify_url_info_by_not_md5(url[0], "xyz", {"md5" : "xyz"}, {}, ["md5"]))
        self.assertEqual(False, UrlCacheClient.find_and_modify_url_info_by_not_md5(url[0], "xyz", {"md5" : "xyz"}, {}, ["md5"]))
        self.assertEqual({"crawl_priority" : 2, "crawl_depth" : 3, "crawl_status" : "crawling", "url_class" : "list", "last_crawled" : 123, "md5" : "xyz"}, UrlCacheClient.get_url_info(url[0]))
        self.assertEqual(True, UrlCacheClient.update_url_info(url[0], {"crawl_status" : "notAlive"}))
        self.assertEqual(False, common_settings.cache_client().exists("url", url[0]))
        self.assertEqual("0", common_settings.cache_client().get("url_dedup", url[0]))
        self.assertEqual((True, False), UrlCacheClient.check_url_exists(url[0]))