def _recrawling_url(self, url, url_class):
     mode = self._settings["recrawl_policies"]["url_class_policies"][url_class]["mode"]
     if mode == "none":
         return False
     elif mode == "all":
         return True
     elif mode == "whitelist":
         domain_info = url_analyser.get_crawl_domain_info(url)
         if url_class == "details":
             return domain_info is not None and domain_info["recrawl_details"]
         elif url_class == "list":
             return domain_info is not None and domain_info["recrawl_list"]
         elif url_class == "undefined":
             return domain_info is not None and domain_info["recrawl_undefined"]
         else:
             raise Exception("not supported url class %s" % url_class)
     else:
         raise Exception("not supported recrawling mode %s" % mode)
    def _determine(self, url, source):
        crawl_priority = -1
        crawl_depth = -1
        #use domain based priority/depth
        if self._settings["general_crawl_policies"]["domain_based_crawl_priority_and_depth"]:
            domain_info = url_analyser.get_crawl_domain_info(url)
            if domain_info is not None:
                crawl_priority = domain_info["crawl_priority"]
                crawl_depth = domain_info["crawl_depth"]

        #use default priority/depth, determine by source
        if crawl_priority == -1:
            crawl_priority = self._settings["crawl_policies"][source]["crawl_priority"]

        if crawl_depth == -1:
            # url_type could be domain, subdomain, others.
            url_type = url_analyser.get_url_type(url)
            crawl_depth = self._settings["crawl_policies"][source]["crawl_depth"][url_type]

        return crawl_priority, crawl_depth