def _recrawling_url(self, url, url_class): mode = self._settings["recrawl_policies"]["url_class_policies"][url_class]["mode"] if mode == "none": return False elif mode == "all": return True elif mode == "whitelist": domain_info = url_analyser.get_crawl_domain_info(url) if url_class == "details": return domain_info is not None and domain_info["recrawl_details"] elif url_class == "list": return domain_info is not None and domain_info["recrawl_list"] elif url_class == "undefined": return domain_info is not None and domain_info["recrawl_undefined"] else: raise Exception("not supported url class %s" % url_class) else: raise Exception("not supported recrawling mode %s" % mode)
def _determine(self, url, source): crawl_priority = -1 crawl_depth = -1 #use domain based priority/depth if self._settings["general_crawl_policies"]["domain_based_crawl_priority_and_depth"]: domain_info = url_analyser.get_crawl_domain_info(url) if domain_info is not None: crawl_priority = domain_info["crawl_priority"] crawl_depth = domain_info["crawl_depth"] #use default priority/depth, determine by source if crawl_priority == -1: crawl_priority = self._settings["crawl_policies"][source]["crawl_priority"] if crawl_depth == -1: # url_type could be domain, subdomain, others. url_type = url_analyser.get_url_type(url) crawl_depth = self._settings["crawl_policies"][source]["crawl_depth"][url_type] return crawl_priority, crawl_depth