def validate(self, url, parent_url, extras = None): ''' it's normalized url ''' source_info = misc.get_url_domain_info(url) #check whether it's a mobile url if url_analyser.is_mobile_url(url): return False #check whether it's a negative url pattern for pattern in self._settings["negative_url_patterns"]: if pattern.match(url): return False #check whether it's a negative url extension parse_result = urlparse.urlparse(url) ext = os.path.splitext(parse_result.path)[1].lower() if len(ext) > 0 and ext in self._settings["negative_url_extensions"]: return False #check whether it's a negative url domain negative_domains = self._settings["negative_url_domains"] if source_info[1].lower() in negative_domains: return False # TODO why read from db? just add dependents. this should not be # a default action. If want to, user should implements their own url # validator, and replace default one in settings, to do all kind of # stuff they want to. negative_domains = crawlerdb.get_negative_domains() if source_info[1].lower() in negative_domains: return False #check filtering policy # what dose this settings mean? See options in common/configuration.py match_target = self._settings["general_crawl_policies"]["url_match_target"] if match_target == "none": return False elif match_target == "whitelist": domain = url_analyser.get_url_domain(source_info) # what does this settings mean? See options in common/configuration.py domain_type = self._settings["general_crawl_policies"]["url_match_domain_type"] # TODO read from db again... still think this should be done by # user. whitelist = crawlerdb.get_crawl_domain_infos(domain_type) # if domain in white list, valid return len(filter(lambda domain_row : domain_row["domain"] == domain, whitelist)) > 0 elif match_target == "parent_url": if parent_url is None: return True target_info = misc.get_url_domain_info(parent_url) return url_analyser.match_url_domain_info(source_info, target_info) elif match_target == "all": return True else: raise Exception("not supported match_target %s" % match_target)
def assign_url_info_defaults(url, url_info): url_info["_id"] = misc.md5(url) now = datetime2timestamp(datetime.datetime.utcnow()) url_info["created_time"] = now url_info["crawled_count"] = 0 url_info["url_class"] = None url_info["error_messages"] = [] #url_info["processed_count"] = 0 #url_info["last_processed"] = None url_info["first_modified"] = None url_info["last_modified"] = None url_info["modified_count"] = 0 url_info["valid_link_count"] = None url_info["retry_count"] = 0 url_info["status_last_modified"] = now url_info["encoding"] = None url_info["encoding_created_time"] = None url_info["redirect_url"] = None #url_info["last_finished"] = None #url_info["expires"] = now url_info["doc"] = None url_info["headers"] = None url_info["md5"] = None #url_info["process_status"] = True url_info["last_discovered"] = now url_info["discovered_count"] = 1 url_info["comments"] = "" url_info["redirect_count"] = 0 url_info["recrawl_time"] = now url_info["recrawl_duration"] = 0 url_info["recrawl_priority"] = url_info["crawl_priority"] _, full_domain, _ = misc.get_url_domain_info(url) url_info["full_domain"] = full_domain
def save_crawl_domain_info(url, domain_type = "full_domain", crawl_priority = -1, crawl_depth = -1, \ recrawl_details = False, recrawl_list = False, recrawl_undefined = False):#-1 means auto config needed domain_info = misc.get_url_domain_info(url) domain_types = common_settings.domain_types domain = domain_info[domain_types.index(domain_type)] update_map = {"domain" : domain, "domain_type" : domain_type, "url" : url, "crawl_priority" : crawl_priority, "crawl_depth" : crawl_depth, "recrawl_details" : recrawl_details, "recrawl_list" : recrawl_list, "recrawl_undefined" : recrawl_undefined, "_id" : misc.md5(''.join([domain, domain_type])) } db.crawlDomainWhitelist.save(update_map)#Note: will override duplicate domain
def _assign_url_info_defaults(self, url_info): now = datetime2timestamp(datetime.datetime.utcnow()) url_info["created_time"] = now url_info["crawled_count"] = 0 url_info["error_messages"] = [] url_info["retry_count"] = 0 url_info["encoding"] = None url_info["encoding_created_time"] = None url_info["redirect_url"] = None #url_info["last_finished"] = None #url_info["expires"] = now url_info["doc"] = None url_info["headers"] = None # TODO not used? url_info["md5"] = None #url_info["process_status"] = True url_info["comments"] = "" url_info["redirect_count"] = 0 _, full_domain, _ = misc.get_url_domain_info(url_info['url']) url_info["full_domain"] = full_domain
def is_external_url(self, url, parent_url): source_info = misc.get_url_domain_info(url) target_info = misc.get_url_domain_info(parent_url) return not self.match_url_domain_info(source_info, target_info)
def get_crawl_domain_info(self, url): domain_type = self._settings["general_crawl_policies"]["url_match_domain_type"] domain_info = misc.get_url_domain_info(url) domain = self.get_url_domain(domain_info) domain_info = crawlerdb.get_crawl_domain_info(domain, domain_type) return domain_info