def validate(self, url, parent_url, extras = None):
        '''
        it's normalized url
        '''

        source_info = misc.get_url_domain_info(url)

        #check whether it's a mobile url
        if url_analyser.is_mobile_url(url):
            return False

        #check whether it's a negative url pattern
        for pattern in self._settings["negative_url_patterns"]:
            if pattern.match(url):
                return False

        #check whether it's a negative url extension
        parse_result = urlparse.urlparse(url)
        ext = os.path.splitext(parse_result.path)[1].lower()
        if len(ext) > 0 and ext in self._settings["negative_url_extensions"]:
            return False

        #check whether it's a negative url domain
        negative_domains = self._settings["negative_url_domains"]
        if source_info[1].lower() in negative_domains:
            return False

        # TODO why read from db? just add dependents. this should not be
        # a default action. If want to, user should implements their own url
        # validator, and replace default one in settings, to do all kind of
        # stuff they want to.
        negative_domains = crawlerdb.get_negative_domains()
        if source_info[1].lower() in negative_domains:
            return False

        #check filtering policy
        # what dose this settings mean? See options in common/configuration.py
        match_target = self._settings["general_crawl_policies"]["url_match_target"]
        if match_target == "none":
            return False
        elif match_target == "whitelist":
            domain = url_analyser.get_url_domain(source_info)
            # what does this settings mean? See options in common/configuration.py
            domain_type = self._settings["general_crawl_policies"]["url_match_domain_type"]
            # TODO read from db again... still think this should be done by
            # user.
            whitelist = crawlerdb.get_crawl_domain_infos(domain_type)
            # if domain in white list, valid
            return len(filter(lambda domain_row : domain_row["domain"] == domain, whitelist)) > 0
        elif match_target == "parent_url":
            if parent_url is None:
                return True
            target_info = misc.get_url_domain_info(parent_url)
            return url_analyser.match_url_domain_info(source_info, target_info)
        elif match_target == "all":
            return True
        else:
            raise Exception("not supported match_target %s" % match_target)
Esempio n. 2
0
def assign_url_info_defaults(url, url_info):
    url_info["_id"] = misc.md5(url)
    now = datetime2timestamp(datetime.datetime.utcnow())
    url_info["created_time"] = now
    url_info["crawled_count"] = 0
    url_info["url_class"] = None
    url_info["error_messages"] = []
    #url_info["processed_count"] = 0
    #url_info["last_processed"] = None
    url_info["first_modified"] = None
    url_info["last_modified"] = None
    url_info["modified_count"] = 0
    url_info["valid_link_count"] = None
    url_info["retry_count"] = 0
    url_info["status_last_modified"] = now
    url_info["encoding"] = None
    url_info["encoding_created_time"] = None
    url_info["redirect_url"] = None
    #url_info["last_finished"] = None
    #url_info["expires"] = now
    url_info["doc"] = None
    url_info["headers"] = None
    url_info["md5"] = None
    #url_info["process_status"] = True
    url_info["last_discovered"] = now
    url_info["discovered_count"] = 1
    url_info["comments"] = ""
    url_info["redirect_count"] = 0
    url_info["recrawl_time"] = now
    url_info["recrawl_duration"] = 0
    url_info["recrawl_priority"] = url_info["crawl_priority"]

    _, full_domain, _ = misc.get_url_domain_info(url)
    url_info["full_domain"] = full_domain
Esempio n. 3
0
def save_crawl_domain_info(url, domain_type = "full_domain", crawl_priority = -1, crawl_depth = -1, \
    recrawl_details = False, recrawl_list = False, recrawl_undefined = False):#-1 means auto config needed

    domain_info = misc.get_url_domain_info(url)
    domain_types = common_settings.domain_types
    domain = domain_info[domain_types.index(domain_type)]
    update_map = {"domain" : domain, "domain_type" : domain_type, "url" : url,
        "crawl_priority" : crawl_priority, "crawl_depth" : crawl_depth,
        "recrawl_details" : recrawl_details, "recrawl_list" : recrawl_list, "recrawl_undefined" : recrawl_undefined,
        "_id" :  misc.md5(''.join([domain, domain_type]))
    }

    db.crawlDomainWhitelist.save(update_map)#Note: will override duplicate domain
Esempio n. 4
0
    def _assign_url_info_defaults(self, url_info):
        now = datetime2timestamp(datetime.datetime.utcnow())
        url_info["created_time"] = now
        url_info["crawled_count"] = 0
        url_info["error_messages"] = []
        url_info["retry_count"] = 0
        url_info["encoding"] = None
        url_info["encoding_created_time"] = None
        url_info["redirect_url"] = None
        #url_info["last_finished"] = None
        #url_info["expires"] = now
        url_info["doc"] = None
        url_info["headers"] = None
        # TODO not used?
        url_info["md5"] = None
        #url_info["process_status"] = True
        url_info["comments"] = ""
        url_info["redirect_count"] = 0

        _, full_domain, _ = misc.get_url_domain_info(url_info['url'])
        url_info["full_domain"] = full_domain
Esempio n. 5
0
 def is_external_url(self, url, parent_url):
     source_info = misc.get_url_domain_info(url)
     target_info = misc.get_url_domain_info(parent_url)
     return not self.match_url_domain_info(source_info, target_info)
Esempio n. 6
0
 def get_crawl_domain_info(self, url):
     domain_type = self._settings["general_crawl_policies"]["url_match_domain_type"]
     domain_info = misc.get_url_domain_info(url)
     domain = self.get_url_domain(domain_info)
     domain_info = crawlerdb.get_crawl_domain_info(domain, domain_type)
     return domain_info