Exemple #1
0
    def crawl_link(self, first_url, rule_id, history_stack, sampler):
        # rule_id = self.get_rule_id(first_url)
        print first_url, "in town!"
        # already the leaf node, no need to crawl
        if rule_id != len(self.pattern) - 1:
            rule_id += 1

        file_path = self.full_folder + "/" + first_url.replace("/", "_") + ".html"
        # print file_path
        page, cluster_id = self.crawler.classify(file_path)
        available_url_list = []
        print file_path, cluster_id, "file_path for cluster_id"
        original = open(file_path, "r").read()
        contents = original.replace("\n", "")
        link_dict = sampler.getAnchor(contents, first_url, sample_flag=False)

        for xpath in link_dict:
            link_list = link_dict[xpath]
            for url in link_list:
                if self.judge_match(url, rule_id):
                    if url not in history_stack and url not in available_url_list:
                        available_url_list.append((url, first_url, rule_id))
                        print url, rule_id, self.pattern[rule_id]
        print available_url_list
        self.url_stack += available_url_list
        return available_url_list, rule_id
Exemple #2
0
    def crawl_link(self, first_url, rule_id, history_stack, sampler):
        #rule_id = self.get_rule_id(first_url)
        print first_url, "in town!"
        # already the leaf node, no need to crawl
        if rule_id != len(self.pattern) - 1:
            rule_id += 1

        file_path = self.full_folder + "/" + first_url.replace("/",
                                                               "_") + ".html"
        #print file_path
        page, cluster_id = self.crawler.classify(file_path)
        available_url_list = []
        print file_path, cluster_id, "file_path for cluster_id"
        original = open(file_path, "r").read()
        contents = original.replace("\n", "")
        link_dict = sampler.getAnchor(contents, first_url, sample_flag=False)

        for xpath in link_dict:
            link_list = link_dict[xpath]
            for url in link_list:
                if self.judge_match(url, rule_id):
                    if url not in history_stack and url not in available_url_list:
                        available_url_list.append((url, first_url, rule_id))
                        print url, rule_id, self.pattern[rule_id]
        print available_url_list
        self.url_stack += available_url_list
        return available_url_list, rule_id
Exemple #3
0
 def sample_link(self, first_url, sampler, method="uniform"):
     #print "sample link starts"
     file_path = self.full_folder + "/" + first_url.replace("/",
                                                            "_") + ".html"
     page, cluster_id = self.classify(file_path)
     original = open(file_path, "r").read()
     contents = original.replace("\n", "")
     link_dict = sampler.getAnchor(contents, first_url, sample_flag=False)
     available_url_list = []
     #print self.xpath_counts_dict, "xpath_counts_dict"
     if method == "uniform" or method == "pagerank" or method == "indegree":
         print "no predicting scores"
         for xpath in link_dict:
             link_list = link_dict[xpath]
             for link in link_list:
                 if link not in self.history_set:
                     available_url_list.append(link)
     else:
         print "predicting scores based on distribution"
         for xpath in link_dict:
             distribution = self.xpath_counts_dict[(cluster_id, xpath)]
             #print distribution, " distribution is "
             link_list = link_dict[xpath]
             if method == "est_pagerank":
                 score = self.calculate_url_pr(distribution)
             elif method == "est_degree":
                 score = self.calculate_url_indegree(distribution)
             elif method == "est_prob":
                 score = self.calculate_url_prob(distribution)
             #print score, "calcualte_url_pr"
             for link in link_list:
                 if link not in self.history_set:
                     available_url_list.append((link, score))
     #print "sample link ends"
     return available_url_list, cluster_id
Exemple #4
0
    def crawl_link(self, first_url, history_stack, sampler):
        file_path = self.full_folder + "/" + first_url.replace("/",
                                                               "_") + ".html"
        #print file_path
        page, cluster_id = self.classify(file_path)
        self.crawled_cluster_count[cluster_id] += 1
        available_url_list = []
        print file_path, cluster_id, "file_path for cluster_id"
        original = open(file_path, "r").read()
        contents = original.replace("\n", "")
        link_dict = sampler.getAnchor(contents, first_url, sample_flag=False)
        print link_dict, "link dict"
        #self.transition_dict[url] = link_dict
        for xpath in link_dict:
            # considering cluster
            #distribution = self.cluster_xpath_trans[(cluster_id,xpath)]
            distribution = self.xpath_counts_dict[(cluster_id, xpath)]
            #print distribution, (cluster_id,xpath), " cluster_id, xpath for distribution"
            score = self.calculate_url_importance(
                distribution)  # and self.cluster_trans_prob_mat
            #print distribution, "the probability of itself"
            link_list = link_dict[xpath]
            flag = 0
            for url in link_list:
                if flag == 0:
                    #print url ,score, "url , score"
                    flag += 1

                if url not in history_stack and url not in available_url_list:
                    available_url_list.append(
                        (url, first_url, (cluster_id, xpath), score))
        return available_url_list, cluster_id
 def sample_link(self,first_url,sampler,method="uniform"):
     #print "sample link starts"
     file_path = self.full_folder + "/" + first_url.replace("/","_") +".html"
     page, cluster_id = self.classify(file_path)
     original = open(file_path,"r").read()
     contents = original.replace("\n","")
     link_dict = sampler.getAnchor(contents,first_url,sample_flag=False)
     available_url_list = []
     #print self.xpath_counts_dict, "xpath_counts_dict"
     if method == "uniform" or method == "pagerank" or method == "indegree":
         print "no predicting scores"
         for xpath in link_dict:
             link_list = link_dict[xpath]
             for link in link_list:
                 if link not in self.history_set:
                     available_url_list.append(link)
     else:
         print "predicting scores based on distribution"
         for xpath in link_dict:
             distribution = self.xpath_counts_dict[(cluster_id,xpath)]
             #print distribution, " distribution is "
             link_list = link_dict[xpath]
             if method == "est_pagerank":
                 score = self.calculate_url_pr(distribution)
             elif method == "est_degree":
                 score = self.calculate_url_indegree(distribution)
             elif method == "est_prob":
                 score = self.calculate_url_prob(distribution)
             #print score, "calcualte_url_pr"
             for link in link_list:
                 if link not in self.history_set:
                     available_url_list.append((link,score))
     #print "sample link ends"
     return available_url_list, cluster_id
    def crawl_link(self, first_url, history_stack, sampler):
        file_path = self.full_folder + "/" + first_url.replace("/","_") +".html"
        #print file_path
        page,cluster_id = self.classify(file_path)
        self.crawled_cluster_count[cluster_id] += 1
        available_url_list = []
        print file_path, cluster_id , "file_path for cluster_id"
        original = open(file_path,"r").read()
        contents = original.replace("\n","")
        link_dict = sampler.getAnchor(contents,first_url,sample_flag=False)
        print link_dict, "link dict"
        #self.transition_dict[url] = link_dict
        for xpath in link_dict:
            # considering cluster
            #distribution = self.cluster_xpath_trans[(cluster_id,xpath)]
            distribution = self.xpath_counts_dict[(cluster_id,xpath)]
            #print distribution, (cluster_id,xpath), " cluster_id, xpath for distribution"
            score = self.calculate_url_importance(distribution) # and self.cluster_trans_prob_mat
            #print distribution, "the probability of itself"
            link_list = link_dict[xpath]
            flag = 0
            for url in link_list:
                if flag == 0:
                    #print url ,score, "url , score"
                    flag += 1

                if url not in history_stack and url not in available_url_list:
                    available_url_list.append((url,first_url,(cluster_id,xpath),score))
        return available_url_list, cluster_id