def crawl_link(self, first_url, rule_id, history_stack, sampler): # rule_id = self.get_rule_id(first_url) print first_url, "in town!" # already the leaf node, no need to crawl if rule_id != len(self.pattern) - 1: rule_id += 1 file_path = self.full_folder + "/" + first_url.replace("/", "_") + ".html" # print file_path page, cluster_id = self.crawler.classify(file_path) available_url_list = [] print file_path, cluster_id, "file_path for cluster_id" original = open(file_path, "r").read() contents = original.replace("\n", "") link_dict = sampler.getAnchor(contents, first_url, sample_flag=False) for xpath in link_dict: link_list = link_dict[xpath] for url in link_list: if self.judge_match(url, rule_id): if url not in history_stack and url not in available_url_list: available_url_list.append((url, first_url, rule_id)) print url, rule_id, self.pattern[rule_id] print available_url_list self.url_stack += available_url_list return available_url_list, rule_id
def crawl_link(self, first_url, rule_id, history_stack, sampler): #rule_id = self.get_rule_id(first_url) print first_url, "in town!" # already the leaf node, no need to crawl if rule_id != len(self.pattern) - 1: rule_id += 1 file_path = self.full_folder + "/" + first_url.replace("/", "_") + ".html" #print file_path page, cluster_id = self.crawler.classify(file_path) available_url_list = [] print file_path, cluster_id, "file_path for cluster_id" original = open(file_path, "r").read() contents = original.replace("\n", "") link_dict = sampler.getAnchor(contents, first_url, sample_flag=False) for xpath in link_dict: link_list = link_dict[xpath] for url in link_list: if self.judge_match(url, rule_id): if url not in history_stack and url not in available_url_list: available_url_list.append((url, first_url, rule_id)) print url, rule_id, self.pattern[rule_id] print available_url_list self.url_stack += available_url_list return available_url_list, rule_id
def sample_link(self, first_url, sampler, method="uniform"): #print "sample link starts" file_path = self.full_folder + "/" + first_url.replace("/", "_") + ".html" page, cluster_id = self.classify(file_path) original = open(file_path, "r").read() contents = original.replace("\n", "") link_dict = sampler.getAnchor(contents, first_url, sample_flag=False) available_url_list = [] #print self.xpath_counts_dict, "xpath_counts_dict" if method == "uniform" or method == "pagerank" or method == "indegree": print "no predicting scores" for xpath in link_dict: link_list = link_dict[xpath] for link in link_list: if link not in self.history_set: available_url_list.append(link) else: print "predicting scores based on distribution" for xpath in link_dict: distribution = self.xpath_counts_dict[(cluster_id, xpath)] #print distribution, " distribution is " link_list = link_dict[xpath] if method == "est_pagerank": score = self.calculate_url_pr(distribution) elif method == "est_degree": score = self.calculate_url_indegree(distribution) elif method == "est_prob": score = self.calculate_url_prob(distribution) #print score, "calcualte_url_pr" for link in link_list: if link not in self.history_set: available_url_list.append((link, score)) #print "sample link ends" return available_url_list, cluster_id
def crawl_link(self, first_url, history_stack, sampler): file_path = self.full_folder + "/" + first_url.replace("/", "_") + ".html" #print file_path page, cluster_id = self.classify(file_path) self.crawled_cluster_count[cluster_id] += 1 available_url_list = [] print file_path, cluster_id, "file_path for cluster_id" original = open(file_path, "r").read() contents = original.replace("\n", "") link_dict = sampler.getAnchor(contents, first_url, sample_flag=False) print link_dict, "link dict" #self.transition_dict[url] = link_dict for xpath in link_dict: # considering cluster #distribution = self.cluster_xpath_trans[(cluster_id,xpath)] distribution = self.xpath_counts_dict[(cluster_id, xpath)] #print distribution, (cluster_id,xpath), " cluster_id, xpath for distribution" score = self.calculate_url_importance( distribution) # and self.cluster_trans_prob_mat #print distribution, "the probability of itself" link_list = link_dict[xpath] flag = 0 for url in link_list: if flag == 0: #print url ,score, "url , score" flag += 1 if url not in history_stack and url not in available_url_list: available_url_list.append( (url, first_url, (cluster_id, xpath), score)) return available_url_list, cluster_id
def sample_link(self,first_url,sampler,method="uniform"): #print "sample link starts" file_path = self.full_folder + "/" + first_url.replace("/","_") +".html" page, cluster_id = self.classify(file_path) original = open(file_path,"r").read() contents = original.replace("\n","") link_dict = sampler.getAnchor(contents,first_url,sample_flag=False) available_url_list = [] #print self.xpath_counts_dict, "xpath_counts_dict" if method == "uniform" or method == "pagerank" or method == "indegree": print "no predicting scores" for xpath in link_dict: link_list = link_dict[xpath] for link in link_list: if link not in self.history_set: available_url_list.append(link) else: print "predicting scores based on distribution" for xpath in link_dict: distribution = self.xpath_counts_dict[(cluster_id,xpath)] #print distribution, " distribution is " link_list = link_dict[xpath] if method == "est_pagerank": score = self.calculate_url_pr(distribution) elif method == "est_degree": score = self.calculate_url_indegree(distribution) elif method == "est_prob": score = self.calculate_url_prob(distribution) #print score, "calcualte_url_pr" for link in link_list: if link not in self.history_set: available_url_list.append((link,score)) #print "sample link ends" return available_url_list, cluster_id
def crawl_link(self, first_url, history_stack, sampler): file_path = self.full_folder + "/" + first_url.replace("/","_") +".html" #print file_path page,cluster_id = self.classify(file_path) self.crawled_cluster_count[cluster_id] += 1 available_url_list = [] print file_path, cluster_id , "file_path for cluster_id" original = open(file_path,"r").read() contents = original.replace("\n","") link_dict = sampler.getAnchor(contents,first_url,sample_flag=False) print link_dict, "link dict" #self.transition_dict[url] = link_dict for xpath in link_dict: # considering cluster #distribution = self.cluster_xpath_trans[(cluster_id,xpath)] distribution = self.xpath_counts_dict[(cluster_id,xpath)] #print distribution, (cluster_id,xpath), " cluster_id, xpath for distribution" score = self.calculate_url_importance(distribution) # and self.cluster_trans_prob_mat #print distribution, "the probability of itself" link_list = link_dict[xpath] flag = 0 for url in link_list: if flag == 0: #print url ,score, "url , score" flag += 1 if url not in history_stack and url not in available_url_list: available_url_list.append((url,first_url,(cluster_id,xpath),score)) return available_url_list, cluster_id