def __init__(self, dataset, date, entry,prefix, eps, cluster_rank,crawl_size, rank_algo="bfs"): self.dataset = dataset self.date = date self.eps = eps self.cluster_rank = cluster_rank self.rank_algo = rank_algo self.crawl_size = crawl_size self.rules = self.get_rules() self.entry, self.prefix = entry,prefix self.history_set = set() self.path_prefix = "../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_","")) self.folder_path = ["../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_",""))] self.sitemap = pageCluster(dataset,date,self.folder_path,0) self.full_folder = "../../Crawler/full_data/" + dataset self.trans = {} self.queue = {} self.crawled_cluster_count = defaultdict(int) self.trans_dict = read_trans_dict(dataset,date) #self.cluster_dict = get_cluster_dict(dataset,date) #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps)) self.cluster_num = int(self.sitemap.DBSCAN()) self.build_gold_cluster_dict() self.cluster_xpath_trans = self.get_xpath_transition() self.trans_prob_mat = self.calculate_trans_prob_mat() self.max_score = 500 self.target_cluster = self.get_sample_cluster()
def get_xpath_transition(self): sampled_urls = self.gold_dict.keys() counts_dict = defaultdict(int) # (cluster_id, xpath) -> int xpath_counts_dict = defaultdict(lambda : defaultdict(float)) # (cluster_id, xpath) - > dict[cluster_id] -> int trans_dict = read_trans_dict(self.dataset,self.date) # [page][xpath] = [url list] ->[cluster][xpath] = {probability list} print "sample_url", sampled_urls for page_path, trans in trans_dict.iteritems(): page_url = page_path.replace(".html","").replace("_","/") if page_url not in self.gold_dict: #print "?" + page_url, " is missing" continue else: cluster_id = self.cluster_dict[page_url] for xpath,url_list in trans.iteritems(): length = len(url_list) count = 0 for path in url_list: url = path.replace(".html","").replace("_","/") if url in sampled_urls: count += 1 #print "for xpath: {0} --- {1} out of {2} have been crawled and have cluster id".format(xpath,count, length) if count == 0: continue else: #if cluster_id == 1: # print page_path, xpath, url_list, "xpath_url_list in train" key = (cluster_id,xpath) #if key == (1,"/html/body/div/div/div/div/div/div/div/div/div/div/ul/li/div/div/div/div/div/div/div/ul/li/div/div/div/h3/a[yt-uix-sessionlink yt-uix-tile-link spf-link yt-ui-ellipsis yt-ui-ellipsis-2]"): # print page_path,url_list, "why 9 not 7???" counts_dict[key] += 1 ratio = float(length)/float(count) for path in url_list: url = path.replace(".html","").replace("_","/") if url in sampled_urls: destination_id = self.cluster_dict[url] #print url, destination_id xpath_counts_dict[key][destination_id] += 1 * ratio #if cluster_id == 1: # print "" # average for key,count in counts_dict.iteritems(): for destination_id in xpath_counts_dict[key]: xpath_counts_dict[key][destination_id] /= count print "Micro average entropy is " + str(self.entropy(xpath_counts_dict)) ''' output for key in xpath_counts_dict: if key[0] == 1: print key, xpath_counts_dict[key] ''' print "=========== end of training ============" self.xpath_counts_dict = xpath_counts_dict return xpath_counts_dict
def __init__(self, dataset, date, entry, prefix, eps, cluster_rank, crawl_size, rank_algo="bfs"): self.dataset = dataset self.date = date self.eps = eps self.cluster_rank = cluster_rank self.rank_algo = rank_algo self.crawl_size = crawl_size self.rules = self.get_rules() self.entry, self.prefix = entry, prefix self.history_set = set() self.path_prefix = "../../Crawler/{}_samples/{}/".format( date, dataset.replace("new_", "")) self.folder_path = [ "../../Crawler/{}_samples/{}/".format(date, dataset.replace("new_", "")) ] self.sitemap = pageCluster(dataset, date, self.folder_path, 0) self.full_folder = "../../Crawler/full_data/" + dataset self.trans = {} self.queue = {} self.crawled_cluster_count = defaultdict(int) self.trans_dict = read_trans_dict(dataset, date) #self.cluster_dict = get_cluster_dict(dataset,date) #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps)) self.cluster_num = int(self.sitemap.DBSCAN()) self.build_gold_cluster_dict() self.cluster_xpath_trans = self.get_xpath_transition() self.trans_prob_mat = self.calculate_trans_prob_mat() self.max_score = 500 self.target_cluster = self.get_sample_cluster()
def get_xpath_transition(self): sampled_urls = self.gold_dict.keys() counts_dict = defaultdict(int) # (cluster_id, xpath) -> int xpath_counts_dict = defaultdict(lambda: defaultdict( float)) # (cluster_id, xpath) - > dict[cluster_id] -> int trans_dict = read_trans_dict( self.dataset, self.date ) # [page][xpath] = [url list] ->[cluster][xpath] = {probability list} print "sample_url", sampled_urls for page_path, trans in trans_dict.iteritems(): page_url = page_path.replace(".html", "").replace("_", "/") if page_url not in self.gold_dict: #print "?" + page_url, " is missing" continue else: cluster_id = self.cluster_dict[page_url] for xpath, url_list in trans.iteritems(): length = len(url_list) count = 0 for path in url_list: url = path.replace(".html", "").replace("_", "/") if url in sampled_urls: count += 1 #print "for xpath: {0} --- {1} out of {2} have been crawled and have cluster id".format(xpath,count, length) if count == 0: continue else: #if cluster_id == 1: # print page_path, xpath, url_list, "xpath_url_list in train" key = (cluster_id, xpath) #if key == (1,"/html/body/div/div/div/div/div/div/div/div/div/div/ul/li/div/div/div/div/div/div/div/ul/li/div/div/div/h3/a[yt-uix-sessionlink yt-uix-tile-link spf-link yt-ui-ellipsis yt-ui-ellipsis-2]"): # print page_path,url_list, "why 9 not 7???" counts_dict[key] += 1 ratio = float(length) / float(count) for path in url_list: url = path.replace(".html", "").replace("_", "/") if url in sampled_urls: destination_id = self.cluster_dict[url] #print url, destination_id xpath_counts_dict[key][destination_id] += 1 * ratio #if cluster_id == 1: # print "" # average for key, count in counts_dict.iteritems(): for destination_id in xpath_counts_dict[key]: xpath_counts_dict[key][destination_id] /= count print "Micro average entropy is " + str( self.entropy(xpath_counts_dict)) ''' output for key in xpath_counts_dict: if key[0] == 1: print key, xpath_counts_dict[key] ''' print "=========== end of training ============" self.xpath_counts_dict = xpath_counts_dict return xpath_counts_dict