def __init__(self, folder_path, dataset, date="Mar15", num_samples=None, mode="write",debug=True): # mode: {raw, read, write} self.folder_path = folder_path self.dataset = dataset self.date = date print folder_path,dataset,date," creating allPages " #print folder_path self.threshold = 4.0 self.pages = [] self.path_list = [] self.category = [] # prediction self.xpaths_set = Set() self.ground_truth = [] # ground truth list for all pages self.idf = {} self.selected_df = {} self.df = {} self.features = [] self.mode = mode self.num_samples = num_samples if debug: print "debug for pageCluster" print "num_samples", num_samples, type(num_samples) if num_samples is None: feat_folder = "./{}/feature/".format(date) + dataset else: feat_folder = "./{0}/feature/{1}/".format(date,num_samples) + dataset if not os.path.exists(feat_folder): if not os.path.exists("./{0}/feature/{1}/".format(date,num_samples)): os.mkdir("./{0}/feature/{1}/".format(date,num_samples)) os.mkdir(feat_folder) else: if num_samples == None: feat_folder = "../{}/feature/".format(date) + dataset else: feat_folder = "../{0}/feature/{1}/".format(date,num_samples) + dataset if not os.path.exists(feat_folder): if not os.path.exists("../{0}/feature/{1}/".format(date,num_samples)): os.mkdir("./{0}/feature/{1}/".format(date,num_samples)) os.mkdir(feat_folder) print feat_folder, "feat folder" if not os.path.exists(feat_folder): os.makedirs(feat_folder) if mode == "read": page_list = open(feat_folder+"/pages.txt","r").readlines() tf_idf_lines = open(feat_folder+"/tf_idf.txt","r").readlines() log_tf_idf_lines = open(feat_folder+ "/log_tf_idf.txt","r").readlines() features = open(feat_folder + "/xpaths.txt","r").readlines() idf_file = open(feat_folder + "/idf.txt","r") #file_size_file = open("./{}/feature/".format(date)+ dataset +"/size.txt","r") num_samples = len(page_list) for i in range(num_samples): pid = page_list[i].strip().split(":")[0] file_path = ":".join(page_list[i].strip().split(":")[1:]) file_page = Page(file_path,mode="read") self.path_list.append(file_path) tf_idf_features = tf_idf_lines[i].strip().split(":")[-1] file_page.read_tf_idf(tf_idf_features) log_tf_idf_features = log_tf_idf_lines[i].strip().split(":")[-1] file_page.read_log_tf_idf(log_tf_idf_features) self.pages.append(file_page) for i in range(len(features)): fid =features[i].strip().split(":")[0] xpath = features[i].strip().split(":")[1] self.features.append(xpath) self.idf = pickle.load(idf_file) #self.file_size_list = pickle.load(file_size_file) self.category = [0 for i in range(num_samples)] self.get_ground_truth(dataset) self.num = len(page_list) elif mode == "c_baseline": print "it is the baseline of v.crescenzi" self.add_page_anchor(folder_path) self.get_ground_truth(dataset) elif mode == "irobot": print "it is for the baseline irobot with partial tree alignment " self.get_ground_truth(dataset) else: # initialize data structure # update attributes self.addPages(folder_path) self.expandXpaths() self.updateidf() #self.get_ground_truth(dataset) self.num = len(self.pages) #self.top_local_stop_structure_gt(0.9) self.updatetfidf() #self.filter_df(0.01,1.0) #self.filter_dfs_xpaths_list() #self.Leung_baseline() # binary feature self.selected_tfidf() self.get_ground_truth(dataset) self.file_size_list = self.get_file_size_list() if mode=="write": print "write mode !" xpath_file = open(feat_folder+"/xpaths.txt","w") print len(self.pages) # filtered xpath : id xpath for page in self.pages: xpath_id = 0 for xpath in page.selected_tfidf: xpath_file.write(str(xpath_id)+":"+xpath+"\n") self.features.append(xpath) xpath_id+=1 break page_file = open(feat_folder + "/pages.txt","w")# id file_path tf_idf_file = open(feat_folder + "/tf_idf.txt","w") # pid features.. log_tf_idf_file = open(feat_folder +"/log_tf_idf.txt","w") page_id = 0 for page in self.pages: #print page.path, page.path.split("http")[1:] path = "http" + "http".join(page.path.split("http")[1:]) page_file.write(str(page_id)+":"+path+"\n") vector = [] for key in page.selected_tfidf: vector.append(page.selected_tfidf[key]) tf_idf_file.write(str(page_id)+":" +" ".join(str(feat) for feat in vector) + "\n") vector = [] for key in page.selected_logtfidf: vector.append(page.selected_logtfidf[key]) log_tf_idf_file.write(str(page_id)+":" + " ".join(str(feat) for feat in vector)+"\n") page_id += 1 idf_file = open(feat_folder + "/idf.txt","w") pickle.dump(self.idf,idf_file) file_size_file = open(feat_folder + "/size.txt","w") pickle.dump(self.file_size_list,file_size_file)
def __init__(self, folder_path, dataset, date="Mar15", mode="read", num_samples=None): # mode: {raw, read, write} self.folder_path = folder_path self.dataset = dataset self.date = date print folder_path, dataset, date, " creating allPages " #print folder_path self.threshold = 0.004 self.pages = [] self.path_list = [] self.category = [] # prediction self.xpaths_set = Set() self.ground_truth = [] # ground truth list for all pages self.idf = {} self.selected_df = {} self.df = {} self.features = [] self.mode = mode if not os.path.exists("./{}/feature/".format(date) + dataset): os.makedirs("./{}/feature/".format(date) + dataset) if mode == "read": page_list = open( "./{}/feature/".format(date) + dataset + "/pages.txt", "r").readlines() tf_idf_lines = open( "./{}/feature/".format(date) + dataset + "/tf_idf.txt", "r").readlines() log_tf_idf_lines = open( "./{}/feature/".format(date) + dataset + "/log_tf_idf.txt", "r").readlines() features = open( "./{}/feature/".format(date) + dataset + "/xpaths.txt", "r").readlines() idf_file = open( "./{}/feature/".format(date) + dataset + "/idf.txt", "r") #file_size_file = open("./{}/feature/".format(date)+ dataset +"/size.txt","r") if num_samples is None: num_samples = len(page_list) for i in range(num_samples): pid = page_list[i].strip().split(":")[0] file_path = ":".join(page_list[i].strip().split(":")[1:]) file_page = Page(file_path, mode="read") self.path_list.append(file_path) tf_idf_features = tf_idf_lines[i].strip().split(":")[-1] file_page.read_tf_idf(tf_idf_features) log_tf_idf_features = log_tf_idf_lines[i].strip().split( ":")[-1] file_page.read_log_tf_idf(log_tf_idf_features) self.pages.append(file_page) for i in range(len(features)): fid = features[i].strip().split(":")[0] xpath = features[i].strip().split(":")[1] self.features.append(xpath) self.idf = pickle.load(idf_file) #self.file_size_list = pickle.load(file_size_file) self.category = [0 for i in range(num_samples)] self.get_ground_truth(dataset) self.num = len(page_list) elif mode == "c_baseline": print "it is the baseline of v.crescenzi" self.add_page_anchor(folder_path) self.get_ground_truth(dataset) elif mode == "irobot": print "it is for the baseline irobot with partial tree alignment " self.get_ground_truth(dataset) else: # initialize data structure # update attributes self.addPages(folder_path) self.expandXpaths() self.updateidf() #self.get_ground_truth(dataset) self.num = len(self.pages) #self.top_local_stop_structure_gt(0.9) self.updatetfidf() self.filter_df(0.01, 1.0) #self.filter_dfs_xpaths_list() #self.Leung_baseline() # binary feature self.selected_tfidf() self.get_ground_truth(dataset) self.file_size_list = self.get_file_size_list() if mode == "write": print "write mode !" xpath_file = open( "./{}/feature/".format(date) + dataset + "/xpaths.txt", "w") print len(self.pages) # filtered xpath : id xpath for page in self.pages: xpath_id = 0 for xpath in page.selected_tfidf: xpath_file.write(str(xpath_id) + ":" + xpath + "\n") self.features.append(xpath) xpath_id += 1 break page_file = open("./{}/feature/".format(date) + dataset + "/pages.txt", "w") # id file_path tf_idf_file = open("./{}/feature/".format(date) + dataset + "/tf_idf.txt", "w") # pid features.. log_tf_idf_file = open( "./{}/feature/".format(date) + dataset + "/log_tf_idf.txt", "w") page_id = 0 for page in self.pages: page_file.write(str(page_id) + ":" + page.path + "\n") vector = [] for key in page.selected_tfidf: vector.append(page.selected_tfidf[key]) tf_idf_file.write( str(page_id) + ":" + " ".join(str(feat) for feat in vector) + "\n") vector = [] for key in page.selected_logtfidf: vector.append(page.selected_logtfidf[key]) log_tf_idf_file.write( str(page_id) + ":" + " ".join(str(feat) for feat in vector) + "\n") page_id += 1 idf_file = open( "./{}/feature/".format(date) + dataset + "/idf.txt", "w") pickle.dump(self.idf, idf_file) file_size_file = open( "./{}/feature/".format(date) + dataset + "/size.txt", "w") pickle.dump(self.file_size_list, file_size_file)