Beispiel #1
0
    def __init__(self, folder_path, dataset, date="Mar15", num_samples=None, mode="write",debug=True): # mode: {raw, read, write}
        self.folder_path = folder_path
        self.dataset = dataset
        self.date = date
        print folder_path,dataset,date," creating allPages "
        #print folder_path
        self.threshold = 4.0
        self.pages = []
        self.path_list = []
        self.category = [] # prediction
        self.xpaths_set = Set()
        self.ground_truth = []  # ground truth list for all pages
        self.idf = {}
        self.selected_df = {}
        self.df = {}
        self.features = []
        self.mode = mode
        self.num_samples = num_samples

        if debug:
            print "debug for pageCluster"
            print "num_samples", num_samples, type(num_samples)
            if num_samples is None:
                feat_folder = "./{}/feature/".format(date) + dataset
            else:
                feat_folder = "./{0}/feature/{1}/".format(date,num_samples) + dataset
                if not os.path.exists(feat_folder):
                    if not os.path.exists("./{0}/feature/{1}/".format(date,num_samples)):
                        os.mkdir("./{0}/feature/{1}/".format(date,num_samples))
                    os.mkdir(feat_folder)
        else:
            if num_samples == None:
                feat_folder = "../{}/feature/".format(date) + dataset
            else:
                feat_folder = "../{0}/feature/{1}/".format(date,num_samples) + dataset
            if not os.path.exists(feat_folder):
                if not os.path.exists("../{0}/feature/{1}/".format(date,num_samples)):
                    os.mkdir("./{0}/feature/{1}/".format(date,num_samples))
                os.mkdir(feat_folder)
        print feat_folder, "feat folder"

        if not os.path.exists(feat_folder):
            os.makedirs(feat_folder)
        if mode == "read":
            page_list = open(feat_folder+"/pages.txt","r").readlines()
            tf_idf_lines = open(feat_folder+"/tf_idf.txt","r").readlines()
            log_tf_idf_lines = open(feat_folder+ "/log_tf_idf.txt","r").readlines()
            features = open(feat_folder + "/xpaths.txt","r").readlines()
            idf_file = open(feat_folder + "/idf.txt","r")
            #file_size_file = open("./{}/feature/".format(date)+ dataset +"/size.txt","r")

            num_samples = len(page_list)
            for i in range(num_samples):

                    pid = page_list[i].strip().split(":")[0]
                    file_path = ":".join(page_list[i].strip().split(":")[1:])
                    file_page = Page(file_path,mode="read")
                    self.path_list.append(file_path)

                    tf_idf_features = tf_idf_lines[i].strip().split(":")[-1]
                    file_page.read_tf_idf(tf_idf_features)
                    
                    log_tf_idf_features = log_tf_idf_lines[i].strip().split(":")[-1]
                    file_page.read_log_tf_idf(log_tf_idf_features)

                    self.pages.append(file_page)

            for i in range(len(features)):
                fid =features[i].strip().split(":")[0]
                xpath = features[i].strip().split(":")[1]
                self.features.append(xpath)

            self.idf = pickle.load(idf_file)
            #self.file_size_list = pickle.load(file_size_file)
            self.category = [0 for i in range(num_samples)]
            self.get_ground_truth(dataset)
            self.num = len(page_list)
        elif mode == "c_baseline":
            print "it is the baseline of v.crescenzi"
            self.add_page_anchor(folder_path)
            self.get_ground_truth(dataset)
        elif mode == "irobot":
            print "it is for the baseline irobot with partial tree alignment "
            self.get_ground_truth(dataset)

        else:
        # initialize data structure
            #  update attributes
            self.addPages(folder_path)
            self.expandXpaths()
            self.updateidf()
            #self.get_ground_truth(dataset)
            self.num = len(self.pages)
            #self.top_local_stop_structure_gt(0.9)
            self.updatetfidf()
            #self.filter_df(0.01,1.0)
            #self.filter_dfs_xpaths_list()
            #self.Leung_baseline()  # binary feature
            self.selected_tfidf()
            self.get_ground_truth(dataset)
            self.file_size_list = self.get_file_size_list()
            
            if mode=="write":
                print "write mode !"
                xpath_file =  open(feat_folder+"/xpaths.txt","w")
                print len(self.pages)
                # filtered xpath :  id xpath
                for page in self.pages:
                    xpath_id = 0
                    for xpath in page.selected_tfidf:
                        xpath_file.write(str(xpath_id)+":"+xpath+"\n")
                        self.features.append(xpath)
                        xpath_id+=1
                    break


                page_file =  open(feat_folder + "/pages.txt","w")# id file_path
                tf_idf_file =  open(feat_folder + "/tf_idf.txt","w")  # pid features..
                log_tf_idf_file = open(feat_folder +"/log_tf_idf.txt","w")
                page_id = 0
                for page in self.pages:
                    #print page.path, page.path.split("http")[1:]
                    path = "http" + "http".join(page.path.split("http")[1:])
                    page_file.write(str(page_id)+":"+path+"\n")
                    vector = []
                    for key in page.selected_tfidf:
                        vector.append(page.selected_tfidf[key])
                    tf_idf_file.write(str(page_id)+":" +" ".join(str(feat) for feat in vector) + "\n")
                    vector = []
                    for key in page.selected_logtfidf:
                        vector.append(page.selected_logtfidf[key])
                    log_tf_idf_file.write(str(page_id)+":" + " ".join(str(feat) for feat in vector)+"\n")

                    page_id += 1
                idf_file = open(feat_folder + "/idf.txt","w")
                pickle.dump(self.idf,idf_file)
                file_size_file = open(feat_folder + "/size.txt","w")
                pickle.dump(self.file_size_list,file_size_file)
Beispiel #2
0
    def __init__(self,
                 folder_path,
                 dataset,
                 date="Mar15",
                 mode="read",
                 num_samples=None):  # mode: {raw, read, write}
        self.folder_path = folder_path
        self.dataset = dataset
        self.date = date
        print folder_path, dataset, date, " creating allPages "
        #print folder_path
        self.threshold = 0.004
        self.pages = []
        self.path_list = []
        self.category = []  # prediction
        self.xpaths_set = Set()
        self.ground_truth = []  # ground truth list for all pages
        self.idf = {}
        self.selected_df = {}
        self.df = {}
        self.features = []
        self.mode = mode
        if not os.path.exists("./{}/feature/".format(date) + dataset):
            os.makedirs("./{}/feature/".format(date) + dataset)
        if mode == "read":
            page_list = open(
                "./{}/feature/".format(date) + dataset + "/pages.txt",
                "r").readlines()
            tf_idf_lines = open(
                "./{}/feature/".format(date) + dataset + "/tf_idf.txt",
                "r").readlines()
            log_tf_idf_lines = open(
                "./{}/feature/".format(date) + dataset + "/log_tf_idf.txt",
                "r").readlines()
            features = open(
                "./{}/feature/".format(date) + dataset + "/xpaths.txt",
                "r").readlines()
            idf_file = open(
                "./{}/feature/".format(date) + dataset + "/idf.txt", "r")
            #file_size_file = open("./{}/feature/".format(date)+ dataset +"/size.txt","r")
            if num_samples is None:
                num_samples = len(page_list)
            for i in range(num_samples):

                pid = page_list[i].strip().split(":")[0]
                file_path = ":".join(page_list[i].strip().split(":")[1:])
                file_page = Page(file_path, mode="read")
                self.path_list.append(file_path)

                tf_idf_features = tf_idf_lines[i].strip().split(":")[-1]
                file_page.read_tf_idf(tf_idf_features)

                log_tf_idf_features = log_tf_idf_lines[i].strip().split(
                    ":")[-1]
                file_page.read_log_tf_idf(log_tf_idf_features)

                self.pages.append(file_page)

            for i in range(len(features)):
                fid = features[i].strip().split(":")[0]
                xpath = features[i].strip().split(":")[1]
                self.features.append(xpath)

            self.idf = pickle.load(idf_file)
            #self.file_size_list = pickle.load(file_size_file)
            self.category = [0 for i in range(num_samples)]
            self.get_ground_truth(dataset)
            self.num = len(page_list)
        elif mode == "c_baseline":
            print "it is the baseline of v.crescenzi"
            self.add_page_anchor(folder_path)
            self.get_ground_truth(dataset)
        elif mode == "irobot":
            print "it is for the baseline irobot with partial tree alignment "
            self.get_ground_truth(dataset)

        else:
            # initialize data structure
            #  update attributes
            self.addPages(folder_path)
            self.expandXpaths()
            self.updateidf()
            #self.get_ground_truth(dataset)
            self.num = len(self.pages)
            #self.top_local_stop_structure_gt(0.9)
            self.updatetfidf()
            self.filter_df(0.01, 1.0)
            #self.filter_dfs_xpaths_list()
            #self.Leung_baseline()  # binary feature
            self.selected_tfidf()
            self.get_ground_truth(dataset)
            self.file_size_list = self.get_file_size_list()

            if mode == "write":
                print "write mode !"
                xpath_file = open(
                    "./{}/feature/".format(date) + dataset + "/xpaths.txt",
                    "w")
                print len(self.pages)
                # filtered xpath :  id xpath
                for page in self.pages:
                    xpath_id = 0
                    for xpath in page.selected_tfidf:
                        xpath_file.write(str(xpath_id) + ":" + xpath + "\n")
                        self.features.append(xpath)
                        xpath_id += 1
                    break

                page_file = open("./{}/feature/".format(date) + dataset +
                                 "/pages.txt", "w")  # id file_path
                tf_idf_file = open("./{}/feature/".format(date) + dataset +
                                   "/tf_idf.txt", "w")  # pid features..
                log_tf_idf_file = open(
                    "./{}/feature/".format(date) + dataset + "/log_tf_idf.txt",
                    "w")
                page_id = 0
                for page in self.pages:
                    page_file.write(str(page_id) + ":" + page.path + "\n")
                    vector = []
                    for key in page.selected_tfidf:
                        vector.append(page.selected_tfidf[key])
                    tf_idf_file.write(
                        str(page_id) + ":" +
                        " ".join(str(feat) for feat in vector) + "\n")
                    vector = []
                    for key in page.selected_logtfidf:
                        vector.append(page.selected_logtfidf[key])
                    log_tf_idf_file.write(
                        str(page_id) + ":" +
                        " ".join(str(feat) for feat in vector) + "\n")

                    page_id += 1
                idf_file = open(
                    "./{}/feature/".format(date) + dataset + "/idf.txt", "w")
                pickle.dump(self.idf, idf_file)
                file_size_file = open(
                    "./{}/feature/".format(date) + dataset + "/size.txt", "w")
                pickle.dump(self.file_size_list, file_size_file)