Ejemplo n.º 1
0
 def __init__(self, dataset, date, entry, prefix, cluster_rank, crawl_size):
     self.dataset = dataset
     self.date = date
     self.cluster_rank = cluster_rank
     self.crawl_size = crawl_size
     self.cluster_rank = cluster_rank
     self.entry, self.prefix = entry, prefix
     self.history_set = set()
     self.group_list = []
     self.group_dict = {}
     if self.date == "May1":
         self.path_prefix = "../../Crawler/{}_samples/{}/".format(date, dataset.replace("new_", ""))
     else:
         self.path_prefix = "../Crawler/{}_samples/{}/".format(date, dataset)
     self.folder_path = ["../../Crawler/{}_samples/{}/".format(date, dataset.replace("new_", ""))]
     self.sitemap = pageCluster(dataset, date, self.folder_path, 0)
     self.cluster_num = int(self.sitemap.DBSCAN())
     self.full_folder = "../../Crawler/full_data/" + dataset
     c = crawler(
         self.dataset,
         self.date,
         None,
         None,
         eps=None,
         cluster_rank=self.cluster_rank,
         crawl_size=None,
         rank_algo=None,
     )
     self.target_cluster = c.target_cluster
     self.crawler = c
Ejemplo n.º 2
0
 def __init__(self, dataset, date, entry, prefix, cluster_rank, crawl_size):
     self.dataset = dataset
     self.date = date
     self.cluster_rank = cluster_rank
     self.crawl_size = crawl_size
     self.cluster_rank = cluster_rank
     self.entry, self.prefix = entry, prefix
     self.history_set = set()
     self.group_list = []
     self.group_dict = {}
     if self.date == "May1":
         self.path_prefix = "../../Crawler/{}_samples/{}/".format(
             date, dataset.replace("new_", ""))
     else:
         self.path_prefix = "../Crawler/{}_samples/{}/".format(
             date, dataset)
     self.folder_path = [
         "../../Crawler/{}_samples/{}/".format(date,
                                               dataset.replace("new_", ""))
     ]
     self.sitemap = pageCluster(dataset, date, self.folder_path, 0)
     self.cluster_num = int(self.sitemap.DBSCAN())
     self.full_folder = "../../Crawler/full_data/" + dataset
     c = crawler(self.dataset,
                 self.date,
                 None,
                 None,
                 eps=None,
                 cluster_rank=self.cluster_rank,
                 crawl_size=None,
                 rank_algo=None)
     self.target_cluster = c.target_cluster
     self.crawler = c
Ejemplo n.º 3
0
    def __init__(self, dataset, date, entry,prefix, eps, cluster_rank,crawl_size, rank_algo="bfs"):
        self.dataset = dataset
        self.date = date
        self.eps = eps
        self.cluster_rank = cluster_rank
        self.rank_algo = rank_algo
        self.crawl_size = crawl_size
        self.rules = self.get_rules()
        self.entry, self.prefix = entry,prefix
        self.history_set = set()
        self.path_prefix = "../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_",""))
        self.folder_path = ["../../Crawler/{}_samples/{}/".format(date,dataset.replace("new_",""))]
        self.sitemap = pageCluster(dataset,date,self.folder_path,0)
        self.full_folder = "../../Crawler/full_data/" + dataset
        self.trans = {}
        self.queue = {}
        self.crawled_cluster_count = defaultdict(int)
        self.trans_dict = read_trans_dict(dataset,date)


        #self.cluster_dict = get_cluster_dict(dataset,date)
        #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps))
        self.cluster_num = int(self.sitemap.DBSCAN())
        self.build_gold_cluster_dict()
        self.cluster_xpath_trans = self.get_xpath_transition()
        self.trans_prob_mat = self.calculate_trans_prob_mat()
        self.max_score = 500
        self.target_cluster = self.get_sample_cluster()
Ejemplo n.º 4
0
    def cv(self):
        labels_true = np.array(self.pages.ground_truth)
        skf = StratifiedKFold(labels_true, n_folds=4)
        results = []
        count = 0
        p = pageCluster(self.dataset, self.date)
        for train, test in skf:
            #print train, test
            count += 1
            print "this is the {} times for CV".format(count)
            train_gold, test_gold = labels_true[train], labels_true[test]
            self.run(train)
            self.MDL()
            path_list = self.pages.path_list

            self.classify(test)
            self.clustering()

            print train, "train index list", type(train), len(train)
            train_y = np.array(self.pages.category)[train]
            test_y = np.array(self.pages.category)[test]

            results.append(
                p.Evaluation_CV(test_gold,
                                test_y,
                                train_gold,
                                train_y,
                                path_list=path_list))
            '''
            t = KMeans()
            train_y, final_centroids, final_ite, final_dist = t.k_means(km_train_x, num_clusters, replicates=20)
            test_y = t.k_means_classify(test_x)
            path_list = [self.UP_pages.path_list[idx] for idx in test]
            results.append(self.Evaluation_CV(test_gold,test_y,km_train_gold,train_y, path_list=path_list))
            '''
        result = np.mean(results, axis=0)
        cv_batch_file = open("./results/c_cv_baseline.results", "a")
        algo = "dbscan"
        dataset = self.dataset
        prefix = str(dataset) + " classifying \t"
        metrics = [
            'cv_micro_precision', 'cv_macro_precision', "non outlier ratio"
        ]
        for index, metric in enumerate(metrics):
            line = prefix + "\t" + metric + "\t" + str(result[index])
            print line
            cv_batch_file.write(line + "\n")
Ejemplo n.º 5
0
    def cv(self):
        labels_true = np.array(self.pages.ground_truth)
        skf = StratifiedKFold(labels_true, n_folds=4)
        results = []
        count = 0
        p = pageCluster(self.dataset,self.date)
        for train, test in skf:
            #print train, test
            count += 1
            print "this is the {} times for CV".format(count)
            train_gold, test_gold = labels_true[train], labels_true[test]
            self.run(train)
            self.MDL()
            path_list = self.pages.path_list

            self.classify(test)
            self.clustering()

            print train, "train index list", type(train), len(train)
            train_y = np.array(self.pages.category)[train]
            test_y = np.array(self.pages.category)[test]

            results.append(p.Evaluation_CV(test_gold,test_y, train_gold, train_y, path_list=path_list))

            '''
            t = KMeans()
            train_y, final_centroids, final_ite, final_dist = t.k_means(km_train_x, num_clusters, replicates=20)
            test_y = t.k_means_classify(test_x)
            path_list = [self.UP_pages.path_list[idx] for idx in test]
            results.append(self.Evaluation_CV(test_gold,test_y,km_train_gold,train_y, path_list=path_list))
            '''
        result = np.mean(results,axis=0)
        cv_batch_file = open("./results/c_cv_baseline.results","a")
        algo = "dbscan"
        dataset = self.dataset
        prefix =  str(dataset)  +  " classifying \t"
        metrics = ['cv_micro_precision','cv_macro_precision',"non outlier ratio"]
        for index,metric in enumerate(metrics):
            line =  prefix + "\t" + metric + "\t" + str(result[index])
            print line
            cv_batch_file.write(line + "\n" )
Ejemplo n.º 6
0
    def __init__(self,
                 dataset,
                 date,
                 entry,
                 prefix,
                 eps,
                 cluster_rank,
                 crawl_size,
                 rank_algo="bfs"):
        self.dataset = dataset
        self.date = date
        self.eps = eps
        self.cluster_rank = cluster_rank
        self.rank_algo = rank_algo
        self.crawl_size = crawl_size
        self.rules = self.get_rules()
        self.entry, self.prefix = entry, prefix
        self.history_set = set()
        self.path_prefix = "../../Crawler/{}_samples/{}/".format(
            date, dataset.replace("new_", ""))
        self.folder_path = [
            "../../Crawler/{}_samples/{}/".format(date,
                                                  dataset.replace("new_", ""))
        ]
        self.sitemap = pageCluster(dataset, date, self.folder_path, 0)
        self.full_folder = "../../Crawler/full_data/" + dataset
        self.trans = {}
        self.queue = {}
        self.crawled_cluster_count = defaultdict(int)
        self.trans_dict = read_trans_dict(dataset, date)

        #self.cluster_dict = get_cluster_dict(dataset,date)
        #self.cluster_num = int(self.sitemap.DBSCAN(eps_val=self.eps))
        self.cluster_num = int(self.sitemap.DBSCAN())
        self.build_gold_cluster_dict()
        self.cluster_xpath_trans = self.get_xpath_transition()
        self.trans_prob_mat = self.calculate_trans_prob_mat()
        self.max_score = 500
        self.target_cluster = self.get_sample_cluster()