Exemple #1
0
    def test_import_data(self):
        file_name = "test/data.txt"
        real_data = [['1'], ['2', '3'], ['1', '6'], ['4', '1'], ['5', '2']]
        real_label = [['3'], ['4', '8'], ['5', '6'], ['1', '2'], ['7']]
        datas, labels = preparation.import_data(file_name)
        self.assertEqual(datas, real_data)
        self.assertEqual(labels, real_label)

        file_name = "test/test_data_bag_of_word.txt"
        real_data = [['8', '18', '18'], ['2', '33', '33']]
        real_label = [['32', '545'], ['11']]
        datas, labels = preparation.import_data(file_name, sequence=False)
        self.assertEqual(datas, real_data)
        self.assertEqual(labels, real_label)
Exemple #2
0
 def test_load_save_pickle(self):
     file_name = "test/test_data_bag_of_word.txt"
     datas, labels = preparation.import_data(file_name, sequence=False)
     preparation.save_data_in_pickle("test/pickle/test.pickle", datas,
                                     labels)
     real_data, real_label = preparation.load_data_in_pickle(
         "test/pickle/test.pickle")
     self.assertEqual(datas, real_data)
     self.assertEqual(labels, real_label)
Exemple #3
0
    def load_datas(self):
        if self.fold_number == 0:
            store_name = "%s/store/data.pickle.%s" % (self.data_name,
                                                      self.mode)
        else:
            store_name = "%s/fold/data_%d.pickle.%s" % (
                self.data_name, self.fold_number, self.mode)
        if self.state == 'embedding':
            self.datas, self.labels = prep.load_data_in_pickle(store_name)

            indice = [j for i in self.labels for j in i]
            indptr = np.cumsum([0] + [len(i) for i in self.labels])
            data_one = np.ones(len(indice))
            self.labels = csr_matrix(
                (data_one, indice, indptr),
                shape=(len(self.labels), len(self.all_name))).tocsc()
            data = []
            with open('export/%s/doc2vec/%s.txt' % (self.data_name, self.mode),
                      mode='r') as f:
                for i in f:
                    data.append(list(map(float, i[:-1].split(' '))))
            self.datas = np.array(data)
            # with open('data/%s/doc2vec/data.%s.pickle' % (self.data_name, self.mode), mode='rb') as f:
            #     self.datas, self.labels = pickle.load(f)
            self.create_label_stat()
            return

        if not os.path.isfile("data/" + store_name):
            if not self.test_split:
                file_name = "%s/%s" % (self.data_name, self.data_file_name)
            else:
                file_name = "%s/%s" % (self.data_name, self.data_file_name)
            datas, labels = prep.import_data(file_name, sequence=self.sequence)
            self.create_temp_hierarchy(
                list(set([j for i in labels for j in i])))
            hierarchy_file_name = "%s/hierarchy.pickle" % self.data_name
            new_labels = prep.map_index_of_label(hierarchy_file_name, labels)
            if not self.test_split:
                if self.mode == "train":
                    prep.split_validate_data(datas, new_labels, self.data_name)
                else:
                    directory = "data/%s/store" % self.data_name
                    if not os.path.exists(directory):
                        os.makedirs(directory)
                    with open('data/' + store_name, 'wb') as f:
                        pickle.dump([datas, new_labels], f)
                        f.close()
            else:
                if self.fold_number == 0:
                    prep.split_data(datas, new_labels, self.data_name)
                else:
                    prep.split_fold_data(datas, new_labels, self.data_name)
        self.datas, self.labels = prep.load_data_in_pickle(store_name)
Exemple #4
0
 def test_map_index(self):
     file_name = "test/data.txt"
     _, labels = preparation.import_data(file_name)
     hierarchy_file_name = "test/hierarchy.pickle"
     new_labels = preparation.map_index_of_label(hierarchy_file_name,
                                                 labels)
     real_new_labels = [
         set([0, 2]),
         set(range(8)),
         set(range(6)),
         set([0, 1]),
         set(range(7))
     ]
     self.assertListEqual(real_new_labels, new_labels)
Exemple #5
0
    def test_load_data(self):
        file_name = "test/data.txt"
        datas, labels = prep.import_data(file_name)
        hierarchy_file_name = "test/hierarchy.pickle"
        labels = prep.map_index_of_label(hierarchy_file_name, labels)

        train = self.dataset_train.datas
        validate = self.dataset_validate.datas
        test = self.dataset_test.datas
        train_label = self.dataset_train.labels
        validate_label = self.dataset_validate.labels
        test_label = self.dataset_test.labels
        fold_datas = np.concatenate([train, validate, test])
        fold_labels = np.concatenate([train_label, validate_label, test_label])
        self.assertListEqual(sorted(fold_datas.tolist()), sorted(datas))
        a = sorted(map(list, fold_labels.tolist()))
        b = sorted(map(list, labels))
        self.assertListEqual(a, b)
Exemple #6
0
 def test_split_data(self):
     file_name = "test/data.txt"
     datas, labels = preparation.import_data(file_name)
     hierarchy_file_name = "test/hierarchy.pickle"
     new_labels = preparation.map_index_of_label(hierarchy_file_name,
                                                 labels)
     data_name = "test"
     preparation.split_fold_data(datas, new_labels, data_name)
     for i in range(5):
         name = "test/fold/data_%d.pickle.%s" % (i + 1, "train")
         train, train_label = preparation.load_data_in_pickle(name)
         name = "test/fold/data_%d.pickle.%s" % (i + 1, "validate")
         validate, validate_label = preparation.load_data_in_pickle(name)
         name = "test/fold/data_%d.pickle.%s" % (i + 1, "test")
         test, test_label = preparation.load_data_in_pickle(name)
         fold_datas = np.concatenate([train, validate, test])
         fold_labels = np.concatenate(
             [train_label, validate_label, test_label])
         self.assertListEqual(sorted(fold_datas.tolist()), sorted(datas))
         a = sorted(map(list, fold_labels.tolist()))
         b = sorted(map(list, new_labels))
         self.assertListEqual(a, b)