def test_import_data(self): file_name = "test/data.txt" real_data = [['1'], ['2', '3'], ['1', '6'], ['4', '1'], ['5', '2']] real_label = [['3'], ['4', '8'], ['5', '6'], ['1', '2'], ['7']] datas, labels = preparation.import_data(file_name) self.assertEqual(datas, real_data) self.assertEqual(labels, real_label) file_name = "test/test_data_bag_of_word.txt" real_data = [['8', '18', '18'], ['2', '33', '33']] real_label = [['32', '545'], ['11']] datas, labels = preparation.import_data(file_name, sequence=False) self.assertEqual(datas, real_data) self.assertEqual(labels, real_label)
def test_load_save_pickle(self): file_name = "test/test_data_bag_of_word.txt" datas, labels = preparation.import_data(file_name, sequence=False) preparation.save_data_in_pickle("test/pickle/test.pickle", datas, labels) real_data, real_label = preparation.load_data_in_pickle( "test/pickle/test.pickle") self.assertEqual(datas, real_data) self.assertEqual(labels, real_label)
def load_datas(self): if self.fold_number == 0: store_name = "%s/store/data.pickle.%s" % (self.data_name, self.mode) else: store_name = "%s/fold/data_%d.pickle.%s" % ( self.data_name, self.fold_number, self.mode) if self.state == 'embedding': self.datas, self.labels = prep.load_data_in_pickle(store_name) indice = [j for i in self.labels for j in i] indptr = np.cumsum([0] + [len(i) for i in self.labels]) data_one = np.ones(len(indice)) self.labels = csr_matrix( (data_one, indice, indptr), shape=(len(self.labels), len(self.all_name))).tocsc() data = [] with open('export/%s/doc2vec/%s.txt' % (self.data_name, self.mode), mode='r') as f: for i in f: data.append(list(map(float, i[:-1].split(' ')))) self.datas = np.array(data) # with open('data/%s/doc2vec/data.%s.pickle' % (self.data_name, self.mode), mode='rb') as f: # self.datas, self.labels = pickle.load(f) self.create_label_stat() return if not os.path.isfile("data/" + store_name): if not self.test_split: file_name = "%s/%s" % (self.data_name, self.data_file_name) else: file_name = "%s/%s" % (self.data_name, self.data_file_name) datas, labels = prep.import_data(file_name, sequence=self.sequence) self.create_temp_hierarchy( list(set([j for i in labels for j in i]))) hierarchy_file_name = "%s/hierarchy.pickle" % self.data_name new_labels = prep.map_index_of_label(hierarchy_file_name, labels) if not self.test_split: if self.mode == "train": prep.split_validate_data(datas, new_labels, self.data_name) else: directory = "data/%s/store" % self.data_name if not os.path.exists(directory): os.makedirs(directory) with open('data/' + store_name, 'wb') as f: pickle.dump([datas, new_labels], f) f.close() else: if self.fold_number == 0: prep.split_data(datas, new_labels, self.data_name) else: prep.split_fold_data(datas, new_labels, self.data_name) self.datas, self.labels = prep.load_data_in_pickle(store_name)
def test_map_index(self): file_name = "test/data.txt" _, labels = preparation.import_data(file_name) hierarchy_file_name = "test/hierarchy.pickle" new_labels = preparation.map_index_of_label(hierarchy_file_name, labels) real_new_labels = [ set([0, 2]), set(range(8)), set(range(6)), set([0, 1]), set(range(7)) ] self.assertListEqual(real_new_labels, new_labels)
def test_load_data(self): file_name = "test/data.txt" datas, labels = prep.import_data(file_name) hierarchy_file_name = "test/hierarchy.pickle" labels = prep.map_index_of_label(hierarchy_file_name, labels) train = self.dataset_train.datas validate = self.dataset_validate.datas test = self.dataset_test.datas train_label = self.dataset_train.labels validate_label = self.dataset_validate.labels test_label = self.dataset_test.labels fold_datas = np.concatenate([train, validate, test]) fold_labels = np.concatenate([train_label, validate_label, test_label]) self.assertListEqual(sorted(fold_datas.tolist()), sorted(datas)) a = sorted(map(list, fold_labels.tolist())) b = sorted(map(list, labels)) self.assertListEqual(a, b)
def test_split_data(self): file_name = "test/data.txt" datas, labels = preparation.import_data(file_name) hierarchy_file_name = "test/hierarchy.pickle" new_labels = preparation.map_index_of_label(hierarchy_file_name, labels) data_name = "test" preparation.split_fold_data(datas, new_labels, data_name) for i in range(5): name = "test/fold/data_%d.pickle.%s" % (i + 1, "train") train, train_label = preparation.load_data_in_pickle(name) name = "test/fold/data_%d.pickle.%s" % (i + 1, "validate") validate, validate_label = preparation.load_data_in_pickle(name) name = "test/fold/data_%d.pickle.%s" % (i + 1, "test") test, test_label = preparation.load_data_in_pickle(name) fold_datas = np.concatenate([train, validate, test]) fold_labels = np.concatenate( [train_label, validate_label, test_label]) self.assertListEqual(sorted(fold_datas.tolist()), sorted(datas)) a = sorted(map(list, fold_labels.tolist())) b = sorted(map(list, new_labels)) self.assertListEqual(a, b)