def test_process_1(): vectors_cache_dir = ".cache" if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_dir = os.path.join(test_dir_path, "test_datasets") train_path = "sample_table_large.csv" valid_path = "sample_table_large.csv" test_path = "sample_table_large.csv" cache_file = "cache.pth" cache_path = os.path.join(data_dir, cache_file) if os.path.exists(cache_path): os.remove(cache_path) process( data_dir, train=train_path, validation=valid_path, test=test_path, id_attr="_id", left_prefix="ltable_", right_prefix="rtable_", cache=cache_file, embeddings=embeddings, embeddings_cache_path="", ) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) if os.path.exists(cache_path): os.remove(cache_path)
def test_process_1(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_dir = os.path.join(test_dir_path, 'test_datasets') train_path = 'sample_table_large.csv' valid_path = 'sample_table_large.csv' test_path = 'sample_table_large.csv' cache_file = 'cache.pth' cache_path = os.path.join(data_dir, cache_file) if os.path.exists(cache_path): os.remove(cache_path) pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec.zip' url_base = urljoin('file:', pathname2url(pathdir)) + os.path.sep ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir) process(data_dir, train=train_path, validation=valid_path, test=test_path, id_attr='_id', left_prefix='ltable_', right_prefix='rtable_', cache=cache_file, embeddings=ft, embeddings_cache_path='') if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) if os.path.exists(cache_path): os.remove(cache_path)
def setUp(self): self.vectors_cache_dir = '.cache' if os.path.exists(self.vectors_cache_dir): shutil.rmtree(self.vectors_cache_dir) self.data_cache_path = os.path.join(test_dir_path, 'test_datasets', 'train_cache.pth') if os.path.exists(self.data_cache_path): os.remove(self.data_cache_path) vec_dir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec.zip' url_base = urljoin('file:', pathname2url(vec_dir)) + os.path.sep ft = FastText(filename, url_base=url_base, cache=self.vectors_cache_dir) self.train, self.valid, self.test = process( path=os.path.join(test_dir_path, 'test_datasets'), cache='train_cache.pth', train='test_train.csv', validation='test_valid.csv', test='test_test.csv', embeddings=ft, embeddings_cache_path='', ignore_columns=('left_id', 'right_id'))
def test_get_raw_table(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_cache_path = os.path.join(test_dir_path, 'test_datasets', 'cacheddata.pth') if os.path.exists(data_cache_path): os.remove(data_cache_path) vec_dir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec.zip' url_base = urljoin('file:', pathname2url(vec_dir)) + os.path.sep ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir) train = process(path=os.path.join(test_dir_path, 'test_datasets'), train='sample_table_small.csv', id_attr='id', embeddings=ft, embeddings_cache_path='', pca=False) train_raw = train.get_raw_table() ori_train = pd.read_csv( os.path.join(test_dir_path, 'test_datasets', 'sample_table_small.csv')) self.assertEqual(set(train_raw.columns), set(ori_train.columns)) if os.path.exists(data_cache_path): os.remove(data_cache_path) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
def datasets(): vectors_cache_dir = ".cache" if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_cache_path = os.path.join(test_dir_path, "test_datasets", "train_cache.pth") if os.path.exists(data_cache_path): os.remove(data_cache_path) train, valid, test = process( path=os.path.join(test_dir_path, "test_datasets"), cache="train_cache.pth", train="test_train.csv", validation="test_valid.csv", test="test_test.csv", embeddings=embeddings, embeddings_cache_path="", ignore_columns=("left_id", "right_id"), ) yield Datasets(train, valid, test) if os.path.exists(data_cache_path): os.remove(data_cache_path) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
def test_get_raw_table(): vectors_cache_dir = ".cache" if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_cache_path = os.path.join(test_dir_path, "test_datasets", "cacheddata.pth") if os.path.exists(data_cache_path): os.remove(data_cache_path) train = process( path=os.path.join(test_dir_path, "test_datasets"), train="sample_table_small.csv", id_attr="id", embeddings=embeddings, embeddings_cache_path="", pca=False, ) train_raw = train.get_raw_table() ori_train = pd.read_csv( os.path.join(test_dir_path, "test_datasets", "sample_table_small.csv")) assert set(train_raw.columns) == set(ori_train.columns) if os.path.exists(data_cache_path): os.remove(data_cache_path) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
def test_process_unlabeled_1(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_cache_path = os.path.join(test_dir_path, 'test_datasets', 'cacheddata.pth') if os.path.exists(data_cache_path): os.remove(data_cache_path) vec_dir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec.zip' url_base = urljoin('file:', pathname2url(vec_dir)) + os.path.sep ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir) train, valid, test = process(path=os.path.join(test_dir_path, 'test_datasets'), train='test_train.csv', validation='test_valid.csv', test='test_test.csv', id_attr='id', ignore_columns=('left_id', 'right_id'), embeddings=ft, embeddings_cache_path='', pca=True) model_save_path = 'sif_model.pth' model = MatchingModel(attr_summarizer='sif') model.run_train(train, valid, epochs=1, batch_size=8, best_save_path=model_save_path, pos_neg_ratio=3) test_unlabeled = process_unlabeled( path=os.path.join(test_dir_path, 'test_datasets', 'test_test.csv'), trained_model=model, ignore_columns=('left_id', 'right_id')) self.assertEqual(test_unlabeled.all_text_fields, test.all_text_fields) if os.path.exists(model_save_path): os.remove(model_save_path) if os.path.exists(data_cache_path): os.remove(data_cache_path) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
def test_process_unlabeled_1(): vectors_cache_dir = ".cache" if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_cache_path = os.path.join(test_dir_path, "test_datasets", "cacheddata.pth") if os.path.exists(data_cache_path): os.remove(data_cache_path) train, valid, test = process( path=os.path.join(test_dir_path, "test_datasets"), train="test_train.csv", validation="test_valid.csv", test="test_test.csv", id_attr="id", ignore_columns=("left_id", "right_id"), embeddings=embeddings, embeddings_cache_path="", pca=True, ) model_save_path = "sif_model.pth" model = MatchingModel(attr_summarizer="sif") model.run_train( train, valid, epochs=1, batch_size=8, best_save_path=model_save_path, pos_neg_ratio=3, ) test_unlabeled = process_unlabeled( path=os.path.join(test_dir_path, "test_datasets", "test_test.csv"), trained_model=model, ignore_columns=("left_id", "right_id"), ) assert test_unlabeled.all_text_fields == test.all_text_fields if os.path.exists(model_save_path): os.remove(model_save_path) if os.path.exists(data_cache_path): os.remove(data_cache_path) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir)
def test_splits_1(self): vectors_cache_dir = '.cache' if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_dir = os.path.join(test_dir_path, 'test_datasets') train_path = 'sample_table_large.csv' valid_path = 'sample_table_large.csv' test_path = 'sample_table_large.csv' cache_file = 'cache.pth' cache_path = os.path.join(data_dir, cache_file) if os.path.exists(cache_path): os.remove(cache_path) pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets')) filename = 'fasttext_sample.vec.zip' url_base = urljoin('file:', pathname2url(pathdir)) + os.path.sep ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir) datasets = process(data_dir, train=train_path, validation=valid_path, test=test_path, cache=cache_file, embeddings=ft, id_attr='_id', left_prefix='ltable_', right_prefix='rtable_', embeddings_cache_path='', pca=False) splits = MatchingIterator.splits(datasets, batch_size=16) self.assertEqual(splits[0].batch_size, 16) self.assertEqual(splits[1].batch_size, 16) self.assertEqual(splits[2].batch_size, 16) splits_sorted = MatchingIterator.splits(datasets, batch_sizes=[16, 32, 64], sort_in_buckets=False) self.assertEqual(splits_sorted[0].batch_size, 16) self.assertEqual(splits_sorted[1].batch_size, 32) self.assertEqual(splits_sorted[2].batch_size, 64) if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) if os.path.exists(cache_path): os.remove(cache_path)
def test_create_batches_1(): vectors_cache_dir = ".cache" if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) data_dir = os.path.join(test_dir_path, "test_datasets") train_path = "sample_table_large.csv" valid_path = "sample_table_large.csv" test_path = "sample_table_large.csv" cache_file = "cache.pth" cache_path = os.path.join(data_dir, cache_file) if os.path.exists(cache_path): os.remove(cache_path) datasets = process( data_dir, train=train_path, validation=valid_path, test=test_path, cache=cache_file, embeddings=embeddings, id_attr="_id", left_prefix="ltable_", right_prefix="rtable_", embeddings_cache_path="", pca=False, ) splits = MatchingIterator.splits(datasets, batch_size=16) batch_splits = [split.create_batches() for split in splits] assert batch_splits sorted_splits = MatchingIterator.splits(datasets, batch_sizes=[16, 32, 64], sort_in_buckets=False) batch_sorted_splits = [ sorted_split.create_batches() for sorted_split in sorted_splits ] assert batch_sorted_splits if os.path.exists(vectors_cache_dir): shutil.rmtree(vectors_cache_dir) if os.path.exists(cache_path): os.remove(cache_path)