Beispiel #1
0
def test_process_1():
    vectors_cache_dir = ".cache"
    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)

    data_dir = os.path.join(test_dir_path, "test_datasets")
    train_path = "sample_table_large.csv"
    valid_path = "sample_table_large.csv"
    test_path = "sample_table_large.csv"
    cache_file = "cache.pth"
    cache_path = os.path.join(data_dir, cache_file)
    if os.path.exists(cache_path):
        os.remove(cache_path)

    process(
        data_dir,
        train=train_path,
        validation=valid_path,
        test=test_path,
        id_attr="_id",
        left_prefix="ltable_",
        right_prefix="rtable_",
        cache=cache_file,
        embeddings=embeddings,
        embeddings_cache_path="",
    )

    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)

    if os.path.exists(cache_path):
        os.remove(cache_path)
Beispiel #2
0
    def test_process_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        data_dir = os.path.join(test_dir_path, 'test_datasets')
        train_path = 'sample_table_large.csv'
        valid_path = 'sample_table_large.csv'
        test_path = 'sample_table_large.csv'
        cache_file = 'cache.pth'
        cache_path = os.path.join(data_dir, cache_file)
        if os.path.exists(cache_path):
            os.remove(cache_path)

        pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec.zip'
        url_base = urljoin('file:', pathname2url(pathdir)) + os.path.sep
        ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir)

        process(data_dir,
                train=train_path,
                validation=valid_path,
                test=test_path,
                id_attr='_id',
                left_prefix='ltable_',
                right_prefix='rtable_',
                cache=cache_file,
                embeddings=ft,
                embeddings_cache_path='')

        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        if os.path.exists(cache_path):
            os.remove(cache_path)
    def setUp(self):
        self.vectors_cache_dir = '.cache'
        if os.path.exists(self.vectors_cache_dir):
            shutil.rmtree(self.vectors_cache_dir)

        self.data_cache_path = os.path.join(test_dir_path, 'test_datasets',
                                            'train_cache.pth')
        if os.path.exists(self.data_cache_path):
            os.remove(self.data_cache_path)

        vec_dir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec.zip'
        url_base = urljoin('file:', pathname2url(vec_dir)) + os.path.sep
        ft = FastText(filename,
                      url_base=url_base,
                      cache=self.vectors_cache_dir)

        self.train, self.valid, self.test = process(
            path=os.path.join(test_dir_path, 'test_datasets'),
            cache='train_cache.pth',
            train='test_train.csv',
            validation='test_valid.csv',
            test='test_test.csv',
            embeddings=ft,
            embeddings_cache_path='',
            ignore_columns=('left_id', 'right_id'))
Beispiel #4
0
    def test_get_raw_table(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        data_cache_path = os.path.join(test_dir_path, 'test_datasets',
                                       'cacheddata.pth')
        if os.path.exists(data_cache_path):
            os.remove(data_cache_path)

        vec_dir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec.zip'
        url_base = urljoin('file:', pathname2url(vec_dir)) + os.path.sep
        ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir)

        train = process(path=os.path.join(test_dir_path, 'test_datasets'),
                        train='sample_table_small.csv',
                        id_attr='id',
                        embeddings=ft,
                        embeddings_cache_path='',
                        pca=False)

        train_raw = train.get_raw_table()
        ori_train = pd.read_csv(
            os.path.join(test_dir_path, 'test_datasets',
                         'sample_table_small.csv'))
        self.assertEqual(set(train_raw.columns), set(ori_train.columns))

        if os.path.exists(data_cache_path):
            os.remove(data_cache_path)

        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)
Beispiel #5
0
def datasets():
    vectors_cache_dir = ".cache"
    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)

    data_cache_path = os.path.join(test_dir_path, "test_datasets",
                                   "train_cache.pth")
    if os.path.exists(data_cache_path):
        os.remove(data_cache_path)

    train, valid, test = process(
        path=os.path.join(test_dir_path, "test_datasets"),
        cache="train_cache.pth",
        train="test_train.csv",
        validation="test_valid.csv",
        test="test_test.csv",
        embeddings=embeddings,
        embeddings_cache_path="",
        ignore_columns=("left_id", "right_id"),
    )
    yield Datasets(train, valid, test)

    if os.path.exists(data_cache_path):
        os.remove(data_cache_path)

    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)
Beispiel #6
0
def test_get_raw_table():
    vectors_cache_dir = ".cache"
    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)

    data_cache_path = os.path.join(test_dir_path, "test_datasets",
                                   "cacheddata.pth")
    if os.path.exists(data_cache_path):
        os.remove(data_cache_path)

    train = process(
        path=os.path.join(test_dir_path, "test_datasets"),
        train="sample_table_small.csv",
        id_attr="id",
        embeddings=embeddings,
        embeddings_cache_path="",
        pca=False,
    )

    train_raw = train.get_raw_table()
    ori_train = pd.read_csv(
        os.path.join(test_dir_path, "test_datasets", "sample_table_small.csv"))
    assert set(train_raw.columns) == set(ori_train.columns)

    if os.path.exists(data_cache_path):
        os.remove(data_cache_path)

    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)
Beispiel #7
0
    def test_process_unlabeled_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        data_cache_path = os.path.join(test_dir_path, 'test_datasets',
                                       'cacheddata.pth')
        if os.path.exists(data_cache_path):
            os.remove(data_cache_path)

        vec_dir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec.zip'
        url_base = urljoin('file:', pathname2url(vec_dir)) + os.path.sep
        ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir)

        train, valid, test = process(path=os.path.join(test_dir_path,
                                                       'test_datasets'),
                                     train='test_train.csv',
                                     validation='test_valid.csv',
                                     test='test_test.csv',
                                     id_attr='id',
                                     ignore_columns=('left_id', 'right_id'),
                                     embeddings=ft,
                                     embeddings_cache_path='',
                                     pca=True)

        model_save_path = 'sif_model.pth'
        model = MatchingModel(attr_summarizer='sif')
        model.run_train(train,
                        valid,
                        epochs=1,
                        batch_size=8,
                        best_save_path=model_save_path,
                        pos_neg_ratio=3)

        test_unlabeled = process_unlabeled(
            path=os.path.join(test_dir_path, 'test_datasets', 'test_test.csv'),
            trained_model=model,
            ignore_columns=('left_id', 'right_id'))

        self.assertEqual(test_unlabeled.all_text_fields, test.all_text_fields)

        if os.path.exists(model_save_path):
            os.remove(model_save_path)

        if os.path.exists(data_cache_path):
            os.remove(data_cache_path)

        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)
Beispiel #8
0
def test_process_unlabeled_1():
    vectors_cache_dir = ".cache"
    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)

    data_cache_path = os.path.join(test_dir_path, "test_datasets",
                                   "cacheddata.pth")
    if os.path.exists(data_cache_path):
        os.remove(data_cache_path)

    train, valid, test = process(
        path=os.path.join(test_dir_path, "test_datasets"),
        train="test_train.csv",
        validation="test_valid.csv",
        test="test_test.csv",
        id_attr="id",
        ignore_columns=("left_id", "right_id"),
        embeddings=embeddings,
        embeddings_cache_path="",
        pca=True,
    )

    model_save_path = "sif_model.pth"
    model = MatchingModel(attr_summarizer="sif")
    model.run_train(
        train,
        valid,
        epochs=1,
        batch_size=8,
        best_save_path=model_save_path,
        pos_neg_ratio=3,
    )

    test_unlabeled = process_unlabeled(
        path=os.path.join(test_dir_path, "test_datasets", "test_test.csv"),
        trained_model=model,
        ignore_columns=("left_id", "right_id"),
    )

    assert test_unlabeled.all_text_fields == test.all_text_fields

    if os.path.exists(model_save_path):
        os.remove(model_save_path)

    if os.path.exists(data_cache_path):
        os.remove(data_cache_path)

    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)
    def test_splits_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        data_dir = os.path.join(test_dir_path, 'test_datasets')
        train_path = 'sample_table_large.csv'
        valid_path = 'sample_table_large.csv'
        test_path = 'sample_table_large.csv'
        cache_file = 'cache.pth'
        cache_path = os.path.join(data_dir, cache_file)
        if os.path.exists(cache_path):
            os.remove(cache_path)

        pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec.zip'
        url_base = urljoin('file:', pathname2url(pathdir)) + os.path.sep
        ft = FastText(filename, url_base=url_base, cache=vectors_cache_dir)

        datasets = process(data_dir,
                           train=train_path,
                           validation=valid_path,
                           test=test_path,
                           cache=cache_file,
                           embeddings=ft,
                           id_attr='_id',
                           left_prefix='ltable_',
                           right_prefix='rtable_',
                           embeddings_cache_path='',
                           pca=False)

        splits = MatchingIterator.splits(datasets, batch_size=16)
        self.assertEqual(splits[0].batch_size, 16)
        self.assertEqual(splits[1].batch_size, 16)
        self.assertEqual(splits[2].batch_size, 16)
        splits_sorted = MatchingIterator.splits(datasets,
                                                batch_sizes=[16, 32, 64],
                                                sort_in_buckets=False)
        self.assertEqual(splits_sorted[0].batch_size, 16)
        self.assertEqual(splits_sorted[1].batch_size, 32)
        self.assertEqual(splits_sorted[2].batch_size, 64)

        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        if os.path.exists(cache_path):
            os.remove(cache_path)
Beispiel #10
0
def test_create_batches_1():
    vectors_cache_dir = ".cache"
    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)

    data_dir = os.path.join(test_dir_path, "test_datasets")
    train_path = "sample_table_large.csv"
    valid_path = "sample_table_large.csv"
    test_path = "sample_table_large.csv"
    cache_file = "cache.pth"
    cache_path = os.path.join(data_dir, cache_file)
    if os.path.exists(cache_path):
        os.remove(cache_path)

    datasets = process(
        data_dir,
        train=train_path,
        validation=valid_path,
        test=test_path,
        cache=cache_file,
        embeddings=embeddings,
        id_attr="_id",
        left_prefix="ltable_",
        right_prefix="rtable_",
        embeddings_cache_path="",
        pca=False,
    )

    splits = MatchingIterator.splits(datasets, batch_size=16)
    batch_splits = [split.create_batches() for split in splits]
    assert batch_splits

    sorted_splits = MatchingIterator.splits(datasets,
                                            batch_sizes=[16, 32, 64],
                                            sort_in_buckets=False)
    batch_sorted_splits = [
        sorted_split.create_batches() for sorted_split in sorted_splits
    ]
    assert batch_sorted_splits

    if os.path.exists(vectors_cache_dir):
        shutil.rmtree(vectors_cache_dir)

    if os.path.exists(cache_path):
        os.remove(cache_path)