Esempio n. 1
0
    def test_extend_vocab_1(self):
        vectors_cache_dir = '.cache'
        if os.path.exists(vectors_cache_dir):
            shutil.rmtree(vectors_cache_dir)

        mf = MatchingField()
        lf = MatchingField(id=True, sequential=False)
        fields = [('id', lf), ('left_a', mf), ('right_a', mf), ('label', lf)]
        col_naming = {
            'id': 'id',
            'label': 'label',
            'left': 'left_',
            'right': 'right_'
        }

        pathdir = os.path.abspath(os.path.join(test_dir_path, 'test_datasets'))
        filename = 'fasttext_sample.vec'
        file = os.path.join(pathdir, filename)
        url_base = urljoin('file:', pathname2url(file))
        vecs = Vectors(name=filename, cache=vectors_cache_dir, url=url_base)

        data_path = os.path.join(test_dir_path, 'test_datasets',
                                 'sample_table_small.csv')
        md = MatchingDataset(fields, col_naming, path=data_path)

        mf.build_vocab()
        mf.vocab.vectors = torch.Tensor(len(mf.vocab.itos), 300)
        mf.extend_vocab(md, vectors=vecs)
        self.assertEqual(len(mf.vocab.itos), 6)
        self.assertEqual(mf.vocab.vectors.size(), torch.Size([6, 300]))
Esempio n. 2
0
def test_splits_3(
    data_dir,
    train_filename,
    valid_filename,
    test_filename,
    fields,
    column_naming,
    cache_name,
    remove_cache,
):
    datasets = MatchingDataset.splits(
        data_dir,
        train_filename,
        valid_filename,
        test_filename,
        fields,
        None,
        None,
        column_naming,
        cache_name,
        train_pca=False,
    )
    assert datasets

    datasets_2 = MatchingDataset.splits(
        data_dir,
        train_filename,
        valid_filename,
        test_filename,
        fields,
        None,
        None,
        column_naming,
        cache_name,
        False,
        False,
        train_pca=False,
    )
    assert datasets_2
Esempio n. 3
0
def test_splits_2(
    data_dir,
    train_filename,
    valid_filename,
    test_filename,
    fields,
    column_naming,
    cache_name,
    remove_cache,
):
    datasets = MatchingDataset.splits(
        data_dir,
        train_filename,
        valid_filename,
        test_filename,
        fields,
        None,
        None,
        column_naming,
        cache_name,
        train_pca=False,
    )
    assert datasets

    with pytest.raises(MatchingDataset.CacheStaleException):
        MatchingDataset.splits(
            data_dir,
            "sample_table_small.csv",
            valid_filename,
            test_filename,
            fields,
            None,
            None,
            column_naming,
            cache_name,
            True,
            False,
            train_pca=False,
        )
Esempio n. 4
0
def test_class_matching_dataset():
    fields = [("left_a", MatchingField()), ("right_a", MatchingField())]
    col_naming = {
        "id": "id",
        "label": "label",
        "left": "left",
        "right": "right"
    }
    path = os.path.join(test_dir_path, "test_datasets",
                        "sample_table_small.csv")
    md = MatchingDataset(fields, col_naming, path=path)
    assert md.id_field == "id"
    assert md.label_field == "label"
    assert md.all_left_fields == ["left_a"]
    assert md.all_right_fields == ["right_a"]
    assert md.all_text_fields == ["left_a", "right_a"]
    assert md.canonical_text_fields == ["_a"]
Esempio n. 5
0
def process(
    path,
    train=None,
    validation=None,
    test=None,
    cache="cacheddata.pth",
    check_cached_data=True,
    auto_rebuild_cache=True,
    tokenize="nltk",
    lowercase=True,
    embeddings="fasttext.en.bin",
    embeddings_cache_path="~/.vector_cache",
    ignore_columns=(),
    include_lengths=True,
    id_attr="id",
    label_attr="label",
    left_prefix="left_",
    right_prefix="right_",
    use_magellan_convention=False,
    pca=True,
):
    r"""Creates dataset objects for multiple splits of a dataset.

    This involves the following steps (if data cannot be retrieved from the cache):
    #. Read CSV header of a data file and verify header is sane.
    #. Create fields, i.e., column processing specifications (e.g. tokenization, label
        conversion to integers etc.)
    #. Load each data file:
        #. Read each example (tuple pair) in specified CSV file.
        #. Preprocess example. Involves lowercasing and tokenization (unless disabled).
        #. Compute metadata if training data file. \
            See :meth:`MatchingDataset.compute_metadata` for details.
    #. Create vocabulary consisting of all tokens in all attributes in all datasets.
    #. Download word embedding data if necessary.
    #. Create mapping from each word in vocabulary to its word embedding.
    #. Compute metadata
    #. Write to cache

    Arguments:
        path (str): Common prefix of the splits' file paths.
        train (str): Suffix to add to path for the train set.
        validation (str): Suffix to add to path for the validation set, or None
            for no validation set. Default is None.
        test (str): Suffix to add to path for the test set, or None for no test
            set. Default is None.
        cache (str): Suffix to add to path for cache file. If `None` disables caching.
        check_cached_data (bool): Verify that data files haven't changes since the
            cache was constructed and that relevant field options haven't changed.
        auto_rebuild_cache (bool): Automatically rebuild the cache if the data files
            are modified or if the field options change. Defaults to False.
        tokenize (str): Which tokenizer to use
        lowercase (bool): Whether to lowercase all words in all attributes.
        embeddings (str or list): One or more of the following strings:
            * `fasttext.{lang}.bin`:
                This uses sub-word level word embeddings based on binary models from "wiki
                word vectors" released by FastText. {lang} is 'en' or any other 2 letter
                ISO 639-1 Language Code, or 3 letter ISO 639-2 Code, if the language does
                not have a 2 letter code. 300d vectors.
                ``fasttext.en.bin`` is the default.
            * `fasttext.wiki.vec`:
                Uses wiki news word vectors released as part of "Advances in Pre-Training
                Distributed Word Representations" by Mikolov et al. (2018). 300d vectors.
            * `fasttext.crawl.vec`:
                Uses Common Crawl word vectors released as part of "Advances in
                Pre-Training Distributed Word Representations" by Mikolov et al. (2018).
                300d vectors.
            * `glove.6B.{dims}`:
                Uses uncased Glove trained on Wiki + Gigaword. {dims} is one of (50d,
                100d, 200d, or 300d).
            * `glove.42B.300d`:
                Uses uncased Glove trained on Common Crawl. 300d vectors.
            * `glove.840B.300d`:
                Uses cased Glove trained on Common Crawl. 300d vectors.
            * `glove.twitter.27B.{dims}`:
                Uses cased Glove trained on Twitter. {dims} is one of (25d, 50d, 100d, or
                200d).
        embeddings_cache_path (str): Directory to store dowloaded word vector data.
        ignore_columns (list): A list of columns to ignore in the CSV files.
        include_lengths (bool): Whether to provide the model with the lengths of
            each attribute sequence in each batch. If True, length information can be
            used by the neural network, e.g. when picking the last RNN output of each
            attribute sequence.
        id_attr (str): The name of the tuple pair ID column in the CSV file.
        label_attr (str): The name of the tuple pair match label column in the CSV file.
        left_prefix (str): The prefix for attribute names belonging to the left table.
        right_prefix (str): The prefix for attribute names belonging to the right table.
        use_magellan_convention (bool): Set `id_attr`, `left_prefix`, and `right_prefix`
            according to Magellan (py_entitymatching Python package) naming conventions.
            Specifically, set them to be '_id', 'ltable_', and 'rtable_' respectively.
        pca (bool): Whether to compute PCA for each attribute (needed for SIF model).
            Defaults to False.

    Returns:
        Tuple[MatchingDataset]: Datasets for (train, validation, and test) splits in that
            order, if provided, or dataset for unlabeled, if provided.

    """
    if use_magellan_convention:
        id_attr = "_id"
        left_prefix = "ltable_"
        right_prefix = "rtable_"

    # TODO(Sid): check for all datasets to make sure the files exist and have the same schema
    a_dataset = train or validation or test
    with io.open(os.path.expanduser(os.path.join(path, a_dataset)),
                 encoding="utf8") as f:
        header = next(unicode_csv_reader(f))

    _maybe_download_nltk_data()
    _check_header(header, id_attr, left_prefix, right_prefix, label_attr,
                  ignore_columns)
    fields = _make_fields(
        header,
        id_attr,
        label_attr,
        ignore_columns,
        lowercase,
        tokenize,
        include_lengths,
    )

    column_naming = {
        "id": id_attr,
        "left": left_prefix,
        "right": right_prefix,
        "label": label_attr,
    }

    datasets = MatchingDataset.splits(
        path,
        train,
        validation,
        test,
        fields,
        embeddings,
        embeddings_cache_path,
        column_naming,
        cache,
        check_cached_data,
        auto_rebuild_cache,
        train_pca=pca,
    )

    # Save additional information to train dataset.
    datasets[0].ignore_columns = ignore_columns
    datasets[0].tokenize = tokenize
    datasets[0].lowercase = lowercase
    datasets[0].include_lengths = include_lengths

    return datasets
Esempio n. 6
0
def process_unlabeled(path, trained_model, ignore_columns=None):
    """Creates a dataset object for an unlabeled dataset.

    Args:
        path (str): The full path to the unlabeled data file (not just the directory).
        trained_model (:class:`~deepmatcher.MatchingModel`): The trained model.
            The model is aware of the configuration of the training
            data on which it was trained, and so this method reuses the same
            configuration for the unlabeled data.
        ignore_columns (list): A list of columns to ignore in the unlabeled CSV file.

    """
    with io.open(path, encoding="utf8") as f:
        header = next(unicode_csv_reader(f))

    train_info = trained_model.meta
    if ignore_columns is None:
        ignore_columns = train_info.ignore_columns
    column_naming = dict(train_info.column_naming)
    column_naming["label"] = None

    fields = _make_fields(
        header,
        column_naming["id"],
        column_naming["label"],
        ignore_columns,
        train_info.lowercase,
        train_info.tokenize,
        train_info.include_lengths,
    )

    begin = timer()
    dataset_args = {"fields": fields, "column_naming": column_naming}
    dataset = MatchingDataset(path=path, **dataset_args)

    # Make sure we have the same attributes.
    assert set(dataset.all_text_fields) == set(train_info.all_text_fields)

    after_load = timer()
    logger.info("Data load time: {}s".format(after_load - begin))

    reverse_fields_dict = {pair[1]: pair[0] for pair in fields}
    for field, name in reverse_fields_dict.items():
        if field is not None and field.use_vocab:
            # Copy over vocab from original train data.
            field.vocab = copy.deepcopy(train_info.vocabs[name])
            # Then extend the vocab.
            field.extend_vocab(
                dataset,
                vectors=train_info.embeddings,
                cache=train_info.embeddings_cache,
            )

    dataset.vocabs = {
        name: dataset.fields[name].vocab
        for name in train_info.all_text_fields
    }

    after_vocab = timer()
    logger.info("Vocab update time: {}s".format(after_vocab - after_load))

    return dataset