Beispiel #1
0
def test_multiple_output_for_input_list(expected_values):
    lower_case_name_field = Field("Lowercase_name", keep_raw=True)
    lower_case_name_field.add_pretokenize_hook(str.lower)

    upper_case_name_field = Field("Uppercase_name", keep_raw=True)
    upper_case_name_field.add_pretokenize_hook(str.upper)

    test_field_list = list(field_list)

    test_field_list[0] = (
        test_field_list[0],
        lower_case_name_field,
        upper_case_name_field,
    )

    example_factory = ExampleFactory(test_field_list)
    example = example_factory.from_list(expected_values)

    raw, tokenized = example["Name"]
    assert raw == expected_values[0]
    assert tokenized == expected_values[0].split()

    raw, tokenized = example["Lowercase_name"]
    assert raw == expected_values[0].lower()
    assert tokenized == expected_values[0].lower().split()

    raw, tokenized = example["Uppercase_name"]
    assert raw == expected_values[0].upper()
    assert tokenized == expected_values[0].upper().split()

    raw, tokenized = example["Score"]
    assert raw == expected_values[1]

    raw, tokenized = example["Favorite_food"]
    assert raw == expected_values[2]
Beispiel #2
0
def get_dataset():
    data = [
        {
            "Name": "Mark Dark",
            "Score": 5
        },
        {
            "Name": "Stephen Smith",
            "Score": 10
        },
        {
            "Name": "Ann Mann",
            "Score": 15
        },
    ]

    name_field = Field("Name",
                       numericalizer=Vocab(),
                       keep_raw=True,
                       tokenizer="split")

    score_field = Field("Score",
                        numericalizer=int,
                        keep_raw=True,
                        tokenizer=None,
                        is_target=True)

    fields = {"Name": name_field, "Score": score_field}

    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_dict(data_) for data_ in data]

    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
Beispiel #3
0
def test_cache_data_field_from_dict(expected_values):
    example_factory = ExampleFactory(field_dict)
    example = example_factory.from_dict(expected_values)

    for field in field_dict.values():
        field_name = field.name

        assert field_name in example
        assert hasattr(example, field_name)
Beispiel #4
0
def test_ignore_values_dict(expected_values):
    fields = {"Name": name_field}
    example_factory = ExampleFactory(fields)
    example = example_factory.from_dict(expected_values)

    assert "Name" in example
    assert hasattr(example, "Name")

    raw, _ = example["Name"]
    assert raw == expected_values["Name"]
Beispiel #5
0
def test_ignore_values_list(expected_values):
    fields = [None, None, favorite_food_field]
    example_factory = ExampleFactory(fields)
    example = example_factory.from_list(expected_values)

    assert "Favorite_food" in example
    assert hasattr(example, "Favorite_food")

    raw, _ = example["Favorite_food"]
    assert raw == expected_values[2]
Beispiel #6
0
def dataset_with_upper_field(fields):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
Beispiel #7
0
def test_from_dataset(data, fields):
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(raw_example) for raw_example in data]
    dataset = Dataset(examples, fields)
    pyarrow_dataset = DiskBackedDataset.from_dataset(dataset)

    for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset):
        assert ds_ex.number == arrow_ex.number
        assert ds_ex.tokens == arrow_ex.tokens

    pyarrow_dataset.delete_cache()
Beispiel #8
0
def test_concat_view_fail_no_field_intersection(dataset):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [None, upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, fields)
    other_dataset.finalize_fields()

    with pytest.raises(ValueError):
        DatasetConcatView([dataset, other_dataset])
Beispiel #9
0
def test_create_from_list(expected_values):
    example_factory = ExampleFactory(field_list)
    example = example_factory.from_list(expected_values)

    raw, tokenized = example["Name"]
    assert tokenized == expected_values[0].split()

    raw, tokenized = example["Score"]
    assert raw == expected_values[1]

    raw, tokenized = example["Favorite_food"]
    assert raw == expected_values[2]
Beispiel #10
0
def test_create_from_tsv(expected_values, example_tsv_string):
    example_factory = ExampleFactory(field_list)
    example = example_factory.from_csv(example_tsv_string, delimiter="\t")

    raw, tokenized = example["Name"]
    assert raw == expected_values[0]
    assert tokenized == expected_values[0].split()

    raw, tokenized = example["Score"]
    assert int(raw) == expected_values[1]

    raw, tokenized = example["Favorite_food"]
    assert raw == expected_values[2]
Beispiel #11
0
def test_create_from_json_string(expected_values, example_json_string):
    example_factory = ExampleFactory(field_dict)
    example = example_factory.from_json(example_json_string)

    raw, tokenized = example["Name"]
    assert raw == expected_values["Name"]
    assert tokenized == expected_values["Name"].split()

    raw, tokenized = example["Score"]
    assert raw == expected_values["Score"]

    raw, tokenized = example["Favorite_food"]
    assert raw == expected_values["Favorite_food"]
Beispiel #12
0
def test_from_examples(data, fields):
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(ex) for ex in data]
    ad = DiskBackedDataset.from_examples(fields, examples)

    for (raw, tokenized), (num, _) in zip(ad.number, data):
        assert raw == num
        assert tokenized is num

    for (raw, tokenized), (_, tok) in zip(ad.tokens, data):
        assert raw == tok
        assert tokenized == tok.split(" ")

    ad.delete_cache()
Beispiel #13
0
    def create_dataset():
        fields = (
            Field("text", numericalizer=Vocab()),
            Field("source", numericalizer=Vocab(), tokenizer=list),
        )
        example_factory = ExampleFactory(fields)

        examples = [
            example_factory.from_list(data)
            for data in zip(TABULAR_TEXT, TABULAR_SOURCES)
        ]

        dataset = Dataset(examples, fields)
        return dataset
Beispiel #14
0
def test_text_clean_up(kwargs, data, expected_output):
    pytest.importorskip("cleantext")

    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_pretokenize_hook(TextCleanUp(**kwargs))
    example = ExampleFactory([field]).from_list([data])

    assert expected_output == example["data"][1]
Beispiel #15
0
def test_remove_stopwords():
    data = "I'll tell you a joke"
    field = Field(name="data")
    field.add_posttokenize_hook(remove_stopwords("en"))
    example = ExampleFactory([field]).from_list([data])

    assert "you" not in example["data"][1]
    assert "a" not in example["data"][1]
Beispiel #16
0
def test_regex_replace():
    data = "This item costs 100$."
    field = Field(name="data", tokenizer=None, keep_raw=True)
    regex_replace = RegexReplace([(r"\d+", "<NUMBER>"), (r"\s+", "<WHITESPACE>")])
    field.add_pretokenize_hook(regex_replace)
    example = ExampleFactory([field]).from_list([data])

    expected_raw = "This<WHITESPACE>item<WHITESPACE>costs<WHITESPACE><NUMBER>$."
    assert expected_raw == example["data"][1]
Beispiel #17
0
def test_truecase():
    pytest.importorskip("truecase")

    data = "hey how are you"
    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_pretokenize_hook(truecase())
    example = ExampleFactory([field]).from_list([data])

    assert "Hey how are you" == example["data"][0]
Beispiel #18
0
def test_keyword_extractor(alg, alg_pkg_name):
    pytest.importorskip(alg_pkg_name)

    field = Field(name="data", tokenizer=None, keep_raw=True)
    field.add_posttokenize_hook(KeywordExtractor(alg))
    example = ExampleFactory([field]).from_list([TEXT])

    # make sure all the keywords originate from the raw data
    text_ = TEXT.lower()
    assert all(kw in text_ for kws in example["data"][1] for kw in kws.lower().split())
Beispiel #19
0
def test_delete_cache(data, fields):
    cache_dir = tempfile.mkdtemp()

    example_factory = ExampleFactory(fields)
    examples = map(example_factory.from_list, data)
    ad = DiskBackedDataset.from_examples(fields, examples, cache_path=cache_dir)

    assert os.path.exists(cache_dir)
    ad.delete_cache()
    assert not os.path.exists(cache_dir)
Beispiel #20
0
def test_moses_normalizer():
    pytest.importorskip("sacremoses")

    data = "What's    up!"
    field = Field(name="data", tokenizer=None, keep_raw=True)
    normalizer = MosesNormalizer()
    field.add_pretokenize_hook(normalizer)
    example = ExampleFactory([field]).from_list([data])

    assert "What's up!" == example["data"][1]
Beispiel #21
0
def test_concat_view_override_fields_eager(dataset, fields):
    upper_name_field = Field("name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    other_fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(other_fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, other_fields)
    other_dataset.finalize_fields()

    new_field = Field("override_name_field", numericalizer=Vocab(eager=True))
    dataset_concat = DatasetConcatView([dataset, other_dataset],
                                       field_overrides={"name": new_field})

    assert dataset_concat.field_dict["override_name_field"].is_finalized

    concat_vocab = dataset_concat.field_dict["override_name_field"].vocab
    dataset_vocab = dataset.field_dict["name"].vocab
    other_vocab = other_dataset.field_dict["name"].vocab
    assert set(
        concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
Beispiel #22
0
def test_iterator_batch_as_list():
    raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)]
    field = Field(
        "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True
    )
    fields = (field,)
    ef = ExampleFactory(fields)
    examples = [ef.from_list(raw_example) for raw_example in raw_dataset]
    ds = Dataset(examples, fields)

    for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)):
        assert isinstance(batch.test_field, list)
        field_batch = batch.test_field
        if i == 0:
            assert len(field_batch) == 2
            assert np.all(field_batch[0] == [1, 2, 3, 4])
            assert np.all(field_batch[1] == [2, 3, 4])

        if i == 2:
            assert len(field_batch) == 1
            assert np.all(field_batch[0] == [3, 4])
Beispiel #23
0
def test_missing_datatype_exception(data, fields, tmpdir):
    data_null = [(*d, None) for d in data]
    null_field = Field(
        "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab()
    )
    fields_null = [*fields, null_field]

    exf = ExampleFactory(fields_null)
    examples = map(exf.from_list, data_null)

    with pytest.raises(RuntimeError):
        DiskBackedDataset.from_examples(fields_null, examples, cache_path=tmpdir)
Beispiel #24
0
def test_lemmatization_and_stemming(hook):
    # we need this to postpone initialization
    # in pytest.mark.parametrize
    if inspect.isfunction(hook):
        hook = hook()

    data = "stemming playing books"
    field = Field(name="data")
    field.add_posttokenize_hook(hook)
    example = ExampleFactory([field]).from_list([data])

    # we don't check the exact results,
    # instead we expect some modifications
    assert data != example["data"][1]
Beispiel #25
0
def test_hook_conversion():
    field = Field(name="data", tokenizer="split", keep_raw=True)
    text_clean_up_hook = TextCleanUp(replace_url="<URL>")

    assert text_clean_up_hook.__hook_type__ == HookType.PRETOKENIZE
    with pytest.raises(ValueError):
        field.add_posttokenize_hook(text_clean_up_hook)

    text_clean_up_hook = as_posttokenize_hook(text_clean_up_hook)
    assert text_clean_up_hook.__hook_type__ == HookType.POSTTOKENIZE

    field.add_posttokenize_hook(text_clean_up_hook)

    data = "url to github is https://github.com"
    example = ExampleFactory([field]).from_list([data])

    assert example["data"][1] == ["url", "to", "github", "is", "<URL>"]
Beispiel #26
0
def test_datatype_definition(data, fields):
    data_null = [(*d, None) for d in data]
    null_field = Field(
        "null_field", keep_raw=True, allow_missing_data=True, numericalizer=Vocab()
    )
    fields_null = [*fields, null_field]

    exf = ExampleFactory(fields_null)
    examples = map(exf.from_list, data_null)

    datatypes = {"null_field": (pa.string(), pa.list_(pa.string()))}
    dataset = DiskBackedDataset.from_examples(fields_null, examples, data_types=datatypes)

    for ex, d in zip(dataset, data_null):
        assert int(ex["number"][0]) == d[0]
        assert ex["tokens"][0] == d[1]

    dataset.delete_cache()
Beispiel #27
0
def test_from_format():
    list_example_factory = ExampleFactory(field_list)

    list_data = ["Mark Dark", 5, "Hawaiian pizza"]
    example = list_example_factory.from_format(list_data, ExampleFormat.LIST)

    assert example["Name"][0] == list_data[0]
    assert example["Score"][0] == list_data[1]
    assert example["Favorite_food"][0] == list_data[2]

    dict_example_factory = ExampleFactory(field_dict)
    dict_data = {
        "Name": "Mark Dark",
        "Score": 5,
        "Favorite_food": "Hawaiian pizza"
    }

    example = dict_example_factory.from_format(dict_data, ExampleFormat.DICT)
    assert example["Name"][0] == dict_data["Name"]
    assert example["Score"][0] == dict_data["Score"]
    assert example["Favorite_food"][0] == dict_data["Favorite_food"]
Beispiel #28
0
def dataset(fields) -> DatasetBase:
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
Beispiel #29
0
def pyarrow_dataset(data, fields):
    example_factory = ExampleFactory(fields)
    examples = map(example_factory.from_list, data)
    return DiskBackedDataset.from_examples(fields, examples)
Beispiel #30
0
    def __init__(
        self,
        fields: Union[Dict, List],
        example_format: Union[ExampleFormat, str],
        model: Union[AbstractSupervisedModel, Type[AbstractSupervisedModel]],
        trainer: AbstractTrainer = None,
        feature_transformer: Union[FeatureTransformer,
                                   Callable[[NamedTuple], np.array]] = None,
        label_transform_fn: Callable[[NamedTuple], np.ndarray] = None,
        output_transform_fn: Callable[[np.ndarray], Any] = None,
    ):
        """
        Creates a new pipeline instance.

        Parameters
        ----------
        fields : dict or list of fields
            Fields used to process raw data. Can be either a dict mapping column names
            to Fields (or tuples of Fields), or a list of Fields (or tuples of Fields).
            A Field value of None means the corresponding column will
            be ignored.

        example_format: ExampleFormat
            Format of expected raw examples.

        feature_transformer: FeatureTransformer
            FeatureTransformer used to transform data features from the podium "batch"
            format into numpy arrays. Will be fitted along with the model to the provided
            data.

        model : class or model instance
            Class of the model to be fitted or a pre-trained model.
            If pre-trained model is passed and `fit` is called a new model instance will
            be created. For fine-tuning of the passed model instance call
            `partial_fit`.
            Must be a subclass of Podium's `AbstractSupervisedModel`

        trainer: AbstractTrainer, Optional
            Trainer used to fit the model. If provided, this trainer instance will be
            stored in the pipeline and used as the default trainer if no trainer is
            provided in the `fit` and `partial_fit` methods.

        feature_transformer : Union[FeatureTransformer, Callable[[NamedTuple], np.array]
            FeatureTransformer that transforms the input part of the batch returned by the
            iterator into features that can be fed into the model. Will also be fitted
            during Experiment fitting.
            A callable taking an input batch and returning a numpy array of features can
            also be passed.
            If None, a default feature transformer that returns a single feature from
            the batch will be used. In this case the Dataset used in training must contain
            a single input field.

        label_transform_fn : Callable[[NamedTuple], np.ndarray]
            Callable that transforms the target part of the batch returned by the iterator
            into the same format the model prediction is. For a hypothetical perfect model
            the prediction result of the model for some examples must be identical to the
            result of this callable for those same examples.
            If None, a default label transformer that returns a single feature from
            the batch will be used. In this case the Dataset used in training must contain
            a single target field.

        output_transform_fn: Callable[[np.ndarray], Any]
            Callable used to transform the prediction result of the model.

        Raises
        ------
        TypeError
            If `example format` is LIST, CSV or NLTK and `fields` is not either
            a list or tuple.
            If `example format` is DICT, XML or JSON and `fields` is not a dict.
        """
        if isinstance(example_format, ExampleFormat):
            example_format = example_format.value

        if example_format in (
                ExampleFormat.LIST.value,
                ExampleFormat.CSV.value,
                ExampleFormat.NLTK.value,
        ):
            if not isinstance(fields, (list, tuple)):
                raise TypeError(
                    "If `example format` is LIST, CSV or NLTK, `fields` "
                    "must be either a list or tuple. "
                    f"Type of `fields`: {type(fields)}")
        elif not isinstance(fields, dict):
            raise TypeError(
                "If `example format` is DICT, XML or JSON, `fields` "
                "must be a dict. "
                f"Type of `fields`: {type(fields)}")

        if isinstance(fields, (list, tuple)):
            feature_field_dict = _filter_feature_fields(
                {k: v
                 for k, v in enumerate(fields)})
            self.feature_fields = list(feature_field_dict.values())
        else:
            self.feature_fields = _filter_feature_fields(fields)

        self.all_fields = fields

        self.example_format = example_format

        self.training_example_factory = ExampleFactory(self.all_fields)
        self.prediction_example_factory = ExampleFactory(self.feature_fields)

        self.output_transform_fn = output_transform_fn

        super().__init__(
            model,
            feature_transformer=feature_transformer,
            trainer=trainer,
            label_transform_fn=label_transform_fn,
        )