Exemple #1
0
def get_dataset():
    data = [
        {
            "Name": "Mark Dark",
            "Score": 5
        },
        {
            "Name": "Stephen Smith",
            "Score": 10
        },
        {
            "Name": "Ann Mann",
            "Score": 15
        },
    ]

    name_field = Field("Name",
                       numericalizer=Vocab(),
                       keep_raw=True,
                       tokenizer="split")

    score_field = Field("Score",
                        numericalizer=int,
                        keep_raw=True,
                        tokenizer=None,
                        is_target=True)

    fields = {"Name": name_field, "Score": score_field}

    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_dict(data_) for data_ in data]

    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
Exemple #2
0
def dataset_with_upper_field(fields):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds
Exemple #3
0
def test_concat_view_fail_no_field_intersection(dataset):
    upper_name_field = Field("upper_name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    fields = [None, upper_name_field]
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, fields)
    other_dataset.finalize_fields()

    with pytest.raises(ValueError):
        DatasetConcatView([dataset, other_dataset])
Exemple #4
0
    def predict_raw(self, raw_example: Any, **kwargs) -> np.ndarray:
        """
        Computes the prediction of the model for the one example. The example
        must be of the format provided in the constructor as the
        `example_format` parameter.

        Parameters
        ----------
        raw_example: Any
            Example to compute the prediction for.

        kwargs
            Keyword arguments passed to the model's `predict` method

        Returns
        -------
        ndarray
            Tensor containing the prediction for the example.
        """
        processed_example = self.prediction_example_factory.from_format(
            raw_example, self.example_format)
        ds = Dataset([processed_example], self.feature_fields)
        prediction = self.predict(ds, **kwargs)
        # Indexed with 0 to extract the single prediction from the prediction batch
        prediction = prediction[0]
        if self.output_transform_fn is not None:
            return self.output_transform_fn(prediction)

        else:
            return prediction
Exemple #5
0
    def __getitem__(self, i):
        raw_examples = self.dataset[i]

        # Index or slice
        if isinstance(i, int):
            return self._example_factory.from_dict(raw_examples)
        else:
            # Slice of hf.datasets.Dataset is a dictionary that maps
            # to a list of values. To map this to a list of our examples,
            # we map the single dictionary to a list of dictionaries and
            # then convert this to a list of podium Examples

            # Unpack the dict, creating a dict for each value tuple
            raw_examples = [
                {k: v for k, v in zip(raw_examples, values)}
                for values in zip(*raw_examples.values())
            ]

            # Map each raw example to a Podium example
            examples = [
                self._example_factory.from_dict(raw_example)
                for raw_example in raw_examples
            ]

            # Cast to a dataset
            return Dataset(examples, self.fields, sort_key=None)
Exemple #6
0
    def fit_raw(
        self,
        examples: Iterable[Union[Dict, List]],
        model_kwargs: Dict = None,
        trainer_kwargs: Dict = None,
        feature_transformer: FeatureTransformer = None,
        trainer: AbstractTrainer = None,
    ):
        """
        Fits the model to the provided examples. During fitting, the provided
        Iterator and Trainer are used. Each example must be of the format
        provided in the constructor as the `example_format` parameter.

        Parameters
        ----------
        examples : Iterable[Union[Dict, List]]
            Examples that will be used in fitting,

        model_kwargs : dict
            Dict containing model arguments. Arguments passed to the model are the default
            arguments defined with `set_default_model_args` updated/overridden by
            model_kwargs.

        trainer_kwargs : dict
            Dict containing trainer arguments. Arguments passed to the trainer are the
            default arguments defined with `set_default_trainer_args` updated/overridden
            by 'trainer_kwargs'.

        feature_transformer : FeatureTransformer, Optional
            FeatureTransformer that transforms the input part of the batch returned by the
            iterator into features that can be fed into the model. Will also be fitted
            during Experiment fitting.
            If None, the default FeatureTransformer provided in the constructor will be
            used. Otherwise, this will overwrite the default feature transformer.

        trainer : AbstractTrainer, Optional
            Trainer used to fit the model. If None, the trainer provided in the
            constructor will be used.

        training_iterator_callable: Callable[[Dataset], Iterator]
            Callable used to instantiate new instances of the Iterator used in fitting the
            model. If None, the training_iterator_callable provided in the
            constructor will be used.
        """
        processed_examples = [
            self.training_example_factory.from_format(ex, self.example_format)
            for ex in examples
        ]
        ds = Dataset(processed_examples, self.all_fields)
        self.fit(
            ds,
            model_kwargs=model_kwargs,
            trainer_kwargs=trainer_kwargs,
            feature_transformer=feature_transformer,
            trainer=trainer,
        )
Exemple #7
0
    def as_dataset(self) -> Dataset:
        """
        Convert the original HuggingFace dataset to a podium.Dataset.

        Returns
        -------
        podium.Dataset
            podium.Dataset instance.
        """
        return Dataset(list(self), self.fields)
Exemple #8
0
def test_from_dataset(data, fields):
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(raw_example) for raw_example in data]
    dataset = Dataset(examples, fields)
    pyarrow_dataset = DiskBackedDataset.from_dataset(dataset)

    for ds_ex, arrow_ex in zip(dataset, pyarrow_dataset):
        assert ds_ex.number == arrow_ex.number
        assert ds_ex.tokens == arrow_ex.tokens

    pyarrow_dataset.delete_cache()
Exemple #9
0
def test_concat_view_override_fields_eager(dataset, fields):
    upper_name_field = Field("name",
                             pretokenize_hooks=(str.upper, ),
                             numericalizer=Vocab())
    other_fields = [fields[0], upper_name_field]
    example_factory = ExampleFactory(other_fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    other_dataset = Dataset(examples, other_fields)
    other_dataset.finalize_fields()

    new_field = Field("override_name_field", numericalizer=Vocab(eager=True))
    dataset_concat = DatasetConcatView([dataset, other_dataset],
                                       field_overrides={"name": new_field})

    assert dataset_concat.field_dict["override_name_field"].is_finalized

    concat_vocab = dataset_concat.field_dict["override_name_field"].vocab
    dataset_vocab = dataset.field_dict["name"].vocab
    other_vocab = other_dataset.field_dict["name"].vocab
    assert set(
        concat_vocab.itos) == set(dataset_vocab.itos) | set(other_vocab.itos)
Exemple #10
0
def test_slice_view_to_dataset(dataset, tmp_path):
    start, stop, step = 3, 8, 2
    slc = slice(start, stop, step)
    dataset_view = DatasetSlicedView(dataset, s=slc)

    # cast to Dataset
    ds = Dataset.from_dataset(dataset_view)
    assert isinstance(ds, Dataset)
    assert len(ds) == len(dataset_view)
    for ex_view, ex_dataset in zip(dataset_view, ds):
        for f in dataset.fields:
            assert ex_view[f.name] == ex_dataset[f.name]

    # cast to DiskBackedDataset
    ds = DiskBackedDataset.from_dataset(dataset_view, cache_path=tmp_path)
    assert isinstance(ds, DiskBackedDataset)
    assert len(ds) == len(dataset_view)
    for ex_view, ex_dataset in zip(dataset_view, ds):
        for f in dataset.fields:
            assert ex_view[f.name] == ex_dataset[f.name]
Exemple #11
0
def test_iterator_batch_as_list():
    raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)]
    field = Field(
        "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True
    )
    fields = (field,)
    ef = ExampleFactory(fields)
    examples = [ef.from_list(raw_example) for raw_example in raw_dataset]
    ds = Dataset(examples, fields)

    for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)):
        assert isinstance(batch.test_field, list)
        field_batch = batch.test_field
        if i == 0:
            assert len(field_batch) == 2
            assert np.all(field_batch[0] == [1, 2, 3, 4])
            assert np.all(field_batch[1] == [2, 3, 4])

        if i == 2:
            assert len(field_batch) == 1
            assert np.all(field_batch[0] == [3, 4])
Exemple #12
0
    def partial_fit_raw(
        self,
        examples: Iterable[Union[Dict, List]],
        trainer_kwargs: Dict = None,
        trainer: AbstractTrainer = None,
    ):
        """
        Fits the model to the data without resetting the model. Each example
        must be of the format provided in the constructor as the
        `example_format` parameter.

        Parameters
        ----------
        examples: Iterable[Union[Dict, List]]
            Iterable of examples in raw state.

        trainer_kwargs : dict
            Dict containing trainer arguments. Arguments passed to the trainer are the
            default arguments defined with `set_default_trainer_args` updated/overridden
            by 'trainer_kwargs'.

        trainer: AbstractTrainer, Optional
            Trainer used to fit the model. If None, the trainer provided in the
            constructor will be used.

        training_iterator_callable: Callable[[Dataset], Iterator]
            Callable used to instantiate new instances of the Iterator used in fitting the
            model. If None, the training_iterator_callable provided in the
            constructor will be used.
        """

        processed_examples = [
            self.training_example_factory.from_format(ex, self.example_format)
            for ex in examples
        ]
        ds = Dataset(processed_examples, self.all_fields)
        self.partial_fit(dataset=ds,
                         trainer_kwargs=trainer_kwargs,
                         trainer=trainer)
Exemple #13
0
def dataset(fields) -> DatasetBase:
    example_factory = ExampleFactory(fields)
    examples = [example_factory.from_list(e) for e in TEST_DATA]
    ds = Dataset(examples, fields)
    ds.finalize_fields()
    return ds