Beispiel #1
0
def test_shuffle_random_state(tabular_dataset):
    random.seed(5)  # internal random state independent from global seed
    # run first iterator for 3 epochs
    iterator = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True)
    run_n_epochs(iterator, 3)

    # get first iterator's internal state
    state = iterator.get_internal_random_state()

    random.seed(6)  # internal random state independent from global seed

    # initialize second iterator with the state
    iterator_2 = Iterator(
        dataset=tabular_dataset, batch_size=2, shuffle=True, internal_random_state=state
    )

    # run both iterators for 2 epochs
    run_n_epochs(iterator, 2)
    random.seed(8)  # internal random state independent from global seed
    run_n_epochs(iterator_2, 2)

    # the iterators should behave identically
    assert iterators_behave_identically(iterator, iterator_2)

    iterator_3 = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True)
    iterator_3.set_internal_random_state(iterator_2.get_internal_random_state())

    # the iterators should behave identically
    assert iterators_behave_identically(iterator_2, iterator_3)
Beispiel #2
0
def test_shuffle_deterministic_sequence(
    seed_1,
    seed_2,
    num_epochs_1,
    num_epochs_2,
    expect_identical_behaviour,
    tabular_dataset,
):
    random.seed(42)  # internal random state independent from global seed

    iterator = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True, seed=seed_1)
    run_n_epochs(iterator, num_epochs_1)  # iterate for num_epochs_1 epochs

    random.seed(43)  # internal random state independent from global seed

    iterator_2 = Iterator(
        dataset=tabular_dataset, batch_size=2, shuffle=True, seed=seed_2
    )
    run_n_epochs(iterator_2, num_epochs_2)  # iterate for num_epochs_2 epochs

    random.seed(44)  # internal random state independent from global seed

    if expect_identical_behaviour:
        assert iterators_behave_identically(iterator, iterator_2)
    else:
        # Beware, for some combination of different seeds and numbers of
        # epochs the iterators might actually behave identically.
        # For the chosen combination they don't.
        assert not iterators_behave_identically(iterator, iterator_2)
Beispiel #3
0
def test_shuffle_random_state_exception(tabular_dataset):
    iterator = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=False)

    with pytest.raises(RuntimeError):
        iterator.get_internal_random_state()

    iterator_2 = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True)
    state = iterator_2.get_internal_random_state()

    with pytest.raises(RuntimeError):
        iterator.set_internal_random_state(state)
Beispiel #4
0
def test_simple_trainer_batch_transform_call(tabular_dataset, mocker, model):  # noqa
    iterator = Iterator(tabular_dataset, batch_size=len(tabular_dataset))
    batch = next(iter(iterator))

    mocker.patch(
        "tests.experimental.models.test_simple_trainers.mock_feature_transform_fun",
        return_value=batch["text"],
    )
    mocker.patch(
        "tests.experimental.models.test_simple_trainers.mock_label_transform_fun",
        return_value=batch["rating"],
    )

    feature_transformer = FeatureTransformer(mock_feature_transform_fun)
    trainer = SimpleTrainer()
    trainer.train(
        model=model,
        dataset=tabular_dataset,
        iterator=iterator,
        feature_transformer=feature_transformer,
        label_transform_fun=mock_label_transform_fun,
        **{trainer.MAX_EPOCH_KEY: 10},
    )
    assert mock_feature_transform_fun.call_count == 10  # pylint: disable=E1101
    assert mock_label_transform_fun.call_count == 10  # pylint: disable=E1101
Beispiel #5
0
def test_not_numericalizable_field(json_file_path):
    class MockCustomDataClass:
        def __init__(self, data):
            self.data = data

    def custom_datatype_tokenizer(data):
        return MockCustomDataClass(data)

    fields = tabular_dataset_fields()
    text_field = fields["text_with_missing_data"]
    non_numericalizable_field = Field(
        "non_numericalizable_field",
        tokenizer=custom_datatype_tokenizer,
        numericalizer=None,
        allow_missing_data=True,
        keep_raw=True,
    )

    fields["text_with_missing_data"] = (text_field, non_numericalizable_field)

    dataset = create_tabular_dataset_from_json(fields, json_file_path)

    with pytest.warns(UserWarning):
        for batch in Iterator(dataset, batch_size=len(dataset), shuffle=False):
            assert isinstance(batch.non_numericalizable_field, (list, tuple))
            for i, batch_data, real_data in zip(
                range(len(dataset)), batch.non_numericalizable_field, TABULAR_TEXT
            ):
                if i == 3:
                    assert batch_data is None
                else:
                    assert isinstance(batch_data, MockCustomDataClass)
                    assert batch_data.data == real_data
Beispiel #6
0
def test_eager_tokenization():
    def create_dataset():
        fields = (
            Field("text", numericalizer=Vocab()),
            Field("source", numericalizer=Vocab(), tokenizer=list),
        )
        example_factory = ExampleFactory(fields)

        examples = [
            example_factory.from_list(data)
            for data in zip(TABULAR_TEXT, TABULAR_SOURCES)
        ]

        dataset = Dataset(examples, fields)
        return dataset

    dataset_lazy = create_dataset()
    dataset_eager = create_dataset()

    dataset_eager.finalize_fields()
    # Numericalize eagerly
    dataset_eager.numericalize_examples()

    dataset_lazy.finalize_fields()
    # Numericalize Lazily
    for _ in Iterator(dataset_lazy, 100):
        pass

    for example_eager, example_lazy in zip(dataset_eager, dataset_lazy):
        assert example_eager["text_"] is not None
        assert all(example_eager["text_"] == example_lazy.text_)

        assert example_eager["source_"] is not None
        assert all(example_eager["source_"] == example_lazy.source_)
Beispiel #7
0
def test_shuffle_no_seed_or_state_exception(tabular_dataset):
    with pytest.raises(ValueError):
        Iterator(
            dataset=tabular_dataset,
            batch_size=2,
            shuffle=True,
            seed=None,
            internal_random_state=None,
        )
Beispiel #8
0
def test_lazy_numericalization_caching(tabular_dataset):
    # Run one epoch to cause lazy numericalization
    for _ in Iterator(dataset=tabular_dataset, batch_size=10):
        pass

    # Test if cached data is equal to numericalized data
    for example in tabular_dataset:
        for field in tabular_dataset.fields:
            example_data = example[field.name]
            numericalized_data = field.numericalize(example_data)

            cached_data = example[f"{field.name}_"]
            assert np.all(numericalized_data == cached_data)
Beispiel #9
0
def test_simple_trainer_num_epoch(tabular_dataset, model):  # noqa
    iterator = Iterator(batch_size=len(tabular_dataset))
    trainer = SimpleTrainer()
    feature_transformer = FeatureTransformer(lambda x: x)
    trainer.train(
        model=model,
        dataset=tabular_dataset,
        iterator=iterator,
        feature_transformer=feature_transformer,
        label_transform_fun=lambda y: y,
        **{trainer.MAX_EPOCH_KEY: 10},
    )
    assert model.fit.call_count == 10
Beispiel #10
0
    def predict(self,
                dataset: Dataset,
                batch_size: int = 128,
                **kwargs) -> np.ndarray:
        """
        Computes the prediction of the model for every example in the provided
        dataset.

        Parameters
        ----------
        dataset : Dataset
            Dataset to compute predictions for.

        batch_size : int
            If None, predictions for the whole dataset will be done in a single batch.
            Else, predictions will be calculated in batches of batch_size size.
            This argument is useful in case the whole dataset can't be processed in a
            single batch.

        kwargs
            Keyword arguments passed to the model's `predict` method

        Returns
        -------
        ndarray
            Tensor containing predictions for examples in the passed Dataset.
        """
        # TODO: new method of providing examples must be defined.
        # examples is taken in dataset form as proof-of-concept.
        self._check_if_model_exists()

        y = []
        prediction_key = AbstractSupervisedModel.PREDICTION_KEY

        if batch_size is None:
            x_batch_tensor = self.feature_transformer.transform(
                dataset.batch())
            batch_prediction = self.model.predict(x_batch_tensor, **kwargs)
            prediction_tensor = batch_prediction[prediction_key]
            return prediction_tensor
        else:
            prediction_iterator = Iterator(batch_size=batch_size,
                                           shuffle=False)

            for batch in prediction_iterator(dataset):
                x_batch_tensor = self.feature_transformer.transform(batch)
                batch_prediction = self.model.predict(x_batch_tensor, **kwargs)
                prediction_tensor = batch_prediction[prediction_key]
                y.append(prediction_tensor)

            return np.concatenate(y)
Beispiel #11
0
def test_caching_disabled(cache_disabled_tabular_dataset):
    # Run one epoch to cause lazy numericalization
    for _ in Iterator(dataset=cache_disabled_tabular_dataset, batch_size=10):
        pass

    cache_disabled_fields = [
        f for f in cache_disabled_tabular_dataset.fields if f.disable_numericalize_caching
    ]
    # Test if cached data is equal to numericalized data
    for example in cache_disabled_tabular_dataset:
        for field in cache_disabled_fields:

            cache_field_name = f"{field.name}_"
            numericalization = example.get(cache_field_name)
            assert numericalization is None
Beispiel #12
0
def test_iterate_new_epoch(tabular_dataset):
    iterator = Iterator(dataset=tabular_dataset, batch_size=2)

    it = iter(iterator)
    assert iterator._iterations == 0

    for i in range(1, 5):
        next(it)
        assert iterator._epoch == 0
        assert iterator._iterations == i

    with pytest.raises(StopIteration):
        next(it)

    assert iterator._epoch == 1
    assert iterator._iterations == 0
Beispiel #13
0
def test_include_lengths(length_included_tabular_dataset):

    iterator = Iterator(
        dataset=length_included_tabular_dataset, batch_size=2, shuffle=False
    )

    # Since we're not shuffling, this shouldn't change
    expected_batch_lengths = [[3, 1], [4, 1], [2, 3], [6]]

    for batch, expected_batch_length in zip(iterator, expected_batch_lengths):
        text, lengths = batch.text
        # Should contain same number of instances
        assert lengths.shape[0] == text.shape[0]
        # Number of columns should be equal to max length
        assert max(lengths) == text.shape[-1]
        # Check that expected lengths match
        assert np.array_equal(lengths, expected_batch_length)
Beispiel #14
0
def test_iterator_missing_data_in_batch(json_file_path):
    missing_data_default_value = -99
    fields = tabular_dataset_fields()
    missing_value_field = Field(
        "missing_value_field",
        tokenizer="split",
        numericalizer=Vocab(),
        allow_missing_data=True,
        keep_raw=True,
        missing_data_token=missing_data_default_value,
    )
    fields["text_with_missing_data"] = missing_value_field
    ds = create_tabular_dataset_from_json(fields, json_file_path)

    for batch in Iterator(ds, batch_size=len(ds), shuffle=False):
        # test if the value we know is missing is correctly filled out
        missing_value_row = batch.missing_value_field[3]
        assert np.all(missing_value_row == missing_data_default_value)
Beispiel #15
0
def test_create_batch(tabular_dataset):
    expected_row_lengths = [3, 4, 3, 6]

    batch_size = 2
    iterator = Iterator(dataset=tabular_dataset, batch_size=batch_size, shuffle=False)

    iter_len = len(iterator)
    assert iter_len == 4
    for i, (batch, expected_row_length) in enumerate(zip(iterator, expected_row_lengths)):
        assert hasattr(batch, "text") and hasattr(batch, "rating")

        assert batch.text.shape[1] == expected_row_length
        assert batch.rating.shape[1] == 1

        if (i + 1) == iter_len:
            assert batch.text.shape[0] == 1
            assert batch.rating.shape[0] == 1
        else:
            assert batch.text.shape[0] == batch_size
            assert batch.rating.shape[0] == batch_size
Beispiel #16
0
def test_iterator_batch_as_list():
    raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)]
    field = Field(
        "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True
    )
    fields = (field,)
    ef = ExampleFactory(fields)
    examples = [ef.from_list(raw_example) for raw_example in raw_dataset]
    ds = Dataset(examples, fields)

    for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)):
        assert isinstance(batch.test_field, list)
        field_batch = batch.test_field
        if i == 0:
            assert len(field_batch) == 2
            assert np.all(field_batch[0] == [1, 2, 3, 4])
            assert np.all(field_batch[1] == [2, 3, 4])

        if i == 2:
            assert len(field_batch) == 1
            assert np.all(field_batch[0] == [3, 4])
Beispiel #17
0
def test_sort_key(length_included_tabular_dataset):
    def text_len_sort_key(example):
        tokens = example["text"][1]
        if tokens is None:
            return 0
        else:
            return -len(tokens)

    iterator = Iterator(
        dataset=length_included_tabular_dataset,
        batch_size=2,
        sort_key=text_len_sort_key,
        shuffle=False,
    )

    # Since we're not shuffling, this shouldn't change
    expected_batch_lengths = [[3, 1], [4, 1], [3, 2], [6]]

    for batch, expected_batch_length in zip(iterator, expected_batch_lengths):
        text, lengths = batch.text
        assert np.array_equal(lengths, expected_batch_length)
Beispiel #18
0
def test_padding(fixed_length, expected_shape, json_file_path):
    fields = tabular_dataset_fields(fixed_length=fixed_length)
    ds = create_tabular_dataset_from_json(fields=fields, json_file_path=json_file_path)

    batch_size = 7

    iterator = Iterator(dataset=ds, batch_size=batch_size, shuffle=False)

    batch = next(iter(iterator))

    assert batch.text.shape == expected_shape

    pad_symbol = fields["text"].vocab.get_padding_index()

    for i, row in enumerate(batch.text):
        if TABULAR_TEXT[i] is None:
            # if missing data
            continue

        n_el = len(TABULAR_TEXT[i].split())

        assert (row[:n_el].astype(np.int32) != pad_symbol).all()
        assert (row[n_el:].astype(np.int32) == pad_symbol).all()
Beispiel #19
0
def train_multilabel_svm(
    dataset_path,
    param_grid,
    cutoff,
    n_outer_splits=5,
    n_inner_splits=3,
    n_jobs=1,
    is_verbose=True,
    include_classes_with_no_train_examples=False,
    include_classes_with_no_test_examples=False,
):
    """
    Trains the multilabel SVM model on a given instance of dataset.

    Parameters
    ----------
    dataset_path : str
        Path to the instance of EuroVoc dataset stored as a dill file.
    param_grid : dict or list(dict)
            Dictionary with parameters names (string) as keys and lists of parameter
            settings to try as values, or a list of such dictionaries, in which case the
            grids spanned by each dictionary in the list are explored. This enables
            searching over any sequence of parameter settings. For more information,
            refer to
            https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
    cutoff : int
        If the number of positive training examples for a class is less than the
        cut-off, no model is trained for such class and the index of the label is
        added in the missing model indexes.
    n_outer_splits : int
        Number of splits in an outer loop of a nested cross validation.
    n_inner_splits : int
        Number of splits in an inner loop of a nested cross validation.
    n_jobs : int
        Number of threads to be used.
    is_verbose : boolean
        If set to true, scores on test set are printed for each fold of the
        outer loop in the nested cross validation.
    include_classes_with_no_train_examples : boolean
        If True, scores of the classes witn an unsufficient number of training examples
        (less than the specified cut-off) are included when calculating general scores.
        Note that this makes sense if cut-off=1 because that means classes with no train
        examples will be taken into consideration.
    include_classes_with_no_test_examples : boolean
        If True, scores for classes with no positive instances in the test set are
        included in the general score.
    """
    dataset = None
    with open(dataset_path, "rb") as input_file:
        dataset = dill.load(input_file)

    vectorizer = TfIdfVectorizer()
    vectorizer.fit(dataset, dataset.field_dict["text"])

    outer_cv = KFold(n_splits=n_outer_splits, shuffle=True, random_state=0)

    micro_P = []
    micro_R = []
    micro_F1 = []
    macro_P = []
    macro_R = []
    macro_F1 = []

    for train, test in outer_cv.split(dataset):
        train_iter = Iterator(dataset=train, batch_size=len(train))
        clf = MultilabelSVM()
        for X, Y in train_iter:
            X = vectorizer.transform(X.text)
            Y = get_label_matrix(Y)

            clf.fit(X, Y, parameter_grid=param_grid, cutoff=cutoff, n_jobs=n_jobs)

        test_iter = Iterator(dataset=test, batch_size=len(test))
        for X, Y in test_iter:
            X = vectorizer.transform(X.text)
            Y = get_label_matrix(Y)
            prediction_dict = clf.predict(X)
            Y_pred = prediction_dict[AbstractSupervisedModel.PREDICTION_KEY]

            if not include_classes_with_no_train_examples:
                Y_pred = np.delete(
                    Y_pred, list(clf.get_indexes_of_missing_models()), axis=1
                )
                Y = np.delete(Y, list(clf.get_indexes_of_missing_models()), axis=1)

            # deletes all zero columns (all labels which don't have any positive exaples
            # in the current test set)
            if not include_classes_with_no_test_examples:
                cols = ~(Y == 0).all(axis=0)
                Y = Y[:, cols]
                Y_pred = Y_pred[:, cols]

            micro_P.append(precision_score(Y, Y_pred, average="micro"))
            micro_R.append(recall_score(Y, Y_pred, average="micro"))
            micro_F1.append(f1_score(Y, Y_pred, average="micro"))

            macro_P.append(precision_score(Y, Y_pred, average="macro"))
            macro_R.append(recall_score(Y, Y_pred, average="macro"))
            macro_F1.append(f1_score(Y, Y_pred, average="macro"))

            if is_verbose:
                print("Scores on test set:")
                print("micro P", micro_P[-1])
                print("micro R", micro_R[-1])
                print("micro F1", micro_F1[-1])
                print("macro P", macro_P[-1])
                print("macro R", macro_R[-1])
                print("macro F1", macro_F1[-1])

    print("Average scores on test sets:")

    print("average micro P", np.average(micro_P))
    print("average micro R", np.average(micro_R))
    print("average micro F1", np.average(micro_F1))

    print("average macro P", np.average(macro_P))
    print("average macro R", np.average(macro_R))
    print("average macro F1", np.average(macro_F1))
Beispiel #20
0
def test_len(batch_size, expected_len, tabular_dataset):
    iterator = Iterator(dataset=tabular_dataset, batch_size=batch_size)

    assert len(iterator) == expected_len