def test_shuffle_random_state(tabular_dataset): random.seed(5) # internal random state independent from global seed # run first iterator for 3 epochs iterator = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True) run_n_epochs(iterator, 3) # get first iterator's internal state state = iterator.get_internal_random_state() random.seed(6) # internal random state independent from global seed # initialize second iterator with the state iterator_2 = Iterator( dataset=tabular_dataset, batch_size=2, shuffle=True, internal_random_state=state ) # run both iterators for 2 epochs run_n_epochs(iterator, 2) random.seed(8) # internal random state independent from global seed run_n_epochs(iterator_2, 2) # the iterators should behave identically assert iterators_behave_identically(iterator, iterator_2) iterator_3 = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True) iterator_3.set_internal_random_state(iterator_2.get_internal_random_state()) # the iterators should behave identically assert iterators_behave_identically(iterator_2, iterator_3)
def test_shuffle_deterministic_sequence( seed_1, seed_2, num_epochs_1, num_epochs_2, expect_identical_behaviour, tabular_dataset, ): random.seed(42) # internal random state independent from global seed iterator = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True, seed=seed_1) run_n_epochs(iterator, num_epochs_1) # iterate for num_epochs_1 epochs random.seed(43) # internal random state independent from global seed iterator_2 = Iterator( dataset=tabular_dataset, batch_size=2, shuffle=True, seed=seed_2 ) run_n_epochs(iterator_2, num_epochs_2) # iterate for num_epochs_2 epochs random.seed(44) # internal random state independent from global seed if expect_identical_behaviour: assert iterators_behave_identically(iterator, iterator_2) else: # Beware, for some combination of different seeds and numbers of # epochs the iterators might actually behave identically. # For the chosen combination they don't. assert not iterators_behave_identically(iterator, iterator_2)
def test_shuffle_random_state_exception(tabular_dataset): iterator = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=False) with pytest.raises(RuntimeError): iterator.get_internal_random_state() iterator_2 = Iterator(dataset=tabular_dataset, batch_size=2, shuffle=True) state = iterator_2.get_internal_random_state() with pytest.raises(RuntimeError): iterator.set_internal_random_state(state)
def test_simple_trainer_batch_transform_call(tabular_dataset, mocker, model): # noqa iterator = Iterator(tabular_dataset, batch_size=len(tabular_dataset)) batch = next(iter(iterator)) mocker.patch( "tests.experimental.models.test_simple_trainers.mock_feature_transform_fun", return_value=batch["text"], ) mocker.patch( "tests.experimental.models.test_simple_trainers.mock_label_transform_fun", return_value=batch["rating"], ) feature_transformer = FeatureTransformer(mock_feature_transform_fun) trainer = SimpleTrainer() trainer.train( model=model, dataset=tabular_dataset, iterator=iterator, feature_transformer=feature_transformer, label_transform_fun=mock_label_transform_fun, **{trainer.MAX_EPOCH_KEY: 10}, ) assert mock_feature_transform_fun.call_count == 10 # pylint: disable=E1101 assert mock_label_transform_fun.call_count == 10 # pylint: disable=E1101
def test_not_numericalizable_field(json_file_path): class MockCustomDataClass: def __init__(self, data): self.data = data def custom_datatype_tokenizer(data): return MockCustomDataClass(data) fields = tabular_dataset_fields() text_field = fields["text_with_missing_data"] non_numericalizable_field = Field( "non_numericalizable_field", tokenizer=custom_datatype_tokenizer, numericalizer=None, allow_missing_data=True, keep_raw=True, ) fields["text_with_missing_data"] = (text_field, non_numericalizable_field) dataset = create_tabular_dataset_from_json(fields, json_file_path) with pytest.warns(UserWarning): for batch in Iterator(dataset, batch_size=len(dataset), shuffle=False): assert isinstance(batch.non_numericalizable_field, (list, tuple)) for i, batch_data, real_data in zip( range(len(dataset)), batch.non_numericalizable_field, TABULAR_TEXT ): if i == 3: assert batch_data is None else: assert isinstance(batch_data, MockCustomDataClass) assert batch_data.data == real_data
def test_eager_tokenization(): def create_dataset(): fields = ( Field("text", numericalizer=Vocab()), Field("source", numericalizer=Vocab(), tokenizer=list), ) example_factory = ExampleFactory(fields) examples = [ example_factory.from_list(data) for data in zip(TABULAR_TEXT, TABULAR_SOURCES) ] dataset = Dataset(examples, fields) return dataset dataset_lazy = create_dataset() dataset_eager = create_dataset() dataset_eager.finalize_fields() # Numericalize eagerly dataset_eager.numericalize_examples() dataset_lazy.finalize_fields() # Numericalize Lazily for _ in Iterator(dataset_lazy, 100): pass for example_eager, example_lazy in zip(dataset_eager, dataset_lazy): assert example_eager["text_"] is not None assert all(example_eager["text_"] == example_lazy.text_) assert example_eager["source_"] is not None assert all(example_eager["source_"] == example_lazy.source_)
def test_shuffle_no_seed_or_state_exception(tabular_dataset): with pytest.raises(ValueError): Iterator( dataset=tabular_dataset, batch_size=2, shuffle=True, seed=None, internal_random_state=None, )
def test_lazy_numericalization_caching(tabular_dataset): # Run one epoch to cause lazy numericalization for _ in Iterator(dataset=tabular_dataset, batch_size=10): pass # Test if cached data is equal to numericalized data for example in tabular_dataset: for field in tabular_dataset.fields: example_data = example[field.name] numericalized_data = field.numericalize(example_data) cached_data = example[f"{field.name}_"] assert np.all(numericalized_data == cached_data)
def test_simple_trainer_num_epoch(tabular_dataset, model): # noqa iterator = Iterator(batch_size=len(tabular_dataset)) trainer = SimpleTrainer() feature_transformer = FeatureTransformer(lambda x: x) trainer.train( model=model, dataset=tabular_dataset, iterator=iterator, feature_transformer=feature_transformer, label_transform_fun=lambda y: y, **{trainer.MAX_EPOCH_KEY: 10}, ) assert model.fit.call_count == 10
def predict(self, dataset: Dataset, batch_size: int = 128, **kwargs) -> np.ndarray: """ Computes the prediction of the model for every example in the provided dataset. Parameters ---------- dataset : Dataset Dataset to compute predictions for. batch_size : int If None, predictions for the whole dataset will be done in a single batch. Else, predictions will be calculated in batches of batch_size size. This argument is useful in case the whole dataset can't be processed in a single batch. kwargs Keyword arguments passed to the model's `predict` method Returns ------- ndarray Tensor containing predictions for examples in the passed Dataset. """ # TODO: new method of providing examples must be defined. # examples is taken in dataset form as proof-of-concept. self._check_if_model_exists() y = [] prediction_key = AbstractSupervisedModel.PREDICTION_KEY if batch_size is None: x_batch_tensor = self.feature_transformer.transform( dataset.batch()) batch_prediction = self.model.predict(x_batch_tensor, **kwargs) prediction_tensor = batch_prediction[prediction_key] return prediction_tensor else: prediction_iterator = Iterator(batch_size=batch_size, shuffle=False) for batch in prediction_iterator(dataset): x_batch_tensor = self.feature_transformer.transform(batch) batch_prediction = self.model.predict(x_batch_tensor, **kwargs) prediction_tensor = batch_prediction[prediction_key] y.append(prediction_tensor) return np.concatenate(y)
def test_caching_disabled(cache_disabled_tabular_dataset): # Run one epoch to cause lazy numericalization for _ in Iterator(dataset=cache_disabled_tabular_dataset, batch_size=10): pass cache_disabled_fields = [ f for f in cache_disabled_tabular_dataset.fields if f.disable_numericalize_caching ] # Test if cached data is equal to numericalized data for example in cache_disabled_tabular_dataset: for field in cache_disabled_fields: cache_field_name = f"{field.name}_" numericalization = example.get(cache_field_name) assert numericalization is None
def test_iterate_new_epoch(tabular_dataset): iterator = Iterator(dataset=tabular_dataset, batch_size=2) it = iter(iterator) assert iterator._iterations == 0 for i in range(1, 5): next(it) assert iterator._epoch == 0 assert iterator._iterations == i with pytest.raises(StopIteration): next(it) assert iterator._epoch == 1 assert iterator._iterations == 0
def test_include_lengths(length_included_tabular_dataset): iterator = Iterator( dataset=length_included_tabular_dataset, batch_size=2, shuffle=False ) # Since we're not shuffling, this shouldn't change expected_batch_lengths = [[3, 1], [4, 1], [2, 3], [6]] for batch, expected_batch_length in zip(iterator, expected_batch_lengths): text, lengths = batch.text # Should contain same number of instances assert lengths.shape[0] == text.shape[0] # Number of columns should be equal to max length assert max(lengths) == text.shape[-1] # Check that expected lengths match assert np.array_equal(lengths, expected_batch_length)
def test_iterator_missing_data_in_batch(json_file_path): missing_data_default_value = -99 fields = tabular_dataset_fields() missing_value_field = Field( "missing_value_field", tokenizer="split", numericalizer=Vocab(), allow_missing_data=True, keep_raw=True, missing_data_token=missing_data_default_value, ) fields["text_with_missing_data"] = missing_value_field ds = create_tabular_dataset_from_json(fields, json_file_path) for batch in Iterator(ds, batch_size=len(ds), shuffle=False): # test if the value we know is missing is correctly filled out missing_value_row = batch.missing_value_field[3] assert np.all(missing_value_row == missing_data_default_value)
def test_create_batch(tabular_dataset): expected_row_lengths = [3, 4, 3, 6] batch_size = 2 iterator = Iterator(dataset=tabular_dataset, batch_size=batch_size, shuffle=False) iter_len = len(iterator) assert iter_len == 4 for i, (batch, expected_row_length) in enumerate(zip(iterator, expected_row_lengths)): assert hasattr(batch, "text") and hasattr(batch, "rating") assert batch.text.shape[1] == expected_row_length assert batch.rating.shape[1] == 1 if (i + 1) == iter_len: assert batch.text.shape[0] == 1 assert batch.rating.shape[0] == 1 else: assert batch.text.shape[0] == batch_size assert batch.rating.shape[0] == batch_size
def test_iterator_batch_as_list(): raw_dataset = [("1 2 3 4",), ("2 3 4",), ("3 4",)] field = Field( "test_field", numericalizer=int, tokenizer="split", disable_batch_matrix=True ) fields = (field,) ef = ExampleFactory(fields) examples = [ef.from_list(raw_example) for raw_example in raw_dataset] ds = Dataset(examples, fields) for i, batch in enumerate(Iterator(ds, batch_size=2, shuffle=False)): assert isinstance(batch.test_field, list) field_batch = batch.test_field if i == 0: assert len(field_batch) == 2 assert np.all(field_batch[0] == [1, 2, 3, 4]) assert np.all(field_batch[1] == [2, 3, 4]) if i == 2: assert len(field_batch) == 1 assert np.all(field_batch[0] == [3, 4])
def test_sort_key(length_included_tabular_dataset): def text_len_sort_key(example): tokens = example["text"][1] if tokens is None: return 0 else: return -len(tokens) iterator = Iterator( dataset=length_included_tabular_dataset, batch_size=2, sort_key=text_len_sort_key, shuffle=False, ) # Since we're not shuffling, this shouldn't change expected_batch_lengths = [[3, 1], [4, 1], [3, 2], [6]] for batch, expected_batch_length in zip(iterator, expected_batch_lengths): text, lengths = batch.text assert np.array_equal(lengths, expected_batch_length)
def test_padding(fixed_length, expected_shape, json_file_path): fields = tabular_dataset_fields(fixed_length=fixed_length) ds = create_tabular_dataset_from_json(fields=fields, json_file_path=json_file_path) batch_size = 7 iterator = Iterator(dataset=ds, batch_size=batch_size, shuffle=False) batch = next(iter(iterator)) assert batch.text.shape == expected_shape pad_symbol = fields["text"].vocab.get_padding_index() for i, row in enumerate(batch.text): if TABULAR_TEXT[i] is None: # if missing data continue n_el = len(TABULAR_TEXT[i].split()) assert (row[:n_el].astype(np.int32) != pad_symbol).all() assert (row[n_el:].astype(np.int32) == pad_symbol).all()
def train_multilabel_svm( dataset_path, param_grid, cutoff, n_outer_splits=5, n_inner_splits=3, n_jobs=1, is_verbose=True, include_classes_with_no_train_examples=False, include_classes_with_no_test_examples=False, ): """ Trains the multilabel SVM model on a given instance of dataset. Parameters ---------- dataset_path : str Path to the instance of EuroVoc dataset stored as a dill file. param_grid : dict or list(dict) Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored. This enables searching over any sequence of parameter settings. For more information, refer to https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html cutoff : int If the number of positive training examples for a class is less than the cut-off, no model is trained for such class and the index of the label is added in the missing model indexes. n_outer_splits : int Number of splits in an outer loop of a nested cross validation. n_inner_splits : int Number of splits in an inner loop of a nested cross validation. n_jobs : int Number of threads to be used. is_verbose : boolean If set to true, scores on test set are printed for each fold of the outer loop in the nested cross validation. include_classes_with_no_train_examples : boolean If True, scores of the classes witn an unsufficient number of training examples (less than the specified cut-off) are included when calculating general scores. Note that this makes sense if cut-off=1 because that means classes with no train examples will be taken into consideration. include_classes_with_no_test_examples : boolean If True, scores for classes with no positive instances in the test set are included in the general score. """ dataset = None with open(dataset_path, "rb") as input_file: dataset = dill.load(input_file) vectorizer = TfIdfVectorizer() vectorizer.fit(dataset, dataset.field_dict["text"]) outer_cv = KFold(n_splits=n_outer_splits, shuffle=True, random_state=0) micro_P = [] micro_R = [] micro_F1 = [] macro_P = [] macro_R = [] macro_F1 = [] for train, test in outer_cv.split(dataset): train_iter = Iterator(dataset=train, batch_size=len(train)) clf = MultilabelSVM() for X, Y in train_iter: X = vectorizer.transform(X.text) Y = get_label_matrix(Y) clf.fit(X, Y, parameter_grid=param_grid, cutoff=cutoff, n_jobs=n_jobs) test_iter = Iterator(dataset=test, batch_size=len(test)) for X, Y in test_iter: X = vectorizer.transform(X.text) Y = get_label_matrix(Y) prediction_dict = clf.predict(X) Y_pred = prediction_dict[AbstractSupervisedModel.PREDICTION_KEY] if not include_classes_with_no_train_examples: Y_pred = np.delete( Y_pred, list(clf.get_indexes_of_missing_models()), axis=1 ) Y = np.delete(Y, list(clf.get_indexes_of_missing_models()), axis=1) # deletes all zero columns (all labels which don't have any positive exaples # in the current test set) if not include_classes_with_no_test_examples: cols = ~(Y == 0).all(axis=0) Y = Y[:, cols] Y_pred = Y_pred[:, cols] micro_P.append(precision_score(Y, Y_pred, average="micro")) micro_R.append(recall_score(Y, Y_pred, average="micro")) micro_F1.append(f1_score(Y, Y_pred, average="micro")) macro_P.append(precision_score(Y, Y_pred, average="macro")) macro_R.append(recall_score(Y, Y_pred, average="macro")) macro_F1.append(f1_score(Y, Y_pred, average="macro")) if is_verbose: print("Scores on test set:") print("micro P", micro_P[-1]) print("micro R", micro_R[-1]) print("micro F1", micro_F1[-1]) print("macro P", macro_P[-1]) print("macro R", macro_R[-1]) print("macro F1", macro_F1[-1]) print("Average scores on test sets:") print("average micro P", np.average(micro_P)) print("average micro R", np.average(micro_R)) print("average micro F1", np.average(micro_F1)) print("average macro P", np.average(macro_P)) print("average macro R", np.average(macro_R)) print("average macro F1", np.average(macro_F1))
def test_len(batch_size, expected_len, tabular_dataset): iterator = Iterator(dataset=tabular_dataset, batch_size=batch_size) assert len(iterator) == expected_len