Exemple #1
0
def test_use_shared_vocab_exception(
    initial_train_text: Text,
    additional_train_text: Text,
    use_shared_vocab: bool,
    tmp_path: Path,
):
    """Tests if an exception is raised when `use_shared_vocab` is set to True
    during incremental training."""
    tk = WhitespaceTokenizer()
    initial_cvf = CountVectorsFeaturizer(
        component_config={"use_shared_vocab": use_shared_vocab}
    )
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])
    tk.train(data)
    initial_cvf.train(data)

    file_dict = initial_cvf.persist("ftr", tmp_path)
    meta = initial_cvf.component_config.copy()
    meta.update(file_dict)
    new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True)

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tk.train(data)
    if use_shared_vocab:
        with pytest.raises(Exception) as exec_info:
            new_cvf.train(data)
        assert (
            "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported"
            in str(exec_info.value)
        )
    else:
        new_cvf.train(data)
Exemple #2
0
def test_cvf_incremental_train_vocabulary_overflow(tmp_path: Path, ):
    additional_size = 3
    original_train_text = "hello my name is John."
    additional_train_text = "I am also new."
    tokenizer = WhitespaceTokenizer()
    original_featurizer = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": additional_size
        }},
        finetune_mode=False,
    )
    train_message = Message(data={"text": original_train_text})
    data = TrainingData([train_message])

    tokenizer.train(data)
    original_featurizer.train(data)

    file_dict = original_featurizer.persist("ftr", str(tmp_path))

    # load original_featurizer
    meta = original_featurizer.component_config.copy()
    meta.update(file_dict)
    new_featurizer = CountVectorsFeaturizer.load(meta,
                                                 str(tmp_path),
                                                 should_finetune=True)

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tokenizer.train(data)

    with pytest.warns(UserWarning) as warning:
        new_featurizer.train(data)
    assert "New data contains vocabulary of size" in warning[0].message.args[0]
Exemple #3
0
def test_cvf_incremental_train_vocabulary(
    additional_size: Optional[int],
    original_train_text: Text,
    additional_train_text: Text,
    total_vocabulary_size: int,
    remaining_buffer_size: int,
    tmp_path: Path,
):

    tokenizer = WhitespaceTokenizer()
    original_featurizer = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": additional_size
        }},
        finetune_mode=False,
    )
    train_message = Message(data={"text": original_train_text})
    data = TrainingData([train_message])

    tokenizer.train(data)
    original_featurizer.train(data)

    # Check total vocabulary size with buffer slots before finetuning
    original_vocabulary = original_featurizer.vectorizers["text"].vocabulary_
    assert len(original_vocabulary) == total_vocabulary_size

    file_dict = original_featurizer.persist("ftr", str(tmp_path))

    # load original_featurizer
    meta = original_featurizer.component_config.copy()
    meta.update(file_dict)
    new_featurizer = CountVectorsFeaturizer.load(meta,
                                                 str(tmp_path),
                                                 should_finetune=True)

    # Check total vocabulary size with buffer slots before finetuning
    assert len(new_featurizer.vectorizers["text"].vocabulary_
               ) == total_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tokenizer.train(data)
    new_featurizer.train(data)

    new_vocabulary = new_featurizer.vectorizers["text"].vocabulary_

    # Check total vocabulary size with buffer slots after finetuning
    assert len(new_vocabulary) == total_vocabulary_size

    # Check remaining buffer slots after finetuning
    assert (len(new_vocabulary) -
            new_featurizer._get_starting_empty_index(new_vocabulary) ==
            remaining_buffer_size)

    # Check indices of original vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in original_vocabulary.items():
        if not vocab_token.startswith("buf_"):
            assert vocab_token in new_vocabulary
            assert new_vocabulary.get(vocab_token) == vocab_index
Exemple #4
0
def test_cvf_incremental_training(
    initial_train_text: Text,
    additional_train_text: Text,
    initial_vocabulary_size: int,
    final_vocabulary_size: int,
    tmp_path: Path,
):
    tk = WhitespaceTokenizer()
    initial_cvf = CountVectorsFeaturizer()
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])

    tk.train(data)
    initial_cvf.train(data)

    # Check initial vocabulary size
    initial_vocab = initial_cvf.vectorizers["text"].vocabulary_
    assert len(initial_vocab) == initial_vocabulary_size

    # persist and load initial cvf
    file_dict = initial_cvf.persist("ftr", tmp_path)
    meta = initial_cvf.component_config.copy()
    meta.update(file_dict)
    new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True)

    # Check vocabulary size again
    assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tk.train(data)
    new_cvf.train(data)

    new_vocab = new_cvf.vectorizers["text"].vocabulary_

    # Check vocabulary size after finetuning
    assert len(new_vocab) == final_vocabulary_size

    # Check indices of initial vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in initial_vocab.items():
        assert vocab_token in new_vocab
        assert new_vocab.get(vocab_token) == vocab_index
def test_count_vector_featurizer_persist_load(tmp_path):

    # set non default values to config
    config = {
        "analyzer": "char",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"
    train_message1 = Message(sentence1)
    train_message2 = Message(sentence2)

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)

    # persist featurizer
    file_dict = train_ftr.persist("ftr", str(tmp_path))
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_})

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path))
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    # check if vocaculary was loaded correctly
    assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_")

    test_message1 = Message(sentence1)
    test_ftr.process(test_message1)
    test_message2 = Message(sentence2)
    test_ftr.process(test_message2)

    test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(
        TEXT, [])
    train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(
        TEXT, [])
    test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(
        TEXT, [])
    train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(
        TEXT, [])

    # check that train features and test features after loading are the same
    assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray())
    assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray())
    assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray())
    assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
Exemple #6
0
def test_count_vector_featurizer_persist_load(tmpdir):

    # set non default values to config
    config = {
        "analyzer": "char",
        "token_pattern": r"(?u)\b\w+\b",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"
    train_message1 = Message(sentence1)
    train_message2 = Message(sentence2)

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)

    # persist featurizer
    file_dict = train_ftr.persist("ftr", tmpdir.strpath)
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_})

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath)
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    test_message1 = Message(sentence1)
    test_ftr.process(test_message1)
    test_message2 = Message(sentence2)
    test_ftr.process(test_message2)

    # check that train features and test features after loading are the same
    assert np.all([
        train_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() ==
        test_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
        train_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() ==
        test_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
    ])