Python CountVectorsFeaturizer.CountVectorsFeaturizerの例、rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer.CountVectorsFeaturizer.CountVectorsFeaturizer Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_count_vectors_featurizer.py プロジェクト: hr004/rasa-1

def test_count_vector_featurizer_oov_words(sentence, expected):

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "OOV_token": "__oov__",
        "OOV_words": ["oov_word0", "OOV_word1"],
    })
    train_message = Message(sentence)
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)

コード例 #2

0

ファイルを表示

def test_count_vector_featurizer_oov_token(sentence, expected):
    ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"})
    train_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None

コード例 #3

0

ファイルを表示

def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": 0,
            "response": 0
        }})
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None

コード例 #4

0

ファイルを表示

def test_count_vector_featurizer_char(sentence, expected):
    ftr = CountVectorsFeaturizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char"
    })

    train_message = Message(sentence)
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    WhitespaceTokenizer().process(test_message)
    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)

コード例 #5

0

ファイルを表示

def test_count_vector_featurizer_oov_words(sentence, expected):

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "OOV_token": "__oov__",
        "OOV_words": ["oov_word0", "OOV_word1"],
    })
    train_message = Message(sentence)
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None

コード例 #6

0

ファイルを表示

def test_count_vector_featurizer_char(sentence, expected):
    ftr = CountVectorsFeaturizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char"
    })

    train_message = Message(sentence)
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    WhitespaceTokenizer().process(test_message)
    ftr.process(test_message)

    vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(vec.toarray()[0] == expected)

コード例 #7

0

ファイルを表示

def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({
        "use_shared_vocab": True,
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0
        },
    })
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None

コード例 #8

0

ファイルを表示

def test_cvf_incremental_training(
    initial_train_text: Text,
    additional_train_text: Text,
    initial_vocabulary_size: int,
    final_vocabulary_size: int,
    tmp_path: Path,
):
    tk = WhitespaceTokenizer()
    initial_cvf = CountVectorsFeaturizer()
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])

    tk.train(data)
    initial_cvf.train(data)

    # Check initial vocabulary size
    initial_vocab = initial_cvf.vectorizers["text"].vocabulary_
    assert len(initial_vocab) == initial_vocabulary_size

    # persist and load initial cvf
    file_dict = initial_cvf.persist("ftr", tmp_path)
    meta = initial_cvf.component_config.copy()
    meta.update(file_dict)
    new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True)

    # Check vocabulary size again
    assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tk.train(data)
    new_cvf.train(data)

    new_vocab = new_cvf.vectorizers["text"].vocabulary_

    # Check vocabulary size after finetuning
    assert len(new_vocab) == final_vocabulary_size

    # Check indices of initial vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in initial_vocab.items():
        assert vocab_token in new_vocab
        assert new_vocab.get(vocab_token) == vocab_index

コード例 #9

0

ファイルを表示

def test_count_vector_featurizer_char(sentence, expected):
    ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char",})

    train_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(test_message)
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None

コード例 #10

0

ファイルを表示

def test_count_vector_featurizer(sentence, expected, expected_cls):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    train_message = Message(sentence)
    test_message = Message(sentence)

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]),
                      scipy.sparse.coo_matrix)

    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()

    assert np.all(actual[0] == expected)
    assert np.all(actual[-1] == expected_cls)

コード例 #11

0

ファイルを表示

ファイル: test_count_vectors_featurizer.py プロジェクト: VasitumInc/rasa

def test_count_vector_featurizer_oov_token(sentence, expected):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "OOV_token": "__oov__",
        "return_sequence": True,
    })
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(
        test_message.get("text_sparse_features").toarray()[0] == expected)

コード例 #12

0

ファイルを表示

def test_cvf_independent_train_vocabulary_expand(
    additional_size: Optional[int],
    text: Text,
    real_vocabulary_size: int,
    total_vocabulary_size: int,
):

    tokenizer = WhitespaceTokenizer()
    featurizer = CountVectorsFeaturizer(
        {
            "additional_vocabulary_size": {
                TEXT: additional_size,
                RESPONSE: additional_size,
                ACTION_TEXT: additional_size,
            }
        },
        finetune_mode=False,
    )

    train_message = Message(
        data={
            TEXT: text,
            INTENT: "intent_1",
            RESPONSE: text,
            ACTION_TEXT: text,
            ACTION_NAME: "action_1",
        })
    data = TrainingData([train_message])

    tokenizer.train(data)
    featurizer.train(data)

    for attribute in [TEXT, RESPONSE, ACTION_TEXT]:
        attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_
        assert len(attribute_vocabulary) == total_vocabulary_size
        assert (featurizer._get_starting_empty_index(attribute_vocabulary) ==
                real_vocabulary_size)

    for attribute in [INTENT, ACTION_NAME]:
        attribute_vocabulary = featurizer.vectorizers[attribute].vocabulary_
        assert len(attribute_vocabulary) == 1

コード例 #13

0

ファイルを表示

def test_count_vector_featurizer(sentence, expected, expected_cls):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    train_message = Message(sentence)
    test_message = Message(sentence)

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(vecs, scipy.sparse.coo_matrix)

    actual_vecs = vecs.toarray()

    assert np.all(actual_vecs[0] == expected)
    assert np.all(actual_vecs[-1] == expected_cls)

コード例 #14

0

ファイルを表示

ファイル: test_count_vectors_featurizer.py プロジェクト: VasitumInc/rasa

def test_count_vector_featurizer_char(sentence, expected):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char",
        "return_sequence": True
    })
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(
        test_message.get("text_sparse_features").toarray()[0] == expected)

コード例 #15

0

ファイルを表示

def test_count_vector_featurizer_process_by_attribute(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b",})
    tk = WhitespaceTokenizer()

    # add a second example that has some response, so that the vocabulary for
    # response exists
    train_message = Message(data={TEXT: "hello"})
    train_message.set(ACTION_NAME, "greet")

    train_message1 = Message(data={TEXT: "hello"})
    train_message1.set(ACTION_TEXT, "hi")

    data = TrainingData([train_message, train_message1])

    tk.train(data)
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    test_message.set(ACTION_NAME, action_name)
    test_message.set(ACTION_TEXT, action_text)

    for module in [tk, ftr]:
        module.process(test_message)

    action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features(
        ACTION_NAME, []
    )
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features

    assert action_name_seq_vecs.toarray()[0] == action_name_features
    assert action_name_sen_vecs is None

コード例 #16

0

ファイルを表示

ファイル: test_count_vectors_featurizer.py プロジェクト: sanaayakurup/rasa-1

def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer()
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message("hello")
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])

    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None

コード例 #17

0

ファイルを表示

ファイル: test_count_vectors_featurizer.py プロジェクト: VasitumInc/rasa

def test_count_vector_featurizer_no_sequence(sentence, expected):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "return_sequence": False
    })
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert isinstance(test_message.get("text_sparse_features"),
                      scipy.sparse.coo_matrix)

    actual = test_message.get("text_sparse_features").toarray()

    assert np.all(actual == expected)

コード例 #18

0

ファイルを表示

def test_cvf_shared_train_vocabulary_expand(
    additional_size: Optional[int],
    text: Text,
    real_vocabulary_size: int,
    total_vocabulary_size: int,
):

    tokenizer = WhitespaceTokenizer()
    featurizer = CountVectorsFeaturizer(
        {
            "additional_vocabulary_size": {
                "text": additional_size,
                "response": additional_size,
                "action_text": additional_size,
            },
            "use_shared_vocab": True,
        },
        finetune_mode=False,
    )

    train_message = Message(
        data={
            TEXT: text,
            INTENT: "intent_1",
            RESPONSE: text,
            ACTION_TEXT: text,
            ACTION_NAME: "action_1",
        })
    data = TrainingData([train_message])

    tokenizer.train(data)
    featurizer.train(data)

    shared_vocabulary = featurizer.vectorizers["text"].vocabulary_
    assert len(shared_vocabulary) == total_vocabulary_size
    assert (featurizer._get_starting_empty_index(shared_vocabulary) ==
            real_vocabulary_size)

コード例 #19

0

ファイルを表示

def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)

コード例 #20

0

ファイルを表示

ファイル: test_count_vectors_featurizer.py プロジェクト: yeter12345/ayca

def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT_ATTRIBUTE, intent)
    train_message.set(RESPONSE_ATTRIBUTE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message("hello")
    second_message.set(RESPONSE_ATTRIBUTE, "hi")
    second_message.set(INTENT_ATTRIBUTE, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    if intent_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]).toarray()[0] ==
                intent_features)
    else:
        assert train_message.get(
            SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) is None

    if response_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]).toarray()[0] ==
                response_features)
    else:
        assert train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) is None

コード例 #21

0

ファイルを表示

def test_count_vector_featurizer_use_lemma(
    spacy_nlp: Any,
    sentence: Text,
    sequence_features: List[List[int]],
    sentence_features: List[List[int]],
    use_lemma: bool,
):
    ftr = CountVectorsFeaturizer({
        "use_lemma": use_lemma,
        "additional_vocabulary_size": {
            "text": 0
        }
    })

    train_message = Message(data={TEXT: sentence})
    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message = Message(data={TEXT: sentence})
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.features.toarray()
    actual_sen_vecs = sen_vecs.features.toarray()

    assert np.all(actual_seq_vecs[0] == sequence_features)
    assert np.all(actual_sen_vecs[-1] == sentence_features)

コード例 #22

0

ファイルを表示

def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "use_shared_vocab": True
    })
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(vec.toarray()[0] == text_features)
    vec = train_message.get_sparse_features(INTENT, [])
    assert np.all(vec.toarray()[0] == intent_features)
    vec = train_message.get_sparse_features(RESPONSE, [])
    assert np.all(vec.toarray()[0] == response_features)

コード例 #23

0

ファイルを表示

ファイル: test_count_vectors_featurizer.py プロジェクト: sanaayakurup/rasa-1

def test_count_vector_featurizer_persist_load(tmp_path):

    # set non default values to config
    config = {
        "analyzer": "char",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"
    train_message1 = Message(sentence1)
    train_message2 = Message(sentence2)

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)

    # persist featurizer
    file_dict = train_ftr.persist("ftr", str(tmp_path))
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_})

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path))
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    # check if vocaculary was loaded correctly
    assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_")

    test_message1 = Message(sentence1)
    test_ftr.process(test_message1)
    test_message2 = Message(sentence2)
    test_ftr.process(test_message2)

    test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(
        TEXT, [])
    train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(
        TEXT, [])
    test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(
        TEXT, [])
    train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(
        TEXT, [])

    # check that train features and test features after loading are the same
    assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray())
    assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray())
    assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray())
    assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())

コード例 #24

0

ファイルを表示

def test_count_vector_featurizer_persist_load(tmpdir):

    # set non default values to config
    config = {
        "analyzer": "char",
        "token_pattern": r"(?u)\b\w+\b",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"
    train_message1 = Message(sentence1)
    train_message2 = Message(sentence2)

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)

    # persist featurizer
    file_dict = train_ftr.persist("ftr", tmpdir.strpath)
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_})

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath)
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    test_message1 = Message(sentence1)
    test_ftr.process(test_message1)
    test_message2 = Message(sentence2)
    test_ftr.process(test_message2)

    # check that train features and test features after loading are the same
    assert np.all([
        train_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() ==
        test_message1.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
        train_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray() ==
        test_message2.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray(),
    ])