def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
Beispiel #2
0
def test_additional_vocab_size_deprecation():
    with pytest.warns(FutureWarning) as warning:
        _ = CountVectorsFeaturizer.create(
            {"additional_vocabulary_size": {TEXT: 5, RESPONSE: 10}},
            RasaNLUModelConfig(),
        )
    assert "The parameter has been deprecated" in warning[0].message.args[0]
Beispiel #3
0
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])

    assert (6, 5) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])

    assert (6, 5) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])

    assert (1, 1) == vecs.shape
    assert np.all(vecs.toarray()[0] == np.array([1]))
Beispiel #4
0
 def inner(config: Optional[Dict[Text, Any]] = None) -> CountVectorsFeaturizer:
     config = config or {}
     return CountVectorsFeaturizer.create(
         {**CountVectorsFeaturizer.get_default_config(), **config},
         default_model_storage,
         Resource("count_vectors_featurizer"),
         default_execution_context,
     )
def test_convert_training_examples(
    spacy_nlp: Language,
    text: Text,
    intent: Optional[Text],
    entities: Optional[List[Dict[Text, Any]]],
    attributes: List[Text],
    real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
):
    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})

    tokenizer = SpacyTokenizer.create(
        SpacyTokenizer.get_default_config(),
        default_model_storage,
        Resource("tokenizer"),
        default_execution_context,
    )
    count_vectors_featurizer = CountVectorsFeaturizer.create(
        CountVectorsFeaturizer.get_default_config(),
        default_model_storage,
        Resource("count_featurizer"),
        default_execution_context,
    )
    spacy_featurizer = SpacyFeaturizer.create(
        SpacyFeaturizer.get_default_config(),
        default_model_storage,
        Resource("spacy_featurizer"),
        default_execution_context,
    )

    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    training_data = TrainingData([message])
    tokenizer.process_training_data(training_data)
    count_vectors_featurizer.train(training_data)
    count_vectors_featurizer.process_training_data(training_data)
    spacy_featurizer.process_training_data(training_data)

    entity_tag_spec = [
        EntityTagSpec(
            "entity",
            {
                0: "O",
                1: "name",
                2: "location"
            },
            {
                "O": 0,
                "name": 1,
                "location": 2
            },
            3,
        )
    ]
    output, sparse_feature_sizes = model_data_utils.featurize_training_examples(
        [message], attributes=attributes, entity_tag_specs=entity_tag_spec)

    assert len(output) == 1
    for attribute in attributes:
        assert attribute in output[0]
    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
        assert attribute not in output[0]
    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence
    # features in the list
    assert len(output[0][TEXT]) == 4
    if INTENT in attributes:
        # we will just have space sentence features
        assert len(output[0][INTENT]) == 1
    if ENTITIES in attributes:
        # we will just have space sentence features
        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
    # check that it calculates sparse_feature_sizes correctly
    assert sparse_feature_sizes == real_sparse_feature_sizes