Beispiel #1
0
def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": 0,
            "response": 0
        }})
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
def test_count_vectors_featurizer_train(
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    featurizer = create_featurizer()

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    whitespace_tokenizer.process_training_data(TrainingData([message]))

    data = TrainingData([message])
    featurizer.train(data)
    featurizer.process_training_data(data)

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
Beispiel #3
0
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create(
        {"additional_vocabulary_size": {
            "text": 0,
            "response": 0
        }}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
Beispiel #4
0
def test_count_vector_featurizer_char(
    sentence: Text,
    expected: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char",
    })

    train_message = Message(data={TEXT: sentence})
    whitespace_tokenizer.process([train_message])

    data = TrainingData([train_message])
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp):
    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert isinstance(seq_vec, scipy.sparse.coo_matrix)
    assert sen_vec is None

    assert np.all(seq_vec.toarray() == expected)
Beispiel #6
0
def print_message(message: Message) -> None:
    features = {**message.as_dict_nlu()}
    seq_vecs, sen_vecs = message.get_dense_features(TEXT)
    features["dense"] = {
        "sequence": None if not seq_vecs else dense_message(seq_vecs.features),
        "sentence": None if not sen_vecs else dense_message(sen_vecs.features),
    }
    seq_vecs, sen_vecs = message.get_sparse_features(TEXT)
    features["sparse"] = {
        "sequence":
        None if not seq_vecs else sparse_message(seq_vecs.features),
        "sentence":
        None if not sen_vecs else sparse_message(sen_vecs.features),
    }
    if "text_tokens" in features.keys():
        features["text_tokens"] = [t.text for t in features["text_tokens"]]
    if "intent" in features.keys():
        features["intent"] = {
            k: v
            for k, v in features["intent"].items() if "id" != k
        }
    if "intent_ranking" in features.keys():
        features["intent_ranking"] = [{
            k: v
            for k, v in i.items() if "id" != k
        } for i in features["intent_ranking"]]

    if "diagnostic_data" in features.keys():
        features["diagnostic_data"] = {
            name: {k: dense_message(v)
                   for k, v in comp.items()}
            for name, comp in features["diagnostic_data"].items()
        }
    print(features)
Beispiel #7
0
def test_get_sparse_features(
    features: Optional[List[Features]],
    attribute: Text,
    featurizers: List[Text],
    expected_seq_features: Optional[List[Features]],
    expected_sen_features: Optional[List[Features]],
):
    message = Message(data={TEXT: "This is a test sentence."},
                      features=features)

    actual_seq_features, actual_sen_features = message.get_sparse_features(
        attribute, featurizers)
    if actual_seq_features:
        actual_seq_features = actual_seq_features.features
    if actual_sen_features:
        actual_sen_features = actual_sen_features.features

    if expected_seq_features is None:
        assert actual_seq_features is None
    else:
        assert actual_seq_features is not None
        assert np.all(actual_seq_features.toarray() == expected_seq_features)

    if expected_sen_features is None:
        assert actual_sen_features is None
    else:
        assert actual_sen_features is not None
        assert np.all(actual_sen_features.toarray() == expected_sen_features)
def test_count_vector_featurizer(
    sentence: Text,
    expected: List[List[int]],
    expected_cls: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer()

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    whitespace_tokenizer.process([train_message])
    whitespace_tokenizer.process([test_message])

    ftr.train(TrainingData([train_message]))

    ftr.process([test_message])

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert isinstance(seq_vecs, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.toarray()
    actual_sen_vecs = sen_vecs.toarray()

    assert np.all(actual_seq_vecs[0] == expected)
    assert np.all(actual_sen_vecs[-1] == expected_cls)
Beispiel #9
0
def test_count_vector_featurizer(sentence, expected, expected_cls):
    ftr = CountVectorsFeaturizer()

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert isinstance(seq_vecs, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.toarray()
    actual_sen_vecs = sen_vecs.toarray()

    assert np.all(actual_seq_vecs[0] == expected)
    assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_text_featurizer(sentence, expected_features):
    featurizer = LexicalSyntacticFeaturizer({
        "features": [
            ["BOS", "upper"],
            ["BOS", "EOS", "prefix2", "digit"],
            ["EOS", "low"],
        ]
    })

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert isinstance(seq_vec, scipy.sparse.coo_matrix)
    assert sen_vec is None

    assert np.all(seq_vec.toarray() == expected_features[:-1])
Beispiel #11
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    ftr = CountVectorsFeaturizer()

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message(data={TEXT: ""})
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message(data={TEXT: ""})
    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Beispiel #12
0
def test_count_vector_featurizer_use_lemma(
    spacy_nlp: Any,
    sentence: Text,
    sequence_features: List[List[int]],
    sentence_features: List[List[int]],
    use_lemma: bool,
):
    ftr = CountVectorsFeaturizer({"use_lemma": use_lemma})

    train_message = Message(data={TEXT: sentence})
    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message = Message(data={TEXT: sentence})
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.features.toarray()
    actual_sen_vecs = sen_vecs.features.toarray()

    assert np.all(actual_seq_vecs[0] == sequence_features)
    assert np.all(actual_sen_vecs[-1] == sentence_features)
Beispiel #13
0
def test_count_vector_featurizer_oov_words(sentence, expected):

    ftr = CountVectorsFeaturizer({
        "OOV_token": "__oov__",
        "OOV_words": ["oov_word0", "OOV_word1"],
        "additional_vocabulary_size": {
            "text": 0
        },
    })
    train_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Beispiel #14
0
def test_count_vector_featurizer_char(sentence, expected):
    ftr = CountVectorsFeaturizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char",
        "additional_vocabulary_size": {
            "text": 0
        },
    })

    train_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(test_message)
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
def test_count_vector_featurizer_attribute_featurization(
    sentence: Text,
    intent: Text,
    response: Optional[Text],
    intent_features: List[List[int]],
    response_features: Optional[List[List[int]]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #16
0
def fetch_sparse_features(txt, tokenizer, featurizer):
    message = Message(
        {TEXT: "my advices include to give advice and giving many greetings"})
    tokenizer.process(message)
    featurizer.train(TrainingData([message]))
    featurizer.process(message)
    seq_vecs, sen_vecs = message.get_sparse_features(TEXT, [])
    return seq_vecs.features.toarray()
Beispiel #17
0
def test_count_vector_featurizer_shared_vocab(
    sentence: Text,
    intent: Text,
    response: Text,
    text_features: List[List[int]],
    intent_features: List[List[int]],
    response_features: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "use_shared_vocab": True,
    })

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
Beispiel #18
0
def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({
        "use_shared_vocab": True,
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0
        },
    })
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
Beispiel #19
0
def test_count_vector_featurizer_attribute_featurization(
    sentence, intent, response, intent_features, response_features
):
    ftr = CountVectorsFeaturizer()
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
    def _get_sentence_features(message: Message) -> scipy.sparse.spmatrix:
        _, dense_sentence_features = message.get_dense_features(TEXT)
        if dense_sentence_features is not None:
            rasa.shared.utils.io.raise_warning(
                "Dense features are being computed but not used in "
                "the SparseNaiveBayesIntentClassifier.")

        _, sentence_features = message.get_sparse_features(TEXT)
        if sentence_features is not None:
            return sentence_features.features

        raise ValueError("No sparse sentence features present. "
                         "Not able to train sklearn intent classifier.")
def test_text_featurizer_using_pos_with_action_text(sentence: Text,
                                                    expected: np.ndarray,
                                                    spacy_nlp):
    featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]})

    train_message = Message(data={TEXT: sentence, ACTION_TEXT: sentence})
    test_message = Message(data={TEXT: sentence, ACTION_TEXT: sentence})

    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    train_message.set(SPACY_DOCS[ACTION_TEXT], spacy_nlp(sentence))
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message.set(SPACY_DOCS[ACTION_TEXT], spacy_nlp(sentence))

    SpacyTokenizer().process(train_message)
    SpacyTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))
    # Checking that text is processed as expected
    featurizer.process(test_message)

    seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert isinstance(seq_vec, scipy.sparse.coo_matrix)
    assert sen_vec is None

    assert np.all(seq_vec.toarray() == expected)

    # Checking that action_text does not get processed and passing attribute works
    featurizer.process(test_message)

    seq_vec, sen_vec = test_message.get_sparse_features(ACTION_TEXT, [])

    assert seq_vec is None
    assert sen_vec is None
Beispiel #22
0
def fetch_sparse_features(txt, tokenizer, featurizer):

    message = Message({TEXT: txt})
    tokenizer.process(message)
    featurizer.train(TrainingData([message]))
    featurizer.process(message)

    seq_vecs, sen_vecs = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    return seq_vecs.toarray()
Beispiel #23
0
def test_count_vector_featurizer_process_by_attribute(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0,
            "action_text": 0
        },
    })
    tk = WhitespaceTokenizer()

    # add a second example that has some response, so that the vocabulary for
    # response exists
    train_message = Message(data={TEXT: "hello"})
    train_message.set(ACTION_NAME, "greet")

    train_message1 = Message(data={TEXT: "hello"})
    train_message1.set(ACTION_TEXT, "hi")

    data = TrainingData([train_message, train_message1])

    tk.train(data)
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    test_message.set(ACTION_NAME, action_name)
    test_message.set(ACTION_TEXT, action_text)

    for module in [tk, ftr]:
        module.process(test_message)

    action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features(
        ACTION_NAME, [])
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features

    assert action_name_seq_vecs.toarray()[0] == action_name_features
    assert action_name_sen_vecs is None
Beispiel #24
0
def test_count_vector_featurizer_oov_token(sentence, expected):
    ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"})
    train_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Beispiel #25
0
def test_count_vector_featurizer_process_by_attribute(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "token_pattern": r"(?u)\b\w+\b",
    })

    # add a second example that has some response, so that the vocabulary for
    # response exists
    train_message = Message(data={TEXT: "hello"})
    train_message.set(ACTION_NAME, "greet")

    train_message1 = Message(data={TEXT: "hello"})
    train_message1.set(ACTION_TEXT, "hi")

    data = TrainingData([train_message, train_message1])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    test_message.set(ACTION_NAME, action_name)
    test_message.set(ACTION_TEXT, action_text)

    whitespace_tokenizer.process([test_message])
    ftr.process([test_message])

    action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features(
        ACTION_NAME, [])
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features

    assert action_name_seq_vecs.toarray()[0] == action_name_features
    assert action_name_sen_vecs is None
Beispiel #26
0
def test_count_vector_featurizer_use_lemma(
    spacy_nlp: Any,
    sentence: Text,
    sequence_features: List[List[int]],
    sentence_features: List[List[int]],
    use_lemma: bool,
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    load_featurizer: Callable[..., CountVectorsFeaturizer],
    spacy_tokenizer: SpacyTokenizer,
):
    config = {
        "use_lemma": use_lemma,
        "OOV_words": ["drinks"],
        "OOV_token": "OOV"
    }
    ftr = create_featurizer(config)

    train_message = Message(data={TEXT: sentence})
    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message = Message(data={TEXT: sentence})
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    spacy_tokenizer.process([train_message])
    spacy_tokenizer.process([test_message])

    ftr.train(TrainingData([train_message]), model=SpacyModel(spacy_nlp, "en"))

    ftr.process([test_message])

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.features.toarray()
    actual_sen_vecs = sen_vecs.features.toarray()

    assert np.all(actual_seq_vecs[0] == sequence_features)
    assert np.all(actual_sen_vecs[-1] == sentence_features)

    loaded = load_featurizer(config)
    assert loaded.OOV_words == ftr.OOV_words
def test_count_vector_featurizer_oov_token(
    sentence: Text,
    expected: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer({"OOV_token": "__oov__"})
    train_message = Message(data={TEXT: sentence})
    whitespace_tokenizer.process([train_message])

    data = TrainingData([train_message])
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Beispiel #28
0
def test_incremental_train_featurization(tmp_path: Path):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 5},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    # Test featurization of message
    expected = np.array([0, 1, 0, 0, 0, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 8) == seq_vecs.shape
    assert (1, 8) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    persist_value = featurizer.persist("ftr", str(tmp_path))
    loaded_featurizer = RegexFeaturizer.load(
        meta={
            "number_additional_patterns": 5,
            "file": persist_value["file"],
        },
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    new_patterns = [
        {
            "pattern": "\\btoday*",
            "name": "day",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey+",
            "name": "hello",
            "usage": "intent"
        },
    ]

    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    loaded_featurizer.train(
        TrainingData([message], regex_features=patterns + new_patterns),
        RasaNLUModelConfig(),
    )

    # Test featurization of message, this time for the extra pattern as well.
    expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0])
    expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 8) == seq_vecs.shape
    assert (1, 8) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected_token_1)
    assert np.all(seq_vecs.toarray()[-2] == expected_token_2)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    # we also modified a pattern, check if that is correctly modified
    pattern_to_check = [
        pattern for pattern in loaded_featurizer.known_patterns
        if pattern["name"] == "hello"
    ]
    assert pattern_to_check == [new_patterns[1]]
Beispiel #29
0
def test_regex_featurizer_train():

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 0},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
Beispiel #30
0
def test_count_vector_featurizer_persist_load(tmp_path: Path):
    # set non default values to config
    config = {
        "analyzer": "char",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"

    train_message1 = Message(data={TEXT: sentence1})
    train_message2 = Message(data={TEXT: sentence2})
    WhitespaceTokenizer().process(train_message1)
    WhitespaceTokenizer().process(train_message2)

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)

    # persist featurizer
    file_dict = train_ftr.persist("ftr", str(tmp_path))
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}
            )

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, str(tmp_path), finetune_mode=False)
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    # check if vocaculary was loaded correctly
    assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_")

    test_message1 = Message(data={TEXT: sentence1})
    WhitespaceTokenizer().process(test_message1)
    test_ftr.process(test_message1)
    test_message2 = Message(data={TEXT: sentence2})
    WhitespaceTokenizer().process(test_message2)
    test_ftr.process(test_message2)

    test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, [])
    if test_seq_vec_1:
        test_seq_vec_1 = test_seq_vec_1.features
    if test_sen_vec_1:
        test_sen_vec_1 = test_sen_vec_1.features
    train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, [])
    if train_seq_vec_1:
        train_seq_vec_1 = train_seq_vec_1.features
    if train_sen_vec_1:
        train_sen_vec_1 = train_sen_vec_1.features
    test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, [])
    if test_seq_vec_2:
        test_seq_vec_2 = test_seq_vec_2.features
    if test_sen_vec_2:
        test_sen_vec_2 = test_sen_vec_2.features
    train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, [])
    if train_seq_vec_2:
        train_seq_vec_2 = train_seq_vec_2.features
    if train_sen_vec_2:
        train_sen_vec_2 = train_sen_vec_2.features

    # check that train features and test features after loading are the same
    assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray())
    assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray())
    assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray())
    assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())