Beispiel #1
0
def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "use_shared_vocab": True
    })
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    assert np.all(
        train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] ==
        text_features)
    assert np.all(
        train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] ==
        intent_features)
    assert np.all(
        train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] ==
        response_features)
def test_convert_featurizer_process(monkeypatch: MonkeyPatch):
    tokenizer = WhitespaceTokenizer()

    monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url",
                        lambda x: RESTRICTED_ACCESS_URL)
    component_config = {
        "name": "ConveRTFeaturizer",
        "model_url": RESTRICTED_ACCESS_URL
    }
    featurizer = ConveRTFeaturizer(component_config)
    sentence = "Hey how are you today ?"
    message = Message.build(text=sentence)

    td = TrainingData([message])
    tokenizer.train(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    featurizer.process(message, tf_hub_module=featurizer.module)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
Beispiel #3
0
def test_cvf_incremental_train_vocabulary_overflow(tmp_path: Path, ):
    additional_size = 3
    original_train_text = "hello my name is John."
    additional_train_text = "I am also new."
    tokenizer = WhitespaceTokenizer()
    original_featurizer = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": additional_size
        }},
        finetune_mode=False,
    )
    train_message = Message(data={"text": original_train_text})
    data = TrainingData([train_message])

    tokenizer.train(data)
    original_featurizer.train(data)

    file_dict = original_featurizer.persist("ftr", str(tmp_path))

    # load original_featurizer
    meta = original_featurizer.component_config.copy()
    meta.update(file_dict)
    new_featurizer = CountVectorsFeaturizer.load(meta,
                                                 str(tmp_path),
                                                 should_finetune=True)

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tokenizer.train(data)

    with pytest.warns(UserWarning) as warning:
        new_featurizer.train(data)
    assert "New data contains vocabulary of size" in warning[0].message.args[0]
Beispiel #4
0
def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message("hello")
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    if intent_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None

    if response_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
Beispiel #5
0
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    if intent_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None

    if response_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
Beispiel #6
0
def test_train_tokenizer(text: Text, expected_tokens: List[Text],
                         expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(RESPONSE, text)
    message.set(INTENT, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE, TEXT]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]

    # check intent attribute
    tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT])

    assert [t.text for t in tokens] == [text]
def test_whitespace_training(supervised_embeddings_config):
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}
                ],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}
                ],
            },
        ),
    ]

    tk = WhitespaceTokenizer()

    tk.train(TrainingData(training_examples=examples), supervised_embeddings_config)

    assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer()
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #9
0
def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "use_shared_vocab": True
    })
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
async def test_train_persist_load_with_composite_entities(
    crf_entity_extractor: Callable[[Dict[Text, Any]],
                                   CRFEntityExtractorGraphComponent],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
):
    importer = RasaFileImporter(
        training_data_paths=["data/test/demo-rasa-composite-entities.yml"])
    training_data = importer.get_nlu_data()

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    crf_extractor = crf_entity_extractor({})
    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})

    tokenizer.process(message)
    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractorGraphComponent.load(
        CRFEntityExtractorGraphComponent.get_default_config(),
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()
Beispiel #11
0
def test_use_shared_vocab_exception(
    initial_train_text: Text,
    additional_train_text: Text,
    use_shared_vocab: bool,
    tmp_path: Path,
):
    """Tests if an exception is raised when `use_shared_vocab` is set to True
    during incremental training."""
    tk = WhitespaceTokenizer()
    initial_cvf = CountVectorsFeaturizer(
        component_config={"use_shared_vocab": use_shared_vocab}
    )
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])
    tk.train(data)
    initial_cvf.train(data)

    file_dict = initial_cvf.persist("ftr", tmp_path)
    meta = initial_cvf.component_config.copy()
    meta.update(file_dict)
    new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True)

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tk.train(data)
    if use_shared_vocab:
        with pytest.raises(Exception) as exec_info:
            new_cvf.train(data)
        assert (
            "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported"
            in str(exec_info.value)
        )
    else:
        new_cvf.train(data)
Beispiel #12
0
def test_flexible_nlu_pipeline():
    message = Message("This is a test message.", data={"intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"})
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        })
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 6
    assert message.features[0].origin == "cvf_word"
    assert message.features[0].type == FEATURE_TYPE_SEQUENCE
    assert message.features[1].origin == "cvf_word"
    assert message.features[1].type == FEATURE_TYPE_SENTENCE
    # cvf word is also extracted for the intent
    assert message.features[2].origin == "cvf_word"
    assert message.features[2].type == FEATURE_TYPE_SEQUENCE
    assert message.features[3].origin == "cvf_char"
    assert message.features[3].type == FEATURE_TYPE_SEQUENCE
    assert message.features[4].origin == "cvf_char"
    assert message.features[4].type == FEATURE_TYPE_SENTENCE
    assert message.features[5].origin == "LexicalSyntacticFeaturizer"
    assert message.features[5].type == FEATURE_TYPE_SEQUENCE

    sequence_feature_dim = (message.features[0].features.shape[1] +
                            message.features[5].features.shape[1])
    sentence_feature_dim = message.features[0].features.shape[1]

    classifier = DIETClassifier(component_config={
        FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]
    })
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get(TEXT_SENTENCE_FEATURES)) == 1
    assert len(model_data.get(TEXT_SEQUENCE_FEATURES)) == 1
    assert len(model_data.get(LABEL_SEQUENCE_FEATURES)) == 1
    assert len(model_data.get(LABEL_SENTENCE_FEATURES)) == 0
    assert model_data.get(TEXT_SEQUENCE_FEATURES)[0][0].shape == (
        5,
        sequence_feature_dim,
    )
    assert model_data.get(TEXT_SENTENCE_FEATURES)[0][0].shape == (
        1,
        sentence_feature_dim,
    )
    assert model_data.get(LABEL_SEQUENCE_FEATURES)[0][0].shape == (1, 1)
Beispiel #13
0
def test_cvf_incremental_train_vocabulary(
    additional_size: Optional[int],
    original_train_text: Text,
    additional_train_text: Text,
    total_vocabulary_size: int,
    remaining_buffer_size: int,
    tmp_path: Path,
):

    tokenizer = WhitespaceTokenizer()
    original_featurizer = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": additional_size
        }},
        finetune_mode=False,
    )
    train_message = Message(data={"text": original_train_text})
    data = TrainingData([train_message])

    tokenizer.train(data)
    original_featurizer.train(data)

    # Check total vocabulary size with buffer slots before finetuning
    original_vocabulary = original_featurizer.vectorizers["text"].vocabulary_
    assert len(original_vocabulary) == total_vocabulary_size

    file_dict = original_featurizer.persist("ftr", str(tmp_path))

    # load original_featurizer
    meta = original_featurizer.component_config.copy()
    meta.update(file_dict)
    new_featurizer = CountVectorsFeaturizer.load(meta,
                                                 str(tmp_path),
                                                 should_finetune=True)

    # Check total vocabulary size with buffer slots before finetuning
    assert len(new_featurizer.vectorizers["text"].vocabulary_
               ) == total_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tokenizer.train(data)
    new_featurizer.train(data)

    new_vocabulary = new_featurizer.vectorizers["text"].vocabulary_

    # Check total vocabulary size with buffer slots after finetuning
    assert len(new_vocabulary) == total_vocabulary_size

    # Check remaining buffer slots after finetuning
    assert (len(new_vocabulary) -
            new_featurizer._get_starting_empty_index(new_vocabulary) ==
            remaining_buffer_size)

    # Check indices of original vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in original_vocabulary.items():
        if not vocab_token.startswith("buf_"):
            assert vocab_token in new_vocabulary
            assert new_vocabulary.get(vocab_token) == vocab_index
Beispiel #14
0
def test_count_vector_featurizer_action_attribute_featurization(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0,
            "action_text": 0
        },
    })
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(ACTION_NAME, action_name)
    train_message.set(ACTION_TEXT, action_text)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(ACTION_TEXT, "hi")
    second_message.set(ACTION_NAME, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features(
        ACTION_NAME, [])
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        ACTION_TEXT, [])
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if action_name_features:
        assert action_name_seq_vecs.toarray()[0] == action_name_features
        assert action_name_sen_vecs is None
    else:
        assert action_name_seq_vecs is None
        assert action_name_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
def test_custom_intent_symbol(text, expected_tokens):
    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}

    tk = WhitespaceTokenizer(component_config)

    message = Message(text)
    message.set(INTENT, text)

    tk.train(TrainingData([message]))

    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
Beispiel #16
0
def train_texts(texts: List[Text], model_name: Text,
                model_weights: Text) -> List[Message]:
    config = create_pretrained_transformers_config(model_name, model_weights)
    whitespace_tokenizer = WhitespaceTokenizer()
    transformer = HFTransformersNLP(config)

    messages = [Message.build(text=text) for text in texts]
    td = TrainingData(messages)

    whitespace_tokenizer.train(td)
    transformer.train(td)
    return messages
Beispiel #17
0
def test_model_data_signature_with_entities(messages: List[Message],
                                            entity_expected: bool):
    classifier = DIETClassifier({"BILOU_flag": False})
    training_data = TrainingData(messages)

    # create tokens for entity parsing inside DIET
    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    model_data = classifier.preprocess_train_data(training_data)
    entity_exists = "entities" in model_data.get_signature().keys()
    assert entity_exists == entity_expected
Beispiel #18
0
def test_check_check_correct_entity_annotations(text: Text, warnings: int):
    reader = MarkdownReader()
    tokenizer = WhitespaceTokenizer()

    training_data = reader.reads(text)
    tokenizer.train(training_data)

    with pytest.warns(UserWarning) as record:
        EntityExtractor.check_correct_entity_annotations(training_data)

    assert len(record) == warnings
    assert all([excerpt in record[0].message.args[0]]
               for excerpt in ["Misaligned entity annotation in sentence"])
Beispiel #19
0
def test_apply_bilou_schema():
    tokenizer = WhitespaceTokenizer()

    message_1 = Message("Germany is part of the European Union")
    message_1.set(
        ENTITIES,
        [
            {"start": 0, "end": 7, "value": "Germany", "entity": "location"},
            {
                "start": 23,
                "end": 37,
                "value": "European Union",
                "entity": "organisation",
            },
        ],
    )

    message_2 = Message("Berlin is the capital of Germany")
    message_2.set(
        ENTITIES,
        [
            {"start": 0, "end": 6, "value": "Berlin", "entity": "location"},
            {"start": 25, "end": 32, "value": "Germany", "entity": "location"},
        ],
    )

    training_data = TrainingData([message_1, message_2])

    tokenizer.train(training_data)

    bilou_utils.apply_bilou_schema(training_data)

    assert message_1.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "B-organisation",
        "L-organisation",
        "O",
    ]
    assert message_2.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "U-location",
        "O",
    ]
Beispiel #20
0
def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": 0,
            "response": 0
        }})
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #21
0
def test_whitespace_training(supervised_embeddings_config: RasaNLUModelConfig):
    examples = [
        Message(
            data={
                TEXT: "Any Mexican restaurant will do",
                "intent": "restaurant_search",
                "entities": [
                    {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}
                ],
            }
        ),
        Message(
            data={
                TEXT: "I want Tacos!",
                "intent": "restaurant_search",
                "entities": [
                    {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}
                ],
            }
        ),
        Message(data={TEXT: "action_restart", "action_name": "action_restart"}),
        Message(
            data={
                TEXT: "Where are you going?",
                ACTION_NAME: "Where are you going?",
                ACTION_TEXT: "Where are you going?",
            }
        ),
    ]

    component_config = {"case_sensitive": False, "intent_tokenization_flag": True}
    tk = WhitespaceTokenizer(component_config)

    tk.train(TrainingData(training_examples=examples), supervised_embeddings_config)

    assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
    assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[0].text == "action"
    assert examples[2].data.get(TOKENS_NAMES[ACTION_NAME])[1].text == "restart"
    assert examples[2].data.get(TOKENS_NAMES[TEXT])[0].text == "action_restart"
    assert examples[2].data.get(TOKENS_NAMES[ACTION_TEXT]) is None
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[0].text == "Where"
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[1].text == "are"
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[2].text == "you"
    assert examples[3].data.get(TOKENS_NAMES[ACTION_TEXT])[3].text == "going"
Beispiel #22
0
def test_train_tokenizer_action_name(text: Text, expected_tokens: List[Text],
                                     expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(ACTION_NAME, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    # check action_name attribute
    tokens = training_data.training_examples[0].get(TOKENS_NAMES[ACTION_NAME])

    assert [t.text for t in tokens] == [text]
Beispiel #23
0
def test_count_vector_featurizer_process_by_attribute(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0,
            "action_text": 0
        },
    })
    tk = WhitespaceTokenizer()

    # add a second example that has some response, so that the vocabulary for
    # response exists
    train_message = Message(data={TEXT: "hello"})
    train_message.set(ACTION_NAME, "greet")

    train_message1 = Message(data={TEXT: "hello"})
    train_message1.set(ACTION_TEXT, "hi")

    data = TrainingData([train_message, train_message1])

    tk.train(data)
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    test_message.set(ACTION_NAME, action_name)
    test_message.set(ACTION_TEXT, action_text)

    for module in [tk, ftr]:
        module.process(test_message)

    action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features(
        ACTION_NAME, [])
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features

    assert action_name_seq_vecs.toarray()[0] == action_name_features
    assert action_name_sen_vecs is None
Beispiel #24
0
def test_flexible_nlu_pipeline():
    message = Message("This is a test message.", data={"intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}
    )
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        }
    )
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 4
    assert message.features[0].origin == "cvf_word"
    # cvf word is also extracted for the intent
    assert message.features[1].origin == "cvf_word"
    assert message.features[2].origin == "cvf_char"
    assert message.features[3].origin == "LexicalSyntacticFeaturizer"

    feature_dim = (
        message.features[0].features.shape[1] + message.features[3].features.shape[1]
    )

    classifier = DIETClassifier(
        component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]}
    )
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get("text_features")) == 1
    assert len(model_data.get("label_features")) == 1
    assert model_data.get("text_features")[0][0].shape == (6, feature_dim)
    assert model_data.get("label_features")[0][0].shape == (1, 1)
Beispiel #25
0
def test_whitespace_training(supervised_embeddings_config):
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
    ]

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)

    tk.train(TrainingData(training_examples=examples),
             supervised_embeddings_config)

    assert examples[0].data.get("tokens")[0].text == "any"
    assert examples[0].data.get("tokens")[1].text == "mexican"
    assert examples[0].data.get("tokens")[2].text == "restaurant"
    assert examples[0].data.get("tokens")[3].text == "will"
    assert examples[0].data.get("tokens")[4].text == "do"
    assert examples[1].data.get("tokens")[0].text == "i"
    assert examples[1].data.get("tokens")[1].text == "want"
    assert examples[1].data.get("tokens")[2].text == "tacos"
def test_lm_featurizer_number_of_sub_tokens(text,
                                            expected_number_of_sub_tokens):
    config = {
        "model_name": "bert",
        "model_weights": "bert-base-uncased",
    }  # Test for one should be enough

    lm_featurizer = LanguageModelFeaturizer(config)
    whitespace_tokenizer = WhitespaceTokenizer()

    message = Message.build(text=text)

    td = TrainingData([message])
    whitespace_tokenizer.train(td)
    lm_featurizer.train(td)

    assert [
        t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])
    ] == expected_number_of_sub_tokens
def test_convert_featurizer_tokens_to_text(sentence: Text, expected_text: Text,
                                           monkeypatch: MonkeyPatch):
    tokenizer = WhitespaceTokenizer()

    monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url",
                        lambda x: RESTRICTED_ACCESS_URL)
    component_config = {
        "name": "ConveRTFeaturizer",
        "model_url": RESTRICTED_ACCESS_URL
    }
    featurizer = ConveRTFeaturizer(component_config)
    message = Message.build(text=sentence)
    td = TrainingData([message])
    tokenizer.train(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]

    assert expected_text == actual_text
Beispiel #28
0
def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({
        "use_shared_vocab": True,
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0
        },
    })
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
Beispiel #29
0
def test_cvf_incremental_training(
    initial_train_text: Text,
    additional_train_text: Text,
    initial_vocabulary_size: int,
    final_vocabulary_size: int,
    tmp_path: Path,
):
    tk = WhitespaceTokenizer()
    initial_cvf = CountVectorsFeaturizer()
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])

    tk.train(data)
    initial_cvf.train(data)

    # Check initial vocabulary size
    initial_vocab = initial_cvf.vectorizers["text"].vocabulary_
    assert len(initial_vocab) == initial_vocabulary_size

    # persist and load initial cvf
    file_dict = initial_cvf.persist("ftr", tmp_path)
    meta = initial_cvf.component_config.copy()
    meta.update(file_dict)
    new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True)

    # Check vocabulary size again
    assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tk.train(data)
    new_cvf.train(data)

    new_vocab = new_cvf.vectorizers["text"].vocabulary_

    # Check vocabulary size after finetuning
    assert len(new_vocab) == final_vocabulary_size

    # Check indices of initial vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in initial_vocab.items():
        assert vocab_token in new_vocab
        assert new_vocab.get(vocab_token) == vocab_index
Beispiel #30
0
def test_train_tokenizer_e2e_actions(text: Text, expected_tokens: List[Text],
                                     expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(ACTION_TEXT, text)
    message.set(ACTION_NAME, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [ACTION_TEXT, TEXT]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]