def test_model_data_signature_with_entities(
    messages: List[Message],
    entity_expected: bool,
    create_diet: Callable[..., DIETClassifier],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    classifier = create_diet({"BILOU_flag": False})
    training_data = TrainingData(messages)

    # create tokens for entity parsing inside DIET
    whitespace_tokenizer.process_training_data(training_data)

    model_data = classifier.preprocess_train_data(training_data)
    entity_exists = "entities" in model_data.get_signature().keys()
    assert entity_exists == entity_expected
def test_custom_intent_symbol(text, expected_tokens):
    component_config = {
        "intent_tokenization_flag": True,
        "intent_split_symbol": "+"
    }

    tk = WhitespaceTokenizer(component_config)

    message = Message.build(text=text)
    message.set(INTENT, text)

    tk.train(TrainingData([message]))

    assert [t.text
            for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
Beispiel #3
0
def test_split_intent_response_key(text, expected_tokens):
    component_config = {
        "intent_tokenization_flag": True,
        "intent_split_symbol": "+"
    }

    tk = WhitespaceTokenizer(component_config)

    message = Message(text)
    message.set(INTENT_RESPONSE_KEY, text)

    assert [
        t.text
        for t in tk._split_intent(message, attribute=INTENT_RESPONSE_KEY)
    ] == expected_tokens
def test_whitespace_cls_token():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

    component_config = {"use_cls_token": True}

    tk = WhitespaceTokenizer(component_config)

    assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
        "Forecast",
        "for",
        "lunch",
        CLS_TOKEN,
    ]
    assert [t.offset
            for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13, 19]
Beispiel #5
0
def test_check_check_correct_entity_annotations(text: Text, warnings: int):
    reader = MarkdownReader()
    tokenizer = WhitespaceTokenizer()

    training_data = reader.reads(text)
    tokenizer.train(training_data)

    with pytest.warns(UserWarning) as record:
        EntityExtractor.check_correct_entity_annotations(training_data)

    assert len(record) == warnings
    assert all(
        [excerpt in record[0].message.args[0]]
        for excerpt in ["Misaligned entity annotation in sentence"]
    )
Beispiel #6
0
def test_convert_tags_to_entities(
    text: Text,
    tags: Dict[Text, List[Text]],
    confidences: Dict[Text, List[float]],
    expected_entities: List[Dict[Text, Any]],
):
    extractor = EntityExtractor()
    tokenizer = WhitespaceTokenizer()

    message = Message(text)
    tokens = tokenizer.tokenize(message, TEXT)

    actual_entities = extractor.convert_predictions_into_entities(
        text, tokens, tags, confidences)
    assert actual_entities == expected_entities
Beispiel #7
0
def test_count_vector_featurizer_response_attribute_featurization(
    sentence, intent, response, intent_features, response_features
):
    ftr = CountVectorsFeaturizer()
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #8
0
def test_check_correct_entity_annotations(
    text: Text, warnings: int, whitespace_tokenizer: WhitespaceTokenizer
):
    reader = RasaYAMLReader()

    training_data = reader.reads(text)
    whitespace_tokenizer.process_training_data(training_data)

    with pytest.warns(UserWarning) as record:
        EntityExtractorMixin.check_correct_entity_annotations(training_data)

    assert len(record) == warnings
    assert all(
        [excerpt in record[0].message.args[0]]
        for excerpt in ["Misaligned entity annotation in sentence"]
    )
def test_elmo_featurizer_tokens_to_text(sentence, expected_text):
    tokens = WhitespaceTokenizer().tokenize(Message(sentence),
                                            attribute=TEXT_ATTRIBUTE)

    actual_text = ElmoFeaturizer._tokens_to_text([tokens])[0]

    assert expected_text == actual_text
def test_elmo_featurizer_train():
    featurizer = ElmoFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE_ATTRIBUTE, sentence)
    tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE)
    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
    message.set(TOKENS_NAMES[RESPONSE_ATTRIBUTE], tokens)

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])

    assert vecs is None
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
Beispiel #12
0
def test_convert_tags_to_entities(
    text: Text,
    tags: Dict[Text, List[Text]],
    confidences: Dict[Text, List[float]],
    expected_entities: List[Dict[Text, Any]],
):
    extractor = EntityExtractor()
    tokenizer = WhitespaceTokenizer()

    message = Message(data={TEXT: text})
    tokens = tokenizer.tokenize(message, TEXT)

    split_entities_config = {SPLIT_ENTITIES_BY_COMMA: True}
    actual_entities = extractor.convert_predictions_into_entities(
        text, tokens, tags, split_entities_config, confidences)
    assert actual_entities == expected_entities
def test_multiword_entities():
    data = """
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "show me flights to New York City",
        "intent": "unk",
        "entities": [
          {
            "entity": "destination",
            "start": 19,
            "end": 32,
            "value": "New York City"
          }
        ]
      }
    ]
  }
}"""
    with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f:
        f.write(data.encode("utf-8"))
        f.flush()
        td = training_data.load_data(f.name)
        assert len(td.entity_examples) == 1
        example = td.entity_examples[0]
        entities = example.get("entities")
        assert len(entities) == 1
        tokens = WhitespaceTokenizer().tokenize(example.text)
        start, end = MitieEntityExtractor.find_entity(entities[0],
                                                      example.text, tokens)
        assert start == 4
        assert end == 7
Beispiel #14
0
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])

    assert (6, 5) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])

    assert (6, 5) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])

    assert (1, 1) == vecs.shape
    assert np.all(vecs.toarray()[0] == np.array([1]))
Beispiel #15
0
def test_repeated_entities(tmp_path):
    data = """
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "book a table today from 3 to 6 for 3 people",
        "intent": "unk",
        "entities": [
          {
            "entity": "description",
            "start": 35,
            "end": 36,
            "value": "3"
          }
        ]
      }
    ]
  }
}"""
    f = tmp_path / "tmp_training_data.json"
    f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING)
    td = load_data(str(f))
    assert len(td.entity_examples) == 1
    example = td.entity_examples[0]
    entities = example.get("entities")
    assert len(entities) == 1
    tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT)
    start, end = MitieEntityExtractor.find_entity(entities[0],
                                                  example.get(TEXT), tokens)
    assert start == 9
    assert end == 10
Beispiel #16
0
def test_train_tokenizer_action_name(text: Text, expected_tokens: List[Text],
                                     expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(ACTION_NAME, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    # check action_name attribute
    tokens = training_data.training_examples[0].get(TOKENS_NAMES[ACTION_NAME])

    assert [t.text for t in tokens] == [text]
Beispiel #17
0
def test_count_vector_featurizer_oov_words(sentence, expected):

    ftr = CountVectorsFeaturizer({
        "OOV_token": "__oov__",
        "OOV_words": ["oov_word0", "OOV_word1"],
        "additional_vocabulary_size": {
            "text": 0
        },
    })
    train_message = Message(data={TEXT: sentence})
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Beispiel #18
0
def test_multiword_entities(tmp_path):
    data = """
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "show me flights to New York City",
        "intent": "unk",
        "entities": [
          {
            "entity": "destination",
            "start": 19,
            "end": 32,
            "value": "New York City"
          }
        ]
      }
    ]
  }
}"""
    f = tmp_path / "tmp_training_data.json"
    f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING)
    td = load_data(str(f))
    assert len(td.entity_examples) == 1
    example = td.entity_examples[0]
    entities = example.get("entities")
    assert len(entities) == 1
    tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT)
    start, end = MitieEntityExtractor.find_entity(entities[0],
                                                  example.get(TEXT), tokens)
    assert start == 4
    assert end == 7
def test_repeated_entities():
    data = """
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "book a table today from 3 to 6 for 3 people",
        "intent": "unk",
        "entities": [
          {
            "entity": "description",
            "start": 35,
            "end": 36,
            "value": "3"
          }
        ]
      }
    ]
  }
}"""
    with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f:
        f.write(data.encode("utf-8"))
        f.flush()
        td = training_data.load_data(f.name)
        assert len(td.entity_examples) == 1
        example = td.entity_examples[0]
        entities = example.get("entities")
        assert len(entities) == 1
        tokens = WhitespaceTokenizer().tokenize(example.text)
        start, end = MitieEntityExtractor.find_entity(entities[0],
                                                      example.text, tokens)
        assert start == 9
        assert end == 10
Beispiel #20
0
def test_count_vector_featurizer_attribute_featurization(
    sentence: Text,
    intent: Text,
    response: Optional[Text],
    intent_features: List[List[int]],
    response_features: Optional[List[List[int]]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #21
0
def test_whitespace():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    tk = WhitespaceTokenizer()

    assert ([t.text for t in tk.tokenize("Forecast for lunch")
             ] == ['Forecast', 'for', 'lunch'])

    assert ([t.offset
             for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13])

    # we ignore .,!?
    assert ([t.text for t in tk.tokenize("hey ńöñàśçií how're you?")
             ] == ['hey', 'ńöñàśçií', 'how', 're', 'you'])

    assert ([t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")
             ] == [0, 4, 13, 17, 20])

    assert ([
        t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. "
                                    "(how're you?)")
    ] == ['привет', '10.000', 'ńöñàśçií', 'how', 're', 'you'])

    assert ([
        t.offset for t in tk.tokenize("привет! 10.000, ńöñàśçií. "
                                      "(how're you?)")
    ] == [0, 8, 16, 27, 31, 34])

    # urls are single token
    assert ([
        t.text for t in tk.tokenize("https://www.google.com/search?client="
                                    "safari&rls=en&q="
                                    "i+like+rasa&ie=UTF-8&oe=UTF-8 "
                                    "https://rasa.com/docs/nlu/"
                                    "components/#tokenizer-whitespace")
    ] == [
        "https://www.google.com/search?"
        "client=safari&rls=en&q=i+like+rasa&ie=UTF-8&oe=UTF-8",
        "https://rasa.com/docs/nlu/components/#tokenizer-whitespace"
    ])

    assert ([
        t.offset for t in tk.tokenize("https://www.google.com/search?client="
                                      "safari&rls=en&q="
                                      "i+like+rasa&ie=UTF-8&oe=UTF-8 "
                                      "https://rasa.com/docs/nlu/"
                                      "components/#tokenizer-whitespace")
    ] == [0, 83])
Beispiel #22
0
def test_count_vector_featurizer_process_by_attribute(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0,
            "action_text": 0
        },
    })
    tk = WhitespaceTokenizer()

    # add a second example that has some response, so that the vocabulary for
    # response exists
    train_message = Message(data={TEXT: "hello"})
    train_message.set(ACTION_NAME, "greet")

    train_message1 = Message(data={TEXT: "hello"})
    train_message1.set(ACTION_TEXT, "hi")

    data = TrainingData([train_message, train_message1])

    tk.train(data)
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    test_message.set(ACTION_NAME, action_name)
    test_message.set(ACTION_TEXT, action_text)

    for module in [tk, ftr]:
        module.process(test_message)

    action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features(
        ACTION_NAME, [])
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features

    assert action_name_seq_vecs.toarray()[0] == action_name_features
    assert action_name_sen_vecs is None
Beispiel #23
0
def test_count_vector_featurizer_shared_vocab(
    sentence: Text,
    intent: Text,
    response: Text,
    text_features: List[List[int]],
    intent_features: List[List[int]],
    response_features: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "use_shared_vocab": True,
    })

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
Beispiel #24
0
def test_flexible_nlu_pipeline():
    message = Message("This is a test message.", data={"intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}
    )
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        }
    )
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 4
    assert message.features[0].origin == "cvf_word"
    # cvf word is also extracted for the intent
    assert message.features[1].origin == "cvf_word"
    assert message.features[2].origin == "cvf_char"
    assert message.features[3].origin == "LexicalSyntacticFeaturizer"

    feature_dim = (
        message.features[0].features.shape[1] + message.features[3].features.shape[1]
    )

    classifier = DIETClassifier(
        component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]}
    )
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get("text_features")) == 1
    assert len(model_data.get("label_features")) == 1
    assert model_data.get("text_features")[0][0].shape == (6, feature_dim)
    assert model_data.get("label_features")[0][0].shape == (1, 1)
Beispiel #25
0
def process_training_text(
    texts: List[Text],
    model_name: Text,
    model_weights: Text,
    create_language_model_featurizer: Callable[[Dict[Text, Any]],
                                               LanguageModelFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
) -> List[Message]:
    """ Creates a featurizer and process training data """
    config = create_pretrained_transformers_config(model_name, model_weights)
    lm_featurizer = create_language_model_featurizer(config)

    messages = [Message.build(text=text) for text in texts]
    td = TrainingData(messages)

    whitespace_tokenizer.process_training_data(td)
    lm_featurizer.process_training_data(td)
    return messages
Beispiel #26
0
    def __init__(
        self,
        component_config: Optional[Dict[Text, Any]] = None,
        skip_model_load: bool = False,
    ) -> None:
        """Initializes HFTransformsNLP with the models specified."""
        super(HFTransformersNLP, self).__init__(component_config)

        self._load_model_metadata()
        self._load_model_instance(skip_model_load)
        self.whitespace_tokenizer = WhitespaceTokenizer()
        rasa.shared.utils.io.raise_warning(
            f"'{self.__class__.__name__}' is deprecated and "
            f"will be removed in the future. "
            f"It is recommended to use the '{LanguageModelFeaturizer.__name__}' "
            f"instead.",
            category=DeprecationWarning,
        )
Beispiel #27
0
def test_persist_load_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizer],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource: Resource,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = create_featurizer()

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)
    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)

    loaded_featurizer = RegexFeaturizer.load(
        RegexFeaturizer.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode

    new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
Beispiel #28
0
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer(
        {"additional_vocabulary_size": {
            "text": 0,
            "response": 0
        }})
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Beispiel #29
0
def test_whitespace_training(supervised_embeddings_config):
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
    ]

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)

    tk.train(TrainingData(training_examples=examples),
             supervised_embeddings_config)

    assert examples[0].data.get("tokens")[0].text == "any"
    assert examples[0].data.get("tokens")[1].text == "mexican"
    assert examples[0].data.get("tokens")[2].text == "restaurant"
    assert examples[0].data.get("tokens")[3].text == "will"
    assert examples[0].data.get("tokens")[4].text == "do"
    assert examples[1].data.get("tokens")[0].text == "i"
    assert examples[1].data.get("tokens")[1].text == "want"
    assert examples[1].data.get("tokens")[2].text == "tacos"
def test_persist_load_for_finetuning(tmp_path: Path):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = RegexFeaturizer.create(
        {"number_additional_patterns": 5}, RasaNLUModelConfig()
    )

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(
        TrainingData([message], regex_features=patterns), RasaNLUModelConfig()
    )

    persist_value = featurizer.persist("ftr", str(tmp_path))

    # Test all artifacts stored as part of persist
    assert persist_value["file"] == "ftr"
    assert (tmp_path / "ftr.patterns.pkl").exists()
    assert (tmp_path / "ftr.vocabulary_stats.pkl").exists()
    assert featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 3,
    }

    loaded_featurizer = RegexFeaturizer.load(
        meta={"number_additional_patterns": 5, "file": persist_value["file"],},
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode
    assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats

    new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
    assert loaded_featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 4,
    }