Example #1
0
def test_convert_featurizer_number_of_sub_tokens(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    text: Text,
    expected_number_of_sub_tokens: List[int],
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)

    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)

    tokens = featurizer.tokenize(message, attribute=TEXT)

    assert [t.get(NUMBER_OF_SUB_TOKENS)
            for t in tokens] == expected_number_of_sub_tokens
Example #2
0
def test_convert_featurizer_token_edge_cases(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    text: Text,
    expected_tokens: List[Text],
    expected_indices: List[Tuple[int]],
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Example #3
0
def test_convert_featurizer_train(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    monkeypatch: MonkeyPatch,
    load: bool,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config, load=True)

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)

    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)

    tokens = featurizer.tokenize(message, attribute=TEXT)

    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.process_training_data(TrainingData([message]))

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(INTENT, [])

    assert seq_vecs is None
    assert sent_vecs is None
Example #4
0
async def test_train_persist_load_with_composite_entities(
    crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    importer = RasaFileImporter(
        training_data_paths=["data/test/demo-rasa-composite-entities.yml"])
    training_data = importer.get_nlu_data()

    whitespace_tokenizer.process_training_data(training_data)

    crf_extractor = crf_entity_extractor({})
    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})

    whitespace_tokenizer.process([message])
    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractor.load(
        CRFEntityExtractor.get_default_config(),
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()
    assert list(loaded_extractor.entity_taggers.keys()) == list(
        crf_extractor.entity_taggers.keys())
Example #5
0
def test_convert_featurizer_tokens_to_text(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    sentence: Text,
    expected_text: Text,
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    message = Message.build(text=sentence)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]

    assert expected_text == actual_text
Example #6
0
def test_use_shared_vocab_exception(
    initial_train_text: Text,
    additional_train_text: Text,
    use_shared_vocab: bool,
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    load_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    """Tests if an exception is raised when `use_shared_vocab` is set to True
    during incremental training."""
    config = {"use_shared_vocab": use_shared_vocab}
    initial_cvf = create_featurizer(config)
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])
    whitespace_tokenizer.process_training_data(data)
    initial_cvf.train(data)

    new_cvf = load_featurizer(config, is_finetuning=True)

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    whitespace_tokenizer.process_training_data(data)
    if use_shared_vocab:
        with pytest.raises(Exception) as exec_info:
            new_cvf.train(data)
        assert (
            "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported"
            in str(exec_info.value)
        )
    else:
        new_cvf.train(data)
Example #7
0
def test_count_vector_featurizer_action_attribute_featurization(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({"token_pattern": r"(?u)\b\w+\b"})

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(ACTION_NAME, action_name)
    train_message.set(ACTION_TEXT, action_text)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(ACTION_TEXT, "hi")
    second_message.set(ACTION_NAME, "greet")

    data = TrainingData([train_message, second_message])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features(
        ACTION_NAME, []
    )
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        ACTION_TEXT, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if action_name_features:
        assert action_name_seq_vecs.toarray()[0] == action_name_features
        assert action_name_sen_vecs is None
    else:
        assert action_name_seq_vecs is None
        assert action_name_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Example #8
0
def test_regex_featurizer_train(
    create_featurizer: Callable[..., RegexFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = create_featurizer()
    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")

    whitespace_tokenizer.process_training_data(TrainingData([message]))
    training_data = TrainingData([message], regex_features=patterns)

    featurizer.train(training_data)
    featurizer.process_training_data(training_data)

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
Example #9
0
def test_count_vector_featurizer_response_attribute_featurization(
    sentence: Text,
    intent: Text,
    response: Optional[Text],
    intent_features: List[List[int]],
    response_features: Optional[List[List[int]]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Example #10
0
def test_training_data_fingerprint_incorporates_tokens(
    whitespace_tokenizer: WhitespaceTokenizer, ):
    from rasa.shared.importers.utils import training_data_from_paths

    files = [
        "data/examples/rasa/demo-rasa.yml",
        "data/examples/rasa/demo-rasa-responses.yml",
    ]
    training_data = training_data_from_paths(files, language="en")
    fp1 = training_data.fingerprint()
    whitespace_tokenizer.process_training_data(training_data)
    # training data fingerprint has changed
    assert fp1 != training_data.fingerprint()
Example #11
0
def test_check_correct_entity_annotations(
        text: Text, warnings: int, whitespace_tokenizer: WhitespaceTokenizer):
    reader = RasaYAMLReader()

    training_data = reader.reads(text)
    whitespace_tokenizer.process_training_data(training_data)

    with pytest.warns(UserWarning) as record:
        EntityExtractorMixin.check_correct_entity_annotations(training_data)

    assert len(record) == warnings
    assert all([excerpt in record[0].message.args[0]]
               for excerpt in ["Misaligned entity annotation in sentence"])
Example #12
0
def test_count_vectors_featurizer_train(
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    featurizer = create_featurizer()

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    whitespace_tokenizer.process_training_data(TrainingData([message]))

    data = TrainingData([message])
    featurizer.train(data)
    featurizer.process_training_data(data)

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
Example #13
0
def test_model_data_signature_with_entities(
    messages: List[Message],
    entity_expected: bool,
    create_diet: Callable[..., DIETClassifier],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    classifier = create_diet({"BILOU_flag": False})
    training_data = TrainingData(messages)

    # create tokens for entity parsing inside DIET
    whitespace_tokenizer.process_training_data(training_data)

    model_data = classifier.preprocess_train_data(training_data)
    entity_exists = "entities" in model_data.get_signature().keys()
    assert entity_exists == entity_expected
Example #14
0
def test_count_vector_featurizer_shared_vocab(
    sentence: Text,
    intent: Text,
    response: Text,
    text_features: List[List[int]],
    intent_features: List[List[int]],
    response_features: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "use_shared_vocab": True,
    })

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
Example #15
0
def process_training_text(
    texts: List[Text],
    model_name: Text,
    model_weights: Text,
    create_language_model_featurizer: Callable[[Dict[Text, Any]],
                                               LanguageModelFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
) -> List[Message]:
    """ Creates a featurizer and process training data """
    config = create_pretrained_transformers_config(model_name, model_weights)
    lm_featurizer = create_language_model_featurizer(config)

    messages = [Message.build(text=text) for text in texts]
    td = TrainingData(messages)

    whitespace_tokenizer.process_training_data(td)
    lm_featurizer.process_training_data(td)
    return messages
Example #16
0
def test_persist_load_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizer],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource: Resource,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = create_featurizer()

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)
    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)

    loaded_featurizer = RegexFeaturizer.load(
        RegexFeaturizer.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode

    new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
Example #17
0
def test_cvf_incremental_training(
    initial_train_text: Text,
    additional_train_text: Text,
    initial_vocabulary_size: int,
    final_vocabulary_size: int,
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    load_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    initial_cvf = create_featurizer()
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])

    whitespace_tokenizer.process_training_data(data)
    initial_cvf.train(data)

    # Check initial vocabulary size
    initial_vocab = initial_cvf.vectorizers["text"].vocabulary_
    assert len(initial_vocab) == initial_vocabulary_size

    # persist and load initial cvf
    new_cvf = load_featurizer(is_finetuning=True)

    # Check vocabulary size again
    assert len(
        new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    whitespace_tokenizer.process_training_data(data)
    new_cvf.train(data)

    new_vocab = new_cvf.vectorizers["text"].vocabulary_

    # Check vocabulary size after finetuning
    assert len(new_vocab) == final_vocabulary_size

    # Check indices of initial vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in initial_vocab.items():
        assert vocab_token in new_vocab
        assert new_vocab.get(vocab_token) == vocab_index
Example #18
0
def test_log_longer_sequence(
    sequence_length: int,
    model_name: Text,
    model_weights: Text,
    should_overflow: bool,
    caplog: LogCaptureFixture,
    create_language_model_featurizer: Callable[[Dict[Text, Any]],
                                               LanguageModelFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    config = {"model_name": model_name, "model_weights": model_weights}

    featurizer = create_language_model_featurizer(config)

    text = " ".join(["hi"] * sequence_length)
    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    caplog.set_level(logging.DEBUG)
    featurizer.process([message])
    if should_overflow:
        assert "hi hi hi" in caplog.text
    assert len(message.features) >= 2
Example #19
0
def test_vocabulary_expand_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizer],
    default_model_storage: ModelStorage,
    resource: Resource,
    default_execution_context: ExecutionContext,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
    ]

    featurizer = create_featurizer()

    sentence = "hey hey 2020"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)

    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)
    featurizer.process_training_data(training_data)

    # Test featurization of message
    expected = np.array([1, 0])
    expected_cls = np.array([1, 1])
    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (3, 2) == seq_vecs.shape
    assert (1, 2) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    loaded_featurizer = RegexFeaturizer.load(
        RegexFeaturizer.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    new_patterns = [
        {"pattern": "\\btoday*", "name": "day", "usage": "intent"},
        {"pattern": "\\bhey+", "name": "hello", "usage": "intent"},
    ]
    new_sentence = "hey today"
    message = Message(data={TEXT: new_sentence})
    message.set(RESPONSE, new_sentence)
    message.set(INTENT, "intent")
    new_training_data = TrainingData([message], regex_features=patterns + new_patterns)

    whitespace_tokenizer.process_training_data(new_training_data)

    loaded_featurizer.train(new_training_data)
    loaded_featurizer.process_training_data(new_training_data)

    # Test featurization of message, this time for the extra pattern as well.
    expected_token_1 = np.array([1, 0, 0])
    expected_token_2 = np.array([0, 0, 1])
    expected_cls = np.array([1, 0, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (2, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected_token_1)
    assert np.all(seq_vecs.toarray()[1] == expected_token_2)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    # let's check if the order of patterns is preserved
    for old_index, pattern in enumerate(featurizer.known_patterns):
        assert pattern["name"] == loaded_featurizer.known_patterns[old_index]["name"]

    # we also modified a pattern, check if that is correctly modified
    pattern_to_check = [
        pattern
        for pattern in loaded_featurizer.known_patterns
        if pattern["name"] == "hello"
    ]
    assert pattern_to_check == [new_patterns[1]]
Example #20
0
def test_apply_bilou_schema(whitespace_tokenizer: WhitespaceTokenizer):

    message_1 = Message.build(text="Germany is part of the European Union",
                              intent="inform")
    message_1.set(
        ENTITIES,
        [
            {
                "start": 0,
                "end": 7,
                "value": "Germany",
                "entity": "location"
            },
            {
                "start": 23,
                "end": 37,
                "value": "European Union",
                "entity": "organisation",
            },
        ],
    )

    message_2 = Message.build(text="Berlin is the capital of Germany",
                              intent="inform")
    message_2.set(
        ENTITIES,
        [
            {
                "start": 0,
                "end": 6,
                "value": "Berlin",
                "entity": "location"
            },
            {
                "start": 25,
                "end": 32,
                "value": "Germany",
                "entity": "location"
            },
        ],
    )

    training_data = TrainingData([message_1, message_2])

    whitespace_tokenizer.process_training_data(training_data)

    bilou_utils.apply_bilou_schema(training_data)

    assert message_1.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "B-organisation",
        "L-organisation",
    ]
    assert message_2.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "U-location",
    ]