def test_process_does_not_overwrite_any_entities(
    create_or_load_extractor: Callable[...,
                                       RegexEntityExtractorGraphComponent], ):

    pre_existing_entity = {
        ENTITY_ATTRIBUTE_TYPE: "person",
        ENTITY_ATTRIBUTE_VALUE: "Max",
        ENTITY_ATTRIBUTE_START: 0,
        ENTITY_ATTRIBUTE_END: 3,
        EXTRACTOR: "other extractor",
    }
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [copy.deepcopy(pre_existing_entity)])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT:
                "Hi Max!",
                INTENT:
                "greet",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "person",
                    ENTITY_ATTRIBUTE_VALUE: "Max"
                }],
            }),
        Message(
            data={
                TEXT:
                "I live in Berlin",
                INTENT:
                "inform",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "city",
                    ENTITY_ATTRIBUTE_VALUE: "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = create_or_load_extractor(config={})
    entity_extractor.train(training_data)
    entity_extractor.process([message])

    entities = message.get(ENTITIES)
    assert entities == [
        pre_existing_entity,
        {
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: "Berlin",
            ENTITY_ATTRIBUTE_START: 13,
            ENTITY_ATTRIBUTE_END: 19,
            EXTRACTOR: RegexEntityExtractorGraphComponent.__name__,
        },
    ]
def test_process(
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(data={TEXT: text})

    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message(data={
            TEXT: "Hi Max!",
            "entities": [{
                "entity": "person",
                "value": "Max"
            }]
        }),
        Message(
            data={
                TEXT: "I live in Berlin",
                "entities": [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == expected_entities
Esempio n. 3
0
def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected,
                                                   labeled_tokens):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
    from rasa.nlu.tokenizers.tokenizer import Token

    lookups = [
        {
            "name": "cites",
            "elements": ["北京", "上海", "广州", "深圳", "杭州"],
        },
        {
            "name": "dates",
            "elements": ["昨天", "今天", "明天", "后天"],
        },
    ]
    ftr = RegexFeaturizer({"use_word_boundaries": False})
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(TOKENS_NAMES[TEXT],
                [Token(word, start) for (word, start) in tokens])

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10)
    assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10)

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Esempio n. 4
0
def test_lowercase(
    text: Text,
    case_sensitive: bool,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(data={TEXT: text})
    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{"entity": "person", "value": "Max"}],
            }
        ),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{"entity": "city", "value": "Berlin"}],
            }
        ),
    ]

    entity_extractor = RegexEntityExtractor({"case_sensitive": case_sensitive})
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == expected_entities
Esempio n. 5
0
def test_persist_load_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource: Resource,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = create_featurizer()

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)
    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)

    loaded_featurizer = RegexFeaturizerGraphComponent.load(
        RegexFeaturizerGraphComponent.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode

    new_lookups = [{
        "name": "plates",
        "elements": "data/test/lookup_tables/plates.txt"
    }]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
Esempio n. 6
0
def test_lookup_with_and_without_boundaries(
    sentence: Text,
    expected_sequence_features: List[List[float]],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    use_word_boundaries: bool,
    spacy_nlp: Any,
):
    ftr = RegexFeaturizer({
        "use_word_boundaries": use_word_boundaries,
        "number_additional_patterns": 0
    })
    training_data = TrainingData()

    # we use lookups because the "use_word_boundaries" flag is only used when
    # producing patterns from lookup tables
    lookups = [{"name": "how", "elements": ["how"]}]
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    (sequence_features,
     sentence_features) = ftr._features_for_patterns(message, TEXT)

    sequence_features = sequence_features.toarray()
    sentence_features = sentence_features.toarray()
    num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups])
    assert sequence_features.shape == (
        len(message.get(TOKENS_NAMES[TEXT])),
        num_of_patterns,
    )
    num_of_lookup_tables = len(lookups)
    assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns)

    # sequence_features should be {0,1} for each token: 1 if match, 0 if not
    assert np.allclose(sequence_features,
                       expected_sequence_features,
                       atol=1e-10)
    # sentence_features should be {0,1} for each lookup table: 1 if sentence
    # contains match from that table, 0 if not
    assert np.allclose(sentence_features,
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        # labeled_tokens should list the token(s) which match a pattern
        assert num_matches == labeled_tokens.count(i)
def test_do_not_overwrite_any_entities():
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [{
        "entity": "person",
        "value": "Max",
        "start": 0,
        "end": 3
    }])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{
                    "entity": "person",
                    "value": "Max"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == [
        {
            "entity": "person",
            "value": "Max",
            "start": 0,
            "end": 3
        },
        {
            "entity": "city",
            "value": "Berlin",
            "start": 13,
            "end": 19,
            "extractor": "RegexEntityExtractor",
        },
    ]
def test_persist_load_for_finetuning(tmp_path: Path):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = RegexFeaturizer.create(
        {"number_additional_patterns": 5}, RasaNLUModelConfig()
    )

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(
        TrainingData([message], regex_features=patterns), RasaNLUModelConfig()
    )

    persist_value = featurizer.persist("ftr", str(tmp_path))

    # Test all artifacts stored as part of persist
    assert persist_value["file"] == "ftr"
    assert (tmp_path / "ftr.patterns.pkl").exists()
    assert (tmp_path / "ftr.vocabulary_stats.pkl").exists()
    assert featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 3,
    }

    loaded_featurizer = RegexFeaturizer.load(
        meta={"number_additional_patterns": 5, "file": persist_value["file"],},
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode
    assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats

    new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
    assert loaded_featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 4,
    }
Esempio n. 9
0
def test_non_word_boundaries(
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    non_word_boundary: List[Text],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(data={TEXT: text})
    training_data = TrainingData()
    training_data.lookup_tables = [lookup]
    training_data.training_examples = [
        Message(
            data={
                TEXT: "I love New York",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "New York"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
        Message(
            data={
                TEXT: "I like apples",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "fruit",
                    "value": "apples"
                }],
            }),
        Message(
            data={
                TEXT: "oranges are my fave",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "fruit",
                    "value": "oranges"
                }],
            }),
    ]

    entity_extractor = FlashTextEntityExtractor(
        {"non_word_boundaries": non_word_boundary})
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = [e["value"] for e in message.get(ENTITIES)]
    assert entities == expected_entities
Esempio n. 10
0
def test_extract_patterns(
    lookup_tables: Dict[Text, List[Text]],
    regex_features: Dict[Text, Text],
    expected_patterns: Dict[Text, Text],
):
    training_data = TrainingData()
    if lookup_tables:
        training_data.lookup_tables = [lookup_tables]
    if regex_features:
        training_data.regex_features = [regex_features]

    actual_patterns = pattern_utils.extract_patterns(training_data)

    assert actual_patterns == expected_patterns
Esempio n. 11
0
def test_lookup_tables(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    spacy_nlp: Any,
):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name":
            "drinks",
            "elements":
            ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {
            "name": "plates",
            "elements": "data/test/lookup_tables/plates.txt"
        },
    ]
    ftr = RegexFeaturizer({"number_additional_patterns": 0})
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    component_config = {"name": "SpacyTokenizer"}
    tokenizer = SpacyTokenizer(component_config)
    message = Message(data={TEXT: sentence})
    message.set("text_spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(),
                       expected_sequence_features,
                       atol=1e-10)
    assert np.allclose(sentence_features.toarray(),
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
def test_train_and_process(
    create_or_load_extractor: Callable[...,
                                       RegexEntityExtractorGraphComponent],
    config: Dict[Text, Any],
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
    test_loading: bool,
):
    message = Message(data={TEXT: text})
    if test_loading:
        message_copy = copy.deepcopy(message)

    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{
                    "entity": "person",
                    "value": "Max"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]

    entity_extractor = create_or_load_extractor(config)
    entity_extractor.train(training_data)
    entity_extractor.process([message])
    entities = message.get(ENTITIES)
    assert entities == expected_entities

    if test_loading:
        loaded_entity_extractor = create_or_load_extractor(config, load=True)
        loaded_entity_extractor.process([message_copy])
        loaded_entity_extractor.patterns == entity_extractor.patterns
Esempio n. 13
0
def test_extract_patterns_use_only_lookup_tables_or_regex_features(
    lookup_tables: Dict[Text, List[Text]],
    regex_features: Dict[Text, Text],
    use_lookup_tables: bool,
    use_regex_features: bool,
    expected_patterns: Dict[Text, Text],
):
    training_data = TrainingData()
    if lookup_tables:
        training_data.lookup_tables = [lookup_tables]
    if regex_features:
        training_data.regex_features = [regex_features]

    actual_patterns = pattern_utils.extract_patterns(
        training_data,
        use_lookup_tables=use_lookup_tables,
        use_regexes=use_regex_features,
    )

    assert actual_patterns == expected_patterns
Esempio n. 14
0
def test_extract_patterns_use_only_entities_lookup_tables(
        entity: Text, lookup_tables: Dict[Text, Text],
        expected_patterns: Dict[Text, Text]):
    training_data = TrainingData()
    if entity:
        training_data.training_examples = [
            Message(data={
                "text": "text",
                "entities": [{
                    "entity": entity,
                    "value": "text"
                }]
            })
        ]
    if lookup_tables:
        training_data.lookup_tables = [lookup_tables]

    actual_patterns = pattern_utils.extract_patterns(training_data,
                                                     use_only_entities=True)

    assert actual_patterns == expected_patterns
Esempio n. 15
0
def test_regex_validation(
    lookup_tables: Dict[Text, List[Text]],
    regex_features: Dict[Text, Text],
    use_lookup_tables: bool,
    use_regex_features: bool,
):
    """Tests if exception is raised when regex patterns are invalid."""

    training_data = TrainingData()
    if lookup_tables:
        training_data.lookup_tables = [lookup_tables]
    if regex_features:
        training_data.regex_features = [regex_features]

    with pytest.raises(Exception) as e:
        pattern_utils.extract_patterns(
            training_data,
            use_lookup_tables=use_lookup_tables,
            use_regexes=use_regex_features,
        )

    assert "Model training failed." in str(e.value)
    assert "not a valid regex." in str(e.value)
    assert "Please update your nlu training data configuration" in str(e.value)