def test_process( text: Text, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], ): message = Message(text) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message("Hi Max!", data={"entities": [{ "entity": "person", "value": "Max" }]}), Message( "I live in Berlin", data={"entities": [{ "entity": "city", "value": "Berlin" }]}, ), ] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == expected_entities
def test_extract_patterns( lookup_tables: Dict[Text, List[Text]], regex_features: Dict[Text, Text], expected_patterns: Dict[Text, Text], ): training_data = TrainingData() if lookup_tables: training_data.lookup_tables = [lookup_tables] if regex_features: training_data.regex_features = [regex_features] actual_patterns = pattern_utils.extract_patterns(training_data) assert actual_patterns == expected_patterns
def test_do_not_overwrite_any_entities(): message = Message("Max lives in Berlin.") message.set(ENTITIES, [{ "entity": "person", "value": "Max", "start": 0, "end": 3 }]) training_data = TrainingData() training_data.training_examples = [ Message("Hi Max!", data={"entities": [{ "entity": "person", "value": "Max" }]}), Message( "I live in Berlin", data={"entities": [{ "entity": "city", "value": "Berlin" }]}, ), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == [ { "entity": "person", "value": "Max", "start": 0, "end": 3 }, { "entity": "city", "value": "Berlin", "start": 13, "end": 19, "extractor": "RegexEntityExtractor", }, ]
def test_extract_patterns_use_only_entities_lookup_tables( entity: Text, lookup_tables: Dict[Text, Text], expected_patterns: Dict[Text, Text]): training_data = TrainingData() if entity: training_data.training_examples = [ Message("text", data={"entities": [{ "entity": entity, "value": "text" }]}) ] if lookup_tables: training_data.lookup_tables = [lookup_tables] actual_patterns = pattern_utils.extract_patterns(training_data, use_only_entities=True) assert actual_patterns == expected_patterns
def test_extract_patterns_use_only_lookup_tables_or_regex_features( lookup_tables: Dict[Text, List[Text]], regex_features: Dict[Text, Text], use_lookup_tables: bool, use_regex_features: bool, expected_patterns: Dict[Text, Text], ): training_data = TrainingData() if lookup_tables: training_data.lookup_tables = [lookup_tables] if regex_features: training_data.regex_features = [regex_features] actual_patterns = pattern_utils.extract_patterns( training_data, use_lookup_tables=use_lookup_tables, use_regexes=use_regex_features, ) assert actual_patterns == expected_patterns
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)