def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected, labeled_tokens): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer from rasa.nlu.tokenizers.tokenizer import Token lookups = [ { "name": "cites", "elements": ["北京", "上海", "广州", "深圳", "杭州"], }, { "name": "dates", "elements": ["昨天", "今天", "明天", "后天"], }, ] ftr = RegexFeaturizer({"use_word_boundaries": False}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens]) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def process(self, message: Message, **kwargs: Any) -> None: from seq2annotation.server.paddle_inference import Inference real_result_dir = os.path.join(self.model_dir, self.result_dir) print(real_result_dir) # for cache if not self.predict_fn: self.predict_fn = Inference(real_result_dir) input_text = message.text seq = self.predict_fn.infer(input_text) seq.span_set.fill_text(input_text) entity_set = [] for span in seq.span_set: ent = { "entity": span.entity, "value": span.value, "start": span.start, "confidence": None, "end": span.end } entity_set.append(ent) extracted = self.add_extractor_name(entity_set) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: from seq2label.input import to_fixed_len input_text = message.text input_feature = { 'words': [to_fixed_len([i for i in input_text], 20, '<pad>')], } print(input_feature) predictions = self.predict_fn(input_feature) label = predictions['label'][0].decode() intent = {"name": label, "confidence": 1} ranking = zip([i.decode() for i in predictions['label_mapping']], [float(i) for i in predictions['label_prob'][0]]) intent_ranking = [{ "name": name, "confidence": score } for name, score in ranking] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: extracted_entities = self._extract_entities(message) extracted_entities = self.add_extractor_name(extracted_entities) message.set( ENTITIES, message.get(ENTITIES, []) + extracted_entities, add_to_output=True )
def test_regex_featurizer_case_sensitive( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], case_sensitive: bool, spacy_nlp: Any, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = RegexFeaturizer( {"case_sensitive": case_sensitive, "number_additional_patterns": 0}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def process(self, message: Message, **kwargs: Any) -> None: urls: Set[Any] = set() last_pos = 0 for url in self.extractor.gen_urls(message.data.get("text")): start = message.data.get("text").find(url, last_pos) end = start + len(url) last_pos = end urls.add( tuple( { "start": start, "end": end, "value": url, "entity": "URL", "extractor": self.name, "confidence": 1.0, }.items() ) ) entities = message.get("entities", []) + list( sorted(map(dict, urls), key=lambda x: x.get("start")) # type: ignore ) message.set( "entities", sorted(entities, key=lambda x: x.get("confidence", 0), reverse=True), add_to_output=True, )
def process(self, message: Message, **kwargs: Any) -> None: if self._url() is not None: # mod > params = kwargs timezone = self._timezone_from_config_or_request( self.component_config, params.get("timezone", None)) reference_time = self._reference_time_from_message_or_request( message, params.get("reference_time", None)) matches = self._duckling_parse(message.text, reference_time, timezone) # </ mod all_extracted = convert_duckling_format_to_rasa(matches) dimensions = self.component_config["dimensions"] extracted = DucklingEntityExtractor.filter_irrelevant_entities( all_extracted, dimensions) else: extracted = [] raise_warning( "Duckling HTTP component in pipeline, but no " "`url` configuration in the config " "file nor is `RASA_DUCKLING_HTTP_URL` " "set as an environment variable. No entities will be extracted!", docs=DOCS_URL_COMPONENTS + "#ducklinghttpextractor", ) extracted = self.add_extractor_name(extracted) message.set( ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True, )
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message(data={TEXT: ""}) train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: ""}) test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def process(self, message: Message, **kwargs: Any) -> None: entities = message.get("entities", []) new_entities = [] for entity in entities: config = self._find_entity(entity, self.entities) if config is None or not isinstance(entity["value"], str): new_entities.append(entity) continue matches = process.extract( entity["value"], self.gazette.get(entity["entity"], []), limit=self.limit, scorer=config["mode"], ) primary, score = matches[0] if len(matches) else (None, None) if primary is not None and score > config["min_score"]: entity["value"] = primary entity["gazette_matches"] = [{ "value": value, "score": num } for value, num in matches] new_entities.append(entity) message.set("entities", new_entities)
def test_regex_featurizer_no_sequence( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], spacy_nlp: Any, create_featurizer: Callable[..., RegexFeaturizer], spacy_tokenizer: SpacyTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = create_featurizer(known_patterns=patterns) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([message]) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]}) train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert isinstance(seq_vec, scipy.sparse.coo_matrix) assert sen_vec is None assert np.all(seq_vec.toarray() == expected)
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its probability for a message.""" if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: X = self._get_sentence_features(message).reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list(zip(list(intents), list(probabilities)))[:LABEL_RANKING_LENGTH] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [{ "name": intent_name, "confidence": score } for intent_name, score in ranking] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def test_process_does_not_overwrite_any_entities( create_or_load_extractor: Callable[..., RegexEntityExtractorGraphComponent], ): pre_existing_entity = { ENTITY_ATTRIBUTE_TYPE: "person", ENTITY_ATTRIBUTE_VALUE: "Max", ENTITY_ATTRIBUTE_START: 0, ENTITY_ATTRIBUTE_END: 3, EXTRACTOR: "other extractor", } message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"}) message.set(ENTITIES, [copy.deepcopy(pre_existing_entity)]) training_data = TrainingData() training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ ENTITY_ATTRIBUTE_TYPE: "person", ENTITY_ATTRIBUTE_VALUE: "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Berlin" }], }), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = create_or_load_extractor(config={}) entity_extractor.train(training_data) entity_extractor.process([message]) entities = message.get(ENTITIES) assert entities == [ pre_existing_entity, { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Berlin", ENTITY_ATTRIBUTE_START: 13, ENTITY_ATTRIBUTE_END: 19, EXTRACTOR: RegexEntityExtractorGraphComponent.__name__, }, ]
def test_convert_featurizer_process(component_builder, monkeypatch: MonkeyPatch): monkeypatch.setattr( ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL ) component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} tokenizer = ConveRTTokenizer(component_config) featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) tokens = tokenizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message, tf_hub_module=tokenizer.module) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def process(self, message: Message, **kwargs: Any) -> None: """Process an incoming message. This is the components chance to process an incoming message. The component can rely on any context attribute to be present, that gets created by a call to :meth:`components.Component.pipeline_init` of ANY component and on any context attributes created by a call to :meth:`components.Component.process` of components previous to this one.""" res = self.client.concepts.extract(message.get(TEXT), lang=self.lang, properties=self.properties, split=self.split, precision=self.precision) concepts = [] for concept in res.concepts: for label in concept.labels: for mention in label.mentions: concepts.append({ "value": label.text, "start": mention.start, "end": mention.end, "entity": concept.id, "properties": concept.properties, "confidence": concept.weight }) all_extracted = self.add_extractor_name(concepts) dimensions = self.component_config.get("dimensions") extracted = self.filter_irrelevant_entities(all_extracted, dimensions) extracted = self.add_extractor_name(extracted) message.set("concepts", message.get("concepts", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: """Process an incoming message. This is the components chance to process an incoming message. The component can rely on any context attribute to be present, that gets created by a call to :meth:`components.Component.pipeline_init` of ANY component and on any context attributes created by a call to :meth:`components.Component.process` of components previous to this one.""" res = self.client.entities.extract(message.get(TEXT)) entities = [{ "entity": ent.ent_type, "value": ent.text, "start": ent.start, "confidence": None, "end": ent.end, } for ent in res.entities] all_extracted = self.add_extractor_name(entities) dimensions = self.component_config.get("dimensions") extracted = self.filter_irrelevant_entities(all_extracted, dimensions) extracted = self.add_extractor_name(extracted) message.set(ENTITIES, message.get(ENTITIES, []) + extracted, add_to_output=True)
def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nlp): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_featrures, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_featrures.toarray()[0], expected, atol=1e-10) assert np.allclose(sentence_features.toarray()[-1], expected_cls, atol=1e-10)
def test_convert_featurizer_process(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class( ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) tokens = tokenizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message, tf_hub_module=tokenizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenize the text using the ConveRT model. ConveRT adds a special char in front of (some) words and splits words into sub-words. To ensure the entity start and end values matches the token values, reuse the tokens that are already assigned to the message. If individual tokens are split up into multiple tokens, add this information to the respected tokens. """ tokens_in = message.get(TOKENS_NAMES[attribute]) tokens_out = [] for token in tokens_in: # use ConveRT model to tokenize the text split_token_strings = self._tokenize(token.text)[0] # clean tokens (remove special chars and empty tokens) split_token_strings = self._clean_tokens(split_token_strings) token.set(NUMBER_OF_SUB_TOKENS, len(split_token_strings)) tokens_out.append(token) message.set(TOKENS_NAMES[attribute], tokens_out) return tokens_out
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, ): ftr = CountVectorsFeaturizer({"use_lemma": use_lemma}) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features)
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer doc = spacy_nlp(sentence) token_vectors = [t.vector for t in doc] ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) greet = {TEXT: sentence, "intent": "greet", "text_features": [0.5]} message = Message(data=greet) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features vecs = seq_vecs[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4) assert sen_vecs is not None
def test_convert_training_examples( spacy_nlp: Any, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizer() count_vectors_featurizer = CountVectorsFeaturizer() spacy_featurizer = SpacyFeaturizer() message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.train(training_data) count_vectors_featurizer.train(training_data) spacy_featurizer.train(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec, ) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes
def test_persist_load_for_finetuning( create_featurizer: Callable[..., RegexFeaturizerGraphComponent], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource: Resource, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) loaded_featurizer = RegexFeaturizerGraphComponent.load( RegexFeaturizerGraphComponent.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode new_lookups = [{ "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4
def process(self, message: Message, **kwargs: Any) -> None: intent_name = self._map_keyword_to_intent(message.get(TEXT)) confidence = 0.0 if intent_name is None else 1.0 intent = {"name": intent_name, "confidence": confidence} if message.get(INTENT) is None or intent is not None: message.set(INTENT, intent, add_to_output=True)
def test_lookup_with_and_without_boundaries( sentence: Text, expected_sequence_features: List[List[float]], expected_sentence_features: List[float], labeled_tokens: List[float], use_word_boundaries: bool, spacy_nlp: Any, ): ftr = RegexFeaturizer({ "use_word_boundaries": use_word_boundaries, "number_additional_patterns": 0 }) training_data = TrainingData() # we use lookups because the "use_word_boundaries" flag is only used when # producing patterns from lookup tables lookups = [{"name": "how", "elements": ["how"]}] training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) (sequence_features, sentence_features) = ftr._features_for_patterns(message, TEXT) sequence_features = sequence_features.toarray() sentence_features = sentence_features.toarray() num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups]) assert sequence_features.shape == ( len(message.get(TOKENS_NAMES[TEXT])), num_of_patterns, ) num_of_lookup_tables = len(lookups) assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns) # sequence_features should be {0,1} for each token: 1 if match, 0 if not assert np.allclose(sequence_features, expected_sequence_features, atol=1e-10) # sentence_features should be {0,1} for each lookup table: 1 if sentence # contains match from that table, 0 if not assert np.allclose(sentence_features, expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) # labeled_tokens should list the token(s) which match a pattern assert num_matches == labeled_tokens.count(i)
def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 5}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) persist_value = featurizer.persist("ftr", str(tmp_path)) # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" assert (tmp_path / "ftr.patterns.pkl").exists() assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, } loaded_featurizer = RegexFeaturizer.load( meta={"number_additional_patterns": 5, "file": persist_value["file"],}, should_finetune=True, model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4 assert loaded_featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 4, }
def test_do_not_overwrite_any_entities(): message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"}) message.set(ENTITIES, [{ "entity": "person", "value": "Max", "start": 0, "end": 3 }]) training_data = TrainingData() training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ "entity": "person", "value": "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "Berlin" }], }), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == [ { "entity": "person", "value": "Max", "start": 0, "end": 3 }, { "entity": "city", "value": "Berlin", "start": 13, "end": 19, "extractor": "RegexEntityExtractor", }, ]
def test_regex_featurizer_train( create_featurizer: Callable[..., RegexFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") whitespace_tokenizer.process_training_data(TrainingData([message])) training_data = TrainingData([message], regex_features=patterns) featurizer.train(training_data) featurizer.process_training_data(training_data) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def test_regex_featurizer_train(): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 0}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def test_regex_featurizer( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[int], additional_vocabulary_size: int, spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer( {"number_additional_patterns": additional_vocabulary_size}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(data={TEXT: sentence, RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)