def test_spacy_training_sample_alignment(spacy_nlp_component: SpacyNLP, spacy_model: SpacyModel): from spacy.tokens import Doc m1 = Message.build(text="I have a feeling", intent="feeling") m2 = Message.build(text="", intent="feeling") m3 = Message.build(text="I am the last message", intent="feeling") td = TrainingData(training_examples=[m1, m2, m3]) attribute_docs = spacy_nlp_component._docs_for_training_data( spacy_model.model, td) assert isinstance(attribute_docs["text"][0], Doc) assert isinstance(attribute_docs["text"][1], Doc) assert isinstance(attribute_docs["text"][2], Doc) assert [t.text for t in attribute_docs["text"][0] ] == ["i", "have", "a", "feeling"] assert [t.text for t in attribute_docs["text"][1]] == [] assert [t.text for t in attribute_docs["text"][2]] == [ "i", "am", "the", "last", "message", ]
def test_replacing_fallback_intent(): expected_intent = "greet" expected_confidence = 0.345 fallback_prediction = { INTENT: { INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME, PREDICTED_CONFIDENCE_KEY: 1, }, INTENT_RANKING_KEY: [ { INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME, PREDICTED_CONFIDENCE_KEY: 1, }, { INTENT_NAME_KEY: expected_intent, PREDICTED_CONFIDENCE_KEY: expected_confidence, }, {INTENT_NAME_KEY: "some", PREDICTED_CONFIDENCE_KEY: 0.1}, ], } interpreter = ConstantInterpreter(fallback_prediction) training_data = TrainingData( [Message.build("hi", "greet"), Message.build("bye", "bye")] ) intent_evaluations, _, _ = get_eval_data(interpreter, training_data) assert all( prediction.intent_prediction == expected_intent and prediction.confidence == expected_confidence for prediction in intent_evaluations )
def test_build_tag_id_dict(): message_1 = Message.build( text="Germany is part of the European Union", intent="inform" ) message_1.set( BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"], ) message_2 = Message.build(text="Berlin is the capital of Germany", intent="inform") message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"]) training_data = TrainingData([message_1, message_2]) tag_id_dict = bilou_utils.build_tag_id_dict(training_data) assert tag_id_dict == { "O": 0, "B-location": 1, "I-location": 2, "L-location": 3, "U-location": 4, "B-organisation": 5, "I-organisation": 6, "L-organisation": 7, "U-organisation": 8, }
def test_preprocess_selector_multiple_retrieval_intents(): # use some available data training_data = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.yml" ) training_data_responses = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa-responses.yml" ) training_data_extra_intent = TrainingData( [ Message.build( text="Is it possible to detect the version?", intent="faq/q1" ), Message.build(text="How can I get a new virtual env", intent="faq/q2"), ] ) training_data = training_data.merge(training_data_responses).merge( training_data_extra_intent ) response_selector = ResponseSelector() response_selector.preprocess_train_data(training_data) assert sorted(response_selector.all_retrieval_intents) == ["chitchat", "faq"]
def test_apply_bilou_schema(whitespace_tokenizer: WhitespaceTokenizerGraphComponent): message_1 = Message.build( text="Germany is part of the European Union", intent="inform" ) message_1.set( ENTITIES, [ {"start": 0, "end": 7, "value": "Germany", "entity": "location"}, { "start": 23, "end": 37, "value": "European Union", "entity": "organisation", }, ], ) message_2 = Message.build(text="Berlin is the capital of Germany", intent="inform") message_2.set( ENTITIES, [ {"start": 0, "end": 6, "value": "Berlin", "entity": "location"}, {"start": 25, "end": 32, "value": "Germany", "entity": "location"}, ], ) training_data = TrainingData([message_1, message_2]) whitespace_tokenizer.process_training_data(training_data) bilou_utils.apply_bilou_schema(training_data) assert message_1.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "B-organisation", "L-organisation", ] assert message_2.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "U-location", ]
def read_collection_from_csv(file_path: Text) -> List[Message]: collection = read_from_csv(file_path) all_sentences = [] for line in collection: if len(line) == 2: sentence, label = line[0], line[1] all_sentences.append(Message.build(text=sentence, intent=label)) elif len(line) == 1: sentence = line[0] all_sentences.append(Message.build(text=sentence)) else: raise RuntimeError( "Input CSV file does not adhere to the correct format") return all_sentences
def test_convert_featurizer_number_of_sub_tokens( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], text: Text, expected_number_of_sub_tokens: List[int], monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.get(NUMBER_OF_SUB_TOKENS) for t in tokens] == expected_number_of_sub_tokens
def test_convert_featurizer_token_edge_cases( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]], monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_convert_featurizer_tokens_to_text( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_generate_message_raises_on_overlapping_but_not_identical_spans( message_text: Text, entities: List[Dict[Text, Any]], ): message = Message.build(message_text, "dummy_intent", entities=entities) with pytest.raises(ValueError): TrainingDataWriter.generate_message(message)
def read_from_json(self, js: Dict[Text, Any], **_: Any) -> "TrainingData": """Loads training data stored in the rasa NLU data format.""" import rasa.shared.nlu.training_data.schemas.data_schema as schema import rasa.shared.utils.validation as validation_utils validation_utils.validate_training_data(js, schema.rasa_nlu_data_schema()) data = js["rasa_nlu_data"] common_examples = data.get("common_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) lookup_tables = data.get("lookup_tables", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) training_examples = [] for ex in common_examples: # taking care of custom entries msg = Message.build( text=ex.pop(TEXT, ""), intent=ex.pop(INTENT, None), entities=ex.pop(ENTITIES, None), **ex, ) training_examples.append(msg) return TrainingData(training_examples, entity_synonyms, regex_features, lookup_tables)
def test_preserve_sentence_and_sequence_features_old_config(): attribute = "text" message = Message.build("hi there") transformers_nlp = HFTransformersNLP({ "model_name": "bert", "model_weights": "bert-base-uncased" }) transformers_nlp.process(message) lm_tokenizer = LanguageModelTokenizer() lm_tokenizer.process(message) lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"}) lm_featurizer.process(message) message.set(LANGUAGE_MODEL_DOCS[attribute], None) lm_docs = lm_featurizer._get_docs_for_batch([message], attribute=attribute, inference_mode=True)[0] hf_docs = transformers_nlp._get_docs_for_batch([message], attribute=attribute, inference_mode=True)[0] assert not (message.features[0].features == lm_docs[SEQUENCE_FEATURES]).any() assert not (message.features[1].features == lm_docs[SENTENCE_FEATURES]).any() assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all() assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()
def test_lm_featurizer_edge_cases(model_name, model_weights, texts, expected_tokens, expected_indices): if model_weights is None: model_weights_config = {} else: model_weights_config = {"model_weights": model_weights} transformers_config = { **{ "model_name": model_name }, **model_weights_config } lm_featurizer = LanguageModelFeaturizer(transformers_config) whitespace_tokenizer = WhitespaceTokenizer() for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) tokens = whitespace_tokenizer.tokenize(message, TEXT) message.set(TOKENS_NAMES[TEXT], tokens) lm_featurizer.process(message) assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices]
def test_lm_tokenizer_edge_cases( model_name, model_weights, texts, expected_tokens, expected_indices, expected_num_token_ids, ): if model_weights is None: model_weights_config = {} else: model_weights_config = {"model_weights": model_weights} transformers_config = { **{ "model_name": model_name }, **model_weights_config } transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() for text, gt_tokens, gt_indices, gt_num_indices in zip( texts, expected_tokens, expected_indices, expected_num_token_ids): message = Message.build(text=text) transformers_nlp.process(message) tokens = lm_tokenizer.tokenize(message, TEXT) token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS] assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices] assert len(token_ids) == gt_num_indices
def test_train_tokenizer(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(RESPONSE, text) message.set(INTENT, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices] # check intent attribute tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT]) assert [t.text for t in tokens] == [text]
def test_convert_featurizer_process(monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) sentence = "Hey how are you today ?" message = Message.build(text=sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) featurizer.process(message, tf_hub_module=featurizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def test_sequence_length_overflow_train( input_sequence_length: int, model_name: Text, should_overflow: bool, create_language_model_featurizer: Callable[[Dict[Text, Any]], LanguageModelFeaturizer], monkeypatch: MonkeyPatch, ): monkeypatch.setattr( LanguageModelFeaturizer, "_load_model_instance", lambda _: None, ) component = create_language_model_featurizer({"model_name": model_name}) message = Message.build(text=" ".join(["hi"] * input_sequence_length)) if should_overflow: with pytest.raises(RuntimeError): component._validate_sequence_lengths([input_sequence_length], [message], "text", inference_mode=False) else: component._validate_sequence_lengths([input_sequence_length], [message], "text", inference_mode=False)
def _parse_intent(self, intent_data: Dict[Text, Any]) -> None: import rasa.shared.nlu.training_data.entities_parser as entities_parser import rasa.shared.nlu.training_data.synonyms_parser as synonyms_parser intent = intent_data.get(KEY_INTENT, "") if not intent: rasa.shared.utils.io.raise_warning( f"Issue found while processing '{self.filename}': " f"The intent has an empty name. " f"Intents should have a name defined under the {KEY_INTENT} key. " f"It will be skipped.", docs=DOCS_URL_TRAINING_DATA, ) return examples = intent_data.get(KEY_INTENT_EXAMPLES, "") intent_metadata = intent_data.get(KEY_METADATA) for example, entities, metadata in self._parse_training_examples( examples, intent): plain_text = entities_parser.replace_entities(example) synonyms_parser.add_synonyms_from_entities(plain_text, entities, self.entity_synonyms) self.training_examples.append( Message.build(plain_text, intent, entities, intent_metadata, metadata))
def parse_training_example(example: Text, intent: Optional[Text] = None) -> "Message": """Extract entities and synonyms, and convert to plain text.""" entities = find_entities_in_training_example(example) plain_text = replace_entities(example) return Message.build(plain_text, intent, entities)
def test_whitespace_does_not_throw_error(): texts = rasa.shared.utils.io.read_json_file( "data/test_tokenizers/naughty_strings.json") tk = WhitespaceTokenizer() for text in texts: tk.tokenize(Message.build(text=text), attribute=TEXT)
def test_mitie(text, expected_tokens, expected_indices): tk = MitieTokenizer() tokens = tk.tokenize(Message.build(text=text), attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_split_action_name(text: Text, expected_tokens: List[Text]): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} tk = create_whitespace_tokenizer(component_config) message = Message.build(text=text) message.set(ACTION_NAME, text) assert [t.text for t in tk._split_name(message, ACTION_NAME)] == expected_tokens
def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT) assert [t.data.get("pos") for t in tokens] == expected_pos_tags
def test_generate_message( message_text: Text, expected_text: Text, entities: List[Dict[Text, Any]], ): message = Message.build(message_text, "dummy_intent", entities=entities) message_text = TrainingDataWriter.generate_message(message) assert message_text == expected_text
async def replace_placeholders( self, example: Message, faker_: Faker, matches: List[Tuple[Any, ...]], count: int ) -> AsyncIterator[Message]: original_text = await self.rebuild_original_text(example) for _ in range(count): text = await self.replace_placeholders_in_text(example.data.get("text"), faker_, matches) original_text = await self.replace_placeholders_in_text(original_text, faker_, matches) entities = find_entities_in_training_example(original_text) new_message = Message.build(text, example.get("intent"), entities) yield new_message
def test_preprocess_selector_multiple_retrieval_intents( response_selector_training_data: TrainingData, create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], ): training_data_extra_intent = TrainingData([ Message.build(text="Is it possible to detect the version?", intent="faq/q1"), Message.build(text="How can I get a new virtual env", intent="faq/q2"), ]) training_data = response_selector_training_data.merge( training_data_extra_intent) response_selector = create_response_selector({}) response_selector.preprocess_train_data(training_data) assert sorted( response_selector.all_retrieval_intents) == ["chitchat", "faq"]
def _read_intent(self, intent: Dict[Text, Any], examples: List[Dict[Text, Any]]) -> "TrainingData": """Reads the intent and examples from respective jsons.""" intent = intent.get("name") training_examples = [] for ex in examples: text, entities = self._join_text_chunks(ex["data"]) training_examples.append(Message.build(text, intent, entities)) return TrainingData(training_examples)
def test_split_intent_response_key(text, expected_tokens): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} tk = create_whitespace_tokenizer(component_config) message = Message.build(text=text) message.set(INTENT_RESPONSE_KEY, text) assert [ t.text for t in tk._split_name(message, attribute=INTENT_RESPONSE_KEY) ] == expected_tokens
def test_features_are_sparse( whitespace_tokenizer: WhitespaceTokenizer, semantic_map_featurizer: SemanticMapFeaturizer, ): message = Message.build("word1 word3") whitespace_tokenizer.process(message) semantic_map_featurizer.process(message) for feature in message.features: assert scipy.sparse.issparse(feature.features)
def test_no_features_on_no_tokens( semantic_map_featurizer: SemanticMapFeaturizer): """The component does not set any sparse features if tokens are not available.""" message = Message.build("word1 word3") # We skip: whitespace_tokenizer.process(message) semantic_map_featurizer.process(message) seq_vecs, sen_vecs = message.get_sparse_features(TEXT, []) assert not seq_vecs assert not sen_vecs