def test_custom_intent_symbol(text, expected_tokens, spacy_nlp): component_config = { "intent_tokenization_flag": True, "intent_split_symbol": "+" } tk = SpacyTokenizer(component_config) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) message.set(INTENT, text) tk.process_training_data(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) message.set(RESPONSE, text) message.set(SPACY_DOCS[RESPONSE], spacy_nlp(text)) training_data = TrainingData() training_data.training_examples = [message] tk.process_training_data(training_data) for attribute in [RESPONSE, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
async def test_train_persist_with_different_configurations( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor], config_params: Dict[Text, Any], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, spacy_tokenizer: SpacyTokenizer, spacy_featurizer: SpacyFeaturizer, spacy_nlp_component: SpacyNLP, spacy_model: SpacyModel, ): crf_extractor = crf_entity_extractor(config_params) importer = RasaFileImporter(training_data_paths=["data/examples/rasa"]) training_data = importer.get_nlu_data() training_data = spacy_nlp_component.process_training_data( training_data, spacy_model) training_data = spacy_tokenizer.process_training_data(training_data) training_data = spacy_featurizer.process_training_data(training_data) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) messages = spacy_nlp_component.process([message], spacy_model) messages = spacy_tokenizer.process(messages) message = spacy_featurizer.process(messages)[0] message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractor.load( { **CRFEntityExtractor.get_default_config(), **config_params }, default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() detected_entities = processed_message2.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"