def read_from_json(self, js, **kwargs): """Loads training data stored in the rasa NLU data format.""" from rasa.nlu.training_data import Message, TrainingData validate_rasa_nlu_data(js) data = js["rasa_nlu_data"] common_examples = data.get("common_examples", []) intent_examples = data.get("intent_examples", []) entity_examples = data.get("entity_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) lookup_tables = data.get("lookup_tables", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) if intent_examples or entity_examples: logger.warning("DEPRECATION warning: your rasa data " "contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider " "putting all your examples " "into the 'common_examples' section.") all_examples = common_examples + intent_examples + entity_examples training_examples = [] for ex in all_examples: msg = Message.build(ex["text"], ex.get("intent"), ex.get("entities")) training_examples.append(msg) return TrainingData(training_examples, entity_synonyms, regex_features, lookup_tables)
def test_example(self): example = { "text": "The new coronavirus doesn\u2019t affect young people.", "intent": "myth", "entities": [ { "start": 8, "end": 19, "value": "coronavirus", "entity": "coronavirus" }, { "start": 35, "end": 40, "value": "young", "entity": "young" } ] } message = Message.build( text=example['text'], intent=example['intent'], entities=example['entities'], ) result = PreprocessingFactory('en', remove_accent=False).factory().preprocess(message) result2 = PreprocessingFactory('en', remove_accent=False).factory().preprocess(Message(text=example['text'])) self.assertEqual(result.text, result2.text)
def _parse_intent(self, data: Dict[Text, Any]) -> None: from rasa.nlu.training_data import Message import rasa.nlu.training_data.entities_parser as entities_parser import rasa.nlu.training_data.synonyms_parser as synonyms_parser import rasa.nlu.constants as nlu_constants intent = data.get(KEY_INTENT, "") if not intent: raise_warning( f"Issue found while processing '{self.filename}': " f"The intent has an empty name. " f"Intents should have a name defined under the {KEY_INTENT} key. " f"It will be skipped.", docs=DOCS_URL_TRAINING_DATA_NLU, ) return examples = data.get(KEY_INTENT_EXAMPLES, "") for example, entities in self._parse_training_examples(examples, intent): synonyms_parser.add_synonyms_from_entities( example, entities, self.entity_synonyms ) plain_text = entities_parser.replace_entities(example) message = Message.build(plain_text, intent) if entities: message.set(nlu_constants.ENTITIES, entities) self.training_examples.append(message)
def train_update(update, examples_data, label_examples_data, algorithm, ner_spacy, similarity_type, language, connection): with PokeLogging() as pl: try: examples = [] label_examples = [] for example in examples_data: examples.append( Message.build( text=example.get("text"), intent=example.get("intent"), entities=example.get("entities"), )) for label_example in label_examples_data: label_examples.append( Message.build( text=label_example.get("text"), entities=label_example.get("entities"), )) rasa_nlu_config = get_rasa_nlu_config_from_update( algorithm, ner_spacy, similarity_type, language) trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False)) training_data = BothubTrainingData( label_training_examples=label_examples, training_examples=examples) trainer.train(training_data) persistor = BothubPersistor(update, connection) trainer.persist( mkdtemp(), persistor=persistor, fixed_model_name=str(update), ) except Exception as e: logger.exception(e) raise e finally: pass
def train_update( repository_version_language_id, by_user, repository_authorization, from_queue="celery" ): # pragma: no cover update_request = backend().request_backend_start_training_nlu( repository_version_language_id, by_user, repository_authorization, from_queue ) examples_list = get_examples_request(repository_version_language_id, repository_authorization) with PokeLogging() as pl: try: examples = [] for example in examples_list: examples.append( Message.build( text=example.get("text"), intent=example.get("intent"), entities=example.get("entities"), ) ) update_request["dataset_size"] = len(examples) pipeline_builder = PipelineBuilder(update_request) pipeline_builder.print_pipeline() rasa_nlu_config = pipeline_builder.get_nlu_model() trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False)) training_data = TrainingData( training_examples=examples, lookup_tables=None ) trainer.train(training_data) persistor = BothubPersistor( repository_version_language_id, repository_authorization, rasa_version ) trainer.persist( mkdtemp(), persistor=persistor, fixed_model_name=f"{update_request.get('repository_version')}_" f"{update_request.get('total_training_end') + 1}_" f"{update_request.get('language')}", ) except Exception as e: logger.exception(e) backend().request_backend_trainfail_nlu( repository_version_language_id, repository_authorization ) raise e finally: backend().request_backend_traininglog_nlu( repository_version_language_id, pl.getvalue(), repository_authorization )
def _read_intent(self, intent_js, examples_js): """Reads the intent and examples from respective jsons.""" from rasa.nlu.training_data import Message, TrainingData intent = intent_js.get("name") training_examples = [] for ex in examples_js: text, entities = self._join_text_chunks(ex["data"]) training_examples.append(Message.build(text, intent, entities)) return TrainingData(training_examples)
async def test_get_nlu_data(Faker: asynctest.MagicMock, load_data: asynctest.MagicMock) -> None: faker_ = Faker() faker_.name.return_value = "Nikola Tesla" training_data = TrainingData( training_examples=[ Message.build("hello", "intent_test"), Message.build("hello @name", "intent_test"), Message.build("hello"), ] ) load_data.return_value = training_data importer = PlaceholderImporter() importer.config = {"importers": [{"name": "rasam.PlaceholderImporter"}]} importer._nlu_files = ["test"] new_training_data = await importer.get_nlu_data() faker_.seed_instance.assert_called_once_with(importer.DEFAULT_FAKE_DATA_COUNT) load_data.assert_called_once_with("test", "en") message: Message expected_messages = [ Message.build("hello", "intent_test"), Message.build("hello Nikola Tesla", "intent_test"), Message.build("hello"), ] for message, expected in zip(new_training_data.training_examples, expected_messages): assert message.get("intent") == expected.get("intent") assert message.get("text") == expected.get("text")
async def replace_placeholders(self, example: Message, faker_: Faker, matches: List[Tuple[Any, ...]], count: int) -> AsyncIterator[Message]: original_text = await self.rebuild_original_text(example) for _ in range(count): text = await self.replace_placeholders_in_text( example.text, faker_, matches) original_text = await self.replace_placeholders_in_text( original_text, faker_, matches) entities = MarkdownReader._find_entities_in_training_example( original_text) new_message = Message.build(text, example.get("intent"), entities) yield new_message
def parse_training_example(self, example: Text) -> "Message": """Extract entities and synonyms, and convert to plain text.""" from rasa.nlu.training_data import Message entities = self._find_entities_in_training_example(example) plain_text = re.sub(ent_regex, lambda m: m.groupdict()["entity_text"], example) self._add_synonyms(plain_text, entities) message = Message.build(plain_text, self.current_title) if len(entities) > 0: message.set("entities", entities) return message
def test_spacy_training_sample_alignment(spacy_nlp_component): from spacy.tokens import Doc m1 = Message.build(text="I have a feeling", intent="feeling") m2 = Message.build(text="", intent="feeling") m3 = Message.build(text="I am the last message", intent="feeling") td = TrainingData(training_examples=[m1, m2, m3]) attribute_docs = spacy_nlp_component.docs_for_training_data(td) assert isinstance(attribute_docs["text"][0], Doc) assert isinstance(attribute_docs["text"][1], Doc) assert isinstance(attribute_docs["text"][2], Doc) assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"] assert [t.text for t in attribute_docs["text"][1]] == [] assert [t.text for t in attribute_docs["text"][2]] == [ "i", "am", "the", "last", "message", ]
def parse_training_example(self, example: Text) -> "Message": """Extract entities and synonyms, and convert to plain text.""" from rasa.nlu.training_data import Message import rasa.nlu.training_data.entities_parser as entities_parser import rasa.nlu.training_data.synonyms_parser as synonyms_parser entities = entities_parser.find_entities_in_training_example(example) plain_text = entities_parser.replace_entities(example) synonyms_parser.add_synonyms_from_entities(plain_text, entities, self.entity_synonyms) message = Message.build(plain_text, self.current_title) if len(entities) > 0: message.set("entities", entities) return message
def test_sequence_length_overflow_train(input_sequence_length: int, model_name: Text, should_overflow: bool): component = HFTransformersNLP({"model_name": model_name}, skip_model_load=True) message = Message.build(text=" ".join(["hi"] * input_sequence_length)) if should_overflow: with pytest.raises(RuntimeError): component._validate_sequence_lengths([input_sequence_length], [message], "text", inference_mode=False) else: component._validate_sequence_lengths([input_sequence_length], [message], "text", inference_mode=False)
async def test_replace_placeholders( faker_: asynctest.MagicMock, test: str, text: str, fake_data: List[str], matches: List[Tuple[str, str, int]], count: int, expected: List[str], ) -> None: faker_.name.side_effect = fake_data importer = PlaceholderImporter() message = Message.build(text) index = 0 async for new_message in importer.replace_placeholders(message, faker_, matches, count): print(new_message.as_dict()) assert new_message.text == expected[index] index += 1 assert index == count
def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_indices): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) transformers_nlp.process(message) tokens = lm_tokenizer.tokenize(message, TEXT) assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices]
def test_lm_featurizer_shape_values(): model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[0] transformers_config = {"model_name": model_name} transformers_nlp_bert = HFTransformersNLP({"model_name": "bert"}) transformers_nlp_gpt = HFTransformersNLP({"model_name": "gpt"}) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) show_training_data(td) transformers_nlp_bert.train(td) show_training_data(td) transformers_nlp_gpt.train(td) show_training_data(td) lm_featurizer.train(td) show_training_data(td)
def test_lm_featurizer_shape_values(): model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[3] transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) show_training_data(td) transformers_nlp.train(td) show_training_data(td) lm_featurizer.train(td) show_training_data(td) for index in range(len(texts)): computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT]) computed_sequence_vec, computed_sentence_vec = ( computed_feature_vec[:-1], computed_feature_vec[-1], ) assert computed_feature_vec.shape == expected_shape[index] # Look at the value of first dimension for a few starting timesteps assert np.allclose( computed_sequence_vec[: len(expected_sequence_vec[index]), 0], expected_sequence_vec[index], atol=1e-5, ) # Look at the first value of first five dimensions assert np.allclose( computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5 ) intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT]) assert intent_vec is None
def test_lm_featurizer_shape_values(model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) transformers_nlp.train(td) lm_featurizer.train(td) for index in range(len(texts)): computed_sequence_vec, computed_sentence_vec = messages[ index].get_dense_features(TEXT, []) assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1 assert computed_sequence_vec.shape[1] == expected_shape[index][1] assert computed_sentence_vec.shape[0] == 1 assert computed_sentence_vec.shape[1] == expected_shape[index][1] # Look at the value of first dimension for a few starting timesteps assert np.allclose( computed_sequence_vec[:len(expected_sequence_vec[index]), 0], expected_sequence_vec[index], atol=1e-5, ) # Look at the first value of first five dimensions assert np.allclose(computed_sentence_vec[0][:5], expected_cls_vec[index], atol=1e-5) intent_sequence_vec, intent_sentence_vec = messages[ index].get_dense_features(INTENT, []) assert intent_sequence_vec is None assert intent_sentence_vec is None
def test_ckip_featurizer(mock_POS_class): expected_pos_list = [[ 'Nd', 'Nd', 'VC', 'Di', 'Na', 'Na', 'VC', 'Di', 'Neu', 'Nf' ]] mock_POS_inst = mock_POS_class.return_value mock_POS_inst.return_value = expected_pos_list msg = Message.build(text="昨天晚上吃了牛肉燴飯花了120元", intent="eat_dinner") msg.set("tokens", [ Token("昨天", 0), Token("晚上", 2), Token("吃", 4), Token("了", 5), Token("牛肉", 6), Token("燴飯", 8), Token("花", 10), Token("了", 11), Token("120", 12), Token("元", 15) ]) from rukip.featurizer import CKIPFeaturizer component_config = {"model_path": "./data"} ckip_featurizer = CKIPFeaturizer(component_config) ner_features = ckip_featurizer.gen_ner_features(msg) assert ner_features == [['昨天', 'Nd'], ['晚上', 'Nd'], ['吃', 'VC'], ['了', 'Di'], ['牛肉', 'Na'], ['燴飯', 'Na'], ['花', 'VC'], ['了', 'Di'], ['120', 'Neu'], ['元', 'Nf']] component_config = {"model_path": "./data", "token_features": ["pos"]} ckip_featurizer = CKIPFeaturizer(component_config) ner_features = ckip_featurizer.gen_ner_features(msg) assert ner_features == [['Nd'], ['Nd'], ['VC'], ['Di'], ['Na'], ['Na'], ['VC'], ['Di'], ['Neu'], ['Nf']] component_config = {"model_path": "./data", "token_features": ["word"]} ckip_featurizer = CKIPFeaturizer(component_config) ner_features = ckip_featurizer.gen_ner_features(msg) assert ner_features == [['昨天'], ['晚上'], ['吃'], ['了'], ['牛肉'], ['燴飯'], ['花'], ['了'], ['120'], ['元']]
def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens, expected_indices, expected_num_token_ids): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() for text, gt_tokens, gt_indices, gt_num_indices in zip( texts, expected_tokens, expected_indices, expected_num_token_ids): message = Message.build(text=text) transformers_nlp.process(message) tokens = lm_tokenizer.tokenize(message, TEXT) token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS] assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices] assert len(token_ids) == gt_num_indices
def read_from_json(self, js: Dict[Text, Any], **_) -> "TrainingData": """Loads training data stored in the rasa NLU data format.""" from rasa.nlu.training_data import Message, TrainingData import rasa.nlu.schemas.data_schema as schema import rasa.utils.validation as validation_utils validation_utils.validate_training_data(js, schema.rasa_nlu_data_schema()) data = js["rasa_nlu_data"] common_examples = data.get("common_examples", []) intent_examples = data.get("intent_examples", []) entity_examples = data.get("entity_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) lookup_tables = data.get("lookup_tables", []) gazette = data.get("gazette", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) if intent_examples or entity_examples: raise_warning( "Your rasa data " "contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider " "putting all your examples " "into the 'common_examples' section.", FutureWarning, docs=DOCS_URL_TRAINING_DATA_NLU, ) all_examples = common_examples + intent_examples + entity_examples training_examples = [] for ex in all_examples: msg = Message.build(ex["text"], ex.get("intent"), ex.get("entities")) training_examples.append(msg) return TrainingData(training_examples, entity_synonyms, regex_features, lookup_tables, gazette)
def read_from_json(self, js: Dict[Text, Any], **_) -> "TrainingData": """Loads training data stored in the rasa NLU data format.""" from rasa.nlu.training_data import Message, TrainingData import rasa.nlu.schemas.data_schema as schema import rasa.utils.validation as validation_utils validation_utils.validate_training_data(js, schema.rasa_nlu_data_schema()) data = js["rasa_nlu_data"] common_examples = data.get("common_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) lookup_tables = data.get("lookup_tables", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) training_examples = [] for ex in common_examples: msg = Message.build(**ex) training_examples.append(msg) return TrainingData( training_examples, entity_synonyms, regex_features, lookup_tables )
async def run(skill, language, asm): intents_number = 0 data = [] stories = [] intent_stories = [] domain_data = {"intents": [], "actions": [], "templates": {}, "config": {}, "entities": [], "slots": {}, "forms": []} intents = await asm.memory.get_keys(skill + "_intents") for intent_id in intents: intent = await asm.memory.get(skill + "_intents", intent_id) if len(intent['slot']) == 0: i = {intent_id: {"use_entities": False, "triggers": 'utter_' + intent_id}} else: i = {intent_id: {"use_entities": False}} domain_data['intents'].append(i) domain_data['actions'].append('utter_' + intent_id) stories.append('## ' + intent_id) domain_data['templates']['utter_' + intent_id] = [] if intent['responses'] is not None and 'default' in intent['responses']: for response in intent['responses']['default']: domain_data['templates']['utter_' + intent_id].append({"text": response}) for example in intent['examples']: text = GenerateStories.preprocessor(example, language) msg = Message.build(text=text, intent=intent_id) data.append(msg) if len(intent['slot']) > 0: intent_story = [] domain_data['forms'].append(intent_id + '_form') domain_data['slots']['requested_slot'] = {"type": "unfeaturized"} slot_def = [] for slot_item in intent['slot']: domain_data['slots'][slot_item['name']] = {"type": "unfeaturized", "auto_fill": False} domain_data['templates']['utter_ask_' + slot_item['name']] = \ [{"text": slot_item['question']}] domain_data['templates']['utter_error_' + slot_item['name']] = \ [{"text": slot_item['response_error']}] domain_data['entities'].append(slot_item['name']) slot_def.append({"name": slot_item['name'], "required": slot_item['required'], "type": slot_item['type'], "validation_function": slot_item['validation_function']}) for y in range(5): intent_story.append('* ' + intent_id) intent_story.append(' - ' + intent_id + '_form') intent_story.append(' - form{"name": "' + intent_id + '_form"}') intent_story.append(' - form{"name": null}') intent_story.append(' - utter_' + intent_id) intent_stories.append(intent_story) for item in intent_story: stories.append(item) for x in range(intents_number * 20): stories.append('## random_' + str(x)) for y in range(5): story = random.choice(intent_stories) for item in story: stories.append(item) return data, domain_data, stories
def evaluate_update(repository_version, repository_authorization): evaluations = backend().request_backend_start_evaluation( repository_version, repository_authorization) training_examples = [] for evaluate in evaluations: training_examples.append( Message.build( text=evaluate.get("text"), intent=evaluate.get("intent"), entities=evaluate.get("entities"), )) test_data = TrainingData(training_examples=training_examples) interpreter = update_interpreters.get(repository_version, repository_authorization, rasa_version, use_cache=False) result = { "intent_evaluation": None, "entity_evaluation": None, "response_selection_evaluation": None, } intent_results, response_selection_results, entity_results, = get_eval_data( interpreter, test_data) if intent_results: result["intent_evaluation"] = evaluate_intents(intent_results) if entity_results: extractors = get_entity_extractors(interpreter) result["entity_evaluation"] = evaluate_entities( entity_results, extractors) intent_evaluation = result.get("intent_evaluation") entity_evaluation = result.get("entity_evaluation") merged_logs = merge_intent_entity_log(intent_evaluation, entity_evaluation) log = get_formatted_log(merged_logs) charts = plot_and_save_charts(repository_version, intent_results) evaluate_result = backend().request_backend_create_evaluate_results( { "repository_version": repository_version, "matrix_chart": charts.get("matrix_chart"), "confidence_chart": charts.get("confidence_chart"), "log": json.dumps(log), "intentprecision": intent_evaluation.get("precision"), "intentf1_score": intent_evaluation.get("f1_score"), "intentaccuracy": intent_evaluation.get("accuracy"), "entityprecision": entity_evaluation.get("precision"), "entityf1_score": entity_evaluation.get("f1_score"), "entityaccuracy": entity_evaluation.get("accuracy"), }, repository_authorization, ) intent_reports = intent_evaluation.get("report", {}) entity_reports = entity_evaluation.get("report", {}) for intent_key in intent_reports.keys(): if intent_key and intent_key not in excluded_itens: intent = intent_reports.get(intent_key) backend().request_backend_create_evaluate_results_intent( { "evaluate_id": evaluate_result.get("evaluate_id"), "precision": intent.get("precision"), "recall": intent.get("recall"), "f1_score": intent.get("f1-score"), "support": intent.get("support"), "intent_key": intent_key, }, repository_authorization, ) for entity_key in entity_reports.keys(): if entity_key and entity_key not in excluded_itens: # pragma: no cover entity = entity_reports.get(entity_key) backend().request_backend_create_evaluate_results_score( { "evaluate_id": evaluate_result.get("evaluate_id"), "repository_version": repository_version, "precision": entity.get("precision"), "recall": entity.get("recall"), "f1_score": entity.get("f1-score"), "support": entity.get("support"), "entity_key": entity_key, }, repository_authorization, ) return { "id": evaluate_result.get("evaluate_id"), "version": evaluate_result.get("evaluate_version"), "cross_validation": False }
def evaluate_crossval_update(repository_version_language, repository_authorization, aws_bucket_authentication, language): update_request = backend().request_backend_get_current_configuration( repository_authorization) examples_list = get_examples_request(repository_version_language, repository_authorization) with PokeLogging() as pl: try: examples = [] for example in examples_list: examples.append( Message.build( text=example.get("text"), intent=example.get("intent"), entities=example.get("entities"), )) data = TrainingData(training_examples=examples) pipeline_builder = PipelineBuilder(update_request) pipeline_builder.print_pipeline() rasa_nlu_config = pipeline_builder.get_nlu_model() trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False)) result = { "intent_evaluation": None, "entity_evaluation": None, "response_selection_evaluation": None, } intent_test_metrics: IntentMetrics = defaultdict(list) entity_test_metrics: EntityMetrics = defaultdict( lambda: defaultdict(list)) response_selection_test_metrics: ResponseSelectionMetrics = defaultdict( list) intent_results: List[IntentEvaluationResult] = [] entity_results: List[EntityEvaluationResult] = [] response_selection_test_results: List[ ResponseSelectionEvaluationResult] = ([]) entity_evaluation_possible = False extractors: Set[Text] = set() language_preprocessor = PreprocessingFactory(language).factory() for train, test in generate_folds(3, data): interpreter = trainer.train(train) test.training_examples = [ language_preprocessor.preprocess(x) for x in test.training_examples ] # calculate test accuracy combine_result( intent_test_metrics, entity_test_metrics, response_selection_test_metrics, interpreter, test, intent_results, entity_results, response_selection_test_results, ) if not extractors: extractors = get_entity_extractors(interpreter) entity_evaluation_possible = ( entity_evaluation_possible or _contains_entity_labels(entity_results)) if intent_results: result["intent_evaluation"] = evaluate_intents(intent_results) if entity_results: extractors = get_entity_extractors(interpreter) result["entity_evaluation"] = evaluate_entities( entity_results, extractors) intent_evaluation = result.get("intent_evaluation") entity_evaluation = result.get("entity_evaluation") merged_logs = merge_intent_entity_log(intent_evaluation, entity_evaluation) log = get_formatted_log(merged_logs) charts = plot_and_save_charts(repository_version_language, intent_results, aws_bucket_authentication) evaluate_result = backend( ).request_backend_create_evaluate_results( { "repository_version": repository_version_language, "matrix_chart": charts.get("matrix_chart"), "confidence_chart": charts.get("confidence_chart"), "log": json.dumps(log), "intentprecision": intent_evaluation.get("precision"), "intentf1_score": intent_evaluation.get("f1_score"), "intentaccuracy": intent_evaluation.get("accuracy"), "entityprecision": entity_evaluation.get("precision"), "entityf1_score": entity_evaluation.get("f1_score"), "entityaccuracy": entity_evaluation.get("accuracy"), "cross_validation": True }, repository_authorization, ) intent_reports = intent_evaluation.get("report", {}) entity_reports = entity_evaluation.get("report", {}) for intent_key in intent_reports.keys(): if intent_key not in excluded_itens: intent = intent_reports.get(intent_key) backend().request_backend_create_evaluate_results_intent( { "evaluate_id": evaluate_result.get("evaluate_id"), "precision": intent.get("precision"), "recall": intent.get("recall"), "f1_score": intent.get("f1-score"), "support": intent.get("support"), "intent_key": intent_key, }, repository_authorization, ) # remove group entities when entities returned as "<entity>.<group_entity>" for entity_key in entity_reports.keys(): if '.' in entity_key: new_entity_key = entity_key.split('.')[0] entity_reports[new_entity_key] = entity_reports[entity_key] entity_reports.pop(entity_key, None) for entity_key in entity_reports.keys(): if entity_key not in excluded_itens: # pragma: no cover entity = entity_reports.get(entity_key) backend().request_backend_create_evaluate_results_score( { "evaluate_id": evaluate_result.get("evaluate_id"), "repository_version": repository_version_language, "precision": entity.get("precision"), "recall": entity.get("recall"), "f1_score": entity.get("f1-score"), "support": entity.get("support"), "entity_key": entity_key, }, repository_authorization, ) return { "id": evaluate_result.get("evaluate_id"), "version": evaluate_result.get("evaluate_version"), "cross_validation": True, } except Exception as e: logger.exception(e) raise e
def train_update(update, by, repository_authorization): update_request = backend().request_backend_start_training_nlu( update, by, repository_authorization) examples_list = get_examples_request(update, repository_authorization) examples_label_list = get_examples_label_request(update, repository_authorization) with PokeLogging() as pl: try: examples = [] label_examples = [] get_examples = backend( ).request_backend_get_entities_and_labels_nlu( update, update_request.get("language"), json.dumps({ "examples": examples_list, "label_examples_query": examples_label_list, "update_id": update, }), repository_authorization, ) for example in get_examples.get("examples"): examples.append( Message.build( text=example.get("text"), intent=example.get("intent"), entities=example.get("entities"), )) for label_example in get_examples.get("label_examples"): label_examples.append( Message.build( text=label_example.get("text"), entities=label_example.get("entities"), )) rasa_nlu_config = get_rasa_nlu_config_from_update(update_request) trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False)) training_data = BothubTrainingData( label_training_examples=label_examples, training_examples=examples) trainer.train(training_data) persistor = BothubPersistor(update, repository_authorization) trainer.persist( mkdtemp(), persistor=persistor, fixed_model_name=str(update_request.get("update_id")), ) except Exception as e: logger.exception(e) backend().request_backend_trainfail_nlu(update, repository_authorization) raise e finally: backend().request_backend_traininglog_nlu( update, pl.getvalue(), repository_authorization)
def test__training_preprocess(self): preprocessors = [ PreprocessingFactory(remove_accent=False).factory(), PreprocessingFactory('pt_br', remove_accent=False).factory(), PreprocessingFactory('en', remove_accent=False).factory(), PreprocessingFactory('es', remove_accent=False).factory() ] for preprocessor in preprocessors: phrase = "i'`m GOING não tô é the 'gym" expected_phrase = "im going não tô é the gym" entities = [ { "start": 0, "end": 4, "value": "i'`m", "entity": "me" }, { "start": 24, "end": 28, "value": "'gym", "entity": "gym" }, ] expected_entities = [ { "start": 0, "end": 2, "value": "im", "entity": "me" }, { "start": 22, "end": 25, "value": "gym", "entity": "gym" }, ] message = Message.build( text=phrase, intent='test', entities=entities, ) self.assertEqual( preprocessor.preprocess(message).text, expected_phrase ) self.assertEqual( preprocessor.preprocess(message).data.get('entities'), expected_entities ) message = Message.build( text=phrase, intent='test', entities=None, ) self.assertEqual( preprocessor.preprocess(message).text, expected_phrase ) with self.assertRaises(KeyError): _ = preprocessor.preprocess(message).data['entities']
async def test_rebuild_original_text(text: str, entities: List[Dict[str, Any]], expected: str) -> None: message = Message.build(text, "test_intent", entities) original_text = await PlaceholderImporter.rebuild_original_text(message) assert expected == original_text
def evaluate_crossval_update(repository_version, by, repository_authorization, from_queue='celery'): update_request = backend().request_backend_start_training_nlu( repository_version, by, repository_authorization, from_queue) examples_list = get_examples_request(repository_version, repository_authorization) with PokeLogging() as pl: try: examples = [] for example in examples_list: examples.append( Message.build( text=example.get("text"), intent=example.get("intent"), entities=example.get("entities"), )) data = TrainingData(training_examples=examples) rasa_nlu_config = get_rasa_nlu_config(update_request) trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False)) result = { "intent_evaluation": None, "entity_evaluation": None, "response_selection_evaluation": None, } intent_train_metrics: IntentMetrics = defaultdict(list) intent_test_metrics: IntentMetrics = defaultdict(list) entity_train_metrics: EntityMetrics = defaultdict( lambda: defaultdict(list)) entity_test_metrics: EntityMetrics = defaultdict( lambda: defaultdict(list)) response_selection_train_metrics: ResponseSelectionMetrics = defaultdict( list) response_selection_test_metrics: ResponseSelectionMetrics = defaultdict( list) intent_results: List[IntentEvaluationResult] = [] entity_results: List[EntityEvaluationResult] = [] response_selection_test_results: List[ ResponseSelectionEvaluationResult] = ([]) entity_evaluation_possible = False extractors: Set[Text] = set() for train, test in generate_folds(3, data): interpreter = trainer.train(train) # calculate train accuracy combine_result( intent_train_metrics, entity_train_metrics, response_selection_train_metrics, interpreter, train, ) # calculate test accuracy combine_result( intent_test_metrics, entity_test_metrics, response_selection_test_metrics, interpreter, test, intent_results, entity_results, response_selection_test_results, ) if not extractors: extractors = get_entity_extractors(interpreter) entity_evaluation_possible = ( entity_evaluation_possible or _contains_entity_labels(entity_results)) if intent_results: result["intent_evaluation"] = evaluate_intents(intent_results) if entity_results: extractors = get_entity_extractors(interpreter) result["entity_evaluation"] = evaluate_entities( entity_results, extractors) intent_evaluation = result.get("intent_evaluation") entity_evaluation = result.get("entity_evaluation") merged_logs = merge_intent_entity_log(intent_evaluation, entity_evaluation) log = get_formatted_log(merged_logs) charts = plot_and_save_charts(repository_version, intent_results) evaluate_result = backend( ).request_backend_create_evaluate_results( { "repository_version": repository_version, "matrix_chart": charts.get("matrix_chart"), "confidence_chart": charts.get("confidence_chart"), "log": json.dumps(log), "intentprecision": intent_evaluation.get("precision"), "intentf1_score": intent_evaluation.get("f1_score"), "intentaccuracy": intent_evaluation.get("accuracy"), "entityprecision": entity_evaluation.get("precision"), "entityf1_score": entity_evaluation.get("f1_score"), "entityaccuracy": entity_evaluation.get("accuracy"), }, repository_authorization, ) intent_reports = intent_evaluation.get("report", {}) entity_reports = entity_evaluation.get("report", {}) for intent_key in intent_reports.keys(): if intent_key and intent_key not in excluded_itens: intent = intent_reports.get(intent_key) backend().request_backend_create_evaluate_results_intent( { "evaluate_id": evaluate_result.get("evaluate_id"), "precision": intent.get("precision"), "recall": intent.get("recall"), "f1_score": intent.get("f1-score"), "support": intent.get("support"), "intent_key": intent_key, }, repository_authorization, ) for entity_key in entity_reports.keys(): if entity_key and entity_key not in excluded_itens: # pragma: no cover entity = entity_reports.get(entity_key) backend().request_backend_create_evaluate_results_score( { "evaluate_id": evaluate_result.get("evaluate_id"), "repository_version": repository_version, "precision": entity.get("precision"), "recall": entity.get("recall"), "f1_score": entity.get("f1-score"), "support": entity.get("support"), "entity_key": entity_key, }, repository_authorization, ) return { "id": evaluate_result.get("evaluate_id"), "version": evaluate_result.get("evaluate_version"), "cross_validation": True } except Exception as e: logger.exception(e) backend().request_backend_trainfail_nlu(repository_version, repository_authorization) raise e finally: backend().request_backend_traininglog_nlu( repository_version, pl.getvalue(), repository_authorization)