Esempio n. 1
0
    def read_from_json(self, js, **kwargs):
        """Loads training data stored in the rasa NLU data format."""
        from rasa.nlu.training_data import Message, TrainingData

        validate_rasa_nlu_data(js)

        data = js["rasa_nlu_data"]
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            logger.warning("DEPRECATION warning: your rasa data "
                           "contains 'intent_examples' "
                           "or 'entity_examples' which will be "
                           "removed in the future. Consider "
                           "putting all your examples "
                           "into the 'common_examples' section.")

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex["text"], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features,
                            lookup_tables)
Esempio n. 2
0
    def test_example(self):
        example = {
            "text": "The new coronavirus doesn\u2019t affect young people.",
            "intent": "myth",
            "entities": [
                {
                    "start": 8,
                    "end": 19,
                    "value": "coronavirus",
                    "entity": "coronavirus"
                },
                {
                    "start": 35,
                    "end": 40,
                    "value": "young",
                    "entity": "young"
                }
            ]
        }
        message = Message.build(
            text=example['text'],
            intent=example['intent'],
            entities=example['entities'],
        )

        result = PreprocessingFactory('en', remove_accent=False).factory().preprocess(message)
        result2 = PreprocessingFactory('en', remove_accent=False).factory().preprocess(Message(text=example['text']))

        self.assertEqual(result.text, result2.text)
Esempio n. 3
0
    def _parse_intent(self, data: Dict[Text, Any]) -> None:
        from rasa.nlu.training_data import Message
        import rasa.nlu.training_data.entities_parser as entities_parser
        import rasa.nlu.training_data.synonyms_parser as synonyms_parser
        import rasa.nlu.constants as nlu_constants

        intent = data.get(KEY_INTENT, "")
        if not intent:
            raise_warning(
                f"Issue found while processing '{self.filename}': "
                f"The intent has an empty name. "
                f"Intents should have a name defined under the {KEY_INTENT} key. "
                f"It will be skipped.",
                docs=DOCS_URL_TRAINING_DATA_NLU,
            )
            return

        examples = data.get(KEY_INTENT_EXAMPLES, "")
        for example, entities in self._parse_training_examples(examples, intent):

            synonyms_parser.add_synonyms_from_entities(
                example, entities, self.entity_synonyms
            )

            plain_text = entities_parser.replace_entities(example)

            message = Message.build(plain_text, intent)
            if entities:
                message.set(nlu_constants.ENTITIES, entities)
            self.training_examples.append(message)
Esempio n. 4
0
def train_update(update, examples_data, label_examples_data, algorithm,
                 ner_spacy, similarity_type, language, connection):
    with PokeLogging() as pl:
        try:
            examples = []
            label_examples = []

            for example in examples_data:
                examples.append(
                    Message.build(
                        text=example.get("text"),
                        intent=example.get("intent"),
                        entities=example.get("entities"),
                    ))

            for label_example in label_examples_data:
                label_examples.append(
                    Message.build(
                        text=label_example.get("text"),
                        entities=label_example.get("entities"),
                    ))

            rasa_nlu_config = get_rasa_nlu_config_from_update(
                algorithm, ner_spacy, similarity_type, language)
            trainer = Trainer(rasa_nlu_config,
                              ComponentBuilder(use_cache=False))
            training_data = BothubTrainingData(
                label_training_examples=label_examples,
                training_examples=examples)

            trainer.train(training_data)

            persistor = BothubPersistor(update, connection)
            trainer.persist(
                mkdtemp(),
                persistor=persistor,
                fixed_model_name=str(update),
            )
        except Exception as e:
            logger.exception(e)

            raise e
        finally:
            pass
Esempio n. 5
0
def train_update(
    repository_version_language_id, by_user, repository_authorization, from_queue="celery"
):  # pragma: no cover

    update_request = backend().request_backend_start_training_nlu(
        repository_version_language_id, by_user, repository_authorization, from_queue
    )

    examples_list = get_examples_request(repository_version_language_id, repository_authorization)

    with PokeLogging() as pl:
        try:
            examples = []

            for example in examples_list:
                examples.append(
                    Message.build(
                        text=example.get("text"),
                        intent=example.get("intent"),
                        entities=example.get("entities"),
                    )
                )

            update_request["dataset_size"] = len(examples)

            pipeline_builder = PipelineBuilder(update_request)
            pipeline_builder.print_pipeline()
            rasa_nlu_config = pipeline_builder.get_nlu_model()

            trainer = Trainer(rasa_nlu_config, ComponentBuilder(use_cache=False))
            training_data = TrainingData(
                training_examples=examples, lookup_tables=None
            )

            trainer.train(training_data)

            persistor = BothubPersistor(
                repository_version_language_id, repository_authorization, rasa_version
            )
            trainer.persist(
                mkdtemp(),
                persistor=persistor,
                fixed_model_name=f"{update_request.get('repository_version')}_"
                f"{update_request.get('total_training_end') + 1}_"
                f"{update_request.get('language')}",
            )
        except Exception as e:
            logger.exception(e)
            backend().request_backend_trainfail_nlu(
                repository_version_language_id, repository_authorization
            )
            raise e
        finally:
            backend().request_backend_traininglog_nlu(
                repository_version_language_id, pl.getvalue(), repository_authorization
            )
    def _read_intent(self, intent_js, examples_js):
        """Reads the intent and examples from respective jsons."""
        from rasa.nlu.training_data import Message, TrainingData

        intent = intent_js.get("name")

        training_examples = []
        for ex in examples_js:
            text, entities = self._join_text_chunks(ex["data"])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
Esempio n. 7
0
async def test_get_nlu_data(Faker: asynctest.MagicMock, load_data: asynctest.MagicMock) -> None:
    faker_ = Faker()
    faker_.name.return_value = "Nikola Tesla"
    training_data = TrainingData(
        training_examples=[
            Message.build("hello", "intent_test"),
            Message.build("hello @name", "intent_test"),
            Message.build("hello"),
        ]
    )
    load_data.return_value = training_data

    importer = PlaceholderImporter()
    importer.config = {"importers": [{"name": "rasam.PlaceholderImporter"}]}
    importer._nlu_files = ["test"]
    new_training_data = await importer.get_nlu_data()

    faker_.seed_instance.assert_called_once_with(importer.DEFAULT_FAKE_DATA_COUNT)
    load_data.assert_called_once_with("test", "en")
    message: Message
    expected_messages = [
        Message.build("hello", "intent_test"),
        Message.build("hello Nikola Tesla", "intent_test"),
        Message.build("hello"),
    ]
    for message, expected in zip(new_training_data.training_examples, expected_messages):
        assert message.get("intent") == expected.get("intent")
        assert message.get("text") == expected.get("text")
Esempio n. 8
0
 async def replace_placeholders(self, example: Message, faker_: Faker,
                                matches: List[Tuple[Any, ...]],
                                count: int) -> AsyncIterator[Message]:
     original_text = await self.rebuild_original_text(example)
     for _ in range(count):
         text = await self.replace_placeholders_in_text(
             example.text, faker_, matches)
         original_text = await self.replace_placeholders_in_text(
             original_text, faker_, matches)
         entities = MarkdownReader._find_entities_in_training_example(
             original_text)
         new_message = Message.build(text, example.get("intent"), entities)
         yield new_message
Esempio n. 9
0
    def parse_training_example(self, example: Text) -> "Message":
        """Extract entities and synonyms, and convert to plain text."""
        from rasa.nlu.training_data import Message

        entities = self._find_entities_in_training_example(example)
        plain_text = re.sub(ent_regex, lambda m: m.groupdict()["entity_text"], example)
        self._add_synonyms(plain_text, entities)

        message = Message.build(plain_text, self.current_title)

        if len(entities) > 0:
            message.set("entities", entities)
        return message
Esempio n. 10
0
def test_spacy_training_sample_alignment(spacy_nlp_component):
    from spacy.tokens import Doc

    m1 = Message.build(text="I have a feeling", intent="feeling")
    m2 = Message.build(text="", intent="feeling")
    m3 = Message.build(text="I am the last message", intent="feeling")
    td = TrainingData(training_examples=[m1, m2, m3])

    attribute_docs = spacy_nlp_component.docs_for_training_data(td)

    assert isinstance(attribute_docs["text"][0], Doc)
    assert isinstance(attribute_docs["text"][1], Doc)
    assert isinstance(attribute_docs["text"][2], Doc)

    assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
    assert [t.text for t in attribute_docs["text"][1]] == []
    assert [t.text for t in attribute_docs["text"][2]] == [
        "i",
        "am",
        "the",
        "last",
        "message",
    ]
Esempio n. 11
0
    def parse_training_example(self, example: Text) -> "Message":
        """Extract entities and synonyms, and convert to plain text."""
        from rasa.nlu.training_data import Message
        import rasa.nlu.training_data.entities_parser as entities_parser
        import rasa.nlu.training_data.synonyms_parser as synonyms_parser

        entities = entities_parser.find_entities_in_training_example(example)
        plain_text = entities_parser.replace_entities(example)
        synonyms_parser.add_synonyms_from_entities(plain_text, entities,
                                                   self.entity_synonyms)

        message = Message.build(plain_text, self.current_title)

        if len(entities) > 0:
            message.set("entities", entities)
        return message
Esempio n. 12
0
def test_sequence_length_overflow_train(input_sequence_length: int,
                                        model_name: Text,
                                        should_overflow: bool):

    component = HFTransformersNLP({"model_name": model_name},
                                  skip_model_load=True)
    message = Message.build(text=" ".join(["hi"] * input_sequence_length))
    if should_overflow:
        with pytest.raises(RuntimeError):
            component._validate_sequence_lengths([input_sequence_length],
                                                 [message],
                                                 "text",
                                                 inference_mode=False)
    else:
        component._validate_sequence_lengths([input_sequence_length],
                                             [message],
                                             "text",
                                             inference_mode=False)
Esempio n. 13
0
async def test_replace_placeholders(
    faker_: asynctest.MagicMock,
    test: str,
    text: str,
    fake_data: List[str],
    matches: List[Tuple[str, str, int]],
    count: int,
    expected: List[str],
) -> None:
    faker_.name.side_effect = fake_data
    importer = PlaceholderImporter()
    message = Message.build(text)
    index = 0
    async for new_message in importer.replace_placeholders(message, faker_, matches, count):
        print(new_message.as_dict())
        assert new_message.text == expected[index]
        index += 1
    assert index == count
Esempio n. 14
0
def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens,
                                 expected_indices):

    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer()

    for text, gt_tokens, gt_indices in zip(texts, expected_tokens,
                                           expected_indices):

        message = Message.build(text=text)
        transformers_nlp.process(message)
        tokens = lm_tokenizer.tokenize(message, TEXT)

        assert [t.text for t in tokens] == gt_tokens
        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
Esempio n. 15
0
def test_lm_featurizer_shape_values():
    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[0]
    transformers_config = {"model_name": model_name}

    transformers_nlp_bert = HFTransformersNLP({"model_name": "bert"})
    transformers_nlp_gpt = HFTransformersNLP({"model_name": "gpt"})
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)
    show_training_data(td)
    transformers_nlp_bert.train(td)
    show_training_data(td)
    transformers_nlp_gpt.train(td)
    show_training_data(td)
    lm_featurizer.train(td)
    show_training_data(td)
def test_lm_featurizer_shape_values():
    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[3]
    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)
    show_training_data(td)
    transformers_nlp.train(td)
    show_training_data(td)
    lm_featurizer.train(td)
    show_training_data(td)


    for index in range(len(texts)):
        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT])
        computed_sequence_vec, computed_sentence_vec = (
            computed_feature_vec[:-1],
            computed_feature_vec[-1],
        )

        assert computed_feature_vec.shape == expected_shape[index]

        # Look at the value of first dimension for a few starting timesteps
        assert np.allclose(
            computed_sequence_vec[: len(expected_sequence_vec[index]), 0],
            expected_sequence_vec[index],
            atol=1e-5,
        )

        # Look at the first value of first five dimensions
        assert np.allclose(
            computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
        )

        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT])

        assert intent_vec is None
Esempio n. 17
0
def test_lm_featurizer_shape_values(model_name, texts, expected_shape,
                                    expected_sequence_vec, expected_cls_vec):
    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)

    transformers_nlp.train(td)
    lm_featurizer.train(td)

    for index in range(len(texts)):

        computed_sequence_vec, computed_sentence_vec = messages[
            index].get_dense_features(TEXT, [])

        assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1
        assert computed_sequence_vec.shape[1] == expected_shape[index][1]
        assert computed_sentence_vec.shape[0] == 1
        assert computed_sentence_vec.shape[1] == expected_shape[index][1]

        # Look at the value of first dimension for a few starting timesteps
        assert np.allclose(
            computed_sequence_vec[:len(expected_sequence_vec[index]), 0],
            expected_sequence_vec[index],
            atol=1e-5,
        )

        # Look at the first value of first five dimensions
        assert np.allclose(computed_sentence_vec[0][:5],
                           expected_cls_vec[index],
                           atol=1e-5)

        intent_sequence_vec, intent_sentence_vec = messages[
            index].get_dense_features(INTENT, [])

        assert intent_sequence_vec is None
        assert intent_sentence_vec is None
Esempio n. 18
0
def test_ckip_featurizer(mock_POS_class):
    expected_pos_list = [[
        'Nd', 'Nd', 'VC', 'Di', 'Na', 'Na', 'VC', 'Di', 'Neu', 'Nf'
    ]]
    mock_POS_inst = mock_POS_class.return_value
    mock_POS_inst.return_value = expected_pos_list

    msg = Message.build(text="昨天晚上吃了牛肉燴飯花了120元", intent="eat_dinner")
    msg.set("tokens", [
        Token("昨天", 0),
        Token("晚上", 2),
        Token("吃", 4),
        Token("了", 5),
        Token("牛肉", 6),
        Token("燴飯", 8),
        Token("花", 10),
        Token("了", 11),
        Token("120", 12),
        Token("元", 15)
    ])

    from rukip.featurizer import CKIPFeaturizer
    component_config = {"model_path": "./data"}

    ckip_featurizer = CKIPFeaturizer(component_config)
    ner_features = ckip_featurizer.gen_ner_features(msg)
    assert ner_features == [['昨天', 'Nd'], ['晚上', 'Nd'], ['吃',
                                                         'VC'], ['了', 'Di'],
                            ['牛肉', 'Na'], ['燴飯', 'Na'], ['花', 'VC'],
                            ['了', 'Di'], ['120', 'Neu'], ['元', 'Nf']]

    component_config = {"model_path": "./data", "token_features": ["pos"]}
    ckip_featurizer = CKIPFeaturizer(component_config)
    ner_features = ckip_featurizer.gen_ner_features(msg)
    assert ner_features == [['Nd'], ['Nd'], ['VC'], ['Di'], ['Na'], ['Na'],
                            ['VC'], ['Di'], ['Neu'], ['Nf']]

    component_config = {"model_path": "./data", "token_features": ["word"]}
    ckip_featurizer = CKIPFeaturizer(component_config)
    ner_features = ckip_featurizer.gen_ner_features(msg)
    assert ner_features == [['昨天'], ['晚上'], ['吃'], ['了'], ['牛肉'], ['燴飯'],
                            ['花'], ['了'], ['120'], ['元']]
Esempio n. 19
0
def test_lm_tokenizer_edge_cases(model_name, texts, expected_tokens,
                                 expected_indices, expected_num_token_ids):

    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer()

    for text, gt_tokens, gt_indices, gt_num_indices in zip(
            texts, expected_tokens, expected_indices, expected_num_token_ids):

        message = Message.build(text=text)
        transformers_nlp.process(message)
        tokens = lm_tokenizer.tokenize(message, TEXT)
        token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS]

        assert [t.text for t in tokens] == gt_tokens
        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
        assert len(token_ids) == gt_num_indices
Esempio n. 20
0
    def read_from_json(self, js: Dict[Text, Any], **_) -> "TrainingData":
        """Loads training data stored in the rasa NLU data format."""
        from rasa.nlu.training_data import Message, TrainingData
        import rasa.nlu.schemas.data_schema as schema
        import rasa.utils.validation as validation_utils

        validation_utils.validate_training_data(js,
                                                schema.rasa_nlu_data_schema())

        data = js["rasa_nlu_data"]
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])
        gazette = data.get("gazette", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            raise_warning(
                "Your rasa data "
                "contains 'intent_examples' "
                "or 'entity_examples' which will be "
                "removed in the future. Consider "
                "putting all your examples "
                "into the 'common_examples' section.",
                FutureWarning,
                docs=DOCS_URL_TRAINING_DATA_NLU,
            )

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex["text"], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features,
                            lookup_tables, gazette)
Esempio n. 21
0
    def read_from_json(self, js: Dict[Text, Any], **_) -> "TrainingData":
        """Loads training data stored in the rasa NLU data format."""
        from rasa.nlu.training_data import Message, TrainingData
        import rasa.nlu.schemas.data_schema as schema
        import rasa.utils.validation as validation_utils

        validation_utils.validate_training_data(js, schema.rasa_nlu_data_schema())

        data = js["rasa_nlu_data"]
        common_examples = data.get("common_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        training_examples = []
        for ex in common_examples:
            msg = Message.build(**ex)
            training_examples.append(msg)

        return TrainingData(
            training_examples, entity_synonyms, regex_features, lookup_tables
        )
Esempio n. 22
0
    async def run(skill, language, asm):
        intents_number = 0
        data = []
        stories = []
        intent_stories = []
        domain_data = {"intents": [],
                       "actions": [],
                       "templates": {},
                       "config": {},
                       "entities": [],
                       "slots": {},
                       "forms": []}

        intents = await asm.memory.get_keys(skill + "_intents")

        for intent_id in intents:
            intent = await asm.memory.get(skill + "_intents", intent_id)

            if len(intent['slot']) == 0:
                i = {intent_id: {"use_entities": False, "triggers": 'utter_' + intent_id}}
            else:
                i = {intent_id: {"use_entities": False}}

            domain_data['intents'].append(i)
            domain_data['actions'].append('utter_' + intent_id)
            stories.append('## ' + intent_id)

            domain_data['templates']['utter_' + intent_id] = []
            if intent['responses'] is not None and 'default' in intent['responses']:
                for response in intent['responses']['default']:
                    domain_data['templates']['utter_' + intent_id].append({"text": response})

            for example in intent['examples']:
                text = GenerateStories.preprocessor(example, language)
                msg = Message.build(text=text, intent=intent_id)
                data.append(msg)

            if len(intent['slot']) > 0:
                intent_story = []
                domain_data['forms'].append(intent_id + '_form')
                domain_data['slots']['requested_slot'] = {"type": "unfeaturized"}
                slot_def = []
                for slot_item in intent['slot']:
                    domain_data['slots'][slot_item['name']] = {"type": "unfeaturized", "auto_fill": False}
                    domain_data['templates']['utter_ask_' + slot_item['name']] = \
                        [{"text": slot_item['question']}]
                    domain_data['templates']['utter_error_' + slot_item['name']] = \
                        [{"text": slot_item['response_error']}]
                    domain_data['entities'].append(slot_item['name'])
                    slot_def.append({"name": slot_item['name'],
                                     "required": slot_item['required'],
                                     "type": slot_item['type'],
                                     "validation_function": slot_item['validation_function']})
                for y in range(5):
                    intent_story.append('* ' + intent_id)
                    intent_story.append('  - ' + intent_id + '_form')
                    intent_story.append('  - form{"name": "' + intent_id + '_form"}')
                    intent_story.append('  - form{"name": null}')
                    intent_story.append('  - utter_' + intent_id)

                intent_stories.append(intent_story)
                for item in intent_story:
                    stories.append(item)

        for x in range(intents_number * 20):
            stories.append('## random_' + str(x))
            for y in range(5):
                story = random.choice(intent_stories)
                for item in story:
                    stories.append(item)

        return data, domain_data, stories
Esempio n. 23
0
def evaluate_update(repository_version, repository_authorization):
    evaluations = backend().request_backend_start_evaluation(
        repository_version, repository_authorization)
    training_examples = []

    for evaluate in evaluations:
        training_examples.append(
            Message.build(
                text=evaluate.get("text"),
                intent=evaluate.get("intent"),
                entities=evaluate.get("entities"),
            ))

    test_data = TrainingData(training_examples=training_examples)
    interpreter = update_interpreters.get(repository_version,
                                          repository_authorization,
                                          rasa_version,
                                          use_cache=False)

    result = {
        "intent_evaluation": None,
        "entity_evaluation": None,
        "response_selection_evaluation": None,
    }

    intent_results, response_selection_results, entity_results, = get_eval_data(
        interpreter, test_data)

    if intent_results:
        result["intent_evaluation"] = evaluate_intents(intent_results)

    if entity_results:
        extractors = get_entity_extractors(interpreter)
        result["entity_evaluation"] = evaluate_entities(
            entity_results, extractors)

    intent_evaluation = result.get("intent_evaluation")
    entity_evaluation = result.get("entity_evaluation")

    merged_logs = merge_intent_entity_log(intent_evaluation, entity_evaluation)
    log = get_formatted_log(merged_logs)

    charts = plot_and_save_charts(repository_version, intent_results)
    evaluate_result = backend().request_backend_create_evaluate_results(
        {
            "repository_version": repository_version,
            "matrix_chart": charts.get("matrix_chart"),
            "confidence_chart": charts.get("confidence_chart"),
            "log": json.dumps(log),
            "intentprecision": intent_evaluation.get("precision"),
            "intentf1_score": intent_evaluation.get("f1_score"),
            "intentaccuracy": intent_evaluation.get("accuracy"),
            "entityprecision": entity_evaluation.get("precision"),
            "entityf1_score": entity_evaluation.get("f1_score"),
            "entityaccuracy": entity_evaluation.get("accuracy"),
        },
        repository_authorization,
    )

    intent_reports = intent_evaluation.get("report", {})
    entity_reports = entity_evaluation.get("report", {})

    for intent_key in intent_reports.keys():
        if intent_key and intent_key not in excluded_itens:
            intent = intent_reports.get(intent_key)

            backend().request_backend_create_evaluate_results_intent(
                {
                    "evaluate_id": evaluate_result.get("evaluate_id"),
                    "precision": intent.get("precision"),
                    "recall": intent.get("recall"),
                    "f1_score": intent.get("f1-score"),
                    "support": intent.get("support"),
                    "intent_key": intent_key,
                },
                repository_authorization,
            )

    for entity_key in entity_reports.keys():
        if entity_key and entity_key not in excluded_itens:  # pragma: no cover
            entity = entity_reports.get(entity_key)

            backend().request_backend_create_evaluate_results_score(
                {
                    "evaluate_id": evaluate_result.get("evaluate_id"),
                    "repository_version": repository_version,
                    "precision": entity.get("precision"),
                    "recall": entity.get("recall"),
                    "f1_score": entity.get("f1-score"),
                    "support": entity.get("support"),
                    "entity_key": entity_key,
                },
                repository_authorization,
            )

    return {
        "id": evaluate_result.get("evaluate_id"),
        "version": evaluate_result.get("evaluate_version"),
        "cross_validation": False
    }
Esempio n. 24
0
def evaluate_crossval_update(repository_version_language,
                             repository_authorization,
                             aws_bucket_authentication, language):
    update_request = backend().request_backend_get_current_configuration(
        repository_authorization)
    examples_list = get_examples_request(repository_version_language,
                                         repository_authorization)

    with PokeLogging() as pl:
        try:
            examples = []

            for example in examples_list:
                examples.append(
                    Message.build(
                        text=example.get("text"),
                        intent=example.get("intent"),
                        entities=example.get("entities"),
                    ))

            data = TrainingData(training_examples=examples)
            pipeline_builder = PipelineBuilder(update_request)
            pipeline_builder.print_pipeline()
            rasa_nlu_config = pipeline_builder.get_nlu_model()
            trainer = Trainer(rasa_nlu_config,
                              ComponentBuilder(use_cache=False))

            result = {
                "intent_evaluation": None,
                "entity_evaluation": None,
                "response_selection_evaluation": None,
            }

            intent_test_metrics: IntentMetrics = defaultdict(list)
            entity_test_metrics: EntityMetrics = defaultdict(
                lambda: defaultdict(list))
            response_selection_test_metrics: ResponseSelectionMetrics = defaultdict(
                list)

            intent_results: List[IntentEvaluationResult] = []
            entity_results: List[EntityEvaluationResult] = []
            response_selection_test_results: List[
                ResponseSelectionEvaluationResult] = ([])
            entity_evaluation_possible = False
            extractors: Set[Text] = set()

            language_preprocessor = PreprocessingFactory(language).factory()

            for train, test in generate_folds(3, data):

                interpreter = trainer.train(train)

                test.training_examples = [
                    language_preprocessor.preprocess(x)
                    for x in test.training_examples
                ]

                # calculate test accuracy
                combine_result(
                    intent_test_metrics,
                    entity_test_metrics,
                    response_selection_test_metrics,
                    interpreter,
                    test,
                    intent_results,
                    entity_results,
                    response_selection_test_results,
                )

                if not extractors:
                    extractors = get_entity_extractors(interpreter)
                    entity_evaluation_possible = (
                        entity_evaluation_possible
                        or _contains_entity_labels(entity_results))

            if intent_results:
                result["intent_evaluation"] = evaluate_intents(intent_results)

            if entity_results:
                extractors = get_entity_extractors(interpreter)
                result["entity_evaluation"] = evaluate_entities(
                    entity_results, extractors)

            intent_evaluation = result.get("intent_evaluation")
            entity_evaluation = result.get("entity_evaluation")

            merged_logs = merge_intent_entity_log(intent_evaluation,
                                                  entity_evaluation)
            log = get_formatted_log(merged_logs)

            charts = plot_and_save_charts(repository_version_language,
                                          intent_results,
                                          aws_bucket_authentication)
            evaluate_result = backend(
            ).request_backend_create_evaluate_results(
                {
                    "repository_version": repository_version_language,
                    "matrix_chart": charts.get("matrix_chart"),
                    "confidence_chart": charts.get("confidence_chart"),
                    "log": json.dumps(log),
                    "intentprecision": intent_evaluation.get("precision"),
                    "intentf1_score": intent_evaluation.get("f1_score"),
                    "intentaccuracy": intent_evaluation.get("accuracy"),
                    "entityprecision": entity_evaluation.get("precision"),
                    "entityf1_score": entity_evaluation.get("f1_score"),
                    "entityaccuracy": entity_evaluation.get("accuracy"),
                    "cross_validation": True
                },
                repository_authorization,
            )

            intent_reports = intent_evaluation.get("report", {})
            entity_reports = entity_evaluation.get("report", {})

            for intent_key in intent_reports.keys():
                if intent_key not in excluded_itens:
                    intent = intent_reports.get(intent_key)

                    backend().request_backend_create_evaluate_results_intent(
                        {
                            "evaluate_id": evaluate_result.get("evaluate_id"),
                            "precision": intent.get("precision"),
                            "recall": intent.get("recall"),
                            "f1_score": intent.get("f1-score"),
                            "support": intent.get("support"),
                            "intent_key": intent_key,
                        },
                        repository_authorization,
                    )

            # remove group entities when entities returned as "<entity>.<group_entity>"
            for entity_key in entity_reports.keys():
                if '.' in entity_key:
                    new_entity_key = entity_key.split('.')[0]
                    entity_reports[new_entity_key] = entity_reports[entity_key]
                    entity_reports.pop(entity_key, None)

            for entity_key in entity_reports.keys():
                if entity_key not in excluded_itens:  # pragma: no cover
                    entity = entity_reports.get(entity_key)

                    backend().request_backend_create_evaluate_results_score(
                        {
                            "evaluate_id": evaluate_result.get("evaluate_id"),
                            "repository_version": repository_version_language,
                            "precision": entity.get("precision"),
                            "recall": entity.get("recall"),
                            "f1_score": entity.get("f1-score"),
                            "support": entity.get("support"),
                            "entity_key": entity_key,
                        },
                        repository_authorization,
                    )

            return {
                "id": evaluate_result.get("evaluate_id"),
                "version": evaluate_result.get("evaluate_version"),
                "cross_validation": True,
            }

        except Exception as e:
            logger.exception(e)
            raise e
Esempio n. 25
0
def train_update(update, by, repository_authorization):
    update_request = backend().request_backend_start_training_nlu(
        update, by, repository_authorization)

    examples_list = get_examples_request(update, repository_authorization)
    examples_label_list = get_examples_label_request(update,
                                                     repository_authorization)

    with PokeLogging() as pl:
        try:
            examples = []
            label_examples = []

            get_examples = backend(
            ).request_backend_get_entities_and_labels_nlu(
                update,
                update_request.get("language"),
                json.dumps({
                    "examples": examples_list,
                    "label_examples_query": examples_label_list,
                    "update_id": update,
                }),
                repository_authorization,
            )

            for example in get_examples.get("examples"):
                examples.append(
                    Message.build(
                        text=example.get("text"),
                        intent=example.get("intent"),
                        entities=example.get("entities"),
                    ))

            for label_example in get_examples.get("label_examples"):
                label_examples.append(
                    Message.build(
                        text=label_example.get("text"),
                        entities=label_example.get("entities"),
                    ))

            rasa_nlu_config = get_rasa_nlu_config_from_update(update_request)
            trainer = Trainer(rasa_nlu_config,
                              ComponentBuilder(use_cache=False))
            training_data = BothubTrainingData(
                label_training_examples=label_examples,
                training_examples=examples)

            trainer.train(training_data)

            persistor = BothubPersistor(update, repository_authorization)
            trainer.persist(
                mkdtemp(),
                persistor=persistor,
                fixed_model_name=str(update_request.get("update_id")),
            )
        except Exception as e:
            logger.exception(e)
            backend().request_backend_trainfail_nlu(update,
                                                    repository_authorization)
            raise e
        finally:
            backend().request_backend_traininglog_nlu(
                update, pl.getvalue(), repository_authorization)
Esempio n. 26
0
    def test__training_preprocess(self):
        preprocessors = [
            PreprocessingFactory(remove_accent=False).factory(),
            PreprocessingFactory('pt_br', remove_accent=False).factory(),
            PreprocessingFactory('en', remove_accent=False).factory(),
            PreprocessingFactory('es', remove_accent=False).factory()
        ]
        for preprocessor in preprocessors:
            phrase = "i'`m GOING não tô é the 'gym"
            expected_phrase = "im going não tô é the gym"
            entities = [
                {
                    "start": 0,
                    "end": 4,
                    "value": "i'`m",
                    "entity": "me"
                },
                {
                    "start": 24,
                    "end": 28,
                    "value": "'gym",
                    "entity": "gym"
                },
            ]
            expected_entities = [
                {
                    "start": 0,
                    "end": 2,
                    "value": "im",
                    "entity": "me"
                },
                {
                    "start": 22,
                    "end": 25,
                    "value": "gym",
                    "entity": "gym"
                },
            ]
            message = Message.build(
                text=phrase,
                intent='test',
                entities=entities,
            )

            self.assertEqual(
                preprocessor.preprocess(message).text,
                expected_phrase
            )
            self.assertEqual(
                preprocessor.preprocess(message).data.get('entities'),
                expected_entities
            )

            message = Message.build(
                text=phrase,
                intent='test',
                entities=None,
            )
            self.assertEqual(
                preprocessor.preprocess(message).text,
                expected_phrase
            )
            with self.assertRaises(KeyError):
                _ = preprocessor.preprocess(message).data['entities']
Esempio n. 27
0
async def test_rebuild_original_text(text: str, entities: List[Dict[str, Any]], expected: str) -> None:
    message = Message.build(text, "test_intent", entities)
    original_text = await PlaceholderImporter.rebuild_original_text(message)
    assert expected == original_text
Esempio n. 28
0
def evaluate_crossval_update(repository_version,
                             by,
                             repository_authorization,
                             from_queue='celery'):
    update_request = backend().request_backend_start_training_nlu(
        repository_version, by, repository_authorization, from_queue)
    examples_list = get_examples_request(repository_version,
                                         repository_authorization)

    with PokeLogging() as pl:
        try:
            examples = []

            for example in examples_list:
                examples.append(
                    Message.build(
                        text=example.get("text"),
                        intent=example.get("intent"),
                        entities=example.get("entities"),
                    ))

            data = TrainingData(training_examples=examples)
            rasa_nlu_config = get_rasa_nlu_config(update_request)
            trainer = Trainer(rasa_nlu_config,
                              ComponentBuilder(use_cache=False))

            result = {
                "intent_evaluation": None,
                "entity_evaluation": None,
                "response_selection_evaluation": None,
            }

            intent_train_metrics: IntentMetrics = defaultdict(list)
            intent_test_metrics: IntentMetrics = defaultdict(list)
            entity_train_metrics: EntityMetrics = defaultdict(
                lambda: defaultdict(list))
            entity_test_metrics: EntityMetrics = defaultdict(
                lambda: defaultdict(list))
            response_selection_train_metrics: ResponseSelectionMetrics = defaultdict(
                list)
            response_selection_test_metrics: ResponseSelectionMetrics = defaultdict(
                list)

            intent_results: List[IntentEvaluationResult] = []
            entity_results: List[EntityEvaluationResult] = []
            response_selection_test_results: List[
                ResponseSelectionEvaluationResult] = ([])
            entity_evaluation_possible = False
            extractors: Set[Text] = set()

            for train, test in generate_folds(3, data):
                interpreter = trainer.train(train)

                # calculate train accuracy
                combine_result(
                    intent_train_metrics,
                    entity_train_metrics,
                    response_selection_train_metrics,
                    interpreter,
                    train,
                )
                # calculate test accuracy
                combine_result(
                    intent_test_metrics,
                    entity_test_metrics,
                    response_selection_test_metrics,
                    interpreter,
                    test,
                    intent_results,
                    entity_results,
                    response_selection_test_results,
                )

                if not extractors:
                    extractors = get_entity_extractors(interpreter)
                    entity_evaluation_possible = (
                        entity_evaluation_possible
                        or _contains_entity_labels(entity_results))

            if intent_results:
                result["intent_evaluation"] = evaluate_intents(intent_results)

            if entity_results:
                extractors = get_entity_extractors(interpreter)
                result["entity_evaluation"] = evaluate_entities(
                    entity_results, extractors)

            intent_evaluation = result.get("intent_evaluation")
            entity_evaluation = result.get("entity_evaluation")

            merged_logs = merge_intent_entity_log(intent_evaluation,
                                                  entity_evaluation)
            log = get_formatted_log(merged_logs)

            charts = plot_and_save_charts(repository_version, intent_results)
            evaluate_result = backend(
            ).request_backend_create_evaluate_results(
                {
                    "repository_version": repository_version,
                    "matrix_chart": charts.get("matrix_chart"),
                    "confidence_chart": charts.get("confidence_chart"),
                    "log": json.dumps(log),
                    "intentprecision": intent_evaluation.get("precision"),
                    "intentf1_score": intent_evaluation.get("f1_score"),
                    "intentaccuracy": intent_evaluation.get("accuracy"),
                    "entityprecision": entity_evaluation.get("precision"),
                    "entityf1_score": entity_evaluation.get("f1_score"),
                    "entityaccuracy": entity_evaluation.get("accuracy"),
                },
                repository_authorization,
            )

            intent_reports = intent_evaluation.get("report", {})
            entity_reports = entity_evaluation.get("report", {})

            for intent_key in intent_reports.keys():
                if intent_key and intent_key not in excluded_itens:
                    intent = intent_reports.get(intent_key)

                    backend().request_backend_create_evaluate_results_intent(
                        {
                            "evaluate_id": evaluate_result.get("evaluate_id"),
                            "precision": intent.get("precision"),
                            "recall": intent.get("recall"),
                            "f1_score": intent.get("f1-score"),
                            "support": intent.get("support"),
                            "intent_key": intent_key,
                        },
                        repository_authorization,
                    )

            for entity_key in entity_reports.keys():
                if entity_key and entity_key not in excluded_itens:  # pragma: no cover
                    entity = entity_reports.get(entity_key)

                    backend().request_backend_create_evaluate_results_score(
                        {
                            "evaluate_id": evaluate_result.get("evaluate_id"),
                            "repository_version": repository_version,
                            "precision": entity.get("precision"),
                            "recall": entity.get("recall"),
                            "f1_score": entity.get("f1-score"),
                            "support": entity.get("support"),
                            "entity_key": entity_key,
                        },
                        repository_authorization,
                    )

            return {
                "id": evaluate_result.get("evaluate_id"),
                "version": evaluate_result.get("evaluate_version"),
                "cross_validation": True
            }

        except Exception as e:
            logger.exception(e)
            backend().request_backend_trainfail_nlu(repository_version,
                                                    repository_authorization)
            raise e
        finally:
            backend().request_backend_traininglog_nlu(
                repository_version, pl.getvalue(), repository_authorization)