コード例 #1
0
def test_nlu_warn_if_lookup_table_and_crf_extractor_pattern_feature_mismatch(
        nodes: List[SchemaNode], warns: bool):
    training_data = TrainingData(
        training_examples=[Message({TEXT: "hi"}),
                           Message({TEXT: "hi hi"})],
        lookup_tables=[{
            "elements": "this-is-no-file-and-that-does-not-matter"
        }],
    )
    assert training_data.lookup_tables is not None
    importer = DummyImporter(training_data=training_data)

    graph_schema = GraphSchema(
        {f"{idx}": node
         for idx, node in enumerate(nodes)})
    validator = DefaultV1RecipeValidator(graph_schema)

    if warns:
        match = (
            f"You have defined training data consisting of lookup tables, "
            f"but your NLU configuration's "
            f"'{CRFEntityExtractor.__name__}' does not include the "
            f"'{CRFEntityExtractorOptions.PATTERN}' feature")

        with pytest.warns(UserWarning, match=match):
            validator.validate(importer)
    else:
        with pytest.warns(None) as records:
            validator.validate(importer)
            assert len(records) == 0
コード例 #2
0
def test_nlu_warn_if_training_examples_with_entity_roles_are_unused(
        component_type: Type[GraphComponent], role_instead_of_group: bool,
        warns: bool):
    messages = [
        Message({
            ENTITIES: [{
                ENTITY_ATTRIBUTE_TYPE:
                "dummy",
                (ENTITY_ATTRIBUTE_ROLE if role_instead_of_group else ENTITY_ATTRIBUTE_GROUP):
                "dummy-2",
            }],
            TEXT:
            f"hi{i}",
            INTENT:
            "dummy",
        }) for i in range(2)
    ]
    training_data = TrainingData(training_examples=messages)
    warnings = ([
        "You have defined training data with entities that have roles/groups, "
        "but your NLU configuration"
    ] if warns else [])
    component_types = [WhitespaceTokenizer]
    if component_type:
        component_types.append(component_type)
    _test_validation_warnings_with_default_configs(
        training_data=training_data,
        component_types=component_types,
        warnings=warnings)
コード例 #3
0
def test_nlu_warn_if_lookup_table_is_not_used(
    featurizer: Type[GraphComponent],
    consumer: Type[GraphComponent],
    warns_featurizer: bool,
    warns_consumer: bool,
):
    training_data = TrainingData(
        training_examples=[Message({TEXT: "hi"}),
                           Message({TEXT: "hi hi"})],
        lookup_tables=[{
            "elements": "this-is-no-file-and-that-does-not-matter"
        }],
    )
    assert training_data.lookup_tables is not None
    component_types = [WhitespaceTokenizer, featurizer, consumer]
    component_types = [type for type in component_types if type is not None]

    expected_warnings = []
    if warns_featurizer:
        warning = (
            "You have defined training data consisting of lookup tables, "
            "your NLU configuration does not include a featurizer using the "
            "lookup table.")
        expected_warnings.append(warning)
    if warns_consumer:
        warning = (
            "You have defined training data consisting of lookup tables, but "
            "your NLU configuration does not include any components "
            "that uses the features created from the lookup table. ")
        expected_warnings.append(warning)
    _test_validation_warnings_with_default_configs(
        training_data=training_data,
        component_types=component_types,
        warnings=expected_warnings,
    )
コード例 #4
0
def test_nlu_warn_if_training_examples_with_intent_response_key_are_unused(
        component_type: Type[GraphComponent], warns: bool):
    messages = [
        Message({
            INTENT: "faq",
            INTENT_RESPONSE_KEY: "faq/dummy",
            TEXT: "hi",
            RESPONSE: "utter_greet",
        }),
        Message({
            INTENT: "faq",
            INTENT_RESPONSE_KEY: "faq/dummy",
            TEXT: "hi hi",
            RESPONSE: "utter_greet",
        }),
    ]
    training_data = TrainingData(training_examples=messages)
    warnings = (([
        "You have defined training data with examples "
        "for training a response selector, "
        "but your NLU configuration"
    ]) if warns else None)
    component_types = [WhitespaceTokenizer]
    if component_type:
        component_types.append(component_type)
    _test_validation_warnings_with_default_configs(
        training_data=training_data,
        component_types=component_types,
        warnings=warnings)
コード例 #5
0
def test_nlu_warn_if_training_examples_with_entities_are_unused(
        component_type: Type[GraphComponent], warns: bool):
    messages = [
        Message({
            ENTITIES: [{
                ENTITY_ATTRIBUTE_TYPE: "dummy"
            }],
            INTENT: "dummy",
            TEXT: "hi"
        }),
        Message({
            ENTITIES: [{
                ENTITY_ATTRIBUTE_TYPE: "dummy"
            }],
            INTENT: "dummy",
            TEXT: "hi hi",
        }),
    ]
    training_data = TrainingData(training_examples=messages)
    warnings = (([
        "You have defined training data consisting of entity examples, "
        "but your NLU configuration"
    ]) if warns else None)
    component_types = [WhitespaceTokenizer]
    if component_type:
        component_types.append(component_type)
    _test_validation_warnings_with_default_configs(
        training_data=training_data,
        component_types=component_types,
        warnings=warnings)
コード例 #6
0
ファイル: importer.py プロジェクト: ugursistas/rasa
def _additional_training_data_from_default_actions() -> TrainingData:
    additional_messages_from_default_actions = [
        Message(data={ACTION_NAME: action_name})
        for action_name in rasa.shared.core.constants.DEFAULT_ACTION_NAMES
    ]

    return TrainingData(additional_messages_from_default_actions)
コード例 #7
0
ファイル: rasa.py プロジェクト: attgua/Geco
    def read_from_json(self, js: Dict[Text, Any], **_) -> "TrainingData":
        """Loads training data stored in the rasa NLU data format."""
        import rasa.shared.nlu.training_data.schemas.data_schema as schema
        import rasa.shared.utils.validation as validation_utils

        validation_utils.validate_training_data(js, schema.rasa_nlu_data_schema())

        data = js["rasa_nlu_data"]
        common_examples = data.get("common_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        training_examples = []
        for ex in common_examples:
            # taking care of custom entries
            msg = Message.build(
                text=ex.pop(TEXT, ""),
                intent=ex.pop(INTENT, None),
                entities=ex.pop(ENTITIES, None),
                **ex,
            )
            training_examples.append(msg)

        return TrainingData(
            training_examples, entity_synonyms, regex_features, lookup_tables
        )
コード例 #8
0
ファイル: importer.py プロジェクト: attgua/Geco
    async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
        nlu_data = [importer.get_nlu_data(language) for importer in self._importers]
        nlu_data = await asyncio.gather(*nlu_data)

        return reduce(
            lambda merged, other: merged.merge(other), nlu_data, TrainingData()
        )
コード例 #9
0
def test_count_vector_featurizer_action_attribute_featurization(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "additional_vocabulary_size": {
            "text": 0,
            "response": 0,
            "action_text": 0
        },
    })
    tk = WhitespaceTokenizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(ACTION_NAME, action_name)
    train_message.set(ACTION_TEXT, action_text)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(ACTION_TEXT, "hi")
    second_message.set(ACTION_NAME, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features(
        ACTION_NAME, [])
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        ACTION_TEXT, [])
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if action_name_features:
        assert action_name_seq_vecs.toarray()[0] == action_name_features
        assert action_name_sen_vecs is None
    else:
        assert action_name_seq_vecs is None
        assert action_name_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
コード例 #10
0
def test_process_does_not_overwrite_any_entities(
        create_or_load_extractor: Callable[..., RegexEntityExtractor]):

    pre_existing_entity = {
        ENTITY_ATTRIBUTE_TYPE: "person",
        ENTITY_ATTRIBUTE_VALUE: "Max",
        ENTITY_ATTRIBUTE_START: 0,
        ENTITY_ATTRIBUTE_END: 3,
        EXTRACTOR: "other extractor",
    }
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [copy.deepcopy(pre_existing_entity)])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT:
                "Hi Max!",
                INTENT:
                "greet",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "person",
                    ENTITY_ATTRIBUTE_VALUE: "Max"
                }],
            }),
        Message(
            data={
                TEXT:
                "I live in Berlin",
                INTENT:
                "inform",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "city",
                    ENTITY_ATTRIBUTE_VALUE: "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = create_or_load_extractor(config={})
    entity_extractor.train(training_data)
    entity_extractor.process([message])

    entities = message.get(ENTITIES)
    assert entities == [
        pre_existing_entity,
        {
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: "Berlin",
            ENTITY_ATTRIBUTE_START: 13,
            ENTITY_ATTRIBUTE_END: 19,
            EXTRACTOR: RegexEntityExtractor.__name__,
        },
    ]
コード例 #11
0
def test_convert_training_examples(
    spacy_nlp: Any,
    text: Text,
    intent: Optional[Text],
    entities: Optional[List[Dict[Text, Any]]],
    attributes: List[Text],
    real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
):
    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})

    tokenizer = SpacyTokenizer()
    count_vectors_featurizer = CountVectorsFeaturizer()
    spacy_featurizer = SpacyFeaturizer()

    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    training_data = TrainingData([message])
    tokenizer.train(training_data)
    count_vectors_featurizer.train(training_data)
    spacy_featurizer.train(training_data)

    entity_tag_spec = [
        EntityTagSpec(
            "entity",
            {
                0: "O",
                1: "name",
                2: "location"
            },
            {
                "O": 0,
                "name": 1,
                "location": 2
            },
            3,
        )
    ]
    output, sparse_feature_sizes = model_data_utils.featurize_training_examples(
        [message],
        attributes=attributes,
        entity_tag_specs=entity_tag_spec,
    )

    assert len(output) == 1
    for attribute in attributes:
        assert attribute in output[0]
    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
        assert attribute not in output[0]
    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence
    # features in the list
    assert len(output[0][TEXT]) == 4
    if INTENT in attributes:
        # we will just have space sentence features
        assert len(output[0][INTENT]) == 1
    if ENTITIES in attributes:
        # we will just have space sentence features
        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
    # check that it calculates sparse_feature_sizes correctly
    assert sparse_feature_sizes == real_sparse_feature_sizes
コード例 #12
0
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
コード例 #13
0
def training_data_from_paths(paths: Iterable[Text],
                             language: Text) -> TrainingData:
    from rasa.shared.nlu.training_data import loading

    training_data_sets = [
        loading.load_data(nlu_file, language) for nlu_file in paths
    ]
    return TrainingData().merge(*training_data_sets)
コード例 #14
0
def fetch_sparse_features(txt, tokenizer, featurizer):
    message = Message(
        {TEXT: "my advices include to give advice and giving many greetings"})
    tokenizer.process(message)
    featurizer.train(TrainingData([message]))
    featurizer.process(message)
    seq_vecs, sen_vecs = message.get_sparse_features(TEXT, [])
    return seq_vecs.features.toarray()
コード例 #15
0
ファイル: importer.py プロジェクト: zoovu/rasa
    def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
        """Retrieves NLU training data (see parent class for full docstring)."""
        nlu_data = [
            importer.get_nlu_data(language) for importer in self._importers
        ]

        return reduce(lambda merged, other: merged.merge(other), nlu_data,
                      TrainingData())
コード例 #16
0
ファイル: test_telemetry.py プロジェクト: zackzou-YS/rasa
async def test_events_schema(monkeypatch: MonkeyPatch, default_agent: Agent,
                             config_path: Text):
    # this allows us to patch the printing part used in debug mode to collect the
    # reported events
    monkeypatch.setenv("RASA_TELEMETRY_DEBUG", "true")
    monkeypatch.setenv("RASA_TELEMETRY_ENABLED", "true")

    mock = Mock()
    monkeypatch.setattr(telemetry, "print_telemetry_event", mock)

    with open(TELEMETRY_EVENTS_JSON) as f:
        schemas = json.load(f)["events"]

    initial = asyncio.Task.all_tasks()
    # Generate all known backend telemetry events, and then use events.json to
    # validate their schema.
    training_data = TrainingDataImporter.load_from_config(config_path)
    async with telemetry.track_model_training(training_data, "rasa"):
        await asyncio.sleep(1)

    telemetry.track_telemetry_disabled()

    telemetry.track_data_split(0.5, "nlu")

    telemetry.track_validate_files(True)

    telemetry.track_data_convert("yaml", "nlu")

    telemetry.track_tracker_export(5, TrackerStore(domain=None), EventBroker())

    telemetry.track_interactive_learning_start(True, False)

    telemetry.track_server_start([CmdlineInput()], None, None, 42, True)

    telemetry.track_project_init("tests/")

    telemetry.track_shell_started("nlu")

    telemetry.track_rasa_x_local()

    telemetry.track_visualization()

    telemetry.track_core_model_test(5, True, default_agent)

    telemetry.track_nlu_model_test(TrainingData())

    pending = asyncio.Task.all_tasks() - initial
    await asyncio.gather(*pending)

    assert mock.call_count == 15

    for call in mock.call_args_list:
        event = call.args[0]
        # `metrics_id` automatically gets added to all event but is
        # not part of the schema so we need to remove it before validation
        del event["properties"]["metrics_id"]
        jsonschema.validate(instance=event["properties"],
                            schema=schemas[event["event"]])
コード例 #17
0
def test_lookup_with_and_without_boundaries(
    sentence: Text,
    expected_sequence_features: List[List[float]],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    use_word_boundaries: bool,
    spacy_nlp: Any,
):
    ftr = RegexFeaturizer({
        "use_word_boundaries": use_word_boundaries,
        "number_additional_patterns": 0
    })
    training_data = TrainingData()

    # we use lookups because the "use_word_boundaries" flag is only used when
    # producing patterns from lookup tables
    lookups = [{"name": "how", "elements": ["how"]}]
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    (sequence_features,
     sentence_features) = ftr._features_for_patterns(message, TEXT)

    sequence_features = sequence_features.toarray()
    sentence_features = sentence_features.toarray()
    num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups])
    assert sequence_features.shape == (
        len(message.get(TOKENS_NAMES[TEXT])),
        num_of_patterns,
    )
    num_of_lookup_tables = len(lookups)
    assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns)

    # sequence_features should be {0,1} for each token: 1 if match, 0 if not
    assert np.allclose(sequence_features,
                       expected_sequence_features,
                       atol=1e-10)
    # sentence_features should be {0,1} for each lookup table: 1 if sentence
    # contains match from that table, 0 if not
    assert np.allclose(sentence_features,
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        # labeled_tokens should list the token(s) which match a pattern
        assert num_matches == labeled_tokens.count(i)
コード例 #18
0
ファイル: importer.py プロジェクト: pranavdurai10/rasa
def _additional_training_data_from_default_actions() -> TrainingData:
    from rasa.core.actions import action

    additional_messages_from_default_actions = [
        Message.build_from_action(action_name=action_name)
        for action_name in action.default_action_names()
    ]

    return TrainingData(additional_messages_from_default_actions)
コード例 #19
0
def test_train_process_and_load_with_empty_model(
        create_or_load_extractor: Callable[..., RegexEntityExtractor]):
    extractor = create_or_load_extractor({})
    with pytest.warns(UserWarning):
        extractor.train(TrainingData([]))
    with pytest.warns(UserWarning):
        extractor.process(Message(data={TEXT: "arbitrary"}))
    with pytest.warns(UserWarning):
        create_or_load_extractor({}, load=True)
コード例 #20
0
 def __init__(
     self,
     training_data: Optional[TrainingData] = None,
     config: Optional[Dict[Text, Any]] = None,
     domain: Optional[Domain] = None,
 ) -> None:
     self.training_data = training_data or TrainingData([])
     self.config = config or {}
     self.domain = domain or Domain.empty()
コード例 #21
0
def test_flexible_nlu_pipeline():
    message = Message(data={TEXT: "This is a test message.", "intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}
    )
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        }
    )
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 6
    assert message.features[0].origin == "cvf_word"
    assert message.features[0].type == FEATURE_TYPE_SEQUENCE
    assert message.features[1].origin == "cvf_word"
    assert message.features[1].type == FEATURE_TYPE_SENTENCE
    # cvf word is also extracted for the intent
    assert message.features[2].origin == "cvf_word"
    assert message.features[2].type == FEATURE_TYPE_SEQUENCE
    assert message.features[3].origin == "cvf_char"
    assert message.features[3].type == FEATURE_TYPE_SEQUENCE
    assert message.features[4].origin == "cvf_char"
    assert message.features[4].type == FEATURE_TYPE_SENTENCE
    assert message.features[5].origin == "LexicalSyntacticFeaturizer"
    assert message.features[5].type == FEATURE_TYPE_SEQUENCE

    sequence_feature_dim = (
        message.features[0].features.shape[1] + message.features[5].features.shape[1]
    )
    sentence_feature_dim = message.features[0].features.shape[1]

    classifier = DIETClassifier(
        component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]}
    )
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get(TEXT).get(SENTENCE)) == 1
    assert len(model_data.get(TEXT).get(SEQUENCE)) == 1
    assert len(model_data.get(LABEL).get(SEQUENCE)) == 1
    assert model_data.get(LABEL).get(SENTENCE) is None
    assert model_data.get(TEXT).get(SEQUENCE)[0][0].shape == (5, sequence_feature_dim)
    assert model_data.get(TEXT).get(SENTENCE)[0][0].shape == (1, sentence_feature_dim)
    assert model_data.get(LABEL).get(SEQUENCE)[0][0].shape == (1, 1)
コード例 #22
0
def test_persist_load_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizer],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource: Resource,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = create_featurizer()

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)
    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)

    loaded_featurizer = RegexFeaturizer.load(
        RegexFeaturizer.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode

    new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
コード例 #23
0
def test_do_not_overwrite_any_entities():
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [{
        "entity": "person",
        "value": "Max",
        "start": 0,
        "end": 3
    }])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{
                    "entity": "person",
                    "value": "Max"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == [
        {
            "entity": "person",
            "value": "Max",
            "start": 0,
            "end": 3
        },
        {
            "entity": "city",
            "value": "Berlin",
            "start": 13,
            "end": 19,
            "extractor": "RegexEntityExtractor",
        },
    ]
コード例 #24
0
 async def get_nlu_data(self, languages=True) -> Dict[Text, TrainingData]:
     language = None
     if isinstance(languages, str):
         language = languages
         languages = [language]
     if not isinstance(languages, list):
         languages = self.nlu_config.keys()
     td = {}
     for lang in languages:
         try:
             td[lang] = utils.training_data_from_paths(
                 self.path_for_nlu_lang(lang),
                 lang,
             )
         except ValueError as e:
             if str(e).startswith("Unknown data format"):
                 td[lang] = TrainingData()
     if language: return td.get(language, TrainingData())
     return td
コード例 #25
0
def test_nlu_training_data_validation():
    importer = DummyImporter(
        training_data=TrainingData([Message({
            TEXT: "some text",
            INTENT: ""
        })]))
    nlu_validator = DefaultV1RecipeValidator(GraphSchema({}))

    with pytest.warns(UserWarning, match="Found empty intent"):
        nlu_validator.validate(importer)
コード例 #26
0
    def read_from_dict(self, yaml_content: Dict, **kwargs: Any) -> "TrainingData":
        if not validation.validate_training_data_format_version(
            yaml_content, self.filename
        ):
            return TrainingData()

        for key, value in yaml_content.items():
            if key == KEY_NLU:
                self._parse_nlu(value)
            elif key == KEY_RESPONSES:
                self.responses = value

        return TrainingData(
            self.training_examples,
            self.entity_synonyms,
            self.regex_features,
            self.lookup_tables,
            self.responses,
        )
コード例 #27
0
def training_data():
    # Create training data.
    return TrainingData(
        [
            Message({"text": "hello", "intent": "greet"}),
            Message({"text": "hi there", "intent": "greet"}),
            Message({"text": "ciao", "intent": "goodbye"}),
            Message({"text": "bye", "intent": "goodbye"}),
        ]
    )
コード例 #28
0
ファイル: importer.py プロジェクト: zoovu/rasa
    def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
        """Retrieves NLU training data (see parent class for full docstring)."""
        training_datasets = [
            _additional_training_data_from_default_actions(),
            self.importer.get_nlu_data(language),
            self._additional_training_data_from_stories(),
        ]

        return reduce(lambda merged, other: merged.merge(other),
                      training_datasets, TrainingData())
コード例 #29
0
def test_non_word_boundaries(
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    non_word_boundary: List[Text],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(data={TEXT: text})
    training_data = TrainingData()
    training_data.lookup_tables = [lookup]
    training_data.training_examples = [
        Message(
            data={
                TEXT: "I love New York",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "New York"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
        Message(
            data={
                TEXT: "I like apples",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "fruit",
                    "value": "apples"
                }],
            }),
        Message(
            data={
                TEXT: "oranges are my fave",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "fruit",
                    "value": "oranges"
                }],
            }),
    ]

    entity_extractor = FlashTextEntityExtractor(
        {"non_word_boundaries": non_word_boundary})
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = [e["value"] for e in message.get(ENTITIES)]
    assert entities == expected_entities
コード例 #30
0
def test_cvf_incremental_training(
    initial_train_text: Text,
    additional_train_text: Text,
    initial_vocabulary_size: int,
    final_vocabulary_size: int,
    tmp_path: Path,
):
    tk = WhitespaceTokenizer()
    initial_cvf = CountVectorsFeaturizer()
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])

    tk.train(data)
    initial_cvf.train(data)

    # Check initial vocabulary size
    initial_vocab = initial_cvf.vectorizers["text"].vocabulary_
    assert len(initial_vocab) == initial_vocabulary_size

    # persist and load initial cvf
    file_dict = initial_cvf.persist("ftr", tmp_path)
    meta = initial_cvf.component_config.copy()
    meta.update(file_dict)
    new_cvf = CountVectorsFeaturizer.load(meta, tmp_path, should_finetune=True)

    # Check vocabulary size again
    assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    tk.train(data)
    new_cvf.train(data)

    new_vocab = new_cvf.vectorizers["text"].vocabulary_

    # Check vocabulary size after finetuning
    assert len(new_vocab) == final_vocabulary_size

    # Check indices of initial vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in initial_vocab.items():
        assert vocab_token in new_vocab
        assert new_vocab.get(vocab_token) == vocab_index