コード例 #1
0
def test_train_tokenizer(text, expected_tokens, expected_indices):
    tk = WhitespaceTokenizer()

    message = Message(text)
    message.set(RESPONSE_ATTRIBUTE, text)
    message.set(INTENT_ATTRIBUTE, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]

    # check intent attribute
    tokens = training_data.training_examples[0].get(
        TOKENS_NAMES[INTENT_ATTRIBUTE])

    assert [t.text for t in tokens] == [text]
コード例 #2
0
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT])

    assert (6, 5) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE])

    assert (6, 5) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT])

    assert (1, 1) == vecs.shape
    assert np.all(vecs.toarray()[0] == np.array([1]))
コード例 #3
0
def test_process(
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(text)

    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message("Hi Max!",
                data={"entities": [{
                    "entity": "person",
                    "value": "Max"
                }]}),
        Message(
            "I live in Berlin",
            data={"entities": [{
                "entity": "city",
                "value": "Berlin"
            }]},
        ),
    ]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == expected_entities
コード例 #4
0
ファイル: rasa_yaml.py プロジェクト: ravishankr/rasa
    def reads(self, string: Text, **kwargs: Any) -> "TrainingData":
        """Reads TrainingData in YAML format from a string.

        Args:
            string: String with YAML training data.
            **kwargs: Keyword arguments.

        Returns:
            New `TrainingData` object with parsed training data.
        """
        from rasa.nlu.training_data import TrainingData
        from rasa.validator import Validator

        self.validate(string)

        yaml_content = io_utils.read_yaml(string)

        if not Validator.validate_training_data_format_version(
                yaml_content, self.filename):
            return TrainingData()

        for key, value in yaml_content.items():  # pytype: disable=attribute-error
            if key == KEY_NLU:
                self._parse_nlu(value)
            elif key == KEY_RESPONSES:
                self.responses = value

        return TrainingData(
            self.training_examples,
            self.entity_synonyms,
            self.regex_features,
            self.lookup_tables,
            self.responses,
        )
コード例 #5
0
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
コード例 #6
0
ファイル: utils.py プロジェクト: sanaayakurup/rasa-1
def training_data_from_paths(paths: Iterable[Text],
                             language: Text) -> TrainingData:
    from rasa.nlu.training_data import loading

    training_datas = [
        loading.load_data(nlu_file, language) for nlu_file in paths
    ]
    merged_training_data = TrainingData().merge(*training_datas)
    merged_training_data.fill_response_phrases()
    return merged_training_data
コード例 #7
0
def test_regex_featurizer_train():

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])

    assert seq_vecs is None
    assert sen_vec is None
コード例 #8
0
def test_do_not_overwrite_any_entities():
    message = Message("Max lives in Berlin.")
    message.set(ENTITIES, [{
        "entity": "person",
        "value": "Max",
        "start": 0,
        "end": 3
    }])

    training_data = TrainingData()
    training_data.training_examples = [
        Message("Hi Max!",
                data={"entities": [{
                    "entity": "person",
                    "value": "Max"
                }]}),
        Message(
            "I live in Berlin",
            data={"entities": [{
                "entity": "city",
                "value": "Berlin"
            }]},
        ),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == [
        {
            "entity": "person",
            "value": "Max",
            "start": 0,
            "end": 3
        },
        {
            "entity": "city",
            "value": "Berlin",
            "start": 13,
            "end": 19,
            "extractor": "RegexEntityExtractor",
        },
    ]
コード例 #9
0
def test_extract_patterns(
    lookup_tables: Dict[Text, List[Text]],
    regex_features: Dict[Text, Text],
    expected_patterns: Dict[Text, Text],
):
    training_data = TrainingData()
    if lookup_tables:
        training_data.lookup_tables = [lookup_tables]
    if regex_features:
        training_data.regex_features = [regex_features]

    actual_patterns = pattern_utils.extract_patterns(training_data)

    assert actual_patterns == expected_patterns
コード例 #10
0
def test_regex_featurizer_train():

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(sentence)
    message.set(RESPONSE_ATTRIBUTE, sentence)
    message.set(INTENT_ATTRIBUTE, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    assert (7, 3) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])

    assert (7, 3) == vecs.shape
    assert np.all(vecs.toarray()[0] == expected)
    assert np.all(vecs.toarray()[-1] == expected_cls)

    vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE])

    assert vecs is None
コード例 #11
0
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
        CountVectorsFeaturizer, )

    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "return_sequence": True
    })
    train_message = Message(sentence)

    # this is needed for a valid training example
    train_message.set("intent", intent)
    train_message.set("response", response)

    data = TrainingData([train_message])
    ftr.train(data)

    if intent_features:
        assert (train_message.get("intent_sparse_features").toarray()[0] ==
                intent_features)
    else:
        assert train_message.get("intent_sparse_features") is None

    if response_features:
        assert (train_message.get("response_sparse_features").toarray()[0] ==
                response_features)
    else:
        assert train_message.get("response_sparse_features") is None
コード例 #12
0
ファイル: test_featurizers.py プロジェクト: zijiannc/RASA_NLU
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
コード例 #13
0
    def read(self, fn: Text, **kwargs: Any) -> "TrainingData":
        """Loads training data stored in the Dialogflow data format."""
        from rasa.nlu.training_data import TrainingData

        language = kwargs["language"]
        fformat = kwargs["fformat"]

        if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
            raise ValueError(
                "fformat must be either {}, or {}"
                "".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES)
            )

        root_js = rasa.utils.io.read_json_file(fn)
        examples_js = self._read_examples_js(fn, language, fformat)

        if not examples_js:
            raise_warning(
                f"No training examples found for dialogflow file {fn}!",
                docs=DOCS_URL_MIGRATE_GOOGLE,
            )
            return TrainingData()
        elif fformat == DIALOGFLOW_INTENT:
            return self._read_intent(root_js, examples_js)
        else:  # path for DIALOGFLOW_ENTITIES
            return self._read_entities(root_js, examples_js)
コード例 #14
0
ファイル: rasa.py プロジェクト: psds01/rasa_custom
    def read_from_json(self, js, **kwargs):
        """Loads training data stored in the rasa NLU data format."""
        from rasa.nlu.training_data import Message, TrainingData

        validate_rasa_nlu_data(js)

        data = js["rasa_nlu_data"]
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            logger.warning("DEPRECATION warning: your rasa data "
                           "contains 'intent_examples' "
                           "or 'entity_examples' which will be "
                           "removed in the future. Consider "
                           "putting all your examples "
                           "into the 'common_examples' section.")

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex["text"], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features,
                            lookup_tables)
コード例 #15
0
def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)

    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] ==
        expected)
コード例 #16
0
def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer()

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
コード例 #17
0
def test_count_vector_featurizer_shared_vocab(sentence, intent, response,
                                              text_features, intent_features,
                                              response_features):
    ftr = CountVectorsFeaturizer({
        "token_pattern": r"(?u)\b\w+\b",
        "use_shared_vocab": True
    })
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    tk.train(data)
    ftr.train(data)

    assert np.all(
        train_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] ==
        text_features)
    assert np.all(
        train_message.get(SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] ==
        intent_features)
    assert np.all(
        train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] ==
        response_features)
コード例 #18
0
ファイル: markdown.py プロジェクト: sysang/rasa
    def reads(self, s: Text, **kwargs: Any) -> "TrainingData":
        """Read markdown string and create TrainingData object"""
        from rasa.nlu.training_data import TrainingData

        s = self._strip_comments(s)
        for line in s.splitlines():
            line = decode_string(line.strip())
            header = self._find_section_header(line)
            if header:
                self._set_current_section(header[0], header[1])
            else:
                self._parse_item(line)
                self._load_files(line)

        if self._deprecated_synonym_format_was_used:
            raise_warning(
                "You are using the deprecated training data format to declare synonyms."
                " Please use the following format: \n"
                '[<entity-text>]{"entity": "<entity-type>", "value": '
                '"<entity-synonym>"}.'
                "\nYou can use the following command to update your training data file:"
                "\nsed -i -E 's/\\[([^)]+)\\]\\(([^)]+):([^)]+)\\)/[\\1]{"
                '"entity": "\\2", "value": "\\3"}/g\' nlu.md',
                category=FutureWarning,
                docs=DOCS_URL_TRAINING_DATA_NLU,
            )

        return TrainingData(
            self.training_examples,
            self.entity_synonyms,
            self.regex_features,
            self.lookup_tables,
        )
コード例 #19
0
def test_whitespace_training(supervised_embeddings_config):
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}
                ],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}
                ],
            },
        ),
    ]

    tk = WhitespaceTokenizer()

    tk.train(TrainingData(training_examples=examples), supervised_embeddings_config)

    assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
コード例 #20
0
    def read_from_json(self, js: Dict[Text, Any], **kwargs: Any):
        """Loads training data stored in the WIT.ai data format."""
        from rasa.nlu.training_data import Message, TrainingData

        training_examples = []

        for s in js["data"]:
            entities = s.get("entities")
            if entities is None:
                continue
            text = s.get("text")
            intents = [e["value"] for e in entities if e["entity"] == 'intent']
            intent = intents[0].strip("\"") if intents else None

            entities = [
                e for e in entities
                if ("start" in e and "end" in e and e["entity"] != 'intent')
            ]
            for e in entities:
                # for some reason wit adds additional quotes around entities
                e["value"] = e["value"].strip("\"")

            data = {}
            if intent:
                data["intent"] = intent
            if entities is not None:
                data["entities"] = entities
            training_examples.append(Message(text, data))
        return TrainingData(training_examples)
コード例 #21
0
ファイル: test_placeholder.py プロジェクト: amedat04/rasam
async def test_get_nlu_data(Faker: asynctest.MagicMock, load_data: asynctest.MagicMock) -> None:
    faker_ = Faker()
    faker_.name.return_value = "Nikola Tesla"
    training_data = TrainingData(
        training_examples=[
            Message.build("hello", "intent_test"),
            Message.build("hello @name", "intent_test"),
            Message.build("hello"),
        ]
    )
    load_data.return_value = training_data

    importer = PlaceholderImporter()
    importer.config = {"importers": [{"name": "rasam.PlaceholderImporter"}]}
    importer._nlu_files = ["test"]
    new_training_data = await importer.get_nlu_data()

    faker_.seed_instance.assert_called_once_with(importer.DEFAULT_FAKE_DATA_COUNT)
    load_data.assert_called_once_with("test", "en")
    message: Message
    expected_messages = [
        Message.build("hello", "intent_test"),
        Message.build("hello Nikola Tesla", "intent_test"),
        Message.build("hello"),
    ]
    for message, expected in zip(new_training_data.training_examples, expected_messages):
        assert message.get("intent") == expected.get("intent")
        assert message.get("text") == expected.get("text")
コード例 #22
0
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    if intent_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None

    if response_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
コード例 #23
0
def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message("hello")
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    if intent_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None

    if response_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
コード例 #24
0
def test_unintentional_synonyms_capitalized(component_builder):
    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
    ner_syn = component_builder.create_component(_config.for_component(5),
                                                 _config)
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican"
コード例 #25
0
def test_convert_featurizer_train():
    featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])

    assert vecs is None
コード例 #26
0
def test_build_tag_id_dict():
    message_1 = Message("Germany is part of the European Union")
    message_1.set(
        BILOU_ENTITIES,
        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
    )

    message_2 = Message("Berlin is the capital of Germany")
    message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"])

    training_data = TrainingData([message_1, message_2])

    tag_id_dict = bilou_utils.build_tag_id_dict(training_data)

    assert tag_id_dict == {
        "O": 0,
        "B-location": 1,
        "I-location": 2,
        "U-location": 3,
        "L-location": 4,
        "B-organisation": 5,
        "I-organisation": 6,
        "U-organisation": 7,
        "L-organisation": 8,
    }
コード例 #27
0
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer()
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
コード例 #28
0
    def preprocess_train_data(self, training_data: TrainingData) -> RasaModelData:
        """Prepares data for training.

        Performs sanity checks on training data, extracts encodings for labels.
        """

        if self.retrieval_intent:
            training_data = training_data.filter_by_intent(self.retrieval_intent)

        label_id_index_mapping = self._label_id_index_mapping(
            training_data, attribute=RESPONSE
        )

        if not label_id_index_mapping:
            # no labels are present to train
            return RasaModelData()

        self.index_label_id_mapping = self._invert_mapping(label_id_index_mapping)

        self._label_data = self._create_label_data(
            training_data, label_id_index_mapping, attribute=RESPONSE
        )

        model_data = self._create_model_data(
            training_data.intent_examples,
            label_id_index_mapping,
            label_attribute=RESPONSE,
        )

        self._check_input_dimension_consistency(model_data)

        return model_data
コード例 #29
0
def test_spacy_featurizer_train(spacy_nlp):

    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(RESPONSE_ATTRIBUTE, sentence)
    message.set(INTENT_ATTRIBUTE, "intent")
    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(sentence))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array(
        [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])

    assert vecs is None
コード例 #30
0
ファイル: importer.py プロジェクト: delldu/Rasa
    async def get_nlu_data(self, language: Optional[Text] = "en") -> TrainingData:
        nlu_data = [importer.get_nlu_data(language) for importer in self._importers]
        nlu_data = await asyncio.gather(*nlu_data)

        return reduce(
            lambda merged, other: merged.merge(other), nlu_data, TrainingData()
        )