Esempio n. 1
0
def load_rasa_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    validate_rasa_nlu_data(data)
    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())

    return TrainingData(intent, entity, common)
Esempio n. 2
0
def test_train_with_empty_data(component_builder):
    _config = utilities.base_test_conf("all_components")
    trainer = Trainer(_config, component_builder)
    trainer.train(TrainingData())
    persistor = create_persistor(_config)
    persisted_path = trainer.persist(_config['path'],
                                     persistor,
                                     model_name=_config['name'])
    loaded = utilities.load_interpreter_for_model(_config, persisted_path,
                                                  component_builder)
    assert loaded.pipeline
    assert loaded.parse("hello", time=None) is not None
Esempio n. 3
0
    def _read_intent(self, intent_js, examples_js):
        """Reads the intent and examples from respective jsons."""
        from rasa_nlu.training_data import Message, TrainingData

        intent = intent_js.get("name")

        training_examples = []
        for ex in examples_js:
            text, entities = self._join_text_chunks(ex['data'])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
    trainer = Trainer(_config, component_builder)
    trainer.train(TrainingData())
    persistor = create_persistor(_config)
    persisted_path = trainer.persist(tmpdir.strpath,
                                     persistor,
                                     project_name="my_project")
    loaded = Interpreter.load(persisted_path, component_builder)
    assert loaded.pipeline
    assert loaded.parse("hello") is not None
    assert loaded.parse("Hello today is Monday, again!") is not None
    def train(cfg_name, model_name):
        from rasa_nlu.train import create_trainer
        from rasa_nlu.train import create_persistor
        from rasa_nlu.training_data import TrainingData

        config = RasaNLUConfig(cfg_name)
        trainer = create_trainer(config)
        persistor = create_persistor(config)

        training_data = TrainingData(config['data'], config['backend'], nlp=trainer.nlp)
        trainer.train(training_data)
        trainer.persist(os.path.join("test_models", model_name), persistor, create_unique_subfolder=False)
Esempio n. 6
0
def load_luis_data(filename, tokenizer):
    # type: (str, Optional[Tokenizer]) -> TrainingData
    """Loads training data stored in the LUIS.ai data format."""

    warnings.warn(
        """LUIS data may not always be correctly imported because entity locations are specified by tokens.
        If you use a tokenizer which behaves differently from LUIS's your entities might not be correct"""
    )
    if not tokenizer:
        raise ValueError(
            "Can not load luis data without a specified tokenizer " +
            "(e.g. using the configuration value `luis_data_tokenizer`)")

    intent_examples = []
    entity_examples = []
    common_examples = []

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    for s in data["utterances"]:
        text = s.get("text")
        tokens = [t for t in tokenizer.tokenize(text)]
        intent = s.get("intent")
        entities = []
        for e in s.get("entities") or []:
            i, ii = e["startPos"], e["endPos"] + 1
            _regex = u"\s*".join([re.escape(s) for s in tokens[i:ii]])
            expr = re.compile(_regex)
            m = expr.search(text)
            start, end = m.start(), m.end()
            val = text[start:end]
            entities.append({
                "entity": e["entity"],
                "value": val,
                "start": start,
                "end": end
            })

        if intent and entities:
            common_examples.append({
                "text": text,
                "intent": intent,
                "entities": entities
            })
        elif intent:
            intent_examples.append({"text": text, "intent": intent})
        elif entities:
            entity_examples.append({
                "text": text,
                "intent": intent,
                "entities": entities
            })
    return TrainingData(intent_examples, entity_examples, common_examples)
Esempio n. 7
0
def _write_nlu_to_file(export_nlu_path: Text, evts: List[Dict[Text,
                                                              Any]]) -> None:
    """Write the nlu data of the sender_id to the file paths."""
    from rasa_nlu.training_data import TrainingData

    msgs = _collect_messages(evts)

    # noinspection PyBroadException
    try:
        previous_examples = load_data(export_nlu_path)
    except Exception as e:
        logger.exception("An exception occurred while trying to load the "
                         "NLU data.")

        export_nlu_path = questionary.text(
            message="Could not load existing NLU data, please "
            "specify where to store NLU data learned in "
            "this session (this will overwrite any "
            "existing file). {}".format(str(e)),
            default=PATHS["backup"]).ask()

        if export_nlu_path is None:
            return

        previous_examples = TrainingData()

    nlu_data = previous_examples.merge(TrainingData(msgs))

    # need to guess the format of the file before opening it to avoid a read
    # in a write
    if _guess_format(export_nlu_path) in {"md", "unk"}:
        fformat = "md"
    else:
        fformat = "json"

    with open(export_nlu_path, 'w', encoding="utf-8") as f:
        if fformat == "md":
            f.write(nlu_data.as_markdown())
        else:
            f.write(nlu_data.as_json())
Esempio n. 8
0
def load_data(resource_name, language='en'):
    # type: (Text, Optional[Text]) -> TrainingData
    """Load training data from disk. Merges them if multiple files are found."""

    files = utils.list_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        return TrainingData()
    elif len(data_sets) == 1:
        return data_sets[0]
    else:
        return data_sets[0].merge(*data_sets[1:])
Esempio n. 9
0
    def reads(self, s, **kwargs):
        """Read markdown string and create TrainingData object"""
        self.__init__()
        s = self._strip_comments(s)
        for line in s.splitlines():
            line = line.strip()
            header = self._find_section_header(line)
            if header:
                self._set_current_section(header[0], header[1])
            else:
                self._parse_item(line)

        return TrainingData(self.training_examples, self.entity_synonyms, self.regex_features)
Esempio n. 10
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    message = Message(sentence)
    message.set("intent", "bla")
    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected)
Esempio n. 11
0
def load_markdown_data(filenames):
    # type: (List[Text]) -> TrainingData
    """Loads training data stored in markdown data format."""
    from rasa_nlu.utils.md_to_json import MarkdownToJson

    common_examples = list()
    known_synonyms = {}
    for filename in filenames:
        data = MarkdownToJson(filename)
        common_examples += data.common_examples
        known_synonyms = get_entity_synonyms_dict(data.entity_synonyms,
                                                  known_synonyms)
    return TrainingData(common_examples, known_synonyms)
Esempio n. 12
0
def load_data(resource_name, language='en'):
    # type: (Text, Optional[Text]) -> TrainingData
    """Loads training data from disk and merges them if multiple files are found."""

    files = utils.recursively_find_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    # Dialogflow has files that we don't read directly, these return None
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        return TrainingData()
    elif len(data_sets) == 1:
        return data_sets[0]
    else:
        return data_sets[0].merge(*data_sets[1:])
def load_dialogflow_data(files, language):
    # type: (List[Text]) -> TrainingData
    """Loads training data stored in the Dialogflow data format."""

    training_examples = []
    entity_synonyms = {}
    for filename in files:
        data = _read_json_from_file(filename)
        # Language specific extensions
        usersays_file_ext = '_usersays_{}.json'.format(language)
        synonyms_file_ext = '_entries_{}.json'.format(language)
        if filename.endswith(usersays_file_ext):
            synonyms_filename = filename.replace(usersays_file_ext, '.json')
            root_f_data = _read_json_from_file(synonyms_filename)
            intent = root_f_data.get("name")

            for s in data:
                text = "".join([chunk["text"] for chunk in s.get("data")])
                # add entities to each token, if available
                entities = []
                for e in [chunk
                          for chunk in s.get("data")
                          if "alias" in chunk or "meta" in chunk]:
                    start = text.find(e["text"])
                    end = start + len(e["text"])
                    val = text[start:end]
                    entity_type = e["alias"] if "alias" in e else e["meta"]
                    if entity_type != u'@sys.ignore':
                        entities.append(
                            {
                                "entity": entity_type,
                                "value": val,
                                "start": start,
                                "end": end
                            }
                        )
                data = {}
                if intent:
                    data["intent"] = intent
                if entities is not None:
                    data["entities"] = entities
                training_examples.append(Message(text, data))

        elif filename.endswith(synonyms_file_ext):
            # create synonyms dictionary
            for entry in data:
                if "value" in entry and "synonyms" in entry:
                    for synonym in entry["synonyms"]:
                        entity_synonyms[synonym] = entry["value"]
    return TrainingData(training_examples, entity_synonyms)
Esempio n. 14
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Esempio n. 15
0
def _write_nlu_to_file(export_nlu_path, evts):
    # type: (Text, List[Dict[Text, Any]]) -> None
    """Write the nlu data of the sender_id to the file paths."""

    msgs = _collect_messages(evts)

    # noinspection PyBroadException
    try:
        previous_examples = load_data(export_nlu_path)

    except Exception:
        questions = [{
            "name":
            "export nlu",
            "type":
            "input",
            "message":
            "Could not load existing NLU data, please "
            "specify where to store NLU data learned in "
            "this session (this will overwrite any "
            "existing file)",
            "default":
            PATHS["backup"]
        }]

        answers = prompt(questions)
        export_nlu_path = answers["export nlu"]
        previous_examples = TrainingData()

    nlu_data = previous_examples.merge(TrainingData(msgs))

    with io.open(export_nlu_path, 'w', encoding="utf-8") as f:
        if _guess_format(export_nlu_path) in ["md", "unk"]:
            f.write(nlu_data.as_markdown())
        else:
            f.write(nlu_data.as_json())
Esempio n. 16
0
def load_luis_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the LUIS.ai data format."""

    intent_examples = []
    entity_examples = []
    common_examples = []

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())

    # Simple check to ensure we support this luis data schema version
    if not data["luis_schema_version"].startswith("2"):
        raise Exception(
            "Invalid luis data schema version {}, should be 2.x.x. ".format(
                data["luis_schema_version"]) +
            "Make sure to use the latest luis version (e.g. by downloading your data again)."
        )

    for s in data["utterances"]:
        text = s.get("text")
        intent = s.get("intent")
        entities = []
        for e in s.get("entities") or []:
            start, end = e["startPos"], e["endPos"] + 1
            val = text[start:end]
            entities.append({
                "entity": e["entity"],
                "value": val,
                "start": start,
                "end": end
            })

        if intent and entities:
            common_examples.append({
                "text": text,
                "intent": intent,
                "entities": entities
            })
        elif intent:
            intent_examples.append({"text": text, "intent": intent})
        elif entities:
            entity_examples.append({
                "text": text,
                "intent": intent,
                "entities": entities
            })
    return TrainingData(intent_examples, entity_examples, common_examples)
Esempio n. 17
0
def test_unintentional_synonyms_capitalized(component_builder):
    _config = utilities.base_test_conf("all_components")
    ner_syn = component_builder.create_component("ner_synonyms", _config)
    examples = [
        Message("Any Mexican restaurant will do", {
            "intent": "restaurant_search",
            "entities": [{"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}]
        }),
        Message("I want Tacos!", {
            "intent": "restaurant_search",
            "entities": [{"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}]
        })
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican"
Esempio n. 18
0
    def reads(self, s: Text, **kwargs: Any) -> 'TrainingData':
        """Read markdown string and create TrainingData object"""
        from rasa_nlu.training_data import TrainingData

        self.__init__()
        s = self._strip_comments(s)
        for line in s.splitlines():
            line = line.strip()
            header = self._find_section_header(line)
            if header:
                self._set_current_section(header[0], header[1])
            else:
                self._parse_item(line)
                self._load_files(line)
        return TrainingData(self.training_examples, self.entity_synonyms,
                            self.regex_features, self.lookup_tables)
Esempio n. 19
0
def load_api_data(files):
    # type: (List[Text]) -> TrainingData
    """Loads training data stored in the API.ai data format."""

    training_examples = []
    entity_synonyms = {}
    for filename in files:
        with io.open(filename, encoding="utf-8-sig") as f:
            data = json.loads(f.read())
        # get only intents, skip the rest. The property name is the target class
        if "userSays" in data:
            intent = data.get("name")
            for s in data["userSays"]:
                text = "".join([chunk["text"] for chunk in s.get("data")])
                # add entities to each token, if available
                entities = []
                for e in [
                        chunk for chunk in s.get("data")
                        if "alias" in chunk or "meta" in chunk
                ]:
                    start = text.find(e["text"])
                    end = start + len(e["text"])
                    val = text[start:end]
                    entities.append({
                        "entity":
                        e["alias"] if "alias" in e else e["meta"],
                        "value":
                        val,
                        "start":
                        start,
                        "end":
                        end
                    })
                data = {"text": text}
                if intent:
                    data["intent"] = intent
                if entities:
                    data["entities"] = entities
                training_examples.append(data)

        # create synonyms dictionary
        if "name" in data and "entries" in data:
            for entry in data["entries"]:
                if "value" in entry and "synonyms" in entry:
                    for synonym in entry["synonyms"]:
                        entity_synonyms[synonym] = entry["value"]
    return TrainingData(training_examples, entity_synonyms)
Esempio n. 20
0
def test_crf_extractor(spacy_nlp):
    from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor()
    examples = [
        Message(
            "anywhere in the west", {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 16,
                    "end": 20,
                    "value": "west",
                    "entity": "location"
                }],
                "spacy_doc":
                spacy_nlp("anywhere in the west")
            }),
        Message(
            "central indian restaurant", {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 0,
                    "end": 7,
                    "value": "central",
                    "entity": "location"
                }],
                "spacy_doc":
                spacy_nlp("central indian restaurant")
            })
    ]
    config = {
        "entity_crf_BILOU_flag": True,
        "entity_crf_features": ext.crf_features
    }
    ext.train(TrainingData(training_examples=examples), config)
    sentence = 'anywhere in the west'
    crf_format = ext._from_text_to_crf(
        Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    assert ([word[0]
             for word in crf_format] == ['anywhere', 'in', 'the', 'west'])
    feats = ext._sentence_to_features(crf_format)
    assert ('BOS' in feats[0])
    assert ('EOS' in feats[-1])
    assert ('0:low:in' in feats[1])
    sentence = 'anywhere in the west'
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
Esempio n. 21
0
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
    from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    examples = [
        Message("anywhere in the west", {
            "intent": "restaurant_search",
            "entities": [{"start": 16, "end": 20,
                          "value": "west", "entity": "location"}],
            "spacy_doc": spacy_nlp("anywhere in the west")
        }),
        Message("central indian restaurant", {
            "intent": "restaurant_search",
            "entities": [
                {"start": 0, "end": 7, "value": "central",
                 "entity": "location", "extractor": "random_extractor"},
                {"start": 8, "end": 14, "value": "indian",
                 "entity": "cuisine", "extractor": "ner_crf"}
            ],
            "spacy_doc": spacy_nlp("central indian restaurant")
        })]

    # uses BILOU and the default features
    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
    sentence = 'anywhere in the west'
    doc = {"spacy_doc": spacy_nlp(sentence)}
    crf_format = ext._from_text_to_crf(Message(sentence, doc))
    assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']
    feats = ext._sentence_to_features(crf_format)
    assert 'BOS' in feats[0]
    assert 'EOS' in feats[-1]
    assert feats[1]['0:low'] == "in"
    sentence = 'anywhere in the west'
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    filtered = ext.filter_trainable_entities(examples)
    assert filtered[0].get('entities') == [
        {"start": 16, "end": 20, "value": "west", "entity": "location"}
    ], 'Entity without extractor remains'
    assert filtered[1].get('entities') == [
        {"start": 8, "end": 14,
         "value": "indian", "entity": "cuisine", "extractor": "ner_crf"}
    ], 'Only ner_crf entity annotation remains'
    assert examples[1].get('entities')[0] == {
        "start": 0, "end": 7,
        "value": "central", "entity": "location",
        "extractor": "random_extractor"
    }, 'Original examples are not mutated'
Esempio n. 22
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1,
                                  "max_ngram": 2,
                                  "analyzer": 'char'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_oov_token(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({
        "token_pattern": r'(?u)\b\w+\b',
        "OOV_token": '__oov__'
    })
    train_message = Message(sentence)
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
def load_luis_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the LUIS.ai data format."""

    training_examples = []
    regex_features = []

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())

    # Simple check to ensure we support this luis data schema version
    if not data["luis_schema_version"].startswith("2"):
        raise Exception(
            "Invalid luis data schema version {}, should be 2.x.x. ".format(
                data["luis_schema_version"]) +
            "Make sure to use the latest luis version (e.g. by downloading your data again)."
        )

    for r in data.get("regex_features", []):
        if r.get("activated", False):
            regex_features.append({
                "name": r.get("name"),
                "pattern": r.get("pattern")
            })

    for s in data["utterances"]:
        text = s.get("text")
        intent = s.get("intent")
        entities = []
        for e in s.get("entities") or []:
            start, end = e["startPos"], e["endPos"] + 1
            val = text[start:end]
            entities.append({
                "entity": e["entity"],
                "value": val,
                "start": start,
                "end": end
            })

        data = {"entities": entities}
        if intent:
            data["intent"] = intent
        training_examples.append(Message(text, data))
    return TrainingData(training_examples, regex_features=regex_features)
Esempio n. 25
0
def load_data(resource_name: Text,
              language: Optional[Text] = 'en') -> 'TrainingData':
    """Load training data from disk.

    Merges them if loaded from disk and multiple files are found."""
    from rasa_nlu.training_data import TrainingData

    files = utils.list_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        training_data = TrainingData()
    elif len(data_sets) == 1:
        training_data = data_sets[0]
    else:
        training_data = data_sets[0].merge(*data_sets[1:])

    training_data.validate()
    return training_data
Esempio n. 26
0
    def read_from_json(self, js: Dict[Text, Any],
                       **kwargs: Any) -> 'TrainingData':
        """Loads training data stored in the LUIS.ai data format."""
        from rasa_nlu.training_data import Message, TrainingData

        training_examples = []
        regex_features = []

        # Simple check to ensure we support this luis data schema version
        if not js["luis_schema_version"].startswith("2"):
            raise Exception("Invalid luis data schema version {}, "
                            "should be 2.x.x. "
                            "Make sure to use the latest luis version "
                            "(e.g. by downloading your data again)."
                            "".format(js["luis_schema_version"]))

        for r in js.get("regex_features", []):
            if r.get("activated", False):
                regex_features.append({
                    "name": r.get("name"),
                    "pattern": r.get("pattern")
                })

        for s in js["utterances"]:
            text = s.get("text")
            intent = s.get("intent")
            entities = []
            for e in s.get("entities") or []:
                start, end = e["startPos"], e["endPos"] + 1
                val = text[start:end]
                entities.append({
                    "entity": e["entity"],
                    "value": val,
                    "start": start,
                    "end": end
                })

            data = {"entities": entities}
            if intent:
                data["intent"] = intent
            training_examples.append(Message(text, data))
        return TrainingData(training_examples, regex_features=regex_features)
Esempio n. 27
0
def load_wit_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the WIT.ai data format."""

    intent_examples = []
    entity_examples = []
    common_examples = []

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    for s in data["data"]:
        entities = s.get("entities")
        if entities is None:
            continue
        text = s.get("text")
        intents = [e["value"] for e in entities if e["entity"] == 'intent']
        intent = intents[0].strip("\"") if intents else None

        entities = [
            e for e in entities
            if ("start" in e and "end" in e and e["entity"] != 'intent')
        ]
        for e in entities:
            e["value"] = e["value"].strip(
                "\""
            )  # for some reason wit adds additional quotes around entity values

        if intent and entities:
            common_examples.append({
                "text": text,
                "intent": intent,
                "entities": entities
            })
        elif intent:
            intent_examples.append({"text": text, "intent": intent})
        elif entities:
            entity_examples.append({
                "text": text,
                "intent": intent,
                "entities": entities
            })
    return TrainingData(intent_examples, entity_examples, common_examples)
Esempio n. 28
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result
    message = Message("")

    tokens_feature = [Token(i, 0) for i in tokens]
    message.set("tokens", tokens_feature)
    message.set("intent", "bla")  # this is needed for a valid training example

    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected)
Esempio n. 29
0
def load_rasa_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())

    if intent or entity:
        logger.warn(
            "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be "
            +
            "removed in the future. Consider putting all your examples into the 'common_examples' section."
        )

    all_examples = common + intent + entity
    return TrainingData(all_examples)
Esempio n. 30
0
    def read(self, fn: Text, **kwargs: Any) -> 'TrainingData':
        """Loads training data stored in the Dialogflow data format."""
        from rasa_nlu.training_data import TrainingData

        language = kwargs["language"]
        fformat = kwargs["fformat"]

        if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
            raise ValueError("fformat must be either {}, or {}"
                             "".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES))

        root_js = utils.read_json_file(fn)
        examples_js = self._read_examples_js(fn, language, fformat)

        if not examples_js:
            logger.warning("No training examples found for dialogflow file {}!"
                           "".format(fn))
            return TrainingData()
        elif fformat == DIALOGFLOW_INTENT:
            return self._read_intent(root_js, examples_js)
        elif fformat == DIALOGFLOW_ENTITIES:
            return self._read_entities(root_js, examples_js)