Beispiel #1
0
def _write_nlu_to_file(
    export_nlu_path: Text,
    evts: List[Dict[Text, Any]]
) -> None:
    """Write the nlu data of the sender_id to the file paths."""

    msgs = _collect_messages(evts)

    # noinspection PyBroadException
    try:
        previous_examples = load_data(export_nlu_path)
    except Exception as e:
        logger.exception("An exception occurred while trying to load the "
                         "NLU data.")

        export_nlu_path = questionary.text(
            message="Could not load existing NLU data, please "
                    "specify where to store NLU data learned in "
                    "this session (this will overwrite any "
                    "existing file). {}".format(str(e)),
            default=PATHS["backup"]).ask()

        if export_nlu_path is None:
            return

        previous_examples = TrainingData()

    nlu_data = previous_examples.merge(TrainingData(msgs))

    with io.open(export_nlu_path, 'w', encoding="utf-8") as f:
        if _guess_format(export_nlu_path) in {"md", "unk"}:
            f.write(nlu_data.as_markdown())
        else:
            f.write(nlu_data.as_json())
Beispiel #2
0
def _write_nlu_to_file(
    export_nlu_path: Text,
    evts: List[Dict[Text, Any]]
) -> None:
    """Write the nlu data of the sender_id to the file paths."""

    msgs = _collect_messages(evts)

    # noinspection PyBroadException
    try:
        previous_examples = load_data(export_nlu_path)

    except Exception:
        questions = [{"name": "export nlu",
                      "type": "input",
                      "message": "Could not load existing NLU data, please "
                                 "specify where to store NLU data learned in "
                                 "this session (this will overwrite any "
                                 "existing file)",
                      "default": PATHS["backup"]}]

        answers = prompt(questions)
        export_nlu_path = answers["export nlu"]
        previous_examples = TrainingData()

    nlu_data = previous_examples.merge(TrainingData(msgs))

    with io.open(export_nlu_path, 'w', encoding="utf-8") as f:
        if _guess_format(export_nlu_path) in {"md", "unk"}:
            f.write(nlu_data.as_markdown())
        else:
            f.write(nlu_data.as_json())
def test_repeated_entities():
    data = u"""
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "book a table today from 3 to 6 for 3 people",
        "intent": "unk",
        "entities": [
          {
            "entity": "description",
            "start": 35,
            "end": 36,
            "value": 3
          }
        ]
      }
    ]
  }
}"""
    with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f:
        f.write(data.encode("utf-8"))
        f.flush()
        td = TrainingData(f.name, 'mitie', 'en')
        assert len(td.entity_examples) == 1
        example = td.entity_examples[0]
        entities = example["entities"]
        assert len(entities) == 1
        start, end = mitie_trainer_utils.find_entity(entities[0], example["text"])
        assert start == 9
        assert end == 10
def test_nonascii_entities():
    data = u"""
{
  "luis_schema_version": "1.0",
  "utterances" : [
    {
      "text": "I am looking for a ßäæ ?€ö) item",
      "intent": "unk",
      "entities": [
        {
          "entity": "description",
          "startPos": 5,
          "endPos": 8
        }
      ]
    }
  ]
}"""
    with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f:
        f.write(data.encode("utf-8"))
        f.flush()
        td = TrainingData(f.name, 'mitie', 'en')
        assert len(td.entity_examples) == 1
        example = td.entity_examples[0]
        entities = example["entities"]
        assert len(entities) == 1
        entity = entities[0]
        assert entity["value"] == u"ßäæ ?€ö)"
        assert entity["start"] == 19
        assert entity["end"] == 27
        assert entity["entity"] == "description"
Beispiel #5
0
def load_train_data(data):
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())

    entity_synonyms = get_entity_synonyms_dict(synonyms)

    if intent or entity:
        logger.warn(
            "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be "
            +
            "removed in the future. Consider putting all your examples into the 'common_examples' section."
        )

    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = {}
        if e.get("intent"):
            data["intent"] = e["intent"]
        if e.get("entities") is not None:
            data["entities"] = e["entities"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
def load_wit_data(filename):
    # type: (str) -> TrainingData
    """Loads training data stored in the WIT.ai data format."""

    intent_examples = []
    entity_examples = []
    common_examples = []

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    for s in data["data"]:
        entities = s.get("entities")
        if entities is None:
            continue
        text = s.get("text")
        intents = [e["value"] for e in entities if e["entity"] == 'intent']
        intent = intents[0] if intents else None

        entities = [e for e in entities if ("start" in e and "end" in e)]
        for e in entities:
            e["value"] = e["value"][1:-1]

        if intent and entities:
            common_examples.append({"text": text, "intent": intent, "entities": entities})
        elif intent:
            intent_examples.append({"text": text, "intent": intent})
        elif entities:
            entity_examples.append({"text": text, "intent": intent, "entities": entities})
    return TrainingData(intent_examples, entity_examples, common_examples)
def test_crf_extractor(spacy_nlp):
    from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor()
    examples = [
        Message("anywhere in the west", {
            "intent": "restaurant_search",
            "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}],
            "spacy_doc": spacy_nlp("anywhere in the west")
        }),
        Message("central indian restaurant", {
            "intent": "restaurant_search",
            "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}],
            "spacy_doc": spacy_nlp("central indian restaurant")
        })]
    config = {"ner_crf": {"BILOU_flag": True, "features": ext.crf_features}}
    ext.train(TrainingData(training_examples=examples), config)
    sentence = 'anywhere in the west'
    crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']
    feats = ext._sentence_to_features(crf_format)
    assert 'BOS' in feats[0]
    assert 'EOS' in feats[-1]
    assert feats[1]['0:low'] == "in"
    sentence = 'anywhere in the west'
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
Beispiel #8
0
def load_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())


    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())
    entity_synonyms = get_entity_synonyms_dict(synonyms)


    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = e.copy()
        if "text" in data:
            del data["text"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
def test_multiword_entities():
    data = u"""
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "show me flights to New York City",
        "intent": "unk",
        "entities": [
          {
            "entity": "destination",
            "start": 19,
            "end": 32,
            "value": "New York City"
          }
        ]
      }
    ]
  }
}"""
    filename = 'tmp_training_data.json'
    with open(filename, 'w') as f:
        f.write(data.encode("utf-8"))
    td = TrainingData(filename, 'mitie', 'en')
    assert len(td.entity_examples) == 1
    example = td.entity_examples[0]
    entities = example["entities"]
    assert len(entities) == 1
    start, end = MITIETrainer.find_entity(entities[0], example["text"])
    assert start == 4
    assert end == 7
def test_repeated_entities():
    data = u"""
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "book a table today from 3 to 6 for 3 people",
        "intent": "unk",
        "entities": [
          {
            "entity": "description",
            "start": 35,
            "end": 36,
            "value": 3
          }
        ]
      }
    ]
  }
}"""
    filename = 'tmp_training_data.json'
    with open(filename, 'w') as f:
        f.write(data.encode("utf-8"))
    td = TrainingData(filename, 'mitie', 'en')
    assert len(td.entity_examples) == 1
    example = td.entity_examples[0]
    entities = example["entities"]
    assert len(entities) == 1
    start, end = MITIETrainer.find_entity(entities[0], example["text"])
    assert start == 9
    assert end == 10
Beispiel #11
0
def load_rasa_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())

    # build entity_synonyms dictionary
    entity_synonyms = {}
    for s in synonyms:
        if "value" in s and "synonyms" in s:
            for synonym in s["synonyms"]:
                entity_synonyms[synonym] = s["value"]

    if intent or entity:
        logger.warn(
            "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be "
            +
            "removed in the future. Consider putting all your examples into the 'common_examples' section."
        )

    all_examples = common + intent + entity
    return TrainingData(all_examples, entity_synonyms)
def load_wit_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the WIT.ai data format."""

    training_examples = []

    data = _read_json_from_file(filename)
    for s in data["data"]:
        entities = s.get("entities")
        if entities is None:
            continue
        text = s.get("text")
        intents = [e["value"] for e in entities if e["entity"] == 'intent']
        intent = intents[0].strip("\"") if intents else None

        entities = [e
                    for e in entities
                    if ("start" in e and "end" in e and
                        e["entity"] != 'intent')]
        for e in entities:
            # for some reason wit adds additional quotes around entity values
            e["value"] = e["value"].strip("\"")

        data = {}
        if intent:
            data["intent"] = intent
        if entities is not None:
            data["entities"] = entities
        training_examples.append(Message(text, data))
    return TrainingData(training_examples)
def test_multiword_entities():
    data = u"""
{
  "rasa_nlu_data": {
    "common_examples" : [
      {
        "text": "show me flights to New York City",
        "intent": "unk",
        "entities": [
          {
            "entity": "destination",
            "start": 19,
            "end": 32,
            "value": "New York City"
          }
        ]
      }
    ]
  }
}"""
    with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f:
        f.write(data.encode("utf-8"))
        f.flush()
        td = TrainingData(f.name, 'mitie', 'en')
        assert len(td.entity_examples) == 1
        example = td.entity_examples[0]
        entities = example["entities"]
        assert len(entities) == 1
        start, end = mitie_trainer_utils.find_entity(entities[0], example["text"])
        assert start == 4
        assert end == 7
Beispiel #14
0
 def do_GET(self):
     self._set_headers()
     data_file = sys.argv[1]
     training_data = TrainingData(data_file, 'mitie', 'en')
     data = create_html(training_data)
     self.wfile.write(data.encode('utf-8'))
     return
Beispiel #15
0
    def read_from_json(self, js, **kwargs):
        """Loads training data stored in the rasa NLU data format."""
        validate_rasa_nlu_data(js)

        data = js['rasa_nlu_data']
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            logger.warning("DEPRECATION warning: your rasa data "
                           "contains 'intent_examples' "
                           "or 'entity_examples' which will be "
                           "removed in the future. Consider "
                           "putting all your examples "
                           "into the 'common_examples' section.")

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex['text'], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features,
                            lookup_tables)
def load_rasa_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    data = _read_json_from_file(filename)
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())

    entity_synonyms = get_entity_synonyms_dict(synonyms)

    if intent or entity:
        logger.warn("DEPRECATION warning: Data file contains 'intent_examples' "
                    "or 'entity_examples' which will be "
                    "removed in the future. Consider putting all your examples "
                    "into the 'common_examples' section.")

    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = e.copy()
        if "text" in data:
            del data["text"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
Beispiel #17
0
def load_wit_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the WIT.ai data format."""

    training_examples = []

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    for s in data["data"]:
        entities = s.get("entities")
        if entities is None:
            continue
        text = s.get("text")
        intents = [e["value"] for e in entities if e["entity"] == 'intent']
        intent = intents[0].strip("\"") if intents else None

        entities = [
            e for e in entities
            if ("start" in e and "end" in e and e["entity"] != 'intent')
        ]
        for e in entities:
            e["value"] = e["value"].strip(
                "\""
            )  # for some reason wit adds additional quotes around entity values

        data = {"text": text}
        if intent:
            data["intent"] = intent
        if entities:
            data["entities"] = entities
        training_examples.append(data)
    return TrainingData(training_examples)
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
def load_markdown_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in markdown data format."""
    from rasa_nlu.utils.md_to_json import MarkdownToJson
    data = MarkdownToJson(filename)
    return TrainingData(data.common_examples,
                        get_entity_synonyms_dict(data.entity_synonyms))
Beispiel #20
0
def test_unintentional_synonyms_capitalized(component_builder):
    _config = utilities.base_test_conf("all_components")
    ner_syn = component_builder.create_component("ner_synonyms", _config)
    examples = [
        Message(
            "Any Mexican restaurant will do", {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }]
            }),
        Message(
            "I want Tacos!", {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }]
            })
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican"
def load_rasa_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())

    entity_synonyms = get_entity_synonyms_dict(synonyms)

    if intent or entity:
        logger.warn(
            "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be "
            +
            "removed in the future. Consider putting all your examples into the 'common_examples' section."
        )

    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = {}
        if e.get("intent"):
            data["intent"] = e["intent"]
        if e.get("entities") is not None:
            data["entities"] = e["entities"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
Beispiel #22
0
    def read_from_json(self, js: Dict[Text, Any], **kwargs: Any):
        """Loads training data stored in the WIT.ai data format."""
        from rasa_nlu.training_data import Message, TrainingData

        training_examples = []

        for s in js["data"]:
            entities = s.get("entities")
            if entities is None:
                continue
            text = s.get("text")
            intents = [e["value"] for e in entities if e["entity"] == 'intent']
            intent = intents[0].strip("\"") if intents else None

            entities = [
                e for e in entities
                if ("start" in e and "end" in e and e["entity"] != 'intent')
            ]
            for e in entities:
                # for some reason wit adds additional quotes around entities
                e["value"] = e["value"].strip("\"")

            data = {}
            if intent:
                data["intent"] = intent
            if entities is not None:
                data["entities"] = entities
            training_examples.append(Message(text, data))
        return TrainingData(training_examples)
Beispiel #23
0
def load_api_data(files):
    # type: ([str]) -> TrainingData
    """Loads training data stored in the API.ai data format."""

    intent_examples = []
    entity_examples = []
    common_examples = []
    entity_synonyms = {}
    for filename in files:
        with io.open(filename, encoding="utf-8-sig") as f:
            data = json.loads(f.read())
        # get only intents, skip the rest. The property name is the target class
        if "userSays" in data:
            intent = data.get("name")
            for s in data["userSays"]:
                text = "".join([chunk["text"] for chunk in s.get("data")])
                # add entities to each token, if available
                entities = []
                for e in [
                        chunk for chunk in s.get("data")
                        if "alias" in chunk or "meta" in chunk
                ]:
                    start = text.find(e["text"])
                    end = start + len(e["text"])
                    val = text[start:end]
                    entities.append({
                        "entity":
                        e["alias"] if "alias" in e else e["meta"],
                        "value":
                        val,
                        "start":
                        start,
                        "end":
                        end
                    })

                if intent and entities:
                    common_examples.append({
                        "text": text,
                        "intent": intent,
                        "entities": entities
                    })
                elif intent:
                    intent_examples.append({"text": text, "intent": intent})
                elif entities:
                    entity_examples.append({
                        "text": text,
                        "intent": intent,
                        "entities": entities
                    })

        # create synonyms dictionary
        if "name" in data and "entries" in data:
            for entry in data["entries"]:
                if "value" in entry and "synonyms" in entry:
                    for synonym in entry["synonyms"]:
                        entity_synonyms[synonym] = entry["value"]
    return TrainingData(intent_examples, entity_examples, common_examples,
                        entity_synonyms)
Beispiel #24
0
def do_train(config):
    trainer = create_trainer(config)

    persistor = create_persistor(config)

    training_data = TrainingData(config.data, config.backend, config.language)
    trainer.train(training_data)
    trainer.persist(config.path, persistor)
Beispiel #25
0
    def _read_entities(entity_js, examples_js):
        from rasa_nlu.training_data import TrainingData

        entity_synonyms = transform_entity_synonyms(examples_js)

        name = entity_js.get("name")
        lookup_tables = DialogflowReader._extract_lookup_tables(
            name, examples_js)
        return TrainingData([], entity_synonyms, [], lookup_tables)
Beispiel #26
0
def load_data(resource_name, language='en'):
    # type: (Text, Optional[Text]) -> TrainingData
    """Load training data from disk.

    Merges them if loaded from disk and multiple files are found."""

    files = utils.list_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]
    if len(data_sets) == 0:
        training_data = TrainingData()
    elif len(data_sets) == 1:
        training_data = data_sets[0]
    else:
        training_data = data_sets[0].merge(*data_sets[1:])

    training_data.validate()
    return training_data
Beispiel #27
0
    def _read_intent(self, intent_js, examples_js):
        """Reads the intent and examples from respective jsons."""
        intent = intent_js.get("name")

        training_examples = []
        for ex in examples_js:
            text, entities = self._join_text_chunks(ex['data'])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
Beispiel #28
0
def test_train_with_empty_data(component_builder):
    _config = utilities.base_test_conf("all_components")
    trainer = Trainer(_config, component_builder)
    trainer.train(TrainingData())
    persistor = create_persistor(_config)
    persisted_path = trainer.persist(_config['path'], persistor, project_name=_config['name'])
    loaded = utilities.load_interpreter_for_model(_config, persisted_path, component_builder)
    assert loaded.pipeline
    assert loaded.parse("hello") is not None
    assert loaded.parse("Hello today is Monday, again!") is not None
Beispiel #29
0
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir):
    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})
    trainer = Trainer(_config, component_builder)
    trainer.train(TrainingData())
    persistor = create_persistor(_config)
    persisted_path = trainer.persist(tmpdir.strpath, persistor,
                                     project_name="my_project")
    loaded = Interpreter.load(persisted_path, component_builder)
    assert loaded.pipeline
    assert loaded.parse("hello") is not None
    assert loaded.parse("Hello today is Monday, again!") is not None
Beispiel #30
0
def do_train(config):
    """Loads the trainer and the data and runs the training of the specified model."""

    trainer = create_trainer(config)

    persistor = create_persistor(config)

    training_data = TrainingData(config.data, config.backend, config.language)
    trainer.train(training_data)
    trainer.persist(config.path, persistor)
    return trainer