Exemple #1
0
    def read_from_json(self, js, **kwargs):
        """Loads training data stored in the rasa NLU data format."""
        validate_rasa_nlu_data(js)

        data = js['rasa_nlu_data']
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        regex_intent = data.get("regex_intent", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            logger.warn("DEPRECATION warning: your rasa data "
                        "contains 'intent_examples' "
                        "or 'entity_examples' which will be "
                        "removed in the future. Consider "
                        "putting all your examples "
                        "into the 'common_examples' section.")

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex['text'], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features, regex_intent)
Exemple #2
0
    def train(self, training_data: TrainingData, config: RasaNLUModelConfig,
              **kwargs) -> None:
        self.component_config = config.for_component(self.name, self.defaults)

        _tag_schema = self.component_config.get("tag_schema", "BIO").upper()
        _embedding = self.component_config.get("embedding", "embedding")

        train_examples, dev_examples = training_data.train_test_split(
            train_frac=self.component_config.get("folds", 1.))

        assert _tag_schema in ["BIO", "BIOES"], \
            "Only supported for `BIO` or `BIOES` tag schema"

        if train_examples.entity_examples:
            filtered_entity_examples = self.filter_trainable_entities(
                train_examples.training_examples)
            filtered_dev_entity_examples = self.filter_trainable_entities(
                dev_examples.training_examples)

            # get features, where the embedding method differs from bert
            if _embedding == "embedding":
                self.processor = self._load_embedding_processor(
                    filtered_entity_examples, filtered_dev_entity_examples,
                    **self.component_config)
                self._train_embedding()

            elif _embedding == "bert":
                self.processor = self._load_bert_processor(
                    filtered_entity_examples, filtered_dev_entity_examples,
                    **self.component_config)
                self._train_bert()

            else:
                raise ValueError("Unknown processor for given `embedding`,"
                                 "only `embedding` or `bert` received")
Exemple #3
0
    def read_from_json(self, js, **kwargs):
        # type: (Text, Any) -> TrainingData
        """Loads training data stored in the WIT.ai data format."""

        training_examples = []

        for s in js["data"]:
            entities = s.get("entities")
            if entities is None:
                continue
            text = s.get("text")
            intents = [e["value"] for e in entities if e["entity"] == 'intent']
            intent = intents[0].strip("\"") if intents else None

            entities = [e
                        for e in entities
                        if ("start" in e and "end" in e and
                            e["entity"] != 'intent')]
            for e in entities:
                # for some reason wit adds additional quotes around entity values
                e["value"] = e["value"].strip("\"")

            data = {}
            if intent:
                data["intent"] = intent
            if entities is not None:
                data["entities"] = entities
            training_examples.append(Message(text, data))
        return TrainingData(training_examples)
Exemple #4
0
    def _read_intent(self, intent_js, examples_js):
        """Reads the intent and examples from respective jsons."""
        intent = intent_js.get("name")

        training_examples = []
        for ex in examples_js:
            text, entities = self._join_text_chunks(ex['data'])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
Exemple #5
0
    def prepare_dataset(self, training_data: TrainingData):
        """obtain training dataset and dev dataset features feed into model"""

        train_examples, test_examples = training_data.train_test_split(
            train_frac=self.folds)

        train_features, train_size = self._prepare_feature(train_examples)
        test_features, test_size = self._prepare_feature(test_examples)

        return train_features, test_features, train_size
Exemple #6
0
def load_data(resource_name, language='en'):
    # type: (Text, Optional[Text]) -> TrainingData
    """Load training data from disk.

    Merges them if loaded from disk and multiple files are found."""

    files = utils.list_files(resource_name)
    data_sets = [_load(f, language) for f in files]
    data_sets = [ds for ds in data_sets if ds]

    if len(data_sets) == 0:
        return TrainingData()
    elif len(data_sets) == 1:
        return data_sets[0]
    else:
        return data_sets[0].merge(*data_sets[1:])
Exemple #7
0
    def train(self, training_data: TrainingData,
              config: Optional[RasaNLUModelConfig], **kwargs: Any) -> None:

        encoder, tokenizer = load_pretrained(mpath=self.pre_path,
                                             config=self.bert_config,
                                             model=self.bert_model)

        self._create_intent_dict(training_data)

        # if self.folds == 1.:
        #     train_examples, test_examples = training_data, TrainingData()
        # else:
        train_examples, test_examples = training_data.train_test_split(
            train_frac=self.folds)

        data_loader = NluClsDataLoader(
            message=train_examples.training_examples,
            tokenizer=tokenizer,
            max_len=self.max_seq_len,
            batch_size=self.batch_size,
            label_dict=self.int2idx)

        if test_examples.training_examples:

            test_data_loader = NluClsDataLoader(
                message=test_examples.training_examples,
                tokenizer=tokenizer,
                max_len=self.max_seq_len,
                batch_size=self.batch_size,
                label_dict=self.int2idx)
        else:
            test_data_loader = None

        train_pipeline = TrainingPipeLine(
            epochs=self.epochs,
            walking_epoch_visual=self.walking_epoch_visual,
            lr=self.lr,
            dropout=self.dropout,
            device=self.device,
            int2idx=self.int2idx,
            idx2int=self.idx2int)

        self.model = train_pipeline.train(encoder,
                                          data_loader=data_loader,
                                          test_loader=test_data_loader)
        self.eval_res = train_pipeline.eval_res
Exemple #8
0
    def read(self, fn, **kwargs):
        # type: ([Text]) -> TrainingData
        """Loads training data stored in the Dialogflow data format."""

        language = kwargs["language"]
        fformat = kwargs["fformat"]

        if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}:
            raise ValueError("fformat must be either {}, or {}".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES))

        root_js = utils.read_json_file(fn)
        examples_js = self._read_examples_js(fn, language, fformat)

        if not examples_js:
            logger.warning("No training examples found for dialogflow file {}!".format(fn))
            return TrainingData()
        elif fformat == DIALOGFLOW_INTENT:
            return self._read_intent(root_js, examples_js)
        elif fformat == DIALOGFLOW_ENTITIES:
            return self._read_entities(examples_js)
Exemple #9
0
    def read_from_json(self, js, **kwargs):
        # type: (Text, Any) -> TrainingData
        """Loads training data stored in the LUIS.ai data format."""

        training_examples = []
        regex_features = []

        # Simple check to ensure we support this luis data schema version
        if not js["luis_schema_version"].startswith("2"):
            raise Exception("Invalid luis data schema version {}, should be 2.x.x. "
                            "Make sure to use the latest luis version "
                            "(e.g. by downloading your data again)."
                            "".format(js["luis_schema_version"]))

        for r in js.get("regex_features", []):
            if r.get("activated", False):
                regex_features.append({"name": r.get("name"),
                                       "pattern": r.get("pattern")})

        for s in js["utterances"]:
            text = s.get("text")
            intent = s.get("intent")
            entities = []
            for e in s.get("entities") or []:
                start, end = e["startPos"], e["endPos"] + 1
                val = text[start:end]
                entities.append({"entity": e["entity"],
                                 "value": val,
                                 "start": start,
                                 "end": end})

            data = {"entities": entities}
            if intent:
                data["intent"] = intent
            training_examples.append(Message(text, data))
        return TrainingData(training_examples, regex_features=regex_features)
Exemple #10
0
 def _read_entities(self, examples_js):
     entity_synonyms = transform_entity_synonyms(examples_js)
     return TrainingData([], entity_synonyms)