def read_from_json(self, js, **kwargs): """Loads training data stored in the rasa NLU data format.""" validate_rasa_nlu_data(js) data = js['rasa_nlu_data'] common_examples = data.get("common_examples", []) intent_examples = data.get("intent_examples", []) entity_examples = data.get("entity_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) regex_intent = data.get("regex_intent", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) if intent_examples or entity_examples: logger.warn("DEPRECATION warning: your rasa data " "contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider " "putting all your examples " "into the 'common_examples' section.") all_examples = common_examples + intent_examples + entity_examples training_examples = [] for ex in all_examples: msg = Message.build(ex['text'], ex.get("intent"), ex.get("entities")) training_examples.append(msg) return TrainingData(training_examples, entity_synonyms, regex_features, regex_intent)
def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs) -> None: self.component_config = config.for_component(self.name, self.defaults) _tag_schema = self.component_config.get("tag_schema", "BIO").upper() _embedding = self.component_config.get("embedding", "embedding") train_examples, dev_examples = training_data.train_test_split( train_frac=self.component_config.get("folds", 1.)) assert _tag_schema in ["BIO", "BIOES"], \ "Only supported for `BIO` or `BIOES` tag schema" if train_examples.entity_examples: filtered_entity_examples = self.filter_trainable_entities( train_examples.training_examples) filtered_dev_entity_examples = self.filter_trainable_entities( dev_examples.training_examples) # get features, where the embedding method differs from bert if _embedding == "embedding": self.processor = self._load_embedding_processor( filtered_entity_examples, filtered_dev_entity_examples, **self.component_config) self._train_embedding() elif _embedding == "bert": self.processor = self._load_bert_processor( filtered_entity_examples, filtered_dev_entity_examples, **self.component_config) self._train_bert() else: raise ValueError("Unknown processor for given `embedding`," "only `embedding` or `bert` received")
def read_from_json(self, js, **kwargs): # type: (Text, Any) -> TrainingData """Loads training data stored in the WIT.ai data format.""" training_examples = [] for s in js["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent')] for e in entities: # for some reason wit adds additional quotes around entity values e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
def _read_intent(self, intent_js, examples_js): """Reads the intent and examples from respective jsons.""" intent = intent_js.get("name") training_examples = [] for ex in examples_js: text, entities = self._join_text_chunks(ex['data']) training_examples.append(Message.build(text, intent, entities)) return TrainingData(training_examples)
def prepare_dataset(self, training_data: TrainingData): """obtain training dataset and dev dataset features feed into model""" train_examples, test_examples = training_data.train_test_split( train_frac=self.folds) train_features, train_size = self._prepare_feature(train_examples) test_features, test_size = self._prepare_feature(test_examples) return train_features, test_features, train_size
def load_data(resource_name, language='en'): # type: (Text, Optional[Text]) -> TrainingData """Load training data from disk. Merges them if loaded from disk and multiple files are found.""" files = utils.list_files(resource_name) data_sets = [_load(f, language) for f in files] data_sets = [ds for ds in data_sets if ds] if len(data_sets) == 0: return TrainingData() elif len(data_sets) == 1: return data_sets[0] else: return data_sets[0].merge(*data_sets[1:])
def train(self, training_data: TrainingData, config: Optional[RasaNLUModelConfig], **kwargs: Any) -> None: encoder, tokenizer = load_pretrained(mpath=self.pre_path, config=self.bert_config, model=self.bert_model) self._create_intent_dict(training_data) # if self.folds == 1.: # train_examples, test_examples = training_data, TrainingData() # else: train_examples, test_examples = training_data.train_test_split( train_frac=self.folds) data_loader = NluClsDataLoader( message=train_examples.training_examples, tokenizer=tokenizer, max_len=self.max_seq_len, batch_size=self.batch_size, label_dict=self.int2idx) if test_examples.training_examples: test_data_loader = NluClsDataLoader( message=test_examples.training_examples, tokenizer=tokenizer, max_len=self.max_seq_len, batch_size=self.batch_size, label_dict=self.int2idx) else: test_data_loader = None train_pipeline = TrainingPipeLine( epochs=self.epochs, walking_epoch_visual=self.walking_epoch_visual, lr=self.lr, dropout=self.dropout, device=self.device, int2idx=self.int2idx, idx2int=self.idx2int) self.model = train_pipeline.train(encoder, data_loader=data_loader, test_loader=test_data_loader) self.eval_res = train_pipeline.eval_res
def read(self, fn, **kwargs): # type: ([Text]) -> TrainingData """Loads training data stored in the Dialogflow data format.""" language = kwargs["language"] fformat = kwargs["fformat"] if fformat not in {DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES}: raise ValueError("fformat must be either {}, or {}".format(DIALOGFLOW_INTENT, DIALOGFLOW_ENTITIES)) root_js = utils.read_json_file(fn) examples_js = self._read_examples_js(fn, language, fformat) if not examples_js: logger.warning("No training examples found for dialogflow file {}!".format(fn)) return TrainingData() elif fformat == DIALOGFLOW_INTENT: return self._read_intent(root_js, examples_js) elif fformat == DIALOGFLOW_ENTITIES: return self._read_entities(examples_js)
def read_from_json(self, js, **kwargs): # type: (Text, Any) -> TrainingData """Loads training data stored in the LUIS.ai data format.""" training_examples = [] regex_features = [] # Simple check to ensure we support this luis data schema version if not js["luis_schema_version"].startswith("2"): raise Exception("Invalid luis data schema version {}, should be 2.x.x. " "Make sure to use the latest luis version " "(e.g. by downloading your data again)." "".format(js["luis_schema_version"])) for r in js.get("regex_features", []): if r.get("activated", False): regex_features.append({"name": r.get("name"), "pattern": r.get("pattern")}) for s in js["utterances"]: text = s.get("text") intent = s.get("intent") entities = [] for e in s.get("entities") or []: start, end = e["startPos"], e["endPos"] + 1 val = text[start:end] entities.append({"entity": e["entity"], "value": val, "start": start, "end": end}) data = {"entities": entities} if intent: data["intent"] = intent training_examples.append(Message(text, data)) return TrainingData(training_examples, regex_features=regex_features)
def _read_entities(self, examples_js): entity_synonyms = transform_entity_synonyms(examples_js) return TrainingData([], entity_synonyms)