def train_nlu(): training_data = load_data('./data/nlu.md') trainer = Trainer(config.load("config.yml")) trainer.train(training_data) model_directory = trainer.persist('./models/nlu/', fixed_model_name="current") return model_directory
def train_nlu(data, configs, model_dir): training_data = load_data(data) trainer = Trainer(config.load(configs)) trainer.train(training_data) model_directory = trainer.persist(model_dir, fixed_model_name="nlu") logger.info(f"Model trained. Stored in '{model_directory}'.") return model_directory
def trained_nlu_model(request): cfg = RasaNLUModelConfig({"pipeline": "keyword"}) trainer = Trainer(cfg) td = training_data.load_data(DEFAULT_DATA_PATH) trainer.train(td) model_path = trainer.persist(NLU_MODEL_PATH) nlu_data = data.get_nlu_directory(DEFAULT_DATA_PATH) output_path = os.path.join(NLU_MODEL_PATH, NLU_MODEL_NAME) new_fingerprint = model.model_fingerprint(NLU_DEFAULT_CONFIG_PATH, nlu_data=nlu_data) model.create_package_rasa(model_path, output_path, new_fingerprint) def fin(): if os.path.exists(NLU_MODEL_PATH): shutil.rmtree(NLU_MODEL_PATH) if os.path.exists(output_path): shutil.rmtree(output_path) request.addfinalizer(fin) return output_path
def train(nlu_config: Union[Text, RasaNLUModelConfig], data: Text, path: Optional[Text] = None, project: Optional[Text] = None, fixed_model_name: Optional[Text] = None, storage: Optional[Text] = None, component_builder: Optional[ComponentBuilder] = None, training_data_endpoint: Optional[EndpointConfig] = None, **kwargs: Any) -> Tuple[Trainer, Interpreter, Text]: """Loads the trainer and the data and runs the training of the model.""" if isinstance(nlu_config, str): nlu_config = config.load(nlu_config) # Ensure we are training a model that we can save in the end # WARN: there is still a race condition if a model with the same name is # trained in another subprocess trainer = Trainer(nlu_config, component_builder) persistor = create_persistor(storage) if training_data_endpoint is not None: training_data = load_data_from_endpoint(training_data_endpoint, nlu_config.language) else: training_data = load_data(data, nlu_config.language) interpreter = trainer.train(training_data, **kwargs) if path: persisted_path = trainer.persist(path, persistor, project, fixed_model_name) else: persisted_path = None return trainer, interpreter, persisted_path
def _async_train(self, config, nlu_data, model_name): training_start = timer() with self.lock: self.training_status[model_name] = { "status": "TRAINING", } data = self.data_reader.read_from_json({'rasa_nlu_data': nlu_data}) with self.interpreter_cache.lock: trainer = Trainer(RasaNLUModelConfig(config), self.interpreter_cache.component_builder) interpreter = trainer.train(data) tempdir = tempfile.mkdtemp() trainer.persist(tempdir, None, "nlu") _model_package = create_package_rasa(tempdir, os.path.join("models", model_name)) self.interpreter_cache.store(model_name, interpreter) with self.lock: training_end = timer() self.training_status[model_name] = { "status": "READY", "training_time": f"{training_end - training_start:.2f}" }
def train_nlu(data_path, configs, model_path): logging.basicConfig(filename=logfile, level=logging.DEBUG) training_data = load_data(data_path) trainer = Trainer(config.load(configs)) trainer.train(training_data) model_directory = trainer.persist(model_path, fixed_model_name='nlu') run_evaluation(data_path, model_directory)
def load_training_data(data_file="../data/testData.json", config_file="../configs/config_spacy.yml"): training_data = load_data(data_file) trainer = Trainer(config.load(config_file)) trainer.train(training_data) model_directory = trainer.persist('./projects/default/') # where model_directory points to the model folder return model_directory
def train_model(td_file, config_file, model_dir): """trains a model using the training data and config creates model and returns the path to this model for evaluation""" td = load_data(td_file) trainer = Trainer(config.load(config_file)) trainer.train(td) model_loc = trainer.persist(model_dir) return model_loc
def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) persistor = create_persistor(_config) persisted_path = trainer.persist(tmpdir.strpath, persistor) loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
def test_nlu_interpreter(): #training_data = load_data("data/chitchat_nlu.md") training_data = load_data("data") trainer = Trainer(config.load("config.yml")) interpreter = trainer.train(training_data) test_interpreter_dir = trainer.persist("./tests/models", project_name="nlu") parsing = interpreter.parse('hello') assert parsing['intent']['name'] == 'greet' assert test_interpreter_dir
def trained_nlu_model(): cfg = RasaNLUModelConfig({"pipeline": "keyword"}) trainer = Trainer(cfg) td = training_data.load_data(DEFAULT_DATA_PATH) trainer.train(td) model_path = trainer.persist("test_models", project_name="test_model_keyword") return model_path
def test_load_and_persist_without_train(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) persisted_path = trainer.persist(tmpdir.strpath) loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("Rasa is great!") is not None
def train_eval_rasa_nlu_model(lang='en', cross=False, save=''): """ Train snips data from all brat annotation object :param lang: abbreviate language name :param save: path where model will be save :rtype: None """ from rasa.nlu.training_data import load_data from rasa.nlu.model import Trainer from rasa.nlu.components import ComponentBuilder from rasa.nlu import config from rasa.nlu.test import run_evaluation config_file = source_config / "config_rasa_converrt.yml" if cross: filename_results = source_result / "rasa_cross_semeval_2020_model_task1_{}".format(save) train_data_obj = BuildSnipsDataTask1(lang, cross=cross, vers=save) train_data = train_data_obj.build_rasa_data_task1() training_data = load_data(str(train_data[0])) builder = ComponentBuilder(use_cache=True) trainer = Trainer(config.load(str(config_file)), builder) print("--> Training patent data with Rasa...") trainer.train(training_data, num_threads=8, n_jobs=-1, verbose=True) print("--> Saving model trained with Rasa (Rasa)...") model_directory = trainer.persist(filename_results) print("--> Evaluating training data with Rasa metrics (Cross-validation)...") import os from datetime import datetime filename_test = str(train_data[1]) print(filename_test) dmtime = "test_{}_{}".format(save, datetime.now().strftime("%Y%m%d-%H%M%S")) out_test = source_result / "rasa_cross_evaluation_task1" / dmtime model_directory = sorted(filename_results.glob("nlu_*"), key=os.path.getmtime)[-1] run_evaluation(filename_test, str(model_directory), output_directory=str(out_test)) else: filename_results = source_result / "rasa_semeval_2020_model_task1_{}".format(save) train_data_obj = BuildSnipsDataTask1(lang, cross=cross, vers=save) train_file = train_data_obj.build_rasa_data_task1() training_data = load_data(train_file) builder = ComponentBuilder(use_cache=True) trainer = Trainer(config.load(str(config_file)), builder) print("--> Training patent data with Rasa...") trainer.train(training_data, num_threads=8, verbose=True, n_jobs=-1, fixed_model_name="nlu") print("--> Saving model trained with Rasa (Rasa)...") model_directory = trainer.persist(filename_results)
def __init__(self): try: test = Interpreter.load("./models/nlu/current") self.interpreter = test except Exception: training_data = load_data("./data/nlu.md") trainer = Trainer(config.load("config.yml")) self.interpreter = trainer.train(training_data) model_directory = trainer.persist("./models/nlu", fixed_model_name="current") self.music_verbs = ['Riproduci', 'Suona', 'Fai partire', 'Avvia']
def test_run_cv_evaluation( pretrained_embeddings_spacy_config: RasaNLUModelConfig, monkeypatch: MonkeyPatch): td = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.json") nlu_config = RasaNLUModelConfig({ "language": "en", "pipeline": [ { "name": "WhitespaceTokenizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "DIETClassifier", EPOCHS: 2 }, ], }) # mock training trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors(trainer.pipeline) mock = Mock(return_value=Interpreter(trainer.pipeline, None)) monkeypatch.setattr(Trainer, "train", mock) n_folds = 2 intent_results, entity_results, response_selection_results = cross_validate( td, n_folds, nlu_config, successes=False, errors=False, disable_plotting=True, report_as_dict=True, ) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert all(key in intent_results.evaluation for key in ["errors", "report"]) assert any( isinstance(intent_report, dict) and intent_report.get("confused_with") is not None for intent_report in intent_results.evaluation["report"].values()) for extractor_evaluation in entity_results.evaluation.values(): assert all(key in extractor_evaluation for key in ["errors", "report"])
def test_train_with_empty_data(language, pipeline, component_builder, tmpdir): _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language}) trainer = Trainer(_config, component_builder) trainer.train(TrainingData()) persistor = create_persistor(_config) persisted_path = trainer.persist(tmpdir.strpath, persistor, project_name="my_project") loaded = Interpreter.load(persisted_path, component_builder) assert loaded.pipeline assert loaded.parse("hello") is not None assert loaded.parse("Hello today is Monday, again!") is not None
def train_nlu(): from rasa.nlu.training_data import load_data from rasa.nlu import config from rasa.nlu.model import Trainer training_data = load_data('data/nlu.md') trainer = Trainer(config.load("config.yml")) trainer.train(training_data) model_directory = trainer.persist('models/nlu/', fixed_model_name="current") return model_directory
def train(self): # loading the nlu training samples training_data = load_data(self.data) # trainer to educate our pipeline trainer = Trainer(config.load(self.pipeline)) # train the model self.interpreter = trainer.train(training_data) # store it for future use self.model_directory = trainer.persist( "opennlu/data/model/rasa", fixed_model_name=self.name, persist_nlu_training_data=training_data)
def train_test(td_file, config_file, model_dir): # helper function to split into test and train and evaluate on results. td = load_data(td_file) trainer = Trainer(config.load(config_file)) train, test = td.train_test_split(train_frac=0.6) trainer.train(train) model_loc = trainer.persist(model_dir) with open("data/tmp/temp_test.json", "w", encoding="utf8") as f: f.write(test.as_json()) with open("data/temp_train.json", "w", encoding="utf8") as f: f.write(train.as_json()) evaluate_model("data/tmp/temp_test.json", model_loc)
async def train( nlu_config: Union[Text, Dict, RasaNLUModelConfig], data: Union[Text, "TrainingDataImporter"], path: Optional[Text] = None, fixed_model_name: Optional[Text] = None, storage: Optional[Text] = None, component_builder: Optional[ComponentBuilder] = None, training_data_endpoint: Optional[EndpointConfig] = None, persist_nlu_training_data: bool = False, model_to_finetune: Optional[Interpreter] = None, **kwargs: Any, ) -> Tuple[Trainer, Interpreter, Optional[Text]]: """Loads the trainer and the data and runs the training of the model.""" from rasa.shared.importers.importer import TrainingDataImporter if not isinstance(nlu_config, RasaNLUModelConfig): nlu_config = config.load(nlu_config) # Ensure we are training a model that we can save in the end # WARN: there is still a race condition if a model with the same name is # trained in another subprocess trainer = Trainer( nlu_config, component_builder, model_to_finetune=model_to_finetune ) persistor = create_persistor(storage) if training_data_endpoint is not None: training_data = await load_data_from_endpoint( training_data_endpoint, nlu_config.language ) elif isinstance(data, TrainingDataImporter): training_data = await data.get_nlu_data(nlu_config.language) else: training_data = load_data(data, nlu_config.language) training_data.print_stats() if training_data.entity_roles_groups_used(): rasa.shared.utils.common.mark_as_experimental_feature( "Entity Roles and Groups feature" ) interpreter = trainer.train(training_data, **kwargs) if path: persisted_path = trainer.persist( path, persistor, fixed_model_name, persist_nlu_training_data ) else: persisted_path = None return trainer, interpreter, persisted_path
def __init__(self, agentName, botconfig, data, **kwargs): logger.info("Training Agent " + agentName + " in progress") trainingData = load_data(data) self.intents = list(trainingData.intents) self.entities = list(trainingData.entities) trainer = Trainer(config.load(botconfig)) self.interpreter = trainer.train(trainingData) self.model_path = "./models/" + agentName + "/" persist_path = trainer.persist(self.model_path) self.tar_path = package_model(fingerprint=None, train_path=persist_path, output_directory=self.model_path) self.model_name = self.tar_path.replace(self.model_path, "") self.model_version = self.model_name[:self.model_name.index(".tar.gz")]
def train(): td = load_data("{}/demo_rasa.json".format(prj_dir)) _config = RasaNLUModelConfig(load_json("{}/config.json".format(prj_dir))) trainer = Trainer(_config) trainer.train(td) persisted_path = trainer.persist("{}/models".format(prj_dir)) loaded = Interpreter.load(persisted_path) assert loaded.pipeline # Inference result = loaded.parse("i'm looking for a place in the north of town") result = loaded.parse("show me chinese restaurants") result = dict(filter(lambda item: item[0] not in ["intent_ranking"], result.items())) show_dict(result)
def call(): from rasa.nlu.training_data import load_data from rasa.nlu import config from rasa.nlu.components import ComponentBuilder from rasa.nlu.model import Trainer builder = ComponentBuilder(use_cache=True) training_data = load_data('./data/weapon.md') trainer = Trainer(config.load("./config.yml"), builder) trainer.train(training_data) model_directory = trainer.persist('./models', fixed_model_name="model") print('done') return model_directory
def train_test(td_file, config_file, model_dir, key="company", noise=0.1): """trains a model using the training data (split into train-test) and config""" td = load_data(td_file) trainer = Trainer(config.load(config_file)) train, test = td.train_test_split(train_frac=0.8) test = add_noise(test, key, noise=noise) trainer.train(train) tmp_fname = "data/tmp/temp_test.json" model_loc = trainer.persist(model_dir) with open(tmp_fname, "w", encoding="utf8") as f: f.write(test.as_json()) evaluate_model(tmp_fname, model_loc)
def _get_tokenizer_from_nlu_config( nlu_config: Optional[RasaNLUModelConfig] = None, ) -> Optional[Tokenizer]: """Extracts the first Tokenizer in the NLU pipeline. Args: nlu_config: NLU Config. Returns: The first Tokenizer in the NLU pipeline, if any. """ if not nlu_config: return None pipeline: List[Component] = Trainer(nlu_config, skip_validation=True).pipeline tokenizer: Optional[Tokenizer] = None for component in pipeline: if isinstance(component, Tokenizer): if tokenizer: rasa.shared.utils.io.raise_warning( "The pipeline contains more than one tokenizer. " "Only the first tokenizer will be used for story validation.", ) tokenizer = component return tokenizer
def run_trial(space): """The objective function is pickled and transferred to the workers. Hence, this function has to contain all the imports we need. """ data_dir = os.environ.get("INPUT_DATA_DIRECTORY", "./data") model_dir = os.environ.get("INPUT_MODEL_DIRECTORY", "./models") target_metric = os.environ.get("INPUT_TARGET_METRIC", "f1_score") if target_metric not in AVAILABLE_METRICS: logger.error("The metric '{}' is not in the available metrics. " "Please use one of the available metrics: {}." "".format(target_metric, AVAILABLE_METRICS)) return {"loss": 1, "status": STATUS_FAIL} logger.debug("Search space: {}".format(space)) # The epoch has to be an int since `tqdm` otherwise will cause an exception. if "epochs" in space: space["epochs"] = int(space["epochs"]) with open(os.path.join(data_dir, "template_config.yml")) as f: config_yml = f.read().format(**space) config = read_yaml(config_yml) config = rasa.nlu.config.load(config) trainer = Trainer(config) training_data = load_data(os.path.join(data_dir, "train.md")) test_data_path = os.path.join(data_dir, "validation.md") # wrap in train and eval in try/except in case # nlu_hyperopt proposes invalid combination of params try: model = trainer.train(training_data) model_path = trainer.persist(model_dir) if target_metric is None or target_metric == "threshold_loss": loss = _get_threshold_loss(model, test_data_path) else: loss = _get_nlu_evaluation_loss(model_path, target_metric, test_data_path) return {"loss": loss, "status": STATUS_OK} except Exception as e: logger.error(e) return {"loss": 1, "status": STATUS_FAIL}
def cross_validate( data: TrainingData, n_folds: int, nlu_config: Union[RasaNLUModelConfig, Text]) -> CVEvaluationResult: """Stratified cross validation on data. Args: data: Training Data n_folds: integer, number of cv folds nlu_config: nlu config file Returns: dictionary with key, list structure, where each entry in list corresponds to the relevant result for one fold """ from collections import defaultdict import tempfile if isinstance(nlu_config, str): nlu_config = config.load(nlu_config) trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors(trainer.pipeline) intent_train_results = defaultdict(list) intent_test_results = defaultdict(list) entity_train_results = defaultdict(lambda: defaultdict(list)) entity_test_results = defaultdict(lambda: defaultdict(list)) tmp_dir = tempfile.mkdtemp() for train, test in generate_folds(n_folds, data): interpreter = trainer.train(train) # calculate train accuracy intent_train_results, entity_train_results = combine_result( intent_train_results, entity_train_results, interpreter, train) # calculate test accuracy intent_test_results, entity_test_results = combine_result( intent_test_results, entity_test_results, interpreter, test) shutil.rmtree(tmp_dir, ignore_errors=True) return ( CVEvaluationResult(dict(intent_train_results), dict(intent_test_results)), CVEvaluationResult(dict(entity_train_results), dict(entity_test_results)), )
def test_invalid_many_tokenizers_in_config(): nlu_config = { "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyTokenizer"}] } with pytest.raises(config.InvalidConfigError) as execinfo: Trainer(config.RasaNLUModelConfig(nlu_config)) assert "More then one tokenizer is used" in str(execinfo.value)
def __init__(self, languages=None): if languages is None: languages = ['en', 'de'] pipeline = [{"name": "WhitespaceTokenizer"}, {"name": "CRFEntityExtractor"}, {"name": "EntitySynonymMapper"}, {"name": "CountVectorsFeaturizer"}, {"name": "EmbeddingIntentClassifier"}] self.interpreters = {} for lang in languages: filepath = resource_filename(__name__, f'nlu_{lang}.md') training_data = load_data(filepath) trainer = Trainer(RasaNLUModelConfig({"pipeline": pipeline})) self.interpreters[lang] = trainer.train(training_data)
def test_invalid_many_tokenizers_in_config(): nlu_config = { "pipeline": [{"name": "WhitespaceTokenizer"}, {"name": "SpacyTokenizer"}] } with pytest.raises(InvalidConfigException) as execinfo: Trainer(config.RasaNLUModelConfig(nlu_config)) assert "The pipeline configuration contains more than one" in str(execinfo.value)