def test_run_cv_evaluation( pretrained_embeddings_spacy_config: RasaNLUModelConfig, monkeypatch: MonkeyPatch): td = rasa.shared.nlu.training_data.loading.load_data( "data/examples/rasa/demo-rasa.json") nlu_config = RasaNLUModelConfig({ "language": "en", "pipeline": [ { "name": "WhitespaceTokenizer" }, { "name": "CountVectorsFeaturizer" }, { "name": "DIETClassifier", EPOCHS: 2 }, ], }) # mock training trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors(trainer.pipeline) mock = Mock(return_value=Interpreter(trainer.pipeline, None)) monkeypatch.setattr(Trainer, "train", mock) n_folds = 2 intent_results, entity_results, response_selection_results = cross_validate( td, n_folds, nlu_config, successes=False, errors=False, disable_plotting=True, report_as_dict=True, ) assert len(intent_results.train["Accuracy"]) == n_folds assert len(intent_results.train["Precision"]) == n_folds assert len(intent_results.train["F1-score"]) == n_folds assert len(intent_results.test["Accuracy"]) == n_folds assert len(intent_results.test["Precision"]) == n_folds assert len(intent_results.test["F1-score"]) == n_folds assert all(key in intent_results.evaluation for key in ["errors", "report"]) assert any( isinstance(intent_report, dict) and intent_report.get("confused_with") is not None for intent_report in intent_results.evaluation["report"].values()) for extractor_evaluation in entity_results.evaluation.values(): assert all(key in extractor_evaluation for key in ["errors", "report"])
def cross_validate( data: TrainingData, n_folds: int, nlu_config: Union[RasaNLUModelConfig, Text]) -> CVEvaluationResult: """Stratified cross validation on data. Args: data: Training Data n_folds: integer, number of cv folds nlu_config: nlu config file Returns: dictionary with key, list structure, where each entry in list corresponds to the relevant result for one fold """ from collections import defaultdict import tempfile if isinstance(nlu_config, str): nlu_config = config.load(nlu_config) trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors(trainer.pipeline) intent_train_results = defaultdict(list) intent_test_results = defaultdict(list) entity_train_results = defaultdict(lambda: defaultdict(list)) entity_test_results = defaultdict(lambda: defaultdict(list)) tmp_dir = tempfile.mkdtemp() for train, test in generate_folds(n_folds, data): interpreter = trainer.train(train) # calculate train accuracy intent_train_results, entity_train_results = combine_result( intent_train_results, entity_train_results, interpreter, train) # calculate test accuracy intent_test_results, entity_test_results = combine_result( intent_test_results, entity_test_results, interpreter, test) shutil.rmtree(tmp_dir, ignore_errors=True) return ( CVEvaluationResult(dict(intent_train_results), dict(intent_test_results)), CVEvaluationResult(dict(entity_train_results), dict(entity_test_results)), )
def cross_validate( data: TrainingData, n_folds: int, nlu_config: Union[RasaNLUModelConfig, Text], report: Optional[Text] = None, successes: Optional[Text] = None, errors: Optional[Text] = "errors.json", confmat: Optional[Text] = None, histogram: Optional[Text] = None, ) -> Tuple[CVEvaluationResult, CVEvaluationResult]: """Stratified cross validation on data. Args: data: Training Data n_folds: integer, number of cv folds nlu_config: nlu config file report: path to folder where reports are stored successes: path to file that will contain success cases errors: path to file that will contain error cases confmat: path to file that will show the confusion matrix histogram: path fo file that will show a histogram Returns: dictionary with key, list structure, where each entry in list corresponds to the relevant result for one fold """ from collections import defaultdict if isinstance(nlu_config, str): nlu_config = config.load(nlu_config) if report: utils.create_dir(report) trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors(trainer.pipeline) intent_train_metrics = defaultdict(list) # type: IntentMetrics intent_test_metrics = defaultdict(list) # type: IntentMetrics entity_train_metrics = defaultdict( lambda: defaultdict(list)) # type: EntityMetrics entity_test_metrics = defaultdict( lambda: defaultdict(list)) # type: EntityMetrics intent_test_results = [] # type: List[IntentEvaluationResult] entity_test_results = [] # type: List[EntityEvaluationResult] intent_classifier_present = False extractors = set() # type: Set[Text] for train, test in generate_folds(n_folds, data): interpreter = trainer.train(train) # calculate train accuracy combine_result(intent_train_metrics, entity_train_metrics, interpreter, train) # calculate test accuracy combine_result( intent_test_metrics, entity_test_metrics, interpreter, test, intent_test_results, entity_test_results, ) if not extractors: extractors = get_entity_extractors(interpreter) if is_intent_classifier_present(interpreter): intent_classifier_present = True if intent_classifier_present: logger.info("Accumulated test folds intent evaluation results:") evaluate_intents(intent_test_results, report, successes, errors, confmat, histogram) if extractors: logger.info("Accumulated test folds entity evaluation results:") evaluate_entities(entity_test_results, extractors, report) return ( CVEvaluationResult(dict(intent_train_metrics), dict(intent_test_metrics)), CVEvaluationResult(dict(entity_train_metrics), dict(entity_test_metrics)), )
def main(out_directory, config_directory, dataset_directory, n_folds): start = timer() if not os.path.exists(out_directory): os.mkdir(out_directory) else: count = 0 out_directory_temp = out_directory while os.path.exists(out_directory_temp): out_directory_temp = out_directory + str(count) count += 1 config_size = len(os.listdir(config_directory)) count_config = 0 for config_filename in os.listdir(config_directory): count_config += 1 print('######################################') print('CURRENT CONFIG :', config_filename, ' PROGRESS:', count_config, '/', config_size) print('######################################') start_config = timer() if config_filename.endswith(".yml"): config_path = os.path.join(config_directory, config_filename) config_name = config_filename.split('.')[0] out_config_directory = out_directory + config_name + '/' if not os.path.exists(out_config_directory): os.mkdir(out_config_directory) datasets_dir_out = 'Datasets_Results/' if not os.path.exists(out_config_directory + datasets_dir_out): os.mkdir(out_config_directory + datasets_dir_out) nlu_config = config.load(config_path) try: trainer = Trainer(nlu_config) trainer.pipeline = remove_pretrained_extractors( trainer.pipeline) except OSError: raise datasets_results = [] datasets_names = [] for dataset_filename in os.listdir(dataset_directory): if dataset_filename.endswith( ".json") or dataset_filename.endswith(".md"): dataset_path = os.path.join(dataset_directory, dataset_filename) dataset_name = dataset_filename.split('.')[0] cross_val_results = run_benchmark(dataset_path, n_folds, trainer) # utils.write_json_to_file('new_result_test', cross_val_results) dataset_result = sum_results(cross_val_results, collect_report=True) utils.write_json_to_file( out_config_directory + datasets_dir_out + dataset_name + '_Benchmark', dataset_result) datasets_results.append(dataset_result) datasets_names.append(dataset_filename) save_result_by_group(datasets_results, n_folds, out_config_directory, datasets_names) overhaul_result = sum_results(datasets_results) end_config = timer() overhaul_result['time'] = str(end_config) utils.write_json_to_file( out_config_directory + 'Datasets_Mean_Result', overhaul_result) end = timer() logger.info("Finished evaluation in: " + str(end - start))