def assess_model(y_pred, y_true, labels, target_names, labels_with_o, target_names_with_o, dataset_type, stats_graph_folder, epoch_number, parameters, evaluation_mode='bio', verbose=False): results = {} assert len(y_true) == len(y_pred) # Classification report classification_report = sklearn.metrics.classification_report(y_true, y_pred, labels=labels, target_names=target_names, sample_weight=None, digits=4) utils_plots.plot_classification_report(classification_report, title='Classification report for epoch {0} in {1} ({2} evaluation)\n'.format(epoch_number, dataset_type, evaluation_mode), cmap='RdBu') plt.savefig(os.path.join(stats_graph_folder, 'classification_report_for_epoch_{0:04d}_in_{1}_{2}_evaluation.{3}'.format(epoch_number, dataset_type, evaluation_mode, parameters['plot_format'])), dpi=300, format=parameters['plot_format'], bbox_inches='tight') plt.close() results['classification_report'] = classification_report # F1 scores results['f1_score'] = {} for f1_average_style in ['weighted', 'micro', 'macro']: results['f1_score'][f1_average_style] = sklearn.metrics.f1_score(y_true, y_pred, average=f1_average_style, labels=labels)*100 results['f1_score']['per_label'] = [x*100 for x in sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, average=None, labels=labels)[2].tolist()] confusion_matrix = sklearn.metrics.confusion_matrix(y_true, y_pred, labels=labels_with_o) results['confusion_matrix'] = confusion_matrix.tolist() title = 'Confusion matrix for epoch {0} in {1} ({2} evaluation)\n'.format(epoch_number, dataset_type, evaluation_mode) xlabel = 'Predicted' ylabel = 'True' xticklabels = yticklabels = target_names_with_o utils_plots.heatmap(confusion_matrix, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20, correct_orientation=True, fmt="%d", remove_diagonal=True) plt.savefig(os.path.join(stats_graph_folder, 'confusion_matrix_for_epoch_{0:04d}_in_{1}_{2}_evaluation.{3}'.format(epoch_number, dataset_type, evaluation_mode, parameters['plot_format'])), dpi=300, format=parameters['plot_format'], bbox_inches='tight') plt.close() # Accuracy results['accuracy_score'] = sklearn.metrics.accuracy_score(y_true, y_pred)*100 return results
def evaluate_model(results, dataset, y_pred_all, y_true_all, stats_graph_folder, epoch_number, epoch_start_time, output_filepaths, parameters, verbose=False): results['execution_details']['num_epochs'] = epoch_number results['epoch'][epoch_number] = [] result_update = {} for dataset_type in ['train', 'valid', 'test']: if dataset_type not in output_filepaths.keys(): continue print('Generating plots for the {0} set'.format(dataset_type)) result_update[dataset_type] = {} y_pred_original = y_pred_all[dataset_type] y_true_original = y_true_all[dataset_type] for evaluation_mode in ['bio', 'token', 'binary']: y_pred, y_true, label_indices, label_names, label_indices_with_o, label_names_with_o = remap_labels( y_pred_original, y_true_original, dataset, evaluation_mode=evaluation_mode) result_update[dataset_type][evaluation_mode] = assess_model( y_pred, y_true, label_indices, label_names, label_indices_with_o, label_names_with_o, dataset_type, stats_graph_folder, epoch_number, parameters, evaluation_mode=evaluation_mode, verbose=verbose) if parameters['main_evaluation_mode'] == evaluation_mode: result_update[dataset_type].update( result_update[dataset_type][evaluation_mode]) result_update['time_elapsed_since_epoch_start'] = time.time( ) - epoch_start_time result_update['time_elapsed_since_train_start'] = time.time( ) - results['execution_details']['train_start'] results['epoch'][epoch_number].append(result_update) # CoNLL evaluation script for dataset_type in ['train', 'valid', 'test']: if dataset_type not in output_filepaths.keys(): continue # run perl evaluation script in python package # conll_evaluation_script = os.path.join('.', 'conlleval') package_name = 'neuroner' root_dir = os.path.dirname( pkg_resources.resource_filename(package_name, '__init__.py')) print(root_dir) conll_evaluation_script = os.path.join(root_dir, 'conlleval') conll_output_filepath = '{0}_conll_evaluation.txt'.format( output_filepaths[dataset_type]) shell_command = 'perl {0} < {1} > {2}'.format( conll_evaluation_script, output_filepaths[dataset_type], conll_output_filepath) print('shell_command: {0}'.format(shell_command)) os.system(shell_command) conll_parsed_output = utils_nlp.get_parsed_conll_output( conll_output_filepath) results['epoch'][epoch_number][0][dataset_type][ 'conll'] = conll_parsed_output results['epoch'][epoch_number][0][dataset_type]['f1_conll'] = {} results['epoch'][epoch_number][0][dataset_type]['f1_conll'][ 'micro'] = results['epoch'][epoch_number][0][dataset_type][ 'conll']['all']['f1'] if parameters['main_evaluation_mode'] == 'conll': results['epoch'][epoch_number][0][dataset_type]['f1_score'] = {} results['epoch'][epoch_number][0][dataset_type]['f1_score'][ 'micro'] = results['epoch'][epoch_number][0][dataset_type][ 'conll']['all']['f1'] results['epoch'][epoch_number][0][dataset_type][ 'accuracy_score'] = results['epoch'][epoch_number][0][ dataset_type]['conll']['all']['accuracy'] utils_plots.plot_classification_report( results['epoch'][epoch_number][0][dataset_type]['conll'], title= 'Classification report for epoch {0} in {1} ({2} evaluation)\n' .format(epoch_number, dataset_type, 'conll'), cmap='RdBu', from_conll_json=True) plt.savefig(os.path.join( stats_graph_folder, 'classification_report_for_epoch_{0:04d}_in_{1}_conll_evaluation.{3}' .format(epoch_number, dataset_type, evaluation_mode, parameters['plot_format'])), dpi=300, format=parameters['plot_format'], bbox_inches='tight') plt.close() if parameters['train_model'] and 'train' in output_filepaths.keys( ) and 'valid' in output_filepaths.keys(): plot_f1_vs_epoch(results, stats_graph_folder, 'f1_score', parameters) plot_f1_vs_epoch(results, stats_graph_folder, 'accuracy_score', parameters) plot_f1_vs_epoch(results, stats_graph_folder, 'f1_conll', parameters) results['execution_details']['train_duration'] = time.time( ) - results['execution_details']['train_start'] save_results(results, stats_graph_folder)