Esempio n. 1
0
def assess_model(y_pred, y_true, labels, target_names, labels_with_o, target_names_with_o, dataset_type, stats_graph_folder, epoch_number, parameters,
                 evaluation_mode='bio', verbose=False):
    results = {}
    assert len(y_true) == len(y_pred)

    # Classification report
    classification_report = sklearn.metrics.classification_report(y_true, y_pred, labels=labels, target_names=target_names, sample_weight=None, digits=4)

    utils_plots.plot_classification_report(classification_report,
                                           title='Classification report for epoch {0} in {1} ({2} evaluation)\n'.format(epoch_number, dataset_type,
                                                                                                                        evaluation_mode),
                                           cmap='RdBu')
    plt.savefig(os.path.join(stats_graph_folder, 'classification_report_for_epoch_{0:04d}_in_{1}_{2}_evaluation.{3}'.format(epoch_number, dataset_type,
                                                                                                                            evaluation_mode, parameters['plot_format'])),
                dpi=300, format=parameters['plot_format'], bbox_inches='tight')
    plt.close()
    results['classification_report'] = classification_report

    # F1 scores
    results['f1_score'] = {}
    for f1_average_style in ['weighted', 'micro', 'macro']:
        results['f1_score'][f1_average_style] = sklearn.metrics.f1_score(y_true, y_pred, average=f1_average_style, labels=labels)*100
    results['f1_score']['per_label'] = [x*100 for x in sklearn.metrics.precision_recall_fscore_support(y_true, y_pred, average=None, labels=labels)[2].tolist()]

    confusion_matrix = sklearn.metrics.confusion_matrix(y_true, y_pred, labels=labels_with_o)
    results['confusion_matrix'] = confusion_matrix.tolist()
    title = 'Confusion matrix for epoch {0} in {1} ({2} evaluation)\n'.format(epoch_number, dataset_type, evaluation_mode)
    xlabel = 'Predicted'
    ylabel = 'True'
    xticklabels = yticklabels = target_names_with_o
    utils_plots.heatmap(confusion_matrix, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20, correct_orientation=True, fmt="%d", 
                        remove_diagonal=True)
    plt.savefig(os.path.join(stats_graph_folder, 'confusion_matrix_for_epoch_{0:04d}_in_{1}_{2}_evaluation.{3}'.format(epoch_number, dataset_type,
                                                                                                                       evaluation_mode, parameters['plot_format'])),
                dpi=300, format=parameters['plot_format'], bbox_inches='tight')
    plt.close()

    # Accuracy
    results['accuracy_score'] = sklearn.metrics.accuracy_score(y_true, y_pred)*100

    return results
Esempio n. 2
0
def evaluate_model(results,
                   dataset,
                   y_pred_all,
                   y_true_all,
                   stats_graph_folder,
                   epoch_number,
                   epoch_start_time,
                   output_filepaths,
                   parameters,
                   verbose=False):
    results['execution_details']['num_epochs'] = epoch_number
    results['epoch'][epoch_number] = []
    result_update = {}

    for dataset_type in ['train', 'valid', 'test']:
        if dataset_type not in output_filepaths.keys():
            continue
        print('Generating plots for the {0} set'.format(dataset_type))
        result_update[dataset_type] = {}
        y_pred_original = y_pred_all[dataset_type]
        y_true_original = y_true_all[dataset_type]

        for evaluation_mode in ['bio', 'token', 'binary']:
            y_pred, y_true, label_indices, label_names, label_indices_with_o, label_names_with_o = remap_labels(
                y_pred_original,
                y_true_original,
                dataset,
                evaluation_mode=evaluation_mode)
            result_update[dataset_type][evaluation_mode] = assess_model(
                y_pred,
                y_true,
                label_indices,
                label_names,
                label_indices_with_o,
                label_names_with_o,
                dataset_type,
                stats_graph_folder,
                epoch_number,
                parameters,
                evaluation_mode=evaluation_mode,
                verbose=verbose)
            if parameters['main_evaluation_mode'] == evaluation_mode:
                result_update[dataset_type].update(
                    result_update[dataset_type][evaluation_mode])

    result_update['time_elapsed_since_epoch_start'] = time.time(
    ) - epoch_start_time
    result_update['time_elapsed_since_train_start'] = time.time(
    ) - results['execution_details']['train_start']
    results['epoch'][epoch_number].append(result_update)

    # CoNLL evaluation script
    for dataset_type in ['train', 'valid', 'test']:
        if dataset_type not in output_filepaths.keys():
            continue

        # run perl evaluation script in python package
        # conll_evaluation_script = os.path.join('.', 'conlleval')
        package_name = 'neuroner'
        root_dir = os.path.dirname(
            pkg_resources.resource_filename(package_name, '__init__.py'))
        print(root_dir)
        conll_evaluation_script = os.path.join(root_dir, 'conlleval')

        conll_output_filepath = '{0}_conll_evaluation.txt'.format(
            output_filepaths[dataset_type])
        shell_command = 'perl {0} < {1} > {2}'.format(
            conll_evaluation_script, output_filepaths[dataset_type],
            conll_output_filepath)
        print('shell_command: {0}'.format(shell_command))
        os.system(shell_command)
        conll_parsed_output = utils_nlp.get_parsed_conll_output(
            conll_output_filepath)
        results['epoch'][epoch_number][0][dataset_type][
            'conll'] = conll_parsed_output
        results['epoch'][epoch_number][0][dataset_type]['f1_conll'] = {}
        results['epoch'][epoch_number][0][dataset_type]['f1_conll'][
            'micro'] = results['epoch'][epoch_number][0][dataset_type][
                'conll']['all']['f1']
        if parameters['main_evaluation_mode'] == 'conll':
            results['epoch'][epoch_number][0][dataset_type]['f1_score'] = {}
            results['epoch'][epoch_number][0][dataset_type]['f1_score'][
                'micro'] = results['epoch'][epoch_number][0][dataset_type][
                    'conll']['all']['f1']
            results['epoch'][epoch_number][0][dataset_type][
                'accuracy_score'] = results['epoch'][epoch_number][0][
                    dataset_type]['conll']['all']['accuracy']
            utils_plots.plot_classification_report(
                results['epoch'][epoch_number][0][dataset_type]['conll'],
                title=
                'Classification report for epoch {0} in {1} ({2} evaluation)\n'
                .format(epoch_number, dataset_type, 'conll'),
                cmap='RdBu',
                from_conll_json=True)
            plt.savefig(os.path.join(
                stats_graph_folder,
                'classification_report_for_epoch_{0:04d}_in_{1}_conll_evaluation.{3}'
                .format(epoch_number, dataset_type, evaluation_mode,
                        parameters['plot_format'])),
                        dpi=300,
                        format=parameters['plot_format'],
                        bbox_inches='tight')
            plt.close()

    if parameters['train_model'] and 'train' in output_filepaths.keys(
    ) and 'valid' in output_filepaths.keys():
        plot_f1_vs_epoch(results, stats_graph_folder, 'f1_score', parameters)
        plot_f1_vs_epoch(results, stats_graph_folder, 'accuracy_score',
                         parameters)
        plot_f1_vs_epoch(results, stats_graph_folder, 'f1_conll', parameters)

    results['execution_details']['train_duration'] = time.time(
    ) - results['execution_details']['train_start']
    save_results(results, stats_graph_folder)