def test_evaluate_nfold_fail_due_to_invalid_data():
    X = 'not a numpy array...'
    y = '...nor a pandas data structure'
    model = GaussianNB()
    num_folds = 3
    with pytest.raises(ValueError):
        evaluate_nfold(X, y, model, num_folds)
def test_evaluate_nfold_fail_due_to_invalid_bootstripping_param(classification_data):
    X, y = classification_data
    num_folds = 3
    bootstrapping = 'non-Boolean'
    with pytest.raises(ValueError):
        evaluate_nfold(X, y, GaussianNB(), num_folds,
                       bootstrapping=bootstrapping)
def test_evaluate_nfold_with_numpy_arrays(classification_data):
    X, y = classification_data
    model = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.5,
                                                        random_state=1)
    model.fit(X_train, y_train)
    num_folds = 1

    scores = evaluate_nfold(X_test, y_test, model,
                            num_folds)
    assert len(scores) == num_folds
    assert all((scores[i]>=0) & (scores[i]<=1) \
               for i in range(len(scores))) == True

    num_folds = 3
    scores = evaluate_nfold(X_test, y_test, model,
                            num_folds)
    assert len(scores) == num_folds
    assert all((scores[i]>=0) & (scores[i]<=1) \
               for i in range(len(scores))) == True
def test_evaluate_nfold_bootstrapping(classification_data):
    X, y = classification_data
    model = GaussianNB()
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.5,
                                                        random_state=1)
    model.fit(X_train, y_train)
    num_folds = 3

    scores = evaluate_nfold(pd.DataFrame(X_test), pd.DataFrame(y_test),
                            model, num_folds, bootstrapping=True)
    assert len(scores) == num_folds
    assert all((scores[i]>=0) & (scores[i]<=1) \
               for i in range(len(scores))) == True
def main():

    start_time_main = time.time()

    print_info('Reading config files...', ':')
    run_config = config_file_to_dict(config_path + 'run_params.conf')
    data_config = config_file_to_dict(config_path + 'data_params.conf')
    model_config = config_file_to_dict(config_path + 'model_params.conf')

    if run_mode_user in run_config:
        frac_train_sample = run_config[run_mode_user]['frac_train_sample']
        num_test_samples = run_config[run_mode_user]['num_test_samples']
        num_CV_folds = run_config[run_mode_user]['num_CV_folds']
        do_optimize_params = run_config[run_mode_user]['do_optimize_params']
        n_iter = run_config[run_mode_user]['n_iter']
        print_info('Chosen run mode is {}: {}'.format(
            run_mode_user, run_config[run_mode_user]))
    else:
        raise KeyError('{} is not a valid run mode setting ' \
                       '(use, e.g., "run_params")'.format(
            run_mode_user))

    # collection of performance measures to be applied to the test set(s)
    scoring_funcs = ['accuracy_score', 'precision_score', 'recall_score', \
                     'f1_score']

    final_results_labels = [
        'dataset', 'model', 'model_params', 'num_test_sets', 'num_CV_folds',
        'elapsed_time_train', 'elapsed_time_test'
    ]
    final_results_labels += ['test_{}_1fold'.format(i) for i in scoring_funcs]
    final_results_labels += ['train_{}'.format(i) for i in scoring_funcs]
    final_results_labels += ['test_{}'.format(i) for i in scoring_funcs]
    final_results_labels += [
        'test_{}_bootstrap'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_max'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_max_bootstrap'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_mean'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_std'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_mean_bootstrap'.format(i) for i in scoring_funcs
    ]
    final_results_labels += [
        'test_{}_diff_std_bootstrap'.format(i) for i in scoring_funcs
    ]

    final_results = pd.DataFrame(columns=final_results_labels)

    # loop over all sections of the data params config file
    for d_cnt, d in enumerate(data_config):

        print_info(
            'Processing dataset: {} ({} of {})'.format(d, d_cnt + 1,
                                                       len(data_config)), '=',
            50)

        current_data_results = {}

        current_data_params = data_config[d]
        check_data_config_requirements(current_data_params)

        print_info('Loading data...', ':')
        data = load_data(current_data_params)

        print_info('Preparing target vector...', ':')
        X = data.drop(current_data_params['data_target_col'], axis=1)
        y = data[current_data_params['data_target_col']]

        y = parse_target_labels(
            y, current_data_params['data_target_positive_label'],
            current_data_params['data_target_negative_label'])

        del data

        print_info('Dimensions of feature matrix X: {}'.format(X.shape))
        print_info('Dimensions of target vector y:  {}'.format(y.shape))

        print_info('Splitting the data: splitting off the training sample...',
                   ':')
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=1 -
                                                            frac_train_sample)

        del X, y

        print_info('Preprocessing the data...', ':')
        pm = PreprocessingManager(om.get_session_folder())

        for func in current_data_params['data_preprocessing']:
            X_train = getattr(pm, func)(X_train, False)
            X_test = getattr(pm, func)(X_test, True)

        print_class_counts(y_train, 'training', background=0, signal=1)
        print_class_counts(y_test, 'test', background=0, signal=1)

        # hyperparameter optimization, if required
        if num_CV_folds is None:
            print_info('Optimizing the number of cross-validation folds...',
                       ':')
            num_CV_folds = get_optimal_CV_n_folds(X_train.as_matrix(),
                                                  y_train.as_matrix())

        for mod in model_config:

            print_info('Training model: {}'.format(mod), '-', 50)

            try:
                model_params = model_config[mod]
                model = supported_models[mod](**model_params)
            except KeyError:
                raise KeyError('Model {} not supported. Choose a valid input ' \
                               'from this list: {}'.format(mod, supported_models))

            fitkwargs = {'X': X_train, 'y': y_train}
            if do_optimize_params:
                print_info('Optimizing hyperparameters...', ':')
                model = hyperparameter_search(model, n_iter, num_CV_folds)
                if mod != 'GaussianNB':
                    fitkwargs['callback'] = DeltaXStopper(1e-2)

            start_time_train = time.time()
            print_info('Fitting the model...', ':')
            model.fit(**fitkwargs)
            elapsed_time_train = time.time() - start_time_train

            model_parameters = get_search_results(model)

            # evaluate model on the training sample
            print_info('Evaluating the model on the training sample...', ':')
            for scoring_func in scoring_funcs:
                try:
                    model_scores_train = evaluate_nfold(X_train,
                                                        y_train,
                                                        model,
                                                        1,
                                                        scoring=scoring_func)
                    current_data_results['train_{}'.format(
                        scoring_func)] = model_scores_train[0]
                except ValueError:
                    warnings.warn('ValueError when evaluating with {}. ' \
                                  'Ignoring and continuing...'.format(
                                      scoring_func))

            # evaluate model on the test sample(s)
            print_info('Evaluating the model on the test sample(s)...', ':')

            test_performance_1fold = -1  # must be initialized with a negative number

            for t in range(1, num_test_samples + 1):

                start_time_test = time.time()

                for scoring_func in scoring_funcs:
                    try:
                        model_scores_test = evaluate_nfold(
                            X_test,
                            y_test,
                            model,
                            t,
                            scoring=scoring_func,
                            bootstrapping=False)
                        model_scores_test_bootstrap = evaluate_nfold(
                            X_test,
                            y_test,
                            model,
                            t,
                            scoring=scoring_func,
                            bootstrapping=True)

                        if test_performance_1fold < 0:
                            test_performance_1fold = model_scores_test[0]
                        else:
                            pass

                        current_data_results['test_{}_1fold'.format(
                            scoring_func)] = test_performance_1fold

                        current_data_results['test_{}'.format(
                            scoring_func)] = str(model_scores_test)
                        current_data_results['test_{}_bootstrap'.format(
                            scoring_func)] = str(model_scores_test_bootstrap)

                        current_data_results['test_{}_diff_max'.format(
                            scoring_func
                        )] = max(model_scores_test) - min(model_scores_test)

                        current_data_results[
                            'test_{}_diff_max_bootstrap'.format(
                                scoring_func
                            )] = max(model_scores_test_bootstrap) - min(
                                model_scores_test_bootstrap)

                        scores_mean, scores_std = performance_difference(
                            model_scores_test)
                        current_data_results['test_{}_diff_mean'.format(
                            scoring_func)] = scores_mean
                        current_data_results['test_{}_diff_std'.format(
                            scoring_func)] = scores_std

                        scores_mean_bootstrap, scores_std_bootstrap = performance_difference(
                            model_scores_test_bootstrap)
                        current_data_results[
                            'test_{}_diff_mean_bootstrap'.format(
                                scoring_func)] = scores_mean_bootstrap
                        current_data_results[
                            'test_{}_diff_std_bootstrap'.format(
                                scoring_func)] = scores_std_bootstrap

                    except ValueError:
                        warnings.warn('ValueError when evaluating with {}. ' \
                                      'Ignoring and continuing...'.format(
                                          scoring_func))
                        current_data_results['test_{}_1fold'.format(
                            scoring_func)] = -1
                        #current_data_results['test_{}'.format(
                        #    scoring_func)] = "-1"
                        #current_data_results['test_{}_bootstrap'.format(
                        #    scoring_func)] = "-1"
                        current_data_results['test_{}_diff_mean'.format(
                            scoring_func)] = -1
                        current_data_results['test_{}_diff_std'.format(
                            scoring_func)] = -1
                        current_data_results['test_{}_diff_mean_bootstrap'.
                                             format(scoring_func)] = -1
                        current_data_results['test_{}_diff_std_bootstrap'.
                                             format(scoring_func)] = -1

                print_info('Model score differences (mean, std) for {} ' \
                           'test sample folds: {:.5f}, {:.5f}'.format(
                               t, scores_mean, scores_std))

                model_params_string = ','.join('{}:{}'.format(key, val) \
                                              for key, val in \
                                               sorted(model_parameters.items()))

                current_data_results['dataset'] = str(d)
                current_data_results['model'] = str(mod)
                current_data_results['model_params'] = model_params_string
                current_data_results['num_test_sets'] = t
                current_data_results['num_CV_folds'] = num_CV_folds
                current_data_results['elapsed_time_train'] = elapsed_time_train
                current_data_results['elapsed_time_test'] = time.time(
                ) - start_time_test

                final_results = final_results.append(current_data_results,
                                                     ignore_index=True)

        print_info('Creating results plots...', ':')
        scoring_func_plot = 'f1_score'

        train_differences = []

        current_data_plot_nsplits = final_results.query(
            '(dataset=="{}") & (model=="{}")'.format(d, mod))['num_test_sets']

        # explicit conversion to floats is necessary for the np.isfinite method,
        # which is implicitely called during plotting
        current_data_plot_xyvals = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]
        current_data_plot_xyvals_bootstrap = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]

        current_data_plot_xyvals_max = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]
        current_data_plot_xyvals_max_bootstrap = [
            current_data_plot_nsplits.values.astype(np.float32)
        ]

        for mod in model_config:
            current_data_plot_xyvals.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_mean'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_std'.format(
                        scoring_func_plot)].values.astype(np.float32))

            current_data_plot_xyvals_max.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_max'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals_max.append(
                np.zeros(current_data_plot_xyvals_max[-1].shape))

            current_data_plot_xyvals_max_bootstrap.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_max_bootstrap'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals_max_bootstrap.append(
                np.zeros(current_data_plot_xyvals_max_bootstrap[-1].shape))

            train_differences.append(
                abs(final_results.query('(dataset=="{}") & '\
                                        '(model=="{}")'.format(
                                            d,mod))['train_{}'.format(
                                                scoring_func_plot)].iloc[0] -
                    final_results.query('(dataset=="{}") & (model=="{}")'.format(
                        d,mod))['test_{}_1fold'.format(scoring_func_plot)].iloc[0])
            )

            current_data_plot_xyvals_bootstrap.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_mean_bootstrap'.format(
                        scoring_func_plot)].values.astype(np.float32))
            current_data_plot_xyvals_bootstrap.append(
                final_results.query('(dataset=="{}") & (model=="{}")'.format(
                    d, mod))['test_{}_diff_std_bootstrap'.format(
                        scoring_func_plot)].values.astype(np.float32))

        xmax_list = [None]
        for i in range(10, 100, 10):
            if num_test_samples > i:
                xmax_list.append(i)

        for lim in xmax_list:
            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals,
                labels=[m for m in model_config],
                train_difference=train_differences,
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits_full'.format(d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits'.format(d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_max,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='maximum performance difference')
            plot_filename = '{}_performance-diff_max_num-splits'.format(d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_max_bootstrap,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='maximum performance difference')
            plot_filename = '{}_performance-diff_max_num-splits_bootstrap'.format(
                d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_bootstrap,
                labels=[m for m in model_config],
                train_difference=train_differences,
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits_full_bootstrap'.format(
                d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

            current_data_plot = plot_performance_diff(
                *current_data_plot_xyvals_bootstrap,
                labels=[m for m in model_config],
                xmax=lim,
                xlabel='number of samples',
                ylabel='mean performance difference')
            plot_filename = '{}_performance-diff_num-splits_bootstrap'.format(
                d)
            if lim is not None:
                plot_filename += '_zoomed-{}'.format(lim)

            om.save(current_data_plot, plot_filename)

        print_info('Saving the final results...', ':')
        om.save(final_results, '{}_final-results'.format(d))

        final_results_dict = final_results.to_dict('dict')
        final_results_dict['relation'] = str(d)  # needed for ARFF
        final_results_dict['description'] = u''  # needed for ARFF
        om.save(final_results, '{}_final-results'.format(d), to_arff=True)

        print_info('\n')
        print_info(
            'Everything done. (Elapsed overall time: {} seconds)\n'.format(
                time.time() - start_time_main))
def test_evaluate_nfold_fail_due_to_invalid_num_folds(classification_data):
    X, y = classification_data
    num_folds = 'not an integer'
    with pytest.raises(ValueError):
        evaluate_nfold(X, y, GaussianNB(), num_folds)
def test_evaluate_nfold_fail_due_to_zero_folds(classification_data):
    X, y = classification_data
    model = GaussianNB()
    num_folds = 0
    with pytest.raises(ValueError):
        evaluate_nfold(X, y, model, num_folds)
def test_evaluate_nfold_fail_due_to_invalid_scoring_func(classification_data):
    X, y = classification_data
    num_folds = 3
    scoring_func = 'invalid function'
    with pytest.raises(ValueError):
        evaluate_nfold(X, y, GaussianNB(), num_folds, scoring=scoring_func)