Esempio n. 1
0
def generate_main_results():
    """Generate the main results of the experiment."""

    wide_optimal = sort_tbl(
        calculate_wide_optimal(results),
        ovrs_order=OVRS_NAMES, clfs_order=CLFS_NAMES
    )\
        .set_index(['Dataset', 'Classifier', 'Metric'])\
        .apply(lambda row: make_bold(row, num_decimals=3), axis=1)\
        .reset_index()
    wide_optimal['Dataset'] = wide_optimal['Dataset'].apply(
        lambda x: x.title()
        if len(x.split(' ')) == 1
        else ''.join([w[0] for w in x.split(' ')])
    )

    mean_sem_scores = sort_tbl(
        generate_mean_std_tbl_bold(*calculate_mean_sem_scores(results), maximum=True, decimals=3),
        ovrs_order=OVRS_NAMES, clfs_order=CLFS_NAMES
    )
    mean_sem_perc_diff_scores = sort_tbl(
        generate_mean_std_tbl(*calculate_mean_sem_perc_diff_scores(results, ['SMOTE', 'K-SMOTE'])),
        ovrs_order=OVRS_NAMES, clfs_order=CLFS_NAMES
    )
    mean_sem_ranking = sort_tbl(
        generate_mean_std_tbl_bold(*calculate_mean_sem_ranking(results), maximum=False),
        ovrs_order=OVRS_NAMES, clfs_order=CLFS_NAMES
    )
    main_results_names = ('wide_optimal', 'mean_sem_scores', 'mean_sem_perc_diff_scores', 'mean_sem_ranking')

    return zip(main_results_names, (wide_optimal, mean_sem_scores, mean_sem_perc_diff_scores, mean_sem_ranking))
Esempio n. 2
0
def generate_main_results(data_path, results_path):
    """Generate the main results of the experiment."""
    
    # Load dataset
    dataset = load_datasets(data_dir=data_path)[0]

    # Load results
    results = []
    for name in RESULTS_NAMES:
        file_path = join(results_path, f'{name}.pkl')
        results.append(pd.read_pickle(file_path))
        
    # Combine and select results
    results = combine_results(*results)
    results = select_results(results, oversamplers_names=OVRS_NAMES, classifiers_names=CLFS_NAMES)

    # Extract metrics names
    metrics_names, *_ = zip(*METRICS_MAPPING.items())

    # Dataset description
    dataset_description = describe_dataset(dataset)
    
    # Scores
    wide_optimal = calculate_wide_optimal(results).drop(columns='Dataset')
    
    # Ranking
    ranking = calculate_ranking(results).drop(columns='Dataset')
    ranking.iloc[:, 2:] = ranking.iloc[:, 2:].astype(int)
    
    # Percentage difference
    perc_diff_scores = []
    for oversampler in BASELINE_OVRS:
        perc_diff_scores_ovs = calculate_mean_sem_perc_diff_scores(results, [oversampler, 'G-SMOTE'])[0]
        perc_diff_scores_ovs = perc_diff_scores_ovs[['Difference']].rename(columns={'Difference': oversampler})
        perc_diff_scores.append(perc_diff_scores_ovs)
    perc_diff_scores = sort_tbl(pd.concat([ranking[['Classifier', 'Metric']], pd.concat(perc_diff_scores, axis=1)], axis=1), clfs_order=CLFS_NAMES, ovrs_order=OVRS_NAMES, metrics_order=metrics_names)
    perc_diff_scores.iloc[:, 2:] = round(perc_diff_scores.iloc[:, 2:], 2)
    
    # Wilcoxon test
    pvalues = []
    for ovr in OVRS_NAMES[:-1]:
        mask = (wide_optimal['Metric'] != 'accuracy') if ovr == 'NONE' else np.repeat(True, len(wide_optimal))
        pvalues.append(wilcoxon(wide_optimal.loc[mask, ovr], wide_optimal.loc[mask, 'G-SMOTE']).pvalue)
    wilcoxon_results = pd.DataFrame({'Oversampler': OVRS_NAMES[:-1], 'p-value': pvalues, 'Significance': np.array(pvalues) < ALPHA})
        
    # Format results
    main_results = [(MAIN_RESULTS_NAMES[0], dataset_description)]
    for name, result in zip(MAIN_RESULTS_NAMES[1:], (wide_optimal, ranking, perc_diff_scores, wilcoxon_results)):
        if name != 'wilcoxon_results':
            result = sort_tbl(result, clfs_order=CLFS_NAMES, ovrs_order=OVRS_NAMES, metrics_order=metrics_names)
            result['Metric'] = result['Metric'].apply(lambda metric: METRICS_MAPPING[metric])
        if name == 'wide_optimal':
            result.iloc[:, 2:] = result.iloc[:, 2:].apply(lambda row: make_bold(row, True, 3), axis=1)
        elif name == 'ranking':
            result.iloc[:, 2:] = result.iloc[:, 2:].apply(lambda row: make_bold(row, False, 0), axis=1)
        elif name == 'wilcoxon_results':
            wilcoxon_results = generate_pvalues_tbl(wilcoxon_results)
        main_results.append((name, result))

    return main_results
Esempio n. 3
0
def generate_statistical_results():
    """Generate the statistical results of the experiment."""

    # Combine experiments objects
    results = []
    for ratio in UNDERSAMPLING_RATIOS:

        # Generate results   
        partial_results = generate_results(ratio)

        # Extract results
        cols = partial_results.columns
        partial_results = partial_results.reset_index()
        partial_results['Dataset'] = partial_results['Dataset'].apply(lambda name: f'{name}({ratio})')
        partial_results.set_index(['Dataset', 'Oversampler', 'Classifier', 'params'], inplace=True)
        partial_results.columns = cols
        results.append(partial_results)

    # Combine results
    results = combine_results(*results)

    # Calculate statistical results
    friedman_test = sort_tbl(generate_pvalues_tbl(apply_friedman_test(results)), ovrs_order=OVERSAMPLERS_NAMES, clfs_order=CLASSIFIERS_NAMES)
    holms_test = sort_tbl(generate_pvalues_tbl(apply_holms_test(results, control_oversampler='G-SMOTE')), ovrs_order=OVERSAMPLERS_NAMES[:-1], clfs_order=CLASSIFIERS_NAMES)
    statistical_results_names = ('friedman_test', 'holms_test')
    statistical_results = zip(statistical_results_names, (friedman_test, holms_test))

    return statistical_results
Esempio n. 4
0
def generate_main_results():
    """Generate the main results of the experiment."""

    main_results = {}
    for ratio in UNDERSAMPLING_RATIOS:

        # Generate results
        results = generate_results(ratio)

        # Calculate results
        mean_sem_scores = sort_tbl(
            generate_mean_std_tbl(*calculate_mean_sem_scores(results)),
            ovrs_order=OVERSAMPLERS_NAMES,
            clfs_order=CLASSIFIERS_NAMES)
        mean_sem_perc_diff_scores = sort_tbl(
            generate_mean_std_tbl(*calculate_mean_sem_perc_diff_scores(
                results, ['NO OVERSAMPLING', 'G-SMOTE'])),
            ovrs_order=OVERSAMPLERS_NAMES,
            clfs_order=CLASSIFIERS_NAMES)
        mean_sem_ranking = sort_tbl(
            generate_mean_std_tbl(*calculate_mean_sem_ranking(results)),
            ovrs_order=OVERSAMPLERS_NAMES,
            clfs_order=CLASSIFIERS_NAMES)

        # Populate main results
        main_results_names = ('mean_sem_scores', 'mean_sem_perc_diff_scores',
                              'mean_sem_ranking')
        main_results[ratio] = zip(
            main_results_names,
            (mean_sem_scores, mean_sem_perc_diff_scores, mean_sem_ranking))

    return main_results
Esempio n. 5
0
def generate_statistical_results():
    """Generate the statistical results of the experiment."""

    friedman_test = sort_tbl(generate_pvalues_tbl(apply_friedman_test(results)), ovrs_order=OVRS_NAMES, clfs_order=CLFS_NAMES)
    holms_test = sort_tbl(generate_pvalues_tbl_bold(apply_holms_test(results, control_oversampler='K-SMOTE')), ovrs_order=OVRS_NAMES[:-1], clfs_order=CLFS_NAMES)
    statistical_results_names = ('friedman_test', 'holms_test')
    statistical_results = zip(statistical_results_names, (friedman_test, holms_test))

    return statistical_results
Esempio n. 6
0
def generate_statistical_results():
    """Generate the statistical results of the experiment."""

    # Generate results
    results = generate_results()

    # Calculate statistical results
    friedman_test = sort_tbl(generate_pvalues_tbl(
        apply_friedman_test(results)),
                             ovrs_order=OVERSAMPLERS_NAMES,
                             clfs_order=CLASSIFIERS_NAMES)
    holms_test = sort_tbl(generate_pvalues_tbl(
        apply_holms_test(results, control_oversampler='G-SOMO')),
                          ovrs_order=OVERSAMPLERS_NAMES[:-1],
                          clfs_order=CLASSIFIERS_NAMES)

    # Generate statistical results
    statistical_results_names = ('friedman_test', 'holms_test')
    statistical_results = zip(statistical_results_names,
                              (friedman_test, holms_test))

    return statistical_results
Esempio n. 7
0
def generate_main_results():
    """Generate the main results of the experiment."""

    # Generate results
    results = generate_results()

    # Calculate results
    mean_sem_scores = sort_tbl(
        generate_mean_std_tbl(*calculate_mean_sem_scores(results)),
        ovrs_order=OVERSAMPLERS_NAMES,
        clfs_order=CLASSIFIERS_NAMES)
    keys = mean_sem_scores[['Classifier', 'Metric']]
    mean_sem_perc_diff_scores = []
    for oversampler in ('SMOTE', 'K-MEANS SMOTE', 'SOMO', 'G-SMOTE'):
        perc_diff_scores = sort_tbl(
            generate_mean_std_tbl(*calculate_mean_sem_perc_diff_scores(
                results, [oversampler, 'G-SOMO'])),
            ovrs_order=OVERSAMPLERS_NAMES,
            clfs_order=CLASSIFIERS_NAMES)
        perc_diff_scores = perc_diff_scores.rename(columns={
            'Difference': oversampler
        }).drop(columns=['Classifier', 'Metric'])
        mean_sem_perc_diff_scores.append(perc_diff_scores)
    mean_sem_perc_diff_scores = pd.concat(
        [keys, pd.concat(mean_sem_perc_diff_scores, axis=1)], axis=1)
    mean_sem_ranking = sort_tbl(
        generate_mean_std_tbl(*calculate_mean_sem_ranking(results)),
        ovrs_order=OVERSAMPLERS_NAMES,
        clfs_order=CLASSIFIERS_NAMES)

    # Generate main results
    main_results_names = ('mean_sem_scores', 'mean_sem_perc_diff_scores',
                          'mean_sem_ranking')
    main_results = zip(
        main_results_names,
        (mean_sem_scores, mean_sem_perc_diff_scores, mean_sem_ranking))

    return main_results