Ejemplo n.º 1
0
def generate_main_results(data_path, results_path):
    """Generate the main results of the experiment."""
    
    # Load dataset
    dataset = load_datasets(data_dir=data_path)[0]

    # Load results
    results = []
    for name in RESULTS_NAMES:
        file_path = join(results_path, f'{name}.pkl')
        results.append(pd.read_pickle(file_path))
        
    # Combine and select results
    results = combine_results(*results)
    results = select_results(results, oversamplers_names=OVRS_NAMES, classifiers_names=CLFS_NAMES)

    # Extract metrics names
    metrics_names, *_ = zip(*METRICS_MAPPING.items())

    # Dataset description
    dataset_description = describe_dataset(dataset)
    
    # Scores
    wide_optimal = calculate_wide_optimal(results).drop(columns='Dataset')
    
    # Ranking
    ranking = calculate_ranking(results).drop(columns='Dataset')
    ranking.iloc[:, 2:] = ranking.iloc[:, 2:].astype(int)
    
    # Percentage difference
    perc_diff_scores = []
    for oversampler in BASELINE_OVRS:
        perc_diff_scores_ovs = calculate_mean_sem_perc_diff_scores(results, [oversampler, 'G-SMOTE'])[0]
        perc_diff_scores_ovs = perc_diff_scores_ovs[['Difference']].rename(columns={'Difference': oversampler})
        perc_diff_scores.append(perc_diff_scores_ovs)
    perc_diff_scores = sort_tbl(pd.concat([ranking[['Classifier', 'Metric']], pd.concat(perc_diff_scores, axis=1)], axis=1), clfs_order=CLFS_NAMES, ovrs_order=OVRS_NAMES, metrics_order=metrics_names)
    perc_diff_scores.iloc[:, 2:] = round(perc_diff_scores.iloc[:, 2:], 2)
    
    # Wilcoxon test
    pvalues = []
    for ovr in OVRS_NAMES[:-1]:
        mask = (wide_optimal['Metric'] != 'accuracy') if ovr == 'NONE' else np.repeat(True, len(wide_optimal))
        pvalues.append(wilcoxon(wide_optimal.loc[mask, ovr], wide_optimal.loc[mask, 'G-SMOTE']).pvalue)
    wilcoxon_results = pd.DataFrame({'Oversampler': OVRS_NAMES[:-1], 'p-value': pvalues, 'Significance': np.array(pvalues) < ALPHA})
        
    # Format results
    main_results = [(MAIN_RESULTS_NAMES[0], dataset_description)]
    for name, result in zip(MAIN_RESULTS_NAMES[1:], (wide_optimal, ranking, perc_diff_scores, wilcoxon_results)):
        if name != 'wilcoxon_results':
            result = sort_tbl(result, clfs_order=CLFS_NAMES, ovrs_order=OVRS_NAMES, metrics_order=metrics_names)
            result['Metric'] = result['Metric'].apply(lambda metric: METRICS_MAPPING[metric])
        if name == 'wide_optimal':
            result.iloc[:, 2:] = result.iloc[:, 2:].apply(lambda row: make_bold(row, True, 3), axis=1)
        elif name == 'ranking':
            result.iloc[:, 2:] = result.iloc[:, 2:].apply(lambda row: make_bold(row, False, 0), axis=1)
        elif name == 'wilcoxon_results':
            wilcoxon_results = generate_pvalues_tbl(wilcoxon_results)
        main_results.append((name, result))

    return main_results
Ejemplo n.º 2
0
def generate_statistical_results():
    """Generate the statistical results of the experiment."""

    # Combine experiments objects
    results = []
    for ratio in UNDERSAMPLING_RATIOS:

        # Generate results   
        partial_results = generate_results(ratio)

        # Extract results
        cols = partial_results.columns
        partial_results = partial_results.reset_index()
        partial_results['Dataset'] = partial_results['Dataset'].apply(lambda name: f'{name}({ratio})')
        partial_results.set_index(['Dataset', 'Oversampler', 'Classifier', 'params'], inplace=True)
        partial_results.columns = cols
        results.append(partial_results)

    # Combine results
    results = combine_results(*results)

    # Calculate statistical results
    friedman_test = sort_tbl(generate_pvalues_tbl(apply_friedman_test(results)), ovrs_order=OVERSAMPLERS_NAMES, clfs_order=CLASSIFIERS_NAMES)
    holms_test = sort_tbl(generate_pvalues_tbl(apply_holms_test(results, control_oversampler='G-SMOTE')), ovrs_order=OVERSAMPLERS_NAMES[:-1], clfs_order=CLASSIFIERS_NAMES)
    statistical_results_names = ('friedman_test', 'holms_test')
    statistical_results = zip(statistical_results_names, (friedman_test, holms_test))

    return statistical_results
Ejemplo n.º 3
0
def generate_results():
    """Generate results including all oversamplers."""

    # Load results
    results = []
    for name in RESULTS_NAMES:
        file_path = join(RESULTS_PATH, f'{name}.pkl')
        results.append(pd.read_pickle(file_path))

    # Combine results
    results = combine_results(*results)

    return results
Ejemplo n.º 4
0
def generate_results(ratio):
    """Generate results including all oversamplers."""

    # Load results
    results = []
    for name in RESULTS_NAMES:
        file_path = join(RESULTS_PATH, f'{name}_{ratio}.pkl')
        results.append(pd.read_pickle(file_path))
        
    # Combine results
    results = combine_results(*results)

    # Select results
    results = select_results(results, classifiers_names=CLASSIFIERS_NAMES)

    return results
Ejemplo n.º 5
0
if __name__=='__main__':

    data_path, results_path, analysis_path = generate_paths()

    # load datasets
    datasets = load_datasets(data_dir=data_path)

    # load results
    results = []
    for name in RESULTS_NAMES:
        file_path = join(results_path, f'{name}.pkl')
        results.append(pd.read_pickle(file_path))

    # combine and select results
    results = combine_results(*results)
    results = select_results(results, oversamplers_names=OVRS_NAMES, classifiers_names=CLFS_NAMES)

    # datasets description
    summarize_multiclass_datasets(datasets).to_csv(join(analysis_path, 'datasets_description.csv'), index=False)

    # Main results
    main_results = generate_main_results()
    for name, result in main_results:
        result['Metric'] = result['Metric'].map(METRICS_MAPPING)
        result.to_csv(join(analysis_path, f'{name}.csv'), index=False)

    # Visualizations
    make_mean_rank_bar_chart()
    make_score_heatmaps()
    make_resampling_example()