Esempio n. 1
0
def plot_f1_cleaning_local(data, result_name: str):
    results = load_result(Path(f"{data.results_path}{result_name}.p"))
    local_results = [x for x in results if x.get('label') is not None]

    cleaning_results = [x for x in local_results if x['n_errors_in_dirty'] > 0]
    labels = [data.column_map[c['label']] for c in cleaning_results]
    perf_cleaning = [round(c['error_cleaning'], 2) for c in cleaning_results]

    global_f1_score = [
        x['global_error_cleaning'] for x in results
        if x.get('global_error_cleaning') is not None
    ][0]
    print(f'The run has a global f1-score on dataset {data.title} of '
          f'{round(global_f1_score, 5)}')
    pu.figure_setup()
    fig_size = pu.get_fig_size(25, 5)
    fig = plt.figure(figsize=list(fig_size))
    ax = fig.add_subplot(111)

    x = np.arange(len(labels))
    width = 0.35  # the width of the bars
    rects1 = ax.bar(x, perf_cleaning, width, label='Cleaning')

    ax.set_ylabel('Cleaning F1-Score')
    ax.set_title('Performance Cleaning')

    ax.set_xlabel('Columns with Errors')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)

    ax.bar_label(rects1, padding=3)

    return (fig, ax)
Esempio n. 2
0
def plot_prec_threshold(data, *args):
    global_results, prec_thresh = list(), list()

    p = Path(data.results_path)
    for r_path in p.glob('*.p'):
        result = load_result(r_path)
        glob = list(filter(lambda x: x.get('global_error_detection'), result))
        global_results.append(glob)

    for r_path in p.glob('*.p'):
        result = load_result(r_path)
        conf = result[0].get('precision_threshold')
        prec_thresh.append(conf)

    pu.figure_setup()
    fig_size = pu.get_fig_size(10, 4)
    fig = plt.figure(figsize=list(fig_size))
    ax = fig.add_subplot(111)

    clean = [r[0]['global_error_cleaning'] for r in global_results]
    detect = [r[0]['global_error_detection'] for r in global_results]
    ax.scatter(prec_thresh, clean, label='Error Cleaning Complete Dataset')
    ax.scatter(prec_thresh, detect, label='Error Detecting Complete Dataset')
    ax.legend()

    ax.set_title(
        'Effect Of Precision Threshold on Cleaning and Detection Performance')
    ax.set(xlabel='Precision Threshold', ylabel='F1 Score')
    return (fig, ax)
Esempio n. 3
0
def plot_auc_cleaning_global(data, *args):
    """
    Plot the trend of cleaning models over time using auc
    of the precision-recall curve.
    """
    df_clean = helps.load_original_data(data, load_dirty=False)
    df_dirty = helps.load_original_data(data, load_dirty=True)

    local_results, timestamps = list(), list()

    p = Path(data.results_path)
    for r_path in p.glob('*.p'):
        result = load_result(r_path)
        local_result = list(filter(lambda x: x.get('label'), result))
        ts = list(
            map(lambda x: x.get('run_at_timestamp'),
                filter(lambda x: x.get('run_at_timestamp'), result)))

        local_results.append(local_result)
        timestamps.append(ts)

    prec, rec = {}, {}
    avg_areas_under_curve_per_run, aucs = list(), list()

    for local_result in local_results:  # for each cleaning run
        for r in local_result:  # for each RHS
            aucs = list()
            df_clean_y_true = df_clean.loc[:, r['label']]
            imputer = datawig.AutoGluonImputer.load(
                output_path='./', model_name=r['model_checksum'])

            df_probas = imputer.predict(df_dirty, return_probas=True)
            for i in imputer.predictor.class_labels:  # for each class
                prec[i], rec[i], _ = precision_recall_curve(
                    df_clean_y_true == i, df_probas.loc[:, i], pos_label=True)
                aucs.append(auc(rec[i], prec[i]))
        avg_areas_under_curve_per_run.append(np.average(aucs))
    pu.figure_setup()
    fig_size = pu.get_fig_size(10, 4)
    fig = plt.figure(figsize=list(fig_size))
    ax = fig.add_subplot(111)
    ax.scatter(timestamps, avg_areas_under_curve_per_run, label='AUC Cleaning')
    ax.legend()

    ax.set(xlabel='Timestamp', ylabel='AUC Cleaning Performance')
    return (fig, ax)
Esempio n. 4
0
def plot_f1_cleaning_detection_global(data, *kwargs):
    """
    Plot the trend of cleaning models over time.
    """
    p = Path(data.results_path)
    all_results = []
    for r_path in p.glob('*.p'):
        all_results = all_results + load_result(r_path)
    global_results = list(
        filter(lambda x: x.get('global_error_detection'), all_results))
    timestamps = list(
        map(lambda x: x.get('run_at_timestamp'),
            filter(lambda x: x.get('run_at_timestamp'), all_results)))

    detection = [x['global_error_detection'] for x in global_results]
    cleaning = [x['global_error_cleaning'] for x in global_results]
    prec_thresholds = [
        x['precision_threshold'] for x in all_results
        if x.get('precision_threshold') is not None
    ]

    print("Plotting Datapoints:")
    for x in zip(detection, cleaning, timestamps, prec_thresholds):
        print('~~~~~')
        print(f'Detection performance: {round(x[0], 5)}')
        print(f'Cleaning performance: {round(x[1], 5)}')
        print(f'Precision Threshold: {x[3]}')
        print(f'Timestamp: {x[2]}')

    pu.figure_setup()
    fig_size = pu.get_fig_size(10, 4)
    fig = plt.figure(figsize=list(fig_size))
    ax = fig.add_subplot(111)
    ax.scatter(timestamps, detection, label='F1 Error Detection')
    ax.scatter(timestamps, cleaning, label='F1 Data Cleaning')
    ax.legend()

    ax.set(xlabel='Timestamp', ylabel='Cleaning Performance')
    return (fig, ax)
Esempio n. 5
0
def plot_f1_cleaning_detection_local(data, result_name: str):
    """
    Plot the result stored at $data's $results_path with the name
    $result_name.
    """
    results = load_result(Path(f"{data.results_path}{result_name}.p"))

    local_results = list(filter(lambda x: x.get('label'), results))

    labels = [data.column_map[c['label']] for c in local_results]
    perf_error_detection = [
        round(c['error_detection'], 2) for c in local_results
    ]
    perf_cleaning = [round(c['error_cleaning'], 2) for c in local_results]

    pu.figure_setup()
    fig_size = pu.get_fig_size(25, 5)
    fig = plt.figure(figsize=list(fig_size))
    ax = fig.add_subplot(111)

    x = np.arange(len(labels))
    width = 0.35  # the width of the bars
    rects1 = ax.bar(x - width / 2, perf_cleaning, width, label='Cleaning')
    rects2 = ax.bar(x + width / 2,
                    perf_error_detection,
                    width,
                    label='Error Detection')

    ax.set_ylabel('F1-Score')
    ax.set_title('Performance Cleaning \& Error Detection')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()

    ax.bar_label(rects1, padding=3)
    ax.bar_label(rects2, padding=3)

    return (fig, ax)