Esempio n. 1
0
def main(select_clusters, cluster_num, feature_num, scatter_num, epsilon, parallel_plots, box_plots, shap_plots, corr_plots):
    """
    """
    df, clusterer = dataset.default_dataset(paths=["../data/anonimized_io.csv"])
    # We remove a lot of the columns because SHAP sometimes picks them up. 
    input_columns = set([c for c in df.columns if 'perc' in c.lower() or 'LOG10' in c]).difference(["POSIX_LOG10_agg_perf_by_slowest", "LOG10_runtime", 
        "POSIX_LOG10_SEEKS", "POSIX_LOG10_MODE", "POSIX_LOG10_STATS", 'POSIX_ACCESS1_COUNT_PERC', 'POSIX_ACCESS2_COUNT_PERC', 'POSIX_ACCESS3_COUNT_PERC', 'POSIX_ACCESS4_COUNT_PERC'])

    # 
    # Since almost any epsilon is going to give us more clusters than we want (mostly outliers), here we refine them to TOP_CLUSTERS
    #
    if select_clusters is None: 
        clusters = HDBSCAN_to_DBSCAN(clusterer.condensed_tree_.to_networkx(), df.shape[0], epsilon=epsilon)
        top_indexes = reversed(np.argsort([len(x) for x in clusters])[-cluster_num:])
        top_clusters = [clusters[idx] for idx in top_indexes]
    # Given indexes of nodes in the condensed tree, find reachable leaves 
    else:
        def get_leaves(G): 
            return {x for x in G.nodes() if G.out_degree(x)==0 and G.in_degree(x)==1}

        G = clusterer.condensed_tree_.to_networkx()
        top_clusters = [get_leaves(nx.dfs_tree(G, c)) for c in select_clusters]

    #
    # Plotting setup
    #
    fig = plt.figure()
    plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.6)
    rows =              int(parallel_plots)    + int(box_plots)    + int(shap_plots)    + int(corr_plots)     + int(scatter_num)
    height_ratios = [0.8] * parallel_plots + [0.3] * box_plots + [0.5] * shap_plots + [0.4] * corr_plots + [0.25] * scatter_num # Hardcoded for now 
    gs = fig.add_gridspec(rows, len(top_clusters), height_ratios=height_ratios, hspace=0.55)
    next_row = 0

    #
    # Plot cluster metadata in the next two rows
    #
    if parallel_plots:
        logging.info("Plotting parallel plots")
        for c_idx, cluster in enumerate(top_clusters): 
            plot_parallel_coordinates(gs[next_row, c_idx], df.iloc[list(cluster)], dpi=fig.dpi_scale_trans, name=str(c_idx))

        next_row += 1


    #
    # Next, let's fit linear models and predict POSIX_agg_perf_by_slowest
    # We will plot a boxplot of the test set points 
    #
    if box_plots:
        logging.info("Training models")
        for c_idx, cluster in enumerate(top_clusters): 
            box_ax       = fig.add_subplot(gs[next_row, c_idx], sharey=box_ax if c_idx != 0 else None)   # noqa: F821
            scatter_axes = [fig.add_subplot(gs[next_row + 2 + s, c_idx]) for s in range(scatter_num)] if scatter_num > 0 else None
            corr_ax      = fig.add_subplot(gs[next_row + 2, c_idx]) if corr_plots else None
            shap_ax      = fig.add_subplot(gs[next_row + 1, c_idx]) if shap_plots else None
            train_and_plot_errors(box_ax, shap_ax, corr_ax, scatter_axes, df.iloc[list(cluster)][input_columns], df.iloc[list(cluster)].POSIX_LOG10_agg_perf_by_slowest, feature_num)

        next_row += 3

    plt.show()
Esempio n. 2
0
def main():
    df, _ = dataset.default_dataset(paths=["../data/anonimized_io.csv"])

    fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 4))
    plt.rcParams["font.family"] = "Times New Roman"
    plt.rcParams["font.size"] = 14
    options = {"fontname": "Times New Roman", "fontsize": 18}
    options_small = {"fontname": "Times New Roman", "fontsize": 16}
    plt.subplots_adjust(wspace=0.05)

    # Plotting the first figure. We multiply by three because the old code used log2 and not log10 features
    hexbin = ax1.hexbin(x=df.POSIX_LOG10_total_bytes*3, y=df.POSIX_LOG10_agg_perf_by_slowest*3, bins="log", linewidths=0.1, cmap="bone_r", gridsize=64, vmax=3*10**4)

    ax1.set_xticks([10, 20, 30, 40, 50])
    ax1.set_xticklabels(["KiB", "MiB", "GiB", "TiB", "PiB"], **options_small)
    ax1.set_yticks([-10, 0, 10, 20])
    ax1.set_yticklabels(["KiB / s", "MiB / s", "GiB / s", "TiB / s", "PiB /s"], **options_small)
    ax1.set_ylabel("I/O Throughput", **options)
    ax1.set_xlabel("I/O Volume", **options)
    ax1.grid(alpha=0.3)

    # Plotting the 2nd figure
    hexbin = ax2.hexbin(x=df.LOG10_nprocs*3, y=df.POSIX_LOG10_agg_perf_by_slowest*3, bins="log", linewidths=0.1, cmap="bone_r", gridsize=64, vmax=3*10**4)

    ax2.set_xticks([0, 4, 8, 12, 16])
    ax2.set_xticklabels([1, 16, 256, 4096, 65536], **options_small)
    ax2.set_yticks([-10, 0, 10, 20])
    ax2.set_yticklabels(["KiB / s", "MiB / s", "GiB / s", "TiB / s", "PiB /s"], **options_small)
    ax2.set_xlabel("Number of processes", **options)
    ax2.grid(alpha=0.3)

    cbar = plt.colorbar(hexbin, ax=ax2)
    cbar.set_label("Number of jobs", **options)

    plt.show()
Esempio n. 3
0
def main():
    df, clusterer = dataset.default_dataset(
        paths=["../data/anonimized_io.csv"])
    print("Loaded dataset and HDBSCAN clusterer")

    ct = clusterer.condensed_tree_
    G = ct.to_networkx()
    print("Converted condensed tree to networkX graph")

    split_multidegree_nodes(G)
    print("Split multi-degree nodes into multiple 2-degree nodes")

    CG = build_condensed_graph(G, 3., 1000, dont_merge=[])
    print("Built condensed graph with {} nodes and {} edges".format(
        len(CG.nodes), len(CG.edges)))

    print("Drawing the tree")
    log_columns = set([c for c in df.columns if 'perc' in c.lower()])
    draw_circular_tree(CG, G, df[log_columns], list(log_columns))
Esempio n. 4
0
def recursive_permutation_feature_elimination():
    """
    Repeatedly runs permutation feature importance, determines the least important features, drops them,
    and repeats the process until only one feature is left.
    """
    random.seed(0)
    np.random.seed(0)

    df, _ = dataset.default_dataset(paths=["../data/anonimized_io.csv"])

    # Extract IO throughput
    IO_log10_throughput = df.POSIX_LOG10_agg_perf_by_slowest
    df.drop(columns=[
        "POSIX_RAW_agg_perf_by_slowest", "POSIX_LOG10_agg_perf_by_slowest",
        "LOG10_runtime"
    ],
            inplace=True)

    # Drop nonessential features
    log_columns = list(
        set([x for x in df.columns if "LOG10" in x or "perc" in x.lower()]))
    df = df[log_columns]

    # Take a subset of the dataset to speed up computation
    # sample = random.sample(range(df.shape[0]), 100000)
    # df = df.iloc[sample]
    # IO_log10_throughput = IO_log10_throughput[sample]

    mape_results = []
    dropped_features = []

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        df, IO_log10_throughput, test_size=0.3, shuffle=True)

    while X_train.shape[1] > 0:
        print("Dataset size: {}".format(X_train.shape))

        model = xgb.XGBRegressor(obj=huber_approx_obj)
        model.fit(X_train, y_train, eval_metric=huber_approx_obj)

        mape_results.append(
            mean_absolute_percentage_error(y_test, model.predict(X_test)))
        print("Model achieved MAPE value of {}".format(mape_results[-1]))

        result = permutation_importance(model,
                                        X_train,
                                        y_train,
                                        n_repeats=5,
                                        n_jobs=8,
                                        random_state=0xdeadbeef)

        least_important_feature_index = np.argmin(result.importances_mean)
        dropped_features.append(X_train.columns[least_important_feature_index])

        print("Dropping feature {}".format(
            X_train.columns[least_important_feature_index]))
        X_train = X_train.drop(
            columns=X_train.columns[least_important_feature_index])
        X_test = X_test.drop(
            columns=X_test.columns[least_important_feature_index])

    return mape_results, dropped_features
Esempio n. 5
0
def main(top_apps=6, jobs_per_app=16):
    df, _ = dataset.default_dataset(paths=["../data/anonimized_io.csv"])
    columns = set([
        c for c in df.columns if 'perc' in c.lower() or 'log10' in c.lower()
    ]).difference(["POSIX_LOG10_agg_perf_by_slowest"])

    top_applications = Counter(df.apps_short).most_common()[:top_apps]
    top_applications = [x[0] for x in top_applications]  # get just the names

    new_df = pd.DataFrame()
    for app in top_applications:
        new_df = new_df.append(df[df.apps_short == app].sample(jobs_per_app))

    # Rename app names to preserve anonymity
    mapping = {
        top_applications[0]: "Climate",
        top_applications[1]: "Materials",
        top_applications[2]: "Cosmology",
        top_applications[3]: "Fluid dynamics",
        top_applications[4]: "Benchmark 1",
        top_applications[5]: "Benchmark 2"
    }

    new_df.apps_short = new_df.apps_short.map(mapping)
    top_applications = [mapping[x] for x in top_applications]

    # Calculate distances and wrap in a dataframe
    l1_distances = manhattan_distances(new_df[columns], new_df[columns])
    l2_distances = euclidean_distances(new_df[columns], new_df[columns])
    l1_distances = pd.DataFrame(l1_distances,
                                columns=new_df.index,
                                index=new_df.index)
    l2_distances = pd.DataFrame(l2_distances,
                                columns=new_df.index,
                                index=new_df.index)

    # Get colors for each row and column
    palette = sns.color_palette(n_colors=top_apps)
    lut = {app: color for app, color in zip(top_applications, palette)}
    row_colors = new_df.apps_short.map(lut)
    row_colors.name = ""

    # Draw
    cg1 = sns.clustermap(l1_distances,
                         row_colors=row_colors,
                         col_colors=row_colors,
                         row_cluster=True,
                         col_cluster=True,
                         cbar_pos=(.1, .1, .03, 0.6),
                         xticklabels=False,
                         yticklabels=False,
                         robust=True,
                         figsize=(8, 8))
    cg2 = sns.clustermap(l2_distances,
                         row_colors=row_colors,
                         col_colors=row_colors,
                         row_cluster=True,
                         col_cluster=True,
                         cbar_pos=(.1, .1, .03, 0.6),
                         xticklabels=False,
                         yticklabels=False,
                         robust=True,
                         figsize=(8, 8))

    # Hardcode labels for anonymity
    for label in top_applications:
        cg1.ax_col_dendrogram.bar(0, 0, color=lut[label], label=label)
        cg2.ax_col_dendrogram.bar(0, 0, color=lut[label], label=label)

    # Plot labels
    cg1.ax_col_dendrogram.legend(loc="center", ncol=top_apps // 3, fontsize=25)
    cg2.ax_col_dendrogram.legend(loc="center", ncol=top_apps // 3, fontsize=25)

    # Hide the dendrogram - this will also kill the legend, so we should generate two graphs and then stitch them
    # together
    cg1.ax_row_dendrogram.set_visible(False)
    cg1.ax_col_dendrogram.set_visible(False)
    cg2.ax_row_dendrogram.set_visible(False)
    cg2.ax_col_dendrogram.set_visible(False)

    # Save figures
    # cg1.savefig('pdfs/l1_distance.pdf', bbox_inches='tight')
    # cg2.savefig('pdfs/l2_distance.pdf', bbox_inches='tight')

    plt.show()
def main(MIN_CLUSTER_SIZE=100):
    df, clusterer = dataset.default_dataset(
        paths=["../data/anonimized_io.csv"])
    input_columns = set([
        c for c in df.columns if 'perc' in c.lower() or 'LOG10' in c
    ]).difference(["POSIX_LOG10_agg_perf_by_slowest"])

    # Hand selected these to show good gradients, since the dataset is finicky.
    # small changes in epsilon lead to large changes in the number of clusters.
    epsilons = [9.5, 7, 5, 2.1]
    cluster_sizes = [10, 79, 267, 1077]

    results = get_cluster_results(df, input_columns, epsilons,
                                  MIN_CLUSTER_SIZE)
    global_train_errors, global_test_errors = prediction_error(
        df[input_columns], df.POSIX_LOG10_agg_perf_by_slowest)
    global_results = pd.DataFrame({
        "jobs_cluster_size":
        [df.shape[0]] * (len(global_train_errors) + len(global_test_errors)),
        "job_errors":
        list(global_train_errors) + list(global_test_errors),
        "job_in_test_set":
        [False] * len(global_train_errors) + [True] * len(global_test_errors),
        "job_eps":
        [1000] * (len(global_train_errors) + len(global_test_errors))
    })
    results = results.append(global_results)

    # Let's rescale the errors so that they represent real ratios and not logarithmic differences
    results.job_errors = 10**results.job_errors

    #
    # Plotting
    #
    fig = plt.figure(figsize=(15, 8))
    spec = fig.add_gridspec(ncols=2, nrows=2)

    ax2 = fig.add_subplot(spec[0, 0])
    ax4 = fig.add_subplot(spec[0, 1], sharey=ax2)

    seaborn.boxplot(data=results[results.job_in_test_set == False],
                    x="job_eps",
                    y="job_errors",
                    ax=ax2)
    ax2.set_title("Cluster Training Errors")
    ax2.set_xlabel("Number of clusters")
    ax2.set_xticklabels(list(reversed(cluster_sizes)) + ["Global"])
    ax2.set_yscale('log')

    seaborn.boxplot(data=results[results.job_in_test_set],
                    x="job_eps",
                    y="job_errors",
                    ax=ax4)
    ax4.set_title("Cluster Test Errors")
    ax4.set_xlabel("Number of clusters")
    ax4.set_xticklabels(list(reversed(cluster_sizes)) + ["Global"])

    ax2.set_ylabel("Average Prediction Ratio Error")
    ax4.set_ylabel("Average Prediction Ratio Error")
    ax2.set_yticks(np.arange(1, 3, 0.1), minor=True)
    ax4.set_yticks(np.arange(1, 3, 0.1), minor=True)
    ax2.grid(axis='x', which='both')
    ax4.grid(axis='x', which='both')

    # Plot the cumulative histograms results[results.job_in_test_set].job_errors
    for eps in reversed([1000] + epsilons):
        ax_cm = fig.add_subplot(spec[1, 0])
        plt.xlim(1, 2)
        plt.ylim(0, 1.2)

        x = results[np.logical_and(results.job_in_test_set == False,
                                   results.job_eps == eps)].job_errors
        x = x[x < 3]
        seaborn.distplot(x,
                         norm_hist=True,
                         hist_kws={
                             'cumulative': True,
                             'histtype': 'step',
                             'alpha': 1
                         },
                         ax=ax_cm)

        plt.legend(list(reversed(cluster_sizes)) + ["Global"])
        ax_cm.set_xlabel("Average Prediction Ratio Error")
        ax_cm.set_ylabel("Percentage of jobs")

        ax_cm = fig.add_subplot(spec[1, 1])
        plt.xlim(1, 2)
        plt.ylim(0, 1.2)

        x = results[np.logical_and(results.job_in_test_set,
                                   results.job_eps == eps)].job_errors
        x = x[x < 3]
        seaborn.distplot(x,
                         norm_hist=True,
                         hist_kws={
                             'cumulative': True,
                             'histtype': 'step',
                             'alpha': 1
                         },
                         ax=ax_cm)

        plt.legend(list(reversed(cluster_sizes)) + ["Global"])
        ax_cm.set_xlabel("Average Prediction Ratio Error")
        ax_cm.set_ylabel("Percentage of jobs")

    plt.show()