def main(select_clusters, cluster_num, feature_num, scatter_num, epsilon, parallel_plots, box_plots, shap_plots, corr_plots): """ """ df, clusterer = dataset.default_dataset(paths=["../data/anonimized_io.csv"]) # We remove a lot of the columns because SHAP sometimes picks them up. input_columns = set([c for c in df.columns if 'perc' in c.lower() or 'LOG10' in c]).difference(["POSIX_LOG10_agg_perf_by_slowest", "LOG10_runtime", "POSIX_LOG10_SEEKS", "POSIX_LOG10_MODE", "POSIX_LOG10_STATS", 'POSIX_ACCESS1_COUNT_PERC', 'POSIX_ACCESS2_COUNT_PERC', 'POSIX_ACCESS3_COUNT_PERC', 'POSIX_ACCESS4_COUNT_PERC']) # # Since almost any epsilon is going to give us more clusters than we want (mostly outliers), here we refine them to TOP_CLUSTERS # if select_clusters is None: clusters = HDBSCAN_to_DBSCAN(clusterer.condensed_tree_.to_networkx(), df.shape[0], epsilon=epsilon) top_indexes = reversed(np.argsort([len(x) for x in clusters])[-cluster_num:]) top_clusters = [clusters[idx] for idx in top_indexes] # Given indexes of nodes in the condensed tree, find reachable leaves else: def get_leaves(G): return {x for x in G.nodes() if G.out_degree(x)==0 and G.in_degree(x)==1} G = clusterer.condensed_tree_.to_networkx() top_clusters = [get_leaves(nx.dfs_tree(G, c)) for c in select_clusters] # # Plotting setup # fig = plt.figure() plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.6) rows = int(parallel_plots) + int(box_plots) + int(shap_plots) + int(corr_plots) + int(scatter_num) height_ratios = [0.8] * parallel_plots + [0.3] * box_plots + [0.5] * shap_plots + [0.4] * corr_plots + [0.25] * scatter_num # Hardcoded for now gs = fig.add_gridspec(rows, len(top_clusters), height_ratios=height_ratios, hspace=0.55) next_row = 0 # # Plot cluster metadata in the next two rows # if parallel_plots: logging.info("Plotting parallel plots") for c_idx, cluster in enumerate(top_clusters): plot_parallel_coordinates(gs[next_row, c_idx], df.iloc[list(cluster)], dpi=fig.dpi_scale_trans, name=str(c_idx)) next_row += 1 # # Next, let's fit linear models and predict POSIX_agg_perf_by_slowest # We will plot a boxplot of the test set points # if box_plots: logging.info("Training models") for c_idx, cluster in enumerate(top_clusters): box_ax = fig.add_subplot(gs[next_row, c_idx], sharey=box_ax if c_idx != 0 else None) # noqa: F821 scatter_axes = [fig.add_subplot(gs[next_row + 2 + s, c_idx]) for s in range(scatter_num)] if scatter_num > 0 else None corr_ax = fig.add_subplot(gs[next_row + 2, c_idx]) if corr_plots else None shap_ax = fig.add_subplot(gs[next_row + 1, c_idx]) if shap_plots else None train_and_plot_errors(box_ax, shap_ax, corr_ax, scatter_axes, df.iloc[list(cluster)][input_columns], df.iloc[list(cluster)].POSIX_LOG10_agg_perf_by_slowest, feature_num) next_row += 3 plt.show()
def main(): df, _ = dataset.default_dataset(paths=["../data/anonimized_io.csv"]) fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(10, 4)) plt.rcParams["font.family"] = "Times New Roman" plt.rcParams["font.size"] = 14 options = {"fontname": "Times New Roman", "fontsize": 18} options_small = {"fontname": "Times New Roman", "fontsize": 16} plt.subplots_adjust(wspace=0.05) # Plotting the first figure. We multiply by three because the old code used log2 and not log10 features hexbin = ax1.hexbin(x=df.POSIX_LOG10_total_bytes*3, y=df.POSIX_LOG10_agg_perf_by_slowest*3, bins="log", linewidths=0.1, cmap="bone_r", gridsize=64, vmax=3*10**4) ax1.set_xticks([10, 20, 30, 40, 50]) ax1.set_xticklabels(["KiB", "MiB", "GiB", "TiB", "PiB"], **options_small) ax1.set_yticks([-10, 0, 10, 20]) ax1.set_yticklabels(["KiB / s", "MiB / s", "GiB / s", "TiB / s", "PiB /s"], **options_small) ax1.set_ylabel("I/O Throughput", **options) ax1.set_xlabel("I/O Volume", **options) ax1.grid(alpha=0.3) # Plotting the 2nd figure hexbin = ax2.hexbin(x=df.LOG10_nprocs*3, y=df.POSIX_LOG10_agg_perf_by_slowest*3, bins="log", linewidths=0.1, cmap="bone_r", gridsize=64, vmax=3*10**4) ax2.set_xticks([0, 4, 8, 12, 16]) ax2.set_xticklabels([1, 16, 256, 4096, 65536], **options_small) ax2.set_yticks([-10, 0, 10, 20]) ax2.set_yticklabels(["KiB / s", "MiB / s", "GiB / s", "TiB / s", "PiB /s"], **options_small) ax2.set_xlabel("Number of processes", **options) ax2.grid(alpha=0.3) cbar = plt.colorbar(hexbin, ax=ax2) cbar.set_label("Number of jobs", **options) plt.show()
def main(): df, clusterer = dataset.default_dataset( paths=["../data/anonimized_io.csv"]) print("Loaded dataset and HDBSCAN clusterer") ct = clusterer.condensed_tree_ G = ct.to_networkx() print("Converted condensed tree to networkX graph") split_multidegree_nodes(G) print("Split multi-degree nodes into multiple 2-degree nodes") CG = build_condensed_graph(G, 3., 1000, dont_merge=[]) print("Built condensed graph with {} nodes and {} edges".format( len(CG.nodes), len(CG.edges))) print("Drawing the tree") log_columns = set([c for c in df.columns if 'perc' in c.lower()]) draw_circular_tree(CG, G, df[log_columns], list(log_columns))
def recursive_permutation_feature_elimination(): """ Repeatedly runs permutation feature importance, determines the least important features, drops them, and repeats the process until only one feature is left. """ random.seed(0) np.random.seed(0) df, _ = dataset.default_dataset(paths=["../data/anonimized_io.csv"]) # Extract IO throughput IO_log10_throughput = df.POSIX_LOG10_agg_perf_by_slowest df.drop(columns=[ "POSIX_RAW_agg_perf_by_slowest", "POSIX_LOG10_agg_perf_by_slowest", "LOG10_runtime" ], inplace=True) # Drop nonessential features log_columns = list( set([x for x in df.columns if "LOG10" in x or "perc" in x.lower()])) df = df[log_columns] # Take a subset of the dataset to speed up computation # sample = random.sample(range(df.shape[0]), 100000) # df = df.iloc[sample] # IO_log10_throughput = IO_log10_throughput[sample] mape_results = [] dropped_features = [] X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( df, IO_log10_throughput, test_size=0.3, shuffle=True) while X_train.shape[1] > 0: print("Dataset size: {}".format(X_train.shape)) model = xgb.XGBRegressor(obj=huber_approx_obj) model.fit(X_train, y_train, eval_metric=huber_approx_obj) mape_results.append( mean_absolute_percentage_error(y_test, model.predict(X_test))) print("Model achieved MAPE value of {}".format(mape_results[-1])) result = permutation_importance(model, X_train, y_train, n_repeats=5, n_jobs=8, random_state=0xdeadbeef) least_important_feature_index = np.argmin(result.importances_mean) dropped_features.append(X_train.columns[least_important_feature_index]) print("Dropping feature {}".format( X_train.columns[least_important_feature_index])) X_train = X_train.drop( columns=X_train.columns[least_important_feature_index]) X_test = X_test.drop( columns=X_test.columns[least_important_feature_index]) return mape_results, dropped_features
def main(top_apps=6, jobs_per_app=16): df, _ = dataset.default_dataset(paths=["../data/anonimized_io.csv"]) columns = set([ c for c in df.columns if 'perc' in c.lower() or 'log10' in c.lower() ]).difference(["POSIX_LOG10_agg_perf_by_slowest"]) top_applications = Counter(df.apps_short).most_common()[:top_apps] top_applications = [x[0] for x in top_applications] # get just the names new_df = pd.DataFrame() for app in top_applications: new_df = new_df.append(df[df.apps_short == app].sample(jobs_per_app)) # Rename app names to preserve anonymity mapping = { top_applications[0]: "Climate", top_applications[1]: "Materials", top_applications[2]: "Cosmology", top_applications[3]: "Fluid dynamics", top_applications[4]: "Benchmark 1", top_applications[5]: "Benchmark 2" } new_df.apps_short = new_df.apps_short.map(mapping) top_applications = [mapping[x] for x in top_applications] # Calculate distances and wrap in a dataframe l1_distances = manhattan_distances(new_df[columns], new_df[columns]) l2_distances = euclidean_distances(new_df[columns], new_df[columns]) l1_distances = pd.DataFrame(l1_distances, columns=new_df.index, index=new_df.index) l2_distances = pd.DataFrame(l2_distances, columns=new_df.index, index=new_df.index) # Get colors for each row and column palette = sns.color_palette(n_colors=top_apps) lut = {app: color for app, color in zip(top_applications, palette)} row_colors = new_df.apps_short.map(lut) row_colors.name = "" # Draw cg1 = sns.clustermap(l1_distances, row_colors=row_colors, col_colors=row_colors, row_cluster=True, col_cluster=True, cbar_pos=(.1, .1, .03, 0.6), xticklabels=False, yticklabels=False, robust=True, figsize=(8, 8)) cg2 = sns.clustermap(l2_distances, row_colors=row_colors, col_colors=row_colors, row_cluster=True, col_cluster=True, cbar_pos=(.1, .1, .03, 0.6), xticklabels=False, yticklabels=False, robust=True, figsize=(8, 8)) # Hardcode labels for anonymity for label in top_applications: cg1.ax_col_dendrogram.bar(0, 0, color=lut[label], label=label) cg2.ax_col_dendrogram.bar(0, 0, color=lut[label], label=label) # Plot labels cg1.ax_col_dendrogram.legend(loc="center", ncol=top_apps // 3, fontsize=25) cg2.ax_col_dendrogram.legend(loc="center", ncol=top_apps // 3, fontsize=25) # Hide the dendrogram - this will also kill the legend, so we should generate two graphs and then stitch them # together cg1.ax_row_dendrogram.set_visible(False) cg1.ax_col_dendrogram.set_visible(False) cg2.ax_row_dendrogram.set_visible(False) cg2.ax_col_dendrogram.set_visible(False) # Save figures # cg1.savefig('pdfs/l1_distance.pdf', bbox_inches='tight') # cg2.savefig('pdfs/l2_distance.pdf', bbox_inches='tight') plt.show()
def main(MIN_CLUSTER_SIZE=100): df, clusterer = dataset.default_dataset( paths=["../data/anonimized_io.csv"]) input_columns = set([ c for c in df.columns if 'perc' in c.lower() or 'LOG10' in c ]).difference(["POSIX_LOG10_agg_perf_by_slowest"]) # Hand selected these to show good gradients, since the dataset is finicky. # small changes in epsilon lead to large changes in the number of clusters. epsilons = [9.5, 7, 5, 2.1] cluster_sizes = [10, 79, 267, 1077] results = get_cluster_results(df, input_columns, epsilons, MIN_CLUSTER_SIZE) global_train_errors, global_test_errors = prediction_error( df[input_columns], df.POSIX_LOG10_agg_perf_by_slowest) global_results = pd.DataFrame({ "jobs_cluster_size": [df.shape[0]] * (len(global_train_errors) + len(global_test_errors)), "job_errors": list(global_train_errors) + list(global_test_errors), "job_in_test_set": [False] * len(global_train_errors) + [True] * len(global_test_errors), "job_eps": [1000] * (len(global_train_errors) + len(global_test_errors)) }) results = results.append(global_results) # Let's rescale the errors so that they represent real ratios and not logarithmic differences results.job_errors = 10**results.job_errors # # Plotting # fig = plt.figure(figsize=(15, 8)) spec = fig.add_gridspec(ncols=2, nrows=2) ax2 = fig.add_subplot(spec[0, 0]) ax4 = fig.add_subplot(spec[0, 1], sharey=ax2) seaborn.boxplot(data=results[results.job_in_test_set == False], x="job_eps", y="job_errors", ax=ax2) ax2.set_title("Cluster Training Errors") ax2.set_xlabel("Number of clusters") ax2.set_xticklabels(list(reversed(cluster_sizes)) + ["Global"]) ax2.set_yscale('log') seaborn.boxplot(data=results[results.job_in_test_set], x="job_eps", y="job_errors", ax=ax4) ax4.set_title("Cluster Test Errors") ax4.set_xlabel("Number of clusters") ax4.set_xticklabels(list(reversed(cluster_sizes)) + ["Global"]) ax2.set_ylabel("Average Prediction Ratio Error") ax4.set_ylabel("Average Prediction Ratio Error") ax2.set_yticks(np.arange(1, 3, 0.1), minor=True) ax4.set_yticks(np.arange(1, 3, 0.1), minor=True) ax2.grid(axis='x', which='both') ax4.grid(axis='x', which='both') # Plot the cumulative histograms results[results.job_in_test_set].job_errors for eps in reversed([1000] + epsilons): ax_cm = fig.add_subplot(spec[1, 0]) plt.xlim(1, 2) plt.ylim(0, 1.2) x = results[np.logical_and(results.job_in_test_set == False, results.job_eps == eps)].job_errors x = x[x < 3] seaborn.distplot(x, norm_hist=True, hist_kws={ 'cumulative': True, 'histtype': 'step', 'alpha': 1 }, ax=ax_cm) plt.legend(list(reversed(cluster_sizes)) + ["Global"]) ax_cm.set_xlabel("Average Prediction Ratio Error") ax_cm.set_ylabel("Percentage of jobs") ax_cm = fig.add_subplot(spec[1, 1]) plt.xlim(1, 2) plt.ylim(0, 1.2) x = results[np.logical_and(results.job_in_test_set, results.job_eps == eps)].job_errors x = x[x < 3] seaborn.distplot(x, norm_hist=True, hist_kws={ 'cumulative': True, 'histtype': 'step', 'alpha': 1 }, ax=ax_cm) plt.legend(list(reversed(cluster_sizes)) + ["Global"]) ax_cm.set_xlabel("Average Prediction Ratio Error") ax_cm.set_ylabel("Percentage of jobs") plt.show()