def run_factor_analysis(paths, savedir, cluster_range, algorithms): import gc # Load matrices assert len(paths) > 0 matrices = [] with stopwatch("matrix concatenation"): for path in paths: matrices.append( Matrix.load_matrix(os.path.join(path, "y_data_enc.npz"))) # Combine matrix data if more than 1 matrix if len(matrices) > 1: matrix = Matrix.vstack(matrices, require_equal_columnlabels=True) else: matrix = matrices[0] del matrices gc.collect() with stopwatch("preprocessing"): # Filter out columns with near zero standard deviation # i.e., constant columns column_mask = ~stdev_zero(matrix.data, axis=0) filtered_columns = matrix.columnlabels[column_mask] matrix = matrix.filter(filtered_columns, 'columns') print "matrix shape after filter constant: ", matrix.data.shape # Scale the data standardizer = StandardScaler() matrix.data = standardizer.fit_transform(matrix.data) # Shuffle the data rows (experiments x metrics) exp_shuffle_indices = get_shuffle_indices(matrix.data.shape[0]) matrix.data = matrix.data[exp_shuffle_indices] # Shrink the cluster range if # metrics < max # clusters max_clusters = matrix.data.shape[1] + 1 if max_clusters < cluster_range[1]: cluster_range = (cluster_range[0], max_clusters) with stopwatch("factor analysis"): # Fit the model to calculate the components fa = FactorAnalysis() fa.fit(matrix.data) fa_mask = np.sum(fa.components_ != 0.0, axis=1) > 0.0 variances = np.sum(np.abs(fa.components_[fa_mask]), axis=1) total_variance = np.sum(variances).squeeze() print "total variance: {}".format(total_variance) var_exp = np.array([np.sum(variances[:i+1]) / total_variance * 100 \ for i in range(variances.shape[0])]) factor_cutoff = np.count_nonzero(var_exp < REQUIRED_VARIANCE_EXPLAINED) + 1 factor_cutoff = min(factor_cutoff, 10) print "factor cutoff: {}".format(factor_cutoff) for i, var in enumerate(variances): print i, var, np.sum( variances[:i + 1]), np.sum(variances[:i + 1]) / total_variance components = np.transpose(fa.components_[:factor_cutoff]).copy() print "components shape: {}".format(components.shape) standardizer = StandardScaler() components = standardizer.fit_transform(components) # Shuffle factor analysis matrix rows (metrics x factors) metric_shuffle_indices = get_shuffle_indices(components.shape[0]) components = components[metric_shuffle_indices] component_columnlabels = matrix.columnlabels[metric_shuffle_indices].copy() kmeans = KMeans_(components, cluster_range) kmeans.plot_results(savedir, components, component_columnlabels) # Compute optimal number of clusters K for algorithm in algorithms: with stopwatch("compute {} (factors={})".format( algorithm, factor_cutoff)): kselection = KSelection.new(components, cluster_range, kmeans.cluster_map_, algorithm) print "{} optimal # of clusters: {}".format( algorithm, kselection.optimal_num_clusters_) kselection.plot_results(savedir) metric_clusters = {} featured_metrics = {} for n_clusters, (cluster_centers, labels, _) in kmeans.cluster_map_.iteritems(): # For each cluster, calculate the distances of each metric from the # cluster center. We use the metric closest to the cluster center. mclusters = [] mfeat_list = [] for i in range(n_clusters): metric_labels = component_columnlabels[labels == i] component_rows = components[labels == i] centroid = np.expand_dims(cluster_centers[i], axis=0) dists = np.empty(component_rows.shape[0]) for j, row in enumerate(component_rows): row = np.expand_dims(row, axis=0) dists[j] = cdist(row, centroid, 'euclidean').squeeze() order_by = np.argsort(dists) metric_labels = metric_labels[order_by] dists = dists[order_by] mclusters.append((i, metric_labels, dists)) assert len(OPT_METRICS) > 0 label_mask = np.zeros(metric_labels.shape[0]) for opt_metric in OPT_METRICS: label_mask = np.logical_or(label_mask, metric_labels == opt_metric) if np.count_nonzero(label_mask) > 0: mfeat_list.extend(metric_labels[label_mask].tolist()) elif len(metric_labels) > 0: mfeat_list.append(metric_labels[0]) metric_clusters[n_clusters] = mclusters featured_metrics[n_clusters] = mfeat_list for n_clusters, mlist in sorted(featured_metrics.iteritems()): savepath = os.path.join(savedir, "featured_metrics_{}.txt".format(n_clusters)) with open(savepath, "w") as f: f.write("\n".join(sorted(mlist))) for n_clusters, memberships in sorted(metric_clusters.iteritems()): cstr = "" for i, (cnum, lab, dist) in enumerate(memberships): assert i == cnum cstr += "---------------------------------------------\n" cstr += "CLUSTERS {}\n".format(i) cstr += "---------------------------------------------\n\n" for l, d in zip(lab, dist): cstr += "{}\t({})\n".format(l, d) cstr += "\n\n" savepath = os.path.join(savedir, "membership_{}.txt".format(n_clusters)) with open(savepath, "w") as f: f.write(cstr)
def run_lasso(dbms, basepaths, savedir, featured_metrics, knobs_to_ignore, include_polynomial_features=True): import gc # Load matrices assert len(basepaths) > 0 Xs = [] ys = [] with stopwatch("matrix concatenation"): for basepath in basepaths: X_path = os.path.join(basepath, "X_data_enc.npz") y_path = os.path.join(basepath, "y_data_enc.npz") Xs.append(Matrix.load_matrix(X_path)) ys.append( Matrix.load_matrix(y_path).filter(featured_metrics, "columns")) # Combine matrix data if more than 1 matrix if len(Xs) > 1: X = Matrix.vstack(Xs, require_equal_columnlabels=True) y = Matrix.vstack(ys, require_equal_columnlabels=True) else: X = Xs[0] y = ys[0] del Xs del ys gc.collect() with stopwatch("preprocessing"): # Filter out columns with near zero standard # deviation (i.e., constant columns) if y.shape[1] > 1: column_mask = ~stdev_zero(y.data, axis=0) filtered_columns = y.columnlabels[column_mask] y = y.filter(filtered_columns, 'columns') column_mask = ~stdev_zero(X.data, axis=0) removed_columns = X.columnlabels[~column_mask] print "removed columns = {}".format(removed_columns) filtered_columns = set(X.columnlabels[column_mask]) filtered_columns -= set(knobs_to_ignore) filtered_columns = np.array(sorted(filtered_columns)) X = X.filter(filtered_columns, 'columns') print "\ncolumnlabels:", X.columnlabels # Dummy-code categorical features n_values, cat_feat_indices, _ = dummy_encoder_helper( dbms, X.columnlabels) if len(cat_feat_indices) > 0: encoder = DummyEncoder(n_values, cat_feat_indices) encoder.fit(X.data, columnlabels=X.columnlabels) X = Matrix(encoder.transform(X.data), X.rowlabels, encoder.columnlabels) # Scale the data X_standardizer = StandardScaler() X.data = X_standardizer.fit_transform(X.data) y_standardizer = StandardScaler() y.data = y_standardizer.fit_transform(y.data) if include_polynomial_features: X_poly = PolynomialFeatures() X_data = X_poly.fit_transform(X.data) X_columnlabels = np.expand_dims(np.array(X.columnlabels, dtype=str), axis=0) X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze() X = Matrix(X_data, X.rowlabels, X_columnlabels) # Shuffle the data rows (experiments x metrics) shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False) X = shuffler.fit_transform(X, copy=False) y = shuffler.transform(y, copy=False) assert np.array_equal(X.rowlabels, y.rowlabels) gc.collect() print "\nfeatured_metrics:", featured_metrics with stopwatch("lasso paths"): # Fit the model to calculate the components alphas, coefs, _ = get_coef_range(X.data, y.data) # Save model np.savez(os.path.join(savedir, "lasso_path.npz"), alphas=alphas, coefs=coefs, feats=X.columnlabels, metrics=y.columnlabels) with stopwatch("lasso processing"): nfeats = X.columnlabels.shape[0] lasso = Lasso(alphas, X.columnlabels, coefs) print lasso.get_top_summary(nfeats, "") top_knobs = get_features_list(lasso.get_top_features(n=nfeats)) print "\nfeat list length: {}".format(len(top_knobs)) print "nfeats = {}".format(nfeats) top_knobs = lasso.get_top_features(nfeats) print top_knobs final_ordering = [] for knob in top_knobs: if '#' in knob: knob = knob.split('#')[0] if knob not in final_ordering: final_ordering.append(knob) else: final_ordering.append(knob) final_ordering = np.append(final_ordering, removed_columns) with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f: f.write("\n".join(final_ordering))