def get_unique_matrix(X, y): X_unique, unique_indexes = X.unique_rows(return_index=True) assert np.array_equal(X_unique.columnlabels, X.columnlabels) y_unique = Matrix(y.data[unique_indexes], y.rowlabels[unique_indexes], y.columnlabels) rowlabels = np.empty_like(X_unique.rowlabels, dtype=object) exp_set = set() for i, row in enumerate(X_unique.data): exp_label = tuple((l, r) for l, r in zip(X_unique.columnlabels, row)) assert exp_label not in exp_set rowlabels[i] = exp_label exp_set.add(exp_label) y_unique.rowlabels = rowlabels X_unique.rowlabels = rowlabels if X_unique.data.shape != X.data.shape: print "\n\nDIFF(num_knobs={}): X_unique: {}, X: {}\n\n".format( X_unique.columnlabels.shape[0], X_unique.data.shape, X.data.shape) dup_map = {} dup_indexes = np.array([d for d in range(X.data.shape[0]) \ if d not in unique_indexes]) for dup_idx in dup_indexes: dup_label = tuple((u''+l,r) for l,r in \ zip(X_unique.columnlabels, X.data[dup_idx])) primary_idx = [idx for idx,rl in enumerate(rowlabels) \ if rl == dup_label] assert len(primary_idx) == 1 primary_idx = primary_idx[0] if primary_idx not in dup_map: dup_map[primary_idx] = [y_unique.data[primary_idx]] dup_map[primary_idx].append(y.data[dup_idx]) for idx, yvals in dup_map.iteritems(): y_unique.data[idx] = np.median(np.vstack(yvals), axis=0) return X_unique, y_unique
class SolutionStep(object): ROW = 0 COL = 1 def __init__(self, nonogram): self._uid = 1000 self._idToIndex = {} self._nonogram = nonogram self._idToIndex[-1] = -1 self._idToIndex[0] = 0 self.row = [[self.__getIndex(x) for x in r] for r in nonogram.rows()] self.column = [[self.__getIndex(x) for x in c] for c in nonogram.columns()] y_size, x_size = self.shape() self._sol = Solution(shape=(x_size, y_size)) self.matrix = Matrix(x=x_size, y=y_size, default=SolutionCell()) def __getIndex(self, value): self._uid = self._uid + 1 index = LayoutIndex(value) self._idToIndex[index.id()] = index return index def index(self, uid): return self._idToIndex[int(uid)] def shape(self): return (len(self.column), len(self.row)) def solution(self): return self._sol def row_layout(self, i): return self.row[i] def column_layout(self, i): return self.column[i] def row_lineup(self, i): return [x.v[self.ROW] for x in self.matrix.row(i)] def col_lineup(self, i): return [x.v[self.COL] for x in self.matrix.column(i)] def set_row(self, i, j, item): self.matrix.row(i)[j][self.ROW] = item self._sol.item(i, j).v = item.color() def set_col(self, i, j, item): self.matrix.col(i)[self.COL] = item self._sol.item(j, i).v = item.color()
def __init__(self, nonogram): self._uid = 1000 self._idToIndex = {} self._nonogram = nonogram self._idToIndex[-1] = -1 self._idToIndex[0] = 0 self.row = [[self.__getIndex(x) for x in r] for r in nonogram.rows()] self.column = [[self.__getIndex(x) for x in c] for c in nonogram.columns()] y_size, x_size = self.shape() self._sol = Solution(shape=(x_size, y_size)) self.matrix = Matrix(x=x_size, y=y_size, default=SolutionCell())
def test_ctor(self): m = Matrix.from_shape(3, 5) for i in xrange(5): self.assertEqual(m.row(i), 3 * [None]) for i in xrange(3): self.assertEqual(m.column(i), 5 * [None])
def __init__(self, descr=None): SudokuDescr.__init__(self, matrix=descr.matrix) self.steps = [] # ((x, y), value) x, y = self.matrix.shape self.probability = Matrix(x=x, y=y, default_func=lambda ix, iy: Probability(self.values())) self.pending = [] # (x, y), value self.random = [] indexes = set() for init in self.matrix: if init.v != 0: indexes.add(init.index()) self.add_step(init.index(), init.v) new_pending = [] for i in self.pending: if not i[0] in indexes: new_pending.append(i) self.pending = new_pending
def test_create_from_matrix(self): m = Matrix.from_matrix([ [1, 2], [3, 4], [5, 6], ]) self.assertEqual(m.row(0), [1, 2]) self.assertEqual(m.row(1), [3, 4]) self.assertEqual(m.row(2), [5, 6]) self.assertEqual(m.column(0), [1, 3, 5]) self.assertEqual(m.column(1), [2, 4, 6])
def __init__(self, matrix=None): self.matrix = Matrix(matrix=matrix) y_size = int(math.sqrt(self.matrix.shape[0])) x_size = int(math.sqrt(self.matrix.shape[1])) self._box_shape = (x_size, y_size) self._values = set([i + 1 for i in xrange(x_size*y_size)])
def _flip_horizontally(matrix): m = Matrix(matrix.num_rows(), matrix.num_columns()) for i in xrange(m.num_rows()): for j in xrange(m.num_columns()): m[i][j] = matrix[m.num_rows() - i - 1][j] return m
def to_matrix(matrix_as_lists): result = Matrix(len(matrix_as_lists), len(matrix_as_lists[0])) for i in xrange(result.num_rows()): for j in xrange(result.num_columns()): result[i][j] = matrix_as_lists[i][j] return result
def __init__(self, matrix=[], shape=None): if shape: x, y = shape self.matrix = Matrix.from_shape(x=x, y=y, default=-1) else: self.matrix = Matrix.from_matrix(matrix)
def test_change_value_in_column(self): m = Matrix.from_shape(3, 5) value = "new_value" m.column(1)[2].value = value self.assertEqual(m.row(2)[1].value, value)
def run_factor_analysis(paths, savedir, cluster_range, algorithms): import gc # Load matrices assert len(paths) > 0 matrices = [] with stopwatch("matrix concatenation"): for path in paths: matrices.append( Matrix.load_matrix(os.path.join(path, "y_data_enc.npz"))) # Combine matrix data if more than 1 matrix if len(matrices) > 1: matrix = Matrix.vstack(matrices, require_equal_columnlabels=True) else: matrix = matrices[0] del matrices gc.collect() with stopwatch("preprocessing"): # Filter out columns with near zero standard deviation # i.e., constant columns column_mask = ~stdev_zero(matrix.data, axis=0) filtered_columns = matrix.columnlabels[column_mask] matrix = matrix.filter(filtered_columns, 'columns') print "matrix shape after filter constant: ", matrix.data.shape # Scale the data standardizer = StandardScaler() matrix.data = standardizer.fit_transform(matrix.data) # Shuffle the data rows (experiments x metrics) exp_shuffle_indices = get_shuffle_indices(matrix.data.shape[0]) matrix.data = matrix.data[exp_shuffle_indices] # Shrink the cluster range if # metrics < max # clusters max_clusters = matrix.data.shape[1] + 1 if max_clusters < cluster_range[1]: cluster_range = (cluster_range[0], max_clusters) with stopwatch("factor analysis"): # Fit the model to calculate the components fa = FactorAnalysis() fa.fit(matrix.data) fa_mask = np.sum(fa.components_ != 0.0, axis=1) > 0.0 variances = np.sum(np.abs(fa.components_[fa_mask]), axis=1) total_variance = np.sum(variances).squeeze() print "total variance: {}".format(total_variance) var_exp = np.array([np.sum(variances[:i+1]) / total_variance * 100 \ for i in range(variances.shape[0])]) factor_cutoff = np.count_nonzero(var_exp < REQUIRED_VARIANCE_EXPLAINED) + 1 factor_cutoff = min(factor_cutoff, 10) print "factor cutoff: {}".format(factor_cutoff) for i, var in enumerate(variances): print i, var, np.sum( variances[:i + 1]), np.sum(variances[:i + 1]) / total_variance components = np.transpose(fa.components_[:factor_cutoff]).copy() print "components shape: {}".format(components.shape) standardizer = StandardScaler() components = standardizer.fit_transform(components) # Shuffle factor analysis matrix rows (metrics x factors) metric_shuffle_indices = get_shuffle_indices(components.shape[0]) components = components[metric_shuffle_indices] component_columnlabels = matrix.columnlabels[metric_shuffle_indices].copy() kmeans = KMeans_(components, cluster_range) kmeans.plot_results(savedir, components, component_columnlabels) # Compute optimal number of clusters K for algorithm in algorithms: with stopwatch("compute {} (factors={})".format( algorithm, factor_cutoff)): kselection = KSelection.new(components, cluster_range, kmeans.cluster_map_, algorithm) print "{} optimal # of clusters: {}".format( algorithm, kselection.optimal_num_clusters_) kselection.plot_results(savedir) metric_clusters = {} featured_metrics = {} for n_clusters, (cluster_centers, labels, _) in kmeans.cluster_map_.iteritems(): # For each cluster, calculate the distances of each metric from the # cluster center. We use the metric closest to the cluster center. mclusters = [] mfeat_list = [] for i in range(n_clusters): metric_labels = component_columnlabels[labels == i] component_rows = components[labels == i] centroid = np.expand_dims(cluster_centers[i], axis=0) dists = np.empty(component_rows.shape[0]) for j, row in enumerate(component_rows): row = np.expand_dims(row, axis=0) dists[j] = cdist(row, centroid, 'euclidean').squeeze() order_by = np.argsort(dists) metric_labels = metric_labels[order_by] dists = dists[order_by] mclusters.append((i, metric_labels, dists)) assert len(OPT_METRICS) > 0 label_mask = np.zeros(metric_labels.shape[0]) for opt_metric in OPT_METRICS: label_mask = np.logical_or(label_mask, metric_labels == opt_metric) if np.count_nonzero(label_mask) > 0: mfeat_list.extend(metric_labels[label_mask].tolist()) elif len(metric_labels) > 0: mfeat_list.append(metric_labels[0]) metric_clusters[n_clusters] = mclusters featured_metrics[n_clusters] = mfeat_list for n_clusters, mlist in sorted(featured_metrics.iteritems()): savepath = os.path.join(savedir, "featured_metrics_{}.txt".format(n_clusters)) with open(savepath, "w") as f: f.write("\n".join(sorted(mlist))) for n_clusters, memberships in sorted(metric_clusters.iteritems()): cstr = "" for i, (cnum, lab, dist) in enumerate(memberships): assert i == cnum cstr += "---------------------------------------------\n" cstr += "CLUSTERS {}\n".format(i) cstr += "---------------------------------------------\n\n" for l, d in zip(lab, dist): cstr += "{}\t({})\n".format(l, d) cstr += "\n\n" savepath = os.path.join(savedir, "membership_{}.txt".format(n_clusters)) with open(savepath, "w") as f: f.write(cstr)
class Solution(SudokuDescr): def __init__(self, descr=None): SudokuDescr.__init__(self, matrix=descr.matrix) self.steps = [] # ((x, y), value) x, y = self.matrix.shape self.probability = Matrix(x=x, y=y, default_func=lambda ix, iy: Probability(self.values())) self.pending = [] # (x, y), value self.random = [] indexes = set() for init in self.matrix: if init.v != 0: indexes.add(init.index()) self.add_step(init.index(), init.v) new_pending = [] for i in self.pending: if not i[0] in indexes: new_pending.append(i) self.pending = new_pending def probability_line(self, index): return [ SolverMethod.belonged_box(index, self.probability, self.box_shape()), self.probability.row(index[1]), self.probability.column(index[0]) ] def add_step(self, index, value): lines = self.probability_line(index) for l in lines: for i in l: if i.index() != index and i.v.erase(value, len(self.steps)): self.add_pending(i.index(), i.v.value()) self.probability.item_i(index).v.set(value, len(self.steps)) self.steps.append((index, value)) self.matrix.item_i(index).v = value def add_pending(self, index, value): self.pending.append((index, value)) def add_random_choice(self, index, values): vs = copy(values) v = vs.pop() self.random.append((index, vs, len(self.steps))) self.add_step(index, v) def rollback_random(self): # revert to 0 in matrix r = self.random.pop() while not r[1]: r = self.random.pop() rev = r[2] while rev != len(self.steps): s = self.steps.pop() self.matrix.item_i(s[0]).v = 0 for item in self.probability: item.v.rollback(rev) self.pending = [] self.add_random_choice(r[0], r[1]) def process_pending(self): while self.pending: item = self.pending.pop() self.add_step(item[0], item[1]) def is_done(self): x, y = self.matrix.shape return x*y == len(self.steps)
def map_workload(self, X_client, y_client): # tuner = TunerContext() with stopwatch("workload mapping - preprocessing"): # # Recompute the GPR models if the # of knobs to tune has # # changed (incremental knob selection feature is enabled) # tuner_feat_knobs = tuner.featured_knobs # if not np.array_equal(tuner_feat_knobs, self.featured_knobs_): # print ("# knobs: {} --> {}. Re-creating models" # .format(tuner_feat_knobs.size, # self.featured_knobs_.size)) # assert tuner_feat_knobs.size != self.featured_knobs_.size # assert tuner.incremental_knob_selection == True # self.featured_knobs_ = tuner_feat_knobs # self.initialize_models() # gc.collect() # Filter be featured knobs & metrics X_client = X_client.filter(self.featured_knobs_, "columns") y_client = y_client.filter(self.featured_metrics_, "columns") # Generate unique X,y matrices X_client, y_client = get_unique_matrix(X_client, y_client) # Preprocessing steps if self.dummy_encoder_ is not None: X_client = Matrix(self.dummy_encoder_.transform(X_client.data), X_client.rowlabels, self.dummy_encoder_.columnlabels) X_client.data = self.X_scaler_.transform(X_client.data) # Create y_client scaler with prior and transform client data y_client_scaler = copy.deepcopy(self.y_scaler_) y_client_scaler.n_samples_seen_ = 1 y_client_scaler.partial_fit(y_client.data) y_client.data = y_client_scaler.transform(y_client.data) # Bin and recenter client data y_client.data = self.y_binner_.transform(y_client.data) y_client.data = self.y_gp_scaler_.transform(y_client.data) # Compute workload scores in parallel njobs = len(self.workload_states_) iterable = [(i, wd, ws, X_client, y_client, njobs, self.verbose_) \ for i,(wd,ws) in enumerate(self.workload_states_.iteritems())] with stopwatch("workload mapping - predictions"): if self.pool_ is not None: wkld_scores = self.pool_.map(worker_score_workload, iterable) else: wkld_scores = [] for item in iterable: wkld_scores.append(worker_score_workload(item)) sorted_wkld_scores = sorted(wkld_scores, key=operator.itemgetter(1)) print "" print "WORKLOAD SCORES" for wkld, score in sorted_wkld_scores: print "{0}: {1:.2f}".format(os.path.basename(wkld), score) return sorted_wkld_scores[0][0]
def initialize_models(self): if self.verbose_: print("Initializing models for # knobs={}\n".format( self.featured_knobs_.size)) with stopwatch("workload mapping model creation"): n_values, cat_indices, params = prep.dummy_encoder_helper( self.dbms_name, self.featured_knobs_) if n_values.size > 0: self.dummy_encoder_ = prep.DummyEncoder(n_values, cat_indices) else: self.dummy_encoder_ = None self.X_scaler_ = StandardScaler() self.y_scaler_ = StandardScaler() data_map = {} for i, wd in enumerate(self.workload_dirs_): # Load and filter data Xpath = os.path.join(wd, "X_data_enc.npz") ypath = os.path.join(wd, "y_data_enc.npz") X = Matrix.load_matrix(Xpath) y = Matrix.load_matrix(ypath) X = X.filter(self.featured_knobs_, "columns") y = y.filter(self.featured_metrics_, "columns") assert np.array_equal(X.columnlabels, self.featured_knobs_) assert np.array_equal(y.columnlabels, self.featured_metrics_) assert np.array_equal(X.rowlabels, y.rowlabels) num_samples = X.shape[0] if num_samples > self.MAX_SAMPLES: print "Shrinking {} samples to {}".format( num_samples, self.MAX_SAMPLES) rand_indices = prep.get_shuffle_indices( num_samples)[:self.MAX_SAMPLES] X = Matrix(X.data[rand_indices], X.rowlabels[rand_indices], X.columnlabels) y = Matrix(y.data[rand_indices], y.rowlabels[rand_indices], y.columnlabels) num_samples = X.shape[0] assert num_samples <= self.MAX_SAMPLES assert num_samples == y.shape[0] # Dummy-code categorical knobs if self.dummy_encoder_ is not None: if i == 0: # Just need to fit this once self.dummy_encoder_.fit(X.data, columnlabels=X.columnlabels) X = Matrix(self.dummy_encoder_.transform(X.data), X.rowlabels, self.dummy_encoder_.columnlabels) self.X_scaler_.partial_fit(X.data) self.y_scaler_.partial_fit(y.data) data_map[wd] = (X, y) if self.dummy_encoder_ is not None: # Fix X_scaler wrt categorical features prep.fix_scaler(self.X_scaler_, self.dummy_encoder_, params) # Scale X/y all_ys = [] for wd, (X, y) in data_map.iteritems(): X.data = self.X_scaler_.transform(X.data) y.data = self.y_scaler_.transform(y.data) all_ys.append(y.data) # Concat all ys and compute deciles all_ys = np.vstack(all_ys) self.y_binner_ = prep.Bin(0, axis=0) self.y_binner_.fit(all_ys) del all_ys # Bin y by deciles and fit scaler self.y_gp_scaler_ = StandardScaler() for wd, (X, y) in data_map.iteritems(): y.data = self.y_binner_.transform(y.data) self.y_gp_scaler_.partial_fit(y.data) # Recenter y-values for wd, (X, y) in data_map.iteritems(): y.data = self.y_gp_scaler_.transform(y.data) njobs = len(data_map) iterable = [(i,wd,ws,njobs,self.verbose_) for i,(wd,ws) \ in enumerate(data_map.iteritems())] if self.pool_ is not None: res = self.pool_.map(worker_create_model, iterable) else: res = [] for item in iterable: res.append(worker_create_model(item)) self.workload_states_ = dict(res)
def run_lasso(dbms, basepaths, savedir, featured_metrics, knobs_to_ignore, include_polynomial_features=True): import gc # Load matrices assert len(basepaths) > 0 Xs = [] ys = [] with stopwatch("matrix concatenation"): for basepath in basepaths: X_path = os.path.join(basepath, "X_data_enc.npz") y_path = os.path.join(basepath, "y_data_enc.npz") Xs.append(Matrix.load_matrix(X_path)) ys.append( Matrix.load_matrix(y_path).filter(featured_metrics, "columns")) # Combine matrix data if more than 1 matrix if len(Xs) > 1: X = Matrix.vstack(Xs, require_equal_columnlabels=True) y = Matrix.vstack(ys, require_equal_columnlabels=True) else: X = Xs[0] y = ys[0] del Xs del ys gc.collect() with stopwatch("preprocessing"): # Filter out columns with near zero standard # deviation (i.e., constant columns) if y.shape[1] > 1: column_mask = ~stdev_zero(y.data, axis=0) filtered_columns = y.columnlabels[column_mask] y = y.filter(filtered_columns, 'columns') column_mask = ~stdev_zero(X.data, axis=0) removed_columns = X.columnlabels[~column_mask] print "removed columns = {}".format(removed_columns) filtered_columns = set(X.columnlabels[column_mask]) filtered_columns -= set(knobs_to_ignore) filtered_columns = np.array(sorted(filtered_columns)) X = X.filter(filtered_columns, 'columns') print "\ncolumnlabels:", X.columnlabels # Dummy-code categorical features n_values, cat_feat_indices, _ = dummy_encoder_helper( dbms, X.columnlabels) if len(cat_feat_indices) > 0: encoder = DummyEncoder(n_values, cat_feat_indices) encoder.fit(X.data, columnlabels=X.columnlabels) X = Matrix(encoder.transform(X.data), X.rowlabels, encoder.columnlabels) # Scale the data X_standardizer = StandardScaler() X.data = X_standardizer.fit_transform(X.data) y_standardizer = StandardScaler() y.data = y_standardizer.fit_transform(y.data) if include_polynomial_features: X_poly = PolynomialFeatures() X_data = X_poly.fit_transform(X.data) X_columnlabels = np.expand_dims(np.array(X.columnlabels, dtype=str), axis=0) X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze() X = Matrix(X_data, X.rowlabels, X_columnlabels) # Shuffle the data rows (experiments x metrics) shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False) X = shuffler.fit_transform(X, copy=False) y = shuffler.transform(y, copy=False) assert np.array_equal(X.rowlabels, y.rowlabels) gc.collect() print "\nfeatured_metrics:", featured_metrics with stopwatch("lasso paths"): # Fit the model to calculate the components alphas, coefs, _ = get_coef_range(X.data, y.data) # Save model np.savez(os.path.join(savedir, "lasso_path.npz"), alphas=alphas, coefs=coefs, feats=X.columnlabels, metrics=y.columnlabels) with stopwatch("lasso processing"): nfeats = X.columnlabels.shape[0] lasso = Lasso(alphas, X.columnlabels, coefs) print lasso.get_top_summary(nfeats, "") top_knobs = get_features_list(lasso.get_top_features(n=nfeats)) print "\nfeat list length: {}".format(len(top_knobs)) print "nfeats = {}".format(nfeats) top_knobs = lasso.get_top_features(nfeats) print top_knobs final_ordering = [] for knob in top_knobs: if '#' in knob: knob = knob.split('#')[0] if knob not in final_ordering: final_ordering.append(knob) else: final_ordering.append(knob) final_ordering = np.append(final_ordering, removed_columns) with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f: f.write("\n".join(final_ordering))