def initialize_models(self): if self.verbose_: print("Initializing models for # knobs={}\n".format( self.featured_knobs_.size)) with stopwatch("workload mapping model creation"): n_values, cat_indices, params = prep.dummy_encoder_helper( self.dbms_name, self.featured_knobs_) if n_values.size > 0: self.dummy_encoder_ = prep.DummyEncoder(n_values, cat_indices) else: self.dummy_encoder_ = None self.X_scaler_ = StandardScaler() self.y_scaler_ = StandardScaler() data_map = {} for i, wd in enumerate(self.workload_dirs_): # Load and filter data Xpath = os.path.join(wd, "X_data_enc.npz") ypath = os.path.join(wd, "y_data_enc.npz") X = Matrix.load_matrix(Xpath) y = Matrix.load_matrix(ypath) X = X.filter(self.featured_knobs_, "columns") y = y.filter(self.featured_metrics_, "columns") assert np.array_equal(X.columnlabels, self.featured_knobs_) assert np.array_equal(y.columnlabels, self.featured_metrics_) assert np.array_equal(X.rowlabels, y.rowlabels) num_samples = X.shape[0] if num_samples > self.MAX_SAMPLES: print "Shrinking {} samples to {}".format( num_samples, self.MAX_SAMPLES) rand_indices = prep.get_shuffle_indices( num_samples)[:self.MAX_SAMPLES] X = Matrix(X.data[rand_indices], X.rowlabels[rand_indices], X.columnlabels) y = Matrix(y.data[rand_indices], y.rowlabels[rand_indices], y.columnlabels) num_samples = X.shape[0] assert num_samples <= self.MAX_SAMPLES assert num_samples == y.shape[0] # Dummy-code categorical knobs if self.dummy_encoder_ is not None: if i == 0: # Just need to fit this once self.dummy_encoder_.fit(X.data, columnlabels=X.columnlabels) X = Matrix(self.dummy_encoder_.transform(X.data), X.rowlabels, self.dummy_encoder_.columnlabels) self.X_scaler_.partial_fit(X.data) self.y_scaler_.partial_fit(y.data) data_map[wd] = (X, y) if self.dummy_encoder_ is not None: # Fix X_scaler wrt categorical features prep.fix_scaler(self.X_scaler_, self.dummy_encoder_, params) # Scale X/y all_ys = [] for wd, (X, y) in data_map.iteritems(): X.data = self.X_scaler_.transform(X.data) y.data = self.y_scaler_.transform(y.data) all_ys.append(y.data) # Concat all ys and compute deciles all_ys = np.vstack(all_ys) self.y_binner_ = prep.Bin(0, axis=0) self.y_binner_.fit(all_ys) del all_ys # Bin y by deciles and fit scaler self.y_gp_scaler_ = StandardScaler() for wd, (X, y) in data_map.iteritems(): y.data = self.y_binner_.transform(y.data) self.y_gp_scaler_.partial_fit(y.data) # Recenter y-values for wd, (X, y) in data_map.iteritems(): y.data = self.y_gp_scaler_.transform(y.data) njobs = len(data_map) iterable = [(i,wd,ws,njobs,self.verbose_) for i,(wd,ws) \ in enumerate(data_map.iteritems())] if self.pool_ is not None: res = self.pool_.map(worker_create_model, iterable) else: res = [] for item in iterable: res.append(worker_create_model(item)) self.workload_states_ = dict(res)
def map_workload(self, X_client, y_client): # tuner = TunerContext() with stopwatch("workload mapping - preprocessing"): # # Recompute the GPR models if the # of knobs to tune has # # changed (incremental knob selection feature is enabled) # tuner_feat_knobs = tuner.featured_knobs # if not np.array_equal(tuner_feat_knobs, self.featured_knobs_): # print ("# knobs: {} --> {}. Re-creating models" # .format(tuner_feat_knobs.size, # self.featured_knobs_.size)) # assert tuner_feat_knobs.size != self.featured_knobs_.size # assert tuner.incremental_knob_selection == True # self.featured_knobs_ = tuner_feat_knobs # self.initialize_models() # gc.collect() # Filter be featured knobs & metrics X_client = X_client.filter(self.featured_knobs_, "columns") y_client = y_client.filter(self.featured_metrics_, "columns") # Generate unique X,y matrices X_client, y_client = get_unique_matrix(X_client, y_client) # Preprocessing steps if self.dummy_encoder_ is not None: X_client = Matrix(self.dummy_encoder_.transform(X_client.data), X_client.rowlabels, self.dummy_encoder_.columnlabels) X_client.data = self.X_scaler_.transform(X_client.data) # Create y_client scaler with prior and transform client data y_client_scaler = copy.deepcopy(self.y_scaler_) y_client_scaler.n_samples_seen_ = 1 y_client_scaler.partial_fit(y_client.data) y_client.data = y_client_scaler.transform(y_client.data) # Bin and recenter client data y_client.data = self.y_binner_.transform(y_client.data) y_client.data = self.y_gp_scaler_.transform(y_client.data) # Compute workload scores in parallel njobs = len(self.workload_states_) iterable = [(i, wd, ws, X_client, y_client, njobs, self.verbose_) \ for i,(wd,ws) in enumerate(self.workload_states_.iteritems())] with stopwatch("workload mapping - predictions"): if self.pool_ is not None: wkld_scores = self.pool_.map(worker_score_workload, iterable) else: wkld_scores = [] for item in iterable: wkld_scores.append(worker_score_workload(item)) sorted_wkld_scores = sorted(wkld_scores, key=operator.itemgetter(1)) print "" print "WORKLOAD SCORES" for wkld, score in sorted_wkld_scores: print "{0}: {1:.2f}".format(os.path.basename(wkld), score) return sorted_wkld_scores[0][0]
def run_lasso(dbms, basepaths, savedir, featured_metrics, knobs_to_ignore, include_polynomial_features=True): import gc # Load matrices assert len(basepaths) > 0 Xs = [] ys = [] with stopwatch("matrix concatenation"): for basepath in basepaths: X_path = os.path.join(basepath, "X_data_enc.npz") y_path = os.path.join(basepath, "y_data_enc.npz") Xs.append(Matrix.load_matrix(X_path)) ys.append( Matrix.load_matrix(y_path).filter(featured_metrics, "columns")) # Combine matrix data if more than 1 matrix if len(Xs) > 1: X = Matrix.vstack(Xs, require_equal_columnlabels=True) y = Matrix.vstack(ys, require_equal_columnlabels=True) else: X = Xs[0] y = ys[0] del Xs del ys gc.collect() with stopwatch("preprocessing"): # Filter out columns with near zero standard # deviation (i.e., constant columns) if y.shape[1] > 1: column_mask = ~stdev_zero(y.data, axis=0) filtered_columns = y.columnlabels[column_mask] y = y.filter(filtered_columns, 'columns') column_mask = ~stdev_zero(X.data, axis=0) removed_columns = X.columnlabels[~column_mask] print "removed columns = {}".format(removed_columns) filtered_columns = set(X.columnlabels[column_mask]) filtered_columns -= set(knobs_to_ignore) filtered_columns = np.array(sorted(filtered_columns)) X = X.filter(filtered_columns, 'columns') print "\ncolumnlabels:", X.columnlabels # Dummy-code categorical features n_values, cat_feat_indices, _ = dummy_encoder_helper( dbms, X.columnlabels) if len(cat_feat_indices) > 0: encoder = DummyEncoder(n_values, cat_feat_indices) encoder.fit(X.data, columnlabels=X.columnlabels) X = Matrix(encoder.transform(X.data), X.rowlabels, encoder.columnlabels) # Scale the data X_standardizer = StandardScaler() X.data = X_standardizer.fit_transform(X.data) y_standardizer = StandardScaler() y.data = y_standardizer.fit_transform(y.data) if include_polynomial_features: X_poly = PolynomialFeatures() X_data = X_poly.fit_transform(X.data) X_columnlabels = np.expand_dims(np.array(X.columnlabels, dtype=str), axis=0) X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze() X = Matrix(X_data, X.rowlabels, X_columnlabels) # Shuffle the data rows (experiments x metrics) shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False) X = shuffler.fit_transform(X, copy=False) y = shuffler.transform(y, copy=False) assert np.array_equal(X.rowlabels, y.rowlabels) gc.collect() print "\nfeatured_metrics:", featured_metrics with stopwatch("lasso paths"): # Fit the model to calculate the components alphas, coefs, _ = get_coef_range(X.data, y.data) # Save model np.savez(os.path.join(savedir, "lasso_path.npz"), alphas=alphas, coefs=coefs, feats=X.columnlabels, metrics=y.columnlabels) with stopwatch("lasso processing"): nfeats = X.columnlabels.shape[0] lasso = Lasso(alphas, X.columnlabels, coefs) print lasso.get_top_summary(nfeats, "") top_knobs = get_features_list(lasso.get_top_features(n=nfeats)) print "\nfeat list length: {}".format(len(top_knobs)) print "nfeats = {}".format(nfeats) top_knobs = lasso.get_top_features(nfeats) print top_knobs final_ordering = [] for knob in top_knobs: if '#' in knob: knob = knob.split('#')[0] if knob not in final_ordering: final_ordering.append(knob) else: final_ordering.append(knob) final_ordering = np.append(final_ordering, removed_columns) with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f: f.write("\n".join(final_ordering))