Esempio n. 1
0
    def initialize_models(self):
        if self.verbose_:
            print("Initializing models for # knobs={}\n".format(
                self.featured_knobs_.size))
        with stopwatch("workload mapping model creation"):
            n_values, cat_indices, params = prep.dummy_encoder_helper(
                self.dbms_name, self.featured_knobs_)
            if n_values.size > 0:
                self.dummy_encoder_ = prep.DummyEncoder(n_values, cat_indices)
            else:
                self.dummy_encoder_ = None
            self.X_scaler_ = StandardScaler()
            self.y_scaler_ = StandardScaler()
            data_map = {}
            for i, wd in enumerate(self.workload_dirs_):
                # Load and filter data
                Xpath = os.path.join(wd, "X_data_enc.npz")
                ypath = os.path.join(wd, "y_data_enc.npz")
                X = Matrix.load_matrix(Xpath)
                y = Matrix.load_matrix(ypath)
                X = X.filter(self.featured_knobs_, "columns")
                y = y.filter(self.featured_metrics_, "columns")
                assert np.array_equal(X.columnlabels, self.featured_knobs_)
                assert np.array_equal(y.columnlabels, self.featured_metrics_)
                assert np.array_equal(X.rowlabels, y.rowlabels)
                num_samples = X.shape[0]
                if num_samples > self.MAX_SAMPLES:
                    print "Shrinking {} samples to {}".format(
                        num_samples, self.MAX_SAMPLES)
                    rand_indices = prep.get_shuffle_indices(
                        num_samples)[:self.MAX_SAMPLES]
                    X = Matrix(X.data[rand_indices], X.rowlabels[rand_indices],
                               X.columnlabels)
                    y = Matrix(y.data[rand_indices], y.rowlabels[rand_indices],
                               y.columnlabels)
                num_samples = X.shape[0]
                assert num_samples <= self.MAX_SAMPLES
                assert num_samples == y.shape[0]

                # Dummy-code categorical knobs
                if self.dummy_encoder_ is not None:
                    if i == 0:
                        # Just need to fit this once
                        self.dummy_encoder_.fit(X.data,
                                                columnlabels=X.columnlabels)
                    X = Matrix(self.dummy_encoder_.transform(X.data),
                               X.rowlabels, self.dummy_encoder_.columnlabels)

                self.X_scaler_.partial_fit(X.data)
                self.y_scaler_.partial_fit(y.data)
                data_map[wd] = (X, y)

            if self.dummy_encoder_ is not None:
                # Fix X_scaler wrt categorical features
                prep.fix_scaler(self.X_scaler_, self.dummy_encoder_, params)

            # Scale X/y
            all_ys = []
            for wd, (X, y) in data_map.iteritems():
                X.data = self.X_scaler_.transform(X.data)
                y.data = self.y_scaler_.transform(y.data)
                all_ys.append(y.data)

            # Concat all ys and compute deciles
            all_ys = np.vstack(all_ys)
            self.y_binner_ = prep.Bin(0, axis=0)
            self.y_binner_.fit(all_ys)
            del all_ys

            # Bin y by deciles and fit scaler
            self.y_gp_scaler_ = StandardScaler()
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_binner_.transform(y.data)
                self.y_gp_scaler_.partial_fit(y.data)

            # Recenter y-values
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_gp_scaler_.transform(y.data)

            njobs = len(data_map)
            iterable = [(i,wd,ws,njobs,self.verbose_) for i,(wd,ws) \
                        in enumerate(data_map.iteritems())]
            if self.pool_ is not None:
                res = self.pool_.map(worker_create_model, iterable)
            else:
                res = []
                for item in iterable:
                    res.append(worker_create_model(item))
            self.workload_states_ = dict(res)
Esempio n. 2
0
    def map_workload(self, X_client, y_client):
        #         tuner = TunerContext()

        with stopwatch("workload mapping - preprocessing"):
            #             # Recompute the GPR models if the # of knobs to tune has
            #             # changed (incremental knob selection feature is enabled)
            #             tuner_feat_knobs = tuner.featured_knobs
            #             if not np.array_equal(tuner_feat_knobs, self.featured_knobs_):
            #                 print ("# knobs: {} --> {}. Re-creating models"
            #                        .format(tuner_feat_knobs.size,
            #                                self.featured_knobs_.size))
            #                 assert tuner_feat_knobs.size != self.featured_knobs_.size
            #                 assert tuner.incremental_knob_selection == True
            #                 self.featured_knobs_ = tuner_feat_knobs
            #                 self.initialize_models()
            #                 gc.collect()

            # Filter be featured knobs & metrics
            X_client = X_client.filter(self.featured_knobs_, "columns")
            y_client = y_client.filter(self.featured_metrics_, "columns")

            # Generate unique X,y matrices
            X_client, y_client = get_unique_matrix(X_client, y_client)

            # Preprocessing steps
            if self.dummy_encoder_ is not None:
                X_client = Matrix(self.dummy_encoder_.transform(X_client.data),
                                  X_client.rowlabels,
                                  self.dummy_encoder_.columnlabels)
            X_client.data = self.X_scaler_.transform(X_client.data)

            # Create y_client scaler with prior and transform client data
            y_client_scaler = copy.deepcopy(self.y_scaler_)
            y_client_scaler.n_samples_seen_ = 1
            y_client_scaler.partial_fit(y_client.data)
            y_client.data = y_client_scaler.transform(y_client.data)

            # Bin and recenter client data
            y_client.data = self.y_binner_.transform(y_client.data)
            y_client.data = self.y_gp_scaler_.transform(y_client.data)

            # Compute workload scores in parallel
            njobs = len(self.workload_states_)
            iterable = [(i, wd, ws, X_client, y_client, njobs, self.verbose_) \
                    for i,(wd,ws) in enumerate(self.workload_states_.iteritems())]

        with stopwatch("workload mapping - predictions"):
            if self.pool_ is not None:
                wkld_scores = self.pool_.map(worker_score_workload, iterable)
            else:
                wkld_scores = []
                for item in iterable:
                    wkld_scores.append(worker_score_workload(item))

        sorted_wkld_scores = sorted(wkld_scores, key=operator.itemgetter(1))

        print ""
        print "WORKLOAD SCORES"
        for wkld, score in sorted_wkld_scores:
            print "{0}: {1:.2f}".format(os.path.basename(wkld), score)

        return sorted_wkld_scores[0][0]
Esempio n. 3
0
def run_lasso(dbms,
              basepaths,
              savedir,
              featured_metrics,
              knobs_to_ignore,
              include_polynomial_features=True):
    import gc

    # Load matrices
    assert len(basepaths) > 0
    Xs = []
    ys = []

    with stopwatch("matrix concatenation"):
        for basepath in basepaths:
            X_path = os.path.join(basepath, "X_data_enc.npz")
            y_path = os.path.join(basepath, "y_data_enc.npz")

            Xs.append(Matrix.load_matrix(X_path))
            ys.append(
                Matrix.load_matrix(y_path).filter(featured_metrics, "columns"))

        # Combine matrix data if more than 1 matrix
        if len(Xs) > 1:
            X = Matrix.vstack(Xs, require_equal_columnlabels=True)
            y = Matrix.vstack(ys, require_equal_columnlabels=True)
        else:
            X = Xs[0]
            y = ys[0]
        del Xs
        del ys
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard
        # deviation (i.e., constant columns)
        if y.shape[1] > 1:
            column_mask = ~stdev_zero(y.data, axis=0)
            filtered_columns = y.columnlabels[column_mask]
            y = y.filter(filtered_columns, 'columns')
        column_mask = ~stdev_zero(X.data, axis=0)
        removed_columns = X.columnlabels[~column_mask]
        print "removed columns = {}".format(removed_columns)
        filtered_columns = set(X.columnlabels[column_mask])
        filtered_columns -= set(knobs_to_ignore)
        filtered_columns = np.array(sorted(filtered_columns))
        X = X.filter(filtered_columns, 'columns')
        print "\ncolumnlabels:", X.columnlabels

        # Dummy-code categorical features
        n_values, cat_feat_indices, _ = dummy_encoder_helper(
            dbms, X.columnlabels)
        if len(cat_feat_indices) > 0:
            encoder = DummyEncoder(n_values, cat_feat_indices)
            encoder.fit(X.data, columnlabels=X.columnlabels)
            X = Matrix(encoder.transform(X.data), X.rowlabels,
                       encoder.columnlabels)

        # Scale the data
        X_standardizer = StandardScaler()
        X.data = X_standardizer.fit_transform(X.data)
        y_standardizer = StandardScaler()
        y.data = y_standardizer.fit_transform(y.data)
        if include_polynomial_features:
            X_poly = PolynomialFeatures()
            X_data = X_poly.fit_transform(X.data)
            X_columnlabels = np.expand_dims(np.array(X.columnlabels,
                                                     dtype=str),
                                            axis=0)
            X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze()
            X = Matrix(X_data, X.rowlabels, X_columnlabels)

        # Shuffle the data rows (experiments x metrics)
        shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False)
        X = shuffler.fit_transform(X, copy=False)
        y = shuffler.transform(y, copy=False)
        assert np.array_equal(X.rowlabels, y.rowlabels)
        gc.collect()

    print "\nfeatured_metrics:", featured_metrics

    with stopwatch("lasso paths"):
        # Fit the model to calculate the components
        alphas, coefs, _ = get_coef_range(X.data, y.data)
    # Save model
    np.savez(os.path.join(savedir, "lasso_path.npz"),
             alphas=alphas,
             coefs=coefs,
             feats=X.columnlabels,
             metrics=y.columnlabels)

    with stopwatch("lasso processing"):
        nfeats = X.columnlabels.shape[0]
        lasso = Lasso(alphas, X.columnlabels, coefs)
        print lasso.get_top_summary(nfeats, "")
        top_knobs = get_features_list(lasso.get_top_features(n=nfeats))
        print "\nfeat list length: {}".format(len(top_knobs))
        print "nfeats = {}".format(nfeats)
        top_knobs = lasso.get_top_features(nfeats)
        print top_knobs
        final_ordering = []
        for knob in top_knobs:
            if '#' in knob:
                knob = knob.split('#')[0]
                if knob not in final_ordering:
                    final_ordering.append(knob)
            else:
                final_ordering.append(knob)
        final_ordering = np.append(final_ordering, removed_columns)
    with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f:
        f.write("\n".join(final_ordering))