예제 #1
0
def get_unique_matrix(X, y):
    X_unique, unique_indexes = X.unique_rows(return_index=True)
    assert np.array_equal(X_unique.columnlabels, X.columnlabels)
    y_unique = Matrix(y.data[unique_indexes], y.rowlabels[unique_indexes],
                      y.columnlabels)

    rowlabels = np.empty_like(X_unique.rowlabels, dtype=object)
    exp_set = set()
    for i, row in enumerate(X_unique.data):
        exp_label = tuple((l, r) for l, r in zip(X_unique.columnlabels, row))
        assert exp_label not in exp_set
        rowlabels[i] = exp_label
        exp_set.add(exp_label)
    y_unique.rowlabels = rowlabels
    X_unique.rowlabels = rowlabels
    if X_unique.data.shape != X.data.shape:
        print "\n\nDIFF(num_knobs={}): X_unique: {}, X: {}\n\n".format(
            X_unique.columnlabels.shape[0], X_unique.data.shape, X.data.shape)
        dup_map = {}
        dup_indexes = np.array([d for d in range(X.data.shape[0]) \
                                if d not in unique_indexes])
        for dup_idx in dup_indexes:
            dup_label = tuple((u''+l,r) for l,r in \
                              zip(X_unique.columnlabels,
                                  X.data[dup_idx]))
            primary_idx = [idx for idx,rl in enumerate(rowlabels) \
                           if rl == dup_label]
            assert len(primary_idx) == 1
            primary_idx = primary_idx[0]
            if primary_idx not in dup_map:
                dup_map[primary_idx] = [y_unique.data[primary_idx]]
            dup_map[primary_idx].append(y.data[dup_idx])
        for idx, yvals in dup_map.iteritems():
            y_unique.data[idx] = np.median(np.vstack(yvals), axis=0)
    return X_unique, y_unique
예제 #2
0
    def __init__(self, nonogram):
        self._uid = 1000
        self._idToIndex = {}
        self._nonogram = nonogram

        self._idToIndex[-1] = -1
        self._idToIndex[0] = 0

        self.row = [[self.__getIndex(x) for x in r] for r in nonogram.rows()]
        self.column = [[self.__getIndex(x) for x in c]
                       for c in nonogram.columns()]

        y_size, x_size = self.shape()
        self._sol = Solution(shape=(x_size, y_size))
        self.matrix = Matrix(x=x_size, y=y_size, default=SolutionCell())
예제 #3
0
파일: solution.py 프로젝트: zifter/nonogram
    def __init__(self, descr=None):
        SudokuDescr.__init__(self, matrix=descr.matrix)
        self.steps = []  # ((x, y), value)
        x, y = self.matrix.shape
        self.probability = Matrix(x=x, y=y, default_func=lambda ix, iy: Probability(self.values()))
        self.pending = [] # (x, y), value
        self.random = []

        indexes = set()
        for init in self.matrix:
            if init.v != 0:
                indexes.add(init.index())
                self.add_step(init.index(), init.v)

        new_pending = []
        for i in self.pending:
            if not i[0] in indexes:
                new_pending.append(i)

        self.pending = new_pending
예제 #4
0
 def __init__(self, matrix=None):
     self.matrix = Matrix(matrix=matrix)
     y_size = int(math.sqrt(self.matrix.shape[0]))
     x_size = int(math.sqrt(self.matrix.shape[1]))
     self._box_shape = (x_size, y_size)
     self._values = set([i + 1 for i in xrange(x_size*y_size)])
예제 #5
0
    def map_workload(self, X_client, y_client):
        #         tuner = TunerContext()

        with stopwatch("workload mapping - preprocessing"):
            #             # Recompute the GPR models if the # of knobs to tune has
            #             # changed (incremental knob selection feature is enabled)
            #             tuner_feat_knobs = tuner.featured_knobs
            #             if not np.array_equal(tuner_feat_knobs, self.featured_knobs_):
            #                 print ("# knobs: {} --> {}. Re-creating models"
            #                        .format(tuner_feat_knobs.size,
            #                                self.featured_knobs_.size))
            #                 assert tuner_feat_knobs.size != self.featured_knobs_.size
            #                 assert tuner.incremental_knob_selection == True
            #                 self.featured_knobs_ = tuner_feat_knobs
            #                 self.initialize_models()
            #                 gc.collect()

            # Filter be featured knobs & metrics
            X_client = X_client.filter(self.featured_knobs_, "columns")
            y_client = y_client.filter(self.featured_metrics_, "columns")

            # Generate unique X,y matrices
            X_client, y_client = get_unique_matrix(X_client, y_client)

            # Preprocessing steps
            if self.dummy_encoder_ is not None:
                X_client = Matrix(self.dummy_encoder_.transform(X_client.data),
                                  X_client.rowlabels,
                                  self.dummy_encoder_.columnlabels)
            X_client.data = self.X_scaler_.transform(X_client.data)

            # Create y_client scaler with prior and transform client data
            y_client_scaler = copy.deepcopy(self.y_scaler_)
            y_client_scaler.n_samples_seen_ = 1
            y_client_scaler.partial_fit(y_client.data)
            y_client.data = y_client_scaler.transform(y_client.data)

            # Bin and recenter client data
            y_client.data = self.y_binner_.transform(y_client.data)
            y_client.data = self.y_gp_scaler_.transform(y_client.data)

            # Compute workload scores in parallel
            njobs = len(self.workload_states_)
            iterable = [(i, wd, ws, X_client, y_client, njobs, self.verbose_) \
                    for i,(wd,ws) in enumerate(self.workload_states_.iteritems())]

        with stopwatch("workload mapping - predictions"):
            if self.pool_ is not None:
                wkld_scores = self.pool_.map(worker_score_workload, iterable)
            else:
                wkld_scores = []
                for item in iterable:
                    wkld_scores.append(worker_score_workload(item))

        sorted_wkld_scores = sorted(wkld_scores, key=operator.itemgetter(1))

        print ""
        print "WORKLOAD SCORES"
        for wkld, score in sorted_wkld_scores:
            print "{0}: {1:.2f}".format(os.path.basename(wkld), score)

        return sorted_wkld_scores[0][0]
예제 #6
0
    def initialize_models(self):
        if self.verbose_:
            print("Initializing models for # knobs={}\n".format(
                self.featured_knobs_.size))
        with stopwatch("workload mapping model creation"):
            n_values, cat_indices, params = prep.dummy_encoder_helper(
                self.dbms_name, self.featured_knobs_)
            if n_values.size > 0:
                self.dummy_encoder_ = prep.DummyEncoder(n_values, cat_indices)
            else:
                self.dummy_encoder_ = None
            self.X_scaler_ = StandardScaler()
            self.y_scaler_ = StandardScaler()
            data_map = {}
            for i, wd in enumerate(self.workload_dirs_):
                # Load and filter data
                Xpath = os.path.join(wd, "X_data_enc.npz")
                ypath = os.path.join(wd, "y_data_enc.npz")
                X = Matrix.load_matrix(Xpath)
                y = Matrix.load_matrix(ypath)
                X = X.filter(self.featured_knobs_, "columns")
                y = y.filter(self.featured_metrics_, "columns")
                assert np.array_equal(X.columnlabels, self.featured_knobs_)
                assert np.array_equal(y.columnlabels, self.featured_metrics_)
                assert np.array_equal(X.rowlabels, y.rowlabels)
                num_samples = X.shape[0]
                if num_samples > self.MAX_SAMPLES:
                    print "Shrinking {} samples to {}".format(
                        num_samples, self.MAX_SAMPLES)
                    rand_indices = prep.get_shuffle_indices(
                        num_samples)[:self.MAX_SAMPLES]
                    X = Matrix(X.data[rand_indices], X.rowlabels[rand_indices],
                               X.columnlabels)
                    y = Matrix(y.data[rand_indices], y.rowlabels[rand_indices],
                               y.columnlabels)
                num_samples = X.shape[0]
                assert num_samples <= self.MAX_SAMPLES
                assert num_samples == y.shape[0]

                # Dummy-code categorical knobs
                if self.dummy_encoder_ is not None:
                    if i == 0:
                        # Just need to fit this once
                        self.dummy_encoder_.fit(X.data,
                                                columnlabels=X.columnlabels)
                    X = Matrix(self.dummy_encoder_.transform(X.data),
                               X.rowlabels, self.dummy_encoder_.columnlabels)

                self.X_scaler_.partial_fit(X.data)
                self.y_scaler_.partial_fit(y.data)
                data_map[wd] = (X, y)

            if self.dummy_encoder_ is not None:
                # Fix X_scaler wrt categorical features
                prep.fix_scaler(self.X_scaler_, self.dummy_encoder_, params)

            # Scale X/y
            all_ys = []
            for wd, (X, y) in data_map.iteritems():
                X.data = self.X_scaler_.transform(X.data)
                y.data = self.y_scaler_.transform(y.data)
                all_ys.append(y.data)

            # Concat all ys and compute deciles
            all_ys = np.vstack(all_ys)
            self.y_binner_ = prep.Bin(0, axis=0)
            self.y_binner_.fit(all_ys)
            del all_ys

            # Bin y by deciles and fit scaler
            self.y_gp_scaler_ = StandardScaler()
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_binner_.transform(y.data)
                self.y_gp_scaler_.partial_fit(y.data)

            # Recenter y-values
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_gp_scaler_.transform(y.data)

            njobs = len(data_map)
            iterable = [(i,wd,ws,njobs,self.verbose_) for i,(wd,ws) \
                        in enumerate(data_map.iteritems())]
            if self.pool_ is not None:
                res = self.pool_.map(worker_create_model, iterable)
            else:
                res = []
                for item in iterable:
                    res.append(worker_create_model(item))
            self.workload_states_ = dict(res)
예제 #7
0
def run_lasso(dbms,
              basepaths,
              savedir,
              featured_metrics,
              knobs_to_ignore,
              include_polynomial_features=True):
    import gc

    # Load matrices
    assert len(basepaths) > 0
    Xs = []
    ys = []

    with stopwatch("matrix concatenation"):
        for basepath in basepaths:
            X_path = os.path.join(basepath, "X_data_enc.npz")
            y_path = os.path.join(basepath, "y_data_enc.npz")

            Xs.append(Matrix.load_matrix(X_path))
            ys.append(
                Matrix.load_matrix(y_path).filter(featured_metrics, "columns"))

        # Combine matrix data if more than 1 matrix
        if len(Xs) > 1:
            X = Matrix.vstack(Xs, require_equal_columnlabels=True)
            y = Matrix.vstack(ys, require_equal_columnlabels=True)
        else:
            X = Xs[0]
            y = ys[0]
        del Xs
        del ys
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard
        # deviation (i.e., constant columns)
        if y.shape[1] > 1:
            column_mask = ~stdev_zero(y.data, axis=0)
            filtered_columns = y.columnlabels[column_mask]
            y = y.filter(filtered_columns, 'columns')
        column_mask = ~stdev_zero(X.data, axis=0)
        removed_columns = X.columnlabels[~column_mask]
        print "removed columns = {}".format(removed_columns)
        filtered_columns = set(X.columnlabels[column_mask])
        filtered_columns -= set(knobs_to_ignore)
        filtered_columns = np.array(sorted(filtered_columns))
        X = X.filter(filtered_columns, 'columns')
        print "\ncolumnlabels:", X.columnlabels

        # Dummy-code categorical features
        n_values, cat_feat_indices, _ = dummy_encoder_helper(
            dbms, X.columnlabels)
        if len(cat_feat_indices) > 0:
            encoder = DummyEncoder(n_values, cat_feat_indices)
            encoder.fit(X.data, columnlabels=X.columnlabels)
            X = Matrix(encoder.transform(X.data), X.rowlabels,
                       encoder.columnlabels)

        # Scale the data
        X_standardizer = StandardScaler()
        X.data = X_standardizer.fit_transform(X.data)
        y_standardizer = StandardScaler()
        y.data = y_standardizer.fit_transform(y.data)
        if include_polynomial_features:
            X_poly = PolynomialFeatures()
            X_data = X_poly.fit_transform(X.data)
            X_columnlabels = np.expand_dims(np.array(X.columnlabels,
                                                     dtype=str),
                                            axis=0)
            X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze()
            X = Matrix(X_data, X.rowlabels, X_columnlabels)

        # Shuffle the data rows (experiments x metrics)
        shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False)
        X = shuffler.fit_transform(X, copy=False)
        y = shuffler.transform(y, copy=False)
        assert np.array_equal(X.rowlabels, y.rowlabels)
        gc.collect()

    print "\nfeatured_metrics:", featured_metrics

    with stopwatch("lasso paths"):
        # Fit the model to calculate the components
        alphas, coefs, _ = get_coef_range(X.data, y.data)
    # Save model
    np.savez(os.path.join(savedir, "lasso_path.npz"),
             alphas=alphas,
             coefs=coefs,
             feats=X.columnlabels,
             metrics=y.columnlabels)

    with stopwatch("lasso processing"):
        nfeats = X.columnlabels.shape[0]
        lasso = Lasso(alphas, X.columnlabels, coefs)
        print lasso.get_top_summary(nfeats, "")
        top_knobs = get_features_list(lasso.get_top_features(n=nfeats))
        print "\nfeat list length: {}".format(len(top_knobs))
        print "nfeats = {}".format(nfeats)
        top_knobs = lasso.get_top_features(nfeats)
        print top_knobs
        final_ordering = []
        for knob in top_knobs:
            if '#' in knob:
                knob = knob.split('#')[0]
                if knob not in final_ordering:
                    final_ordering.append(knob)
            else:
                final_ordering.append(knob)
        final_ordering = np.append(final_ordering, removed_columns)
    with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f:
        f.write("\n".join(final_ordering))