Exemple #1
0
def get_unique_matrix(X, y):
    X_unique, unique_indexes = X.unique_rows(return_index=True)
    assert np.array_equal(X_unique.columnlabels, X.columnlabels)
    y_unique = Matrix(y.data[unique_indexes], y.rowlabels[unique_indexes],
                      y.columnlabels)

    rowlabels = np.empty_like(X_unique.rowlabels, dtype=object)
    exp_set = set()
    for i, row in enumerate(X_unique.data):
        exp_label = tuple((l, r) for l, r in zip(X_unique.columnlabels, row))
        assert exp_label not in exp_set
        rowlabels[i] = exp_label
        exp_set.add(exp_label)
    y_unique.rowlabels = rowlabels
    X_unique.rowlabels = rowlabels
    if X_unique.data.shape != X.data.shape:
        print "\n\nDIFF(num_knobs={}): X_unique: {}, X: {}\n\n".format(
            X_unique.columnlabels.shape[0], X_unique.data.shape, X.data.shape)
        dup_map = {}
        dup_indexes = np.array([d for d in range(X.data.shape[0]) \
                                if d not in unique_indexes])
        for dup_idx in dup_indexes:
            dup_label = tuple((u''+l,r) for l,r in \
                              zip(X_unique.columnlabels,
                                  X.data[dup_idx]))
            primary_idx = [idx for idx,rl in enumerate(rowlabels) \
                           if rl == dup_label]
            assert len(primary_idx) == 1
            primary_idx = primary_idx[0]
            if primary_idx not in dup_map:
                dup_map[primary_idx] = [y_unique.data[primary_idx]]
            dup_map[primary_idx].append(y.data[dup_idx])
        for idx, yvals in dup_map.iteritems():
            y_unique.data[idx] = np.median(np.vstack(yvals), axis=0)
    return X_unique, y_unique
Exemple #2
0
class SolutionStep(object):
    ROW = 0
    COL = 1

    def __init__(self, nonogram):
        self._uid = 1000
        self._idToIndex = {}
        self._nonogram = nonogram

        self._idToIndex[-1] = -1
        self._idToIndex[0] = 0

        self.row = [[self.__getIndex(x) for x in r] for r in nonogram.rows()]
        self.column = [[self.__getIndex(x) for x in c]
                       for c in nonogram.columns()]

        y_size, x_size = self.shape()
        self._sol = Solution(shape=(x_size, y_size))
        self.matrix = Matrix(x=x_size, y=y_size, default=SolutionCell())

    def __getIndex(self, value):
        self._uid = self._uid + 1
        index = LayoutIndex(value)
        self._idToIndex[index.id()] = index

        return index

    def index(self, uid):
        return self._idToIndex[int(uid)]

    def shape(self):
        return (len(self.column), len(self.row))

    def solution(self):
        return self._sol

    def row_layout(self, i):
        return self.row[i]

    def column_layout(self, i):
        return self.column[i]

    def row_lineup(self, i):
        return [x.v[self.ROW] for x in self.matrix.row(i)]

    def col_lineup(self, i):
        return [x.v[self.COL] for x in self.matrix.column(i)]

    def set_row(self, i, j, item):
        self.matrix.row(i)[j][self.ROW] = item
        self._sol.item(i, j).v = item.color()

    def set_col(self, i, j, item):
        self.matrix.col(i)[self.COL] = item
        self._sol.item(j, i).v = item.color()
Exemple #3
0
    def __init__(self, nonogram):
        self._uid = 1000
        self._idToIndex = {}
        self._nonogram = nonogram

        self._idToIndex[-1] = -1
        self._idToIndex[0] = 0

        self.row = [[self.__getIndex(x) for x in r] for r in nonogram.rows()]
        self.column = [[self.__getIndex(x) for x in c]
                       for c in nonogram.columns()]

        y_size, x_size = self.shape()
        self._sol = Solution(shape=(x_size, y_size))
        self.matrix = Matrix(x=x_size, y=y_size, default=SolutionCell())
Exemple #4
0
    def test_ctor(self):
        m = Matrix.from_shape(3, 5)

        for i in xrange(5):
            self.assertEqual(m.row(i), 3 * [None])

        for i in xrange(3):
            self.assertEqual(m.column(i), 5 * [None])
Exemple #5
0
    def __init__(self, descr=None):
        SudokuDescr.__init__(self, matrix=descr.matrix)
        self.steps = []  # ((x, y), value)
        x, y = self.matrix.shape
        self.probability = Matrix(x=x, y=y, default_func=lambda ix, iy: Probability(self.values()))
        self.pending = [] # (x, y), value
        self.random = []

        indexes = set()
        for init in self.matrix:
            if init.v != 0:
                indexes.add(init.index())
                self.add_step(init.index(), init.v)

        new_pending = []
        for i in self.pending:
            if not i[0] in indexes:
                new_pending.append(i)

        self.pending = new_pending
Exemple #6
0
    def test_create_from_matrix(self):
        m = Matrix.from_matrix([
            [1, 2],
            [3, 4],
            [5, 6],
        ])

        self.assertEqual(m.row(0), [1, 2])
        self.assertEqual(m.row(1), [3, 4])
        self.assertEqual(m.row(2), [5, 6])

        self.assertEqual(m.column(0), [1, 3, 5])
        self.assertEqual(m.column(1), [2, 4, 6])
Exemple #7
0
 def __init__(self, matrix=None):
     self.matrix = Matrix(matrix=matrix)
     y_size = int(math.sqrt(self.matrix.shape[0]))
     x_size = int(math.sqrt(self.matrix.shape[1]))
     self._box_shape = (x_size, y_size)
     self._values = set([i + 1 for i in xrange(x_size*y_size)])
Exemple #8
0
def _flip_horizontally(matrix):
    m = Matrix(matrix.num_rows(), matrix.num_columns())
    for i in xrange(m.num_rows()):
        for j in xrange(m.num_columns()):
            m[i][j] = matrix[m.num_rows() - i - 1][j]
    return m
def to_matrix(matrix_as_lists):
    result = Matrix(len(matrix_as_lists), len(matrix_as_lists[0]))
    for i in xrange(result.num_rows()):
        for j in xrange(result.num_columns()):
            result[i][j] = matrix_as_lists[i][j]
    return result
Exemple #10
0
 def __init__(self, matrix=[], shape=None):
     if shape:
         x, y = shape
         self.matrix = Matrix.from_shape(x=x, y=y, default=-1)
     else:
         self.matrix = Matrix.from_matrix(matrix)
Exemple #11
0
    def test_change_value_in_column(self):
        m = Matrix.from_shape(3, 5)

        value = "new_value"
        m.column(1)[2].value = value
        self.assertEqual(m.row(2)[1].value, value)
def run_factor_analysis(paths, savedir, cluster_range, algorithms):
    import gc

    # Load matrices
    assert len(paths) > 0
    matrices = []

    with stopwatch("matrix concatenation"):
        for path in paths:
            matrices.append(
                Matrix.load_matrix(os.path.join(path, "y_data_enc.npz")))
        # Combine matrix data if more than 1 matrix
        if len(matrices) > 1:
            matrix = Matrix.vstack(matrices, require_equal_columnlabels=True)
        else:
            matrix = matrices[0]
        del matrices
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard deviation
        # i.e., constant columns
        column_mask = ~stdev_zero(matrix.data, axis=0)
        filtered_columns = matrix.columnlabels[column_mask]
        matrix = matrix.filter(filtered_columns, 'columns')
        print "matrix shape after filter constant: ", matrix.data.shape

        # Scale the data
        standardizer = StandardScaler()
        matrix.data = standardizer.fit_transform(matrix.data)

        # Shuffle the data rows (experiments x metrics)
        exp_shuffle_indices = get_shuffle_indices(matrix.data.shape[0])
        matrix.data = matrix.data[exp_shuffle_indices]

        # Shrink the cluster range if # metrics < max # clusters
        max_clusters = matrix.data.shape[1] + 1
        if max_clusters < cluster_range[1]:
            cluster_range = (cluster_range[0], max_clusters)

    with stopwatch("factor analysis"):
        # Fit the model to calculate the components
        fa = FactorAnalysis()
        fa.fit(matrix.data)
    fa_mask = np.sum(fa.components_ != 0.0, axis=1) > 0.0
    variances = np.sum(np.abs(fa.components_[fa_mask]), axis=1)
    total_variance = np.sum(variances).squeeze()
    print "total variance: {}".format(total_variance)
    var_exp = np.array([np.sum(variances[:i+1]) / total_variance * 100 \
                        for i in range(variances.shape[0])])
    factor_cutoff = np.count_nonzero(var_exp < REQUIRED_VARIANCE_EXPLAINED) + 1
    factor_cutoff = min(factor_cutoff, 10)
    print "factor cutoff: {}".format(factor_cutoff)
    for i, var in enumerate(variances):
        print i, var, np.sum(
            variances[:i + 1]), np.sum(variances[:i + 1]) / total_variance

    components = np.transpose(fa.components_[:factor_cutoff]).copy()
    print "components shape: {}".format(components.shape)
    standardizer = StandardScaler()
    components = standardizer.fit_transform(components)

    # Shuffle factor analysis matrix rows (metrics x factors)
    metric_shuffle_indices = get_shuffle_indices(components.shape[0])
    components = components[metric_shuffle_indices]
    component_columnlabels = matrix.columnlabels[metric_shuffle_indices].copy()

    kmeans = KMeans_(components, cluster_range)
    kmeans.plot_results(savedir, components, component_columnlabels)

    # Compute optimal number of clusters K
    for algorithm in algorithms:
        with stopwatch("compute {} (factors={})".format(
                algorithm, factor_cutoff)):
            kselection = KSelection.new(components, cluster_range,
                                        kmeans.cluster_map_, algorithm)
        print "{} optimal # of clusters: {}".format(
            algorithm, kselection.optimal_num_clusters_)
        kselection.plot_results(savedir)

    metric_clusters = {}
    featured_metrics = {}
    for n_clusters, (cluster_centers, labels,
                     _) in kmeans.cluster_map_.iteritems():

        # For each cluster, calculate the distances of each metric from the
        # cluster center. We use the metric closest to the cluster center.
        mclusters = []
        mfeat_list = []
        for i in range(n_clusters):
            metric_labels = component_columnlabels[labels == i]
            component_rows = components[labels == i]
            centroid = np.expand_dims(cluster_centers[i], axis=0)
            dists = np.empty(component_rows.shape[0])
            for j, row in enumerate(component_rows):
                row = np.expand_dims(row, axis=0)
                dists[j] = cdist(row, centroid, 'euclidean').squeeze()
            order_by = np.argsort(dists)
            metric_labels = metric_labels[order_by]
            dists = dists[order_by]
            mclusters.append((i, metric_labels, dists))
            assert len(OPT_METRICS) > 0
            label_mask = np.zeros(metric_labels.shape[0])
            for opt_metric in OPT_METRICS:
                label_mask = np.logical_or(label_mask,
                                           metric_labels == opt_metric)
            if np.count_nonzero(label_mask) > 0:
                mfeat_list.extend(metric_labels[label_mask].tolist())
            elif len(metric_labels) > 0:
                mfeat_list.append(metric_labels[0])
        metric_clusters[n_clusters] = mclusters
        featured_metrics[n_clusters] = mfeat_list

    for n_clusters, mlist in sorted(featured_metrics.iteritems()):
        savepath = os.path.join(savedir,
                                "featured_metrics_{}.txt".format(n_clusters))
        with open(savepath, "w") as f:
            f.write("\n".join(sorted(mlist)))

    for n_clusters, memberships in sorted(metric_clusters.iteritems()):
        cstr = ""
        for i, (cnum, lab, dist) in enumerate(memberships):
            assert i == cnum
            cstr += "---------------------------------------------\n"
            cstr += "CLUSTERS {}\n".format(i)
            cstr += "---------------------------------------------\n\n"

            for l, d in zip(lab, dist):
                cstr += "{}\t({})\n".format(l, d)
            cstr += "\n\n"

        savepath = os.path.join(savedir,
                                "membership_{}.txt".format(n_clusters))
        with open(savepath, "w") as f:
            f.write(cstr)
Exemple #13
0
class Solution(SudokuDescr):
    def __init__(self, descr=None):
        SudokuDescr.__init__(self, matrix=descr.matrix)
        self.steps = []  # ((x, y), value)
        x, y = self.matrix.shape
        self.probability = Matrix(x=x, y=y, default_func=lambda ix, iy: Probability(self.values()))
        self.pending = [] # (x, y), value
        self.random = []

        indexes = set()
        for init in self.matrix:
            if init.v != 0:
                indexes.add(init.index())
                self.add_step(init.index(), init.v)

        new_pending = []
        for i in self.pending:
            if not i[0] in indexes:
                new_pending.append(i)

        self.pending = new_pending

    def probability_line(self, index):
        return [
            SolverMethod.belonged_box(index, self.probability, self.box_shape()),
            self.probability.row(index[1]),
            self.probability.column(index[0])
        ]

    def add_step(self, index, value):
        lines = self.probability_line(index)
        for l in lines:
            for i in l:
                if i.index() != index and i.v.erase(value, len(self.steps)):
                    self.add_pending(i.index(), i.v.value())

        self.probability.item_i(index).v.set(value, len(self.steps))
        self.steps.append((index, value))
        self.matrix.item_i(index).v = value

    def add_pending(self, index, value):
        self.pending.append((index, value))

    def add_random_choice(self, index, values):
        vs = copy(values)
        v = vs.pop()
        self.random.append((index,  vs, len(self.steps)))
        self.add_step(index, v)

    def rollback_random(self):
        # revert to 0 in matrix
        r = self.random.pop()
        while not r[1]:
            r = self.random.pop()

        rev = r[2]
        while rev != len(self.steps):
            s = self.steps.pop()
            self.matrix.item_i(s[0]).v = 0

        for item in self.probability:
            item.v.rollback(rev)

        self.pending = []
        self.add_random_choice(r[0], r[1])

    def process_pending(self):
        while self.pending:
            item = self.pending.pop()
            self.add_step(item[0], item[1])

    def is_done(self):
        x, y = self.matrix.shape
        return x*y == len(self.steps)
Exemple #14
0
    def map_workload(self, X_client, y_client):
        #         tuner = TunerContext()

        with stopwatch("workload mapping - preprocessing"):
            #             # Recompute the GPR models if the # of knobs to tune has
            #             # changed (incremental knob selection feature is enabled)
            #             tuner_feat_knobs = tuner.featured_knobs
            #             if not np.array_equal(tuner_feat_knobs, self.featured_knobs_):
            #                 print ("# knobs: {} --> {}. Re-creating models"
            #                        .format(tuner_feat_knobs.size,
            #                                self.featured_knobs_.size))
            #                 assert tuner_feat_knobs.size != self.featured_knobs_.size
            #                 assert tuner.incremental_knob_selection == True
            #                 self.featured_knobs_ = tuner_feat_knobs
            #                 self.initialize_models()
            #                 gc.collect()

            # Filter be featured knobs & metrics
            X_client = X_client.filter(self.featured_knobs_, "columns")
            y_client = y_client.filter(self.featured_metrics_, "columns")

            # Generate unique X,y matrices
            X_client, y_client = get_unique_matrix(X_client, y_client)

            # Preprocessing steps
            if self.dummy_encoder_ is not None:
                X_client = Matrix(self.dummy_encoder_.transform(X_client.data),
                                  X_client.rowlabels,
                                  self.dummy_encoder_.columnlabels)
            X_client.data = self.X_scaler_.transform(X_client.data)

            # Create y_client scaler with prior and transform client data
            y_client_scaler = copy.deepcopy(self.y_scaler_)
            y_client_scaler.n_samples_seen_ = 1
            y_client_scaler.partial_fit(y_client.data)
            y_client.data = y_client_scaler.transform(y_client.data)

            # Bin and recenter client data
            y_client.data = self.y_binner_.transform(y_client.data)
            y_client.data = self.y_gp_scaler_.transform(y_client.data)

            # Compute workload scores in parallel
            njobs = len(self.workload_states_)
            iterable = [(i, wd, ws, X_client, y_client, njobs, self.verbose_) \
                    for i,(wd,ws) in enumerate(self.workload_states_.iteritems())]

        with stopwatch("workload mapping - predictions"):
            if self.pool_ is not None:
                wkld_scores = self.pool_.map(worker_score_workload, iterable)
            else:
                wkld_scores = []
                for item in iterable:
                    wkld_scores.append(worker_score_workload(item))

        sorted_wkld_scores = sorted(wkld_scores, key=operator.itemgetter(1))

        print ""
        print "WORKLOAD SCORES"
        for wkld, score in sorted_wkld_scores:
            print "{0}: {1:.2f}".format(os.path.basename(wkld), score)

        return sorted_wkld_scores[0][0]
Exemple #15
0
    def initialize_models(self):
        if self.verbose_:
            print("Initializing models for # knobs={}\n".format(
                self.featured_knobs_.size))
        with stopwatch("workload mapping model creation"):
            n_values, cat_indices, params = prep.dummy_encoder_helper(
                self.dbms_name, self.featured_knobs_)
            if n_values.size > 0:
                self.dummy_encoder_ = prep.DummyEncoder(n_values, cat_indices)
            else:
                self.dummy_encoder_ = None
            self.X_scaler_ = StandardScaler()
            self.y_scaler_ = StandardScaler()
            data_map = {}
            for i, wd in enumerate(self.workload_dirs_):
                # Load and filter data
                Xpath = os.path.join(wd, "X_data_enc.npz")
                ypath = os.path.join(wd, "y_data_enc.npz")
                X = Matrix.load_matrix(Xpath)
                y = Matrix.load_matrix(ypath)
                X = X.filter(self.featured_knobs_, "columns")
                y = y.filter(self.featured_metrics_, "columns")
                assert np.array_equal(X.columnlabels, self.featured_knobs_)
                assert np.array_equal(y.columnlabels, self.featured_metrics_)
                assert np.array_equal(X.rowlabels, y.rowlabels)
                num_samples = X.shape[0]
                if num_samples > self.MAX_SAMPLES:
                    print "Shrinking {} samples to {}".format(
                        num_samples, self.MAX_SAMPLES)
                    rand_indices = prep.get_shuffle_indices(
                        num_samples)[:self.MAX_SAMPLES]
                    X = Matrix(X.data[rand_indices], X.rowlabels[rand_indices],
                               X.columnlabels)
                    y = Matrix(y.data[rand_indices], y.rowlabels[rand_indices],
                               y.columnlabels)
                num_samples = X.shape[0]
                assert num_samples <= self.MAX_SAMPLES
                assert num_samples == y.shape[0]

                # Dummy-code categorical knobs
                if self.dummy_encoder_ is not None:
                    if i == 0:
                        # Just need to fit this once
                        self.dummy_encoder_.fit(X.data,
                                                columnlabels=X.columnlabels)
                    X = Matrix(self.dummy_encoder_.transform(X.data),
                               X.rowlabels, self.dummy_encoder_.columnlabels)

                self.X_scaler_.partial_fit(X.data)
                self.y_scaler_.partial_fit(y.data)
                data_map[wd] = (X, y)

            if self.dummy_encoder_ is not None:
                # Fix X_scaler wrt categorical features
                prep.fix_scaler(self.X_scaler_, self.dummy_encoder_, params)

            # Scale X/y
            all_ys = []
            for wd, (X, y) in data_map.iteritems():
                X.data = self.X_scaler_.transform(X.data)
                y.data = self.y_scaler_.transform(y.data)
                all_ys.append(y.data)

            # Concat all ys and compute deciles
            all_ys = np.vstack(all_ys)
            self.y_binner_ = prep.Bin(0, axis=0)
            self.y_binner_.fit(all_ys)
            del all_ys

            # Bin y by deciles and fit scaler
            self.y_gp_scaler_ = StandardScaler()
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_binner_.transform(y.data)
                self.y_gp_scaler_.partial_fit(y.data)

            # Recenter y-values
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_gp_scaler_.transform(y.data)

            njobs = len(data_map)
            iterable = [(i,wd,ws,njobs,self.verbose_) for i,(wd,ws) \
                        in enumerate(data_map.iteritems())]
            if self.pool_ is not None:
                res = self.pool_.map(worker_create_model, iterable)
            else:
                res = []
                for item in iterable:
                    res.append(worker_create_model(item))
            self.workload_states_ = dict(res)
Exemple #16
0
def run_lasso(dbms,
              basepaths,
              savedir,
              featured_metrics,
              knobs_to_ignore,
              include_polynomial_features=True):
    import gc

    # Load matrices
    assert len(basepaths) > 0
    Xs = []
    ys = []

    with stopwatch("matrix concatenation"):
        for basepath in basepaths:
            X_path = os.path.join(basepath, "X_data_enc.npz")
            y_path = os.path.join(basepath, "y_data_enc.npz")

            Xs.append(Matrix.load_matrix(X_path))
            ys.append(
                Matrix.load_matrix(y_path).filter(featured_metrics, "columns"))

        # Combine matrix data if more than 1 matrix
        if len(Xs) > 1:
            X = Matrix.vstack(Xs, require_equal_columnlabels=True)
            y = Matrix.vstack(ys, require_equal_columnlabels=True)
        else:
            X = Xs[0]
            y = ys[0]
        del Xs
        del ys
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard
        # deviation (i.e., constant columns)
        if y.shape[1] > 1:
            column_mask = ~stdev_zero(y.data, axis=0)
            filtered_columns = y.columnlabels[column_mask]
            y = y.filter(filtered_columns, 'columns')
        column_mask = ~stdev_zero(X.data, axis=0)
        removed_columns = X.columnlabels[~column_mask]
        print "removed columns = {}".format(removed_columns)
        filtered_columns = set(X.columnlabels[column_mask])
        filtered_columns -= set(knobs_to_ignore)
        filtered_columns = np.array(sorted(filtered_columns))
        X = X.filter(filtered_columns, 'columns')
        print "\ncolumnlabels:", X.columnlabels

        # Dummy-code categorical features
        n_values, cat_feat_indices, _ = dummy_encoder_helper(
            dbms, X.columnlabels)
        if len(cat_feat_indices) > 0:
            encoder = DummyEncoder(n_values, cat_feat_indices)
            encoder.fit(X.data, columnlabels=X.columnlabels)
            X = Matrix(encoder.transform(X.data), X.rowlabels,
                       encoder.columnlabels)

        # Scale the data
        X_standardizer = StandardScaler()
        X.data = X_standardizer.fit_transform(X.data)
        y_standardizer = StandardScaler()
        y.data = y_standardizer.fit_transform(y.data)
        if include_polynomial_features:
            X_poly = PolynomialFeatures()
            X_data = X_poly.fit_transform(X.data)
            X_columnlabels = np.expand_dims(np.array(X.columnlabels,
                                                     dtype=str),
                                            axis=0)
            X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze()
            X = Matrix(X_data, X.rowlabels, X_columnlabels)

        # Shuffle the data rows (experiments x metrics)
        shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False)
        X = shuffler.fit_transform(X, copy=False)
        y = shuffler.transform(y, copy=False)
        assert np.array_equal(X.rowlabels, y.rowlabels)
        gc.collect()

    print "\nfeatured_metrics:", featured_metrics

    with stopwatch("lasso paths"):
        # Fit the model to calculate the components
        alphas, coefs, _ = get_coef_range(X.data, y.data)
    # Save model
    np.savez(os.path.join(savedir, "lasso_path.npz"),
             alphas=alphas,
             coefs=coefs,
             feats=X.columnlabels,
             metrics=y.columnlabels)

    with stopwatch("lasso processing"):
        nfeats = X.columnlabels.shape[0]
        lasso = Lasso(alphas, X.columnlabels, coefs)
        print lasso.get_top_summary(nfeats, "")
        top_knobs = get_features_list(lasso.get_top_features(n=nfeats))
        print "\nfeat list length: {}".format(len(top_knobs))
        print "nfeats = {}".format(nfeats)
        top_knobs = lasso.get_top_features(nfeats)
        print top_knobs
        final_ordering = []
        for knob in top_knobs:
            if '#' in knob:
                knob = knob.split('#')[0]
                if knob not in final_ordering:
                    final_ordering.append(knob)
            else:
                final_ordering.append(knob)
        final_ordering = np.append(final_ordering, removed_columns)
    with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f:
        f.write("\n".join(final_ordering))