Esempio n. 1
0
    def initialize_models(self):
        if self.verbose_:
            print("Initializing models for # knobs={}\n".format(
                self.featured_knobs_.size))
        with stopwatch("workload mapping model creation"):
            n_values, cat_indices, params = prep.dummy_encoder_helper(
                self.dbms_name, self.featured_knobs_)
            if n_values.size > 0:
                self.dummy_encoder_ = prep.DummyEncoder(n_values, cat_indices)
            else:
                self.dummy_encoder_ = None
            self.X_scaler_ = StandardScaler()
            self.y_scaler_ = StandardScaler()
            data_map = {}
            for i, wd in enumerate(self.workload_dirs_):
                # Load and filter data
                Xpath = os.path.join(wd, "X_data_enc.npz")
                ypath = os.path.join(wd, "y_data_enc.npz")
                X = Matrix.load_matrix(Xpath)
                y = Matrix.load_matrix(ypath)
                X = X.filter(self.featured_knobs_, "columns")
                y = y.filter(self.featured_metrics_, "columns")
                assert np.array_equal(X.columnlabels, self.featured_knobs_)
                assert np.array_equal(y.columnlabels, self.featured_metrics_)
                assert np.array_equal(X.rowlabels, y.rowlabels)
                num_samples = X.shape[0]
                if num_samples > self.MAX_SAMPLES:
                    print "Shrinking {} samples to {}".format(
                        num_samples, self.MAX_SAMPLES)
                    rand_indices = prep.get_shuffle_indices(
                        num_samples)[:self.MAX_SAMPLES]
                    X = Matrix(X.data[rand_indices], X.rowlabels[rand_indices],
                               X.columnlabels)
                    y = Matrix(y.data[rand_indices], y.rowlabels[rand_indices],
                               y.columnlabels)
                num_samples = X.shape[0]
                assert num_samples <= self.MAX_SAMPLES
                assert num_samples == y.shape[0]

                # Dummy-code categorical knobs
                if self.dummy_encoder_ is not None:
                    if i == 0:
                        # Just need to fit this once
                        self.dummy_encoder_.fit(X.data,
                                                columnlabels=X.columnlabels)
                    X = Matrix(self.dummy_encoder_.transform(X.data),
                               X.rowlabels, self.dummy_encoder_.columnlabels)

                self.X_scaler_.partial_fit(X.data)
                self.y_scaler_.partial_fit(y.data)
                data_map[wd] = (X, y)

            if self.dummy_encoder_ is not None:
                # Fix X_scaler wrt categorical features
                prep.fix_scaler(self.X_scaler_, self.dummy_encoder_, params)

            # Scale X/y
            all_ys = []
            for wd, (X, y) in data_map.iteritems():
                X.data = self.X_scaler_.transform(X.data)
                y.data = self.y_scaler_.transform(y.data)
                all_ys.append(y.data)

            # Concat all ys and compute deciles
            all_ys = np.vstack(all_ys)
            self.y_binner_ = prep.Bin(0, axis=0)
            self.y_binner_.fit(all_ys)
            del all_ys

            # Bin y by deciles and fit scaler
            self.y_gp_scaler_ = StandardScaler()
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_binner_.transform(y.data)
                self.y_gp_scaler_.partial_fit(y.data)

            # Recenter y-values
            for wd, (X, y) in data_map.iteritems():
                y.data = self.y_gp_scaler_.transform(y.data)

            njobs = len(data_map)
            iterable = [(i,wd,ws,njobs,self.verbose_) for i,(wd,ws) \
                        in enumerate(data_map.iteritems())]
            if self.pool_ is not None:
                res = self.pool_.map(worker_create_model, iterable)
            else:
                res = []
                for item in iterable:
                    res.append(worker_create_model(item))
            self.workload_states_ = dict(res)
def run_factor_analysis(paths, savedir, cluster_range, algorithms):
    import gc

    # Load matrices
    assert len(paths) > 0
    matrices = []

    with stopwatch("matrix concatenation"):
        for path in paths:
            matrices.append(
                Matrix.load_matrix(os.path.join(path, "y_data_enc.npz")))
        # Combine matrix data if more than 1 matrix
        if len(matrices) > 1:
            matrix = Matrix.vstack(matrices, require_equal_columnlabels=True)
        else:
            matrix = matrices[0]
        del matrices
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard deviation
        # i.e., constant columns
        column_mask = ~stdev_zero(matrix.data, axis=0)
        filtered_columns = matrix.columnlabels[column_mask]
        matrix = matrix.filter(filtered_columns, 'columns')
        print "matrix shape after filter constant: ", matrix.data.shape

        # Scale the data
        standardizer = StandardScaler()
        matrix.data = standardizer.fit_transform(matrix.data)

        # Shuffle the data rows (experiments x metrics)
        exp_shuffle_indices = get_shuffle_indices(matrix.data.shape[0])
        matrix.data = matrix.data[exp_shuffle_indices]

        # Shrink the cluster range if # metrics < max # clusters
        max_clusters = matrix.data.shape[1] + 1
        if max_clusters < cluster_range[1]:
            cluster_range = (cluster_range[0], max_clusters)

    with stopwatch("factor analysis"):
        # Fit the model to calculate the components
        fa = FactorAnalysis()
        fa.fit(matrix.data)
    fa_mask = np.sum(fa.components_ != 0.0, axis=1) > 0.0
    variances = np.sum(np.abs(fa.components_[fa_mask]), axis=1)
    total_variance = np.sum(variances).squeeze()
    print "total variance: {}".format(total_variance)
    var_exp = np.array([np.sum(variances[:i+1]) / total_variance * 100 \
                        for i in range(variances.shape[0])])
    factor_cutoff = np.count_nonzero(var_exp < REQUIRED_VARIANCE_EXPLAINED) + 1
    factor_cutoff = min(factor_cutoff, 10)
    print "factor cutoff: {}".format(factor_cutoff)
    for i, var in enumerate(variances):
        print i, var, np.sum(
            variances[:i + 1]), np.sum(variances[:i + 1]) / total_variance

    components = np.transpose(fa.components_[:factor_cutoff]).copy()
    print "components shape: {}".format(components.shape)
    standardizer = StandardScaler()
    components = standardizer.fit_transform(components)

    # Shuffle factor analysis matrix rows (metrics x factors)
    metric_shuffle_indices = get_shuffle_indices(components.shape[0])
    components = components[metric_shuffle_indices]
    component_columnlabels = matrix.columnlabels[metric_shuffle_indices].copy()

    kmeans = KMeans_(components, cluster_range)
    kmeans.plot_results(savedir, components, component_columnlabels)

    # Compute optimal number of clusters K
    for algorithm in algorithms:
        with stopwatch("compute {} (factors={})".format(
                algorithm, factor_cutoff)):
            kselection = KSelection.new(components, cluster_range,
                                        kmeans.cluster_map_, algorithm)
        print "{} optimal # of clusters: {}".format(
            algorithm, kselection.optimal_num_clusters_)
        kselection.plot_results(savedir)

    metric_clusters = {}
    featured_metrics = {}
    for n_clusters, (cluster_centers, labels,
                     _) in kmeans.cluster_map_.iteritems():

        # For each cluster, calculate the distances of each metric from the
        # cluster center. We use the metric closest to the cluster center.
        mclusters = []
        mfeat_list = []
        for i in range(n_clusters):
            metric_labels = component_columnlabels[labels == i]
            component_rows = components[labels == i]
            centroid = np.expand_dims(cluster_centers[i], axis=0)
            dists = np.empty(component_rows.shape[0])
            for j, row in enumerate(component_rows):
                row = np.expand_dims(row, axis=0)
                dists[j] = cdist(row, centroid, 'euclidean').squeeze()
            order_by = np.argsort(dists)
            metric_labels = metric_labels[order_by]
            dists = dists[order_by]
            mclusters.append((i, metric_labels, dists))
            assert len(OPT_METRICS) > 0
            label_mask = np.zeros(metric_labels.shape[0])
            for opt_metric in OPT_METRICS:
                label_mask = np.logical_or(label_mask,
                                           metric_labels == opt_metric)
            if np.count_nonzero(label_mask) > 0:
                mfeat_list.extend(metric_labels[label_mask].tolist())
            elif len(metric_labels) > 0:
                mfeat_list.append(metric_labels[0])
        metric_clusters[n_clusters] = mclusters
        featured_metrics[n_clusters] = mfeat_list

    for n_clusters, mlist in sorted(featured_metrics.iteritems()):
        savepath = os.path.join(savedir,
                                "featured_metrics_{}.txt".format(n_clusters))
        with open(savepath, "w") as f:
            f.write("\n".join(sorted(mlist)))

    for n_clusters, memberships in sorted(metric_clusters.iteritems()):
        cstr = ""
        for i, (cnum, lab, dist) in enumerate(memberships):
            assert i == cnum
            cstr += "---------------------------------------------\n"
            cstr += "CLUSTERS {}\n".format(i)
            cstr += "---------------------------------------------\n\n"

            for l, d in zip(lab, dist):
                cstr += "{}\t({})\n".format(l, d)
            cstr += "\n\n"

        savepath = os.path.join(savedir,
                                "membership_{}.txt".format(n_clusters))
        with open(savepath, "w") as f:
            f.write(cstr)
Esempio n. 3
0
def run_lasso(dbms,
              basepaths,
              savedir,
              featured_metrics,
              knobs_to_ignore,
              include_polynomial_features=True):
    import gc

    # Load matrices
    assert len(basepaths) > 0
    Xs = []
    ys = []

    with stopwatch("matrix concatenation"):
        for basepath in basepaths:
            X_path = os.path.join(basepath, "X_data_enc.npz")
            y_path = os.path.join(basepath, "y_data_enc.npz")

            Xs.append(Matrix.load_matrix(X_path))
            ys.append(
                Matrix.load_matrix(y_path).filter(featured_metrics, "columns"))

        # Combine matrix data if more than 1 matrix
        if len(Xs) > 1:
            X = Matrix.vstack(Xs, require_equal_columnlabels=True)
            y = Matrix.vstack(ys, require_equal_columnlabels=True)
        else:
            X = Xs[0]
            y = ys[0]
        del Xs
        del ys
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard
        # deviation (i.e., constant columns)
        if y.shape[1] > 1:
            column_mask = ~stdev_zero(y.data, axis=0)
            filtered_columns = y.columnlabels[column_mask]
            y = y.filter(filtered_columns, 'columns')
        column_mask = ~stdev_zero(X.data, axis=0)
        removed_columns = X.columnlabels[~column_mask]
        print "removed columns = {}".format(removed_columns)
        filtered_columns = set(X.columnlabels[column_mask])
        filtered_columns -= set(knobs_to_ignore)
        filtered_columns = np.array(sorted(filtered_columns))
        X = X.filter(filtered_columns, 'columns')
        print "\ncolumnlabels:", X.columnlabels

        # Dummy-code categorical features
        n_values, cat_feat_indices, _ = dummy_encoder_helper(
            dbms, X.columnlabels)
        if len(cat_feat_indices) > 0:
            encoder = DummyEncoder(n_values, cat_feat_indices)
            encoder.fit(X.data, columnlabels=X.columnlabels)
            X = Matrix(encoder.transform(X.data), X.rowlabels,
                       encoder.columnlabels)

        # Scale the data
        X_standardizer = StandardScaler()
        X.data = X_standardizer.fit_transform(X.data)
        y_standardizer = StandardScaler()
        y.data = y_standardizer.fit_transform(y.data)
        if include_polynomial_features:
            X_poly = PolynomialFeatures()
            X_data = X_poly.fit_transform(X.data)
            X_columnlabels = np.expand_dims(np.array(X.columnlabels,
                                                     dtype=str),
                                            axis=0)
            X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze()
            X = Matrix(X_data, X.rowlabels, X_columnlabels)

        # Shuffle the data rows (experiments x metrics)
        shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False)
        X = shuffler.fit_transform(X, copy=False)
        y = shuffler.transform(y, copy=False)
        assert np.array_equal(X.rowlabels, y.rowlabels)
        gc.collect()

    print "\nfeatured_metrics:", featured_metrics

    with stopwatch("lasso paths"):
        # Fit the model to calculate the components
        alphas, coefs, _ = get_coef_range(X.data, y.data)
    # Save model
    np.savez(os.path.join(savedir, "lasso_path.npz"),
             alphas=alphas,
             coefs=coefs,
             feats=X.columnlabels,
             metrics=y.columnlabels)

    with stopwatch("lasso processing"):
        nfeats = X.columnlabels.shape[0]
        lasso = Lasso(alphas, X.columnlabels, coefs)
        print lasso.get_top_summary(nfeats, "")
        top_knobs = get_features_list(lasso.get_top_features(n=nfeats))
        print "\nfeat list length: {}".format(len(top_knobs))
        print "nfeats = {}".format(nfeats)
        top_knobs = lasso.get_top_features(nfeats)
        print top_knobs
        final_ordering = []
        for knob in top_knobs:
            if '#' in knob:
                knob = knob.split('#')[0]
                if knob not in final_ordering:
                    final_ordering.append(knob)
            else:
                final_ordering.append(knob)
        final_ordering = np.append(final_ordering, removed_columns)
    with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f:
        f.write("\n".join(final_ordering))