def run_factor_analysis(paths, savedir, cluster_range, algorithms):
    import gc

    # Load matrices
    assert len(paths) > 0
    matrices = []

    with stopwatch("matrix concatenation"):
        for path in paths:
            matrices.append(
                Matrix.load_matrix(os.path.join(path, "y_data_enc.npz")))
        # Combine matrix data if more than 1 matrix
        if len(matrices) > 1:
            matrix = Matrix.vstack(matrices, require_equal_columnlabels=True)
        else:
            matrix = matrices[0]
        del matrices
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard deviation
        # i.e., constant columns
        column_mask = ~stdev_zero(matrix.data, axis=0)
        filtered_columns = matrix.columnlabels[column_mask]
        matrix = matrix.filter(filtered_columns, 'columns')
        print "matrix shape after filter constant: ", matrix.data.shape

        # Scale the data
        standardizer = StandardScaler()
        matrix.data = standardizer.fit_transform(matrix.data)

        # Shuffle the data rows (experiments x metrics)
        exp_shuffle_indices = get_shuffle_indices(matrix.data.shape[0])
        matrix.data = matrix.data[exp_shuffle_indices]

        # Shrink the cluster range if # metrics < max # clusters
        max_clusters = matrix.data.shape[1] + 1
        if max_clusters < cluster_range[1]:
            cluster_range = (cluster_range[0], max_clusters)

    with stopwatch("factor analysis"):
        # Fit the model to calculate the components
        fa = FactorAnalysis()
        fa.fit(matrix.data)
    fa_mask = np.sum(fa.components_ != 0.0, axis=1) > 0.0
    variances = np.sum(np.abs(fa.components_[fa_mask]), axis=1)
    total_variance = np.sum(variances).squeeze()
    print "total variance: {}".format(total_variance)
    var_exp = np.array([np.sum(variances[:i+1]) / total_variance * 100 \
                        for i in range(variances.shape[0])])
    factor_cutoff = np.count_nonzero(var_exp < REQUIRED_VARIANCE_EXPLAINED) + 1
    factor_cutoff = min(factor_cutoff, 10)
    print "factor cutoff: {}".format(factor_cutoff)
    for i, var in enumerate(variances):
        print i, var, np.sum(
            variances[:i + 1]), np.sum(variances[:i + 1]) / total_variance

    components = np.transpose(fa.components_[:factor_cutoff]).copy()
    print "components shape: {}".format(components.shape)
    standardizer = StandardScaler()
    components = standardizer.fit_transform(components)

    # Shuffle factor analysis matrix rows (metrics x factors)
    metric_shuffle_indices = get_shuffle_indices(components.shape[0])
    components = components[metric_shuffle_indices]
    component_columnlabels = matrix.columnlabels[metric_shuffle_indices].copy()

    kmeans = KMeans_(components, cluster_range)
    kmeans.plot_results(savedir, components, component_columnlabels)

    # Compute optimal number of clusters K
    for algorithm in algorithms:
        with stopwatch("compute {} (factors={})".format(
                algorithm, factor_cutoff)):
            kselection = KSelection.new(components, cluster_range,
                                        kmeans.cluster_map_, algorithm)
        print "{} optimal # of clusters: {}".format(
            algorithm, kselection.optimal_num_clusters_)
        kselection.plot_results(savedir)

    metric_clusters = {}
    featured_metrics = {}
    for n_clusters, (cluster_centers, labels,
                     _) in kmeans.cluster_map_.iteritems():

        # For each cluster, calculate the distances of each metric from the
        # cluster center. We use the metric closest to the cluster center.
        mclusters = []
        mfeat_list = []
        for i in range(n_clusters):
            metric_labels = component_columnlabels[labels == i]
            component_rows = components[labels == i]
            centroid = np.expand_dims(cluster_centers[i], axis=0)
            dists = np.empty(component_rows.shape[0])
            for j, row in enumerate(component_rows):
                row = np.expand_dims(row, axis=0)
                dists[j] = cdist(row, centroid, 'euclidean').squeeze()
            order_by = np.argsort(dists)
            metric_labels = metric_labels[order_by]
            dists = dists[order_by]
            mclusters.append((i, metric_labels, dists))
            assert len(OPT_METRICS) > 0
            label_mask = np.zeros(metric_labels.shape[0])
            for opt_metric in OPT_METRICS:
                label_mask = np.logical_or(label_mask,
                                           metric_labels == opt_metric)
            if np.count_nonzero(label_mask) > 0:
                mfeat_list.extend(metric_labels[label_mask].tolist())
            elif len(metric_labels) > 0:
                mfeat_list.append(metric_labels[0])
        metric_clusters[n_clusters] = mclusters
        featured_metrics[n_clusters] = mfeat_list

    for n_clusters, mlist in sorted(featured_metrics.iteritems()):
        savepath = os.path.join(savedir,
                                "featured_metrics_{}.txt".format(n_clusters))
        with open(savepath, "w") as f:
            f.write("\n".join(sorted(mlist)))

    for n_clusters, memberships in sorted(metric_clusters.iteritems()):
        cstr = ""
        for i, (cnum, lab, dist) in enumerate(memberships):
            assert i == cnum
            cstr += "---------------------------------------------\n"
            cstr += "CLUSTERS {}\n".format(i)
            cstr += "---------------------------------------------\n\n"

            for l, d in zip(lab, dist):
                cstr += "{}\t({})\n".format(l, d)
            cstr += "\n\n"

        savepath = os.path.join(savedir,
                                "membership_{}.txt".format(n_clusters))
        with open(savepath, "w") as f:
            f.write(cstr)
Exemple #2
0
def run_lasso(dbms,
              basepaths,
              savedir,
              featured_metrics,
              knobs_to_ignore,
              include_polynomial_features=True):
    import gc

    # Load matrices
    assert len(basepaths) > 0
    Xs = []
    ys = []

    with stopwatch("matrix concatenation"):
        for basepath in basepaths:
            X_path = os.path.join(basepath, "X_data_enc.npz")
            y_path = os.path.join(basepath, "y_data_enc.npz")

            Xs.append(Matrix.load_matrix(X_path))
            ys.append(
                Matrix.load_matrix(y_path).filter(featured_metrics, "columns"))

        # Combine matrix data if more than 1 matrix
        if len(Xs) > 1:
            X = Matrix.vstack(Xs, require_equal_columnlabels=True)
            y = Matrix.vstack(ys, require_equal_columnlabels=True)
        else:
            X = Xs[0]
            y = ys[0]
        del Xs
        del ys
        gc.collect()

    with stopwatch("preprocessing"):
        # Filter out columns with near zero standard
        # deviation (i.e., constant columns)
        if y.shape[1] > 1:
            column_mask = ~stdev_zero(y.data, axis=0)
            filtered_columns = y.columnlabels[column_mask]
            y = y.filter(filtered_columns, 'columns')
        column_mask = ~stdev_zero(X.data, axis=0)
        removed_columns = X.columnlabels[~column_mask]
        print "removed columns = {}".format(removed_columns)
        filtered_columns = set(X.columnlabels[column_mask])
        filtered_columns -= set(knobs_to_ignore)
        filtered_columns = np.array(sorted(filtered_columns))
        X = X.filter(filtered_columns, 'columns')
        print "\ncolumnlabels:", X.columnlabels

        # Dummy-code categorical features
        n_values, cat_feat_indices, _ = dummy_encoder_helper(
            dbms, X.columnlabels)
        if len(cat_feat_indices) > 0:
            encoder = DummyEncoder(n_values, cat_feat_indices)
            encoder.fit(X.data, columnlabels=X.columnlabels)
            X = Matrix(encoder.transform(X.data), X.rowlabels,
                       encoder.columnlabels)

        # Scale the data
        X_standardizer = StandardScaler()
        X.data = X_standardizer.fit_transform(X.data)
        y_standardizer = StandardScaler()
        y.data = y_standardizer.fit_transform(y.data)
        if include_polynomial_features:
            X_poly = PolynomialFeatures()
            X_data = X_poly.fit_transform(X.data)
            X_columnlabels = np.expand_dims(np.array(X.columnlabels,
                                                     dtype=str),
                                            axis=0)
            X_columnlabels = X_poly.fit_transform(X_columnlabels).squeeze()
            X = Matrix(X_data, X.rowlabels, X_columnlabels)

        # Shuffle the data rows (experiments x metrics)
        shuffler = Shuffler(shuffle_rows=True, shuffle_columns=False)
        X = shuffler.fit_transform(X, copy=False)
        y = shuffler.transform(y, copy=False)
        assert np.array_equal(X.rowlabels, y.rowlabels)
        gc.collect()

    print "\nfeatured_metrics:", featured_metrics

    with stopwatch("lasso paths"):
        # Fit the model to calculate the components
        alphas, coefs, _ = get_coef_range(X.data, y.data)
    # Save model
    np.savez(os.path.join(savedir, "lasso_path.npz"),
             alphas=alphas,
             coefs=coefs,
             feats=X.columnlabels,
             metrics=y.columnlabels)

    with stopwatch("lasso processing"):
        nfeats = X.columnlabels.shape[0]
        lasso = Lasso(alphas, X.columnlabels, coefs)
        print lasso.get_top_summary(nfeats, "")
        top_knobs = get_features_list(lasso.get_top_features(n=nfeats))
        print "\nfeat list length: {}".format(len(top_knobs))
        print "nfeats = {}".format(nfeats)
        top_knobs = lasso.get_top_features(nfeats)
        print top_knobs
        final_ordering = []
        for knob in top_knobs:
            if '#' in knob:
                knob = knob.split('#')[0]
                if knob not in final_ordering:
                    final_ordering.append(knob)
            else:
                final_ordering.append(knob)
        final_ordering = np.append(final_ordering, removed_columns)
    with open(os.path.join(savedir, "featured_knobs.txt"), "w") as f:
        f.write("\n".join(final_ordering))