Ejemplo n.º 1
0
def _get_fc_info(args, emb_type):
    X = _load_data_by_emb_type(args[emb_type], args['dataKey'])

    selected = np.array(args['selected'], dtype=bool)

    # ccpca with sign adjustment
    cl = CL(learner=CCPCA(n_components=1))
    cl.fit(X[selected, :],
           X[np.logical_not(selected), :],
           var_thres_ratio=0.5,
           max_log_alpha=2)

    fcs = cl.fcs

    return (fcs, selected)
Ejemplo n.º 2
0
    def set_learner(self, learner):
        """Set a contrastive representation learning method.

        Parameters
        ----------
        learner: Class object for contrastive learning.
            Contrastive representation learning class object. Any class
            object that (1) has fit as a class method, (2) can take two matrices as
            the first parameters of fit, and (3) has get_feat_contribs as a class
            method (e.g., ccPCA, https://github.com/takanori-fujiwara/ccpca).
            If None, ccPCA is set as a learner.
        Returns
        -------
        self.
        """
        if learner is None:
            self.learner = CCPCA()
        else:
            self.learner = learner
Ejemplo n.º 3
0
plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], [0, 1, 2]):
    plt.scatter(X_r[y == i, 0],
                X_r[y == i, 1],
                color=color,
                alpha=.8,
                lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('cPCA of IRIS dataset (alpha=2.15)')
plt.show()

ccpca = CCPCA()
ccpca.fit(X[y == 0], X[y != 0], var_thres_ratio=0.5, max_log_alpha=0.5)
X_r2 = ccpca.transform(X)

plt.figure()
for color, i, target_name in zip(colors, [0, 1, 2], [0, 1, 2]):
    plt.scatter(X_r2[y == i, 0],
                X_r2[y == i, 1],
                color=color,
                alpha=.8,
                lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('ccPCA of IRIS dataset (alpha =' + str(ccpca.get_best_alpha()) + ')')
plt.show()
def getHeatmap(dataset_name):
    #dataset_name = request.args.get("dataset_name")
    print(dataset_name)
    data = None
    feature_names = None
    if ("_updated" in dataset_name):
        data = np.loadtxt(open("../data/" + str(dataset_name) + ".csv", "rb"),
                          delimiter=",",
                          skiprows=1)
        feature_names = np.genfromtxt("../data/" + dataset_name +
                                      ".featurenames.csv",
                                      delimiter=",",
                                      dtype='str',
                                      skip_header=1)
    else:
        data = np.loadtxt(open("./sample_data/" + str(dataset_name) + ".csv",
                               "rb"),
                          delimiter=",",
                          skiprows=1)
        feature_names = np.genfromtxt("./sample_data/" + dataset_name +
                                      ".featurenames.csv",
                                      delimiter=",",
                                      dtype='str',
                                      skip_header=1)

    X = None
    if ("_updated" in dataset_name):
        X = data[:, 1:-3]
    else:
        X = data[:, :-3]
    y = np.int_(data[:, -3])
    unique_labels = np.unique(y)

    print(X.shape)
    _, n_feats = X.shape
    n_labels = len(unique_labels)
    first_cpc_mat = np.zeros((n_feats, n_labels))
    feat_contrib_mat = np.zeros((n_feats, n_labels))

    # 1. get the scaled feature contributions and first cPC for each label
    ccpca = CCPCA(n_components=1)
    for i, target_label in enumerate(unique_labels):
        ccpca.fit(X[y == target_label],
                  X[y != target_label],
                  var_thres_ratio=0.2,
                  n_alphas=80,
                  max_log_alpha=0.2)
        first_cpc_mat[:, i] = ccpca.get_first_component()
        feat_contrib_mat[:, i] = ccpca.get_scaled_feat_contribs()

    if (dataset_name == "mnist_updated"
            or dataset_name == "fashion_mnist_updated"):
        xlabel_names = [None] * n_labels
        for i, label in enumerate(unique_labels):
            if (unique_labels[label] == -1):
                c = "Z"
            else:
                c = chr(65 + unique_labels[label])
            xlabel_names[i] = str(c)
        f = open(
            "/home/user/Desktop/heatmap/data/" + dataset_name + "_labels.csv",
            "w")
        f.write("label\n")
        for label in xlabel_names:
            f.write(str(label) + "\n")
        f.close()
        print(xlabel_names)

        ylabel_names = feature_names.tolist()
        f = open(
            "/home/user/Desktop/heatmap/data/" + dataset_name +
            "_features.csv", "w")
        f.write("feature\n")
        for feature in ylabel_names:
            f.write(str(feature) + "\n")
        f.close()

        f = open(
            "/home/user/Desktop/heatmap/data/" + dataset_name + "_heatmap.csv",
            "w")
        f.write("feature,label,contribution\n")
        for i, feature in enumerate(ylabel_names):
            for j, label in enumerate(xlabel_names):
                f.write(
                    str(feature) + "," + str(label) + "," +
                    str(feat_contrib_mat[i, j]) + "\n")

    else:
        # 2. apply optimal sign flipping
        OptSignFlip().opt_sign_flip(first_cpc_mat, feat_contrib_mat)

        # 3. apply hierarchical clustering with optimal-leaf-ordering
        mr = MatReorder()
        feat_contrib_mat = mr.fit_transform(feat_contrib_mat)

        # 4. apply aggregation
        n_feats_shown = n_feats
        agg_feat_contrib_mat, label_to_rows, label_to_rep_row = mr.aggregate_rows(
            feat_contrib_mat, n_feats_shown, agg_method='abs_max')

        # plot cluster names
        xlabel_names = [None] * n_labels
        for i, label in enumerate(mr.order_col_):
            if (unique_labels[label] == -1):
                c = "Z"
            else:
                c = chr(65 + unique_labels[label])
            xlabel_names[i] = str(c)

        f = open("../data/" + dataset_name + "_labels.csv", "w")
        f.write("label\n")
        for label in xlabel_names:
            f.write(str(label) + "\n")
        f.close()

        # plot feature names
        ylabel_names = np.array(feature_names)[mr.order_row_]
        # ylabel_names = np.array(feature_names, dtype=object)[label_to_rep_row]
        # for i in range(len(ylabel_names)):
        #     name = ylabel_names[i]
        #     rows = label_to_rows[i]
        #     if len(rows) > 1:
        #         ylabel_names[i] = name + ', ' + str(len(rows) - 1) + ' more'
        ylabel_names = ylabel_names.tolist()

        f = open("../data/" + dataset_name + "_features.csv", "w")
        f.write("feature\n")
        for feature in ylabel_names:
            f.write(str(feature) + "\n")
        f.close()

        f = open("../data/" + dataset_name + "_heatmap.csv", "w")
        f.write("feature,label,contribution\n")
        for i, feature in enumerate(ylabel_names):
            for j, label in enumerate(xlabel_names):
                f.write(
                    str(feature) + "," + str(label) + "," +
                    str(feat_contrib_mat[i, j]) + "\n")
        f.close()
def wordCloudGen():
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from ccpca import CCPCA
    from opt_sign_flip import OptSignFlip
    from mat_reorder import MatReorder
    # print("I was here")
    # classLabel = request.get.params("label")
    dataset_name = request.args.get("datasetName")
    print(dataset_name)
    data = None
    feature_names = None
    if ("_updated" in dataset_name):
        data = np.loadtxt(open("../data/" + str(dataset_name) + ".csv", "rb"),
                          delimiter=",",
                          skiprows=1)
        feature_names = np.genfromtxt("../data/" + dataset_name +
                                      ".featurenames.csv",
                                      delimiter=",",
                                      dtype='str',
                                      skip_header=1)
    else:
        data = np.loadtxt(open("./sample_data/" + str(dataset_name) + ".csv",
                               "rb"),
                          delimiter=",",
                          skiprows=1)
        feature_names = np.genfromtxt("./sample_data/" + dataset_name +
                                      ".featurenames.csv",
                                      delimiter=",",
                                      dtype='str',
                                      skip_header=1)
    print(feature_names)
    X = None
    if ("_updated" in dataset_name):
        X = data[:, 1:-3]
    else:
        X = data[:, :-3]
    y = np.int_(data[:, -3])
    unique_labels = np.unique(y)

    target_label = 0

    ccpca = CCPCA(n_components=2)
    ccpca.fit(X[y == target_label],
              X[y != target_label],
              var_thres_ratio=0.5,
              n_alphas=40,
              max_log_alpha=0.5)

    # get results
    cpca_result = ccpca.transform(X)
    best_alpha = ccpca.get_best_alpha()
    cpca_fcs = ccpca.get_feat_contribs()

    X = data[:, :-3]
    y = np.int_(data[:, -3])
    unique_labels = np.unique(y)

    _, n_feats = X.shape
    n_labels = len(unique_labels)
    first_cpc_mat = np.zeros((n_feats, n_labels))
    feat_contrib_mat = np.zeros((n_feats, n_labels))

    ccpca = CCPCA(n_components=1)
    for i, target_label in enumerate(unique_labels):
        ccpca.fit(X[y == target_label],
                  X[y != target_label],
                  var_thres_ratio=0.5,
                  n_alphas=40,
                  max_log_alpha=0.5)

        first_cpc_mat[:, i] = ccpca.get_first_component()
        feat_contrib_mat[:, i] = ccpca.get_scaled_feat_contribs()

    OptSignFlip().opt_sign_flip(first_cpc_mat, feat_contrib_mat)

    mr = MatReorder()
    mr.fit_transform(feat_contrib_mat)

    print(feature_names)
    combined = np.vstack((feature_names, cpca_fcs)).T
    print(combined)
    pd.DataFrame(combined).to_csv("../data/featContrib.csv")
    resp = make_response()
    resp.headers['Access-Control-Allow-Origin'] = '*'
    return resp
Ejemplo n.º 6
0
Archivo: sample.py Proyecto: wzpy/ccpca
plt.figure()
colors = ['navy', 'turquoise', 'darkorange']
lw = 2

for color, i, target_name in zip(colors, [0, 1, 2], [0, 1, 2]):
    plt.scatter(X_r[y == i, 0],
                X_r[y == i, 1],
                color=color,
                alpha=.8,
                lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title(f'cPCA of IRIS dataset (alpha={cpca.get_best_alpha()})')
plt.show()

ccpca = CCPCA()

# apply fit and transform seaparately
# ccpca.fit(X[y == 0], X[y != 0], var_thres_ratio=0.5, max_log_alpha=0.5)
# X_r2 = ccpca.transform(X)

# apply fit and transform at the same time
X_r2 = ccpca.fit_transform(X[y == 0],
                           X[y != 0],
                           var_thres_ratio=0.5,
                           max_log_alpha=0.5)

plt.figure()
for color, i, target_name in zip(colors, [0, 1, 2], [0, 1, 2]):
    plt.scatter(X_r2[y == i, 0],
                X_r2[y == i, 1],