Example #1
0
 def setUpClass(cls):
     file_name = "datasets/cstr.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['fea']  # numpy.ndarray
     model = CoclustMod(n_clusters=4)
     model.fit(X)
     cls.model = model
Example #2
0
 def setUpClass(cls):
     file_name = "datasets/classic3.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['A']  # scipy.sparse.csc.csc_matrix
     model = CoclustMod(n_clusters=3)
     model.fit(X)
     cls.model = model
 def setUpClass(cls):
     file_name = "datasets/classic3.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['A']  # scipy.sparse.csc.csc_matrix
     model = CoclustMod(n_clusters=3)
     model.fit(X)
     cls.model = model
 def setUpClass(cls):
     file_name = "datasets/cstr.mat"
     matlab_dict = loadmat(file_name)
     X = matlab_dict['fea']  # numpy.ndarray
     model = CoclustMod(n_clusters=4)
     model.fit(X)
     cls.model = model
Example #5
0
    def run_coclust(self):
        # co-clustering
        model = CoclustMod(n_clusters=4)
        model.fit(
            self.doc_term_mat
        )  # No errors? Is this right? Gensim types have plug-and-play support?

        top_term_plt = plot_cluster_top_terms(in_data=self.doc_term_mat,
                                              all_terms=self.vocab,
                                              nb_top_terms=5,
                                              model=model,
                                              do_plot=False)

        # print(get_term_graph(X=doc_term_mat,
        #                      model=model,
        #                      terms=vocab,
        #                      n_cluster=2,
        #                      n_top_terms=10,
        #                      n_neighbors=2,
        #                      stopwords=[]))

        clus_sz_plt = plot_cluster_sizes(model=model, do_plot=False)
        mat_plot = plot_reorganized_matrix(X=self.doc_term_mat,
                                           model=model,
                                           do_plot=False)

        return (top_term_plt, clus_sz_plt, mat_plot)
Example #6
0
    def cluster(self, data):
        global weighted_edge_list, matrix, model, row_order, column_order, rowMap, colMap, subModels, row_sums_map, column_sums_map
        subModels = {}
        # num_clusters = 9
        weighted_edge_list = data[[
            "RECHTSTRAEGER", "MEDIUM_MEDIENINHABER", "EURO"
        ]]
        weighted_edge_list = weighted_edge_list.groupby(
            by=["RECHTSTRAEGER", "MEDIUM_MEDIENINHABER"]).sum().reset_index()

        G = nx.from_pandas_dataframe(weighted_edge_list,
                                     "RECHTSTRAEGER",
                                     "MEDIUM_MEDIENINHABER",
                                     "EURO",
                                     create_using=nx.DiGraph())
        row_order = np.sort(np.unique(weighted_edge_list["RECHTSTRAEGER"]))
        column_order = np.sort(
            np.unique(weighted_edge_list["MEDIUM_MEDIENINHABER"]))
        matrix_real = biadjacency_matrix(G,
                                         row_order,
                                         column_order=column_order,
                                         weight="EURO")
        matrix = matrix_real.toarray()
        row_sums = matrix.sum(axis=1).round(2)
        row_sums_map = dict(zip(row_order, row_sums))
        row_sums_map = {k: float(v) for k, v in row_sums_map.items()}
        column_sums = matrix.sum(axis=0).round(2)
        column_sums_map = dict(zip(column_order, column_sums))
        column_sums_map = {k: float(v) for k, v in column_sums_map.items()}

        model = CoclustMod(min(min(matrix.shape), num_clusters),
                           random_state=0)  #n_init=500
        model.fit(matrix)

        #test andere liste senden
        rowMap = dict(zip(row_order, list(map(str, model.row_labels_))))
        colMap = dict(zip(column_order, list(map(str, model.column_labels_))))
        ret = []

        wel = weighted_edge_list.copy()
        wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap))
        wel["MEDIUM_MEDIENINHABER"].update(
            wel["MEDIUM_MEDIENINHABER"].map(colMap))
        ret = wel.as_matrix().tolist()

        clusters = self.getElementsbyCluster()

        return {"data": ret, "clusters": clusters}
Example #7
0
    def subcluster3(self, clusterID):
        global subModels

        clusterID_array = [int(x) for x in clusterID.split('.')]
        # print(clusterID_array)
        # print("subModels",subModels)
        subMatrix = model.get_submatrix(matrix, clusterID_array[0])
        sub_row_order = row_order[model.get_indices(clusterID_array[0])[0]]
        sub_column_order = column_order[model.get_indices(
            clusterID_array[0])[1]]

        for i, cID in enumerate(clusterID_array[1:]):
            smID = '.'.join(str(x) for x in clusterID_array[:(i + 1)])
            print("smID", smID)
            sm = subModels[smID]
            subMatrix = sm.get_submatrix(subMatrix, cID)
            sub_row_order = sub_row_order[sm.get_indices(cID)[0]]
            sub_column_order = sub_column_order[sm.get_indices(cID)[1]]

        zeros_cols = np.where(~subMatrix.any(axis=0))[0]
        zeros_rows = np.where(~subMatrix.any(axis=1))[0]
        subMatrix = np.delete(subMatrix, zeros_cols, 1)
        subMatrix = np.delete(subMatrix, zeros_rows, 0)
        sub_row_order = np.delete(sub_row_order, zeros_rows)
        sub_column_order = np.delete(sub_column_order, zeros_cols)

        num_clusters2 = min(min(subMatrix.shape), num_clusters)

        subModel = CoclustMod(num_clusters2, random_state=0)

        subModels[clusterID] = subModel
        # print("subModels",subModels)
        subModel.fit(subMatrix)

        for i, label in enumerate(subModel.row_labels_):
            rowMap[sub_row_order[i]] = str(clusterID) + "." + str(label)

        for i, label in enumerate(subModel.column_labels_):
            colMap[sub_column_order[i]] = str(clusterID) + "." + str(label)

        # ret = []
        # wel = weighted_edge_list.copy()
        # wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap))
        # wel["MEDIUM_MEDIENINHABER"].update(wel["MEDIUM_MEDIENINHABER"].map(colMap))

        rowLabelSet = set(
            [str(clusterID) + "." + str(x) for x in subModel.row_labels_])
        colLabelSet = set(
            [str(clusterID) + "." + str(x) for x in subModel.column_labels_])
        #---

        rowMap2 = {
            k: (v if v in rowLabelSet else "Sonstige")
            for k, v in rowMap.items()
        }
        colMap2 = {
            k: (v if v in colLabelSet else "Sonstige")
            for k, v in colMap.items()
        }

        wel = weighted_edge_list.copy()
        # print(rowLabelSet)

        wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap2))
        wel["MEDIUM_MEDIENINHABER"].update(
            wel["MEDIUM_MEDIENINHABER"].map(colMap2))

        idc = wel[(
            wel["RECHTSTRAEGER"].astype(str).str[:len(clusterID)] != clusterID)
                  & (wel["MEDIUM_MEDIENINHABER"].astype(
                      str).str[:len(clusterID)] != clusterID)].index
        wel = wel.drop(idc)

        wel2 = weighted_edge_list.copy()
        wel2 = wel2.drop(idc)
        row_sums_map2 = wel2.groupby(
            by=["RECHTSTRAEGER"]).sum().to_dict()["EURO"]
        row_sums_map2 = {k: float(v) for k, v in row_sums_map2.items()}
        column_sums_map2 = wel2.groupby(
            by=["MEDIUM_MEDIENINHABER"]).sum().to_dict()["EURO"]
        column_sums_map2 = {k: float(v) for k, v in column_sums_map2.items()}

        ret = []
        ret = wel.as_matrix().tolist()

        # clusters = self.getElementsbyCluster()
        inv_rowMap2 = {}
        for k, v in rowMap2.items():
            inv_rowMap2.setdefault(v, []).append(k)

        inv_colMap2 = {}
        for k, v in colMap2.items():
            inv_colMap2.setdefault(v, []).append(k)

        clusters = {}
        for label in inv_rowMap2:
            clusters[label] = {
                "rows": {
                    k: row_sums_map2[k]
                    for k in inv_rowMap2[label] if k in row_sums_map2
                },
                "columns": {
                    k: column_sums_map2[k]
                    for k in inv_colMap2[label] if k in column_sums_map2
                }
            }

        return {"data": ret, "clusters": clusters}
Example #8
0
from scipy.io import loadmat
from coclust.coclustering import CoclustMod

file_name = "/home/sayon/Dropbox/MyModules/Canvass/cclust_package/datasets/som.mat"
matlab_dict = loadmat(file_name)
X = matlab_dict['fea']

model = CoclustMod(n_clusters=4)
model.fit(X)

print(model.modularity)
predicted_row_labels = model.row_labels_
predicted_column_labels = model.column_labels_
Example #9
0
plt.matshow(euclidean_distances(Feafile.values, Feafile.values))
plt.colorbar()
plt.title('Show the Euclidean distance matrix')
plt.show()

# %%Combined usage
# The following example shows how easy coclust is to run several algorithms on the same dataset
import matplotlib.pyplot
import numpy as np, scipy.sparse as sp, scipy.io as io
from sklearn.metrics import confusion_matrix
from coclust.coclustering import (CoclustMod, CoclustSpecMod, CoclustInfo)
from coclust.visualization import plot_reorganized_matrix

X = Feafile.values
model_1 = CoclustMod(n_clusters=4, n_init=4)
model_1.fit(X)
model_2 = CoclustSpecMod(n_clusters=4, n_init=4)
model_2.fit(X)
model_3 = CoclustInfo(n_row_clusters=3, n_col_clusters=4, n_init=4)
model_3.fit(X)
plt.figure()

plt.title(' plot three reorganized matrices for the dataset')
plt.subplot(131)
plot_reorganized_matrix(X, model_1)
plt.subplot(132)
plot_reorganized_matrix(X, model_2)
plt.subplot(133)
plot_reorganized_matrix(X, model_3)
plt.show()
# plot the resulting reorganized matrices in order to have a first visual grasp of what can be expected from the different algorithms. A plot of three different reorganized matrices
Example #10
0
 def setUpClass(cls):
     model = CoclustMod(n_clusters=3)
     X = np.diag(range(1, 200))
     model.fit(X)
     cls.model = model
 def setUpClass(cls):
     model = CoclustMod(n_clusters=3)
     X = np.diag(range(1, 200))
     model.fit(X)
     cls.model = model
Example #12
0
def genes_articles():
    print(request.json)
    tfidf = request.json["tfidf"]
    distance = request.json["distance"]
    coclust = int(request.json["coclust"])
    selected_genes = [v["label"] for v in request.json["genes"]]
    nb_cluster = int(request.json["nb"])

    genes_articles_str = [
        ' '.join(str(x) for x in genesjson[g]) for g in selected_genes
    ]

    if tfidf:
        vec = TfidfVectorizer()
    else:
        vec = CountVectorizer()

    X = vec.fit_transform(genes_articles_str)
    nb = nb_cluster
    if nb_cluster == 0 and coclust != 3:
        xn = X.shape[0]
        step = round(xn / 10) if round(xn / 10) > 0 else 1
        rng = range(1, xn, step)
        # _, modularities = best_modularity_partition(X, rng, n_rand_init=1)
        # nb = rng[np.argmax(modularities)]
        modularities = []
        for x in rng:
            print(x)
            m = CoclustMod(n_clusters=x, n_init=1).fit(X)
            modularities.append(m.modularity)
        nb = rng[np.argmax(modularities)]

    if coclust == 1:
        model = CoclustMod(n_clusters=nb, random_state=0)
    if coclust == 2:
        model = CoclustSpecMod(n_clusters=nb, random_state=0)
    if coclust == 3:
        model = CoclustInfo()

    dt = X.toarray()
    model.fit(dt)
    fit_data = dt[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.figure(figsize=(22, 5))
    if nb_cluster == 0 and coclust != 3:
        plt.subplot(131)
        plt.plot(rng, modularities, 'ro-')
        plt.xlabel("Number of cluster")
        plt.ylabel("Modularity")
        plt.title("Max modularity for " + str(nb) + " clusters (" +
                  str(round(np.max(modularities), 3)) + ")")
        plt.axvline(x=nb, color='r', linestyle='-')
    plt.subplot(132)
    sns.heatmap(dt,
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("Heatmap on Original Data")
    plt.subplot(133)
    sns.heatmap(fit_data,
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("CoclustMod %i clusters" % nb)
    plt.savefig("img-ga1.jpg", bbox_inches='tight', pad_inches=0)

    # hierarchical clustering
    Z = linkage(dt, 'single', 'euclidean')

    plt.figure(figsize=(15, 7))
    plt.xlabel('')
    plt.ylabel('distance')
    dendrogram(Z, labels=selected_genes)
    plt.savefig("img-ga2.jpg", bbox_inches='tight', pad_inches=0)
    plt.close('all')

    return jsonify({"tab": 1})
Example #13
0
def genes_termes():
    tfidf = request.json["tfidf"]
    coclust = int(request.json["coclust"])
    selected_genes = [v["label"] for v in request.json["genes"]]
    nb_cluster = int(request.json["nb"])

    # get id articles from all genes
    l = [genesjson[g] for g in selected_genes]
    genes_articles = list(set([item for sublist in l for item in sublist]))
    # get text from article
    articles_text = [
        ' '.join(asthmajson[str(i)]["text"]) for i in genes_articles
    ]

    if tfidf:
        vec = TfidfVectorizer(max_df=0.7, min_df=0.01)
    else:
        vec = CountVectorizer(max_df=0.7, min_df=0.01)

    dt = vec.fit_transform(articles_text)

    matrix_article_terms = dt.toarray()
    matrix_genes_terms = defaultdict(
        lambda: np.zeros(matrix_article_terms.shape[1]).astype(np.float64))
    for idx, row in enumerate(matrix_article_terms):
        article = asthmajson[str(genes_articles[idx])]
        for ge in article["genes"]:
            if ge in selected_genes:
                matrix_genes_terms[ge] += row
    list_matrix_genes_terms = [v for k, v in matrix_genes_terms.items()]
    list_genes = [k for k, v in matrix_genes_terms.items()]
    #df3 = pd.DataFrame(list_matrix_genes_terms, columns=vec.get_feature_names())

    if coclust == 1:
        model = CoclustMod(n_clusters=nb_cluster, random_state=0)
    if coclust == 2:
        model = CoclustSpecMod(n_clusters=nb_cluster, random_state=0)
    if coclust == 3:
        model = CoclustInfo()

    dt = np.array(list_matrix_genes_terms)
    m1 = model.fit(dt)
    fit_data = dt[np.argsort(model.row_labels_)]
    fit_data = fit_data[:, np.argsort(model.column_labels_)]

    plt.figure(figsize=(20, 8))
    plt.subplot(121)
    sns.heatmap(np.log(dt + 1),
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("Heatmap on Original Data", fontdict={'fontsize': 20})

    plt.subplot(122)
    sns.heatmap(np.log(fit_data + 1),
                cmap="BuPu",
                yticklabels=False,
                xticklabels=False,
                cbar=False)
    plt.title("CoclustMod %i clusters" % nb_cluster, fontdict={'fontsize': 20})
    plt.savefig("img-gt1.jpg", bbox_inches='tight', pad_inches=0)

    # Top terms by cluster
    nb_clust = np.unique(model.column_labels_)
    nm = np.array(vec.get_feature_names())
    dt_sum = np.sum(dt, axis=0)
    col_label = np.array(model.column_labels_)
    cluster = []
    for c in nb_clust:
        idx = np.argsort(-dt_sum[col_label == c])
        col = nm[np.array(model.column_labels_) == c]
        value = dt_sum[col_label == c][idx]
        name = col[idx]
        cluster.append({"name": list(name[0:8]), "value": list(value[0:8])})

    # hierarchical clustering
    Z = linkage(dt, 'single', 'euclidean')
    plt.figure(figsize=(15, 7))
    # plt.title('Hierarchical Clustering - Hamming')
    plt.xlabel('')
    plt.ylabel('distance')
    dendrogram(Z, labels=list_genes)
    plt.savefig("img-gt2.jpg", bbox_inches='tight', pad_inches=0)
    plt.close('all')

    return jsonify({"tab": 2, "cluster": cluster})