def setUpClass(cls): file_name = "datasets/classic3.mat" matlab_dict = loadmat(file_name) X = matlab_dict['A'] # scipy.sparse.csc.csc_matrix model = CoclustMod(n_clusters=3) model.fit(X) cls.model = model
def setUpClass(cls): file_name = "datasets/cstr.mat" matlab_dict = loadmat(file_name) X = matlab_dict['fea'] # numpy.ndarray model = CoclustMod(n_clusters=4) model.fit(X) cls.model = model
def run_coclust(self): # co-clustering model = CoclustMod(n_clusters=4) model.fit( self.doc_term_mat ) # No errors? Is this right? Gensim types have plug-and-play support? top_term_plt = plot_cluster_top_terms(in_data=self.doc_term_mat, all_terms=self.vocab, nb_top_terms=5, model=model, do_plot=False) # print(get_term_graph(X=doc_term_mat, # model=model, # terms=vocab, # n_cluster=2, # n_top_terms=10, # n_neighbors=2, # stopwords=[])) clus_sz_plt = plot_cluster_sizes(model=model, do_plot=False) mat_plot = plot_reorganized_matrix(X=self.doc_term_mat, model=model, do_plot=False) return (top_term_plt, clus_sz_plt, mat_plot)
def cluster(self, data): global weighted_edge_list, matrix, model, row_order, column_order, rowMap, colMap, subModels, row_sums_map, column_sums_map subModels = {} # num_clusters = 9 weighted_edge_list = data[[ "RECHTSTRAEGER", "MEDIUM_MEDIENINHABER", "EURO" ]] weighted_edge_list = weighted_edge_list.groupby( by=["RECHTSTRAEGER", "MEDIUM_MEDIENINHABER"]).sum().reset_index() G = nx.from_pandas_dataframe(weighted_edge_list, "RECHTSTRAEGER", "MEDIUM_MEDIENINHABER", "EURO", create_using=nx.DiGraph()) row_order = np.sort(np.unique(weighted_edge_list["RECHTSTRAEGER"])) column_order = np.sort( np.unique(weighted_edge_list["MEDIUM_MEDIENINHABER"])) matrix_real = biadjacency_matrix(G, row_order, column_order=column_order, weight="EURO") matrix = matrix_real.toarray() row_sums = matrix.sum(axis=1).round(2) row_sums_map = dict(zip(row_order, row_sums)) row_sums_map = {k: float(v) for k, v in row_sums_map.items()} column_sums = matrix.sum(axis=0).round(2) column_sums_map = dict(zip(column_order, column_sums)) column_sums_map = {k: float(v) for k, v in column_sums_map.items()} model = CoclustMod(min(min(matrix.shape), num_clusters), random_state=0) #n_init=500 model.fit(matrix) #test andere liste senden rowMap = dict(zip(row_order, list(map(str, model.row_labels_)))) colMap = dict(zip(column_order, list(map(str, model.column_labels_)))) ret = [] wel = weighted_edge_list.copy() wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap)) wel["MEDIUM_MEDIENINHABER"].update( wel["MEDIUM_MEDIENINHABER"].map(colMap)) ret = wel.as_matrix().tolist() clusters = self.getElementsbyCluster() return {"data": ret, "clusters": clusters}
def cluster(self, data, tempMem, num_clusters): #global weighted_edge_list, tempMem["firstGroupIndex"], tempMem["secondGroupIndex"], tempMem["valueIndex"], tempMem["matrix"], tempMem["model"], tempMem["row_order"], tempMem["column_order"], tempMem["rowMap"], tempMem["colMap"], tempMem["subModels"], tempMem["subModels"], tempMem["column_sums_map"] tempMem["subModels"] = {} dataKeys = data.keys(); tempMem["firstGroupIndex"] = dataKeys[0] tempMem["secondGroupIndex"] = dataKeys[len(dataKeys) - 2] tempMem["valueIndex"] = dataKeys[len(dataKeys) - 1] # num_clusters = 9 tempMem["weighted_edge_list"] = data[[tempMem["firstGroupIndex"],tempMem["secondGroupIndex"],tempMem["valueIndex"]]] tempMem["weighted_edge_list"] = tempMem["weighted_edge_list"].groupby(by = [tempMem["firstGroupIndex"], tempMem["secondGroupIndex"]]).sum().reset_index() G = nx.from_pandas_dataframe(tempMem["weighted_edge_list"],tempMem["firstGroupIndex"],tempMem["secondGroupIndex"],tempMem["valueIndex"], create_using=nx.DiGraph()) tempMem["row_order"] = np.sort(np.unique(tempMem["weighted_edge_list"][tempMem["firstGroupIndex"]])) tempMem["column_order"] = np.sort(np.unique(tempMem["weighted_edge_list"][tempMem["secondGroupIndex"]])) matrix_real = biadjacency_matrix(G, tempMem["row_order"], column_order=tempMem["column_order"], weight=tempMem["valueIndex"]) tempMem["matrix"] = matrix_real.toarray() row_sums = tempMem["matrix"].sum(axis=1).round(2) tempMem["row_sums_map"] = dict(zip(tempMem["row_order"], row_sums)) tempMem["row_sums_map"] = {k:float(v) for k,v in tempMem["row_sums_map"].items()} column_sums = tempMem["matrix"].sum(axis=0).round(2) tempMem["column_sums_map"] = dict(zip(tempMem["column_order"], column_sums)) tempMem["column_sums_map"] = {k:float(v) for k,v in tempMem["column_sums_map"].items()} tempMem["model"] = CoclustMod(min(min(tempMem["matrix"].shape), num_clusters),random_state=0) #n_init=500 tempMem["model"].fit(tempMem["matrix"]) #test andere liste senden tempMem["rowMap"] = dict(zip(tempMem["row_order"], list(map(str, tempMem["model"].row_labels_)))) tempMem["colMap"] = dict(zip(tempMem["column_order"], list(map(str,tempMem["model"].column_labels_)))) ret = [] wel = tempMem["weighted_edge_list"].copy() wel[tempMem["firstGroupIndex"]].update(wel[tempMem["firstGroupIndex"]].map(tempMem["rowMap"])) wel[tempMem["secondGroupIndex"]].update(wel[tempMem["secondGroupIndex"]].map(tempMem["colMap"])) #ret = wel.as_matrix().tolist() ret = wel.values.tolist() clusters = self.getElementsbyCluster(tempMem) return {"data": ret, "clusters": clusters}
def subcluster3(self, clusterID): global subModels clusterID_array = [int(x) for x in clusterID.split('.')] # print(clusterID_array) # print("subModels",subModels) subMatrix = model.get_submatrix(matrix, clusterID_array[0]) sub_row_order = row_order[model.get_indices(clusterID_array[0])[0]] sub_column_order = column_order[model.get_indices( clusterID_array[0])[1]] for i, cID in enumerate(clusterID_array[1:]): smID = '.'.join(str(x) for x in clusterID_array[:(i + 1)]) print("smID", smID) sm = subModels[smID] subMatrix = sm.get_submatrix(subMatrix, cID) sub_row_order = sub_row_order[sm.get_indices(cID)[0]] sub_column_order = sub_column_order[sm.get_indices(cID)[1]] zeros_cols = np.where(~subMatrix.any(axis=0))[0] zeros_rows = np.where(~subMatrix.any(axis=1))[0] subMatrix = np.delete(subMatrix, zeros_cols, 1) subMatrix = np.delete(subMatrix, zeros_rows, 0) sub_row_order = np.delete(sub_row_order, zeros_rows) sub_column_order = np.delete(sub_column_order, zeros_cols) num_clusters2 = min(min(subMatrix.shape), num_clusters) subModel = CoclustMod(num_clusters2, random_state=0) subModels[clusterID] = subModel # print("subModels",subModels) subModel.fit(subMatrix) for i, label in enumerate(subModel.row_labels_): rowMap[sub_row_order[i]] = str(clusterID) + "." + str(label) for i, label in enumerate(subModel.column_labels_): colMap[sub_column_order[i]] = str(clusterID) + "." + str(label) # ret = [] # wel = weighted_edge_list.copy() # wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap)) # wel["MEDIUM_MEDIENINHABER"].update(wel["MEDIUM_MEDIENINHABER"].map(colMap)) rowLabelSet = set( [str(clusterID) + "." + str(x) for x in subModel.row_labels_]) colLabelSet = set( [str(clusterID) + "." + str(x) for x in subModel.column_labels_]) #--- rowMap2 = { k: (v if v in rowLabelSet else "Sonstige") for k, v in rowMap.items() } colMap2 = { k: (v if v in colLabelSet else "Sonstige") for k, v in colMap.items() } wel = weighted_edge_list.copy() # print(rowLabelSet) wel["RECHTSTRAEGER"].update(wel["RECHTSTRAEGER"].map(rowMap2)) wel["MEDIUM_MEDIENINHABER"].update( wel["MEDIUM_MEDIENINHABER"].map(colMap2)) idc = wel[( wel["RECHTSTRAEGER"].astype(str).str[:len(clusterID)] != clusterID) & (wel["MEDIUM_MEDIENINHABER"].astype( str).str[:len(clusterID)] != clusterID)].index wel = wel.drop(idc) wel2 = weighted_edge_list.copy() wel2 = wel2.drop(idc) row_sums_map2 = wel2.groupby( by=["RECHTSTRAEGER"]).sum().to_dict()["EURO"] row_sums_map2 = {k: float(v) for k, v in row_sums_map2.items()} column_sums_map2 = wel2.groupby( by=["MEDIUM_MEDIENINHABER"]).sum().to_dict()["EURO"] column_sums_map2 = {k: float(v) for k, v in column_sums_map2.items()} ret = [] ret = wel.as_matrix().tolist() # clusters = self.getElementsbyCluster() inv_rowMap2 = {} for k, v in rowMap2.items(): inv_rowMap2.setdefault(v, []).append(k) inv_colMap2 = {} for k, v in colMap2.items(): inv_colMap2.setdefault(v, []).append(k) clusters = {} for label in inv_rowMap2: clusters[label] = { "rows": { k: row_sums_map2[k] for k in inv_rowMap2[label] if k in row_sums_map2 }, "columns": { k: column_sums_map2[k] for k in inv_colMap2[label] if k in column_sums_map2 } } return {"data": ret, "clusters": clusters}
df = pd.read_csv(label_file) y = np.unique(df['Label'], return_inverse=True)[1] # as factor mat = io.loadmat(mat_file)['X'] print(mat.shape) no_cluster = len(np.unique(y)) print(no_cluster) algo_pipeline = [] algo_pipeline.append((CoclustInfo(n_row_clusters=no_cluster, n_col_clusters=no_cluster, n_init=10, max_iter=200), "CoclustInfo")) algo_pipeline.append((CoclustMod(n_clusters=no_cluster, n_init=10, max_iter=200), "CoclustMod")) algo_pipeline.append((CoclustSpecMod(n_clusters=no_cluster, n_init=10, max_iter=200), "CoclustSpecMod")) for model, model_name in algo_pipeline: res_nmi, res_ari, res_acc = execute_algo(model, model_name, mat, y) # Save results out_dir = result_path + "/" + data_version + "/" makedir(out_dir) out_file = out_dir + dataset + "_" + mat_version + "_" + model_name + ".txt" content = str(res_nmi) + ", " + str(res_ari) + ", " + str( res_acc) + "\n" myfile = open(out_file, "a")
from scipy.io import loadmat from coclust.coclustering import CoclustMod file_name = "/home/sayon/Dropbox/MyModules/Canvass/cclust_package/datasets/som.mat" matlab_dict = loadmat(file_name) X = matlab_dict['fea'] model = CoclustMod(n_clusters=4) model.fit(X) print(model.modularity) predicted_row_labels = model.row_labels_ predicted_column_labels = model.column_labels_
plt.matshow(euclidean_distances(Feafile.values, Feafile.values)) plt.colorbar() plt.title('Show the Euclidean distance matrix') plt.show() # %%Combined usage # The following example shows how easy coclust is to run several algorithms on the same dataset import matplotlib.pyplot import numpy as np, scipy.sparse as sp, scipy.io as io from sklearn.metrics import confusion_matrix from coclust.coclustering import (CoclustMod, CoclustSpecMod, CoclustInfo) from coclust.visualization import plot_reorganized_matrix X = Feafile.values model_1 = CoclustMod(n_clusters=4, n_init=4) model_1.fit(X) model_2 = CoclustSpecMod(n_clusters=4, n_init=4) model_2.fit(X) model_3 = CoclustInfo(n_row_clusters=3, n_col_clusters=4, n_init=4) model_3.fit(X) plt.figure() plt.title(' plot three reorganized matrices for the dataset') plt.subplot(131) plot_reorganized_matrix(X, model_1) plt.subplot(132) plot_reorganized_matrix(X, model_2) plt.subplot(133) plot_reorganized_matrix(X, model_3) plt.show()
def setUpClass(cls): model = CoclustMod(n_clusters=3) X = np.diag(range(1, 200)) model.fit(X) cls.model = model
def genes_articles(): print(request.json) tfidf = request.json["tfidf"] distance = request.json["distance"] coclust = int(request.json["coclust"]) selected_genes = [v["label"] for v in request.json["genes"]] nb_cluster = int(request.json["nb"]) genes_articles_str = [ ' '.join(str(x) for x in genesjson[g]) for g in selected_genes ] if tfidf: vec = TfidfVectorizer() else: vec = CountVectorizer() X = vec.fit_transform(genes_articles_str) nb = nb_cluster if nb_cluster == 0 and coclust != 3: xn = X.shape[0] step = round(xn / 10) if round(xn / 10) > 0 else 1 rng = range(1, xn, step) # _, modularities = best_modularity_partition(X, rng, n_rand_init=1) # nb = rng[np.argmax(modularities)] modularities = [] for x in rng: print(x) m = CoclustMod(n_clusters=x, n_init=1).fit(X) modularities.append(m.modularity) nb = rng[np.argmax(modularities)] if coclust == 1: model = CoclustMod(n_clusters=nb, random_state=0) if coclust == 2: model = CoclustSpecMod(n_clusters=nb, random_state=0) if coclust == 3: model = CoclustInfo() dt = X.toarray() model.fit(dt) fit_data = dt[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.figure(figsize=(22, 5)) if nb_cluster == 0 and coclust != 3: plt.subplot(131) plt.plot(rng, modularities, 'ro-') plt.xlabel("Number of cluster") plt.ylabel("Modularity") plt.title("Max modularity for " + str(nb) + " clusters (" + str(round(np.max(modularities), 3)) + ")") plt.axvline(x=nb, color='r', linestyle='-') plt.subplot(132) sns.heatmap(dt, cmap="BuPu", yticklabels=False, xticklabels=False, cbar=False) plt.title("Heatmap on Original Data") plt.subplot(133) sns.heatmap(fit_data, cmap="BuPu", yticklabels=False, xticklabels=False, cbar=False) plt.title("CoclustMod %i clusters" % nb) plt.savefig("img-ga1.jpg", bbox_inches='tight', pad_inches=0) # hierarchical clustering Z = linkage(dt, 'single', 'euclidean') plt.figure(figsize=(15, 7)) plt.xlabel('') plt.ylabel('distance') dendrogram(Z, labels=selected_genes) plt.savefig("img-ga2.jpg", bbox_inches='tight', pad_inches=0) plt.close('all') return jsonify({"tab": 1})
def genes_termes(): tfidf = request.json["tfidf"] coclust = int(request.json["coclust"]) selected_genes = [v["label"] for v in request.json["genes"]] nb_cluster = int(request.json["nb"]) # get id articles from all genes l = [genesjson[g] for g in selected_genes] genes_articles = list(set([item for sublist in l for item in sublist])) # get text from article articles_text = [ ' '.join(asthmajson[str(i)]["text"]) for i in genes_articles ] if tfidf: vec = TfidfVectorizer(max_df=0.7, min_df=0.01) else: vec = CountVectorizer(max_df=0.7, min_df=0.01) dt = vec.fit_transform(articles_text) matrix_article_terms = dt.toarray() matrix_genes_terms = defaultdict( lambda: np.zeros(matrix_article_terms.shape[1]).astype(np.float64)) for idx, row in enumerate(matrix_article_terms): article = asthmajson[str(genes_articles[idx])] for ge in article["genes"]: if ge in selected_genes: matrix_genes_terms[ge] += row list_matrix_genes_terms = [v for k, v in matrix_genes_terms.items()] list_genes = [k for k, v in matrix_genes_terms.items()] #df3 = pd.DataFrame(list_matrix_genes_terms, columns=vec.get_feature_names()) if coclust == 1: model = CoclustMod(n_clusters=nb_cluster, random_state=0) if coclust == 2: model = CoclustSpecMod(n_clusters=nb_cluster, random_state=0) if coclust == 3: model = CoclustInfo() dt = np.array(list_matrix_genes_terms) m1 = model.fit(dt) fit_data = dt[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.figure(figsize=(20, 8)) plt.subplot(121) sns.heatmap(np.log(dt + 1), cmap="BuPu", yticklabels=False, xticklabels=False, cbar=False) plt.title("Heatmap on Original Data", fontdict={'fontsize': 20}) plt.subplot(122) sns.heatmap(np.log(fit_data + 1), cmap="BuPu", yticklabels=False, xticklabels=False, cbar=False) plt.title("CoclustMod %i clusters" % nb_cluster, fontdict={'fontsize': 20}) plt.savefig("img-gt1.jpg", bbox_inches='tight', pad_inches=0) # Top terms by cluster nb_clust = np.unique(model.column_labels_) nm = np.array(vec.get_feature_names()) dt_sum = np.sum(dt, axis=0) col_label = np.array(model.column_labels_) cluster = [] for c in nb_clust: idx = np.argsort(-dt_sum[col_label == c]) col = nm[np.array(model.column_labels_) == c] value = dt_sum[col_label == c][idx] name = col[idx] cluster.append({"name": list(name[0:8]), "value": list(value[0:8])}) # hierarchical clustering Z = linkage(dt, 'single', 'euclidean') plt.figure(figsize=(15, 7)) # plt.title('Hierarchical Clustering - Hamming') plt.xlabel('') plt.ylabel('distance') dendrogram(Z, labels=list_genes) plt.savefig("img-gt2.jpg", bbox_inches='tight', pad_inches=0) plt.close('all') return jsonify({"tab": 2, "cluster": cluster})