def cluster(y, X, n_clusters): if n_clusters != 1 : y_ = onp.array(y,dtype='float64') # mask = onp.clip((onp.diff(y, n=1, axis=0) == 0).argmin(axis=0),a_max=2 * (y.shape[0] // 3),a_min=0) # for i in range(mask.size): # y_[range(mask[i]),i] = np.nan corr = pd.DataFrame(y_).corr(method='kendall') model = SpectralCoclustering(n_clusters=n_clusters) model.fit(corr) clusters = [model.get_indices(i)[0] for i in range(n_clusters)] def fn_by_cluster(x,fn,weights=None): if weights is not None: return np.concatenate([fn(x[..., rng], axis=-1, weights=weights[i])[..., np.newaxis] for i, rng in enumerate(clusters)], axis=-1) else: return np.concatenate([fn(x[..., rng], axis=-1)[..., np.newaxis] for i, rng in enumerate(clusters)],axis=-1) fn_market_share = lambda x: [np.sum(x[...,rng],axis=0)/np.sum(x[...,rng]) for rng in clusters] y_ = fn_by_cluster(y,np.sum) #Compute within-group market shares y_weights = fn_market_share(y) X_ = fn_by_cluster(X,np.average,y_weights) return y_, X_, clusters else: return np.sum(y,axis=-1)[...,np.newaxis],np.mean(X,axis=-1)[...,np.newaxis],1
def test_spectral_coclustering(): # Test Dhillon's Spectral CoClustering on a simple problem. param_grid = { "svd_method": ["randomized", "arpack"], "n_svd_vecs": [None, 20], "mini_batch": [False, True], "init": ["k-means++"], "n_init": [10], } random_state = 0 S, rows, cols = make_biclusters((30, 30), 3, noise=0.5, random_state=random_state) S -= S.min() # needs to be nonnegative before making it sparse S = np.where(S < 1, 0, S) # threshold some values for mat in (S, csr_matrix(S)): for kwargs in ParameterGrid(param_grid): model = SpectralCoclustering(n_clusters=3, random_state=random_state, **kwargs) model.fit(mat) assert model.rows_.shape == (3, 30) assert_array_equal(model.rows_.sum(axis=0), np.ones(30)) assert_array_equal(model.columns_.sum(axis=0), np.ones(30)) assert consensus_score(model.biclusters_, (rows, cols)) == 1 _test_shape_indices(model)
class Spectral(object): def __init__(self, dataset): """initialize spectral class Arguments: dataset {np.array} -- dataset to fetch """ data_map = {'classic3': 1, 'cstr': 3, 'mnist': 2} self.dataset = dataset print("Fetching ", dataset) self.data, self.labels = get_data_set(data_map[dataset]) if (~self.data.any(axis=0)).any(): print("Found empty features. deleting...") self.data = np.delete(self.data, np.where(~self.data.any(axis=0))[0], axis=1) def view_dataset(self, title, data, markersize=0.001): """plot data matrix Arguments: title {str} -- title of plot data {np.array} -- dataset to plot Keyword Arguments: markersize {float} -- size of datapoints (default: {0.001}) """ plt.spy(data, markersize=markersize) plt.title(title) plt.show() def shuffle_data(self): """shuffles self.data """ print("Shuffling") self.data, self.labels = shuffle(self.data, self.labels) self.view_dataset(data=self.data, title='shuffled data') def form_biclusters(self): """generates spectral bi-clusters from self.data """ n_clusters = len(np.unique(self.labels)) print("Generating {} clusters".format(n_clusters)) self.bicluster = SpectralCoclustering(n_clusters=n_clusters, n_jobs=-1) self.bicluster.fit(self.data) def get_accuracy(self): """calculates NMI between self.bicluster rows and data labels """ nmi = normalized_mutual_info_score(self.bicluster.row_labels_, self.labels) print("Accuracy is ", nmi) def show_clusters(self): """sorts data according to bicluster row and col labels and plots """ fit_data = self.data[np.argsort(self.bicluster.row_labels_)] fit_data = fit_data[:, np.argsort(self.bicluster.column_labels_)] self.view_dataset(data=fit_data, title='co-clusters')
def spect(input_data, n): spec_instance = SpectralCoclustering(n_clusters=n) spec_instance.fit(input_data) pred = spec_instance.row_labels_ print(pred) print("ACC: " + str(accuracy_score(pred, labels))) acc["SPECT" + str(n)] = str(accuracy_score(pred, labels)) tsnePlot(pred, n, input_data, 'SPECT')
def compute_coclustering( fit_data, num_clusters=1, tol_bicluster=0.005, # sparsity otherwise annoyingly causes underflows w/ sklearn ): if num_clusters == 1: num_clusters = min(fit_data.shape[0], 5) model = SpectralCoclustering(n_clusters=num_clusters, random_state=0) model.fit(fit_data + tol_bicluster) ordered_rows = np.argsort(model.row_labels_) ordered_cols = np.argsort(model.column_labels_) return (ordered_rows, ordered_cols, model.row_labels_[ordered_rows], model.column_labels_[ordered_cols])
class Cluster: def __init__(self, n_clusters, feature_vectors): self.n_clusters = n_clusters self.feature_vectors = feature_vectors def kmeans(self): self.model = KMeans(n_clusters=self.n_clusters) def agglomerative(self, linkage, affinity): self.model = AgglomerativeClustering( n_clusters=self.n_clusters, linkage=linkage, affinity=affinity) def birch(self): #acc is 0.87 self.model = Birch(n_clusters=self.n_clusters) def spectral(self, affinity, n_neighbors=None): self.model = SpectralClustering( n_clusters=self.n_clusters, affinity=affinity, n_neighbors=n_neighbors) def spectral_biclustering(self): self.model = SpectralBiclustering(n_clusters=self.n_clusters) def spectral_coclustering(self): self.model = SpectralCoclustering(n_clusters=self.n_clusters) def fit_model(self): # fit model and predict self.model.fit(self.feature_vectors) try: self.predicted_labels = self.model.labels_ except AttributeError: # spectral_biclustering and Coclustering print(self.model.row_labels_.shape) self.predicted_labels = self.model.row_labels_ except Exception: print(Exception) def save_result(self, file_path): np.savetxt('{}'.format(file_path), self.predicted_labels.astype(int), fmt='%i') def goodness(self, true_labels, base_precision, improved_precision, verbose=False): self.fit_model() # evaluate performance normalized_mutual_info = normalized_mutual_info_score( true_labels, self.predicted_labels) points = (normalized_mutual_info-base_precision)/improved_precision + 1 if verbose: print('current project can get {:d} points'.format(int(points))) return normalized_mutual_info
def visualizeCorr(sgp, args): sgp.cpu() if args.file.split('/')[-2] == 'simulation': final_corr = data['corr'] allX = torch.tensor(data['data']).type(torch.float) allIid = data['iid'].reshape(-1) plt.figure() sns.heatmap( final_corr, cmap="YlGnBu", square=True, robust=True, xticklabels=False, yticklabels=False, ) corr = sgp.deepkernel(allX, allIid).detach().cpu().numpy() plt.figure() sns.heatmap( corr, cmap='YlGnBu', square=True, robust=True, xticklabels=False, yticklabels=False, ) plt.show() else: from sklearn.cluster import SpectralCoclustering indv_corr = sgp.deepkernel.indv_kernel(torch.arange( len(idMap))).detach().cpu().numpy() num_c = args.number_cluster model = SpectralCoclustering(n_clusters=num_c, random_state=0) model.fit(indv_corr) fit_data = indv_corr[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.row_labels_)] rows = np.random.permutation(np.arange(len(fit_data))) rows = rows[:3300] rows = np.sort(rows) clusterRes = model.row_labels_ cl = np.argsort(clusterRes) ax = sns.heatmap( indv_corr[cl][:, cl], cmap='YlGnBu', square=True, robust=True, xticklabels=False, yticklabels=False, ) plt.show()
def bicluster_correlation_matrix(X, n_clusters=10, figsize=None): """ Group similar variables together by running Spectral coclustering algorithm on a dataset's correlation matrix. See https://bit.ly/2QgXZB2 for more details. Spectral coclustering finds groups of similar (row, column) subsets where each column can only belong to a single bicluster. This is different than "checkerboard" biclustering. Parameters ------------ X: {pd.DataFrame} numeric feature data. Shape {observations} x {features} n_clusters: {int} number of biclusters to construct figsize: {2-tuple of int} pyplot Figure size. Default [10, 6]. Returns ------------ coclust: {fitted sklearn.cluster.SpectralCoclustering object} """ # -- get estimate of correlation matrix using median-imputed version of data, # -- and then downsample to 50k datapoints for speed. num_df = X.iloc[np.random.choice(range(X.shape[0]) , size=min(100000, X.shape[0]) , replace=False)] cor_mat = num_df.fillna(num_df.median()).corr() # -- run coclustering. coclust = SpectralCoclustering(n_clusters=n_clusters , random_state=666) coclust.fit(cor_mat) # -- re-order correlation matrix by cluster indices. biclust_dat = cor_mat.iloc[np.argsort(coclust.row_labels_)] biclust_dat = biclust_dat.iloc[:, np.argsort(coclust.column_labels_)] # -- display biclustering pattern. fig = plt.figure(figsize=figsize if figsize else [10, 10]) ax = fig.add_subplot(111) ax = ax.matshow(biclust_dat , cmap='cool') ax.set_title(f'Correlation matrix post-biclustering: {n_clusters} clusters') ax.set_yticks(range(biclust_dat.shape[0])) ax.set_yticklabels(biclust_dat.index.tolist()) plt.show() return coclust
noise=5, shuffle=False, random_state=0) plt.matshow(data, cmap=plt.cm.Blues) plt.title("Original dataset") # shuffle clusters rng = np.random.RandomState(0) row_idx = rng.permutation(data.shape[0]) col_idx = rng.permutation(data.shape[1]) data = data[row_idx][:, col_idx] plt.matshow(data, cmap=plt.cm.Blues) plt.title("Shuffled dataset") model = SpectralCoclustering(n_clusters=5, random_state=0) model.fit(data) score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx])) print("consensus score: {:.3f}".format(score)) fit_data = data[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] plt.matshow(fit_data, cmap=plt.cm.Blues) plt.title("After biclustering; rearranged to show biclusters") #plt.show()
from bokeh.models import HoverTool, ColumnDataSource from bokeh.plotting import figure, output_file, show from bokeh.io import output_notebook import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import SpectralCoclustering whisky = pd.read_csv('whiskies.txt') whisky['Region'] = pd.read_csv('regions.txt') flavors = whisky.iloc[:, 2:14] corr_flavors = pd.DataFrame.corr(flavors) corr_whisky = pd.DataFrame.corr(flavors.transpose()) model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(corr_whisky) whisky['Group'] = pd.Series(model.row_labels_, index=whisky.index) whisky = whisky.iloc[np.argsort(model.row_labels_)] whisky = whisky.reset_index(drop=True) correlations = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose()) correlations = np.array(correlations) # First, we import a tool to allow text to pop up on a plot when the cursor # hovers over it. Also, we import a data structure used to store arguments # of what to plot in Bokeh. Finally, we will use numpy for this section as well! from bokeh.models import HoverTool, ColumnDataSource import numpy as np
clustering.column_labels_ #doctest: +SKIP clustering # In[12]: from sklearn.metrics import consensus_score from matplotlib import pyplot as plt # shuffle clusters rng = np.random.RandomState(0) row_idx = rng.permutation(X.shape[0]) col_idx = rng.permutation(X.shape[1]) # 여기서부터 우리 실험 데이터! model = SpectralCoclustering(n_clusters=7, random_state=0) model.fit(X) # In[19]: fig = plt.figure(figsize=(40, 55)) ax = fig.add_subplot(1, 1, 1) fit_data = X[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] #plt.matshow(fit_data, cmap=plt.cm.Blues) ax.matshow(fit_data, cmap=plt.cm.Blues) #plt.title("After biclustering") ax.set_title("After biclustering")
print(corr_whisky) plt.figure(figsize = (10,10)) plt.pcolor(corr_whisky) plt.axis("tight") plt.colorbar() plt.show() ''' # Spectral co clustering from sklearn.cluster import SpectralCoclustering model = SpectralCoclustering(n_clusters= 6, random_state= 0) model.fit(corr_whisky) # Data from the correlation matrix # Every row corresponds to the cluster, every column # to the data parameter print( np.sum(model.rows_, axis= 1) ) # Sumamos las columnas # How many clusters belonging from each element print( np.sum(model.rows_, axis= 0) ) # Each element from the array positions belongs from the number # from this position print(model.row_labels_) # Comparing the correlation tables whisky['Group'] = pd.Series(model.row_labels_, index = whisky.index)
y_true = newsgroups.target vectorizer = NumberNormalizingVectorizer(stop_words='english', min_df=5) cocluster = SpectralCoclustering(n_clusters=len(categories), svd_method='arpack', random_state=0) kmeans = MiniBatchKMeans(n_clusters=len(categories), batch_size=20000, random_state=0) print("Vectorizing...") X = vectorizer.fit_transform(newsgroups.data) print("Coclustering...") start_time = time() cocluster.fit(X) y_cocluster = cocluster.row_labels_ print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_cocluster, y_true))) print("MiniBatchKMeans...") start_time = time() y_kmeans = kmeans.fit_predict(X) print("Done in {:.2f}s. V-measure: {:.4f}".format( time() - start_time, v_measure_score(y_kmeans, y_true))) feature_names = vectorizer.get_feature_names() document_names = list(newsgroups.target_names[i] for i in newsgroups.target) def bicluster_ncut(i):
#Prints out the grid shape of the genres print(visGrid.shape) print(len(Genre_ID_to_name.keys())) #Code that illustrates the heat map of co-occurring genre of movies annot_lookup = [] for i in range(len(nr_ids)): annot_lookup.append(Genre_ID_to_name[nr_ids[i]]) sns.heatmap(visGrid, xticklabels=annot_lookup, yticklabels=annot_lookup) plt.title("Heat map of Co-occurring Movie Genres") plt.show() #Bi-clustering to show genres that occur together and genres that don't occur together model = SpectralCoclustering(n_clusters=5) model.fit(visGrid) fit_data = visGrid[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] annot_lookup_sorted = [] for i in np.argsort(model.row_labels_): annot_lookup_sorted.append(Genre_ID_to_name[nr_ids[i]]) sns.heatmap(fit_data, xticklabels=annot_lookup_sorted, yticklabels=annot_lookup_sorted, annot=False) plt.title("After biclustering; rearranged to show biclusters") plt.show()
import pandas as pd import matplotlib.pyplot as plt from sklearn.cluster import SpectralCoclustering from bokeh.models import HoverTool, ColumnDataSource from bokeh.plotting import figure, output_file, show #whisky = pd.read_csv(r'C:/Users/Daria/Documents/PythonScripts/whiskies.txt') whisky = pd.read_csv(r"whiskies.csv", index_col=0) whisky["Region"] = pd.read_csv( r"C:/Users/Daria/Documents/PythonScripts/regions.txt") flavors = whisky.iloc[:, 2:14] correlation_flavors = pd.DataFrame.corr(flavors) correlation_whisky = pd.DataFrame.corr(flavors.transpose()) model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(correlation_whisky) np.sum(model.rows_, 1) #each whisky belongs to one of 6 clusters np.sum(model.rows_, 0) # does each whicky belong to only 1 cluster? model.row_labels_ # each observation belongs to this cluster whisky = whisky.iloc[np.argsort(model.row_labels_)] whisky = whisky.reset_index(drop=True) #correlations = pd.DataFrame.corr(whisky.iloc[:,2:14].transpose()) #correlations = np.array(correlations) corr = pd.DataFrame.corr(whisky.iloc[:, 2:14].transpose()) #DataFrame correl = np.array(corr) # convert to NumPy array plt.figure(figsize=(14, 7)) plt.subplot(121) plt.pcolor(correlation_whisky) plt.colorbar() plt.title('Original')
value = 100 * float(item) matrix[row_index][column_index] = value if value < min_list[column_index]: min_list[column_index] = value if value > max_list[column_index]: max_list[column_index] = value if value != 0: ave_list[column_index] += 1 # 此时ave_list复用表示非零元素个数 column_index += 1 print(unsta_max) print("row_num", row_num) # 跑对角线双聚类 每个基因打上0.。4 的标签 model = SpectralCoclustering(n_clusters=10, random_state=0) model.fit(matrix) for i in range(len(row_dict)): print(i, '.', row_list[i], ':', model.row_labels_[i]) print(model.column_labels_) fit_data = matrix[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] # 临床变量随机森林 con_num = file9.readline().split().__len__() - 1 print("con_num:", con_num) lines = file9.readlines() sam_num = lines.__len__() print("sam_num:", sam_num) x_train = np.empty(shape=(sam_num, con_num), dtype=np.int) y_train = np.empty(sam_num, dtype=int)
def test_spectralcoclustering_parameter_validation(params, type_err, err_msg): """Check parameters validation in `SpectralBiClustering`""" data = np.arange(25).reshape((5, 5)) model = SpectralCoclustering(**params) with pytest.raises(type_err, match=err_msg): model.fit(data)
plt.figure(figsize=(10, 10)) plt.pcolor(corrFlavours) plt.colorbar() plt.savefig("CorrFlavours.pdf") corrWhisky = pd.DataFrame.corr(flavours.transpose()) plt.figure(figsize=(10, 10)) plt.pcolor(corrWhisky) plt.axis("tight") plt.colorbar() plt.savefig("CorrWhisky.pdf") # -- 4.1.4 Clustering Whiskies by Flavour Profile model = SpectralCoclustering(n_clusters=6, random_state=0) model.fit(corrWhisky) # print(model.rows_) # output -> array with dimensions number of row clusters * number of rows # ^^ CORRELATION MATRIX ^^ # Each row in this array identifies a cluster, here ranging from 0 to 5 # Each column identifies a row in the correlation matrix, here ranging from 0 to 85 """ [[False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False True False False False False False False False False False False True False True False False False False False False False False False False False False False False False False False] [False False False False False True False False False False False True False True False False True False True False False False False False
class DocumentClustering: def __init__(self, k=5): self.name = 'k-means' self.k = k self.X = None self.clustering = None self.vectorizer = None self.dataset_size = 0 self.doc2vec_matrix = False def make_matrix(self, documents=None, n_components=-1, doc2vec_matrix=None): if isinstance(doc2vec_matrix, np.ndarray) == False: self.vectorizer = TfidfVectorizer() # self.vectorizer = CountVectorizer() self.X = self.vectorizer.fit_transform(documents) self.dataset_size = len(documents) else: self.X = doc2vec_matrix self.dataset_size = len(doc2vec_matrix) self.doc2vec_matrix = True if (n_components != -1): if n_components > len(self.vectorizer.get_feature_names()): n_components = len(self.vectorizer.get_feature_names()) print('n_components ' + str(n_components)) # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(n_components) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) self.X = lsa.fit_transform(self.X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() def cluster(self, cluster_name): self.name = cluster_name.strip() print('cluster_name ' + self.name) if self.name == 'k-means': print('cluster_name: ' + self.name) self.clustering = KMeans(n_clusters=self.k, init='k-means++', max_iter=500, n_init=1) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif cluster_name == 'agglo': self.clustering = AgglomerativeClustering(n_clusters=self.k, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None) print("Clustering sparse data with %s" % self.clustering) t0 = time() #to make dense matrix if self.doc2vec_matrix == False: self.X = self.X.toarray() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif self.name == 'spectral_cocluster': self.clustering = SpectralCoclustering(n_clusters=self.k, svd_method='arpack', random_state=0) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() def print_results(self): # print the clustering result print(self.name) if self.name == 'k-means': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) print(str(cluster_label) + " -- " + str(document_id)) order_centroids = self.clustering.cluster_centers_.argsort( )[:, ::-1] terms = self.vectorizer.get_feature_names() for i in range(self.k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() elif self.name == 'agglo': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) #print(str(cluster_label) + " -- " + str(document_id)) results = self.get_cluster_top_keywords(clusters) for _cluster in results: key_terms = results[_cluster] print("Cluster " + str(_cluster) + " : " + str(len(clusters[_cluster])) + " documents") print(key_terms) print() elif self.name == 'spectral_cocluster': target_number = 10 bicluster_ncuts = list( self.bicluster_ncut(i) for i in range(self.k)) best_idx = np.argsort(bicluster_ncuts)[:target_number] feature_names = self.vectorizer.get_feature_names() print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = self.clustering.get_shape(cluster) cluster_docs, cluster_words = self.clustering.get_indices( cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories counter = defaultdict(int) for i in cluster_docs: counter[str(i)] += 1 cat_string = ", ".join( "{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in self.most_common(counter)[:3]) # words out_of_cluster_docs = self.clustering.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = self.X[:, cluster_words] word_scores = np.array(word_col[cluster_docs, :].sum( axis=0) - word_col[out_of_cluster_docs, :].sum(axis=0)) word_scores = word_scores.ravel() important_words = list(feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]) print("bicluster {} : {} documents, {} words".format( idx, n_rows, n_cols)) print("categories : {}".format(cat_string)) print("words : {}\n".format(', '.join(important_words))) def bicluster_ncut(self, i): rows, cols = self.clustering.get_indices(i) if not (np.any(rows) and np.any(cols)): import sys return sys.float_info.max row_complement = np.nonzero(np.logical_not( self.clustering.rows_[i]))[0] col_complement = np.nonzero(np.logical_not( self.clustering.columns_[i]))[0] # Note: the following is identical to X[rows[:, np.newaxis], # cols].sum() but much faster in scipy <= 0.16 weight = self.X[rows][:, cols].sum() cut = (self.X[row_complement][:, cols].sum() + self.X[rows][:, col_complement].sum()) return cut / weight def most_common(self, d): """Items of a defaultdict(int) with the highest values. """ return sorted(d.items(), key=operator.itemgetter(1), reverse=True) def get_cluster_top_keywords(self, clusters, keywords_per_cluster=10): """Shows the top k words for each cluster Keyword Arguments: keywords_per_cluster {int} -- The k words to show for each cluster (default: {10}) Returns: dict of lists -- Returns a dict of {cluster_id: ['top', 'k', 'words', 'for', 'cluster']} """ terms = self.vectorizer.get_feature_names() out = {} docs_for_cluster = {} # self.clusters = 10 clusters,containing the index of the document_vectors document in that cluster, ex len(self.clusters[6]) == 508 for cluster in clusters: # To flatten/combine all documents into one docs_for_cluster[cluster] = np.array( [self.X[i] for i in clusters[cluster]]) # Cluster vectors to feature words out[cluster] = np.array(terms)[np.flip( np.argsort(docs_for_cluster[cluster]), -1)] cluster_shape = out[cluster].shape out[cluster] = out[cluster].reshape( cluster_shape[0] * cluster_shape[1])[:keywords_per_cluster].tolist() return out def visualize(self): # The output is a one-dimensional array of N documents corresponding to the clusters # assigned to our N data points. if self.name == 'spectral_cocluster': pca_t = None if self.doc2vec_matrix == False: pca_t = PCA().fit_transform(self.X.toarray()) else: pca_t = PCA().fit_transform(self.X) #pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.row_labels_, cmap='rainbow') plt.show() elif self.name == 'agglo': pca_t = PCA().fit_transform(self.X) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show() elif self.name == 'k-means': if self.doc2vec_matrix == False: self.X = self.X.toarray() pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show()
def type_consistent_cocluster(topic_word_dict0, ename2embed_bert, n_cluster_min, print_cls=False, save_file=None): topic_word_dict = {} all_words = [] for topic in topic_word_dict0: topic_word_dict[topic] = [] for ename in topic_word_dict0[topic]: if ename in ename2embed_bert: topic_word_dict[topic].append(ename) all_words.append(ename) topics = list(topic_word_dict0.keys()) # print("topics") # print(topics) all_children = [x for x in all_words] # all_words.extend([x for x in topics if x in ename2embed_bert]) all_embed = [ename2embed_bert[x][0] for x in all_words] # print(all_children) all_words_and_their_parents = [] for word in all_words: for topic in topic_word_dict: if word in topic_word_dict[topic]: word0 = (topic, word) break all_words_and_their_parents.append(word0) # print(all_words_and_their_parents) # AP clustering = AffinityPropagation().fit(all_embed) n_clusters = max(clustering.labels_) + 1 clusters = {} col_vectors = np.zeros((len(topic_word_dict), n_clusters), dtype=float) for i in range(n_clusters): clusters[i] = [ all_words_and_their_parents[x] for x in range(len(clustering.labels_)) if clustering.labels_[x] == i ] for word0 in clusters[i]: word0_col = int(word0[0]) col_vectors[word0_col, i] = 1 col_vectors = np.array(col_vectors) col_vectors += 0.1 * np.ones((len(topic_word_dict), n_clusters), dtype=int) for n_cluster in range(n_cluster_min, n_cluster_min + 10): model = SpectralCoclustering(n_clusters=n_cluster, random_state=0) model.fit(col_vectors) new_topic_word_dict = {} coverage_list = [] for ind in range(n_cluster): # print(ind) small_matrix = col_vectors[[ x for x in range(len(model.row_labels_)) if model.row_labels_[x] == ind ]] small_matrix = small_matrix[:, [ x for x in range(len(model.column_labels_)) if model.column_labels_[x] == ind ]] coverage_list.append( np.sum(small_matrix) / np.sum(np.ones_like(small_matrix))) if max(coverage_list) >= 0.7: break fit_data = col_vectors[np.argsort(model.row_labels_)] fit_data = fit_data[:, np.argsort(model.column_labels_)] cluster_count = [sum(model.row_labels_ == x) for x in range(n_cluster)] # print("row cluster count: ", cluster_count) cluster_count = [sum(model.column_labels_ == x) for x in range(n_cluster)] # print("column cluster count: ", cluster_count) coverage_thre = min(max(coverage_list), 0.4) # print('coverage: ',coverage_list) for ind in range(n_cluster): if coverage_list[ind] < coverage_thre: # print("del cluster ",ind) continue for topic in topic_word_dict: if model.row_labels_[int(topic)] == ind: new_topic_word_dict[topic] = [ x for x in topic_word_dict[topic] ] return new_topic_word_dict