def cluster(y, X, n_clusters): if n_clusters != 1 : y_ = onp.array(y,dtype='float64') # mask = onp.clip((onp.diff(y, n=1, axis=0) == 0).argmin(axis=0),a_max=2 * (y.shape[0] // 3),a_min=0) # for i in range(mask.size): # y_[range(mask[i]),i] = np.nan corr = pd.DataFrame(y_).corr(method='kendall') model = SpectralCoclustering(n_clusters=n_clusters) model.fit(corr) clusters = [model.get_indices(i)[0] for i in range(n_clusters)] def fn_by_cluster(x,fn,weights=None): if weights is not None: return np.concatenate([fn(x[..., rng], axis=-1, weights=weights[i])[..., np.newaxis] for i, rng in enumerate(clusters)], axis=-1) else: return np.concatenate([fn(x[..., rng], axis=-1)[..., np.newaxis] for i, rng in enumerate(clusters)],axis=-1) fn_market_share = lambda x: [np.sum(x[...,rng],axis=0)/np.sum(x[...,rng]) for rng in clusters] y_ = fn_by_cluster(y,np.sum) #Compute within-group market shares y_weights = fn_market_share(y) X_ = fn_by_cluster(X,np.average,y_weights) return y_, X_, clusters else: return np.sum(y,axis=-1)[...,np.newaxis],np.mean(X,axis=-1)[...,np.newaxis],1
def rearrange_confusion_matrix(cm, n_clusters): from sklearn.cluster import SpectralCoclustering clst = SpectralCoclustering(n_clusters=n_clusters).fit(cm) idx = [] for c in range(n_clusters): idx.append(clst.get_indices(c)[0]) idx = np.concatenate(idx) cm_clustered = np.zeros(cm.shape, dtype=int) for i, idxi in enumerate(idx): for j, idxj in enumerate(idx): cm_clustered[i,j] = cm[idxi, idxj] return cm_clustered, idx
def cocluster(np_sums, matrix_diags, vectorizer): ''' Perform the coclustering ''' x = np.array(np_sums) # print(x) n_clusters = 20 clustering = SpectralCoclustering(n_clusters=n_clusters, random_state=0).fit(x) for i in range(n_clusters): row_nums, col_nums = clustering.get_indices(i) row_words = [matrix_diags[num] for num in row_nums] col_words = [vectorizer.get_feature_names()[num] for num in col_nums] print("Cluster: ", i) print("===========") print("Diagnoses: ", row_words) print() print("n-grams: ", col_words) print()
Like Counter.most_common in Python >=2.7. """ return sorted(d.items(), key=operator.itemgetter(1), reverse=True) bicluster_ncuts = list( bicluster_ncut(i) for i in range(len(newsgroups.target_names))) best_idx = np.argsort(bicluster_ncuts)[:5] print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = cocluster.get_shape(cluster) cluster_docs, cluster_words = cocluster.get_indices(cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories counter = defaultdict(int) for i in cluster_docs: counter[document_names[i]] += 1 cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in most_common(counter)[:3]) # words out_of_cluster_docs = cocluster.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = X[:, cluster_words] word_scores = np.array(word_col[cluster_docs, :].sum(axis=0) -
class DocumentClustering: def __init__(self, k=5): self.name = 'k-means' self.k = k self.X = None self.clustering = None self.vectorizer = None self.dataset_size = 0 self.doc2vec_matrix = False def make_matrix(self, documents=None, n_components=-1, doc2vec_matrix=None): if isinstance(doc2vec_matrix, np.ndarray) == False: self.vectorizer = TfidfVectorizer() # self.vectorizer = CountVectorizer() self.X = self.vectorizer.fit_transform(documents) self.dataset_size = len(documents) else: self.X = doc2vec_matrix self.dataset_size = len(doc2vec_matrix) self.doc2vec_matrix = True if (n_components != -1): if n_components > len(self.vectorizer.get_feature_names()): n_components = len(self.vectorizer.get_feature_names()) print('n_components ' + str(n_components)) # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(n_components) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) self.X = lsa.fit_transform(self.X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() def cluster(self, cluster_name): self.name = cluster_name.strip() print('cluster_name ' + self.name) if self.name == 'k-means': print('cluster_name: ' + self.name) self.clustering = KMeans(n_clusters=self.k, init='k-means++', max_iter=500, n_init=1) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif cluster_name == 'agglo': self.clustering = AgglomerativeClustering(n_clusters=self.k, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None) print("Clustering sparse data with %s" % self.clustering) t0 = time() #to make dense matrix if self.doc2vec_matrix == False: self.X = self.X.toarray() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif self.name == 'spectral_cocluster': self.clustering = SpectralCoclustering(n_clusters=self.k, svd_method='arpack', random_state=0) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() def print_results(self): # print the clustering result print(self.name) if self.name == 'k-means': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) print(str(cluster_label) + " -- " + str(document_id)) order_centroids = self.clustering.cluster_centers_.argsort( )[:, ::-1] terms = self.vectorizer.get_feature_names() for i in range(self.k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() elif self.name == 'agglo': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) #print(str(cluster_label) + " -- " + str(document_id)) results = self.get_cluster_top_keywords(clusters) for _cluster in results: key_terms = results[_cluster] print("Cluster " + str(_cluster) + " : " + str(len(clusters[_cluster])) + " documents") print(key_terms) print() elif self.name == 'spectral_cocluster': target_number = 10 bicluster_ncuts = list( self.bicluster_ncut(i) for i in range(self.k)) best_idx = np.argsort(bicluster_ncuts)[:target_number] feature_names = self.vectorizer.get_feature_names() print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = self.clustering.get_shape(cluster) cluster_docs, cluster_words = self.clustering.get_indices( cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories counter = defaultdict(int) for i in cluster_docs: counter[str(i)] += 1 cat_string = ", ".join( "{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in self.most_common(counter)[:3]) # words out_of_cluster_docs = self.clustering.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = self.X[:, cluster_words] word_scores = np.array(word_col[cluster_docs, :].sum( axis=0) - word_col[out_of_cluster_docs, :].sum(axis=0)) word_scores = word_scores.ravel() important_words = list(feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]) print("bicluster {} : {} documents, {} words".format( idx, n_rows, n_cols)) print("categories : {}".format(cat_string)) print("words : {}\n".format(', '.join(important_words))) def bicluster_ncut(self, i): rows, cols = self.clustering.get_indices(i) if not (np.any(rows) and np.any(cols)): import sys return sys.float_info.max row_complement = np.nonzero(np.logical_not( self.clustering.rows_[i]))[0] col_complement = np.nonzero(np.logical_not( self.clustering.columns_[i]))[0] # Note: the following is identical to X[rows[:, np.newaxis], # cols].sum() but much faster in scipy <= 0.16 weight = self.X[rows][:, cols].sum() cut = (self.X[row_complement][:, cols].sum() + self.X[rows][:, col_complement].sum()) return cut / weight def most_common(self, d): """Items of a defaultdict(int) with the highest values. """ return sorted(d.items(), key=operator.itemgetter(1), reverse=True) def get_cluster_top_keywords(self, clusters, keywords_per_cluster=10): """Shows the top k words for each cluster Keyword Arguments: keywords_per_cluster {int} -- The k words to show for each cluster (default: {10}) Returns: dict of lists -- Returns a dict of {cluster_id: ['top', 'k', 'words', 'for', 'cluster']} """ terms = self.vectorizer.get_feature_names() out = {} docs_for_cluster = {} # self.clusters = 10 clusters,containing the index of the document_vectors document in that cluster, ex len(self.clusters[6]) == 508 for cluster in clusters: # To flatten/combine all documents into one docs_for_cluster[cluster] = np.array( [self.X[i] for i in clusters[cluster]]) # Cluster vectors to feature words out[cluster] = np.array(terms)[np.flip( np.argsort(docs_for_cluster[cluster]), -1)] cluster_shape = out[cluster].shape out[cluster] = out[cluster].reshape( cluster_shape[0] * cluster_shape[1])[:keywords_per_cluster].tolist() return out def visualize(self): # The output is a one-dimensional array of N documents corresponding to the clusters # assigned to our N data points. if self.name == 'spectral_cocluster': pca_t = None if self.doc2vec_matrix == False: pca_t = PCA().fit_transform(self.X.toarray()) else: pca_t = PCA().fit_transform(self.X) #pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.row_labels_, cmap='rainbow') plt.show() elif self.name == 'agglo': pca_t = PCA().fit_transform(self.X) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show() elif self.name == 'k-means': if self.doc2vec_matrix == False: self.X = self.X.toarray() pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show()