"""Items of a defaultdict(int) with the highest values. Like Counter.most_common in Python >=2.7. """ return sorted(d.items(), key=operator.itemgetter(1), reverse=True) bicluster_ncuts = list( bicluster_ncut(i) for i in range(len(newsgroups.target_names))) best_idx = np.argsort(bicluster_ncuts)[:5] print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = cocluster.get_shape(cluster) cluster_docs, cluster_words = cocluster.get_indices(cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories counter = defaultdict(int) for i in cluster_docs: counter[document_names[i]] += 1 cat_string = ", ".join("{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in most_common(counter)[:3]) # words out_of_cluster_docs = cocluster.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = X[:, cluster_words]
class DocumentClustering: def __init__(self, k=5): self.name = 'k-means' self.k = k self.X = None self.clustering = None self.vectorizer = None self.dataset_size = 0 self.doc2vec_matrix = False def make_matrix(self, documents=None, n_components=-1, doc2vec_matrix=None): if isinstance(doc2vec_matrix, np.ndarray) == False: self.vectorizer = TfidfVectorizer() # self.vectorizer = CountVectorizer() self.X = self.vectorizer.fit_transform(documents) self.dataset_size = len(documents) else: self.X = doc2vec_matrix self.dataset_size = len(doc2vec_matrix) self.doc2vec_matrix = True if (n_components != -1): if n_components > len(self.vectorizer.get_feature_names()): n_components = len(self.vectorizer.get_feature_names()) print('n_components ' + str(n_components)) # Vectorizer results are normalized, which makes KMeans behave as # spherical k-means for better results. Since LSA/SVD results are # not normalized, we have to redo the normalization. print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(n_components) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) self.X = lsa.fit_transform(self.X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() def cluster(self, cluster_name): self.name = cluster_name.strip() print('cluster_name ' + self.name) if self.name == 'k-means': print('cluster_name: ' + self.name) self.clustering = KMeans(n_clusters=self.k, init='k-means++', max_iter=500, n_init=1) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif cluster_name == 'agglo': self.clustering = AgglomerativeClustering(n_clusters=self.k, affinity='euclidean', memory=None, connectivity=None, compute_full_tree='auto', linkage='ward', distance_threshold=None) print("Clustering sparse data with %s" % self.clustering) t0 = time() #to make dense matrix if self.doc2vec_matrix == False: self.X = self.X.toarray() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() elif self.name == 'spectral_cocluster': self.clustering = SpectralCoclustering(n_clusters=self.k, svd_method='arpack', random_state=0) print("Clustering sparse data with %s" % self.clustering) t0 = time() self.clustering.fit(self.X) print("done in %0.3fs" % (time() - t0)) print() def print_results(self): # print the clustering result print(self.name) if self.name == 'k-means': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) print(str(cluster_label) + " -- " + str(document_id)) order_centroids = self.clustering.cluster_centers_.argsort( )[:, ::-1] terms = self.vectorizer.get_feature_names() for i in range(self.k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() elif self.name == 'agglo': cluster_labels = self.clustering.labels_ clustering_dict = self.clustering.__dict__ clusters = {} for document_id, cluster_label in enumerate(cluster_labels): if cluster_label not in clusters: clusters[cluster_label] = [] clusters[cluster_label].append(document_id) #print(str(cluster_label) + " -- " + str(document_id)) results = self.get_cluster_top_keywords(clusters) for _cluster in results: key_terms = results[_cluster] print("Cluster " + str(_cluster) + " : " + str(len(clusters[_cluster])) + " documents") print(key_terms) print() elif self.name == 'spectral_cocluster': target_number = 10 bicluster_ncuts = list( self.bicluster_ncut(i) for i in range(self.k)) best_idx = np.argsort(bicluster_ncuts)[:target_number] feature_names = self.vectorizer.get_feature_names() print() print("Best biclusters:") print("----------------") for idx, cluster in enumerate(best_idx): n_rows, n_cols = self.clustering.get_shape(cluster) cluster_docs, cluster_words = self.clustering.get_indices( cluster) if not len(cluster_docs) or not len(cluster_words): continue # categories counter = defaultdict(int) for i in cluster_docs: counter[str(i)] += 1 cat_string = ", ".join( "{:.0f}% {}".format(float(c) / n_rows * 100, name) for name, c in self.most_common(counter)[:3]) # words out_of_cluster_docs = self.clustering.row_labels_ != cluster out_of_cluster_docs = np.where(out_of_cluster_docs)[0] word_col = self.X[:, cluster_words] word_scores = np.array(word_col[cluster_docs, :].sum( axis=0) - word_col[out_of_cluster_docs, :].sum(axis=0)) word_scores = word_scores.ravel() important_words = list(feature_names[cluster_words[i]] for i in word_scores.argsort()[:-11:-1]) print("bicluster {} : {} documents, {} words".format( idx, n_rows, n_cols)) print("categories : {}".format(cat_string)) print("words : {}\n".format(', '.join(important_words))) def bicluster_ncut(self, i): rows, cols = self.clustering.get_indices(i) if not (np.any(rows) and np.any(cols)): import sys return sys.float_info.max row_complement = np.nonzero(np.logical_not( self.clustering.rows_[i]))[0] col_complement = np.nonzero(np.logical_not( self.clustering.columns_[i]))[0] # Note: the following is identical to X[rows[:, np.newaxis], # cols].sum() but much faster in scipy <= 0.16 weight = self.X[rows][:, cols].sum() cut = (self.X[row_complement][:, cols].sum() + self.X[rows][:, col_complement].sum()) return cut / weight def most_common(self, d): """Items of a defaultdict(int) with the highest values. """ return sorted(d.items(), key=operator.itemgetter(1), reverse=True) def get_cluster_top_keywords(self, clusters, keywords_per_cluster=10): """Shows the top k words for each cluster Keyword Arguments: keywords_per_cluster {int} -- The k words to show for each cluster (default: {10}) Returns: dict of lists -- Returns a dict of {cluster_id: ['top', 'k', 'words', 'for', 'cluster']} """ terms = self.vectorizer.get_feature_names() out = {} docs_for_cluster = {} # self.clusters = 10 clusters,containing the index of the document_vectors document in that cluster, ex len(self.clusters[6]) == 508 for cluster in clusters: # To flatten/combine all documents into one docs_for_cluster[cluster] = np.array( [self.X[i] for i in clusters[cluster]]) # Cluster vectors to feature words out[cluster] = np.array(terms)[np.flip( np.argsort(docs_for_cluster[cluster]), -1)] cluster_shape = out[cluster].shape out[cluster] = out[cluster].reshape( cluster_shape[0] * cluster_shape[1])[:keywords_per_cluster].tolist() return out def visualize(self): # The output is a one-dimensional array of N documents corresponding to the clusters # assigned to our N data points. if self.name == 'spectral_cocluster': pca_t = None if self.doc2vec_matrix == False: pca_t = PCA().fit_transform(self.X.toarray()) else: pca_t = PCA().fit_transform(self.X) #pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.row_labels_, cmap='rainbow') plt.show() elif self.name == 'agglo': pca_t = PCA().fit_transform(self.X) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show() elif self.name == 'k-means': if self.doc2vec_matrix == False: self.X = self.X.toarray() pca_t = PCA().fit_transform(self.X) # print(self.clustering.labels_) plt.scatter(pca_t[:, 0], pca_t[:, 1], c=self.clustering.labels_, cmap='rainbow') plt.show()