def tsne_kmeans_clusters(tfidf, num_clusters=[3, 5, 7, 9, 11]): ''' Vectorizer results are normalized, which makes KMeans behave as spherical k-means for better results. Since LSA/SVD results are not normalized, we have to redo the normalization. ''' print( '\nUse sklearn tSNE to visualize viability of cluster estimates to inform n topic choices: {}' .format(num_clusters)) for k in num_clusters: start = datetime.now() svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) reduced = lsa.fit_transform(tfidf) # next, apply kmeans to the corpus to get labels clusters = KMeans(n_clusters=k, init='k-means++') clusters.fit(reduced) tsne = TSNEVisualizer(decompose=None) tsne.fit(reduced, ["cluster {}".format(c) for c in clusters.labels_]) tsne.finalize() filename = r'images/tsne_projections/tSNE_wKMeans_SVD_' + str( k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png' plt.savefig(filename) plt.close() end = datetime.now() print(' ' + filename) print(" Time taken: {}".format(end - start))
def generate_tsne(title, X, labels): fig, (ax1) = plt.subplots(1, 1, figsize=(4, 2)) title_dic = {'fontsize': 7, 'fontweight': 'bold'} colors = resolve_colors(11, 'Spectral_r') colors2 = resolve_colors(10, 'BrBG_r') tsne = TSNEVisualizer(ax1, colors=colors + colors2,decompose=None) tsne.fit(X, labels) tsne.finalize() ax1 = tsne.ax ax1.set_title(title, title_dic) path = os.path.join(OUTPUT) filename = title filename = os.path.join(path, filename) plt.savefig(filename)
def visualise_with_yellowbrick(feature_matrix, labels_tfidf): tsne = TSNEVisualizer(title="Chat Messages Clusters", alpha = 0.7) tsne.fit(feature_matrix, np.array(labels_tfidf)) tsne.finalize() tsne.poof()