def tsne_kmeans_clusters(tfidf, num_clusters=[3, 5, 7, 9, 11]):
    '''
    Vectorizer results are normalized, which makes KMeans behave as
    spherical k-means for better results. Since LSA/SVD results are
    not normalized, we have to redo the normalization.
    '''
    print(
        '\nUse sklearn tSNE to visualize viability of cluster estimates to inform n topic choices: {}'
        .format(num_clusters))

    for k in num_clusters:
        start = datetime.now()

        svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        reduced = lsa.fit_transform(tfidf)

        # next, apply kmeans to the corpus to get labels
        clusters = KMeans(n_clusters=k, init='k-means++')
        clusters.fit(reduced)

        tsne = TSNEVisualizer(decompose=None)
        tsne.fit(reduced, ["cluster {}".format(c) for c in clusters.labels_])

        tsne.finalize()
        filename = r'images/tsne_projections/tSNE_wKMeans_SVD_' + str(
            k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png'
        plt.savefig(filename)
        plt.close()

        end = datetime.now()
        print('            ' + filename)
        print("            Time taken: {}".format(end - start))
Example #2
0
def generate_tsne(title, X, labels):

    fig, (ax1) = plt.subplots(1, 1, figsize=(4, 2))
    title_dic = {'fontsize': 7, 'fontweight': 'bold'}

    colors = resolve_colors(11, 'Spectral_r')
    colors2 = resolve_colors(10, 'BrBG_r')
    tsne = TSNEVisualizer(ax1, colors=colors + colors2,decompose=None)
    tsne.fit(X, labels)
    tsne.finalize()
    ax1 = tsne.ax
    ax1.set_title(title, title_dic)

    path = os.path.join(OUTPUT)
    filename = title
    filename = os.path.join(path, filename)
    plt.savefig(filename)
def visualise_with_yellowbrick(feature_matrix, labels_tfidf):
    tsne = TSNEVisualizer(title="Chat Messages Clusters", alpha = 0.7)
    tsne.fit(feature_matrix, np.array(labels_tfidf))
    tsne.finalize()
    tsne.poof()