def tsne_kmeans_clusters(tfidf, num_clusters=[3, 5, 7, 9, 11]):
    '''
    Vectorizer results are normalized, which makes KMeans behave as
    spherical k-means for better results. Since LSA/SVD results are
    not normalized, we have to redo the normalization.
    '''
    print(
        '\nUse sklearn tSNE to visualize viability of cluster estimates to inform n topic choices: {}'
        .format(num_clusters))

    for k in num_clusters:
        start = datetime.now()

        svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        reduced = lsa.fit_transform(tfidf)

        # next, apply kmeans to the corpus to get labels
        clusters = KMeans(n_clusters=k, init='k-means++')
        clusters.fit(reduced)

        tsne = TSNEVisualizer(decompose=None)
        tsne.fit(reduced, ["cluster {}".format(c) for c in clusters.labels_])

        tsne.finalize()
        filename = r'images/tsne_projections/tSNE_wKMeans_SVD_' + str(
            k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png'
        plt.savefig(filename)
        plt.close()

        end = datetime.now()
        print('            ' + filename)
        print("            Time taken: {}".format(end - start))
Example #2
0
def tsne_visualization(dataset_object, num_examples):
    """
    Produce and save T-SNE visualization of feature vectors for given dataset
    Parameters
    ----------
    dataset_object: tv_net.dataset.Dataset
        dataset object containing feature vectors and class names
    num_examples: int
        number of examples to plot
    """

    dataset_object.shuffle_examples(
    )  # shuffle so that we don't get all one class
    feature_vectors = np.array(
        [item.feature_vector for item in dataset_object.items[:num_examples]])
    label_list = [
        item.class_name for item in dataset_object.items[:num_examples]
    ]

    title = 'T-SNE of feature vectors extracted from baseline classifier - using random sample of {} images'.format(
        num_examples)
    tsne = TSNEVisualizer(colormap='rainbow', title=title)
    tsne.fit(feature_vectors, label_list)
    output_path = os.path.join(dataset_object.config.OUTPUT_DIR,
                               'visualizations', 'feature_vector_tsne.png')
    tsne.show(outpath=output_path)
    tsne.show()  # have to repeat to show and save
Example #3
0
    def result(self):
        data_df = self.clean_data.data()
        all_data_df = self.clean_data.getSpambase_data()
        target_df = self.clean_data.target()

        # Defining Model
        model = TSNE(learning_rate=100)

        # Fitting Model
        transformed = model.fit_transform(all_data_df)

        # Plotting 2d t-Sne
        x_axis = transformed[:, 0]
        pprint.pprint(x_axis)
        y_axis = transformed[:, 1]
        pprint.pprint(y_axis)

        plt.scatter(x_axis, y_axis, c=target_df)
        #plt.show()
        plt.savefig(self.file_png)

        # Create the visualizer and draw the vectors
        tfidf = TfidfVectorizer()
        docs = tfidf.fit_transform(data_df)

        tsne = TSNEVisualizer()
        tsne.fit(docs, target_df)
        tsne.poof()
def tsne():
    corpus = load_hobbies()
    docs = TfidfVectorizer().fit_transform(corpus.data)

    oz = TSNEVisualizer(ax=newfig())
    oz.fit(docs, corpus.target)
    savefig(oz, "corpus_tsne")
Example #5
0
def text_cluster_tsne(text_vector,
                      TextVectorizer=TfidfVectorizer,
                      text_kwargs=text_kwargs,
                      n_clusters=10,
                      labels=None):
    '''Uses a TextVectorizer to transform the text contained (at the sentence
    or paragraph level) in the text_vector arg to produce a TSNE visualization.
    The label for the final plot is clusters produced from KMeans if labels
    are not passed.

    ARGS:
        text_vector <np.array>: Vector of text units.  Must be type str.
    KWARGS:
        TextVectorizer <sklearn.feature_extraction.text>: Transformer.
        text_kwargs <dict>: kwargs to pass to TextVectorizer
        n_clusters <int>: If not using labels, number of clusters in KMeans
        labels <np.array>: True categorical labels.  Discrete.
    RETURNS:
        None, prints visualizations to the console.
    '''
    txt_vctzr = TextVectorizer(**text_kwargs)
    docs = txt_vctzr.fit_transform(text_vector)
    tsne = TSNEVisualizer()

    if labels is None:
        # derive clusters if labels not provided
        clusters = KMeans(n_clusters=n_clusters)
        clusters.fit(docs)
        tsne.fit(docs, ["cluster_{}".format(c) for c in clusters.labels_])
    else:
        # otherwise use labels
        tsne.fit(docs, labels)
    sns.despine()
    tsne.poof()
Example #6
0
def perform_tsne(X,
                 Y,
                 vec=None,
                 outpath="",
                 clusterLabels=False,
                 savePlot=False):
    if vec == None:
        vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity)

    docs = vec.fit_transform(X)
    labels = Y

    # from yellowbrick.text import TSNEVisualizer
    tsne = TSNEVisualizer()

    if clusterLabels:
        tsne.fit(docs,
                 ["c{}".format(c) for c in Y])  # where Y=clusters.labels_
    else:
        tsne.fit(docs, labels)

    if savePlot:
        # tsne.finalize()
        tsne.poof(outpath=outpath)
    else:
        tsne.poof()
Example #7
0
def tsne_pack(c, l):
    my_title = "t-SNE Plot of " + c + " feature"
    data = df.filter(like=c)
    tfidf = TfidfVectorizer()
    new_values = tfidf.fit_transform(corpus)
    tsne = TSNEVisualizer(title=my_title)
    tsne.fit(data, l)
    tsne.poof()
Example #8
0
 def clusters_tsne(self, labels: pd.Series, title: str = 'title'):
     tsne = TSNEVisualizer(random_state=42)
     tsne.fit(self.vectors, labels)
     f = tsne.show().figure
     f.set_figheight(15)
     f.set_figwidth(15)
     f.suptitle(title)
     return f
Example #9
0
def tsne(docs, target, outpath, **kwargs):
    # Create a new figure and axes
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # Visualize the frequency distribution
    visualizer = TSNEVisualizer(ax=ax, **kwargs)
    visualizer.fit(docs, target)
    visualizer.poof(outpath=outpath)
Example #10
0
def analyse_2_step_model():
    X_test = np.load(
        "test_prepared.npy").item()  # this is our Single point of truth
    #test_silhouette(30, X_test)

    test = X_test[0:1000]
    prediction = test_entire_model()[0:1000]

    vis_shilouette(test, prediction)
    plt.savefig("silhouette.png")

    tsne = TSNEVisualizer(colormap=cm.get_cmap('jet', len(set(prediction))))
    tsne.fit(test[0:1000], ["c{}".format(c) for c in prediction])
    tsne.poof(outpath="tsne.png")
Example #11
0
def tsne(c, l):
    my_title = "t-SNE Plot of final model"
    data = c
    tfidf = TfidfVectorizer()
    new_values = tfidf.fit_transform(corpus)
    tsne = TSNEVisualizer(title=my_title)
    tsne.fit(data, l)
    tsne.poof()

    # %%time
    figure(figsize=(20, 10))
    tsne(final, label_bias3)

    # %%time
    figure(figsize=(20, 10))
    tsne(final, label_fact)
def plot_tsne_clusters(corpus, fileids=None, labels=None):
    from yellowbrick.text import TSNEVisualizer
    from sklearn.feature_extraction.text import TfidfVectorizer

    words = corpus.title_tagged(fileids=fileids)
    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = (sent for title in normalizer.transform(words) for sent in title)
    # normed = (dd for dd in normalizer.transform(docs))
    tfidf = TfidfVectorizer()
    procd = tfidf.fit_transform(normed)

    tsne = TSNEVisualizer()
    if labels is None:
        tsne.fit(procd)
    else:
        tsne.fit(procd, ["c{}".format(c) for c in labels])
    tsne.poof()
Example #13
0
def tsne(ax, classes=True):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from yellowbrick.text import TSNEVisualizer

    X, y = load_data("hobbies", text=True)
    if not classes:
        y = None

    freq = TfidfVectorizer(input='filename', stop_words='english')
    X = freq.fit_transform(X)

    visualizer = TSNEVisualizer(ax=ax)
    visualizer.title = "t-SNE Projection of the Hobbies Corpus"
    if not classes:
        visualizer.title = "Unlabeled " + visualizer.title
    visualizer.fit(X, y)
    return visualizer
Example #14
0
def generate_tsne(title, X, labels):

    fig, (ax1) = plt.subplots(1, 1, figsize=(4, 2))
    title_dic = {'fontsize': 7, 'fontweight': 'bold'}

    colors = resolve_colors(11, 'Spectral_r')
    colors2 = resolve_colors(10, 'BrBG_r')
    tsne = TSNEVisualizer(ax1, colors=colors + colors2,decompose=None)
    tsne.fit(X, labels)
    tsne.finalize()
    ax1 = tsne.ax
    ax1.set_title(title, title_dic)

    path = os.path.join(OUTPUT)
    filename = title
    filename = os.path.join(path, filename)
    plt.savefig(filename)
Example #15
0
def cluster(corpus, k):
    y = [i[0] for i in corpus]
    corpus = [i[1] for i in corpus]
    eng = list(set(stopwords.words('english')))

    trump = [
        'wall', 'president', 'trump', 'loss', 'yes', 'sorry', 'mr', 'build',
        'thank', 'people'
    ]

    s_w = eng + trump

    vectorizer = TfidfVectorizer(stop_words=s_w)
    vectorizer.fit(corpus)
    features = vectorizer.transform(corpus)

    tsne = TSNEVisualizer()
    tsne.fit(features, y)
    tsne.show()
Example #16
0
    def tsne_plot(self, outpath, sample_size=1000, tfidf=True):
        """
        Creates a png file at `outpath` with t-SNE visualization.
        `sample_size` determines the size of the random sample from each label.
        Uses TfidfVectorizer by default;
        if `tfidf` is set to False, CountVectorizer is used.
        -----------------------------------------------------------------------
        More info:
        https://www.scikit-yb.org/en/latest/api/text/tsne.html
        https://lvdmaaten.github.io/tsne/
        """

        if self.tokenizer is None:
            print('No tokenizer was loaded.')
            return None

        df = pd.DataFrame(columns=self.data.columns)
        for label in self.labels:
            samp_df = self.data \
                .query("Label == @label") \
                .sample(sample_size, random_state=19)
            df = df.append(samp_df, ignore_index=True)

        # vectorize
        if tfidf:
            vectorizer = TfidfVectorizer(tokenizer=self.tokenizer.tokenize)
        else:
            vectorizer = CountVectorizer(tokenizer=self.tokenizer.tokenize)
        X = vectorizer.fit_transform(df.Text)
        y = df.Label

        # create the visualizer and draw the vectors
        tsne = TSNEVisualizer()
        tsne.fit(X, y)
        tsne.show(outpath=outpath)

        return None
Example #17
0
def analyse_results():
    rerun = False
    if ("rerun" in sys.argv):
        print("Redo everything")
        rerun = True

    X_test = np.load("test_prepared.npy").item()

    results = []
    names = []

    for filename in os.listdir("results"):
        if filename.endswith(".npy"):
            if filename[:-4] + "tsne.png" in os.listdir(
                    "results") and not rerun:
                continue

            results.append(np.load("results/" + filename))
            names.append(filename[:-4])

    for i in range(len(results)):
        print("iteration " + str(i + 1) + " of " + str(len(results)) + " : " +
              names[i])

        vis_shilouette(X_test, results[i])
        plt.savefig("results/" + names[i] + "silhouette.png")

        plt.close()
        plt.figure()

        tsne = TSNEVisualizer(colormap=cm.get_cmap(
            'jet', len(set(results[i][0:5000]))),
                              alpha=0.5,
                              random_state=45)  # make it deterministic
        tsne.fit(X_test[0:5000], ["c{}".format(c) for c in results[i][0:5000]])
        tsne.poof(outpath="results/" + names[i] + "tsne.png",
                  clear_figure=True)
# freq_dist_viz(vectorizer, df_train['Lyrics'],
#               "images/tfid_stopwords_train.png")
# freq_dist_viz(vectorizer, df_test['Lyrics'], "images/tfid_stopwords_test.png")


def get_sentence_embedding(w2v_model, sentence):
    embedding = np.zeros(3000)

    for word in sentence.split():
        try:
            vector = w2v_model.wv.get_vector(word)
        except KeyError:
            vector = np.zeros(3000)
        embedding += vector

    return embedding / len(sentence.split())


w2v_model = Word2Vec.load("word2vec_models/word2vec.model")
docs = np.array([
    get_sentence_embedding(w2v_model, sentence)
    for sentence in df_train['Lyrics']
])
# tfidf = TfidfVectorizer()
# docs = tfidf.fit_transform(X)
labels = df_train['Genre']

tsne = TSNEVisualizer()
tsne.fit(docs, labels)
tsne.poof("images/w2v_tsne.png")
Example #19
0
#!/usr/bin/env python3

import pickle
from yellowbrick.text import TSNEVisualizer

with open('data/agorb.csv', 'rb') as file:
    agora = pickle.load(file)

with open('data/tno/tfidf_vectors_webiq.pkl', 'rb') as file:
    X = pickle.load(file)

with open('data/tno/categorieen.pkl', 'rb') as file:
    c = pickle.load(file)

tsne = TSNEVisualizer()
tsne.fit(X, c)
tsne.show()
Example #20
0
def main(X_train_smart, X_test_smart, y_train_smart, y_test_smart,
         X_train_bank, X_test_bank, y_train_bank, y_test_bank, args):

    # em = KMeans(n_clusters=4, random_state=27)
    # em.fit(X_train_smart)
    # prediction = em.predict(X_train_smart)

    # viz = RadViz()
    # viz.fit_transform(X_train_smart, prediction)
    # viz.show()

    # umap = UMAPVisualizer()
    # umap.fit(X_train_smart, ["c{}".format(c) for c in prediction])
    # umap.show()

    # tsne = TSNEVisualizer(decompose_by=4)
    # tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction])
    # tsne.show()
    # exit()
    sil_score_list_smart = []
    cal_har_score_list_smart = []
    davies_bouldin_score_list_smart = []
    sil_score_list_bank = []
    cal_har_score_list_bank = []
    davies_bouldin_score_list_bank = []
    num_clusters_list = np.arange(2, 25)
    for num_clusters in num_clusters_list:
        k_means = KMeans(n_clusters=num_clusters, random_state=27)
        k_means.fit(X_train_smart)
        prediction = k_means.predict(X_train_smart)
        # print(prediction)
        sil_score_list_smart.append(silhouette_score(X_train_smart,
                                                     prediction))
        cal_har_score_list_smart.append(
            calinski_harabasz_score(X_train_smart, prediction))
        davies_bouldin_score_list_smart.append(
            davies_bouldin_score(X_train_smart, prediction))

    for num_clusters in num_clusters_list:
        k_means = KMeans(n_clusters=num_clusters, random_state=27)
        k_means.fit(X_train_bank)
        prediction = k_means.predict(X_train_bank)
        # print(prediction)
        sil_score_list_bank.append(silhouette_score(X_train_bank, prediction))
        cal_har_score_list_bank.append(
            calinski_harabasz_score(X_train_bank, prediction))
        davies_bouldin_score_list_bank.append(
            davies_bouldin_score(X_train_bank, prediction))

    with open('experiment_best.json') as f:
        params = json.load(f)
    if args.dimensionality is None:
        num_clusters_smart = params['k_means']['smart']
        num_clusters_bank = params['k_means']['bank']
    else:
        num_clusters_smart = params[args.dimensionality[0]]['k_means']['smart']
        num_clusters_bank = params[args.dimensionality[0]]['k_means']['bank']

    # Scale these for plotting
    cal_har_score_list_smart = [x / 500 for x in cal_har_score_list_smart]
    cal_har_score_list_bank = [x / 500 for x in cal_har_score_list_bank]
    davies_bouldin_score_list_smart = [
        x / 5 for x in davies_bouldin_score_list_smart
    ]
    davies_bouldin_score_list_bank = [
        x / 5 for x in davies_bouldin_score_list_bank
    ]

    plt.rc("font", size=8)
    plt.rc("axes", titlesize=12)
    plt.rc("axes", labelsize=10)
    plt.rc("xtick", labelsize=8)
    plt.rc("ytick", labelsize=8)
    plt.rc("legend", fontsize=8)
    plt.rc("figure", titlesize=11)
    #fig, ax = plt.subplots(2, 1, dpi=100, sharex=True, figsize=(5,4))
    fig, ax = plt.subplots(1, 4, figsize=(15, 4))
    fig.suptitle(
        'K-Means Clusters - # of clusters Analysis (Left: Smart Grid, Right: Bank Loan)',
        fontsize=14)
    # ax[0].plot(problem_size_list, sa_fitness_list, 'b-', label='Simulated Annealing', linewidth=1)
    # ax[0].plot(problem_size_list, ga_fitness_list, 'g:', label='Genetic', linewidth=1)
    ax[0].plot(num_clusters_list,
               sil_score_list_smart,
               'b-',
               label='Silhouette',
               linewidth=1)
    ax[0].plot(num_clusters_list,
               cal_har_score_list_smart,
               'r--',
               label='Calinksi-Harabasz / 500',
               linewidth=1)
    ax[0].plot(num_clusters_list,
               davies_bouldin_score_list_smart,
               'g-.',
               label='Davies-Bouldin / 5',
               linewidth=1)
    ax[0].set(xlabel='K (# of clusters)', ylabel='Scores')
    ax[0].set_title('Clustering Scores')
    ax[0].legend()

    k_means = KMeans(n_clusters=num_clusters_smart, random_state=27)
    k_means.fit(X_train_smart)
    prediction_smart = k_means.predict(X_train_smart)
    tsne = TSNEVisualizer(decompose_by=X_train_smart.shape[1] - 1,
                          ax=ax[1],
                          random_state=27)
    tsne.fit(X_train_smart, ["c{}".format(c) for c in prediction_smart])
    ax[1].set_title(
        'tSNE Projection (clusters = {0})'.format(num_clusters_smart))
    ax[1].set_xticklabels([])
    ax[1].set_yticklabels([])

    ax[2].plot(num_clusters_list,
               sil_score_list_bank,
               'b-',
               label='Silhouette',
               linewidth=1)
    ax[2].plot(num_clusters_list,
               cal_har_score_list_bank,
               'r--',
               label='Calinksi-Harabasz / 5d00',
               linewidth=1)
    ax[2].plot(num_clusters_list,
               davies_bouldin_score_list_bank,
               'g-.',
               label='Davies-Bouldin / 5',
               linewidth=1)
    ax[2].set(xlabel='K (# of clusters)', ylabel='Scores')
    ax[2].set_title('Clustering Scores')
    ax[2].legend()

    k_means = KMeans(n_clusters=num_clusters_bank, random_state=27)
    k_means.fit(X_train_bank)
    prediction_bank = k_means.predict(X_train_bank)
    tsne_bank = TSNEVisualizer(decompose_by=X_train_bank.shape[1] - 1,
                               ax=ax[3],
                               random_state=27)
    tsne_bank.fit(X_train_bank, ["c{}".format(c) for c in prediction_bank])
    ax[3].set_title(
        'tSNE Projection (clusters = {0})'.format(num_clusters_bank))
    ax[3].set_xticklabels([])
    ax[3].set_yticklabels([])

    plt.show()

    # Boosting validation
    # Smart grid
    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_smart, y_train_smart)
    boost_fit_time = time() - boost_fit_t
    print('Boosting baseline fit time (smart): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_smart)
    boost_pred_time = time() - boost_pred_t
    print('Boosting baseline predict time (smart): ' + str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_smart,
                                  y_train_smart,
                                  cv=10)
    print('Boosting baseline cross validation score (smart): ' +
          str(np.mean(boost_score)))
    # boost_accuracy = accuracy(boosting_learner, y_test, boost_pred)
    # print('Boosting baseline test set predict accuracy: ' + str(boost_accuracy))

    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_smart, prediction_smart)
    boost_fit_time = time() - boost_fit_t
    print('Boosting DR + cluster fit time (smart): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_smart)
    boost_pred_time = time() - boost_pred_t
    print('Boosting DR + cluster predict time (smart): ' +
          str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_smart,
                                  prediction_smart,
                                  cv=10)
    print('Boosting DR + cluster cross validation score (smart): ' +
          str(np.mean(boost_score)))

    # Bank loan
    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_bank, y_train_bank)
    boost_fit_time = time() - boost_fit_t
    print('Boosting baseline fit time (bank): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_bank)
    boost_pred_time = time() - boost_pred_t
    print('Boosting baseline predict time (bank): ' + str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_bank,
                                  y_train_bank,
                                  cv=10)
    print('Boosting baseline cross validation score (bank): ' +
          str(np.mean(boost_score)))

    boosting_learner = AdaBoostClassifier(learning_rate=1, n_estimators=100)
    boost_fit_t = time()
    boosting_learner.fit(X_train_bank, prediction_bank)
    boost_fit_time = time() - boost_fit_t
    print('Boosting DR + cluster fit time (bank): ' + str(boost_fit_time))
    boost_pred_t = time()
    boost_pred = boosting_learner.predict(X_test_bank)
    boost_pred_time = time() - boost_pred_t
    print('Boosting DR + cluster predict time (bank): ' + str(boost_pred_time))
    boost_score = cross_val_score(boosting_learner,
                                  X_train_bank,
                                  prediction_bank,
                                  cv=10)
    print('Boosting DR + cluster cross validation score (bank): ' +
          str(np.mean(boost_score)))

    return
Example #21
0
                    n_jobs=num_threads)
    kmeans = kmeans.fit(X)
    pickle.dump(kmeans,
                open(f"cache/{thread_id}/{thread_id}_fitted_kmeans.pk", 'wb'))

y = kmeans.labels_

if os.path.exists(f"cache/{thread_id}/{thread_id}_tsne_fitted.pk"):
    print(f"Loading pre-trained tsne comments.")
    tsne = pickle.load(
        open(f"cache/{thread_id}/{thread_id}_tsne_fitted.pk", 'rb'))
else:
    tsne = TSNEVisualizer(decompose_by=tsne_svd,
                          n_iter=tsne_iterations,
                          verbose=2)
    tsne = tsne.fit(X, y)
    pickle.dump(tsne,
                open(f"cache/{thread_id}/{thread_id}_tsne_fitted.pk", 'wb'))

tsne.colors = """#33000e, #660029, #bf0080, #660080, #0088ff, #00708c, #008066, #4cbf00, #735c00, #ff8800, #995200, #402200, #ff4400, #590000, #ff4073, #ff40f2, #7736d9, #101040, #233f8c, #36ced9, #36d98d, #538020, #b6bf30, #b22d2d, #733960, #8959b3, #1a2033, #46628c, #73bfe6, #1a2e33, #204020, #ffd580, #f29979, #8c5946, #cc99c2, #bfbfff, #698c8a, #eaffbf, #8c8569, #4d4439, #bfa38f, #e6acac""".split(
    ', ')
tsne.draw(tsne.vecs,
          y,
          point_annotations=[
              f"{document}\n{author}"
              for document, author in zip(documents, authors)
          ])

with open(f'cache/{thread_id}/{thread_id}_clustered.tsv', 'w') as file:
    csv_writer = csv.writer(file,
                            delimiter='\t',
def plotData(base, labels=[-1,0,1]):
    vectorizer = CountVectorizer(lowercase=False)
    tweets       = vectorizer.fit_transform(base['Tweet'])
    tsne = TSNEVisualizer()
    tsne.fit(tweets, labels)
    tsne.poof()
Example #23
0
        color = "#000000"
        colormap.append(color)

for label in labels:
    big_colormap.append(mycolormap[label])



t6 = time.time()






tsne = TSNEVisualizer(colormap='RdYlGn')
tsne.fit(tfidf_matrix, labels)
tsne.poof()

t7 = time.time()

print("time for TSNE and vis: " + str(t7-t6))


tsne.poof()





Example #24
0
           cbar=False,
           fmt='g')

####################visualisng Clusters

###########Dendogram for TF-IDF features
from scipy.cluster.hierarchy import dendrogram, linkage

np.set_printoptions(precision=6, suppress=True)
H_cluster = linkage(tfidf_matrix, 'ward')
plt.title('Dendogram')
plt.xlabel('Data')
plt.ylabel('Distance bewteen data points')
dendrogram(
    H_cluster,
    truncate_mode='lastp',  # show only the last p merged clusters
    p=13,  # show only the last p merged clusters
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=
    True,  # to get a distribution impression in truncated branches
)
plt.show()

#########Scatter plot to visualise k-means clusters
from yellowbrick.text import TSNEVisualizer

tsne = TSNEVisualizer()
tsne.fit(tfidf_matrix, ["c{}".format(c) for c in labels])
tsne.poof()
Example #25
0
    matrix = np.zeros([len(t), len(liste_galaxies)])

    dirGalaxies = shelve.open(path + '/BDs/listeGalaxies')

    for galaxie in range(len(liste_galaxies)):
        for node in dirGalaxies[str(liste_galaxies[galaxie])]:
            matrix[index[node]][galaxie] += 1
        
        matrix[:,galaxie] = matrix[:,galaxie] / len(dirGalaxies[str(liste_galaxies[galaxie])])

    dirGalaxies.close()
    
    label = np.array([i for i in range(len(t))])
    tsne = TSNEVisualizer(decompose='svd',decompose_by=15)
    tsne.fit(matrix, label)
    print(tsne.transformer_)
    tsne.poof()

    svd = TruncatedSVD(n_components=15)
    svd_matrix = svd.fit_transform(matrix)
    tsne = ts.TSNE()
    y = tsne.fit_transform(svd_matrix)
    kmeans = Kmeans(5,200,0.1)
    kmeans.fit(y)
    for i in range(kmeans.nb_cluster):
        print("Cluster ",i)
        print((np.where(kmeans.which_cluster == i))[0])
        print()
    plt.scatter(y[:, 0], y[:, 1], c=kmeans.which_cluster.reshape(-1,1), s=50, cmap='viridis')
    plt.title("Resultat du clustering")
Example #26
0
        files=files,
        data=data,
        target=target,
    )


# Load the data and create document vectors
corpus = load_corpus('hobbies')
tfidf  = TfidfVectorizer()

docs   = tfidf.fit_transform(corpus.data)
labels = corpus.target

# Create a visualizer to simply see the vectors plotted in 2D
tsne = TSNEVisualizer()
tsne.fit(docs)
tsne.poof()


# Create a visualizer to see how k-means clustering grouped the docs
from sklearn.cluster import KMeans

clusters = KMeans(n_clusters=5)
clusters.fit(docs)

tsne = TSNEVisualizer()
tsne.fit(docs, ["c{}".format(c) for c in clusters.labels_])
tsne.poof()


# Create a visualizer to see how the classes are distributed
Example #27
0
from sklearn.cluster import KMeans
from yellowbrick.text import TSNEVisualizer

#LDA予測
pred_score = predict_lda(scene_docs)
result = pd.DataFrame(pred_score)

clusters = KMeans(n_clusters=10)
clusters.fit(result.values)

plt.figure(figsize=(10,10))
tsne = TSNEVisualizer()
tsne.fit(result.values, ["c{}".format(c) for c in clusters.labels_])
tsne.poof()
Example #28
0
def load_corpus():
    c = Corpus("all_posts01.txt")
    return c


corpus = load_corpus()

#tfidf  = TfidfVectorizer(stop_words='english')
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer(max_df=0.5,
                             max_features=10000,
                             min_df=2,
                             use_idf=True)
#transformer =  TfidfTransformer()
#tfidf = make_pipeline(hasher,transformer)
docs = vectorizer.fit_transform(corpus.documents)

print(docs)

true_k = 500
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(docs)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

tsne = TSNEVisualizer(labels=["documents"])
tsne.fit(docs)
tsne.poof()
Example #29
0
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')], lowercase=False)
dummies = vect.fit_transform(df['ingredients'].apply(','.join)) 

df = pd.DataFrame(dummies.todense(),columns=vect.get_feature_names())
print("Vocab Length: ", len(vect.get_feature_names()))
print("All Data Shape: ", df.shape)
df.index= df_index

print("Number of Predictors: ", df.shape[0])
df.head()

# Create the visualizer and draw the vectors
plt.figure(figsize = [15,9])
tsne = TSNEVisualizer()
tsne.fit(df.loc[traindex,:][:7000], y[:7000])
tsne.poof()

X = df.loc[traindex,:]
print("Number of Cuisine Types: ", y.nunique())
print("X Shape: ", X.shape)
test_df = df.loc[testdex,:]
print("Test DF Shape: ", test_df.shape)
del df; gc.collect();

LogisticRegression().get_params().keys()

model = LogisticRegression(multi_class= 'ovr')
score = cross_validate(model, X, y, return_train_score=False)
score["test_score"].mean()
def TSNE_graph(X_train, y_train):
    tsne = TSNEVisualizer()
    tsne.fit(X_train, y_train)
    tsne.poof()