Beispiel #1
0
def perform_tsne(X,
                 Y,
                 vec=None,
                 outpath="",
                 clusterLabels=False,
                 savePlot=False):
    if vec == None:
        vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity)

    docs = vec.fit_transform(X)
    labels = Y

    # from yellowbrick.text import TSNEVisualizer
    tsne = TSNEVisualizer()

    if clusterLabels:
        tsne.fit(docs,
                 ["c{}".format(c) for c in Y])  # where Y=clusters.labels_
    else:
        tsne.fit(docs, labels)

    if savePlot:
        # tsne.finalize()
        tsne.poof(outpath=outpath)
    else:
        tsne.poof()
Beispiel #2
0
    def result(self):
        data_df = self.clean_data.data()
        all_data_df = self.clean_data.getSpambase_data()
        target_df = self.clean_data.target()

        # Defining Model
        model = TSNE(learning_rate=100)

        # Fitting Model
        transformed = model.fit_transform(all_data_df)

        # Plotting 2d t-Sne
        x_axis = transformed[:, 0]
        pprint.pprint(x_axis)
        y_axis = transformed[:, 1]
        pprint.pprint(y_axis)

        plt.scatter(x_axis, y_axis, c=target_df)
        #plt.show()
        plt.savefig(self.file_png)

        # Create the visualizer and draw the vectors
        tfidf = TfidfVectorizer()
        docs = tfidf.fit_transform(data_df)

        tsne = TSNEVisualizer()
        tsne.fit(docs, target_df)
        tsne.poof()
Beispiel #3
0
def text_cluster_tsne(text_vector,
                      TextVectorizer=TfidfVectorizer,
                      text_kwargs=text_kwargs,
                      n_clusters=10,
                      labels=None):
    '''Uses a TextVectorizer to transform the text contained (at the sentence
    or paragraph level) in the text_vector arg to produce a TSNE visualization.
    The label for the final plot is clusters produced from KMeans if labels
    are not passed.

    ARGS:
        text_vector <np.array>: Vector of text units.  Must be type str.
    KWARGS:
        TextVectorizer <sklearn.feature_extraction.text>: Transformer.
        text_kwargs <dict>: kwargs to pass to TextVectorizer
        n_clusters <int>: If not using labels, number of clusters in KMeans
        labels <np.array>: True categorical labels.  Discrete.
    RETURNS:
        None, prints visualizations to the console.
    '''
    txt_vctzr = TextVectorizer(**text_kwargs)
    docs = txt_vctzr.fit_transform(text_vector)
    tsne = TSNEVisualizer()

    if labels is None:
        # derive clusters if labels not provided
        clusters = KMeans(n_clusters=n_clusters)
        clusters.fit(docs)
        tsne.fit(docs, ["cluster_{}".format(c) for c in clusters.labels_])
    else:
        # otherwise use labels
        tsne.fit(docs, labels)
    sns.despine()
    tsne.poof()
Beispiel #4
0
def tsne_pack(c, l):
    my_title = "t-SNE Plot of " + c + " feature"
    data = df.filter(like=c)
    tfidf = TfidfVectorizer()
    new_values = tfidf.fit_transform(corpus)
    tsne = TSNEVisualizer(title=my_title)
    tsne.fit(data, l)
    tsne.poof()
Beispiel #5
0
def tsne(docs, target, outpath, **kwargs):
    # Create a new figure and axes
    fig = plt.figure()
    ax = fig.add_subplot(111)

    # Visualize the frequency distribution
    visualizer = TSNEVisualizer(ax=ax, **kwargs)
    visualizer.fit(docs, target)
    visualizer.poof(outpath=outpath)
Beispiel #6
0
def analyse_2_step_model():
    X_test = np.load(
        "test_prepared.npy").item()  # this is our Single point of truth
    #test_silhouette(30, X_test)

    test = X_test[0:1000]
    prediction = test_entire_model()[0:1000]

    vis_shilouette(test, prediction)
    plt.savefig("silhouette.png")

    tsne = TSNEVisualizer(colormap=cm.get_cmap('jet', len(set(prediction))))
    tsne.fit(test[0:1000], ["c{}".format(c) for c in prediction])
    tsne.poof(outpath="tsne.png")
Beispiel #7
0
def tsne(c, l):
    my_title = "t-SNE Plot of final model"
    data = c
    tfidf = TfidfVectorizer()
    new_values = tfidf.fit_transform(corpus)
    tsne = TSNEVisualizer(title=my_title)
    tsne.fit(data, l)
    tsne.poof()

    # %%time
    figure(figsize=(20, 10))
    tsne(final, label_bias3)

    # %%time
    figure(figsize=(20, 10))
    tsne(final, label_fact)
def plot_tsne_clusters(corpus, fileids=None, labels=None):
    from yellowbrick.text import TSNEVisualizer
    from sklearn.feature_extraction.text import TfidfVectorizer

    words = corpus.title_tagged(fileids=fileids)
    normalizer = Corpus_Vectorizer.TextNormalizer()
    normed = (sent for title in normalizer.transform(words) for sent in title)
    # normed = (dd for dd in normalizer.transform(docs))
    tfidf = TfidfVectorizer()
    procd = tfidf.fit_transform(normed)

    tsne = TSNEVisualizer()
    if labels is None:
        tsne.fit(procd)
    else:
        tsne.fit(procd, ["c{}".format(c) for c in labels])
    tsne.poof()
Beispiel #9
0
def analyse_results():
    rerun = False
    if ("rerun" in sys.argv):
        print("Redo everything")
        rerun = True

    X_test = np.load("test_prepared.npy").item()

    results = []
    names = []

    for filename in os.listdir("results"):
        if filename.endswith(".npy"):
            if filename[:-4] + "tsne.png" in os.listdir(
                    "results") and not rerun:
                continue

            results.append(np.load("results/" + filename))
            names.append(filename[:-4])

    for i in range(len(results)):
        print("iteration " + str(i + 1) + " of " + str(len(results)) + " : " +
              names[i])

        vis_shilouette(X_test, results[i])
        plt.savefig("results/" + names[i] + "silhouette.png")

        plt.close()
        plt.figure()

        tsne = TSNEVisualizer(colormap=cm.get_cmap(
            'jet', len(set(results[i][0:5000]))),
                              alpha=0.5,
                              random_state=45)  # make it deterministic
        tsne.fit(X_test[0:5000], ["c{}".format(c) for c in results[i][0:5000]])
        tsne.poof(outpath="results/" + names[i] + "tsne.png",
                  clear_figure=True)
Beispiel #10
0
vect = CountVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')], lowercase=False)
dummies = vect.fit_transform(df['ingredients'].apply(','.join)) 

df = pd.DataFrame(dummies.todense(),columns=vect.get_feature_names())
print("Vocab Length: ", len(vect.get_feature_names()))
print("All Data Shape: ", df.shape)
df.index= df_index

print("Number of Predictors: ", df.shape[0])
df.head()

# Create the visualizer and draw the vectors
plt.figure(figsize = [15,9])
tsne = TSNEVisualizer()
tsne.fit(df.loc[traindex,:][:7000], y[:7000])
tsne.poof()

X = df.loc[traindex,:]
print("Number of Cuisine Types: ", y.nunique())
print("X Shape: ", X.shape)
test_df = df.loc[testdex,:]
print("Test DF Shape: ", test_df.shape)
del df; gc.collect();

LogisticRegression().get_params().keys()

model = LogisticRegression(multi_class= 'ovr')
score = cross_validate(model, X, y, return_train_score=False)
score["test_score"].mean()

model.fit(X,y)
# freq_dist_viz(vectorizer, df_train['Lyrics'],
#               "images/tfid_stopwords_train.png")
# freq_dist_viz(vectorizer, df_test['Lyrics'], "images/tfid_stopwords_test.png")


def get_sentence_embedding(w2v_model, sentence):
    embedding = np.zeros(3000)

    for word in sentence.split():
        try:
            vector = w2v_model.wv.get_vector(word)
        except KeyError:
            vector = np.zeros(3000)
        embedding += vector

    return embedding / len(sentence.split())


w2v_model = Word2Vec.load("word2vec_models/word2vec.model")
docs = np.array([
    get_sentence_embedding(w2v_model, sentence)
    for sentence in df_train['Lyrics']
])
# tfidf = TfidfVectorizer()
# docs = tfidf.fit_transform(X)
labels = df_train['Genre']

tsne = TSNEVisualizer()
tsne.fit(docs, labels)
tsne.poof("images/w2v_tsne.png")
def plotData(base, labels=[-1,0,1]):
    vectorizer = CountVectorizer(lowercase=False)
    tweets       = vectorizer.fit_transform(base['Tweet'])
    tsne = TSNEVisualizer()
    tsne.fit(tweets, labels)
    tsne.poof()
def TSNE_graph(X_train, y_train):
    tsne = TSNEVisualizer()
    tsne.fit(X_train, y_train)
    tsne.poof()
Beispiel #14
0
        data=data,
        target=target,
    )


# Load the data and create document vectors
corpus = load_corpus('hobbies')
tfidf  = TfidfVectorizer()

docs   = tfidf.fit_transform(corpus.data)
labels = corpus.target

# Create a visualizer to simply see the vectors plotted in 2D
tsne = TSNEVisualizer()
tsne.fit(docs)
tsne.poof()


# Create a visualizer to see how k-means clustering grouped the docs
from sklearn.cluster import KMeans

clusters = KMeans(n_clusters=5)
clusters.fit(docs)

tsne = TSNEVisualizer()
tsne.fit(docs, ["c{}".format(c) for c in clusters.labels_])
tsne.poof()


# Create a visualizer to see how the classes are distributed
tsne = TSNEVisualizer()
def visualise_with_yellowbrick(feature_matrix, labels_tfidf):
    tsne = TSNEVisualizer(title="Chat Messages Clusters", alpha = 0.7)
    tsne.fit(feature_matrix, np.array(labels_tfidf))
    tsne.finalize()
    tsne.poof()