def perform_tsne(X, Y, vec=None, outpath="", clusterLabels=False, savePlot=False): if vec == None: vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity) docs = vec.fit_transform(X) labels = Y # from yellowbrick.text import TSNEVisualizer tsne = TSNEVisualizer() if clusterLabels: tsne.fit(docs, ["c{}".format(c) for c in Y]) # where Y=clusters.labels_ else: tsne.fit(docs, labels) if savePlot: # tsne.finalize() tsne.poof(outpath=outpath) else: tsne.poof()
def result(self): data_df = self.clean_data.data() all_data_df = self.clean_data.getSpambase_data() target_df = self.clean_data.target() # Defining Model model = TSNE(learning_rate=100) # Fitting Model transformed = model.fit_transform(all_data_df) # Plotting 2d t-Sne x_axis = transformed[:, 0] pprint.pprint(x_axis) y_axis = transformed[:, 1] pprint.pprint(y_axis) plt.scatter(x_axis, y_axis, c=target_df) #plt.show() plt.savefig(self.file_png) # Create the visualizer and draw the vectors tfidf = TfidfVectorizer() docs = tfidf.fit_transform(data_df) tsne = TSNEVisualizer() tsne.fit(docs, target_df) tsne.poof()
def text_cluster_tsne(text_vector, TextVectorizer=TfidfVectorizer, text_kwargs=text_kwargs, n_clusters=10, labels=None): '''Uses a TextVectorizer to transform the text contained (at the sentence or paragraph level) in the text_vector arg to produce a TSNE visualization. The label for the final plot is clusters produced from KMeans if labels are not passed. ARGS: text_vector <np.array>: Vector of text units. Must be type str. KWARGS: TextVectorizer <sklearn.feature_extraction.text>: Transformer. text_kwargs <dict>: kwargs to pass to TextVectorizer n_clusters <int>: If not using labels, number of clusters in KMeans labels <np.array>: True categorical labels. Discrete. RETURNS: None, prints visualizations to the console. ''' txt_vctzr = TextVectorizer(**text_kwargs) docs = txt_vctzr.fit_transform(text_vector) tsne = TSNEVisualizer() if labels is None: # derive clusters if labels not provided clusters = KMeans(n_clusters=n_clusters) clusters.fit(docs) tsne.fit(docs, ["cluster_{}".format(c) for c in clusters.labels_]) else: # otherwise use labels tsne.fit(docs, labels) sns.despine() tsne.poof()
def tsne_pack(c, l): my_title = "t-SNE Plot of " + c + " feature" data = df.filter(like=c) tfidf = TfidfVectorizer() new_values = tfidf.fit_transform(corpus) tsne = TSNEVisualizer(title=my_title) tsne.fit(data, l) tsne.poof()
def tsne(docs, target, outpath, **kwargs): # Create a new figure and axes fig = plt.figure() ax = fig.add_subplot(111) # Visualize the frequency distribution visualizer = TSNEVisualizer(ax=ax, **kwargs) visualizer.fit(docs, target) visualizer.poof(outpath=outpath)
def analyse_2_step_model(): X_test = np.load( "test_prepared.npy").item() # this is our Single point of truth #test_silhouette(30, X_test) test = X_test[0:1000] prediction = test_entire_model()[0:1000] vis_shilouette(test, prediction) plt.savefig("silhouette.png") tsne = TSNEVisualizer(colormap=cm.get_cmap('jet', len(set(prediction)))) tsne.fit(test[0:1000], ["c{}".format(c) for c in prediction]) tsne.poof(outpath="tsne.png")
def tsne(c, l): my_title = "t-SNE Plot of final model" data = c tfidf = TfidfVectorizer() new_values = tfidf.fit_transform(corpus) tsne = TSNEVisualizer(title=my_title) tsne.fit(data, l) tsne.poof() # %%time figure(figsize=(20, 10)) tsne(final, label_bias3) # %%time figure(figsize=(20, 10)) tsne(final, label_fact)
def plot_tsne_clusters(corpus, fileids=None, labels=None): from yellowbrick.text import TSNEVisualizer from sklearn.feature_extraction.text import TfidfVectorizer words = corpus.title_tagged(fileids=fileids) normalizer = Corpus_Vectorizer.TextNormalizer() normed = (sent for title in normalizer.transform(words) for sent in title) # normed = (dd for dd in normalizer.transform(docs)) tfidf = TfidfVectorizer() procd = tfidf.fit_transform(normed) tsne = TSNEVisualizer() if labels is None: tsne.fit(procd) else: tsne.fit(procd, ["c{}".format(c) for c in labels]) tsne.poof()
def analyse_results(): rerun = False if ("rerun" in sys.argv): print("Redo everything") rerun = True X_test = np.load("test_prepared.npy").item() results = [] names = [] for filename in os.listdir("results"): if filename.endswith(".npy"): if filename[:-4] + "tsne.png" in os.listdir( "results") and not rerun: continue results.append(np.load("results/" + filename)) names.append(filename[:-4]) for i in range(len(results)): print("iteration " + str(i + 1) + " of " + str(len(results)) + " : " + names[i]) vis_shilouette(X_test, results[i]) plt.savefig("results/" + names[i] + "silhouette.png") plt.close() plt.figure() tsne = TSNEVisualizer(colormap=cm.get_cmap( 'jet', len(set(results[i][0:5000]))), alpha=0.5, random_state=45) # make it deterministic tsne.fit(X_test[0:5000], ["c{}".format(c) for c in results[i][0:5000]]) tsne.poof(outpath="results/" + names[i] + "tsne.png", clear_figure=True)
vect = CountVectorizer(tokenizer=lambda x: [i.strip() for i in x.split(',')], lowercase=False) dummies = vect.fit_transform(df['ingredients'].apply(','.join)) df = pd.DataFrame(dummies.todense(),columns=vect.get_feature_names()) print("Vocab Length: ", len(vect.get_feature_names())) print("All Data Shape: ", df.shape) df.index= df_index print("Number of Predictors: ", df.shape[0]) df.head() # Create the visualizer and draw the vectors plt.figure(figsize = [15,9]) tsne = TSNEVisualizer() tsne.fit(df.loc[traindex,:][:7000], y[:7000]) tsne.poof() X = df.loc[traindex,:] print("Number of Cuisine Types: ", y.nunique()) print("X Shape: ", X.shape) test_df = df.loc[testdex,:] print("Test DF Shape: ", test_df.shape) del df; gc.collect(); LogisticRegression().get_params().keys() model = LogisticRegression(multi_class= 'ovr') score = cross_validate(model, X, y, return_train_score=False) score["test_score"].mean() model.fit(X,y)
# freq_dist_viz(vectorizer, df_train['Lyrics'], # "images/tfid_stopwords_train.png") # freq_dist_viz(vectorizer, df_test['Lyrics'], "images/tfid_stopwords_test.png") def get_sentence_embedding(w2v_model, sentence): embedding = np.zeros(3000) for word in sentence.split(): try: vector = w2v_model.wv.get_vector(word) except KeyError: vector = np.zeros(3000) embedding += vector return embedding / len(sentence.split()) w2v_model = Word2Vec.load("word2vec_models/word2vec.model") docs = np.array([ get_sentence_embedding(w2v_model, sentence) for sentence in df_train['Lyrics'] ]) # tfidf = TfidfVectorizer() # docs = tfidf.fit_transform(X) labels = df_train['Genre'] tsne = TSNEVisualizer() tsne.fit(docs, labels) tsne.poof("images/w2v_tsne.png")
def plotData(base, labels=[-1,0,1]): vectorizer = CountVectorizer(lowercase=False) tweets = vectorizer.fit_transform(base['Tweet']) tsne = TSNEVisualizer() tsne.fit(tweets, labels) tsne.poof()
def TSNE_graph(X_train, y_train): tsne = TSNEVisualizer() tsne.fit(X_train, y_train) tsne.poof()
data=data, target=target, ) # Load the data and create document vectors corpus = load_corpus('hobbies') tfidf = TfidfVectorizer() docs = tfidf.fit_transform(corpus.data) labels = corpus.target # Create a visualizer to simply see the vectors plotted in 2D tsne = TSNEVisualizer() tsne.fit(docs) tsne.poof() # Create a visualizer to see how k-means clustering grouped the docs from sklearn.cluster import KMeans clusters = KMeans(n_clusters=5) clusters.fit(docs) tsne = TSNEVisualizer() tsne.fit(docs, ["c{}".format(c) for c in clusters.labels_]) tsne.poof() # Create a visualizer to see how the classes are distributed tsne = TSNEVisualizer()
def visualise_with_yellowbrick(feature_matrix, labels_tfidf): tsne = TSNEVisualizer(title="Chat Messages Clusters", alpha = 0.7) tsne.fit(feature_matrix, np.array(labels_tfidf)) tsne.finalize() tsne.poof()