def tsne_visualization(dataset_object, num_examples): """ Produce and save T-SNE visualization of feature vectors for given dataset Parameters ---------- dataset_object: tv_net.dataset.Dataset dataset object containing feature vectors and class names num_examples: int number of examples to plot """ dataset_object.shuffle_examples( ) # shuffle so that we don't get all one class feature_vectors = np.array( [item.feature_vector for item in dataset_object.items[:num_examples]]) label_list = [ item.class_name for item in dataset_object.items[:num_examples] ] title = 'T-SNE of feature vectors extracted from baseline classifier - using random sample of {} images'.format( num_examples) tsne = TSNEVisualizer(colormap='rainbow', title=title) tsne.fit(feature_vectors, label_list) output_path = os.path.join(dataset_object.config.OUTPUT_DIR, 'visualizations', 'feature_vector_tsne.png') tsne.show(outpath=output_path) tsne.show() # have to repeat to show and save
def clusters_tsne(self, labels: pd.Series, title: str = 'title'): tsne = TSNEVisualizer(random_state=42) tsne.fit(self.vectors, labels) f = tsne.show().figure f.set_figheight(15) f.set_figwidth(15) f.suptitle(title) return f
def cluster(corpus, k): y = [i[0] for i in corpus] corpus = [i[1] for i in corpus] eng = list(set(stopwords.words('english'))) trump = [ 'wall', 'president', 'trump', 'loss', 'yes', 'sorry', 'mr', 'build', 'thank', 'people' ] s_w = eng + trump vectorizer = TfidfVectorizer(stop_words=s_w) vectorizer.fit(corpus) features = vectorizer.transform(corpus) tsne = TSNEVisualizer() tsne.fit(features, y) tsne.show()
def tsne_plot(self, outpath, sample_size=1000, tfidf=True): """ Creates a png file at `outpath` with t-SNE visualization. `sample_size` determines the size of the random sample from each label. Uses TfidfVectorizer by default; if `tfidf` is set to False, CountVectorizer is used. ----------------------------------------------------------------------- More info: https://www.scikit-yb.org/en/latest/api/text/tsne.html https://lvdmaaten.github.io/tsne/ """ if self.tokenizer is None: print('No tokenizer was loaded.') return None df = pd.DataFrame(columns=self.data.columns) for label in self.labels: samp_df = self.data \ .query("Label == @label") \ .sample(sample_size, random_state=19) df = df.append(samp_df, ignore_index=True) # vectorize if tfidf: vectorizer = TfidfVectorizer(tokenizer=self.tokenizer.tokenize) else: vectorizer = CountVectorizer(tokenizer=self.tokenizer.tokenize) X = vectorizer.fit_transform(df.Text) y = df.Label # create the visualizer and draw the vectors tsne = TSNEVisualizer() tsne.fit(X, y) tsne.show(outpath=outpath) return None
#!/usr/bin/env python3 import pickle from yellowbrick.text import TSNEVisualizer with open('data/agorb.csv', 'rb') as file: agora = pickle.load(file) with open('data/tno/tfidf_vectors_webiq.pkl', 'rb') as file: X = pickle.load(file) with open('data/tno/categorieen.pkl', 'rb') as file: c = pickle.load(file) tsne = TSNEVisualizer() tsne.fit(X, c) tsne.show()