Exemple #1
0
def tsne_visualization(dataset_object, num_examples):
    """
    Produce and save T-SNE visualization of feature vectors for given dataset
    Parameters
    ----------
    dataset_object: tv_net.dataset.Dataset
        dataset object containing feature vectors and class names
    num_examples: int
        number of examples to plot
    """

    dataset_object.shuffle_examples(
    )  # shuffle so that we don't get all one class
    feature_vectors = np.array(
        [item.feature_vector for item in dataset_object.items[:num_examples]])
    label_list = [
        item.class_name for item in dataset_object.items[:num_examples]
    ]

    title = 'T-SNE of feature vectors extracted from baseline classifier - using random sample of {} images'.format(
        num_examples)
    tsne = TSNEVisualizer(colormap='rainbow', title=title)
    tsne.fit(feature_vectors, label_list)
    output_path = os.path.join(dataset_object.config.OUTPUT_DIR,
                               'visualizations', 'feature_vector_tsne.png')
    tsne.show(outpath=output_path)
    tsne.show()  # have to repeat to show and save
Exemple #2
0
 def clusters_tsne(self, labels: pd.Series, title: str = 'title'):
     tsne = TSNEVisualizer(random_state=42)
     tsne.fit(self.vectors, labels)
     f = tsne.show().figure
     f.set_figheight(15)
     f.set_figwidth(15)
     f.suptitle(title)
     return f
Exemple #3
0
def cluster(corpus, k):
    y = [i[0] for i in corpus]
    corpus = [i[1] for i in corpus]
    eng = list(set(stopwords.words('english')))

    trump = [
        'wall', 'president', 'trump', 'loss', 'yes', 'sorry', 'mr', 'build',
        'thank', 'people'
    ]

    s_w = eng + trump

    vectorizer = TfidfVectorizer(stop_words=s_w)
    vectorizer.fit(corpus)
    features = vectorizer.transform(corpus)

    tsne = TSNEVisualizer()
    tsne.fit(features, y)
    tsne.show()
Exemple #4
0
    def tsne_plot(self, outpath, sample_size=1000, tfidf=True):
        """
        Creates a png file at `outpath` with t-SNE visualization.
        `sample_size` determines the size of the random sample from each label.
        Uses TfidfVectorizer by default;
        if `tfidf` is set to False, CountVectorizer is used.
        -----------------------------------------------------------------------
        More info:
        https://www.scikit-yb.org/en/latest/api/text/tsne.html
        https://lvdmaaten.github.io/tsne/
        """

        if self.tokenizer is None:
            print('No tokenizer was loaded.')
            return None

        df = pd.DataFrame(columns=self.data.columns)
        for label in self.labels:
            samp_df = self.data \
                .query("Label == @label") \
                .sample(sample_size, random_state=19)
            df = df.append(samp_df, ignore_index=True)

        # vectorize
        if tfidf:
            vectorizer = TfidfVectorizer(tokenizer=self.tokenizer.tokenize)
        else:
            vectorizer = CountVectorizer(tokenizer=self.tokenizer.tokenize)
        X = vectorizer.fit_transform(df.Text)
        y = df.Label

        # create the visualizer and draw the vectors
        tsne = TSNEVisualizer()
        tsne.fit(X, y)
        tsne.show(outpath=outpath)

        return None
Exemple #5
0
#!/usr/bin/env python3

import pickle
from yellowbrick.text import TSNEVisualizer

with open('data/agorb.csv', 'rb') as file:
    agora = pickle.load(file)

with open('data/tno/tfidf_vectors_webiq.pkl', 'rb') as file:
    X = pickle.load(file)

with open('data/tno/categorieen.pkl', 'rb') as file:
    c = pickle.load(file)

tsne = TSNEVisualizer()
tsne.fit(X, c)
tsne.show()