def generate_vectors(json_input_filename, w2v_dim, perplexity, theta, pca_dims, dim=2):
    vectors = []
    most_dominant_labels = []
    image_ids = []
    label_map = utils.load_json(json_input_filename, w2v_dim)
    for image_id, label in label_map.iteritems():
        label_vectors = []
        label_scores = []
        label_desc = []
        for val in label:
            label_vectors.append(val['word2vec'])
            label_scores.append(val['score'])
            label_desc.append(str(''.join(c for c in val['description'] if c in string.printable)))
        output_vec = word2vec.linear_combination_vectors(vectors=label_vectors, coefficients=label_scores)

        vectors.append(output_vec)
        most_dominant_labels.append(label_desc[0])
        image_ids.append(image_id)

    embeddings = []
    for result in bh_tsne(vectors,
                          perplexity=perplexity,
                          initial_dims=pca_dims,
                          theta=theta,
                          no_dims=dim):
        embeddings.append(result)

    embeddings = utils.scale_max_abs(embeddings)
    return embeddings, most_dominant_labels, image_ids
Example #2
0
def main():
    docs = get_text()  # list of READMEs and descriptions
    docs_preprocess = map(lambda doc: preprocess(doc), docs)  # stemming string
    tfidf_matrix = tfidf_vectorizer(
        docs_preprocess)  # convert to tf-idf matrix
    svd_vect = svd_vectorizer(tfidf_matrix, n_components=200,
                              n_iter=150)  # reduce dimensions
    # Run t-distributed Stochastic Neighbor Embedding (t-SNE; Barnes-Hut implementation)
    # Timings: sklearn - 1k: 15.9195120335, 2k - 41.7645118237, 4k - 185.737361908
    #          t-sne - 1k: 15.7083182335, 2k - 38.8270409107, 4k - 78.0439789295
    embedded = []
    for res in bh_tsne(svd_vect, no_dims=2, perplexity=40, verbose=True):
        embedded.append(res)
    embedded = np.array(embedded)
    # We can use this as input to identify clusters of projects

    # Plot t-SNE
    fig, ax = plt.subplots(figsize=(10, 10))
    plt.setp(ax, xticks=(), yticks=())
    fig.subplots_adjust(left=0.0,
                        bottom=0.0,
                        right=1.0,
                        top=0.9,
                        wspace=0.0,
                        hspace=0.0)
    ax.scatter(embedded[:, 0], embedded[:, 1], marker='x')
    #        c=newsgroups.target, marker="x")
    #    fig.savefig('tsne.pdf', format = 'pdf')
    plt.show()
Example #3
0
def run_bhtsne(data_set, theta=0.5, perplexity=50):
    """ Runs the bh-tsne on the given data

            :type data_set: numpy array
            :param data_set: Numpy array on which bh-tsne shall be run

            :type theta: float
            :param theta: Specifies the theta parameter

            :type perplexity: int
            :param perplexity: Specifies the perplexity
            """

    n = data_set.shape[0]
    print('Running Barnes-Hut - t-SNE on %d data points...' % n)
    data_bhtsne = np.zeros((n, 2))

    for dat, temp in zip(
            bh_tsne(np.copy(data_set), theta=theta, perplexity=perplexity),
            data_bhtsne):
        temp[...] = dat

    print('\nNormalizing...')
    min = np.min(data_bhtsne, axis=0)
    data_bhtsne = data_bhtsne - min
    max = np.max(data_bhtsne, axis=0)
    data_bhtsne = data_bhtsne / max

    return data_bhtsne
Example #4
0
def LDA_run():
    global g_lda, g_vec, g_coords, g_clust
    articles = load_data_folder(text_dir)
    #articles = {key:bag_of_wordify(articles[key]) for key in articles}
    lda_keys = list(articles.keys())
    corpus = [bag_of_wordify(articles[key]) for key in articles]
    wdict  = corpora.Dictionary(corpus)
    bow_corpus = [wdict.doc2bow(text) for text in corpus]
    tfidf = models.tfidfmodel.TfidfModel(bow_corpus, normalize=True)
    tfidf_corpus  = [tfidf[doc] for doc in bow_corpus]
    NUM_TOPICS = 100
    
    printl( "Training LDA model")
    
    lda_model = LDA_train(wdict, articles, tfidf_corpus, NUM_TOPICS)        
    print ("Converting to vector representation")
    wordvec    =    LDA2Vec(lda_model, tfidf_corpus)
    g_vec = wordvec
    
    printl ("running tsne")
    
    coords= [coord for coord in bhtsne.bh_tsne(wordvec)]
    print("running kmeans")
    if(classes==None):
        clusters = kmeans_clusters(list(articles.keys()), wordvec)
    else:
        #Avoid kmeans if classes are already provided
        clusters = classes
    output_write(list(articles.keys()), coords, clusters)
    return lda_model
Example #5
0
def analyze(path, h5_file):
    category = os.path.basename(h5_file.replace('.h5', ''))
    print('Processing category {}'.format(category))
    data = pd.read_hdf(h5_file, 'data')
    try:
        tsne = np.array([ y for y in bh_tsne(np.vstack(data.state)) ])
        plt.scatter(tsne[:,0], tsne[:,1])
        plt.title(category)
        plt.savefig('{}/{}.png'.format(path, category), dpi=300)
    except Exception as e:
        print(e)
Example #6
0
def word2vec_run():
    raw_art = load_data_folder(text_dir)
    s = []    
    for sentence in gen_load(text_dir):
	    s+=[sentence]    
    W2V = models.Word2Vec
    """ All this should be configurable """
    w2v = W2V(s, workers=4, window=5, min_count=3, size=WORDVEC_SIZE)
    wordvec = word2vectorize(w2v, raw_art)
    coords= [coord for coord in bhtsne.bh_tsne(wordvec)]
    clusters = kmeans_clusters(list(raw_art.keys()), wordvec)
    output_write(list(raw_art.keys()), coords, clusters)
    return coords
Example #7
0
def plot_matrix(A, title=None, labels=None, vocab=None, fig=None):
    # cmap = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0.25, 0.75, 0], [0.25, 0, 0.75], [0, 0.5, 0.5], [0.75, 0.25, 0], [0.75, 0, 0.25], [0, 0.75, 0.25], [0, 0.25, 0.75]]
    cmap = [
        [1, 0, 0],
        [0, 1, 0],
        [0, 0, 1],
        [1, 1, 0],
        [0, 1, 1],
    ]  # , [1, 0, 1], [1, 0.5, 0], [0.5, 0, 1], [0.5, 1, 0], [0.98, 0.39, 0]]
    cl = len(cmap)
    markers = ["o", "d", ">", (5, 1)]
    ml = len(markers)
    if not vocab:
        vocab = range(A.shape[0])
    res = np.array([x for x in bh_tsne(A, verbose=True)])
    if not fig:
        plt.figure()
    else:
        plt.figure(fig)
    if title:
        plt.title(title)
    # if all(labels) != None:
    #    plt.scatter(res[:, 0], res[:, 1], s=20, c=labels, alpha=0.5)
    # else:
    #    plt.scatter(res[:, 0], res[:, 1], s=20, alpha=0.5)
    for col in xrange(A.shape[1]):
        top_word = np.argmax(A[:, col])
        mk = (col // cl) % ml
        colors = np.zeros((A.shape[0], 4))
        colors[:, 0] = cmap[col % cl][0]
        colors[:, 1] = cmap[col % cl][1]
        colors[:, 2] = cmap[col % cl][2]
        colors[:, -1] = A[:, col] / A[top_word, col]
        plt.scatter(res[:, 0], res[:, 1], c=colors, marker=markers[mk], s=30, edgecolor="none")
        plt.scatter(
            res[top_word, 0],
            res[top_word, 1],
            c=cmap[col % cl],
            marker=markers[mk],
            s=30,
            edgecolor="none",
            label=u"тема #" + str(col),
        )
    if all(vocab) != None:
        af = AnnoteFinder(res[:, 0], res[:, 1], vocab, xtol=0.1, ytol=0.1)
        plt.connect("button_press_event", af)
    plt.legend(scatterpoints=1, loc="best", ncol=3, fontsize=9)
    plt.draw()
    return res
Example #8
0
def train(dataset = 'mnist.pkl.gz'):
    dataset = load_data(dataset)
    data = dataset[0][0].astype('float64')

    start_time = timeit.default_timer()

    results = np.zeros((data.shape[0], 2))
    print('... training barnes-Hut tsne')
    for res, save in zip(bh_tsne(np.copy(data), theta = 0.5), results):
        save[...] = res

    end_time = timeit.default_timer()
    print(('The code for file  ' + os.path.split(__file__)[1] + ' ran for %.2fs' % (end_time - start_time)), file = sys.stderr)

    with open(os.path.join(os.path.split(__file__)[0], 'data.pkl'), 'wb') as f:
        pickle.dump(results, f)

    results = results - np.min(results, axis = 0)
    results = results / np.max(results, axis = 0)
Example #9
0
def plot_matrix(A, title=None, labels=None, vocab=None, fig=None):
    #cmap = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [0.25, 0.75, 0], [0.25, 0, 0.75], [0, 0.5, 0.5], [0.75, 0.25, 0], [0.75, 0, 0.25], [0, 0.75, 0.25], [0, 0.25, 0.75]]
    cmap = [[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 1, 0], [0, 1, 1]]#, [1, 0, 1], [1, 0.5, 0], [0.5, 0, 1], [0.5, 1, 0], [0.98, 0.39, 0]]
    cl = len(cmap)
    markers = ['o', 'd', '>', (5,1)]
    ml = len(markers)
    if not vocab:
        vocab = range(A.shape[0])
    res = np.array([x for x in bh_tsne(A, verbose=True)])
    if not fig:
        plt.figure()
    else:
        plt.figure(fig)
    if title:
        plt.title(title)
    #if all(labels) != None:
    #    plt.scatter(res[:, 0], res[:, 1], s=20, c=labels, alpha=0.5)
    #else:
    #    plt.scatter(res[:, 0], res[:, 1], s=20, alpha=0.5)
    for col in xrange(A.shape[1]):
        top_word = np.argmax(A[:, col])
        mk = (col // cl) % ml
        colors = np.zeros((A.shape[0], 4))
        colors[:, 0] = cmap[col % cl][0]
        colors[:, 1] = cmap[col % cl][1]
        colors[:, 2] = cmap[col % cl][2]
        colors[:, -1] = (A[:, col] / A[top_word, col])
        plt.scatter(res[:, 0], res[:, 1], c=colors, marker=markers[mk], s=30, edgecolor='none')
        plt.scatter(res[top_word, 0], res[top_word, 1], c=cmap[col % cl], marker=markers[mk], s=30, edgecolor='none', label=u'тема #'+str(col))
    if all(vocab) != None:
        af =  AnnoteFinder(res[:, 0], res[:, 1], vocab, xtol=0.1, ytol=0.1)
        plt.connect('button_press_event', af)
    plt.legend(scatterpoints=1,
           loc='best',
           ncol=3,
           fontsize=9)
    plt.draw()
    return res
Example #10
0
def generate_vectors(json_input_filename,
                     w2v_dim,
                     perplexity,
                     theta,
                     pca_dims,
                     dim=2):
    vectors = []
    most_dominant_labels = []
    image_ids = []
    label_map = utils.load_json(json_input_filename, w2v_dim)
    for image_id, label in label_map.iteritems():
        label_vectors = []
        label_scores = []
        label_desc = []
        for val in label:
            label_vectors.append(val['word2vec'])
            label_scores.append(val['score'])
            label_desc.append(
                str(''.join(c for c in val['description']
                            if c in string.printable)))
        output_vec = word2vec.linear_combination_vectors(
            vectors=label_vectors, coefficients=label_scores)

        vectors.append(output_vec)
        most_dominant_labels.append(label_desc[0])
        image_ids.append(image_id)

    embeddings = []
    for result in bh_tsne(vectors,
                          perplexity=perplexity,
                          initial_dims=pca_dims,
                          theta=theta,
                          no_dims=dim):
        embeddings.append(result)

    embeddings = utils.scale_max_abs(embeddings)
    return embeddings, most_dominant_labels, image_ids
Example #11
0
def reduce_tsne(D, to_dim=2):
    print('Reducing with t-SNE')
    return array([x for x in bh_tsne(D, verbose=True)])
Example #12
0
def tsne(biembedsfn):
    data = get_data(biembedsfn)
    bhtsne.bh_tsne(data, no_dims=2)
words = [l[1] for l in lines]
ids = [l[0] for l in lines]
del lines
import gc
gc.collect()

##############################################################
# tsne
import sys
sys.path.append('/home/ycao/third_party_src/bhtsne')
from bhtsne import bh_tsne

num_dims = 2
pca_dims = 50
perplexity = 50
theta = .5

tsne_out = list(bh_tsne(X, num_dims, pca_dims, perplexity, theta,
                        verbose=True))

joblib.dump({
    'tsne': tsne_out,
    'words': words,
    'ids': ids
},
            'proc_dir/tsne_out.pickle',
            compress=9)
import pdb
pdb.set_trace()
Example #14
0
import bhtsne as bh
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

iris = datasets.load_iris()
res = bh.bh_tsne(samples=iris['data'], perplexity=5, theta=0.15, verbose=True)
fctr = list(res)[0]
z = [y for (x, y) in fctr]
z = np.asarray(z)
z -= z.min(axis=0)
z /= z.max(axis=0)

plt.scatter(z[:, 0], z[:, 1])
for label, x, y in zip(iris['target'], z[:, 0], z[:, 1]):
    plt.annotate(label,
                 xy=(x, y),
                 xytext=(-10, 10),
                 textcoords='offset points',
                 ha='right',
                 va='bottom',
                 bbox=dict(boxstyle='round,pad=0.15', fc='yellow', alpha=0.3),
                 arrowprops=dict(arrowstyle='->',
                                 connectionstyle='arc3,rad=0'))
plt.show()
Example #15
0
        plt.title(category)
        plt.savefig('{}/{}.png'.format(path, category), dpi=300)
    except Exception as e:
        print(e)

files = glob.glob('{}/*.h5'.format(args.data_folder))

if args.all:
    data = []
    labels = []
    for i, h5_file in enumerate(files):
        _data = pd.read_hdf(h5_file, 'data').state[:args.n]
        data.extend(_data)
        labels.extend([i]*len(_data))

    if args.components:
        pca = PCA(n_components=args.components)
        X = pca.fit_transform(np.vstack(data))
    else:
        X = np.vstack(data)
        
    tsne = np.array([ y for y in bh_tsne(X) ]) #fix to say false to PCA to bh_tsne

    plt.scatter(tsne[:,0], tsne[:,1], c=labels)
    plt.savefig('{}/global_{}_components.png'.format(args.png_folder, args.components), dpi=300)
    
else:
    par_analyze = partial(analyze, args.png_folder)
    pool = mp.Pool()
    pool.map(par_analyze, files)
Example #16
0
import bhtsne as bh
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

iris = datasets.load_iris()
res = bh.bh_tsne(samples=iris['data'], perplexity=5, theta=0.15, verbose=True)
fctr = list(res)[0]
z = [y for (x,y) in fctr]
z = np.asarray(z)
z -= z.min(axis=0)
z /= z.max(axis=0)

plt.scatter(z[:,0], z[:,1])
for label, x, y in zip(iris['target'], z[:, 0], z[:, 1]):
    plt.annotate(label,xy=(x, y), xytext=(-10, 10), textcoords='offset points', ha='right', va='bottom',
                 bbox=dict(boxstyle = 'round,pad=0.15', fc = 'yellow', alpha = 0.3),
                 arrowprops=dict(arrowstyle = '->', connectionstyle='arc3,rad=0'))
plt.show()