def save_embed_plot(X,labels,fname): Y = tsne(X, 2, word_vector_dim, 20.0); fig = Plot.figure() Plot.scatter(Y[:,0], Y[:,1], 1); for label, x, y in zip(labels, Y[:,0], Y[:,1]): Plot.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points', size=5) fig.savefig(fname, dpi=1200)
def visualizeLatentState(X, rs, gen_params, rec_params): q_means, q_log_stds = nn_predict_gaussian(rec_params, X) latents = sample_diag_gaussian(q_means, q_log_stds, rs) gen = sigmoid(neural_net_predict(gen_params, latents)) gen = gen[:,:gen.shape[1]/2] print(gen.shape) print(X.shape) #yTrain =y[:genTrain.shape[0],:] #yTest = y[genTrain.shape[0]:,:] #pdb.set_trace y = tsne(np.vstack((X,gen*10))) plt.figure() plt.clf() plt.scatter(y[:gen.shape[0],0],y[:gen.shape[0],1],color='red') plt.scatter(y[gen.shape[0]:,0],y[gen.shape[0]:,1],color='blue') plt.legend(['X', 'Xdecoded'],) plt.savefig('hidden.jpg')
def visualize_codes(net, dataloader=test_loader, batches=4): codes = [] truths = [] for _ in range(batches): to_encode, truth = next(iter(dataloader)) truths.append(truth.numpy()) to_encode = Variable(to_encode) encoded = net.encode(to_encode) codes.append(encoded.data.numpy()) X = np.concatenate(codes, axis=0) GT = np.concatenate(truths, axis=0) Y = tsne(X, no_dims=2, initial_dims=8) tops = Y[np.where(GT == 0)] trousers = Y[np.where(GT == 1)] pullovers = Y[np.where(GT == 2)] dresses = Y[np.where(GT == 3)] coats = Y[np.where(GT == 4)] sandals = Y[np.where(GT == 5)] shirts = Y[np.where(GT == 6)] sneakers = Y[np.where(GT == 7)] bags = Y[np.where(GT == 8)] boots = Y[np.where(GT == 9)] plt.scatter(tops[:, 0], tops[:, 1], label='tops') plt.scatter(trousers[:, 0], trousers[:, 1], label='trousers') plt.scatter(pullovers[:, 0], pullovers[:, 1], label='pullovers') plt.scatter(dresses[:, 0], dresses[:, 1], label='dresses') plt.scatter(coats[:, 0], coats[:, 1], label='coats') plt.scatter(sandals[:, 0], sandals[:, 1], label='sandals') plt.scatter(shirts[:, 0], shirts[:, 1], label='shirts') plt.scatter(sneakers[:, 0], sneakers[:, 1], label='sneakers') plt.scatter(bags[:, 0], bags[:, 1], label='bags') plt.scatter(boots[:, 0], boots[:, 1], label='boots') plt.title('visualization of codes') plt.legend() plt.show() return X, Y, GT
def plot_clusters(matrix, listy, no_dims=2, initial_dims=100, perplexity=10): figure1 = plt.figure() Y = tsne(matrix, no_dims=no_dims, initial_dims=initial_dims, perplexity=perplexity) plt.scatter(Y[:, 0], Y[:, 1]) #,len(languages),np.r_[1:len(languages)]) for label, x, y in zip(listy, Y[:, 0], Y[:, 1]): plt.annotate(label, xy=(x, y), xytext=(-20, 20), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'), fontsize='x-large') frame = plt.gca() frame.axes.get_xaxis().set_ticks([]) frame.axes.get_yaxis().set_ticks([]) return
''' plt.figure() print final_lang.shape X = pca(cosangles) plt.scatter(X[:,0],X[:,1])#,len(languages),np.r_[1:len(languages)]) for label, x, y in zip(languages, X[:, 0], X[:, 1]): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) ''' # plot language points plt.figure() Y = tsne(cosangles,no_dims=2,initial_dims=100,perplexity=8) plt.scatter(Y[:,0],Y[:,1])#,len(languages),np.r_[1:len(languages)]) for label, x, y in zip(languages, Y[:, 0], Y[:, 1]): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'), fontsize='x-large') frame = plt.gca() frame.axes.get_xaxis().set_ticks([]) frame.axes.get_yaxis().set_ticks([]) plt.show()
from sklearn.datasets import load_breast_cancer from tsne import * import pandas as pd from pylab import * import seaborn as sns from functools import reduce from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split data = load_breast_cancer() x, y, label_names = data['data'], data['target'], data['target_names'] scaler = MinMaxScaler() scaler.fit(x) x = scaler.transform(x) tsne(x, y, label_names=label_names) trainx, testx, trainy, testy = train_test_split(x, reshape(y, [-1, 1]), test_size=0.1) def variance(x, u): x = reshape(x, [-1, 1]) return (x - u).dot(x - u).T def lda2(x, y): x0, x1 = mat(x[y.T[0] == 0]), mat(x[y.T[0] == 1]) u0, u1 = x0.mean(axis=0), x1.mean(axis=0) sigma0, sigma1 = reduce(lambda x, y: x + y, [variance(i, u0) for i in x0]), reduce(lambda x, y: x + y, [variance(i, u1) for i in x1])
vectors.append(vec[vocab_index[u]]) if y==1 and yp==1: color.append(1) elif y==1 and yp!=1: color.append(2) elif y!=1 and yp==1: color.append(3) else: color.append(4) count+=1 if count==vec_limit: break return numpy.array(vectors), color, tag, prec def save_embed_plot((X,color,tag,prec),fname): Y = tsne(X, no_dims = 2, initial_dims = 50, perplexity = 30.0) with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/single_topic_vis/"+fname+".pickle","wb") as fd: pickle.dump(Y,fd) fig = Plot.figure() init = [] tp = [] fn = [] fp = [] tn = [] for i,c in enumerate(color): if c==0: init.append(i) elif c==1: tp.append(i) elif c==2: fn.append(i)
#set visibility of most, least and mid frequency hashtags by setting text size def get_tag_size_label(tlist): size = [] label = [] for t in tag_labels: if t in tlist: size.append(2) label.append(t.decode('latin-1')) else: size.append(0) label.append('') return size, array(label) X = array(hist_feature) Y = tsne(X, 2, 50, 30.0) def save_embed_plot((tag_sizes, labels), fname): fig = Plot.figure() Plot.scatter(Y[:, 0], Y[:, 1], 0) for label, x, y, s in zip(labels, Y[:, 0], Y[:, 1], tag_sizes): Plot.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points', size=s) Plot.axis('off') fig.savefig(fname, dpi=800, bbox_inches='tight')
Plot.scatter(Y_tn[:, 0], Y_tn[:, 1], s=10, c='c', alpha=0.4, label='true negatives', edgecolor='none') Plot.axis('off') Plot.legend(prop={'size': 8}) Plot.title('#' + tag + ', P@100: ' + str(prec) + ', ' + clf) fig.savefig(fname + '.png', dpi=400, bbox_inches='tight') if __name__ == "__main__": print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset." for i in range(0, train_ex_limit): X, color, color_log, tag, prec, prec_log = get_user_vectors(i) if use_tsne == True: Y = tsne(X, no_dims=2, initial_dims=50, perplexity=30.0) else: Y = pca(X, no_dims=2) # with open("/mnt/filer01/word2vec/degree_distribution/adopter_pred_files/single_topic_vis/"+fname+".pickle","wb") as fd: # pickle.dump(Y,fd) save_embed_plot(Y, color, tag, prec, 'RF', 'embed_adopters_topic_rf' + str(i)) save_embed_plot(Y, color_log, tag, prec_log, 'LR', 'embed_adopters_topic_lr' + str(i)) #cc 0.0589, candidate set recall 280 out of 4751 cand size 6312 #cc 0.219, candidate set recall 516 out of 2347 cand size 4702 #cc 0.56, candidate set recall 658 out of 1162 cand size 4075
for t in stasks: dstask[ds][t] = ctr print '%d: ds%03d task%03d' % (ctr, ds, t) ctr = ctr + 1 # make colormap cmap = {} ctr = 0 for i in list(s): cmap[i] = ctr ctr += 1 colors = [cmap[i] for i in copedata[:, 0]] X = X[usedata == 1, :] t = tsne(X, no_dims=2, initial_dims=15, perplexity=10.0, max_iter=1000) plt.clf() plt.scatter(t[:, 0], t[:, 1], s=0) # create axes f = open(basedir + 'tasklabels.txt', 'w') for i in range(len(t)): x, y = t[i, :] plt.text(x, y, '%d' % dstask[copedata[i, 0]][copedata[i, 1]]) #,color=colors[i]) f.write('%d\n' % dstask[copedata[i, 0]][copedata[i, 1]]) f.close() # print legend: plt.savefig(basedir + 'tsne_fig.pdf', format='pdf')
The perplexity is 2 to the entropy of the probability distribution. It measures how many neighbors each data point will be connected to. When I raise the perplexity the images have more clusters. Theta measures the accuracy of the algorithm. It is the angle the data points are to each other. Large theta speeds up the algorithm but reduces the accuracy and small theta slows down the algorithm but increases the accuracy. """ #read all the classfied files into a list #not only read the classified files into a list, but also keep it open for appending #randomarray=np.random.random(255, size=(1000, 784)) randomarray=np.random.random((1000, 784)) coordinates = bh_sne(randomarray, perplexity = 30, theta = .1) * 10 print coordinates if __name__ == '__main__': streamhandler = logging.StreamHandler(sys.stdout) if args.logging_level==10: streamhandler.setLevel(logging.INFO) log.setLevel(logging.INFO) if args.logging_level==20: streamhandler.setLevel(logging.DEBUG) log.setLevel(logging.DEBUG) filehandler = logging.FileHandler("logging") #formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') formatter = logging.Formatter("%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s] %(message)s") streamhandler.setFormatter(formatter) log.addHandler(streamhandler) tsne()
def draw(learning_rate, n_examples, repeats): gate = Tanh() runs = 1 cp_alt = CrossPropAlt(dim_in, dim_hidden, dim_out, learning_rate, gate, output_layer='CE', lam=0, name='cp') cp_alt_lam = CrossPropAlt(dim_in, dim_hidden, dim_out, learning_rate, gate, output_layer='CE', lam=0.5, name='cp-lam') bp = BackPropClissification(dim_in, dim_hidden, dim_out, learning_rate, gate, name='bp') bp_mom = BackPropClissification(dim_in, dim_hidden, dim_out, learning_rate, gate, name='bp-mom', optimizer=tf.train.MomentumOptimizer( learning_rate=learning_rate, momentum=0.9)) bp_adam = BackPropClissification( dim_in, dim_hidden, dim_out, learning_rate, gate, name='bp-adam', optimizer=tf.train.AdamOptimizer(learning_rate=learning_rate)) bp_rms = BackPropClissification( dim_in, dim_hidden, dim_out, learning_rate, gate, name='bp-rms', optimizer=tf.train.RMSPropOptimizer(learning_rate=learning_rate)) methods = [cp_alt, cp_alt_lam, bp, bp_adam, bp_rms, bp_mom] for run in range(runs): train_x = train_x_total[:n_examples, :] train_y = train_y_total[:n_examples, :] y0 = train_y y1 = np.concatenate([train_y[:, 1:], train_y[:, :1]], 1) y2 = np.concatenate([y1[:, 1:], y1[:, :1]], 1) train_xs = [train_x] * 6 train_ys = [y0, y1, y2, y0, y1, y2] # np.random.seed(0) # x0 = train_x # perm = np.arange(dim_in) # np.random.shuffle(perm) # x1 = train_x[:, perm] # np.random.shuffle(perm) # x2 = train_x[:, perm] # # train_xs = [x0, x1, x2, x0, x1, x2] # train_ys = [train_y] * 6 # features = np.zeros((stages, len(methods), n_examples, dim_hidden)) saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) figure_index = 0 candidate_stages = [0, 1, 2, 3] candidate_methods = [0, 2] target_dim = 2 tsne_data = dict() for stage in candidate_stages: saver.restore(sess, 'tmp/saved/ffn_model/%s_stage_%d' % (tag, stage)) features = np.zeros((len(methods), n_examples, dim_hidden)) train_x = train_xs[stage] train_y = train_ys[stage] batch_size = 1000 cur_example = 0 while cur_example < n_examples: logger.info('store features... stage %d, example %d' % (stage, cur_example)) end_example = min(n_examples, cur_example + batch_size) for method_ind, method in enumerate(methods): cur_features = sess.run( method.feature, feed_dict={ method.x: train_x[cur_example:end_example, :], method.target: train_y[cur_example:end_example, :] }) features[method_ind, cur_example:end_example, :] = cur_features cur_example = end_example sample_indices = np.arange(2500) for repeat in range(repeats): np.random.shuffle(sample_indices) for method_ind in candidate_methods: x_to_plot = features[method_ind, sample_indices, :] y_to_plot = np.argmax(train_y[sample_indices, :], axis=1) print x_to_plot.shape, y_to_plot.shape x_prime = tsne(x_to_plot, target_dim, 50, 20.0) tsne_data[(stage, method_ind)] = (x_prime, y_to_plot) # fig = plt.figure(figure_index) # figure_index += 1 # ax = Axes3D(fig) # ax.scatter(x_prime[:, 0], x_prime[:, 1], x_prime[:, 2], c=y_to_plot) # plt.scatter(x_prime[:, 0], x_prime[:, 1], 20, y_to_plot) # plt.title('%s_%s_stage_%d' % (tag, labels[method_ind], stage)) # plt.show() # plt.savefig('figure/%s_repeat_%d_%s_stage_%d.png' % (tag, repeat, labels[method_ind], stage)) # plt.close() # plt.show() with open('tmp/tsne_dim_%d.bin' % target_dim, 'wb') as f: pickle.dump(tsne_data, f)
from sklearn.datasets import load_breast_cancer from tsne import * import pandas as pd from pylab import * import seaborn as sns from functools import reduce from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split data = load_breast_cancer() x, y, label_names = data['data'], data['target'], data['target_names'] scaler = MinMaxScaler() scaler.fit(x) x = scaler.transform(x) tsne(x, y, label_names=label_names) trainx, testx, trainy, testy = train_test_split(x, reshape(y, [-1, 1]), test_size=0.1) def variance(x, u): x = reshape(x, [-1, 1]) return (x - u).dot(x - u).T def lda2(x, y): x0, x1 = mat(x[y.T[0] == 0]), mat(x[y.T[0] == 1]) u0, u1 = x0.mean(axis=0), x1.mean(axis=0)
import matplotlib.pyplot as plt import matplotlib.font_manager as mplfont import os from sklearn import manifold from sklearn.metrics import euclidean_distances from tsne import * outdir='/corral-repl/utexas/poldracklab/openfmri/analyses/paper_analysis_Dec2012/clustering' #X=N.loadtxt('/corral-repl/utexas/poldracklab/openfmri/analyses/paper_analysis_Dec2012/ICA/datarun1_icarun1_200comp.txt') X=N.load('/corral-repl/utexas/poldracklab/openfmri/analyses/paper_analysis_Dec2012/data_prep/zstat_run1.npy') #clf = manifold.MDS(n_components=2, n_init=1, max_iter=1000) #t=clf.fit_transform(euclidean_distances(X)) t=tsne(X,no_dims=2, initial_dims=30,perplexity=10.0, max_iter=1000) taskinfo=N.loadtxt('/corral-repl/utexas/poldracklab/openfmri/analyses/paper_analysis_Dec2012/data_prep/data_key_run1.txt') tasknums=N.unique(taskinfo[:,0]) # compute scatter for each task t_eucdist={} mean_t_obs={} for k in tasknums: obs=N.where(taskinfo[:,0]==k)[0] t_obs=t[obs,:] mean_t_obs[k]=N.mean(t_obs,0) t_eucdist[k]=N.mean(N.sqrt((t_obs[:,0]-mean_t_obs[k][0])**2 + (t_obs[:,1]-mean_t_obs[k][1])**2 ))
next = np.array([ float(n) for n in l[:-2].split(" ")[half:]]) prevsong = prev.reshape((96, 50)) nextsong = next.reshape((96, 50)) prevsong = np.sum(prevsong, axis=1) nextsong = np.sum(nextsong, axis=1) arr.append(np.array(list(prevsong) + list(nextsong))) ''' read in labels ''' with open('labels.txt', 'r') as l: labels = [word.rstrip() for word in l.readlines()] matr = np.array( [ np.array(entry) for entry in arr] ) ''' plot scatter ''' Y = tsne(matr) import matplotlib.pyplot as plt plt.scatter(Y[:, 0], Y[:, 1], 20) for label, x, y in zip(labels, Y[:, 0], Y[:, 1]): plt.annotate(label, xy=(x, y), xytext=(-10, 10), textcoords="offset points", bbox = dict(boxstyle='round', fc="yellow")) plt.savefig("test.ps", format='eps', dpi=1000) plt.show()
from blocks.model import Model main_loop = MainLoop( algorithm=algorithm, data_stream=DataStream.default_stream( dataset=train_dataset, iteration_scheme=SequentialScheme(train_dataset.num_instances(), 1) ), model=Model(cost), extensions=extensions ) main_loop.run() from tsne import * import matplotlib.pyplot as plt W1 = numpy.load("layer1_20.npy") Y = tsne(W1, 2, 50, 20.0) fig, ax = plt.subplots() ax.scatter(Y[:,0], Y[:,1]) for i, word in enumerate(train_dataset.bag_words): x,y = Y[i] ax.annotate(word, (x,y)) plt.show()
#set visibility of most, least and mid frequency hashtags by setting text size def get_tag_size_label(tlist): size = [] label = [] for t in tag_labels: if t in tlist: size.append(2) label.append(t.decode('latin-1')) else: size.append(0) label.append('') return size, array(label) X = array(hist_feature) Y = tsne(X, 2, 50, 30.0); def save_embed_plot((tag_sizes,labels),fname): fig = Plot.figure() Plot.scatter(Y[:,0], Y[:,1], 0); for label, x, y, s in zip(labels, Y[:,0], Y[:,1], tag_sizes): Plot.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points', size=s) Plot.axis('off') fig.savefig(fname, dpi=800, bbox_inches='tight') if __name__ == "__main__": print "Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset." # print "Running example on 2,500 MNIST digits..." # X = Math.loadtxt("mnist2500_X.txt"); # labels = Math.loadtxt("mnist2500_labels.txt"); save_embed_plot(get_tag_size_label(most_freq),'embed_tag_mostfreq.png')