def plot_TSNE(labels, features, num_class): Y = tsne(features, no_dims=2, initial_dims=512, perplexity=20.0, max_iter=1000) # fig = plt.figure() ax = fig.add_subplot(111) sct = ax.scatter(Y[:, 0], Y[:, 1], s=20, c=labels, cmap=discrete_cmap(num_class)) cbar = plt.colorbar(sct, ticks=range(num_class)) labels = [] i = 0 for k in label_modelnet.keys(): i += 1 labels.append(k) if i >= num_class: break cbar.set_ticklabels(labels) cbar.set_clim(-0.5, num_class - 0.5) plt.tight_layout() plt.show()
def runTsne(): size = 200 labels = np.array([]) images = np.array([]).reshape(0,size*size*3) # 3 for the colour channels i = 0 for dir in os.listdir(base_dir): # this is one row j = 0 dirPath = os.path.join(base_dir, dir) for imgName in os.listdir(dirPath): # the columns if j >= 10: # Only use 100 images break img = Image.open(os.path.join(dirPath, imgName)) img = img.resize((size,size), Image.ANTIALIAS) np_img = np.array(img) np_img = np_img.reshape(1,-1) # print(np_img.shape) # print(labels.shape) # print(images.shape) labels = np.append(labels,dir) images = np.concatenate((images, np_img)) print(images.shape) j += 1 i += 1 print("Running Tsne on " + str(len(labels)) + " data points") print(images.shape) Y = tsne.tsne(images, 2, 50, 30) pylab.scatter(Y[:,0], Y[:,1], 20, labels) pylab.show()
def eval_all_pointcloud(sess, ops, num_votes=1, topk=1): ''' Code to change.... ''' error_cnt = 0 is_training = False total_correct = 0 total_seen = 0 loss_sum = 0 total_seen_class = [0 for _ in range(NUM_CLASSES)] total_correct_class = [0 for _ in range(NUM_CLASSES)] global_features = [] labels = np.array([]) for fn in range(len(TEST_FILES)): log_string('----' + str(fn) + '----') current_data, current_label = provider.loadDataFile(TEST_FILES[fn]) current_data = current_data[:, 0:NUM_POINT, :] current_label = np.squeeze(current_label) labels = np.append(labels, current_label) print(labels) print(current_data.shape) file_size = current_data.shape[0] print(file_size) for pc_idx in range(file_size): #print(id_count) for vote_idx in range(num_votes): rotated_data = provider.rotate_point_cloud_by_angle( current_data[pc_idx:pc_idx + 1, :, :], vote_idx / float(num_votes) * np.pi * 2) feed_dict = { ops['pointclouds_pl']: rotated_data, ops['labels_pl']: current_label[pc_idx:pc_idx + 1], ops['is_training_pl']: is_training } loss_val, net_val = sess.run([ops['loss'], ops['net']], feed_dict=feed_dict) global_features.append(np.squeeze(net_val['pc_maxpool'])) global_features = np.array(global_features) print "global_features :: ", global_features.shape print "labels :: ", labels.shape global_features = tsne.tsne(global_features, 2, global_features.shape[1]) Plot.scatter(global_features[:, 0], global_features[:, 1], 30, c=4 * labels, cmap='jet') for i, txt in enumerate(labels): if i % 10 == 0: Plot.annotate(txt, (global_features[i, 0], global_features[i, 1])) Plot.show()
def tsne(self): vecs = [] labels = [] for key, value in self.vecs.iteritems(): vecs += value labels += self.labels[key] vecs = np.array(vecs, dtype='float64') #TSNE expects float type values # call tsne with (vectors, #output dimensions (2=2D), intermediate dimensions (not sure what this does), perplexity) # perplexity modifies the repulsion between vectors, so a high value # distributes nodes evenly over space, while a low value groups values self.t = tsne.tsne(vecs, 2, 2, 4) vec_group_start = 0; for key, value in self.vecs.iteritems(): color = self.colors[key] for j in range(len(value)): index = vec_group_start + j label = self.labels[ key ][j] plt.plot(self.t[ index ][0], self.t[ index ][1]) plt.text(self.t[ index ][0], self.t[ index ][1], label, color=color, horizontalalignment='center') #print self.t[ index ][0], self.t[ index ][1] vec_group_start += len(value) plt.show() return plt
def main(args): font = {'family': 'serif'} rc('font', **font) basedir = args.output_prefix if not os.path.exists(basedir): os.makedirs(basedir) output_txt_file = basedir + '.txt' if args.load_tsne_output and os.path.exists(output_txt_file): Y = [] labels = [] with open(output_txt_file, 'r') as f: lines = f.readlines() for line in lines: terms = line.strip().split('\t') labels.append(terms[0]) Y.append([float(terms[1]), float(terms[2])]) Y = np.asarray(Y) else: X = np.loadtxt(args.input_file) if args.transpose: X = X.transpose() Y = tsne(X, 2, 50, 20.0) labels = load_dict(args.dict_file) with open(output_txt_file, 'w') as f: for a, y in zip(labels, Y): f.write(a + '\t' + str(y[0]) + '\t' + str(y[1]) + '\n') if args.separate_plot: draw_figures_separate(args, Y, labels, basedir) else: draw_figures_all(args, Y, labels, basedir)
def get_samples(args): proj_samples = None lengths = None if args.load_proj: with open(args.load_input, 'rb') as f: samples, proj_samples = cPickle.load(f) print 'Loaded {} pickled samples from {}'.format( len(samples), args.load_input) else: samples = [] lengths = [] with open(args.file_in, 'rb') as f: unpickler = cPickle.Unpickler(f) while True: try: saved = unpickler.load() if args.load_lengths: sample = np.array(saved['states'][0][0], dtype=float) samples.append(sample) lengths.append(saved['length']) else: samples.append(np.array(saved)[0]) if args.max_in and len(samples) >= args.max_in: break except (EOFError): break print 'Unpickled {} samples from {}'.format( len(samples), args.file_in) if args.plot: if not proj_samples: from tsne import tsne proj_samples = tsne(samples, max_iter=200) plot_samples(proj_samples) return samples, proj_samples, lengths
def run(): # Setup connection to SQL database connection = sqlite3.connect('freesurfer.db') cursor = connection.cursor() # Extract pandas data frame from SQL query cursor.execute('SELECT * FROM FreeSurfer WHERE Center = \'UiO\' AND (Diagnosis = \'SZ\' OR Diagnosis = \'BD\' OR Diagnosis = \'HC\')') data = cursor.fetchall() data = to_data_frame(data, get_columns(cursor)) data = normalize(data, exclude=['Age']) for target in ['Gender', 'Age']: # Run t-SNE procedure and visualize clusters X, _ = get_xy(data, target, exclude=['Diagnosis', 'Age', 'Center', 'Gender']) labels = get_labels(data, target) X = X.T if not os.path.isfile('Y.txt'): Y = tsne.tsne(X, 2, 50, 20.0) save_csv('Y.txt', Y) else: Y = load_csv('Y.txt') label_colors = labels_to_colors(labels) pylab.scatter(x=Y[:,0], y=Y[:,1], s=20, c=label_colors) pylab.title(target) pylab.show()
def similarity(songs, keys, filter_func= lambda x: True): # pdb.set_trace() avgs = avg_by(songs, keys, filter_func=filter_func) print len(avgs) pca = PCA(n_components=2) vals = avgs.values() print vals # pca.fit(vals) Y = tsne.tsne(np.array(vals), 2, 50, 20.0); points = [] for (k,c) in zip(avgs,Y): # c = pca.transform(avgs[k])[0].tolist() p = { 'x' : c[0], 'y' : c[1], 'song_title': k.next(), 'song_url' : k.next() # 'z' : c[2], } points.append(p) series = { 'name' : keys, 'data' : points } filename = "%s_similarities.json" % (keys) with open(filename, "w") as outfile: json.dump(series, outfile, indent=4)
def plot_clustering_2d(encodings, myCluster, output, **kw): if myCluster != 0: if kw['sof'] == 'sample': data = np.array(encodings)[1:, 1:].astype(float) else: data = np.array(encodings).T[1:, 1:].astype(float) labels = np.array(myCluster)[0:, 1:].reshape(-1, ) e = '' try: Y = tsne.tsne(data, 2, 50, 20.0) except RuntimeWarning as e: Y = pca.pca(data, n_components=2) df = pd.DataFrame({'X': Y[:, 0], 'Y': Y[:, 1], 'L': labels}) fig = plt.figure(0) mySet = set(labels) if len(mySet) > 5: plt.scatter(Y[:, 0], Y[:, 1], 20, labels) else: for l in mySet: newData = df.loc[df.loc[:, "L"] == l, :] plt.scatter(np.array(newData.X), np.array(newData.Y), 20, label="Cluster_%s" % l) plt.legend(loc='best') plt.savefig('%s.png' % output) plt.close(0)
def display_data(word_vectors, words, target_words=None): target_matrix = word_vectors.copy() if target_words: target_words = [line.strip().lower() for line in open(target_words)][:2000] rows = [words.index(word) for word in target_words if word in words] target_matrix = target_matrix[rows,:] else: rows = np.random.choice(len(word_vectors), size=1000, replace=False) target_matrix = target_matrix[rows,:] reduced_matrix = tsne(target_matrix, 2); Plot.figure(figsize=(200, 200), dpi=100) max_x = np.amax(reduced_matrix, axis=0)[0] max_y = np.amax(reduced_matrix, axis=0)[1] Plot.xlim((-max_x,max_x)) Plot.ylim((-max_y,max_y)) Plot.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], 20); for row_id in range(0, len(rows)): target_word = words[rows[row_id]] x = reduced_matrix[row_id, 0] y = reduced_matrix[row_id, 1] Plot.annotate(target_word, (x,y)) Plot.savefig("word_vectors.png");
def visualizeWord(mod1, mod2, word, n): if word in mod1.vocab and word in mod2.vocab: # find old emplacement of word pt1 = mod1[word] # find neighbours of that place label1 = [ label for label, p in mod2.most_similar(positive=[pt1], topn=n + 1) ] # because that place is in not the specific word, we could find it in the resulting neighbour so we make sure it # doesn't happen if word in label1: label1.remove(word) else: label1 = label1[:-1] data1 = mod2[label1] #print recent word pt2 = mod2[word] #print neighbours label2 = [label for label, p in mod2.most_similar(word, topn=n)] data2 = mod2[label2] #apply tsne on the two data before resplitting them res = tsne.tsne(np.concatenate( [pt1.reshape(1, 300), pt2.reshape(1, 300), data1, data2]), no_dims=2, initial_dims=300) pt1 = res[0].reshape(1, 2) pt2 = res[1].reshape(1, 2) data1 = res[2:n + 2] data2 = res[n + 2:] showEvolution(pt1, pt2, word, data1, data2, label1, label2) else: print(word + " is not in the vocabulary.")
def embed(words, matrix, classes, usermodel, fname): perplexity = 5.0 # Should be smaller than the number of points! dimensionality = matrix.shape[1] y = tsne(matrix, 2, dimensionality, perplexity) print >> sys.stderr, '2-d embedding finished' class_set = [c for c in set(classes)] colors = plot.cm.rainbow(np.linspace(0, 1, len(class_set))) class2color = [colors[class_set.index(w)] for w in classes] xpositions = y[:, 0] ypositions = y[:, 1] seen = set() for color, word, class_label, x, y in zip(class2color, words, classes, xpositions, ypositions): plot.scatter(x, y, 20, marker='.', color=color, label=class_label if class_label not in seen else "") seen.add(class_label) lemma = word.split('_')[0].replace('::', ' ') mid = len(lemma) / 2 mid *= 10 # TODO Should really think about how to adapt this variable to the real plot size plot.annotate(lemma, xy=(x - mid, y), size='x-large', weight='bold', fontproperties=font, color=color) plot.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') plot.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') plot.legend(loc=4) plot.savefig(root + 'data/images/tsneplots/' + usermodel + '_' + fname + '.png', dpi=150, bbox_inches='tight') plot.close()
def t_sne(activation_size, csv_file, nc_file): global timestep_activations, seq_activations, timestep_tru_labels, sequence_lengths, seq_labels, Y ################################################################################################ # t-SNE code ################################################################################################ ''' tasks: V 1. write code to average activations of a sequence for seq representations V 2. run t-SNE on softmax activations V 3. create bogus nc files to use to extract last layer activations other ideas: 1. take last activation label for improved accuracy + better sequence representation? 3. use mode of 2nd half of sequence for improved accuracy 4. use rnnlib to classify entire sequences instead of currennt. ''' # activations timestep_activations = parse_csv(csv_file, activation_size) seq_activations = np.array(average_seq_activations(timestep_activations), dtype=np.float64) # labels timestep_tru_labels, sequence_lengths = get_labels_lengths_from_nc(nc_file) seq_labels = restore_seq_label_list(timestep_tru_labels, sequence_lengths) seq_labels = np.array(seq_labels) Y = tsne.tsne(seq_activations) Plot.scatter(Y[:, 0], Y[:, 1], 20, seq_labels) Plot.show()
def visualise_context(network, dataset, device, title=None): """Plot a low-dimensional representation of the context space""" with to.no_grad(): network.eval() contexts = [] labels = [] for batch in dataset: # Extract means only, so the 0th element contexts.append( network.statistic_network.forward( batch['dataset'].to(device), batch['label'].to(device))[0]) labels.append(batch['label']) contexts = to.cat(contexts, dim=0) labels = to.cat(labels, dim=0) data = tsne.tsne(contexts.cpu().numpy(), no_dims=2, initial_dims=contexts.shape[1], perplexity=30.0) if title is None: plt.title( "2D t-SNE Plot of Test Data Contexts\nafter 1000 Iterations (Numeric Labels, Fully Supervised, Zeros Removed)" ) else: plt.title(title) plt.scatter(data[:, 0], data[:, 1], c=labels.argmax(dim=1).cpu().numpy()) plt.show() network.train()
def run_all(tags, filepref): # notice here i require at least 2 uses of each word in the corpus - reduces noise # uncomment the version of the model you want to try: tag limited, or all words. #model = gensim.models.Word2Vec(MySentences('books', tags=tags), min_count=2, size=200, workers=2) model = gensim.models.Word2Vec(DirOfPlainTextCorpus('texts'), min_count=3, workers=4) model.save(filepref + modelName) # if you want to save time and use a saved file, comment out the above and uncomment this with right path #model = gensim.models.Word2Vec.load('data/pride_NNPRP_model_austen_all') # does the text tagging and word replacement print tags datadict = build_dict_write_file(model, filepref + '_labeled.txt', filepref + '_data.json', tags) # tsne input files part make_score_files(model, datadict, filepref) # the actual tsne graph bit X = np.loadtxt(filepref + "_scores.csv") labels = np.genfromtxt(filepref + "_words.csv", dtype=str) Y = tsne.tsne(X, 2, 50, 20.0) # see tsne.py in repo do_tsne_files(filepref + '_coords.tsv', Y, labels, datadict, axis_off=True) return Y, labels, datadict, model
def visualize(wordEmbeddings): """ Visualize a set of examples using t-SNE. """ PERPLEXITY = 30 titles = wordEmbeddings.keys() titlesStr = ["_".join(y.strip().split()) for y in titles] x = numpy.vstack(wordEmbeddings.values()) filename = "./embeddings.png" try: #from textSNE.calc_tsne import tsne from tsne import tsne out = tsne(x, no_dims=2, perplexity=PERPLEXITY) import render render.render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename) except IOError: print "ERROR visualizing", filename data = numpy.column_stack((titlesStr, out)) numpy.savetxt( "/home/bhanu/workspace/RNTN/scripts/embeddings2d_phrase_vis.txt", data, "%s")
def display_data(word_vectors, words, target_words=None): target_matrix = word_vectors.copy() if target_words: target_words = [line.strip().lower() for line in open(target_words)][:2000] rows = [words.index(word) for word in target_words if word in words] target_matrix = target_matrix[rows, :] else: rows = np.random.choice(len(word_vectors), size=1000, replace=False) target_matrix = target_matrix[rows, :] reduced_matrix = tsne(target_matrix, 2) Plot.figure(figsize=(200, 200), dpi=100) max_x = np.amax(reduced_matrix, axis=0)[0] max_y = np.amax(reduced_matrix, axis=0)[1] Plot.xlim((-max_x, max_x)) Plot.ylim((-max_y, max_y)) Plot.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], 20) for row_id in range(0, len(rows)): target_word = words[rows[row_id]] x = reduced_matrix[row_id, 0] y = reduced_matrix[row_id, 1] Plot.annotate(target_word, (x, y)) Plot.savefig("word_vectors.png")
def plot_with_tsne(vectors, words, color_coding=None, outfile_name="tsne_solution"): # Is vectors in the right data structure (numpy array)? if not isinstance(vectors, np.ndarray): vectors = np.array(vectors) # Apply t-sne to project the word embeddings into a 2-dimensional space Y = tsne.tsne(X=vectors, no_dims=2, initial_dims=int(len(words) / 2), perplexity=5.0, max_iter=1000) # Let's plot the solution: if color_coding is not None: plt.scatter(Y[:, 0], Y[:, 1], c=color_coding) else: plt.scatter(Y[:, 0], Y[:, 1]) # Let's add the words to the plot: for label, x, y in zip(words, Y[:, 0], Y[:, 1]): plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points', size=9) plt.savefig(outfile_name + ".png", format='png') plt.show()
def similarity(songs, keys, filter_func=lambda x: True): # pdb.set_trace() avgs = avg_by(songs, keys, filter_func=filter_func) print len(avgs) pca = PCA(n_components=2) vals = avgs.values() print vals # pca.fit(vals) Y = tsne.tsne(np.array(vals), 2, 50, 20.0) points = [] for (k, c) in zip(avgs, Y): # c = pca.transform(avgs[k])[0].tolist() p = { 'x': c[0], 'y': c[1], 'song_title': k.next(), 'song_url': k.next() # 'z' : c[2], } points.append(p) series = {'name': keys, 'data': points} filename = "%s_similarities.json" % (keys) with open(filename, "w") as outfile: json.dump(series, outfile, indent=4)
def dimension_reduction(data, method='tsne', label=None, plot=False): n_components = 2 # 所有降维方法都是基于距离的,需要保证特征距离标准化 scaler = StandardScaler().fit(data) data = scaler.transform(data) # 大多数情况下,用tsne 将高纬度数据用二维方式展示出来。不同方法采用不同的特征映射方法计算出 # 不同的X,用fittransform方法进行标准化,这里是最小-最大规范化 if method == 'tsne': model = TSNE(n_components=n_components, perplexity=20, early_exaggeration=20, method='exact', learning_rate=100, n_iter=1000, random_state=250, verbose=2) X = model.fit_transform(data) # X是两列数据,经过了聚类+规范化 if method == 'isomap': model = Isomap(n_components=n_components, n_neighbors=20) X = model.fit_transform(data) if method == 'MDS': model = MDS(n_components=n_components, verbose=2, n_init=1, max_iter=500) X = model.fit_transform(data) if method == 'tsne_v2': X = tsne(data, 2, 44, 50.0) data_len = len(X) # 统计X长度 print(data_len) # data_len = 1653 print(X) # 二维数组,(1653L,2L) if plot: fig, ax = plt.subplots() # 说明有几个子图,数量未定 ax.scatter(X[label == 0, 0], X[label == 0, 1], c='darkblue', alpha=0.25, marker='^') ax.scatter(X[label == 1, 0], X[label == 1, 1], c='darkred', alpha=0.75, marker='x') ax.scatter(X[label == 2, 0], X[label == 2, 1], c='green', alpha=0.25, marker='o') ax.set_xlim([np.min(X[label == 0, 0]), np.max(X[label == 0, 0])]) ax.set_ylim([np.min(X[label == 0, 1]), np.max(X[label == 0, 1])]) plt.show() return X
def data_visualize(data,triplet,all_info) : print "Run tsne" Y = tsne.tsne(data, 2, 50, 20.0) all_entity = all_info[0] all_relation = all_info[1] idx2name = all_info[2] name2idx = all_info[3] for pos in range(3) : print 'draw pos',pos all_possible_label = set() for tri in triplet : all_possible_label.add(tri[pos]) colors = cm.rainbow(np.linspace(0, 1, len(all_possible_label))) for label_idx ,c in zip(all_possible_label,colors) : temp = Y count = 0 for tri_idx in range(len(triplet)) : if triplet[tri_idx][pos] == label_idx : count = count + 1 else : temp = np.delete(temp,count,0) plt.scatter(temp[:, 0], temp[:, 1], marker = 'o', color=c) plt.show()
def tsne_(y_train, title, pred, title_second): fig, ax = plt.subplots() Y = tsne.tsne(x_train, no_dims=2, initial_dims=784, perplexity=30.0) cax = plt.scatter(Y[:, 0], Y[:, 1], 20, y_train, edgecolors='face', alpha=1, cmap=plt.cm.get_cmap('jet', N)) cbar = plt.colorbar(ticks=tick) plt.clim(-0.5, N - 0.5) cbar.ax.set_yticklabels(target_names) # vertically$ plt.title(title) plt.xlabel('t-SNE dimension - 1') plt.ylabel('t-SNE dimension - 2') fig.tight_layout() plt.savefig(title) fig, ax = plt.subplots() cax = plt.scatter(Y[:, 0], Y[:, 1], 20, pred, edgecolors='face', alpha=1, cmap=plt.cm.get_cmap('jet', N)) cbar = plt.colorbar(ticks=tick) plt.clim(-0.5, N - 0.5) cbar.ax.set_yticklabels(target_names) # vertically$ plt.title(title_second) plt.xlabel('t-SNE dimension - 1') plt.ylabel('t-SNE dimension - 2') fig.tight_layout() plt.savefig(title_second)
def do_tsne(self): """ This function does t-SNE on the positive, negative, and unknown data provided :return: None """ all_data = np.vstack((self.data_points, self.positive_data, self.negative_data)) # This is to save memory and potentially prevent memory error del self.data_points del self.positive_data del self.negative_data if self.use_tsne_python: try: # Try importing and using the tsne_python implementation...i guess import tsne self.tsne_data = tsne.tsne(all_data, 2, self.tsne_perplexity) except: pass elif self.pca_preprocess: # This is to reduce memory requirement logger.info("Pre-processing with PCA...") pca_data = PCA(n_components=self.pca_preprocess_red).fit_transform(all_data) self.tsne_data = TSNE(perplexity=self.tsne_perplexity, early_exaggeration=self.early_exaggeration, random_state=self.tsne_seed, init=self.tsne_init, learning_rate=self.tsne_learning_rate, verbose=True).fit_transform(pca_data) else: self.tsne_data = TSNE(perplexity=self.tsne_perplexity, early_exaggeration=self.early_exaggeration, random_state=self.tsne_seed, init=self.tsne_init, learning_rate=self.tsne_learning_rate, verbose=True).fit_transform(all_data) logger.info("t-SNE complete.") logger.debug("Rearranging data...") chops = (self.num_points, self.num_positive, self.num_negative) self.data_points, self.positive_data, self.negative_data = basic.chop(all_data, chops) del all_data
def main(): # load data print("Loading data...") X, labels = sample_data(1000) # run pca #print("Run Y = pca(X, no_dims) to perform PCA on your dataset.") #Y = pca(X, 2) # run tsne print( "Run Y = tsne.tsne(X, no_dims, initial_dims, perplexity) to perform t-SNE on your dataset." ) Y = tsne.tsne(X, 2, 50, 30.0) # plot the results legend_ = [] colors = cm.rainbow(Math.linspace(0, 1, 10)) for i in sorted(list(set(labels))): idxs = (labels == i).nonzero() l = Plot.scatter(Math.squeeze(Y[idxs, 0]), Y[idxs, 1], 20, color=colors[int(i)]) legend_.append(l) Plot.legend(legend_, list(range(10)), loc='center left', ncol=1, fontsize=8, bbox_to_anchor=(1, 0.5)) Plot.savefig("result.png") elice_utils.send_image("result.png") return
def plot_with_labels(self,plot_only = 100, title="Like2Vec meets TensorFlow", filename='tsne.png', num_tsne_dims = 2, perplexity = 5.0,verbose=False): """ randomly chooses some of the users or items and plots them using tsne INPUT: plot_only : the number of users or items you would like plotted,int title : the title of the plot generated, str filename : the name you would like the file saved under, str num_tsne_dims : number of dimensions, int perplexity : the perplexity used in generating tsne, recommended to be between 5.0-50.0, double verbose : whether or not to print the progress of tsne OUTPUT: Your plot will be saved under the name and in the location you passed in filename """ selected_rows = np.sort(np.random.choice(range(self.final_embeddings.shape[0]),plot_only,replace=False)) labels = [self.labels[i] for i in selected_rows] low_dim_embs = tsne(self.final_embeddings[selected_rows], num_tsne_dims, self.final_embeddings.shape[1], perplexity,verbose) assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings" plt.figure(figsize=(18, 18)) #in inches for i, label in enumerate(labels): x, y = low_dim_embs[i,:] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.title(title) plt.savefig(filename)
def plot_words (V,labels=None,color='b',mark='o',fa='bottom'): W = tsne(V,2) i = 0 plt.scatter(W[:,0], W[:,1],c=color,marker=mark,s=50.0) for label,x,y in zip(labels, W[:,0], W[:,1]): plt.annotate(label.decode('utf8'), xy=(x,y), xytext=(-1,1), textcoords='offset points', ha= 'center', va=fa, bbox=dict(boxstyle='round,pad=0.1', fc='white', alpha=0)) i += 1
def egfr_test(): sdf_dir = '/home/xtalpi/datasets/mol_data/drugbank/sdf3D/' active_smi = '/home/xtalpi/datasets/mol_data/dude/egfr/actives_final.ism' decoys_smi = '/home/xtalpi/datasets/mol_data/dude/egfr/decoys_final.ism' sdf_files = [x for x in os.listdir(sdf_dir) if x.endswith('sdf')] sdf_files = [os.path.join(sdf_dir, f) for f in sdf_files] df_active = pd.read_csv(active_smi, sep=' ', names=['smiles', 'id', 'chemblid']) df_decoys = pd.read_csv(decoys_smi, sep=' ', names=['smiles', 'id', 'chemblid']) n2 = 10000 smiles_list = df_active['smiles'].tolist() + df_decoys['smiles'].tolist( )[:n2] labels = [1] * df_active.shape[0] + [0] * n2 print(len(smiles_list), len(labels)) dataset = dataset_from_mols(smiles_list, featurizer_type='morgan', transformer_type='morgan', batch_size=32) data = np.asarray(dataset.data[0], dtype=np.float32) Y = tsne.tsne(data, 2, 50, 20.) print(data.shape, Y.shape) pylab.scatter(Y[:, 0], Y[:, 1], 20, labels) pylab.show()
def dimension_reduction(data, index, method='tsne', label=None, plot=False): n_components = 2 # 所有降维方法都是基于距离的,需要保证特征距离标准化 scaler = StandardScaler().fit(data) data = scaler.transform(data) # 大多数情况下,用tsne 将高纬度数据用二维方式展示出来。不同方法采用不同的特征映射方法计算出 # 不同的X,用fittransform方法进行标准化,这里是最小-最大规范化 if method == 'tsne': model = TSNE(n_components=n_components, perplexity=20, early_exaggeration=20, method='exact', learning_rate=100, n_iter=1000, random_state=250, verbose=2) X = model.fit_transform(data) # X是两列数据,经过了聚类+规范化 if method == 'isomap': model = Isomap(n_components=n_components, n_neighbors=20) X = model.fit_transform(data) if method == 'MDS': model = MDS(n_components=n_components, verbose=2, n_init=1, max_iter=500) X = model.fit_transform(data) if method == 'tsne_v2': X = tsne(data, 2, 44, 50.0) data_len = len(X) # 统计X长度 print(data_len) # data_len = 1653 print(X) # 二维数组,(1653L,2L) if plot: fig, ax = plt.subplots() # 说明有几个子图,数量未定 # plt.subplot(2, 1, 1)#面板设置成2行1列,并取第一个(顺时针编号) # plt.plot(x1, y1, 'yo-')#画图,染色 # plt.scatter(X[label==0,0],X[label==0,1],c='darkblue',alpha=0.25,marker='^') # plt.scatter(X[label==1,0],X[label==1,1],c='darkred',alpha=0.75,marker='x') # plt.scatter(X[label==2,0],X[label==2,1],c='green',alpha=0.25,marker='o') # plt.xlim([np.min(X[label==0,0]),np.max(X[label==0,0])]) # plt.ylim([np.min(X[label==0,1]),np.max(X[label==0,1])]) ax.scatter(X[label == 0, 0], X[label == 0, 1], c='darkblue', alpha=0.25, marker='^') ax.scatter(X[label == 1, 0], X[label == 1, 1], c='darkred', alpha=0.75, marker='x') ax.scatter(X[label == 2, 0], X[label == 2, 1], c='green', alpha=0.25, marker='o') ax.set_xlim([np.min(X[label == 0, 0]), np.max(X[label == 0, 0])]) ax.set_ylim([np.min(X[label == 0, 1]), np.max(X[label == 0, 1])]) idxList = []; nameList = []; for i, ind in enumerate(index): if not ((-20 < X[ind, 0] < 20) and (-20 < X[ind, 1] < 20)): print(ind) idxList.append(ind) nameList.append(name[ind]) # plt.annotate('This is awesome!', xy=(76, 0.75), ax.annotate(str(ind), xy=(X[ind, 0], X[ind, 1])) # # ax.annotate(str(ind), X[ind, 0], X[ind, 1]) print idxList print nameList plt.show() outPut = {'Index': idxList, 'Video_Name': nameList} print outPut output_Archive = pd.DataFrame(outPut) output_Archive.to_csv('output_Archive.csv') return X
def __init__(self, wrd_embedding_corpora): self.wrd_embedding_corpora = wrd_embedding_corpora self.embedding_dims = wrd_embedding_corpora[0][1].shape[1] self.num_corpus = len(wrd_embedding_corpora) self.master_wrd_embedding = self._concat_wrd_embeddings() self.master_tsne_embedding = tsne(self.master_wrd_embedding) self.tsne_embedding_corpora = self._flatten_tsne_embeddings()
def SBM_visual_tsne(labels, X): import tsne import pylab as Plot Y = tsne.tsne(X, 2) Plot.figure() Plot.scatter(Y[:, 0], Y[:, 1], 20, labels) Plot.show() return Y
def tsne_plot(self): """2-D visualization of the learned representations using t-SNE.""" mapped_X = tsne.tsne(self.model.layers[0].weight.numpy()) plt.figure() for i, w in enumerate(self.vocab): plt.text(mapped_X[i, 0], mapped_X[i, 1], w) plt.xlim(mapped_X[:, 0].min(), mapped_X[:, 0].max()) plt.ylim(mapped_X[:, 1].min(), mapped_X[:, 1].max()) plt.show()
def plot_tsne(self): ''' Plot the t-Distributed Stochastic Neighbor Embedding (t-SNE) distribution of the data ''' self.subplot.clear() self.data = np.nan_to_num(self.data) # Eliminate NaNs centered = self.mean_center(self.data) standardized = self.standardization(centered) # Calculate t-SNE of the data and mask it (python t-SNE version if Intel IPP is not installed) try: from calc_tsne import calc_tsne U = calc_tsne(standardized, 2, 50, 20.0) except: logging.warning('''Could not use fast t-SNE. You may need to install the Intel Integrated Performance Libraries. Will use normal t-SNE instead.''') try: from tsne import tsne U = tsne(standardized, 2, 50, 20.0) except: logging.error('''Both t-SNE versions failed. Your dataset may be too large for t-SNE to handle. Will not plot t-SNE results.''') return self.Scores = U[:, 0:2] if self.class_masks is None or self.class_names is None: self.class_masks, self.class_names = self.create_class_masks() self.masked_X, self.masked_Y = self.mask_data(len(self.class_names), self.class_masks, self.Scores) # Plot the masked t-SNE results in the Scores canvas self.color_set = self.set_colormap(self.class_names) handles = [] labels = [] # Determine the different opacities for the objects. This is set to 1 if no opacities have been specified. if self.object_opacity is None: self.object_opacity = np.ones([self.masked_X.shape[0], 1]) self.object_accuracies = False elif self.object_accuracies is None: self.object_accuracies = True opacities = np.unique(self.object_opacity) nOpacity = len(opacities) # For each class and opacity combination plot the corresponding objects for i in xrange(len(self.class_names)): cell_count = np.shape(np.nonzero(self.masked_X[:, i])) for j in xrange(nOpacity): showObjects = np.where(self.object_opacity == opacities[j]) subHandle = self.subplot.scatter(self.masked_X[showObjects, i], self.masked_Y[showObjects, i], 8, c=self.color_set[i, :], linewidth="0.25", alpha=0.25+0.75*opacities[j]) # The highest opacity objects are added to the legend if opacities[j] == np.max(opacities): handles.append(subHandle) labels.append(self.class_names[i] + ': ' + str(cell_count[1])) self.leg = self.subplot.legend(handles, labels, loc=4, fancybox=True, handlelength=1) self.leg.get_frame().set_alpha(0.25) self.subplot.axhline(0, -100000, 100000, c='k', lw=0.1) self.subplot.axvline(0, -100000, 100000, c='k', lw=0.1) self.figure.canvas.draw() self.motion_event_active = True
def load_training_annotation(filepath, verbose=False): output_filepath = filepath[:-4] + '_embed.pkl' image_names, joints = pickle.load(open(filepath, "rb")) joints = np.array(joints) joints_embed = tsne( joints.reshape( (joints.shape[0], joints.shape[1] * joints.shape[2]))[0:10000, :]) with open(output_filepath, 'wb') as pf: pickle.dump(joints_embed, pf)
def tsne_plot(self): """Plot a 2-D visualization of the learned representations using t-SNE.""" mapped_X = tsne.tsne(self.params.word_embedding_weights) pylab.figure() for i, w in enumerate(self.vocab): pylab.text(mapped_X[i, 0], mapped_X[i, 1], w) pylab.xlim(mapped_X[:, 0].min(), mapped_X[:, 0].max()) pylab.ylim(mapped_X[:, 1].min(), mapped_X[:, 1].max()) pylab.show()
def tsne_plot(self): """Plot a 2-D visualization of the learned representations using t-SNE.""" mapped_X = tsne.tsne(self.params.word_embedding_weights) pylab.figure() for i, w in enumerate(self.vocab): pylab.text(mapped_X[i, 0], mapped_X[i, 1], w) pylab.xlim(mapped_X[:, 0].min(), mapped_X[:, 0].max()) pylab.ylim(mapped_X[:, 1].min(), mapped_X[:, 1].max())
def closest_k_points_tsne(embeddings, word, k): neighbours = embeddings.nearest_neighbors(word, top_k=k) + [word] X = map(lambda x: embeddings.get(x).tolist(), neighbours) tsne_reps = tsne(np.array(X)) result = [] for i in range(len(neighbours)): result.append({}) result[i]['label'] = neighbours[i] result[i]['x'] = tsne_reps[i][0] result[i]['y'] = tsne_reps[i][1] return result
def tSNE_analysis(dp): BATCHSIZE = 50 batch = dp.get_train_batch(BATCHSIZE) imgs, labels = batch[0], batch[1] imgs = np.reshape(imgs, [BATCHSIZE, 4096]) labels = 0.5 + 0.5 * labels Y = tsne.tsne(imgs, 2, 20, 20.0) plt.scatter(Y[:, 0], Y[:, 1], c=labels, cmap=plt.get_cmap("brg")) plt.colorbar() plt.show()
def embed(words,matrix,usermodel): Y = tsne(matrix,2,300,5.0) print '2-d embedding finished' Plot.scatter(Y[:,0], Y[:,1], 20,marker='.') for label, x, y in zip(words, Y[:, 0], Y[:, 1]): Plot.annotate(label.split('_')[0], xy = (x-20, y),size = 'x-large', weight = 'bold',fontproperties=font) m = hashlib.md5() name = '_'.join(words).encode('ascii','backslashreplace') m.update(name) fname = m.hexdigest() Plot.savefig(root+'static/tsneplots/'+usermodel+'_'+fname+'.png',dpi=150,bbox_inches='tight') Plot.close()
def tsne_plot(self): """ Plot a 2-D visualization of the learned representations using t-SNE. """ mapped_x = tsne.tsne(self.params.word_embedding_weights) pylab.figure() for i, w in enumerate(self.vocab): pylab.text(mapped_x[i, 0], mapped_x[i, 1], w) pylab.xlim(mapped_x[:, 0].min(), mapped_x[:, 0].max()) pylab.ylim(mapped_x[:, 1].min(), mapped_x[:, 1].max()) # TODO: change back to show # pylab.show() pylab.savefig('../a1-writeup/Images/1.png')
def feature_tsne(f_S, f_T): N_S = np.shape(f_S)[0] N_T = np.shape(f_T)[0] N = N_S + N_T f_dim = np.shape(f_S)[1] f = np.append(f_S, f_T, axis=0) fr = tsne.tsne(X=f, no_dims=2, initial_dims=f_dim, perplexity=30.0) fr_S = fr[0:N_S, :] fr_T = fr[N_S:N_S + N_T, :] return [fr_S, fr_T]
def embed(words, matrix, usermodel): perplexity = 5.0 dimensionality = matrix.shape[1] y = tsne(matrix, 2, dimensionality, perplexity) print >> sys.stderr, '2-d embedding finished' plot.scatter(y[:, 0], y[:, 1], 20, marker='.') for label, x, y in zip(words, y[:, 0], y[:, 1]): plot.annotate(label.split('_')[0], xy=(x - 20, y), size='x-large', weight='bold', fontproperties=font) m = hashlib.md5() name = '_'.join(words).encode('ascii', 'backslashreplace') m.update(name) fname = m.hexdigest() plot.savefig(root + 'static/tsneplots/' + usermodel + '_' + fname + '.png', dpi=150, bbox_inches='tight') plot.close()
def plot_2d_classes(value): global labels if 10 > len(labels): print ("Labels and HoG dont seem to have been loaded") print ("Trying to load them from disk") if not load_hog(1) == 1: print ("Could not load HoG, quitting") return nm_elements = int(raw_input('Plot this many elements (up to ' + str(len(labels)) + ') : ')) new_labels = list() classes = np.unique(labels).tolist() for single_label in labels[:nm_elements]: for unique_label in classes: if unique_label == single_label: new_labels.append(classes.index(unique_label)) y = tsne.tsne(np.array(hog_list[:nm_elements])) plot.scatter(y[:, 0], y[:, 1], 20, new_labels) plot.show()
def tsne_viz( mat=None, rownames=None, indices=None, colors=None, output_filename=None, figheight=40, figwidth=50, display_progress=False): """2d plot of mat using tsne, with the points labeled by rownames, aligned with colors (defaults to all black). If indices is a list of indices into mat and rownames, then it determines a subspace of mat and rownames to display. Give output_filename a string argument to save the image to disk. figheight and figwidth set the figure dimensions. display_progress=True shows the information that the tsne method prints out.""" if not colors: colors = ['black' for i in range(len(rownames))] temp = sys.stdout if not display_progress: # Redirect stdout so that tsne doesn't fill the screen with its iteration info: f = open(os.devnull, 'w') sys.stdout = f tsnemat = tsne(mat) sys.stdout = temp # Plot coordinates: if not indices: indices = range(len(rownames)) vocab = np.array(rownames)[indices] xvals = tsnemat[indices, 0] yvals = tsnemat[indices, 1] # Plotting: fig, ax = plt.subplots(nrows=1, ncols=1) fig.set_figheight(40) fig.set_figwidth(50) ax.plot(xvals, yvals, marker='', linestyle='') # Text labels: for word, x, y, color in zip(vocab, xvals, yvals, colors): ax.annotate(word, (x, y), fontsize=8, color=color) # Output: if output_filename: plt.savefig(output_filename, bbox_inches='tight') else: plt.show()
def plotTsne(): w2vThreshold = 2 filenames = ['Haupt.txt', 'Super.txt', 'Kinder.txt', 'Bundes.txt', 'Finanz.txt'] # filenames = ['Haupt.txt', 'Bundes.txt'] w2vPath = '../NLP2-Project2/models/mono_500_de.bin' # w2vPath = '../NLP2-Project2/models/mono_200_de.bin' dimensions = 500 # dimensions = 200 colours = ['#f02720', '#ff7f0f', '#32a251', '#1f77b4', '#ab6ad5'] words = set() rawLabels = [] for i, fname in enumerate(filenames): f = codecs.open(fname, 'rb', encoding='utf-8') for l in f: clean = l.strip().split(' ') if clean[0] > w2vThreshold: words.add(clean[1]) rawLabels.append(colours[i]) model = loadW2VModel(w2vPath) X = Math.empty((0, dimensions)) # labels = Math.empty((1),dtype=float) labels = [] for i,w in enumerate(words): try: rep = model[w] X = Math.r_[X, rep[Math.newaxis,:]] labels.append(rawLabels[i]) except KeyError: continue # X = Math.loadtxt() # labels = Math.loadtxt() Y = tsne(X, 2, dimensions, 20.0, max_iter=1000) pylab.scatter(Y[:,0], Y[:,1], 18, marker='o', c=labels, edgecolor='None') pylab.savefig('scatter.png')
def plot(title,embeddings, labels,use_tsne): if not use_tsne: low_dim_embs = PCA(n_components=2).fit_transform(embeddings) else: low_dim_embs = tsne.tsne(embeddings,2,len(embeddings), 50.0) if title: for label, x, y in zip(labels, low_dim_embs[:, 0], low_dim_embs[:, 1]): plt.plot(x,y,'x') plt.annotate(label, xy = (x, y),fontsize='xx-small') file = 'fig-%s.eps' % title plt.savefig(file, format='eps', dpi=1200) plt.clf() for label, x, y in zip(labels, low_dim_embs[:, 0], low_dim_embs[:, 1]): plt.plot(x,y,'x') plt.annotate(label, xy = (x, y)) plt.show() plt.clf()
def visualize(wordEmbeddings): """ Visualize a set of examples using t-SNE. """ PERPLEXITY=30 titles = wordEmbeddings.keys() titlesStr = ["_".join(y.strip().split()) for y in titles] x = numpy.vstack(wordEmbeddings.values()) filename = "embeddings.png" try: #from textSNE.calc_tsne import tsne from tsne import tsne out = tsne(x, no_dims = 2,perplexity=PERPLEXITY) #from textSNE.render import render #render([(title, point[0], point[1]) for title, point in zip(titles, out)], filename) except IOError: print "ERROR visualizing", filename data = numpy.column_stack((titlesStr,out)) numpy.savetxt("/home/bhanu/workspace/RNTN/data/results/embeddings2d_phrase_vis.txt", data, "%s")
def tsne_viz( mat=None, rownames=None, indices=None, colors=None, output_filename=None, figheight=40, figwidth=50, display_progress=False): if not colors: colors = ['black' for i in range(len(mat))] temp = sys.stdout if not display_progress: # Redirect stdout so that tsne doesn't fill the screen with its iteration info: f = open(os.devnull, 'w') sys.stdout = f tsnemat = tsne(mat) sys.stdout = temp # print tsnemat # Plot coordinates: if not indices: indices = range(len(mat)) vocab = np.array(rownames)[indices] xvals = tsnemat[indices, 0] yvals = tsnemat[indices, 1] # Plotting: fig, ax = plt.subplots(nrows=1, ncols=1) fig.set_figheight(100) fig.set_figwidth(500) ax.plot(xvals, yvals, marker='', linestyle='') # Text labels: for word, x, y, color in zip(vocab, xvals, yvals, colors): ax.annotate(word, (x, y), fontsize=8, color=color) print "Output:" if output_filename: plt.savefig(output_filename, bbox_inches='tight') else: plt.show()
def get_area_centroids_2D(): """Return a dictionary of tsne-determined 2D brain area centroids.""" # Get all structure ids s_ids = [ONTO.structure_by_acronym(area).structure_id \ for area in AREAS] # Get centroids ctrds_L = [ONTO.get_mask_from_id_left_hemisphere_nonzero(s_id).centroid for s_id in s_ids] ctrds_R = [ONTO.get_mask_from_id_right_hemisphere_nonzero(s_id).centroid for s_id in s_ids] centroids = np.concatenate([np.array(ctrds_L),np.array(ctrds_R)],0) # Get lateralized names areas_L = [area + '_L' for area in AREAS] areas_R = [area + '_R' for area in AREAS] areas_LR = areas_L + areas_R # Run tsne centroids_2D = tsne.tsne(centroids,2,max_iter=1000) # Align symmetrically centroids_2D = sym_align(centroids_2D,areas_LR) return areas_LR, centroids_2D
def test_tsne(): global digits, labels nrows = digits.shape[0]; # Smaller number of labels for debugging... if not options.all_data: nrows = 250 X = digits[range(nrows), ...] L = labels[range(nrows), ...] Y = tsne.tsne(X, 2, 50, 20.0, use_pca=True, max_iter=1000) #Y = tsne.tsne(X, 2, 50, 20.0, use_pca=False) for i in xrange(10): idxs = [idx for idx in xrange(len(L)) if L[idx] == i] c = Plot.get_cmap()(0.1 * i) Plot.scatter(Y[idxs,0], Y[idxs,1], 20, c, label="%d" % i) #Plot.axis('off') Plot.xticks([]) Plot.yticks([]) Plot.legend(loc='upper left', scatterpoints=1) if options.show_graph: Plot.show()
saver.save(sess, os.getcwd()+"/training/train",global_step=epoch) else: saver.restore(sess, tf.train.latest_checkpoint(os.getcwd()+"/training/")) rand = 50 x = train_data[rand:rand+64,:,:] y = train_labels[rand:rand+64,:] preds = sess.run([model.prediction], {data: x, target: y, dropout: 1})[0] labels = ["4Head","AMPEnergy","AMPEnergyCherry","ANELE","ArgieB8","ArsonNoSexy","AsianGlow","AthenaPMS","BabyRage","BatChest","BCouch","BCWarrior","BibleThump","BiersDerp","BigBrother","BionicBunion","BlargNaut","bleedPurple","BloodTrail","BORT","BrainSlug","BrokeBack","BudBlast","BuddhaBar","BudStar","ChefFrank","cmonBruh","CoolCat","CorgiDerp","CougarHunt","DAESuppy","DalLOVE","DansGame","DatSheffy","DBstyle","deExcite","deIlluminati","DendiFace","DogFace","DOOMGuy","DoritosChip","duDudu","EagleEye","EleGiggle","FailFish","FPSMarksman","FrankerZ","FreakinStinkin","FUNgineer","FunRun","FutureMan","FuzzyOtterOO","GingerPower","GrammarKing","HassaanChop","HassanChop","HeyGuys","HotPokket","HumbleLife","ItsBoshyTime","Jebaited","JKanStyle","JonCarnage","KAPOW","Kappa","KappaClaus","KappaPride","KappaRoss","KappaWealth","Keepo","KevinTurtle","Kippa","Kreygasm","Mau5","mcaT","MikeHogu","MingLee","MKXRaiden","MKXScorpion","MrDestructoid","MVGame","NinjaTroll","NomNom","NoNoSpot","NotATK","NotLikeThis","OhMyDog","OMGScoots","OneHand","OpieOP","OptimizePrime","OSfrog","OSkomodo","OSsloth","panicBasket","PanicVis","PartyTime","PazPazowitz","PeoplesChamp","PermaSmug","PeteZaroll","PeteZarollTie","PicoMause","PipeHype","PJSalt","PJSugar","PMSTwin","PogChamp","Poooound","PraiseIt","PRChase","PunchTrees","PuppeyFace","RaccAttack","RalpherZ","RedCoat","ResidentSleeper","riPepperonis","RitzMitz","RuleFive","SeemsGood","ShadyLulu","ShazBotstix","ShibeZ","SmoocherZ","SMOrc","SMSkull","SoBayed","SoonerLater","SriHead","SSSsss","StinkyCheese","StoneLightning","StrawBeary","SuperVinlin","SwiftRage","TBCheesePull","TBTacoLeft","TBTacoRight","TF2John","TheRinger","TheTarFu","TheThing","ThunBeast","TinyFace","TooSpicy","TriHard","TTours","twitchRaid","TwitchRPG","UleetBackup","UncleNox","UnSane","VaultBoy","VoHiYo","Volcania","WholeWheat","WinWaker","WTRuck","WutFace","YouWHY"] labels2 = [x for pair in zip(labels,labels) for x in pair] embeds = sess.run([model.embeddingvar])[0] print np.shape(embeds) embeds = np.repeat(embeds.T,2,axis=0) print np.shape(embeds) Y = tsne.tsne(embeds, 2, 200, 20.0); print np.shape(Y) # # Plot.scatter(Y[:,0], Y[:,1], 20); # # Plot.show(); plt.scatter( Y[:, 0], Y[:, 1], marker = 'o') # for label, x, y in zip(labels2, Y[:, 0], Y[:, 1]): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.show()
# coding: utf-8 import tsne import cPickle as pkl import numpy as np import sys import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages # Document matrix pickle object with open(sys.argv[1], 'r') as doc_mat_pkl: vectors = pkl.load(doc_mat_pkl) Y = tsne.tsne(vectors.astype(np.float64), no_dims=2, perplexity=5) # Output name for the pdf, sans the filetype pp = PdfPages(sys.argv[2] + '.pdf') fig = plt.figure(figsize=(16,12)) plt.scatter(Y[:,0], Y[:,1], c='k') pp.savefig() pp.close() plt.close()
import numpy as np from tsne import tsne X = np.random.rand(1000,30) Y = tsne(X, verbose = True)
import numpy as Math import pylab as Plot import tsne as visualize from PIL import Image # Functions in tsne taken from http://lvdmaaten.github.io/tsne/ # Load in labels and vectors for corresponding labels vector_matrix = Math.loadtxt("vectors.txt") labels = [line.strip() for line in open("labels.txt")] rows = [labels.index(word) for word in labels] target_matrix = vector_matrix[rows, :] # Run the t-SNE reducing to 2 dimensions reduced_matrix = visualize.tsne(vector_matrix, 2) # Plot the figure Plot.figure(figsize=(20, 20), dpi=10) max_x = Math.amax(reduced_matrix, axis=0)[0] max_y = Math.amax(reduced_matrix, axis=0)[1] Plot.xlim((-max_x, max_x)) Plot.ylim((-max_y, max_y)) Plot.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], 20) # Add labels for row_id in range(0, len(rows)): target_word = labels[rows[row_id]] x = reduced_matrix[row_id, 0] y = reduced_matrix[row_id, 1]
#take apart the bands and time intervals for i,val in enumerate(np.arange(0,56,14)): for j in np.arange(0,no_sbj,1): coh[i,:,:,j] = data[j,val:val+6,:] #Diferentiate the data in time diff_coh = np.diff(coh,1,0) diff_coh_res = np.reshape(diff_coh,[18,36*no_sbj],'F').T coh_res = np.reshape(coh,[24,36*no_sbj]).T #t-SNE on dataset [mapped,C] = tsne.tsne(diff_coh_res, no_dims, init_dims, perplexity) #labels = kmeans2(mapped,no_clust,10) colors = cm.jet(np.linspace(0, 1, no_clstr)) plt.figure(0) plt.scatter(mapped[:,0],mapped[:,1],c = colors[ident[:,0],:], s = 150,alpha = 0.7) plt.show #for i in np.arange(no_sbj): # plt.figure(i) # plt.matshow(diff_coh_res[i*36:i*36 + 36,:]) # plt.colorbar() # plt.text(0.5,36.5,r'$\delta$',fontsize=25) # plt.text(3.5,36.5,r'$\theta$',fontsize=25)
for i,val in enumerate(np.arange(0,56,14)): coh[i,:,:] = average[:,val:val+6].T #Diferentiate the data in time diff_coh = np.diff(coh,1,0) diff_coh_res = np.reshape(diff_coh,[18,no_pairs],'F').T coh_res = np.reshape(coh,[24,no_pairs],'F').T silhouette = np.zeros([no_perm,no_pairs]) for i in np.arange(no_perm): print('Permutation number:' + str(i)) C_min = 1e10 dif_perm = diff_coh_res[np.random.permutation(no_pairs)] #coh_perm = diff_coh_res for j in np.arange(no_map): [mapped,C] = tsne.tsne(dif_perm, no_dims, init_dims, perplexity) if C < C_min: win_map = mapped #a map with the lowest error C_min = C for k in np.arange(2,no_pairs): kmeans_obj = KMeans(k) kmeans_obj.fit(win_map) labels = kmeans_obj.labels_ silhouette[i,k] = crit(win_map,labels) #plt.figure #plt.plot(np.mean(silhouette.T,1),color = 'red') #plt.show plt.figure plt.plot(silhouette.T,color = 'red')
# -*- coding: utf-8 -*- # <nbformat>3.0</nbformat> # <codecell> %pylab import tsne as tsne # <codecell> X = np.loadtxt("mnist2500_X.txt"); labels = np.loadtxt("mnist2500_labels.txt"); # <codecell> print X.shape print labels.shape print np.min(X), np.max(X) # <codecell> tsne.tsne(
from __future__ import unicode_literals import numpy as Math import pylab as Plot import argparse import tsne if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-l', action='store', dest='labelfile', required=True, help='Path of label file') parser.add_argument('-v', action='store', dest='vectorFile', required=True, help='Embedding vector') parser.add_argument('-d', action='store', type=int, dest='demension', required=True, help='Demension of vector') parser.add_argument('-p', action='store', type=int, dest='perplexity', default=20, help='Perplexity, usually between 20 to 50') r = parser.parse_args() X = Math.loadtxt(r.vectorFile) with open(r.labelfile, 'r') as f: labels = f.read().upper().splitlines() Y = tsne.tsne(X, 2, r.demension, r.perplexity) fig, ax = Plot.subplots() ax.scatter(Y[:, 0], Y[:, 1], 20) for i, txt in enumerate(labels): ax.annotate(txt, (Y[:, 0][i], Y[:, 1][i])) Plot.show()
def tsne(fdarray, new_label = 'tsne', channels = None, transform = 'arcsinh', sample = 6000, verbose = False, backgate = True): """Perform t-SNE/viSNE on the FlowData object """ fdarray = util.make_list(fdarray) # If the user has not provided a list of channels to use, # use the intersection of all isotope channels if channels is None: channel_set = [] for fd in fdarray: channel_set.append(set(fd.isotopes)) channels = list(set.intersection(*channel_set)) # Make a copy of the data in files that we want points = [] for fd in fdarray: points.append(np.vstack([ fd[ch] for ch in channels ]).T) # transform if transform == 'arcsinh': for pts in points: # Apply the transform inplace to the data np.arcsinh(5*pts, pts) # Randomly sample to reduce the number of points sample_masks = [] for pts in points: if sample < pts.shape[0]: # If we have enough points to subsample sample_masks.append(np.random.choice(pts.shape[0], sample, replace = False)) else: # Otherwise we add all the points sample_masks.append(np.array(range(pts.shape[0]))) # Sample the points, and construct a large matrix sample_points = [] for mask, pts in zip(sample_masks, points): sample_points.append(pts[mask,:]) X = np.vstack(sample_points) # Perform t-SNE Y = lib_tsne.tsne(X, verbose = verbose) assert Y is not None, ('t-SNE failed to return') # Split Y into a matrix for each dataset splits = np.cumsum( np.array([ mask.shape[0] for mask in sample_masks], dtype = int)) Y_split = np.split(Y, splits, axis = 0) # now expand data to reassign these points back into the dataset tsne_coords = [] for (pts, mask, Yspt) in zip(points, sample_masks, Y_split): npoints = pts.shape[0] Z = np.zeros((npoints, 2))*float('NaN') Z[mask,:] = Yspt tsne_coords.append(Z) # If a point didn't get sampled, place its t-SNE coordinates at its nearest # neighbor. if backgate: kd = KDTree(X) # select points not assigned values with t-SNE for pts, mask, coords, j in zip(points, sample_masks, tsne_coords, range(len(points))): nan_points = np.argwhere(np.isnan(coords[:,0])) d,near = kd.query(pts[nan_points],1) # convert back to coordinates on the whole dataset coords[nan_points, :] = Y[near,:] tsne_coords[j] = coords # add to data to FlowData structure for fd, coords in zip(fdarray, tsne_coords): fd[new_label+'1'] = coords[:,0] fd[new_label+'2'] = coords[:,1]