def visualize(): training_data, validation_data, test_data = mnist_loader.load_data_wrapper() # Unzipping gives tuples, but we want arrays of values. training_input = [x.transpose()[0] for x in zip(*training_data)[0]] test_input = [x.transpose()[0] for x in zip(*test_data)[0]] # Get the y values. test_target = [y for y in zip(*test_data)[1]] # Apply SVD to the training input. u, s, v = np.linalg.svd(training_input, full_matrices=False) print u.shape print s.shape print v.shape print "Generating embeddings..." #print v[0] print v[0].shape embeddings = [np.dot(test_inp, np.transpose(v[:10][:])) for test_inp in test_input] print embeddings[0].shape # Do dimensionality reduction into 2 dimensions. print "Performing dimensionality reduction using t-sne..." tsne = TSNE() reduced_vecs = tsne.fit_transform(embeddings) print reduced_vecs[0] # Graph all of the points, where points corresponding to the same digit will have the same color. colors = ['r', 'b', 'g', 'c', 'm', 'k', 'y', (.2, .2, .2), (.4, 0, .5), (.8, .2, 0)] red_patch = mpatches.Patch(color='red', label='1') patches = [mpatches.Patch(color=colors[i], label='%i'% i) for i in range(len(colors))] plt.legend(handles=patches) for i in range(len(reduced_vecs)): plt.plot([reduced_vecs[i][0]], [reduced_vecs[i][1]], 'o', color=colors[test_target[i]]) plt.show()
def add_tsne_features(x_train, x_test): print('add_tsne_features <<') x_train_data = x_train.data_ x_test_data = x_test.data_ x = np.vstack((x_train_data, x_test_data)) print('applying pca...') pca = PCA(n_components=25) x_pca = pca.fit_transform(x) print('applying t-SNE...') tsne_model = TSNE(n_components=2, random_state=0) x_tsne = tsne_model.fit_transform(x_pca) x_train_data = np.hstack((x_train_data, x_tsne[:x_train_data.shape[0], :])) x_test_data = np.hstack((x_test_data, x_tsne[-x_test_data.shape[0]:, :])) assert(x_train.columns_ == x_test.columns_) columns = x_train.columns_ + ['tsne_1', 'tsne_2'] x_train = DataSet(x_train.ids_, columns, x_train_data) x_test = DataSet(x_test.ids_, columns, x_test_data) print('add_tsne_features >>') return x_train, x_test
def sendTSNE(self, people): d = self.getData() if d is None: return else: (X, y) = d X_pca = PCA(n_components=50).fit_transform(X, X) tsne = TSNE(n_components=2, init='random', random_state=0) X_r = tsne.fit_transform(X_pca) yVals = list(np.unique(y)) colors = cm.rainbow(np.linspace(0, 1, len(yVals))) # print(yVals) plt.figure() for c, i in zip(colors, yVals): name = "Unknown" if i == -1 else people[i] plt.scatter(X_r[y == i, 0], X_r[y == i, 1], c=c, label=name) plt.legend() imgdata = StringIO.StringIO() plt.savefig(imgdata, format='png') imgdata.seek(0) content = 'data:image/png;base64,' + \ urllib.quote(base64.b64encode(imgdata.buf)) msg = { "type": "TSNE_DATA", "content": content } self.sendMessage(json.dumps(msg))
def display_closestwords_tsnescatterplot(model, word): arr = np.empty((0,300), dtype='f') word_labels = [word] # get close words close_words = model.similar_by_word(word) # add the vector for each of the closest words to the array arr = np.append(arr, np.array([model[word]]), axis=0) for wrd_score in close_words: wrd_vector = model[wrd_score[0]] word_labels.append(wrd_score[0]) arr = np.append(arr, np.array([wrd_vector]), axis=0) # find tsne coords for 2 dimensions tsne = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) Y = tsne.fit_transform(arr) x_coords = Y[:, 0] y_coords = Y[:, 1] # display scatter plot plt.scatter(x_coords, y_coords) for label, x, y in zip(word_labels, x_coords, y_coords): plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') plt.xlim(x_coords.min()+0.00005, x_coords.max()+0.00005) plt.ylim(y_coords.min()+0.00005, y_coords.max()+0.00005) plt.show()
def make_tsne_plot(model, rel_wds, plot_lims, title): dim = 30 X, keys = make_data_matrix(model) # first we actually do PCA to reduce the # dimensionality to make tSNE easier to calculate X_std = StandardScaler().fit_transform(X) sklearn_pca = PCA(n_components=2) X = sklearn_pca.fit_transform(X_std)[:,:dim] # do downsample k = 5000 sample = [] important_words = [] r_wds = [word[0] for word in rel_wds] for i, key in enumerate(keys): if key in r_wds: sample.append(i) sample = np.concatenate((np.array(sample), np.random.choice(len(keys), k-10, replace = False), )) X = X[sample,:] keys = [keys[i] for i in sample] # Do tSNE tsne = TSNE(n_components=2, random_state=0, metric="cosine") X_transf = tsne.fit_transform(X) k_means = KMeans(n_clusters=8) labels = k_means.fit_predict(X_transf) scatter_plot(X_transf[:,0], X_transf[:,1], rel_wds, labels, title, keys, plot_lims)
def plot_features(subject, data_path, model_path, test_labels, dataset='test'): with open(model_path + '/' + subject + '.pickle', 'rb') as f: state_dict = cPickle.load(f) cnn = ConvNet(state_dict['params']) cnn.set_weights(state_dict['weights']) scalers = state_dict['scalers'] if dataset == 'test': d = load_test_data(data_path, subject) x = d['x'] y = test_labels['preictal'] elif dataset == 'train': d = load_train_data(data_path, subject) x, y = d['x'], d['y'] else: raise ValueError('dataset') x, _ = scale_across_time(x, x_test=None, scalers=scalers) if state_dict['params']['scale_time'] \ else scale_across_features(x, x_test=None, scalers=scalers) cnn.batch_size.set_value(x.shape[0]) get_features = theano.function([cnn.x, Param(cnn.training_mode, default=0)], cnn.feature_extractor.output, allow_input_downcast=True) logits_test = get_features(x) model = TSNE(n_components=2, random_state=0) z = model.fit_transform(np.float64(logits_test)) plt.scatter(z[:, 0], z[:, 1], s=60, c=y) plt.show()
def plot_data(data, has_label=True): import numpy as np import seaborn as sns from sklearn.manifold import TSNE from sklearn.decomposition import PCA if not has_label: data = data.copy() data['label'] = np.zeros([len(data),1]) LIMIT = 4000 if data.shape[0] > LIMIT: dt = data.sample(n=LIMIT, replace=False) X = dt.ix[:,:-1] labels = dt.ix[:,-1] else: X = data.ix[:,:-1] labels = data.ix[:,-1] tsne_model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) points1 = tsne_model.fit_transform(X) df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"]) sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind')) sns.plt.title('TNSE') pca = PCA(n_components=2) pca.fit(X) points2 = pca.transform(X) df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"]) sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind')) sns.plt.title('PCA')
def plot_mfi(self, outputfile='embeddings.pdf', nb_clusters=8, weights='NA'): # collect embeddings for mfi: X = np.asarray([self.w2v_model[w] for w in self.mfi \ if w in self.w2v_model], dtype='float32') # dimension reduction: tsne = TSNE(n_components=2) coor = tsne.fit_transform(X) # unsparsify plt.clf() sns.set_style('dark') sns.plt.rcParams['axes.linewidth'] = 0.4 fig, ax1 = sns.plt.subplots() labels = self.mfi # first plot slices: x1, x2 = coor[:,0], coor[:,1] ax1.scatter(x1, x2, 100, edgecolors='none', facecolors='none') # clustering on top (add some colouring): clustering = AgglomerativeClustering(linkage='ward', affinity='euclidean', n_clusters=nb_clusters) clustering.fit(coor) # add names: for x, y, name, cluster_label in zip(x1, x2, labels, clustering.labels_): ax1.text(x, y, name, ha='center', va="center", color=plt.cm.spectral(cluster_label / 10.), fontdict={'family': 'Arial', 'size': 8}) # control aesthetics: ax1.set_xlabel('') ax1.set_ylabel('') ax1.set_xticklabels([]) ax1.set_xticks([]) ax1.set_yticklabels([]) ax1.set_yticks([]) sns.plt.savefig(outputfile, bbox_inches=0)
def perform_AE(X, dim=2, tsne=False): y = np.zeros(shape=X.shape[0], dtype=int) if tsne: hidden_layers = [X.shape[1], 500, 100, 32] encoder_weights, decoder_weights = pretrain(X, hidden_layers) X_32d = ae(X, encoder_weights, decoder_weights, hidden_layers) ae_tsne = TSNE(n_components=dim, learning_rate=800, verbose=1) X_2d = ae_tsne.fit_transform(X_32d) method = 'ae_tsne_scaled' ### END - if tsne else: hidden_layers = [X.shape[1], 500, 100, 20, dim] encoder_weights, decoder_weights = pretrain(X, hidden_layers) X_2d = ae(X, encoder_weights, decoder_weights, hidden_layers) method = 'ae_scaled' ### END - else print('***** ' + method + ' *****') cluster(X_2d, method) np.save("{0}_{1}_X_2d".format(species, method), X_2d)
def plot_phonemes(path): phoneme_embeddings = dict() for line in codecs.open(path,"r"): line = line.split(",") key= line[0][1:-1] emb = line[1:] emb[-1] = emb[-1][:-1] emb = np.array([float(e) for e in emb]) phoneme_embeddings[key] = emb phoneme_embeddings = DataFrame(phoneme_embeddings,columns=phoneme_embeddings.keys()) print(phoneme_embeddings.columns) m = TSNE() phoneme_embeddings_tsne = m.fit_transform(phoneme_embeddings.transpose()) print(len(phoneme_embeddings_tsne)) for p,emb in zip(phoneme_embeddings.columns, phoneme_embeddings_tsne): c = "black" if regex.search("^[aeiou3E][*]?$", p): c = "red" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*w~$", p): c = "blue" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*y~$", p): c = "yellow" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*h~$", p): c = "brown" plt.annotate(p,(emb[0],emb[1]),color=c) if regex.search("^.*\"$", p): c = "green" plt.annotate(p,(emb[0],emb[1]),color=c)
def t_sne_view(norm_table, subj_cond, cohorts, image_type): # t-SNE analysis: Use stochastic neighbor embedding to reduce dimensionality of # data set to two dimensions in a non-linear, distance dependent fashion # Perform PCA data reduction if dimensionality of feature space is large: if len(norm_table.columns) > 12: pca = PCA(n_components = 12) pca.fit(norm_table.as_matrix()) raw_data = pca.transform(norm_table.as_matrix()) else: raw_data = norm_table.as_matrix() # Transform data into a two-dimensional embedded space: tsne = TSNE(n_components = 2, perplexity = 40.0, early_exaggeration= 2.0, learning_rate = 100.0, init = 'pca') tsne_data = tsne.fit_transform(raw_data) # Prepare for normalization and view: cols = ['t-SNE', 'Cluster Visualization'] tsne_table = pd.DataFrame(tsne_data, index = norm_table.index, columns = cols) # The output is no longer centered or normalized, so shift & scale it before display: tsne_avg = ppmi.data_stats(tsne_table, subj_cond, cohorts) tsne_norm_table = ppmi.normalize_table(tsne_table, tsne_avg) # Send out to graphics rendering engine: if (image_type == 'Gauss'): return scg.scatter_gauss(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond) elif (image_type == 'Scatter'): return scg.scatter_plain(tsne_norm_table[cols[0]], tsne_norm_table[cols[1]], subj_cond)
def main(): embedding = WordEmbedding(embeddingpath(default_embeddingconfig)) for old, new in spelling_changes: print(old, '--', new) print(embedding.nearest_words([old])) print() print() war, ist = tense_changes[0] tensediff = embedding[ist] - embedding[war] for past, present in tense_changes[1 : ]: print(past, '+ tensediff:', *embedding.nearest_words([embedding[past] + tensediff])) print('Should be:', present) print() # word_diffs = [embedding[new] - embedding[old] for (old, new) in word_changes] spelling_diffs = [embedding[new] - embedding[old] for (old, new) in spelling_changes[10 : 20]] tense_diffs = [embedding[present] - embedding[past] for (past, present) in tense_changes] def metric(u, v): return max(distance.cosine(u, v), 0) while True: try: model = TSNE(n_components=2, metric=metric) reduced = model.fit_transform(spelling_diffs + tense_diffs) print(reduced) return except Exception: pass
def vizualize_clusters(X, y, py, hist=None): """ Using T-SNE to visualize the site clusters. Plot and save the scatter (and the histogramm). """ model = TSNE(n_components=2, random_state=0) fig = model.fit_transform(X, y) fig1 = model.fit_transform(X, py) pyplot.figure(figsize=(16, 8)) pyplot.subplot(121) classes = list(set(y)) for c, color in zip(classes, plt.colors.cnames.iteritems()): indeces = [i for i, p in enumerate(y) if p == c] pyplot.scatter(fig[indeces, 0], fig[indeces, 1], marker="o", c=color[0]) pyplot.subplot(122) clusters = list(set(py)) for c, color in zip(clusters, plt.colors.cnames.iteritems()): indeces = [i for i, p in enumerate(py) if p == c] pyplot.scatter(fig1[indeces, 0], fig1[indeces, 1], marker="o", c=color[0]) # pyplot.show() pyplot.savefig("clusters" + "_scatter.png") if hist is not None: pyplot.figure(figsize=(4, 4)) pyplot.xticks(clusters) pyplot.bar(clusters, hist, align="center") # pyplot.show() pyplot.savefig("clusters" + "_hist.png")
def plot_mean_activation_and_stuff(some_probs, Y, do_tsne=False): pyplot.clf() probs = numpy.float32(some_probs) xv = numpy.arange(probs.shape[1])#probs.var(axis=0) yv = probs.mean(axis=0) pyplot.axis([-0.1, probs.shape[1],0,1]) for k in range(probs.shape[1]): pyplot.plot(xv[k]*numpy.ones(probs.shape[0]),probs[:,k],'o',ms=4., markeredgecolor=(1, 0, 0, 0.01), markerfacecolor=(1, 0, 0, 0.01),) pyplot.plot(xv,yv, 'bo') pyplot.show(block=False) if do_video: pyplot.savefig(video.stdin, format='jpeg') video.stdin.flush() pyplot.savefig('epoch_probs.png') if not do_tsne: return try: from sklearn.manifold import TSNE tsne = TSNE(random_state=0) ps = tsne.fit_transform(numpy.float64(probs[:400])) pyplot.clf() Y = numpy.int32(Y)[:400] for i,c,s in zip(range(10),list('bgrcmyk')+[(.4,.3,.9),(.9,.4,.3),(.3,.9,.4)],'ov'*5): sub = ps[Y == i] pyplot.plot(sub[:,0], sub[:,1], s,color=c,ms=3,mec=c) pyplot.show(block=False) pyplot.savefig('probs_embed.png') except ImportError: print "cant do tsne"
def visualize_latent_rep(args, model, x_latent): print("pca_on=%r pca_comp=%d tsne_comp=%d tsne_perplexity=%f tsne_lr=%f" % ( args.use_pca, args.pca_components, args.tsne_components, args.tsne_perplexity, args.tsne_lr )) if args.use_pca: pca = PCA(n_components = args.pca_components) x_latent = pca.fit_transform(x_latent) figure(figsize=(6, 6)) scatter(x_latent[:, 0], x_latent[:, 1], marker='.') show() tsne = TSNE(n_components = args.tsne_components, perplexity = args.tsne_perplexity, learning_rate = args.tsne_lr, n_iter = args.tsne_iterations, verbose = 4) x_latent_proj = tsne.fit_transform(x_latent) del x_latent figure(figsize=(6, 6)) scatter(x_latent_proj[:, 0], x_latent_proj[:, 1], marker='.') show()
def infer(FLAGS): """ Inference. """ # Retrieve embeddings for docs words = ["tennis", "wimbledon", "icecream", "cake", "bear", "pie"] # Get index in doc embeddings with open(os.path.join(basedir, FLAGS.data_dir, "doc_to_idx.json"), 'r') as f: doc_to_idx = json.load(f) # Load the trained model model = torch.load(os.path.join(basedir, FLAGS.data_dir, "model.pt")) doc_embeddings = model.doc_embeddings.weight.data my_embeddings = np.array( [doc_embeddings[doc_to_idx[word]].numpy() for word in words]) # Use TSNE model to reduce dimensionality model = TSNE(n_components=2, random_state=0) points = model.fit_transform(my_embeddings) # Visualize for i, word in enumerate(words): x, y = points[i, 0]*1e4, points[i, 1]*1e4 plt.scatter(x, y) plt.annotate(word, xy=(x, y), xytext=(25, 5), textcoords='offset points', ha='right', va='bottom') plt.show()
def labtest_TSNE(PID): data = [patients[pid]['tests'] for pid in PID] X = pp.scale(data) tsne = TSNE(n_components=2, perplexity=30.0, learning_rate=1000.0, n_iter=1000, n_iter_without_progress=30, min_grad_norm=1e-07, angle=0.5) pos = tsne.fit(X).embedding_ return pos
def project_in_2D(distance_mat, method='mds'): """ Project SDRs onto a 2D space using manifold learning algorithms :param distance_mat: A square matrix with pairwise distances :param method: Select method from 'mds' and 'tSNE' :return: an array with dimension (numSDRs, 2). It contains the 2D projections of each SDR """ seed = np.random.RandomState(seed=3) if method == 'mds': mds = MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(distance_mat).embedding_ nmds = MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12, dissimilarity="precomputed", random_state=seed, n_jobs=1, n_init=1) pos = nmds.fit_transform(distance_mat, init=pos) elif method == 'tSNE': tsne = TSNE(n_components=2, init='pca', random_state=0) pos = tsne.fit_transform(distance_mat) else: raise NotImplementedError return pos
def dim_survey(X, entry_id): # convert to numpy X = np.array(X) # run the reduction. X_pca = PCA(n_components=3).fit_transform(X) X_tsne = TSNE(n_components=3).fit_transform(X) X_ica = FastICA(n_components=3).fit_transform(X) # connect to db. with mongoctx() as db: # update the stuff. db['entry'].update( { '_id': ObjectId(entry_id) }, { '$set': { 'pca': X_pca.tolist(), 'tsne': X_tsne.tolist(), 'ica': X_ica.tolist(), } } )
def topic_dimen_reduce(words, word2vec): dictionary, matrix = terms_analysis.get_words_matrix(words, word2vec) pca = PCA(n_components=50) pca_matrix = pca.fit_transform(matrix) tsne = TSNE(n_components=2) t_matrix = tsne.fit_transform(pca_matrix) return dictionary, t_matrix
def main(): Xtrain, Ytrain, Xtest, Ytest = getKaggleMNIST() dbn = DBN([1000, 750, 500], UnsupervisedModel=AutoEncoder) # dbn = DBN([1000, 750, 500, 10]) output = dbn.fit(Xtrain, pretrain_epochs=2) print "output.shape", output.shape # sample before using t-SNE because it requires lots of RAM sample_size = 600 tsne = TSNE() reduced = tsne.fit_transform(output[:sample_size]) plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5) plt.title("t-SNE visualization") plt.show() # t-SNE on raw data reduced = tsne.fit_transform(Xtrain[:sample_size]) plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain[:sample_size], alpha=0.5) plt.title("t-SNE visualization") plt.show() pca = PCA() reduced = pca.fit_transform(output) plt.scatter(reduced[:,0], reduced[:,1], s=100, c=Ytrain, alpha=0.5) plt.title("PCA visualization") plt.show()
def apply_tSNE30(proj_data, proj_weights=None): model = TSNE(n_components=2, perplexity=30.0, metric="euclidean", learning_rate=100, early_exaggeration=4.0, random_state=RANDOM_SEED); norm_data = normalize_columns(proj_data); result = model.fit_transform(norm_data.T); return result;
def tsnePlot(plotname, modelName, word, dest): """Plots a tsne graph of words most similar to the word passed in the argument (as represented in the model previously calculated)""" model = word2vec.Word2Vec.load(modelName) words = [model.most_similar(word)[i][0] for i in range(0, len(model.most_similar(word)))] words.append(word) #nested list constaining 100 dimensional word vectors of each most-similar word word_vectors = [model[word] for word in words] word_vectors = np.array(word_vectors) tsne_model = TSNE(n_components=2, random_state=0) Y = tsne_model.fit_transform(word_vectors) sb.plt.plot(Y[:,0], Y[:,1], 'o') for word, x, y in zip(words, Y[:,0], Y[:,1]): sb.plt.annotate(word, (x, y), size=12) #sb.plt.pause(10) plotname = plotname + ".png" if not os.path.exists(dest): os.makedirs(dest) path = os.path.join(dest, plotname) sb.plt.savefig(path)
def plotTSNEDecisionBoundaries(): tsne = TSNE() tsne_data = tsne.fit_transform(feature_set) x_min,x_max = tsne_data[:,0].min()-1, tsne_data[:,0].max() + 1 y_min,y_max = tsne_data[:,1].min()-1, tsne_data[:,1].max() + 1 step_size = 2.0 xx,yy = np.meshgrid(np.arange(x_min,x_max,step_size),np.arange(y_min,y_max,step_size)) for index,classifier in enumerate(classifiers): plt.subplot(2,3,index+1) plt.subplots_adjust(wspace=0.5,hspace=0.5) classifier.fit(tsne_data,class_labels) Z = classifier.predict(zip(xx.ravel(),yy.ravel())) Z = Z.reshape(xx.shape) plt.contourf(xx,yy,Z,cmap=plt.cm.Paired,alpha=0.7) plt.scatter(tsne_data[:,0],tsne_data[:,1],c=class_labels,cmap=plt.cm.rainbow,alpha=0.6) plt.xlabel("Feature 1") plt.ylabel("Feature 2") plt.xlim(x_min,x_max) plt.ylim(y_min,y_max) plt.xticks(()) plt.yticks(()) plt.title(classifier_names[index]) plt.show()
def tsne_plot(model): #"Creates and TSNE model and plots it" labels = [] tokens = [] for word in model.wv.vocab: tokens.append(model[word]) labels.append(word) tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23) new_values = tsne_model.fit_transform(tokens) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) plt.figure(figsize=(16, 16)) for i in range(len(x)): plt.scatter(x[i], y[i]) plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.show()
def plotly_js_viz(word_2_vec_model): tsne_model=TSNE(n_components=2,random_state=5) data=tsne_model.fit_transform(word_2_vec_model.syn0) xd=list(data[:,0]) yd=list(data[:,1]) names_our=word_2_vec_model.index2word plot([Scatter(x=xd,y=yd,mode="markers",text=names_our)])
def visualization(result, word_dict): tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) plot_only = 500 low_dim_embs = tsne.fit_transform(result[0:500]) labels = [ word_dict[i] for i in range(500) ] plot_with_labels(low_dim_embs, labels)
def reduce_dimentionality(self): self.vectors = [] for key in self.selected_words: self.vectors.append(self.model[key]) tnse_model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) self.reduced_vectors = tnse_model.fit_transform(self.vectors)
def PlotTSNE (data, labels): #Takes the data and the labels # Visualize the results on TSNE reduced data print "BUSY IN TSNE" model = TSNE(n_components=2, random_state=0) reduced_data = model.fit_transform(data) print "DATA REDUCED" # Plot the decision boundary. For that, we will assign a color to each x_min, x_max = reduced_data[:, 0].min(), reduced_data[:, 0].max() y_min, y_max = reduced_data[:, 1].min(), reduced_data[:, 1].max() plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) #Adds labels to the plot for label, x, y in zip(labels, reduced_data[:, 0], reduced_data[:, 1]): plt.annotate( label, xy = (x, y), xytext = (-20, 20), textcoords = 'offset points', ha = 'right', va = 'bottom', bbox = dict(boxstyle = 'round,pad=0.5', fc = 'green', alpha = 0.5), arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0')) plt.title('TSNE Plot') plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.xticks(()) plt.yticks(()) plt.show()
def performDimensionalityReduction(context_vector, n_component, perplexity): ''' Applies TSNE on the feature vector of each of the word instances and creates one model for each word type ''' feature_vector_data = defaultdict(dict) word_type_model = {} for word_type, word_type_data in context_vector.iteritems(): feature_vector_word_type = OrderedDict() #Reading in all the feature vectors for the given word type for data_type, instance_details in word_type_data.iteritems(): for instance, context_details in instance_details.iteritems(): #Training data with have the sense id's while test data will have ['<UNKNOWN>'] senses = context_details.get('Sense') for sense in senses: feature_vector_word_type[(instance, sense, data_type)] = context_details["Feature_Vector"] #Applying TSNE on all the feature vectors feature_vector_array = np.array(feature_vector_word_type.values()) model = TSNE(n_components=n_component, random_state=0, perplexity=perplexity, metric="cosine") model.fit(feature_vector_array) #Storing the model since it will be needed to fit the test data word_type_model[word_type] = model #Converting to a structure of {WordType: {(instanceID, senseID): FeatureVector ... }} for i in range(len(feature_vector_word_type)): feature_vector_data[word_type][feature_vector_word_type.keys()[i]] = list(model.embedding_[i]) return feature_vector_word_type, word_type_model
'model': SVC(random_state=0, probability=True, kernel='rbf'), 'methods': ['predict', 'predict_proba'], 'dataset': 'classifier', }, { 'model': SVC(random_state=0, probability=True, kernel='linear'), 'methods': ['predict', 'predict_proba'], 'dataset': 'sparse', }, { 'model': SVC(random_state=0, probability=True, kernel='rbf'), 'methods': ['predict', 'predict_proba'], 'dataset': 'sparse', }, { 'model': TSNE(random_state=0), 'methods': ['fit_transform'], 'dataset': 'classifier', }, { 'model': KMeans(random_state=0, init="k-means++"), 'methods': ['predict'], 'dataset': 'blobs', }, { 'model': KMeans(random_state=0, init="random"), 'methods': ['predict'], 'dataset': 'blobs', }, { 'model': KMeans(random_state=0, init="k-means++"),
dist_neg = distances[identical == 0] # plt.figure(figsize=(12,4)) # # plt.subplot(121) # plt.hist(dist_pos) # plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold') # plt.title('Distances (pos. pairs)') # plt.legend(); # # plt.subplot(122) # plt.hist(dist_neg) # plt.axvline(x=opt_tau, linestyle='--', lw=1, c='lightgrey', label='Threshold') # plt.title('Distances (neg. pairs)') # plt.legend(); ####****Till Here****#### ####****This code is used to plot the learning done by our model. Basically plots the points representing each image on a graph****#### targets = np.array([m.name for m in metadata]) from sklearn.manifold import TSNE # sklearn stands for sci-kit learn library in python used for mathematical operations used in machine learning X_embedded = TSNE(n_components=2).fit_transform(embedded) for i, t in enumerate(set(targets)): idx = targets == t plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t) plt.legend(bbox_to_anchor=(1, 1)) plt.show() ####****Till Here****####
def sortTSNE(dataSpikes): (numSpikes, numChans, numSamples) = shape(dataSpikes.samples) allWaves = dataSpikes.samples.reshape(numSpikes, numChans*numSamples) model = TSNE(n_components=2, method='barnes_hut', verbose=20, n_iter=1000) Y = model.fit_transform(allWaves)
# Write logs if (iteration < 5) or (iteration % 100 == 99): lib.plot.flush(outf, logfile) lib.plot.tick() # Generation and reconstruction if iteration % 5000 == 4999: generate_image(iteration, _data) reconstruct_image(iteration) # Latent space visualization if iteration % 50000 == 49999: z_dev, z_mean_dev, y_dev = [], [], [] for xb, yb in dev_gen(): zb = session.run(q_z, feed_dict={real_x: xb}) z_dev.append(zb) y_dev.append(yb) z_dev_2D = TSNE().fit_transform(np.vstack(z_dev)) lib.visualization.scatter( data=z_dev_2D, label=np.hstack(y_dev), dir=outf, file_name='{}_mnist_manifold_{}.png'.format(MODE, iteration)) # Save model if iteration == ITERS - 1: save_path = saver.save( session, os.path.join(outf, '{}_mnist_model_{}.ckpt'.format(MODE, iteration)))
#load risk factor docs riskfactors_old = loadFacetDocs('./data/risk_abstracts.csv') riskfactors = getRisks('./data/risk_sha.csv') riskvectors = [] for factor in riskfactors: riskvectors.append(get_doc_vector(model, factor[1])) print('start plotting') #perplexity of 5 and learning rate of 500 gives good results tsne = TSNE(n_components=2, perplexity=5, learning_rate = 500) num_abstract = np.array(model.docvecs.vectors_docs).shape[0] #printSNE1(model.docvecs.vectors_docs) #printSNE2(model.docvecs.vectors_docs, num_abstract) #printSNE2(np.concatenate((model.docvecs.vectors_docs, riskvectors)), num_abstract) #printClusterTasks(abstract_vectors, labels, k) printTasksRisks(abstract_vectors, labels, k) #print("Check dit") #print(abstract_vectors[0:2]) #print(list_of_tasks[0:2]) #print(riskfactors[0:2])
# Step 7: Visualize the embeddings. def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): assert low_dim_embs.shape[0] >= len(labels), \ "More labels than embeddings" fig = plt.figure(figsize=(18, 18)) #in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] fig.scatter(x, y) fig.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') fig.savefig(filename) try: from sklearn.manifold import TSNE import matplotlib.pyplot as plt tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000) plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) labels = [reverse_dictionary[i] for i in xrange(plot_only)] plot_with_labels(low_dim_embs, labels) except ImportError: print("Please install sklearn and matplotlib to visualize embeddings.")
# In non-text mode, plot all characters that appear at least once in the corpus charpoints = [ i for i in range(256) if ((HIDE_OTHER_TYPES or char_type(i) in ALLOWED_TYPES) #and (char_type(i) not in ['non-ascii', 'unused']) # hacky and (ALLOW_RARE or is_frequent(i))) or ( not TEXT_MODE and char_counts[i] > 50) ] if MODE == 'SNE': X_sne = TSNE( perplexity=4, n_iter=2000, learning_rate=25, n_iter_without_progress=100, # the goggles do nothing #method='exact', early_exaggeration=4, verbose=2, random_state=8, ).fit_transform(embedding[charpoints]) elif MODE == 'tSVD': X_sne = sklearn.decomposition.TruncatedSVD(n_components=2).fit_transform( embedding[charpoints]) elif MODE == 'PCA': X_sne = sklearn.decomposition.PCA(n_components=2).fit_transform( embedding[charpoints]) else: assert False ("unrecognized mode") plt.figure(figsize=(10, 10)) x_min, x_max = np.min(X_sne, 0), np.max(X_sne, 0)
if not len(vecs): break m.append(idx) vecs = np.array(vecs) tweets_w2v_avg.append(np.mean(vecs, axis=0)) point_labels.append(film_label[k][labels_w2v[idx]]) break tweets_w2v = tweets_w2v[m] tweets_w2v_avg = np.array(tweets_w2v_avg) print("Selected {} tweets".format(len(tweets_w2v_avg))) # In[15]: tsne = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) Y = tsne.fit_transform(tweets_w2v_avg) # In[16]: plt.scatter(Y[:, 0], Y[:, 1]) # In[20]: # plt.scatter(Y[:, 0], Y[:, 1], c=c) # for label, x, y in zip(tweets_w2v, Y[:, 0], Y[:, 1]): # plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') # In[21]:
def show_images(images, labels): _, figs = plt.subplots(1, len(images), figsize=(28, 28)) for f, img, lbl in zip(figs, images, labels): f.imshow(img, cmap='gray') f.set_title(lbl) f.axes.get_xaxis().set_visible(False) f.axes.get_yaxis().set_visible(False) plt.show() tsne = TSNE(n_components=hyper_params['n_components'], min_grad_norm=1e-5, init='pca', method='exact', angle=0.45, early_exaggeration=5, n_iter=1000) pca = PCA(n_components=hyper_params['n_components']) reduction_model = pca all_code = np.concatenate([train_data, test_data], axis=0) reduction_model.fit(all_code.reshape(all_code.shape[0], -1)) reduct_code = reduction_model.transform(all_code.reshape( all_code.shape[0], -1)) Q_code = reduct_code[:train_data.shape[0]] Q1_code = reduct_code[train_data.shape[0]:] train_len = Q_code.shape[0]
genotypes[counter].append(2) if j=='./.': genotypes[counter].append(9) counter+=1 #TRANSFORM TO tSNE X = np.asarray(genotypes) pca_for_tSNE = PCA(n_components=15).fit_transform(genotypes) print(np.sum(PCA(n_components=10).fit(genotypes).explained_variance_ratio_)) X_embedded = TSNE(verbose=0,early_exaggeration=12.0,n_components=2,learning_rate=100.0,n_iter=1000,perplexity=10.0).fit_transform(pca_for_tSNE) #print(X_embedded.shape) #PLOTING plt.figure(figsize=(100, 60)) COLORPALLETE=get_colors(len(set(true_labels))) COLORZ_TO_LABELS={} uniquelabels=[x for x in set(true_labels)] for j in range(0,len(uniquelabels)): COLORZ_TO_LABELS[uniquelabels[j]]=COLORPALLETE[j] colors=[ COLORZ_TO_LABELS[x] for x in true_labels]
def fit_tsne(x, n_components=2, init='pca', *args, **kwargs): return TSNE(n_components=n_components, init=init, *args, **kwargs).fit_transform(x)
''' INSTRUCTIONS * Import TSNE from sklearn.manifold. * Create a TSNE instance called model with learning_rate=50. * Apply the .fit_transform() method of model to normalized_movements. Assign the result to tsne_features. * Select column 0 and column 1 of tsne_features. * Make a scatter plot of the t-SNE features xs and ys. Specify the additional keyword argument alpha=0.5. * Code to label each point with its company name has been written for you using plt.annotate(), so just hit 'Submit Answer' to see the visualization! ''' # Import TSNE from sklearn.manifold import TSNE # Create a TSNE instance: model model = TSNE(learning_rate=50) # Apply fit_transform to normalized_movements: tsne_features tsne_features = model.fit_transform(normalized_movements) # Select the 0th feature: xs xs = tsne_features[:, 0] # Select the 1th feature: ys ys = tsne_features[:, 1] # Scatter plot plt.scatter(xs, ys, alpha=0.5) # Annotate the points for x, y, company in zip(xs, ys, companies):
minCount = 20 s = 250 w = 4 skip_model = Word2Vec(data, min_count=minCount, iter=5, size=s, window=w, sg=1) skip_model.save('SkipGramFile') print("size = %d" % s) print("window = %d" % w) store_model = g.Doc2Vec.load('SkipGramFile') vocab = list(store_model.wv.vocab) X = store_model[vocab] # 이차원 그래프로 표현 tsne = TSNE(n_components=2) X_tsne = tsne.fit_transform(X) #print(len(X_tsne)) # 표 그리기 df = pd.DataFrame(X_tsne, index=vocab[:], columns=['x', 'y']) df.shape #print(df) # 그래프 그리기 fig = plt.figure() fig.set_size_inches(40, 20) ax = fig.add_subplot(1, 1, 1) ax.scatter(df['x'], df['y']) for word, pos in df.iterrows(): ax.annotate(word, pos, fontsize=30)
data = pickle.load(open(dir + 'sentence_text.p', 'rb')) document = data[1] #compiled sentences compiled_sentences = data[0] for i in range(len(data)): compiled_sentences += data[i] print(compiled_sentences) model = gensim.models.Word2Vec(compiled_sentences, min_count=5) print(model.similarity('Greece', 'January')) print( model.most_similar(positive=['woman', 'bailout'], negative=['finance'], topn=1)) X_tot = list() for word in model.wv.vocab: X_tot.append(model.wv[word]) X_tot = np.array(X_tot) ## visualize embedding using t-sne X_embedded = TSNE(n_components=2, verbose=True, perplexity=40).fit_transform(X_tot) X_embedded.shape ## plot the t-sne result plt.figure() plt.scatter(X_embedded[:, 0], X_embedded[:, 1]) plt.show()
if args.device == 'cuda': torch.cuda.set_device(args.gpu) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) else: torch.manual_seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) x = np.load('/Users/tanyue/Desktop/saved/protos/' + args.alg + '_protos.npy', allow_pickle=True) y = np.load('/Users/tanyue/Desktop/saved/protos/' + args.alg + '_labels.npy', allow_pickle=True) # d = np.load('../protos/' + args.alg + '_idx.npy', allow_pickle=True) tsne = TSNE() x = tsne.fit_transform(x) # x = x[:,0:2] y = y.reshape((-1, 1)) # d = d.reshape((-1, 1)) # visualize(args, x, y, d) visualize(args, x, y) # from mlxtend.plotting import plot_decision_regions # from sklearn.svm import SVC # from mlxtend.data import iris_data # clf = SVC(random_state=0, probability=True) # # X, y = iris_data() # # X = X[:,[0, 2]]
# for node, _ in model.most_similar('gk'): # # Show only players # if len(node) > 3: # print(node) # # for node, _ in model.most_similar('real_madrid'): # print(node) # # for node, _ in model.most_similar('paulo_dybala'): # print(node) # Visualization player_nodes = [x for x in model.vocab if len(x) > 3 and x not in clubs] embeddings = np.array([model[x] for x in player_nodes]) tsne = TSNE(n_components=2, random_state=7, perplexity=15) embeddings_2d = tsne.fit_transform(embeddings) # Assign colors to players team_colors = { 'real_madrid': 'lightblue', 'chelsea': 'b', 'manchester_utd': 'r', 'manchester_city': 'teal', 'juventus': 'gainsboro', 'napoli': 'deepskyblue', 'fc_bayern': 'tomato' } data['color'] = data['club'].apply(lambda x: team_colors[x]) player_colors = dict(zip(data['name'], data['color']))
def main(): csvdata = read_points() X = len(csvdata[0]) Y = len(csvdata) New_matrix = np.zeros([Y,X]) for y in range(Y): for x in range(X): New_matrix[y, x] = csvdata[y][x] #最后三列统计结果矩阵 add_matrix = np.zeros([Y,3]) for y in range(Y): add_matrix[y, 0] = New_matrix[y, 0] - New_matrix[y, 1] add_matrix[y, 1] = New_matrix[y, 2] - New_matrix[y, 3] add_matrix[y, 2] = New_matrix[y, 4] - New_matrix[y, 5] temp_mean = add_matrix[:, 0].mean() temp_mean1 = add_matrix[:, 1].mean() temp_mean2 = add_matrix[:, 2].mean() col_std = np.std(add_matrix, axis=0) col_mean = np.mean(add_matrix, axis=0) # 需要对csvdata进行中心化和标准化处理 for y in range(Y): for x in range(0,3): add_matrix[y, x] -= col_mean[x] add_matrix[y, x] /= col_std[x] # for y in range(Y): # temp = str(add_matrix[y,0]) + "," + str(add_matrix[y,1]) + "," + str(add_matrix[y,2]) + "\n" # saveresult(temp) predict_matrix = np.array([(7898765467.24,3235676823.00,3957177004.54,3444000321.55,5432112345.77,2900000089.12), (133241575988.56, 39872238928.11, 14551119352.78, 3164290276.21, 3444305407.86,1015886389.47), (93805217949.67,34975605193.08,2326015727.05,1922978314.70,2273603448.91,4777927001.24)]) predict_subtract = np.zeros([3,3]) for i in range(3): predict_subtract[i, 0] = predict_matrix[i, 0] - predict_matrix[i, 1] predict_subtract[i, 1] = predict_matrix[i, 2] - predict_matrix[i, 3] predict_subtract[i, 2] = predict_matrix[i, 4] - predict_matrix[i, 5] for i in range(3): for x in range(0,3): predict_subtract[i, x] -= col_mean[x] predict_subtract[i, x] /= col_std[x] matrix2list = add_matrix.tolist() print(matrix2list) print(add_matrix) # 聚类前降维显示 # Dimensionality_reduction(matrix2list) X_pca = PCA(n_components=2).fit_transform(add_matrix) t1 = 20 t2 = 15 gc = ca.Canopy(X_pca) gc.setThreshold(t1, t2) canopies = gc.clustering() # showCanopy(canopies,X_pca,t1,t2) all_vrc = [] all_silh = [] sub = [] for k in range(20): # kmeans聚类 if k==0 or k==1: continue clf = KMeans(n_clusters=k,init='k-means++') y_pred = clf.fit_predict(matrix2list) add_pred = clf.predict(predict_subtract.tolist()) # print(clf) # print(y_pred) sub.append(k) VRC = metrics.calinski_harabaz_score(add_matrix, y_pred) all_vrc.append(VRC) silh = metrics.silhouette_score(add_matrix, y_pred, metric='euclidean') all_silh.append(silh) print("k= ",k) print('VRC方差率:',VRC) # print('轮廓系数:%10.3f' % silh) print('轮廓系数:', silh) lines = len(y_pred) static_result = np.zeros([1, k]) first = 0 second = 0 third = 0 for item in y_pred: for i in range(0,k): if item == i: static_result[0,i] += 1 for i , j in enumerate(y_pred): if j == 1: print(i,j) for i in range(0,k): print("第"+str(i)+"类数据占比: "+str(static_result[0,i]*100/lines)+"%") # 降维显示数据 X_tsne = TSNE(learning_rate=100).fit_transform(matrix2list) X_pca = PCA().fit_transform(matrix2list) #单点降维 signal_pca = PCA().fit_transform(predict_subtract.tolist()) plt.close() fig = plt.figure() plt.ion() # interactive mode on plt.subplot(121) plt.title("T-SNE") plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred) plt.subplot(122) plt.title("PCA") plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred) plt.scatter(signal_pca[:,0],signal_pca[:,1],c='r') plt.pause(1) plt.figure(figsize=(10,5)) plt.subplot(121) plt.title("VRC") # plt.scatter(sub,all_vrc,marker='o') plt.plot(all_vrc) plt.subplot(122) plt.title("silh") # plt.scatter(sub,all_silh,marker='x') plt.plot(all_silh) plt.show() print("第一类数据占比:" + str(((first * 100) / lines)) + "%") print("第二类数据占比:" + str(((second * 100) / lines)) + "%") print("第三类数据占比:" + str(((third * 100) / lines)) + "%") temp = "第一类数据占总数的:" + str(((first * 100) / lines)) + "%\n" + "第二类数据占总数的:" + str(((second * 100) / lines)) + "%\n" + "第三类数据占总数的:" + str(((third * 100) / lines)) + "%\n" saveresult(temp) X_tsne = TSNE(learning_rate=100).fit_transform(matrix2list) X_pca = PCA().fit_transform(matrix2list) plt.figure(figsize=(10, 5)) plt.subplot(121) plt.title("T-SNE") plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y_pred) plt.subplot(122) plt.title("PCA") plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_pred) plt.show()
plot_conf_matrix(y_test, y_predSGD, "Stochastic Gradient Descent") # ### Feature Importance # ### Clustering using Dimensionality Reduction. # In[77]: from sklearn.manifold import TSNE from sklearn.decomposition import PCA, TruncatedSVD import matplotlib.patches as mpatches # In[78]: # T-SNE with Original Data. X_reduced_tsne = TSNE(n_components=2, random_state=42).fit_transform(X.values) # In[79]: plt.scatter(X_reduced_tsne[:, 0], X_reduced_tsne[:, 1], c=(y == 0), cmap='coolwarm', label='No Fraud', linewidths=2) plt.scatter(X_reduced_tsne[:, 0], X_reduced_tsne[:, 1], c=(y == 1), cmap='rainbow', label='Fraud', linewidths=2)
from sklearn.decomposition import PCA from sklearn.manifold import TSNE import pylab import numpy as np d = np.load('feature.npy').item() X = d['feature'] labels = d['label'] data_pca_tsne = TSNE(n_components=2).fit_transform(X) cls_num = -45 # pylab.figure() pylab.scatter(data_pca_tsne[cls_num * 5:, 0], data_pca_tsne[cls_num * 5:, 1], 10, np.zeros_like(labels[cls_num * 5:])) pylab.scatter(data_pca_tsne[:cls_num * 5, 0], data_pca_tsne[:cls_num * 5, 1], 10, labels[:cls_num * 5]) pylab.savefig('tsne.pdf')
y = dataset.iloc[:, 0] labels, numbers = pd.factorize(y) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X = sc.fit_transform(X) # Applying PCA from sklearn.decomposition import PCA pca = PCA(n_components=19) Xpca = pca.fit_transform(X) # Applying tSNE from sklearn.manifold import TSNE tsne = TSNE(n_components=2) Xnew = tsne.fit_transform(Xpca) # Scatter Plot cdict = { 0: 'yellow', 1: 'red', 2: 'blue', 3: 'green', 4: 'black', 5: 'orange', 6: 'pink' } fig, ax = plt.subplots() for g in np.unique(labels): ix = np.where(labels == g)
def run_pca(self, vals, clrs): my_mat = np.matrix(vals) #my_pca = PCA(n_components = 20).fit(my_mat.getT()) my_pca = PCA().fit(my_mat.getT()) my_pts = my_pca.transform(my_mat.getT()) coefs = my_pca.components_ top_coefs = [] top_dict = [] for i, sc in enumerate(coefs): sranked = sorted([(sj, self.f_names[j]) for j, sj in enumerate(sc)], reverse=True) sHalf = int(len(sranked) / 2.0) sForward = sranked[0:sHalf - 2] sBack = sranked[-1::-1][0:sHalf - 2] kh, kl, listH, listL = 0, 0, [], [] dictH, dictL = dd(int), dd(int) for z, (scr, ch) in enumerate(sForward): #print 'coef',i,'POS','rank stuff',z,scr,ch ns = self.summarize(ch) dictH[ns] += 1 if ns not in [x[1] for x in listH]: listH.append((scr, ns)) if len(listH) > 40: break if z > 40: break for z, (scr, ch) in enumerate(sBack): #print 'coef',i,'NEG','rank stuff',z,scr,ch ns = self.summarize(ch) dictL[ns] += 1 if ns not in [x[1] for x in listL]: listL.append((scr, ns)) if len(listL) > 40: break if z > 40: break top_coefs.append([listH, listL]) top_dict.append([dictH, dictL]) if i > 0: break for n in range(len(my_pts)): p = my_pts[n] c = clrs[n] if c == 'magenta': #print p[0],p[1],c if p[1] < 0.85: my_pts[n][1] += 1 if c == 'cyan': #print p[0],p[1],c if p[0] < 1: my_pts[n][0] += 1 tsne = TSNE(n_components=2, verbose=0, perplexity=100, n_iter=5000) ts = tsne.fit_transform(my_pts) return my_pts, ts, top_coefs, top_dict
def word2vec_basic(log_dir): """Example of building, training and visualizing a word2vec model.""" # Create the directory for TensorBoard variables if there is not. if not os.path.exists(log_dir): os.makedirs(log_dir) # # Step 1: Download the data. # # Note: Source website does not support HTTPS right now. # url = 'http://mattmahoney.net/dc/' # # # pylint: disable=redefined-outer-name # def maybe_download(filename, expected_bytes): # """Download a file if not present, and make sure it's the right size.""" # if not os.path.exists(filename): # filename, _ = urllib.request.urlretrieve(url + filename,filename) # #获取文件的相关属性信息 # statinfo = os.stat(filename) # #判断文件大小是否相等 # if statinfo.st_size == expected_bytes: # print('Found and verified', filename) # else: # print(statinfo.st_size) # raise Exception('Failed to verify ' + filename +'. Can you get to it with a browser?') # return filename #filename = maybe_download('text8.zip',31344016) filename = r'D:\程序\Text-classification-CNN\text8.zip' # Read the data into a list of strings. def read_data(filename): """Extract the first file enclosed in a zip file as a list of words.""" with zipfile.ZipFile(filename) as f: data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data vocabulary = read_data(filename) #list()列表 print('Data size', len(vocabulary)) # Step 2: Build the dictionary and replace rare words with UNK token. vocabulary_size = 50000 def build_dataset(words, n_words): """Process raw inputs into a dataset.""" count = [['UNK', -1]] #二维数组 count.extend(collections.Counter(words).most_common(n_words - 1)) #截取前49999个高频词 #dictionary为{},key为word value为index dictionary = {word: index for index, (word, _) in enumerate(count) } #不需要但又必须定义的变量点以为“_” data = [] unk_count = 0 for word in words: index = dictionary.get(word, 0) if index == 0: # dictionary['UNK'] 最后统计下所有的低频词UNK的个数 unk_count += 1 data.append(index) count[0][1] = unk_count reversed_dictionary = dict( zip(dictionary.values(), dictionary.keys())) #反转字典 key为index value为word return data, count, dictionary, reversed_dictionary # Filling 4 global variables:(获取训练集中的信息,保存在下面的全局变量中) # data - list of codes (integers from 0 to vocabulary_size-1). # This is the original text but words are replaced by their codes # count - map of words(strings) to count of occurrences # dictionary - map of words(strings) to their codes(integers) # reverse_dictionary - map of codes(integers) to words(strings) data, count, unused_dictionary, reverse_dictionary = build_dataset( vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) # Step 3: Function to generate a training batch for the skip-gram model. def generate_batch(batch_size, num_skips, skip_window): global data_index #代表目前训练数据段的其实位置 assert batch_size % num_skips == 0 #断言 如果batch_size % num_skips!=0 程序报错中断 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32) #batch内为8个word的数字索引 labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) span = 2 * skip_window + 1 # [ skip_window target skip_window ] buffer = collections.deque(maxlen=span) # pylint: disable=redefined-builtin(双向队列) if data_index + span > len(data): data_index = 0 buffer.extend(data[data_index:data_index + span]) #data[0:3] 训练数据中索引为1,2,3的word,数据只在这有,其余的都为索引 data_index += span for i in range(batch_size // num_skips): #整除 4 i:0,1,2,3 context_words = [w for w in range(span) if w != skip_window] #获取上下文的word [0,2] words_to_use = random.sample(context_words, num_skips) #[0,2] ? for j, context_word in enumerate( words_to_use): #j:0,1 context:0,2 batch[i * num_skips + j] = buffer[ skip_window] #batch[n] n:"0,1","2,3","4,5","6,7"每两个batch(target_word)内数据相同 labels[i * num_skips + j, 0] = buffer[ context_word] #labels[n,0] n:0,1,2,3,4,5,6,7 设计的还是很巧妙的 #labels稍微复杂一些,labels[0,0]=buffer[0] labels[1,0]=buffer[2] labels[2,0]=buffer[1] lables[3,0]=buffer[3]... if data_index == len(data): buffer.extend(data[0:span]) data_index = span else: buffer.append(data[data_index]) #这里会改buffer内的值 data_index += 1 # Backtrack a little bit to avoid skipping words in the end of a batch data_index = (data_index - span) % len(data) return batch, labels #batch_size训练一批单词的个数、num_skips batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) for i in range(8): #测试下效果 print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) # Step 4: Build and train a skip-gram model. batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. num_sampled = 64 # Number of negative examples to sample. 负采样:减小计算量,达到较好效果的一种方式 # We pick a random validation set to sample nearest neighbors. Here we limit # the validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) graph = tf.Graph() with graph.as_default(): # Input data. with tf.name_scope('inputs'): train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32) #验证集 # Ops and variables pinned to the CPU because of missing GPU implementation with tf.device('/gpu:0'): #'/cpu:0' # Look up embeddings for inputs. with tf.name_scope('embeddings'): embeddings = tf.Variable( #定义 词向量 维度为:50000*128 50000个词,每个词128个维度 tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) #抽取要训练的词,train_inputs就是要训练的词,训练哪些就从embeddings中抽取出来 #embedding_lookup(params, ids),比如说ids=[1,7,4],就是返回params中的1,7,4行组成的tensor embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss 噪声对比工具(负采样) with tf.name_scope('weights'): nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) with tf.name_scope('biases'): nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. # Explanation of the meaning of NCE loss and why choosing NCE over tf.nn.sampled_softmax_loss: # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ # http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf with tf.name_scope('loss'): loss = tf.reduce_mean( #二次代价函数 tf.nn.nce_loss( #nce负采样 weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size)) # Add the loss value as a scalar to summary. tf.summary.scalar('loss', loss) # Construct the SGD optimizer using a learning rate of 1.0. with tf.name_scope('optimizer'): optimizer = tf.train.GradientDescentOptimizer(1.0).minimize( loss) #梯度下降法 # Compute the cosine similarity between minibatch examples and all 余弦相似度比较测试集数据 # embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # Merge all summaries. merged = tf.summary.merge_all() # Add variable initializer. init = tf.global_variables_initializer() # Create a saver. saver = tf.train.Saver() # Step 5: Begin training. num_steps = 100001 with tf.Session(graph=graph) as session: # Open a writer to write summaries. writer = tf.summary.FileWriter(log_dir, session.graph) # We must initialize all variables before we use them. init.run() print('Initialized') average_loss = 0 for step in xrange(num_steps): batch_inputs, batch_labels = generate_batch( batch_size, num_skips, skip_window) feed_dict = { train_inputs: batch_inputs, train_labels: batch_labels } # Define metadata variable. run_metadata = tf.RunMetadata() # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() # Also, evaluate the merged op to get all summaries from the returned # "summary" variable. Feed metadata variable to session for visualizing # the graph in TensorBoard. _, summary, loss_val = session.run([optimizer, merged, loss], feed_dict=feed_dict, run_metadata=run_metadata) average_loss += loss_val # Add returned summaries to writer in each step. writer.add_summary(summary, step) # Add metadata to visualize the graph for the last run. if step == (num_steps - 1): writer.add_run_metadata(run_metadata, 'step%d' % step) if step % 2000 == 0: if step > 0: average_loss /= 2000 # The average loss is an estimate of the loss over the last 2000 # batches. print('Average loss at step ', step, ': ', average_loss) average_loss = 0 # Note that this is expensive (~20% slowdown if computed every 500 steps) if step % 10000 == 0: sim = similarity.eval() for i in xrange(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word #打印16个测试集中前8个比较相似的词,好的词向量模型 比较相似的词的余弦距离也是比较相近的 print( log_str, ', '.join([ reverse_dictionary[nearest[k]] for k in range(top_k) ])) final_embeddings = normalized_embeddings.eval() # Write corresponding labels for the embeddings. with open(log_dir + '/metadata.tsv', 'w') as f: for i in xrange(vocabulary_size): f.write(reverse_dictionary[i] + '\n') # Save the model for checkpoints. saver.save(session, os.path.join(log_dir, 'model.ckpt')) # Create a configuration for visualizing embeddings with the labels in # TensorBoard. config = projector.ProjectorConfig() embedding_conf = config.embeddings.add() embedding_conf.tensor_name = embeddings.name embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv') projector.visualize_embeddings(writer, config) writer.close() # Step 6: Visualize the embeddings. # pylint: disable=missing-docstring # Function to draw visualization of distance between embeddings. def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): assert low_dim_embs.shape[0] >= len( labels), 'More labels than embeddings' plt.figure(figsize=(18, 18)) # in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.savefig(filename) try: # pylint: disable=g-import-not-at-top from sklearn.manifold import TSNE #把词向量通过TSNE降维的方式给画出来 import matplotlib #matplotlib.use("Agg") matplotlib.use("Pdf") import matplotlib.pyplot as plt tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact') plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) labels = [reverse_dictionary[i] for i in xrange(plot_only)] plot_with_labels(low_dim_embs, labels) except ImportError as ex: print( 'Please install sklearn, matplotlib, and scipy to show embeddings.' ) print(ex)
# restored original order (not sorted by length) for i, predict_f in enumerate(predict_fs): predict_features[sorted_indexes[i] + batch_val] = predict_f predict_features = torch.stack(predict_features) # get test label csv content dict = getVideoList(os.path.join(test_label_path)) action_labels = (dict['Action_labels']) # tSNE to visualize #x_t = (t_features.cpu()).numpy() #y_t = (t_label.cpu()).numpy() X = np.array(predict_features.tolist()) Y = np.array(dict['Action_labels']).astype(int) tsne = TSNE(n_components=2, random_state=0) #Project the data in 2D X_2d = tsne.fit_transform(X) #Visualize the data target_names = [ '0others', '1Inspect/Read', '2Open', '3Take', '4Cut', '5Put', '6Close', '7Move_around', '8Divide/Pull_apart', '9Pour', '10Transfer' ] target_ids = range(len(target_names)) # 0~10 digits fig1 = plt.figure( figsize=(12, 10)).suptitle('tSNE plot of cnn_based action recognition') colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'peru', 'orange', 'purple', 'indigo' for i, c, label in zip(target_ids, colors, target_names): plt.scatter(X_2d[Y == i, 0], X_2d[Y == i, 1], c=c, label=label)
W_outer = tf.Variable(tf.random_normal([emb_dims, vocab_size], mean=0.0, stddev=0.02, dtype=tf.float32)) b_outer = tf.Variable(tf.random_normal([vocab_size], mean=0.0, stddev=0.02, dtype=tf.float32)) hidden = tf.add(tf.matmul(x, W), b) logits = tf.add(tf.matmul(hidden, W_outer), b_outer) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) epochs, batch_size = 100, 10 batch = len(x_train)//batch_size # 迭代 n_iter 次 with tf.Session() as sess: sess.run(tf.global_variables_initializer()) print 'was here' for epoch in xrange(epochs): batch_index = 0 for batch_num in xrange(batch): x_batch = x_train[batch_index: batch_index + batch_size] y_batch = y_train[batch_index: batch_index + batch_size] sess.run(optimizer, feed_dict={x: x_batch, y: y_batch}) print('epoch:', epoch, 'loss :', sess.run(cost, feed_dict={x: x_batch, y: y_batch})) W_embed_trained = sess.run(W) W_embedded = TSNE(n_components=2).fit_transform(W_embed_trained) plt.figure(figsize=(10, 10)) for i in xrange(len(W_embedded)): plt.text(W_embedded[i, 0], W_embedded[i, 1], ind2word[i]) plt.xlim(-150, 150) plt.ylim(-150, 150)
km = KMeans(n_clusters=k_cluster, random_state=0) km.fit(tweet_vecs) predictions_km = km.predict(tweet_vecs) # birch n_clusters = 7 brc = Birch(branching_factor=500, n_clusters=n_clusters, threshold=0.5, compute_labels=True) brc.fit(tweet_vecs) predictions = brc.predict(tweet_vecs) #pdb.set_trace() # tsne model = TSNE(n_components=2, random_state=0) tsne_vecs = model.fit_transform(tweet_vecs) # visualize ALL_COLORS = [ 'red', 'blue', "green", "orange", "yellow", "purple", "black", "brown", 'cyan' "gold", "grey" ] def get_colors(labels): colors = [] for i in labels: if i > 11: print("Require more color")
if subtitle != None: plt.suptitle(subtitle) plt.show() # Getting a batch from training and validation data for visualization x_train, y_train = get_batch(train_set, 32) x_val, y_val = get_batch(valid_set, 32) x_train = x_train.reshape(-1, 784) x_val = x_val.reshape(-1, 784) # Generating and visualizing t-SNE embeddings of the raw data # of the first 512 samples. tsne = TSNE() train_tsne_embeds = tsne.fit_transform(x_train) scatter(train_tsne_embeds, y_train, "Samples from Training Data") eval_tsne_embeds = tsne.fit_transform(x_val) scatter(eval_tsne_embeds, y_val, "Samples from Validation Data") ### #Defining the quadruplet Loss function and Embedding model ### # import tensorflow as tf def all_diffs(a, b): # Returns a tensor of all combinations of a - b
from sklearn.manifold import TSNE import matplotlib.pyplot as plt import pandas as pd from sklearn.cluster import KMeans from knock67 import country_vector country, country_name = country_vector() tsne = TSNE(n_components=2, random_state=2021, perplexity=30, n_iter=1000) embedded = tsne.fit_transform(country) kmeans = KMeans(n_clusters=5, random_state=2021).fit_predict(country) plt.figure(figsize=(10, 10)) colors = ["r", "g", "b", "c", "m"] for i in range(embedded.shape[0]): plt.scatter(embedded[i][0], embedded[i][1], marker='.', color=colors[kmeans[i]]) plt.annotate(country_name[i], xy=(embedded[i][0], embedded[i][1]), color=colors[kmeans[i]]) plt.show()
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'): plt.figure(figsize=(18, 18)) # in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.savefig(filename) tsne = TSNE(perplexity=30.0, n_components=2, n_iter=5000) low_dim_embqedding = tsne.fit_transform(data1) plot_with_labels(low_dim_embqedding, labels1) target_y = np_utils.to_categorical(irish_Data["target"]) model = Sequential() model.add(Dense(units=64, input_shape=(4, ), activation='tanh')) model.add(Dense(units=3, activation='softmax')) print(model.summary()) model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])
def tsne( s: pd.Series, n_components=2, perplexity=30.0, learning_rate=200.0, n_iter=1000, random_state=None, n_jobs=-1, ) -> pd.Series: """ Performs TSNE on the given pandas series. t-distributed Stochastic Neighbor Embedding (t-SNE) is a machine learning algorithm used to visualize high-dimensional data in fewer dimensions. In natural language processing, the high-dimensional data is usually a document-term matrix (so in texthero usually a Series after applying :meth:`texthero.representation.tfidf` or some other first representation function that assigns a scalar (a weight) to each word) that is hard to visualize as there might be many terms. With t-SNE, every document gets a new, low-dimensional (n_components entries) vector in such a way that the differences / similarities between documents are preserved. Parameters ---------- s : Pandas Series n_components : int, default is 2. Number of components to keep (dimensionality of output vectors). If n_components is not set or None, all components are kept. perplexity : float, optional (default: 30) The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. Different values can result in significanlty different results. learning_rate : float, optional (default: 200.0) The learning rate for t-SNE is usually in the range [10.0, 1000.0]. If the learning rate is too high, the data may look like a 'ball' with any point approximately equidistant from its nearest neighbours. If the learning rate is too low, most points may look compressed in a dense cloud with few outliers. If the cost function gets stuck in a bad local minimum increasing the learning rate may help. n_iter : int, optional (default: 1000) Maximum number of iterations for the optimization. Should be at least 250. random_state : int, default=None Determines the random number generator. Pass an int for reproducible results across multiple function calls. n_jobs : int, optional, default=-1 The number of parallel jobs to run for neighbors search. ``-1`` means using all processors. Returns ------- Pandas Series with the vector calculated by t-SNE for the document in every cell. Examples -------- >>> import texthero as hero >>> import pandas as pd >>> s = pd.Series(["Football, Sports, Soccer", "Music, Violin, Orchestra", "Football, Music"]) >>> s = s.pipe(hero.clean).pipe(hero.tokenize).pipe(hero.term_frequency) >>> hero.tsne(s, random_state=42) # doctest: +SKIP 0 [-18.833383560180664, -276.800537109375] 1 [-210.60179138183594, 143.00535583496094] 2 [-478.27984619140625, -232.97410583496094] dtype: object See also -------- `t-SNE on Wikipedia <https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding>`_ """ tsne = TSNE( n_components=n_components, perplexity=perplexity, learning_rate=learning_rate, n_iter=n_iter, random_state=random_state, n_jobs=n_jobs, ) return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index)
if not os.path.exists(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path)) fig = plt.gcf() fig.savefig(save_path, dpi=300) print('png saved in: ', save_path) sns.set(rc={'figure.figsize':(11.7,8.27)}) palette = sns.color_palette("bright",2) modal = 'audio' work_dir = '/m_fusion_data/' path = work_dir + f'representation/{modal}.npz' print(path) data = np.load(path) print(data.files) class_name=['non-sarcastic','sarcastic'] X = data['repre'] y4 = data['label'] y4 = [class_name[yi] for yi in y4] tsne = TSNE() X_embedded = tsne.fit_transform(X) # print(y4) print(X_embedded.shape) sns.scatterplot(X_embedded[:,0], X_embedded[:,1], hue=y4, legend='full', palette=palette) plt.title(modal,fontsize=20) plt.legend(fontsize=20) # plt.show() path = work_dir + f'representation/img/{modal}.png' save_plot(path)