def test_set_rand_seed(self): iris = load_iris() X = iris.data Y_a = tsne(X, rand_seed=999) Y_b = tsne(X, rand_seed=999) self.assertEqual(round(Y_a[0][0] / 5), round(Y_b[0][0] / 5)) self.assertEqual(round(Y_a[0][1] / 5), round(Y_b[0][1] / 5)) plt.scatter(Y_a[:, 0], Y_a[:, 1], c='b') plt.scatter(Y_b[:, 0], Y_b[:, 1], c='r') plt.savefig(PLOTS_DIR + '/iris_set_rand_seed.png') if os.environ.get('SHOW_PLOTS', None) != None: plt.show() plt.close()
def test_without_seed_positions(self): iris = load_iris() X_a = load_iris().data[:-10] X_b = load_iris().data Y_a = tsne(X_a, rand_seed=999) Y_b = tsne(X_b, rand_seed=999) plt.scatter(Y_a[:, 0], Y_a[:, 1], c='b') plt.scatter(Y_b[:-10, 0], Y_b[:-10, 1], c='r') plt.scatter(Y_b[-10:, 0], Y_b[-10:, 1], c='g') plt.savefig(PLOTS_DIR + '/iris_without_seed_positions.png') if os.environ.get('SHOW_PLOTS', None) != None: plt.show() plt.close()
def create_docvec_scatter(matrix, document_list): """ 文書ベクトルの散布図を作成する :param matrix: 行列 :return: """ tsne = bhtsne.tsne(matrix.astype(sp.float64), dimensions=2, perplexity=30.0, theta=0.5, rand_seed=-1) doc_tsne = pd.DataFrame(tsne[:, 0], columns=["x"]) doc_tsne["y"] = pd.DataFrame(tsne[:, 1]) doc_tsne["category"] = list(document_list["category1"]) sns.set_style("darkgrid") sns.set(font="IPAexGothic") sns.lmplot(data=doc_tsne, x="x", y="y", hue="category", fit_reg=False, size=10) # 図を表示 plt.show() return tsne
def evaluate(self): # dimensionality reduction using Barnes-Hut implementation of t-SNE data_2d = tsne(np.array(self.data, dtype=np.float64)) silhouette = {} dunn_score = {} rand_score = {} nmi_score = {} # data for using elbow method centres_list = [] k_distance_list = [] cluster_index_list = [] distance_list = [] subplot_counter = 331 for c in self.k_range: print("Number of clusters c:", c) # create model model = self._cluster_model(self.model_name, c) model.fit(self.data) labels = model.labels_ # visualization evaluation - 2D Scatter plot of data plt.subplot(subplot_counter) plt.scatter(data_2d[:, 0], data_2d[:, 1], c=labels) subplot_counter += 1 # elbow method calculations AIC centres_list.append(model.cluster_centers_) k_dist = cdist(self.data, model.cluster_centers_, 'euclidean') k_distance_list.append(k_dist) cluster_index_list.append(np.argmin(k_dist, axis=1)) distance_list.append(np.min(k_dist, axis=1)) # internal evaluation methods not using true labels silhouette[c] = silhouette_score(np.array(self.data), labels, metric='euclidean') print("silhoutete with cluster number: ", silhouette[c], c) dunn_score[c] = self.dunn_fast(self.data, labels) print("dunn with cluster number: ", dunn_score[c], c) # external evaluation methods using true labels (supervised datasets) if self.supervised_labels: rand_score[c] = adjusted_rand_score(self.supervised_labels, labels) nmi_score[c] = normalized_mutual_info_score( self.supervised_labels, labels) plt.show() self._plot_elbow_method(distance_list) print(max(silhouette.items(), key=operator.itemgetter(1))) print(max(dunn_score.items(), key=operator.itemgetter(1))) print("Silhoutete ", silhouette) print("Dunn ", dunn_score)
def fit_and_plot(self, X, label_list, cluster_list): data_tsne = bhtsne.tsne( X.astype(scipy.float64), dimensions=self.dimensions, perplexity=self.perplexity, theta=self.theta, rand_seed=self.rand_seed) xmin, xmax = data_tsne[:,0].min(), data_tsne[:,0].max() ymin, ymax = data_tsne[:,1].min(), data_tsne[:,1].max() # split each label data_dict = {str(label): np.array([data_tsne[idx] for idx, cluster in enumerate(cluster_list) if cluster == label]) for label in range(len(np.unique(label_list)))} print([len(data) for label, data in data_dict.items()]) plt.figure(figsize=(15,10)) for m, data in data_dict.items(): plt.scatter(data[:,0],data[:,1],cmap=cmap(int(m)),label='label.{}'.format(m)) plt.legend() plt.axis([xmin,xmax,ymin,ymax]) plt.xlabel('component 0') plt.ylabel('component 1') plt.title('t-SNE visualization') savedir = os.path.join('_fig','tsne') if not os.path.exists(savedir): os.mkdir(savedir) plt.savefig(os.path.join(savedir,'clustering_result.png')) plt.close()
def fit_transform(self, X): return bhtsne.tsne(X.astype(sp.float64), dimensions=self.dimensions, perplexity=self.perplexity, theta=self.theta, rand_seed=self.rand_seed, max_iter=self.max_iter)
def _dim_reduct(self, vecs): vecs_2d = bhtsne.tsne(vecs.astype(np.float64), dimensions=2, perplexity=5.0, theta=0.5, rand_seed=-1) return vecs_2d
def fit_transform(self, x): tsne = bhtsne.tsne(x.astype(np.float64), dimensions=self.n_dim, perplexity=self.perplexity, theta=self.theta, rand_seed=self.seed) return tsne
def fit_transform(self, data: np.ndarray or pd.DataFrame) -> None: """ fit the tSNE model to data given the parameters provided during initialization and transform the output :param data: n observation x k feature data array :return np.ndarray or pd.DataFrame: tsne results """ ## NEED TO RUN PCA BEFORE TSNE ## if isinstance(data, pd.DataFrame): data_ = data.values else: data_ = data res = bhtsne.tsne(data_.astype(float), dimensions=self.n_components, **self.kwargs) if isinstance(data, pd.DataFrame): self.tsne = pd.DataFrame(res, index=data.index, columns=['x', 'y']) else: self.tsne = res return self.tsne
def create_wordvec_scatter(matrix, vocab): """ 単語ベクトルの散布図を作成する :param matrix: 単語ベクトルの行列 :param vocab: 単語リスト :return: """ tsne = bhtsne.tsne(matrix.astype(sp.float64), dimensions=2, perplexity=30.0, theta=0.5, rand_seed=-1) plt.figure(figsize=(32, 24)) # 図のサイズ plt.scatter(tsne[0:241], tsne[0:241, 1]) count = 0 for label, x, y in zip(vocab, tsne[0:241], tsne[0:241, 1]): count += 1 if count < 0: continue plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points') if count == 500: break plt.show()
def tsne(self, n_dims=2, perplexity=30, theta=0.5, rand_seed=0, **kwargs): '''t-SNE algorithm. Args: n_dims (int): Number of dimensions to use. perplexity (float): Perplexity of the algorithm. theta (float): A number between 0 and 1. Higher is faster but \ less accurate (via the Barnes-Hut approximation). rand_seed (int): Random seed. -1 randomizes each run. **kwargs: Named arguments passed to the t-SNE algorithm. Returns: ''' from bhtsne import tsne n = self.dataset.n_samples if (n - 1 < 3 * perplexity): raise ValueError('Perplexity too high, reduce to <= {:}'.format( (n - 1.) / 3)) X = self.dataset.counts.copy() # this version does not require pre-whitening Y = tsne(data=X.values.T, dimensions=n_dims, perplexity=perplexity, theta=theta, rand_seed=rand_seed, **kwargs) vs = pd.DataFrame( Y, index=X.columns, columns=['dimension ' + str(i + 1) for i in range(n_dims)]) return vs
def get_tsne(matrix, dimensions=2): tsne = bhtsne.tsne(matrix.astype(sp.float64), dimensions=dimensions, perplexity=30.0, theta=0.5, rand_seed=-1) return tsne
def fit_transform(self, data: np.ndarray or pd.DataFrame) -> None: """ fit the tSNE model to data given the parameters provided during initialization and transform the output :param data: n observation x k feature data array :return np.ndarray or pd.DataFrame: tsne results """ if isinstance(data, pd.DataFrame): data_ = data.values else: data_ = data if self.fillna is not None: data_[np.where(np.isnan(data_))] = self.fillna data_[np.where(np.isinf(data_))] = self.fillna if self.run_pca: self.pca = PCA(n_components=self.n_pca_components) data_ = self.pca.fit_transform(data_) res = bhtsne.tsne(data_.astype(float), dimensions=self.n_components, **self.kwargs) if isinstance(data, pd.DataFrame): self.tsne = pd.DataFrame(res, index=data.index) else: self.tsne = res return self.tsne
def compress_to_2dim(np_mat, split_pos_list, seed=-1, perplexity=30.0): np_mat = np.asarray(np_mat, dtype=np.float64) np_mat = bhtsne.tsne(data=np_mat, dimensions=2, perplexity=perplexity, theta=0.5, rand_seed=seed) return np.split(np_mat, split_pos_list, axis=0)
def run_tsne(data, n_dim=2, perplexity=150, **kwargs): """Run tSNE :param data: Dataframe of cells X genes. Typicaly multiscale space diffusion components :param n_dim: Number of dimensions for tSNE embedding :return: tSNE embedding of the data """ tsne = bhtsne.tsne(data.values.astype(float), dimensions=n_dim, perplexity=perplexity, **kwargs) tsne = pd.DataFrame(tsne, index=data.index) tsne.columns = ['x', 'y'] return tsne
def tsne(self, n_dims=2, perplexity=30, theta=0.5, rand_seed=0, **kwargs): '''t-SNE algorithm. Args: n_dims (int): Number of dimensions to use. perplexity (float): Perplexity of the algorithm. theta (float): A number between 0 and 1. Higher is faster but less accurate (via the Barnes-Hut approximation). rand_seed (int): Random seed. -1 randomizes each run. **kwargs: Named arguments passed to the t-SNE algorithm. Returns: ''' # scikit-learn's <0.19 has a bug import sklearn vers = sklearn.__version__.split('.') vmaj, vmin = vers[:2] if (int(vmaj) == 0) and (int(vmin) < 19): from bhtsne import tsne use_bhtsne = True else: from sklearn.manifold import TSNE use_bhtsne = False n = self.dataset.n_samples if (n - 1 < 3 * perplexity): raise ValueError('Perplexity too high, reduce to <= {:}'.format( (n - 1.) / 3)) X = self.dataset.counts.values if use_bhtsne: # this version does not require pre-whitening Y = tsne(data=X.T, dimensions=n_dims, perplexity=perplexity, theta=theta, rand_seed=rand_seed, **kwargs) else: Y = TSNE( n_components=n_dims, perplexity=perplexity, method='barnes_hut' if theta > 0 else 'exact', angle=theta, random_state=rand_seed, ).fit_transform(X.T) vs = pd.DataFrame( Y, index=self.dataset.counts.columns, columns=['dimension ' + str(i + 1) for i in range(n_dims)]) return vs
def word_vec_average(document_list, word2vec_model): counter = 0 num_features = 200 plainDocVec_all = np.zeros((document_list["news"].size, num_features), dtype="float32") for sentence in document_list["news"]: plainDocVec_all[counter] = plain_word2vec_document_vector(sentence, word2vec_model, num_features) counter += 1 tsne = bhtsne.tsne(plainDocVec_all.astype(sp.float64), dimensions=2, perplexity=30.0, theta=0.5, rand_seed=-1) return tsne
def show_tSNE(vecs, labels): # vecsをtnseでfit_transform embedded = bhtsne.tsne(np.array(vecs).astype(np.float64), dimensions=2) # グラフを作成(20x20インチ) plt.figure(figsize=(20, 20)) # 散布図を作成 (embdeddedは2次元ベクトルのリスト) plt.scatter(embedded[:, 0], embedded[:, 1]) # アノテーションを作成 for i, country in enumerate(countries): plt.annotate(country, (embedded[i, 0], embedded[i, 1])) # グラフの保存 plt.savefig('NLP100_69_result.png') plt.show()
def plot(ifPlot=False, path, train_motor, test_motor): train_lstm_feature = np.load('{}/train_lstm_feature.npy'.format(path)) train_lstm_feature = np.reshape(train_lstm_feature, (train_lstm_feature.shape[0], -1)) test_lstm_feature = np.load('{}/test_lstm_feature.npy'.format(path)) test_lstm_feature = np.reshape(test_lstm_feature, (test_lstm_feature.shape[0], -1)) train_labels = np.load('{}/train_label.npy'.format(path)) test_labels = np.load('{}/test_label.npy'.format(path)) # train_labels = np.reshape(train_labels, (train_labels.shape[0], 1)) # test_labels = np.reshape(test_labels, (test_labels.shape[0], 1)) train_lstm_feature = pd.DataFrame(train_lstm_feature) train_lstm_feature = train_lstm_feature.fillna(0) train_lstm_feature = train_lstm_feature.astype('float64') test_lstm_feature = pd.DataFrame(test_lstm_feature) test_lstm_feature = test_lstm_feature.fillna(0) test_lstm_feature = test_lstm_feature.astype('float64') sample_size = 2000 train_sample = random.sample(range(0, train_lstm_feature.shape[0]), sample_size) test_sample = random.sample(range(0, test_lstm_feature.shape[0]), sample_size) # train_tsne_result = tsne(train_lstm_feature.iloc[train_sample]) fit_feature = pd.concat((train_lstm_feature.iloc[train_sample], test_lstm_feature.iloc[test_sample]), axis=0) tsne_result = tsne(fit_feature) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 5)) np.save('{}/lstm_feature_tsne.npy'.format(path), tsne_result[:sample_size]) plot_tsne_for_different_motor_left( tsne_result[:sample_size], np.reshape(train_labels[train_sample], (sample_size)), axes[0], train_motor) # test_tsne_result = tsne(test_lstm_feature.iloc[test_sample]) # np.save('{}/test_lstm_feature_tsne.npy'.format(path), tsne_result[sample_size:]) plot_tsne_for_different_motor_right( tsne_result[sample_size:], np.reshape(test_labels[test_sample], (sample_size)), axes[1], test_motor) plt.savefig('{}/lstm_feature_tsne.jpg'.format(path)) if ifPlot: plt.show()
def embed(edges, components, file_name): if os.path.isfile(file_name): with open(file_name) as f: return cPkl.load(f) # distance_edges = np.amax(edges, axis=1, keepdims=True) - edges # tsne = TSNE(n_components=components, metric='precomputed', n_iter=200, random_state=RANDOM_STATE, verbose=2) # embeddings = tsne.fit_transform(distance_edges) distance_edges = edges.astype(np.float64) embeddings = tsne(distance_edges, dimensions=components, rand_seed=RANDOM_STATE) print(embeddings) print(embeddings.shape) with open(file_name, "wb") as f: cPkl.dump(embeddings, f, cPkl.HIGHEST_PROTOCOL) return embeddings
def test_seed_positions(self): iris = load_iris() X_a = load_iris().data[:-10] X_b = load_iris().data Y_a = tsne(X_a, rand_seed=999) # Generate random positions for last 10 items remainder_positions = np.array([[ (random.uniform(0, 1) * 0.0001), (random.uniform(0, 1) * 0.0001) ] for x in range(X_b.shape[0] - Y_a.shape[0])]) # Append them to previous TSNE output and use as seed_positions in next plot seed_positions = np.vstack((Y_a, remainder_positions)) Y_b = tsne(X_b, seed_positions=seed_positions) self.assertEqual(round(Y_a[0][0] / 20), round(Y_b[0][0] / 20)) self.assertEqual(round(Y_a[0][1] / 20), round(Y_b[0][1] / 20)) plt.scatter(Y_a[:, 0], Y_a[:, 1], c='b') plt.scatter(Y_b[:-10, 0], Y_b[:-10, 1], c='r') plt.scatter(Y_b[-10:, 0], Y_b[-10:, 1], c='g') plt.savefig(PLOTS_DIR + '/iris_seed_positions.png') if os.environ.get('SHOW_PLOTS', None) != None: plt.show() plt.close()
def visualize_data( data, labels, model_path, max_iter = 1000): model = create_base_network(data.shape[1:]) model.load_weights(model_path) results = model.predict( data, 96, verbose = 1).astype(np.float64) reduced_data = tsne(results, max_iter = max_iter) num_classes = labels.max() + 1 for i in range(num_classes): plot_data = reduced_data[labels == i] plt.plot( plot_data[:, 0], plot_data[:, 1], color = np.random.rand(3), marker = 'o', linestyle = '', alpha = 0.7 ) plt.show()
def create_features(self): feats = [ 'var_13', 'var_108', 'var_33', 'var_184', 'var_165', 'var_9', 'var_169', 'var_166', 'var_127', 'var_76', 'var_6', 'var_22', 'var_1', 'var_133', 'var_133', 'var_99', 'var_80', 'var_34', 'var_110', 'var_177' ] whole = pd.concat([train[feats], test[feats]], axis=0).values tsne_array = tsne(data=whole, dimensions=3, perplexity=10.0, theta=0.5, rand_seed=1000) for i in range(tsne_array.shape[1]): col_ = 'TSNE_{}'.format(i + 1) self.train[col_] = tsne_array[:len(train), i] self.test[col_] = tsne_array[len(test):, i]
def write_projection(representation_file, output_file, method='PCA'): '''performs an inputted dimensionality reduction method on the output of a representational layer and writes the projection to a .tsv file''' X = np.loadtxt(representation_file, delimiter='\t') if method == 'PCA': from sklearn.decomposition import PCA model = PCA(n_components=X.shape[1]) transformed_X = model.fit_transform(X) elif method == 'tSNE': # from sklearn.manifold import TSNE from bhtsne import tsne # model = TSNE(n_components=10,perplexity=50,n_iter=400) # transformed_X = model.fit_transform(X) transformed_X = tsne(X, dimensions=5) np.savetxt(output_file, transformed_X, delimiter='\t')
def compress_from_path(lst, resize=(64, 64), seed=-1): # convert path list from 2dim to 1dim flatten_lst = flatten(lst) # Load image each paths img_vec = list(map(lambda f: Image.open(f).resize(resize), flatten_lst)) # Convert image from 3dim(RGB) to 1dim (for processing tSNE) img_vec = np.asarray(list(map(lambda v: np.ravel(v), img_vec)), np.float64) # Run tSNE (Dimensional compression) compressed_vec = bhtsne.tsne(img_vec, rand_seed=seed) # Get split position len_list = [len(n) for n in lst] split_pos = [sum(len_list[:n + 1]) for n in range(len(len_list))] split_pos = split_pos[:len(len_list) - 1] # Return split vector each domain return np.split(compressed_vec, split_pos, axis=0)
def fit_transform(self, X): if self.variant == "bhtsne": return bhtsne.tsne(X, perplexity=self.perplexity) if self.variant == "multicore": return MulticoreTSNE(n_jobs=4, perplexity=self.perplexity).fit_transform(X) if self.variant == "sklearn": return skTSNE(perplexity=self.perplexity).fit_transform(X) if self.variant == "optsne": return OptSNE(perplexity=self.perplexity).fit_transform(X) if self.variant == "fitsne": return FItSNE(X, perplexity=self.perplexity) if self.variant == "cuda": return CudaTSNE(perplexity=self.perplexity).fit_transform(X) return None
def plotTSNE(self, ax, tsneData, dataLabels, perplexity, theta=0.5): ''' - plot and fencify ''' lineWidth = 1.2 tsneData = tsneData.astype('float64') tsneEmbedded = tsne(tsneData, perplexity=perplexity, theta=theta) plt.hsv() # color model ax.scatter(tsneEmbedded[:, 0], tsneEmbedded[:, 1], s=45, c=dataLabels, edgecolors='w', linewidth=0.30, alpha=0.75) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) ax.spines['bottom'].set_visible(False) ax.spines['left'].set_visible(False) #ax.yaxis.set_ticks_position('left'); ax.xaxis.set_ticks_position('bottom') ax.tick_params(axis='x', which='both', bottom='off', top='off', labelbottom='off') ax.tick_params(axis='y', which='both', left='off', right='off', labelleft='off') xmin, xmax = ax.get_xlim() ax.axvline((xmax - np.abs(xmin)) / 2.0, color='black', linewidth=lineWidth) ymin, ymax = ax.get_ylim() ax.axhline((ymax - np.abs(ymin)) / 2.0, color='black', linewidth=lineWidth) plt.tight_layout() plt.show()
def test_iris(self): iris = load_iris() X = iris.data self.assertEqual(mean_shift(X)[0], 2) Y = tsne(X) plt.scatter(Y[:, 0], Y[:, 1], c=iris.target) num_clusters, cluster_centers = mean_shift(Y) self.assertTrue(num_clusters > 1) self.assertTrue(num_clusters < 4) for k in range(num_clusters): cluster_center = cluster_centers[k] plt.plot(cluster_center[0], cluster_center[1], 'x', markerfacecolor='r', markeredgecolor='r', markersize=16) plt.savefig(PLOTS_DIR + '/iris.png') if os.environ.get('SHOW_PLOTS', None) != None: plt.show() plt.close()
def compress_from_path(lst, class_num, model_path, size, layer, seed=-1): # convert path list from 2dim to 1dim flatten_lst = flatten(lst) # create network object fe = FeatureExtractor(class_num, model_path, size, gpu) # get feature list vec = np.asarray( list(map(lambda f: fe.get_flat_feat(f, layer), flatten_lst))) # Convert image from 3dim(RGB) to 1dim (for processing tSNE) vec = np.asarray(list(map(lambda v: np.ravel(v), vec)), np.float64) # Run tSNE (Dimensional compression) compressed_vec = bhtsne.tsne(vec, rand_seed=seed) # Get split position len_list = [len(n) for n in lst] split_pos = [sum(len_list[:n + 1]) for n in range(len(len_list))] split_pos = split_pos[:len(len_list) - 1] # Return split vector return np.split(compressed_vec, split_pos, axis=0)
def bhtsne(vectors, vecs_with_center, args): # if args.bhtsne or not(args.timeline or args.bhtsne or args.wordclouds): # bhtnse pca = PCA(n_components=50) vectors = pca.fit_transform(vectors) print('Bhtsne..') Y = tsne(vectors, perplexity=args["tsne_perplexity"]) pd.DataFrame(Y).to_csv('{}/bhtsne.csv'.format(args['path'])) plt.scatter(Y[:, 0], Y[:, 1], s=0.3) plt.savefig('{}/bhtsne.svg'.format(args['path']), bbox_inches='tight') plt.savefig('{}/bhtsne.png'.format(args['path']), bbox_inches='tight') pd.DataFrame(Y).to_csv('{}/bhtsne_2d.csv'.format(args['path'])) pvtm_utils.svg_to_pdf('{}/bhtsne.svg'.format(args['path'])) plt.close() print('Bhtsne with center..') Y = tsne(vecs_with_center.values, perplexity=args["tsne_perplexity"]) pd.DataFrame(Y).to_csv('{}/bhtsne_with_center.csv'.format(args['path'])) plt.scatter(Y[:len(vectors), 0], Y[:len(vectors), 1], s=0.3) plt.scatter(Y[len(vectors):, 0], Y[len(vectors):, 1], s=0.8, c='r', marker='x') plt.savefig('{}/bhtsne_with_center.svg'.format(args['path']), bbox_inches='tight') plt.savefig('{}/bhtsne_with_center.png'.format(args['path']), bbox_inches='tight') pd.DataFrame(Y).to_csv('{}/bhtsne_with_center_2d.csv'.format(args['path'])) pvtm_utils.svg_to_pdf('{}/bhtsne_with_center.svg'.format(args['path'])) plt.close() print('3D tsne...') Y = tsne(vectors, dimensions=3, perplexity=args["tsne_perplexity"]) fig = pyplot.figure(frameon=False, figsize=(8, 5)) ax = Axes3D(fig) ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], s=1, c='b', marker='^') ax.scatter(Y[:, 0], Y[:, 1], Y[:, 2], s=20, c='r', marker='^') # pyplot.axis('off') xmax, ymax, zmax = Y[:len(vectors), 0].max(), Y[:len(vectors), 1].max(), Y[:len(vectors), 2].max() xmin, ymin, zmin = Y[:len(vectors), 0].min(), Y[:len(vectors), 1].min(), Y[:len(vectors), 2].min() ax.set_xlim(xmin + 4, xmax - 4) ax.set_ylim(ymin + 4, ymax - 4) ax.set_zlim(zmin + 4, zmax - 4) pyplot.savefig('{}/bhtsne_3d.svg'.format(args['path']), bbox_inches='tight') pyplot.savefig('{}/bhtsne_3d.png'.format(args['path']), bbox_inches='tight') pvtm_utils.svg_to_pdf('{}/bhtsne_3d.svg'.format(args['path'])) pd.DataFrame(Y).to_csv('{}/bhtsne_3d.csv'.format(args['path'])) Y = tsne(vecs_with_center.values, dimensions=3, perplexity=args["tsne_perplexity"]) fig = pyplot.figure(frameon=False, figsize=(8, 5)) ax = Axes3D(fig) ax.scatter(Y[:len(vectors), 0], Y[:len(vectors), 1], Y[:len(vectors), 2], s=1, c='b', marker='^') ax.scatter(Y[len(vectors):, 0], Y[len(vectors):, 1], Y[len(vectors):, 2], s=20, c='r', marker='^') # pyplot.axis('off') xmax, ymax, zmax = Y[:len(vectors), 0].max(), Y[:len(vectors), 1].max(), Y[:len(vectors), 2].max() xmin, ymin, zmin = Y[:len(vectors), 0].min(), Y[:len(vectors), 1].min(), Y[:len(vectors), 2].min() ax.set_xlim(xmin + 4, xmax - 4) ax.set_ylim(ymin + 4, ymax - 4) ax.set_zlim(zmin + 4, zmax - 4) pyplot.savefig('{}/bhtsne_with_center_3d.svg'.format(args['path']), bbox_inches='tight') pyplot.savefig('{}/bhtsne_with_center_3d.png'.format(args['path']), bbox_inches='tight') pvtm_utils.svg_to_pdf('{}/bhtsne_with_center_3d.svg'.format(args['path'])) pd.DataFrame(Y).to_csv('{}/bhtsne_with_center_3d.csv'.format(args['path']))
def compose(builders, sizes, articles=None, output_dir=OUTPUT_CSV, vector=None, keys=None, indices=None, sset=None, classes=None): """ Function to compose a combination of features """ global gvec if (vector==None or keys==None): #If vector isn't provided sset = None if not articles: articles = load_data_folder(text_dir) gvec = articles else: sset = set(articles.keys()) keys = list(articles.keys()) vector = -1 trained_models = [] for builder, size in zip(builders, sizes): tmp, model= builder(articles, size, text_dir, sset) trained_models+=[model] if type(vector)!= type(None): vector = tmp else: vector = np.append(vector, tmp, axis=1) print(vector.shape) gvec = vector tsne_success = False perplexity = 32 while(not tsne_success and perplexity>0): try: printl("trying perplexity", perplexity) coords= [coord for coord in tsne(vector, verbose=True, perplexity=perplexity)] tsne_success = True except Exception as e: print(type(e)) print(e.message) perplexity = perplexity/2 if (tsne_success == False): print("T-SNE failed - all perplexity settings not working") print("Use Scikit-Learn's implementation of TSNE") perplexity = 32 while(not tsne_success and perplexity>0): try: model = sk_tsne(n_components=2, random_state=0, perplexity = perplexity) t1 = time.time() output = model.fit_transform(vector) coords = [pair for pair in output] t2 = time.time() print("Time taken:", t2-t1, "seconds") tsne_success = True except Exception as e: print(type(e)) print(e) print(e.message) perplexity/=2 if (classes == None): clusters = kmeans_clusters(keys, vector) else: clusters = classes output_write(keys, coords, clusters=clusters, indices=indices, output_dir = output_dir) """Caching""" if(not sset): global cached_model, cached_tsne cached_model = vector cached_tsne = coords return vector
import numpy as np import os from bhtsne import tsne ROOT = os.path.dirname(os.path.realpath(__file__)) # load encoded data enc_themes = np.load(ROOT + '/cache/gru32_themes128.npz') enc_themes = enc_themes['arr_0'] print(enc_themes.shape) # load pure data # themes = np.load(ROOT + '/cache/themes128.npz') # themes = themes['arr_0'] # themes = themes.reshape(themes.shape[0], -1,) Y = tsne(enc_themes, 2, 50.0) np.savez_compressed(ROOT+'/cache/tsne2_themes128.npz', Y)