def learn_umap(data, **kwargs): """ Calculates UMAP transformation for given matrix features. Parameters -------- data: np.array Array of features. kwargs: optional Parameters for ``umap.UMAP()`` Returns ------- Calculated UMAP transform Return type ------- np.ndarray """ #_tsne_filter = TSNE.get_params(TSNE) #kwargs = {i: j for i, j in kwargs.items() if i in _tsne_filter} #res = TSNE(random_state=0, **kwargs).fit_transform(data.values) reducer = umap.UMAP() _umap_filter = reducer.get_params() kwargs = {i: j for i, j in kwargs.items() if i in _umap_filter} embedding = umap.UMAP(random_state=0, min_dist=1, **kwargs).fit_transform(data.values) return pd.DataFrame(embedding, index=data.index.values)
def run_umap(x, y, item, n_neighbors_list, min_dist=0.05, verbose=True): for i in n_neighbors_list: print("UMAP NEIGHBOR NUMBER: ", i) x_umap = umap_.UMAP(n_neighbors=i, min_dist=min_dist, verbose=verbose).fit_transform(x) filename = "umap_result" + str(i) + "neighbors" draw_plot(x_umap, y, item, filename)
def find_clusters(embeddings, min_cluster_size): """ Receives a pandas DataFrame containing embedding vectors of length 768 map them into an low-dimensional space and finds the clusters in that space based on the idea in: https://umap-learn.readthedocs.io/en/latest/faq.html - section "From a more practical standpoint" Args: embeddings (:obj:`DataFrame[float]`): DataFrame of embedding vectors min_cluster_size (:obj:`int`): Minimal cluster size Returns: :obj:`numpy array[int64]`: Cluster labels for each data point """ n_components = min(len(embeddings), 50) pca = PCA(n_components=n_components) embeddings = pca.fit_transform(embeddings) reducer = umap.UMAP(n_components=2) embeddings = reducer.fit_transform(embeddings) clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size) clusterer.fit(embeddings) return clusterer.labels_
def _update(self, umap_data): """Updates UMAP object properties from ``umap_data`` :obj:`dict`. Parameters ---------- umap_data : :obj:`dict` """ # pylint: disable=undefined-variable self.z = umap_data['z'] self.R = umap_data['R'] self.R_desc = umap_data['R_desc'] self.E_true = umap_data['E_true'] self.F_true = umap_data['F_true'] self.E_pred = umap_data['E_pred'] self.F_pred = umap_data['F_pred'] self.data_info = umap_data['data_info'].tolist() self.n_neighbors = umap_data['n_neighbors'][()] self.min_dist = umap_data['min_dist'][()] self.random_state = umap_data['random_state'][()] self.reducer = umap.UMAP(n_neighbors=self.n_neighbors, min_dist=self.min_dist, random_state=self.random_state) self.embedding = umap_data['embedding']
def process_umap(exact_pdh, pca_comp, scale=500): umapH = uma.UMAP() umap_result = umapH.fit_transform(exact_pdh[list(range(pca_comp))]) freqlist = exact_pdh['freq'] lw = (freqlist / freqlist[0])**2 plt.scatter(umap_result[:, 0], umap_result[:, 1], s=scale * lw) return umap_result
def generate_base64(terms, vectors): new_values = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='correlation').fit_transform(vectors) x = [] y = [] for value in new_values: x.append(value[0]) y.append(value[1]) plt.figure(figsize=(16, 16)) for i in range(len(x)): plt.scatter(x[i], y[i]) plt.annotate(terms[i], xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', fontproperties=myfont, ha='right', va='bottom') save_file = BytesIO() plt.savefig(save_file, format='png') save_file_base64 = base64.b64encode(save_file.getvalue()).decode('utf8') return save_file_base64
def vis_high_dims_data_umap_2(X, y, show_label_flg=False): """ :param X: features :param y: labels :param show_label_flg : :return: """ # res_umap=umap.UMAP(n_neighbors=5,min_dist=0.3, metric='correlation').fit_transform(X,y) res_umap = umap.UMAP(n_neighbors=50, min_dist=0.8, metric='correlation', random_state=42).fit_transform(X, y) if not show_label_flg: # plt.figure(figsize=(10, 5)) fig, ax = plt.subplots(figsize=(12, 7)) plt.scatter(res_umap[:, 0], res_umap[:, 1], c=y, cmap=plt.cm.get_cmap("jet", 8), alpha=0.8) # cbar = fig.colorbar(cax, ticks=[1, 2, 3, 4, 5, 6, 7, 8], orientation='horizontal') cbar = fig.colorbar(ax, ticks=[1, 2, 3, 4, 5, 6, 7, 8]) cbar.ax.set_xticklabels([ 'Google', 'Twitter', 'Youtube', 'Outlook', 'Github', 'Facebook', 'Slack', 'Bing' ]) # horizontal colorbar # plt.colorbar(ticks=range(0,9)) plt.setp(ax, xticks=[], yticks=[]) # plt.title('umap results') plt.show() else: plot_with_labels(X, y, res_umap, "UMAP", min_dist=2.0)
def vis_high_dims_data_umap(X, y, show_label_flg=False): """ :param X: features :param y: labels :param show_label_flg : :return: """ # res_umap=umap.UMAP(n_neighbors=5,min_dist=0.3, metric='correlation').fit_transform(X,y) res_umap = umap.UMAP(n_neighbors=30, min_dist=0.12, spread=1.8, metric='correlation').fit_transform(X, y) if not show_label_flg: plt.figure(figsize=(10, 5)) plt.scatter(res_umap[:, 0], res_umap[:, 1], c=y, cmap=plt.cm.get_cmap("jet", 7), alpha=0.7) plt.colorbar(ticks=range(7)) plt.title('umap results') plt.savefig("t_SNE.jpg", dpi=400) else: plot_with_labels(X, y, res_umap, "UMAP", min_dist=2.0)
def NDR(data,method,dim,n_neighbors=100): if method == 'standard_LLE': embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors,n_components=dim,\ method='standard').fit_transform(data) elif method == 'hessian_LLE': embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors,n_components=dim,\ method='hessian').fit_transform(data) elif method == 'ltsa_LLE': embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=dim,\ method='ltsa').fit_transform(data) elif method == 'modified_LLE': embedding = manifold.LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=dim,\ method='modified').fit_transform(data) elif method == 'IsoMap': embedding = manifold.Isomap(n_neighbors=n_neighbors, n_components=dim)\ .fit_transform(data) elif method == 't-SNE': embedding = manifold.TSNE(n_components=dim, init='pca', random_state=0,method='exact')\ .fit_transform(data) elif method == 'MDS': embedding = manifold.MDS(n_components=dim, max_iter=100, n_init=1).fit_transform(data) elif method == 'Spectral_Embedding': embedding = manifold.SpectralEmbedding(n_components=dim,n_neighbors=n_neighbors)\ .fit_transform(data) elif method == 'UMAP': embedding = umap.UMAP(n_components=dim,n_neighbors=n_neighbors).fit_transform(data) elif method == 'PCA': embedding = PCA(n_components=dim,svd_solver= 'auto').fit_transform(data) return(embedding)
def embed(self): """Embed the descriptors and derivatives in two dimensions. Will set the ``reducer`` attribute if it does not exist or update it if ``n_neighbors``, ``min_dist``, or ``random_state`` have changed. Returns ------- :obj:`numpy.ndarray` A 2D array with rows being each structure (in the order they are provided in data) with their reduced dimension coordinates being the columns. Notes ----- We recommend first tuning ``n_neighbors`` to provide a balance of clustering and overlap/uniformness. Then tune ``min_dist`` to be the smallest number that allows you to qualitatively determine number of points in a clustered region. For more information on these parameters see https://umap-learn.readthedocs.io/en/latest/parameters.html. """ # pylint: disable=undefined-variable # We check if we need to update the reducer attribute. We always # reinitialize reducer as it takes very little time, but we still have # to check the random_state for reproducibility. reducer = umap.UMAP(n_neighbors=self.n_neighbors, min_dist=self.min_dist, random_state=self.random_state) self.reducer = reducer data = self.R_desc self.embedding = reducer.fit_transform(data) return self.embedding
def get_embeddings(input_features): reducer = umap.UMAP(random_state=28) embedding = reducer.fit_transform(input_features) df = pd.DataFrame({ 'UMAP 1': embedding[:, 1], 'UMAP 2': embedding[:, 0] }) #, 'IDR':idr_class}) return df
def umap_paint(X_topics, umap_para): embedding = umap.UMAP( n_neighbors=umap_para['n_neighbors'], min_dist=umap_para['min_dist'], random_state=umap_para['random_state']).fit_transform(X_topics) plt.figure(figsize=(7, 5)) plt.scatter(embedding[:, 0], embedding[:, 1], s=10, edgecolor='none') plt.show()
def umap_analysis(): from sklearn import preprocessing import umap.umap_ as umap # pip install umap-learn, pip install ipywidgets featrues_filename = 'features_3_sec.csv' #featrues_filename = 'data_adv_3_sec_no_var_hccho.csv' #featrues_filename = 'data_adv_3_sec_hccho.csv' data = pd.read_csv(f'{general_path}/{featrues_filename}') #data = data[data.filename.apply(lambda x: x.split(".")[-2]=='0')].copy().reset_index(drop=True) # 파일당 10개 중 1개만.... print(data.shape, data.head()) data = data.iloc[0:, 1:] print(data.head(5)) y = data['label'] # genre variable. X = data.loc[:, data.columns != 'label'] #select all columns but not the labels #### NORMALIZE X #### cols = X.columns standard_scaler = preprocessing.StandardScaler() np_scaled = standard_scaler.fit_transform(X) X = pd.DataFrame(np_scaled, columns=cols) print(X.shape, X.iloc[:, :2]) # spread는 값의 scale을 결정한다. min_dist <= spread이고 min_dist가 작아지면 더 뭉치게 되고, 커지면 퍼지게 된다. umap_embedding = umap.UMAP(n_neighbors=20, spread=1, min_dist=0.1, n_epochs=5000, metric='correlation', n_components=2, verbose=True).fit_transform( X) # return numpy array (N,n_components) umapDf = pd.DataFrame(data=umap_embedding, columns=['umap component 1', 'umap component 2']) # concatenate with target label finalDf = pd.concat([umapDf, y], axis=1) sns.scatterplot(x="umap component 1", y="umap component 2", data=finalDf, hue="label", alpha=0.7, s=10) plt.title('umap on Genres', fontsize=12) plt.xticks(fontsize=7) plt.yticks(fontsize=7) plt.xlabel("umap Component 1", fontsize=7) plt.ylabel("umap Component 2", fontsize=7) plt.show() print('Done')
def visualise_clusters(feature_vectors_list): reducer = umap.UMAP() embedding_fv = reducer.fit_transform(feature_vectors_list) embedding_fv.shape x_data = [[a, b] for (a, b) in zip(embedding_fv[:, 0], embedding_fv[:, 1])] visualize_scatter_with_images(x_data, images=images, image_zoom=0.3)
def umap_fn(x, y=None, random_state=RS, **kwargs): # WARNING: y shouldn't actually be passed in unless # for supervised clustering purposes start = datetime.now() print("UMAP dimensionality reduction started at {}".format( start.strftime("%H:%M:%S"))) x_umap = umap.UMAP(random_state=RS, **kwargs).fit_transform(x, y) print("UMAP took {} to finish".format(datetime.now() - start)) return x_umap
def umap_iplot(x, df_text, preds): hover_data = pd.DataFrame({'index': preds, 'label': df_text}) mapper = umap.UMAP().fit(x) p = uplot.interactive(mapper, labels=preds, hover_data=hover_data, point_size=2) uplot.show(p) return mapper
def apply_umap(self): """ """ profiles = [(idx, profile) for idx, profile in Clusterer.d_profiles.items() if idx in self.d_sequences] vector = [x[1] for x in profiles] if self.subCluster: neighbors, dist = 5, 0.0 else: neighbors, dist = 50, 0.25 try: clusterable_embedding = umap.UMAP( n_neighbors=neighbors, min_dist=dist, n_components=20, random_state=42, metric='cosine', ).fit_transform(vector) clusterer = hdbscan.HDBSCAN() clusterer.fit(clusterable_embedding) self.clusterlabel = clusterer.labels_ self.probabilities = clusterer.probabilities_ if len(set(self.clusterlabel)) == 1: raise TypeError except TypeError: import shutil shutil.copyfile( self.sequenceFile, f'{self.outdir}/{os.path.splitext(os.path.basename(self.sequenceFile))[0]}_repr.fa' ) return 1 self.allCluster = list(zip([x[0] for x in profiles], self.clusterlabel)) if not self.subCluster: with open(f'{self.outdir}/cluster.txt', 'w') as outStream: for i in set(self.clusterlabel): with open(f'{self.outdir}/cluster{i}.fa', 'w') as fastaOut: outStream.write(f"Cluster: {i}\n") for idx, label in self.allCluster: if label == i: if idx in Clusterer.goiHeader: Clusterer.goi2Cluster[ Clusterer.id2header[idx]] = i outStream.write( f"{Clusterer.id2header[idx]}\n") fastaOut.write( f">{Clusterer.id2header[idx]}\n{self.d_sequences[idx].split('X'*10)[0]}\n" ) outStream.write("\n")
def plot_umap(X_scaled, class_labels, image_save_directory, y): # Use a supervised / unsupervised analysis to make the clusters sns.set(style='white', context='poster') # import umap # %time #Time of the whole cell embeddingUnsupervised = umap.UMAP(n_neighbors=5, random_state=42, init='random').fit_transform(X_scaled) # %time #Time of the whole cell if y is not None: embeddingSupervised = umap.UMAP(n_neighbors=5, random_state=42, init='random').fit_transform(X_scaled, y=y) vis.plotUmap(embeddingSupervised, y, list(class_labels.values()), 'Dataset supervised clustering') vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='UMAP_Supervised') print("Plot UMAP supervised") vis.plotUmap(embeddingUnsupervised, y, list(class_labels.values()), 'Dataset unsupervised clustering', cmapString='RdYlGn') print("Plot UMAP unsupervised with class labels") else: warnings.warn("No y values.") vis.plotUmap(embeddingUnsupervised, None, None, 'Dataset unsupervised clustering', cmapString='RdYlGn') print("Plot UMAP unsupervised without class labels") vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='UMAP_Unsupervised') print("Plot UMAP unsupervised")
def load_umap_embeddings(self): if self.umap_embeddings is None: logger.info('Running UMAP for {} words'.format(self.lang)) umap_embeddings = umap.UMAP(n_neighbors=10, min_dist=0.005, metric='correlation').fit_transform( self.embeddings) self.umap_embeddings = [ vector.tolist() for vector in umap_embeddings ]
def process_umap(exact_pdh, pca_comp, scale=500): umapH = uma.UMAP() umap_result = umapH.fit_transform(exact_pdh[list(range(pca_comp))]) freqlist = exact_pdh['freq'] lw = (freqlist / freqlist[0])**2 u1 = umap_result[:, 0] exact_pdh['u1'] = u1 u2 = umap_result[:, 1] exact_pdh['u2'] = u2 plt.scatter(u1, u2, s=scale * lw) return None
def umap_reducer(self): if (self.umap_reduce == "yes"): IDs = self.confounders_df['ID'] IDs_df = pd.DataFrame(IDs) to_umap = self.confounders_df.drop(columns=['ID']) reducer = umap.UMAP(random_state=153) embedding = reducer.fit_transform(to_umap) embedding1 = pd.DataFrame(embedding[:, 0]) embedding2 = pd.DataFrame(embedding[:, 1]) out_data = pd.concat([ IDs_df.reset_index(), embedding1.reset_index(drop=True), embedding2.reset_index(drop=True) ], axis=1, ignore_index=True) out_data.columns = [ 'INDEX', 'ID', 'UMAP_embedding1', "UMAP_embedding2" ] out_data = out_data.drop(columns=['INDEX']) # Plot print(f"Exporting UMAP plot...") fig, ax = plt.subplots(figsize=(12, 10)) plt.scatter(embedding[:, 0], embedding[:, 1], cmap="cool") plt.title("Data Reduction to 2 Dimensions by UMAP", fontsize=18) plot_out = self.run_prefix + '.umap_plot.png' plt.savefig(plot_out, dpi=600) print( f"The UMAP plot has been exported and can be found here: {plot_out}" ) out_file = self.runplot_out = self.run_prefix + '.umap_data_reduction.csv' out_data.to_csv(out_file, index=False) print( f"The reduced UMAP 2 dimensions per sample .csv file can be found here: {out_file}" ) exported_reducer = reducer.fit(to_umap) algo_out = self.runplot_out = self.run_prefix + '.umap_clustering.joblib' dump(exported_reducer, algo_out) self.confounders_df = out_data print(f"The UMAP .joblib file can be found here: {algo_out}") return self.confounders_df
def gen_projections(features, method='tsne', n_components=2): assert method in ['tsne', 'umap'], f'{method} error' if method == 'tsne': reducer = TSNE(n_components=n_components ) #n_components:Dimension of the embedded space. if method == 'umap': reducer = umap.UMAP(n_components=n_components) print(f'generating embeddings...') features_reduced = reducer.fit_transform(features) print(f'generating embeddings completed!') return features_reduced
def umap_visualization(data, target, color=None): reducer = umap.UMAP() embedding = reducer.fit_transform(data) color_column = data[target] if not color else color plt.figure(figsize=(40, 30), dpi=80) plt.scatter(embedding[:, 0], embedding[:, 1], c=[sns.color_palette()[x] for x in color_column]) plt.gca().set_aspect('equal', 'datalim') plt.title('UMAP projection of the Cover Type dataset', fontsize=24)
def visualize(model): """ Visualize the result for the topic model by 2D embedding (UMAP) :param model: Topic_Model object """ if model.method == 'LDA': return reducer = umap.UMAP() print('Calculating UMAP projection ...') vec_umap = reducer.fit_transform(model.vec[model.method]) print('Calculating UMAP projection. Done!') plot_proj(vec_umap, model.cluster_model.labels_)
def visualize_dimensionality_reduction(cell_data, columns, category, color_map="Spectral", algorithm="UMAP", save_dir=None): """Plots the dimensionality reduction of specified population columns Args: cell_data (pandas.DataFrame): Dataframe containing columns for dimensionality reduction and category columns (list): List of column names that are included for dimensionality reduction category (str): Name of column in dataframe containing population or patient data color_map (str): Name of MatPlotLib ColorMap used, default is Spectral algorithm (str): Name of dimensionality reduction algorithm, default is UMAP save_dir (str): Directory to save plots, default is None """ cell_data = cell_data.dropna() if algorithm not in ["UMAP", "PCA", "tSNE"]: raise ValueError(f"The algorithm specified must be one of the following: " f"{['UMAP', 'PCA', 'tSNE']}") graph_title = "%s projection of data" % algorithm if algorithm == "UMAP": reducer = umap.UMAP() column_data = cell_data[columns].values scaled_column_data = StandardScaler().fit_transform(column_data) embedding = reducer.fit_transform(scaled_column_data) plot_dim_reduced_data(embedding[:, 0], embedding[:, 1], fig_id=1, hue=cell_data[category], cell_data=cell_data, title=graph_title, save_dir=save_dir, save_file="UMAPVisualization.png") elif algorithm == "PCA": pca = PCA() pca_result = pca.fit_transform(cell_data[columns].values) plot_dim_reduced_data(pca_result[:, 0], pca_result[:, 1], fig_id=2, hue=cell_data[category], cell_data=cell_data, title=graph_title, save_dir=save_dir, save_file="PCAVisualization.png") elif algorithm == "tSNE": tsne = TSNE() tsne_results = tsne.fit_transform(cell_data[columns].values) plot_dim_reduced_data(tsne_results[:, 0], tsne_results[:, 1], fig_id=3, hue=cell_data[category], cell_data=cell_data, title=graph_title, save_dir=save_dir, save_file="tSNEVisualization.png")
def fit_umap(self, feature_list, n_neighbors=5, n_jobs=4): time_start = time.time() fit = umap.UMAP(n_neighbors=n_neighbors, random_state=42, n_components=2, verbose=1, n_jobs=n_jobs, metric='euclidean') u = fit.fit_transform(feature_list) print('UMAP done! Time elapsed: {} seconds'.format(time.time() - time_start)) return u
def plot_tsne_and_umap(self, training_set, labels, save_path, model_name="tsne"): """ Run T-SNE on training set and save plots. :param training_set: :param labels: labels of training set :param model_name: "tsne" or "umap" :return: """ if model_name == "tsne": tsne_train = manifold.TSNE(n_components=2, perplexity=30.0, early_exaggeration=12.0, learning_rate=200.0, n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', init='random', verbose=0, random_state=None, method='barnes_hut', angle=0.5) results = tsne_train.fit_transform(training_set) elif model_name == "umap": umap_train = umap.UMAP(n_neighbors=10, min_dist=0.3, metric='correlation') results = umap_train.fit_transform(training_set) else: raise ValueError("Model name could not be recognized") df_subset_up = {} df_subset_up['tsne-2d-one'] = results[:, 0] df_subset_up['tsne-2d-two'] = results[:, 1] df_subset_up['y'] = labels plt.figure(figsize=(16, 10)) sns_plot = sns.scatterplot(x="tsne-2d-one", y="tsne-2d-two", hue="y", palette=sns.color_palette( "hls", max(labels) + 1), data=df_subset_up, legend="full", alpha=0.3) fig = sns_plot.get_figure() fig.savefig(save_path) return sns_plot
def plot_umap(Z, labels): reducer = umap.UMAP() Z_scaled = StandardScaler().fit_transform(Z) embedding = reducer.fit_transform(Z_scaled) ax = sns.scatterplot(embedding[:, 0], embedding[:, 1], hue=labels, palette=sns.color_palette('muted', n_colors=len( np.unique(labels)))) ax.set(xlabel='UMAP0', ylabel='UMAP1', xticklabels=[], yticklabels=[]) ax.set_xticks([]) ax.set_yticks([]) return ax.get_figure()
def _learn_umap(data, **kwargs): """ Calculates UMAP transformation for given matrix features. Parameters -------- data: np.array Array of features. kwargs: optional Parameters for ``umap.UMAP()`` Returns ------- Calculated UMAP transform Return type ------- np.ndarray """ reducer = umap.UMAP() _umap_filter = reducer.get_params() kwargs = {k: v for k, v in kwargs.items() if k in _umap_filter} embedding = umap.UMAP(random_state=0, **kwargs).fit_transform(data.values) return pd.DataFrame(embedding, index=data.index.values)
def plot_UMAP(generated_data: torch.tensor, labels: np.array, n_classes: int, model_name: str, path: str) -> None: reducer = umap.UMAP(random_state=42) embedding = reducer.fit_transform(generated_data) labels = np.array(labels).flatten() fig, ax = plt.subplots(figsize=(12, 10)) for i in range(n_classes): indices = np.where(labels == i)[0] plt.scatter(embedding[indices, 0], embedding[indices, 1], s=5, label=i, color=COLORS[i % 10]) plt.title("{} Generated MNIST Data".format(model_name), fontsize=18) plt.legend(markerscale=2) plt.savefig(os.path.join(path, '{}_UMAP.png'.format(model_name)))