def activations_tsne_plot(activations, labels, ds): """Compute embeddings using t-SNE and plot them.""" tsne = TSNE( perplexity=30, metric="euclidean", n_jobs=8, random_state=42, verbose=False, ) fig, axes = plt.subplots(nrows=1, ncols=len(activations), figsize=(25,5)) embs = [] for idx, acts in enumerate(activations): print("Learning embeddings for layer " + str(idx) + "...") embeddings = tsne.fit(acts) for i,actual_label in enumerate(ds.classes): indices = np.argwhere(labels == i) indices = np.squeeze(indices) axes[idx].scatter(embeddings[indices,0],embeddings[indices,1],label=actual_label,s=2) axes[idx].legend() axes[idx].set_title("Activations in layer " + str(idx)) embs.append(embeddings) fig.tight_layout() return embs
def execute(self): X = self.X X = np.array(self.X) #print("XXn",len(X)) X2 = TSNE(n_components=self.p, random_state=7, perplexity=33).fit(X) print("X2fit", X2) return X2.tolist()
def plot_tsne(source_data, source_name, target_data, target_name, plot_directory): fig, ax = plt.subplots() perplexities = [100] for i, perplexity in enumerate(perplexities): tsne = TSNE(n_components=2, initialization='pca', random_state=0, perplexity=perplexity, n_iter=1000, neighbors='approx') x_source_transformed = tsne.fit(source_data) x_target_transformed = tsne.fit(target_data) ax.set_title('Perplexity=%d' % perplexity) ax.scatter(x_source_transformed[:, 0], x_source_transformed[:, 1], c='r', label='source') ax.scatter(x_target_transformed[:, 0], x_target_transformed[:, 1], c='b', label='target') ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) ax.axis('tight') ax.legend() plt.savefig(plot_directory + 'tsne_source' + source_name + '_target' + target_name + '.png', dpi=500)
def hc_tsne( X, initialization, tree, alpha=1e-3, weights=(0.5, 0.5, 0.0), margin=0.5, loss_logger=None, **tsne_kwargs, ): """Run openTSNE with custom `negative_gradient_method`, in which the hierarchical constraints are encoded in a regularization term. Args: X: ndarray (N, D) initialization: initialization embedding in 2D, (N, 2) tree: hierarchical constraints represented in tree form (using anytree lib) alpha: contribution of regularization term in the new objective function weights: weights of different elements in the regularization margin: margin in the triplet loss. The real margin m is calculated as `margin * dist(anchor, negative)` loss_logger: logger object (containing a dict) to store loss at each iter. **tsne_kwargs: openTSNE params Returns: Z: new embedding model, can be used as (N, 2) array, or tsne object for embedding new datapoints. """ # from the tree-like constraints, create a regularization term by # using the defined hierarchical triplet loss. tree_regularizer = partial( hierarchical_triplet_loss, tree=tree, margin=margin, weights=weights ) # run openTSNE with custom negative gradient function tsne = TSNE( initialization=initialization, callbacks=ErrorLogger(), # use this to evaluate kl_loss at every 10 iterations negative_gradient_method=partial( my_kl_divergence_bh, list_regularizers=[(alpha, tree_regularizer)], logger=loss_logger, ), **tsne_kwargs, ) Z = tsne.fit(X) # now clear the regularizers from tsne object so we will not use them for embedding # new samples (of test set) Z.gradient_descent_params["negative_gradient_method"] = "bh" return Z
def compute_tsne(A): adata = A.copy() #tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(),n_jobs=8, random_state=42, n_iter=750 ) tsne = TSNE(perplexity=30, metric="euclidean", callbacks=None, n_jobs=10, random_state=42, n_iter=750) adata.varm['TSNE10'] = tsne.fit(adata.varm['TSVD']) return adata
def tsne(x, n=100000): from openTSNE import TSNE from openTSNE.callbacks import ErrorLogger x_in = x[:n, :] tsne = TSNE( perplexity=500, metric="euclidean", callbacks=ErrorLogger(), n_iter=2000, n_jobs=4, ) x_embedded = tsne.fit(x_in) return x_embedded
def fetch_algorithm(self): reducer = None if self.algorithm == "umap": reducer = umap.UMAP(random_state=42) else: reducer = TSNE(random_state=42) return reducer
def setproyection(self, proyection_type="TSNE", **kwargs): r""" Calcular proyeccion de los datos Parameters ---------- proyection_type: str Tipo de proyeccionhay tres opciones: TSNE, implementado con OpenTSNE; skTSNE, implementado por sklearn; y PCA, implementado por sklearn. kwargs: dict Argumentos para la proyeccion (Perplexity, etc) """ if self.emb.shape[1] == 2: X_proyected = self.emb.values elif proyection_type == "PCA": X_proyected = PCA(n_components=2, **kwargs).fit_transform(self.emb) elif proyection_type == "skTSNE": X_proyected = skTSNE(n_components=2, **kwargs).fit_transform(self.emb) elif proyection_type == "TSNE": X_proyected = pd.DataFrame(TSNE(n_components=2, n_jobs=8, **kwargs).fit(self.emb.values), index=self.emb.index, columns=["xdim", "ydim"]) self.ids = self.emb.index self.proyected = pd.DataFrame(X_proyected, columns=["xdim", "ydim"], index=self.ids)
def get_embedding_code(self, widgets): params = self.get_current_params(widgets) params['n_iter'] = int(widgets['_iteration'].value) tsne = TSNE(**params, min_grad_norm=0) # Since TSNE is a subclass of sklearn's BaseEstimator, repr(tsne) # provides the code to reproduce the resulting embedding. We remove # whitespace and linebreaks so we can later break the text as we like. expression = repr(tsne) expression = re.sub('\n', '', expression) expression = re.sub(' +', ' ', expression) prefix = 'tsne = ' assignment_line = prefix + expression chars_until_params = len(prefix) + len('TSNE(') tw = TextWrapper(subsequent_indent=' ' * chars_until_params, width=80) assignment_line = tw.fill(assignment_line) code = ('# pip install openTSNE\n' 'from openTSNE import TSNE\n' f'{assignment_line}\n' 'tsne.fit(X)') # return as IPython.display.Code # -> repr will print the code in fixed width font # -> str will print the actual string containing \n return Code(code)
class OpenTsne(Transformer): """ This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] by means of tsne. This implementation used [open-tsne](https://opentsne.readthedocs.io/en/latest/tsne_algorithm.html). Important: OpenTSNE is a faster variant of TSNE but it only allows for <2 components. You may also notice that it is relatively slow. This unfortunately is a fact of TSNE. This embedding transformation might require you to manually install extra dependencies unless you installed via either; ``` pip install whatlies[opentsne] pip install whatlies[all] ``` Arguments: n_components: the number of compoments to create/add kwargs: keyword arguments passed to the OpenTsne implementation, includes things like `perplexity` [link](https://opentsne.readthedocs.io/en/latest/api/index.html) Usage: ```python from whatlies.language import SpacyLanguage from whatlies.transformers import OpenTsne words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water", "person", "family", "brother", "sister"] lang = SpacyLanguage("en_core_web_md") emb = lang[words] emb.transform(OpenTsne(2)).plot_interactive_matrix('tsne_0', 'tsne_1') ``` """ def __init__(self, n_components=2, **kwargs): super().__init__() self.n_components = n_components self.kwargs = kwargs self.tfm = TSNE(n_components=n_components, **kwargs) def fit(self, embset): names, X = embset.to_names_X() self.emb = self.tfm.fit(X) self.is_fitted = True return self def transform(self, embset): names, X = embset.to_names_X() new_vecs = np.array(self.emb.transform(X)) names_out = names + [f"tsne_{i}" for i in range(self.n_components)] vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)]) new_dict = new_embedding_dict(names_out, vectors_out, embset) return EmbeddingSet(new_dict, name=f"{embset.name}.tsne_{self.n_components}()")
def __init__(self, n_components=None, random_state=None, initialization="pca", perplexity=30, n_jobs=6): self.n_components = n_components self.random_state = random_state self.tsne = OpenTSNE(n_components=self.n_components, random_state=self.random_state, initialization=initialization, perplexity=perplexity, n_jobs=n_jobs)
def reduce_dimension(embeddings, reduction='pca'): if reduction == 'pca': pca = PCA(n_components=2) embeddings = pca.fit_transform(embeddings) elif reduction == 'tsne': otsne = OTSNE(initialization='pca', n_jobs=8, callbacks=ErrorLogger(), negative_gradient_method='bh') embeddings = otsne.fit(embeddings) # stsne = STSNE() # embeddings = stsne.fit_transform(embeddings) elif reduction == 'none': pass else: raise Exception return embeddings
class TSNEWrapper: def __init__(self, params, random_seed): self.tsneer = TSNE(n_components=params['embed_dim'], random_state=random_seed) def fit(self, data): self.embedding = self.tsneer.fit(data) def transform(self, data): new_embedded_data = self.embedding.transform(data) return new_embedded_data
def run_transformation(self, X, y, transformation_params, callback): class CallbackAdapter: def __init__(self, callback, early_exaggeration_iter): self.callback = callback self.exaggeration_phase = early_exaggeration_iter > 0 self.early_exaggeration_iter = early_exaggeration_iter def __call__(self, iteration, error, embedding): if not self.exaggeration_phase: iteration += self.early_exaggeration_iter if self.exaggeration_phase and iteration == self.early_exaggeration_iter: self.exaggeration_phase = False self.callback( 'embedding', iteration, dict(embedding=embedding.view(np.ndarray), error_metrics=dict(kl_divergence=error))) callback_adapter = CallbackAdapter( callback, transformation_params['early_exaggeration_iter']) tsne = TSNE( **transformation_params, min_grad_norm=0, # never stop n_iter=10000000, # TODO callbacks=callback_adapter, callbacks_every_iters=1) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=NumbaWarning) callback( 'start', 0, dict(error_metrics=[ dict(name='kl_divergence', label='KL divergence:') ])) callback('status', 0, dict(message='Initializing TSNE')) tsne.fit(X)
def calculate_dim_red(): self.embedding_train = None sc.pp.highly_variable_genes(self.data, n_top_genes=500) sc.pp.pca(self.data, n_comps=self.n_comps, zero_center=True) X_pca = self.data.obsm['X_pca'] tSNE_init = X_pca[:, :2] print('feature selection and PCA compression finished ') if self.UMAP: import umap reducer = umap.UMAP(n_components=n_components) X_embedded = reducer.fit_transform(X_pca) self.results['UMAP1'] = X_embedded[:, 0].tolist() if n_components == 2: self.results['UMAP2'] = X_embedded[:, 1].tolist() print('UMAP finished') if self.tSNE: from openTSNE import TSNE from openTSNE.callbacks import ErrorLogger tsne = TSNE(perplexity=30, callbacks=ErrorLogger(), initialization='pca', random_state=42, early_exaggeration_iter=50, n_components=2) %time embedding_train = tsne.fit(X_pca) self.embedding_train = embedding_train self.results['tSNE1'] = embedding_train.T[0].tolist() self.results['tSNE2'] = embedding_train.T[1].tolist() print('tSNE finished') return self.data, self.results
class TSNEm: def __init__(self, n_components=None, random_state=None, initialization="pca", perplexity=30, n_jobs=6): self.n_components = n_components self.random_state = random_state self.tsne = OpenTSNE(n_components=self.n_components, random_state=self.random_state, initialization=initialization, perplexity=perplexity, n_jobs=n_jobs) def fit_transform(self, X): embeddings = self.tsne.fit(X) self.embeddings = embeddings return embeddings def transform(self, x): return self.embeddings.transform(x)
def dimension_reduction(data): #get true labels m, n = data.obs.shape if n > 0: labels = pd.unique(data.obs.iloc[:, n - 1]) else: labels = pd.unique(data.obs.index) if len(labels) != 0: num_cluster = len(labels) else: num_cluster = m #map colors cmap = plt.get_cmap('Spectral') colors = [cmap(i) for i in np.linspace(0, 1, num_cluster)] color_list = [] for i in range(m): if n > 0: color_list.append( colors[np.where(data.obs.iloc[i, n - 1] == labels)[0][0]]) else: color_list.append(colors[i]) print("Preprocessing: Executing Dimension Reduction...") #TSNE from openTSNE import TSNE tsne_embedded = TSNE().fit(data.X) fig = plt.figure(figsize=(16, 7)) warnings.filterwarnings("ignore", module="matplotlib") plt.scatter(tsne_embedded[:, 0], tsne_embedded[:, 1], c=color_list, s=1.5) plt.title(('t-SNE visualization')) #UMAP import umap umap_embedded = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='correlation').fit_transform(data.X) fig = plt.figure(figsize=(16, 7)) plt.scatter(umap_embedded[:, 0], umap_embedded[:, 1], c=color_list, s=1.5) plt.title('UMAP visualization') return tsne_embedded, umap_embedded
def dimension_reduction(data): #get true labels m, n = data.obs.shape if n > 0: labels = pd.unique(data.obs.iloc[:, n - 1]) else: labels = pd.unique(data.obs.index) if len(labels) != 0: num_cluster = len(labels) else: num_cluster = m #TSNE from openTSNE import TSNE tsne_embedded = TSNE().fit(data.X) #UMAP import umap umap_embedded = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='correlation').fit_transform(data.X) return tsne_embedded, umap_embedded
# seed = i + 41 seed = seed_lst[i] # seed = 42 start1 = time.time() reducer = umap.UMAP(metric='precomputed', n_neighbors=k, random_state=seed) embedding_hub = reducer.fit_transform(X) elapsed_time1 = time.time() - start1 start2 = time.time() reducer = umap.UMAP(n_neighbors=k, random_state=seed) embedding_org = reducer.fit_transform(X) elapsed_time2 = time.time() - start2 start3 = time.time() embedding_TSNE = TSNE().fit(X) elapsed_time3 = time.time() - start3 emb_org_list.append(embedding_org) emb_hub_list.append(embedding_hub) time_org_list.append(elapsed_time2) time_hub_list.append(elapsed_time1) print('org: ', elapsed_time2) print('hub: ', elapsed_time1) print('TSNE:', elapsed_time3) time_org = np.array(time_org_list) time_hub = np.array(time_hub_list) mean_time_org = np.mean(time_org)
def _tsne_projection(data, num_tsne_components=2, num_pca_components=50): pca = PCA(n_components=num_pca_components) # PCA first speed up the tSNE pca_data = pca.fit_transform(data) tsne = TSNE(n_components=num_tsne_components) data_embedded = tsne.fit(pca_data) return data_embedded
def get_results(data_path, model_path, different_layer, look_embedding, tsne, distance, random_mode, multihead): results = [] tokenizer = tokenizer_class.from_pretrained(model_path) if different_layer: if multihead: configer = BertConfig.from_pretrained(model_path) configer.__setattr__('get_multihead', True) configer.__setattr__('output_hidden_states', True) model = model_class.from_pretrained(model_path, config=configer) else: model = model_class.from_pretrained(model_path, output_hidden_states=True) if distance == 'cos': if not random_mode: if 'squad_bert_base' in model_path: output_path = data_path.split( '.')[0] + 'diff_res_squad_finetune_cos_2.txt' elif 'nq_bert_base' in args.model_path: output_path = data_path.split( '.')[0] + 'diff_res_nq_finetune_cos_2.txt' elif 'bert-base-uncased' in args.model_path: output_path = data_path.split( '.')[0] + 'diff_res_cos_2.txt' else: if 'squad_bert_base' in model_path: output_path = data_path.split( '.')[0] + 'diff_random_res_squad_finetune_cos_2.txt' elif 'nq_bert_base' in args.model_path: output_path = data_path.split( '.')[0] + 'diff_random_res_nq_finetune_cos_2.txt' elif 'bert-base-uncased' in args.model_path: output_path = data_path.split( '.')[0] + 'diff_random_res_cos_2.txt' elif distance == 'euc': if not random_mode: if 'squad_bert_base' in model_path: output_path = data_path.split( '.')[0] + 'diff_res_squad_finetune_euc.txt' elif 'nq_bert_base' in args.model_path: output_path = data_path.split( '.')[0] + 'diff_res_nq_finetune_euc.txt' elif 'bert-base-uncased' in args.model_path: output_path = data_path.split('.')[0] + 'diff_res_euc.txt' else: if 'squad_bert_base' in model_path: output_path = data_path.split( '.')[0] + 'diff_random_res_squad_finetune_euc.txt' elif 'nq_bert_base' in args.model_path: output_path = data_path.split( '.')[0] + 'diff_random_res_nq_finetune_euc.txt' elif 'bert-base-uncased' in args.model_path: output_path = data_path.split( '.')[0] + 'diff_random_res_euc.txt' else: model = model_class.from_pretrained(model_path) output_path = data_path.split('.')[0] + '_res_squad_finetune_euc.txt' with open(data_path, 'r', encoding='utf-8') as f: data = json.load(f)['data'] print(len(data)) count = 0 #two_embeddings = [] for qp_pair in tqdm(data): question, query_idx, paragraph, para_idx = get_query_para(qp_pair) count += 1 if random_mode: if 'WSC' in data_path or 'coreference' in data_path: random_index_1, random_index_2 = get_random_para_WSC( paragraph, tokenizer) result = bert(question, random_index_1, paragraph, random_index_2, model, tokenizer, different_layer, look_embedding, tsne, distance, multihead) elif 'sentence_ranking' in data_path: random_para, random_index = get_random_para( data, question, tokenizer) result = bert(question, query_idx, paragraph, para_idx, model, tokenizer, different_layer, look_embedding, tsne, distance, multihead, random_para, random_index) else: random_para, random_index = get_random_para( data, question, tokenizer) result = bert(question, query_idx, paragraph, para_idx, model, tokenizer, different_layer, look_embedding, tsne, distance, multihead, random_para, random_index) else: result = bert(question, query_idx, paragraph, para_idx, model, tokenizer, different_layer, look_embedding, tsne, distance, multihead) if not result: print(count) continue else: results.append(result[0]) #two_embeddings.append(result[1]) # import pickle # pickle.dump(two_embeddings,open('/data/caijie/analyse_bert/data/probing/bigram_shift/two_embedding.pickle','wb')) if tsne: from openTSNE import TSNE import pickle pickle.dump( np.array(tsne_arrays), open( '/data/home/t-jicai/caijie/analyse_bert/embedding_vector_tsne.pickle', 'wb')) res = TSNE().fit(np.array(tsne_arrays)) with open( '/data/home/t-jicai/caijie/analyse_bert/embedding_vector_tsne.txt', 'w', encoding='utf-8') as fout: for r in res: fout.write(str(r)) fout.write('\n') with open(output_path, 'w', encoding='utf-8') as fout: for res in results: fout.write(str(res)) fout.write('\n')
from sklearn.datasets import load_digits from openTSNE import TSNE from matplotlib import pyplot as plt digits = load_digits() X, y = digits["data"], digits["target"] embedding = TSNE().fit(X) embedding[:5] target_ids = range(len(digits.target_names)) plt.figure(figsize=(6, 5)) colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'w', 'orange', 'purple' for i, c, label in zip(target_ids, colors, digits.target_names): plt.scatter(embedding[y == i, 0], embedding[y == i, 1], c=c, label=label) plt.legend() plt.show()
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from openTSNE import TSNE from openTSNE.callbacks import ErrorLogger from openTSNE import utils df = pd.read_csv("train.csv") df = df[:100] label = df.label df.drop("label", axis=1, inplace=True) standardized_data = StandardScaler().fit_transform(df) print(standardized_data.shape) tsne = TSNE( perplexity=30, metric="euclidean", callbacks=ErrorLogger(), n_jobs=8, random_state=42, ) embedding_train = tsne.fit(standardized_data) utils.plot(embedding_train, label, colors=utils.MACOSKO_COLORS)
def make_data_faster(dataset_shortname): k_folder = '/home/single_cell_analysis/kallisto_out_single_bustools_dev/kallisto_' + dataset_shortname if dataset_shortname in ["pbmc_1k_v3", "pbmc_10k_v3", "neuron_10k_v3"]: dataset_shortname = dataset_shortname.split( "_")[0] + dataset_shortname.split( "_")[1] + "_" + dataset_shortname.split("_")[2] c_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/filtered_feature_bc_matrix' c_raw_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/raw_feature_bc_matrix' c_raw = anndata.AnnData( scipy.io.mmread(os.path.join(c_raw_folder, 'matrix.mtx.gz')).tocsr().T) c_barcodes = pd.read_csv(os.path.join(c_raw_folder, 'barcodes.tsv.gz'), index_col=0, header=None, names=['barcode']) c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1) c_raw.obs = c_barcodes c_raw.var = pd.read_csv(os.path.join(c_raw_folder, 'features.tsv.gz'), header=None, index_col=0, names=['ensembl_id', 'gene_name', 'kind'], sep='\t') print('Loaded c raw mtx:', c_raw.X.shape) del c_barcodes # load c filtered matrix c = anndata.AnnData( scipy.io.mmread(os.path.join(c_folder, 'matrix.mtx.gz')).tocsr().T) c_barcodes = pd.read_csv(os.path.join(c_folder, 'barcodes.tsv.gz'), index_col=0, header=None, names=['barcode']) c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1) c.obs = c_barcodes c.var = pd.read_csv(os.path.join(c_folder, 'features.tsv.gz'), header=None, index_col=0, names=['ensembl_id', 'gene_name', 'kind'], sep='\t') print('Loaded c filtered mtx:', c.X.shape) del c_barcodes ## load kallisto raw matrix k_raw = anndata.AnnData( scipy.io.mmread(os.path.join(k_folder, 'genes.mtx')).tocsr()) k_raw.obs = pd.read_csv(os.path.join(k_folder, 'genes.barcodes.txt'), index_col=0, header=None, names=['barcode']) k_raw.var = pd.read_csv(os.path.join(k_folder, 'genes.genes.txt'), header=None, index_col=0, names=['ensembl_id'], sep='\t') print('Loaded k raw mtx:', k_raw.X.shape) # truncdates the ensembl version number off the kallisto labels k_raw.var['full_emsembl_id'] = k_raw.var.index k_raw.var.index = k_raw.var['full_emsembl_id'].str.slice(0, 18) if dataset_shortname in ['hgmm1k_v2', 'hgmm1k_v3', 'hgmm10k_v3']: k_raw.var.index = k_raw.var['full_emsembl_id'] # do this as late as possible k = k_raw[c.obs.index.values] print('Loaded k filtered mtx:', k.X.shape) c_raw.obs['counts'] = c_raw.X.sum(1) c_raw.obs['ngenes'] = np.array((c_raw.X > 0).sum(1)) c_raw = c_raw[c_raw.obs['counts'] > 0] c_raw.layers['log1p'] = np.log1p(c_raw.X) c_raw.obs['log10counts'] = np.log10(c_raw.obs['counts']) print('Cell Ranger raw:', c_raw.shape) # count UMIs, genes, log transform raw kallisto barcodes # first remove kallisto barcodes with 0 gene counts k_raw.obs['counts'] = k_raw.X.sum(1) k_raw.obs['ngenes'] = np.array((k_raw.X > 0).sum(1)) k_raw = k_raw[k_raw.obs['counts'] > 0] k_raw.layers['log1p'] = np.log1p(k_raw.X) k_raw.obs['log10counts'] = np.log10(k_raw.obs['counts']) print('kallisto raw:', k_raw.shape) c.obs['counts'] = c.X.sum(1) c.obs['ngenes'] = np.array((c.X > 0).sum(1)) c = c[c.obs['counts'] > 0] c.layers['log1p'] = np.log1p(c.X) c.obs['log10counts'] = np.log10(c.obs['counts']) print('Cell Ranger filtered:', c.shape) # count UMIs, genes, log transform filtered kallisto barcodes # first remove kallisto barcodes with 0 gene counts k.obs['counts'] = k.X.sum(1) k.obs['ngenes'] = np.array((k.X > 0).sum(1)) k = k[k.obs['counts'] > 0] k.layers['log1p'] = np.log1p(k.X) k.obs['log10counts'] = np.log10(k.obs['counts']) print('kallisto filtered:', k.shape) joint_obs = k_raw.obs.join(c_raw.obs, how='outer', lsuffix='-kallisto', rsuffix='-tenx') joint_obs = joint_obs.fillna(0) print('Total barcodes seen') print(len(joint_obs)) # barcodes seen by both common_obs = k_raw.obs.join(c_raw.obs, how='inner', lsuffix='-kallisto', rsuffix='-tenx') print('Barcodes seen by both') print(len(common_obs)) kobs = k_raw.obs.join(c_raw.obs, how='left', lsuffix='-kallisto', rsuffix='-tenx') kobs = kobs.sort_values(by=['counts-kallisto'], ascending=False) print('Barcodes seen by kallisto missed by Cell Ranger') print(len(joint_obs) - len(kobs)) # just Cell Ranger observations tobs = c_raw.obs.copy() tobs = tobs.sort_values('counts', ascending=False) print('Barcodes seen by Cell Ranger missed by kallisto') print(len(joint_obs) - len(tobs)) # ## Compute correlations between kallisto and Cell Ranger # handy and fast function for computing correlation on sparse matrices def sparse_M_std(X): n = X.shape[1] return np.sqrt(n * X.multiply(X).sum(1) - np.multiply(X.sum(1), X.sum(1))) def sparse_M_corr(X, Y): X_std = sparse_M_std(X) Y_std = sparse_M_std(Y) XY_std = np.multiply(X_std, Y_std) n = X.shape[1] XY_cov = n * X.multiply(Y).sum(1) - np.multiply(X.sum(1), Y.sum(1)) R = np.divide(XY_cov, XY_std) return np.squeeze(np.asarray(R)) raw_counts_correlation = sparse_M_corr( k_raw[common_obs.index].layers['log1p'], c_raw[common_obs.index].layers['log1p']) filtered_counts_correlation = sparse_M_corr( k_raw[c.obs.index].layers['log1p'], c_raw[c.obs.index].layers['log1p']) print('Correlations computed!') tsvd = TruncatedSVD(n_components=10) TSVD = tsvd.fit_transform(k.layers['log1p']) k.obsm['TSVD'] = TSVD k.obsm['TSVD'] print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_)) print(datetime.datetime.now()) tsvd = TruncatedSVD(n_components=10) TSVD = tsvd.fit_transform(c.layers['log1p']) c.obsm['TSVD'] = TSVD c.obsm['TSVD'] print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_)) print(datetime.datetime.now()) print('Calculating L1 distances...') # taking manhattan distance between matrices dnck = manhattan_distances(c.layers['log1p'], k.layers['log1p']) dnkk = manhattan_distances(k.layers['log1p'], k.layers['log1p']) print(datetime.datetime.now()) # nkc are the kallisto-cellranger distances nck = np.diagonal(dnck) # ncc are the kallisto-kallisto distances nkk = [] for row in dnkk: val = np.partition(row, 1)[1] nkk.append(val) print('L1 distances done!') print(datetime.datetime.now()) print('Doing t-SNE') print(datetime.datetime.now()) tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(), n_jobs=8, random_state=42, n_iter=750) k.obsm['TSNE10'] = tsne.fit(k.obsm['TSVD']) print('kallisto TSNE-10 done.') print(datetime.datetime.now()) # Perform TSNE on top 10 truncated SVD components of Cell Ranger filtered matrix print('Doing t-SNE on top 10 PC for Cell Ranger') # print(datetime.datetime.now()) tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(), n_jobs=8, random_state=42, n_iter=750) c.obsm['TSNE10'] = tsne.fit(c.obsm['TSVD']) print('Cell Ranger TSNE-10 done.') print(datetime.datetime.now()) c_raw.write( os.path.join("./write_data/" + dataset_shortname + '_tenx_raw.h5ad')) k_raw.write( os.path.join("./write_data/" + dataset_shortname + '_kallisto_raw.h5ad')) k.write( os.path.join("./write_data/" + dataset_shortname + '_kallisto.h5ad')) c.write(os.path.join("./write_data/" + dataset_shortname + '_tenx.h5ad')) with open(os.path.join("./write_data/" + dataset_shortname + '_kobs.pkl'), 'wb') as handle: pickle.dump(kobs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join("./write_data/" + dataset_shortname + '_tobs.pkl'), 'wb') as handle: pickle.dump(tobs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_common_obs.pkl'), 'wb') as handle: pickle.dump(common_obs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_joint_obs.pkl'), 'wb') as handle: pickle.dump(joint_obs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join("./write_data/" + dataset_shortname + '_nkk.pkl'), 'wb') as handle: pickle.dump(nkk, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join("./write_data/" + dataset_shortname + '_nck.pkl'), 'wb') as handle: pickle.dump(nck, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_raw_counts_correlation.pkl'), 'wb') as handle: pickle.dump(raw_counts_correlation, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_filtered_counts_correlation.pkl'), 'wb') as handle: pickle.dump(filtered_counts_correlation, handle, protocol=pickle.HIGHEST_PROTOCOL)
def __init__(self, params, random_seed): self.tsneer = TSNE(n_components=params['embed_dim'], random_state=random_seed)
from bokeh.plotting import figure, show from bokeh.io import output_notebook from bokeh.layouts import row from bokeh.layouts import gridplot from bokeh.models import BoxSelectTool, LassoSelectTool from bokeh.plotting import figure, curdoc TOOLS = "pan,wheel_zoom,box_select,lasso_select,reset" output_notebook() train_data = pd.read_csv(data_dict + 'train.csv') # Read learned embedding embed_data = pd.read_csv('xxx.csv') # Dimension reduction embedding = TSNE(perplexity=24, random_state=10).fit(embed_data.iloc[:, 1:]) data_group = { 'vote==0': (train_data.iloc[:, 1] >= -1) & (train_data.iloc[:, 1] <= 0), '1<=vote<=2': (train_data.iloc[:, 1] >= 1) & (train_data.iloc[:, 1] <= 2), '3<=vote<=4': (train_data.iloc[:, 1] >= 3) & (train_data.iloc[:, 1] <= 4), 'vote==5': (train_data.iloc[:, 1] >= 5) & (train_data.iloc[:, 1] <= 99), } p_1 = figure(tools=TOOLS, plot_width=400, plot_height=400, min_border=10, min_border_left=50, toolbar_location="above", title="1 <= votes <= 4")
negative_gradient_method='bh') embedding_umap = reducer.fit_transform(data_no_label) t1 = time() print('UMAP running time is: ' + str(t1 - t0) + ' s') fig, ax = plt.subplots() scatter = ax.scatter( embedding_umap[:, 0], embedding_umap[:, 1], c=[sns.color_palette(n_colors=20)[x] for x in label_group]) plt.axis('off') # tSNE t0 = time() embedding_tsne = TSNE(n_components=2, random_state=42, n_jobs=-1, negative_gradient_method='bh').fit(data_no_label) t1 = time() print('t-SNE running time is: ' + str(t1 - t0) + ' s') fig, ax = plt.subplots() scatter = ax.scatter( embedding_tsne[:, 0], embedding_tsne[:, 1], c=[sns.color_palette(n_colors=20)[x] for x in label_group]) plt.axis('off') # MDS t0 = time() embedding_mds = MDS(n_components=2, n_jobs=-1).fit_transform(data_no_label) t1 = time()
15.24742297, 23.48066375, 37.34107189, 58.27652395, 87.24048423, 137.33961493, 211.00561713, 374.36120544, 576.90813121, 983.37544116 ]) perplexity = np.arange(5, 55, 5) for i in range(len(learning_rate)): for j in range(len(perplexity)): # read data x, label = get_data(args.data) # run TSNE y = TSNE(n_components=args.dim, perplexity=perplexity[j], learning_rate=learning_rate[i], n_jobs=-1, verbose=True).fit(x) # save as csv path = os.path.join(os.getcwd(), "visualization", "public", "results", args.data) save_csv(path, alg_name=f"tsne_{perplexity[j]}_{learning_rate[i]}", data=y, label=label) else: y = TSNE(n_components=args.dim, perplexity=perplexity[j], learning_rate=learning_rate[i], n_jobs=-1,
legend_kwargs_.update(legend_kwargs) ax.legend(handles=legend_handles, **legend_kwargs_) matplotlib.pyplot.show() if __name__ == '__main__': data_dir = "D:\\2020BUAA\dataset\JNU" pic_data = os.path.join(data_dir, "JNU_data_0-1.pk") with open(pic_data, 'rb') as file_1: txt_all_data = pickle.load(file_1) source_train_X, source_train_y = txt_all_data[0] source_val_X, source_val_y = txt_all_data[1] target_train_X, target_train_y = txt_all_data[2] target_val_X, target_val_y = txt_all_data[3] x, y = source_val_X, source_val_y tsne = TSNE( perplexity=30, n_iter=100, metric="euclidean", callbacks=ErrorLogger(), n_jobs=8, random_state=42, ) embedding = tsne.fit(x) viz_plot(embedding, y, colors=MOUSE_10X_COLORS, draw_centers=False)
redux = UMAP(n_components=10) df.index = df['name'] df.drop(columns='name', inplace=True) d = {} for name, v in df.iterrows(): v_ = [float(i) for i in v] vn = normalize(v_) d[name] = vn # redux = UMAP(n_components=n_components) # projection = redux.fit_transform(list(d.values())) redux = TSNE(n_components=n_components) projection = redux.fit(np.array(list(d.values()))) # dfmeta[dfmeta.id == '19142e05-7365-4b55-abcc-9ba0dec235d2'].country__autocolor.item() with open('embedding.csv', 'w+') as out: header = [f'c{str(i+1)}' for i in range(n_components)] out.write('name,country,study,' + ','.join(header) + '\n') # header for leaf, v in zip(d.keys(), projection): study = dfmeta[dfmeta.id == leaf].study__autocolor.item() country = dfmeta[dfmeta.id == leaf].country__autocolor.item() out.write( f'{leaf},{country},{study},{",".join([str(i) for i in v])}\n') with open('embedding_raw.csv', 'w+') as out: header = [f'c{str(i+1)}' for i in range(n_components)]