def tsne_emb(model_path: str, proc_dir: str, layer_name: str, perplexities: List[int], n_iter: int): model = torch.load(model_path, map_location=torch.device('cpu'))['model'] w = model[layer_name].numpy() affinities_multiscale_mixture = affinity.Multiscale( w, perplexities=perplexities, metric="cosine", n_jobs=-1, random_state=3) init = initialization.pca(w, random_state=42) embedding = TSNEEmbedding(init, affinities_multiscale_mixture, negative_gradient_method="fft", n_jobs=-1, random_state=4, verbose=True) embedding = embedding.optimize(n_iter=n_iter, exaggeration=None, momentum=0.8, learning_rate="auto") df = pd.DataFrame(embedding, columns=['x', 'y']) df.to_csv(path.join(proc_dir, 'tsne_emb.csv'), index=False) with open(path.join(proc_dir, 'kl_divergence'), 'w') as f: f.write(f'{embedding.kl_divergence:.4f}')
def test_1(self): init = initialization.pca(self.x) aff = affinity.PerplexityBasedNN(self.x, perplexity=30) embedding = openTSNE.TSNEEmbedding(init, aff) embedding.optimize(25, exaggeration=12, momentum=0.5, inplace=True) embedding.optimize(50, exaggeration=1, momentum=0.8, inplace=True) self.eval_embedding(embedding, self.y) new_embedding = embedding.transform(self.x) self.eval_embedding(new_embedding, self.y, "transform")
def test_2(self): init = initialization.pca(self.x) aff = affinity.MultiscaleMixture(self.x, perplexities=[5, 30]) embedding = openTSNE.TSNEEmbedding(init, aff) embedding.optimize(25, exaggeration=12, momentum=0.5, inplace=True) embedding.optimize(50, exaggeration=1, momentum=0.8, inplace=True) self.eval_embedding(embedding, self.y) new_embedding = embedding.transform(self.x) self.eval_embedding(new_embedding, self.y, "transform")
def art_of_tsne(X: np.ndarray, metric: Union[str, Callable] = "euclidean", exaggeration: float = -1, perplexity: int = 30, n_jobs: int = -1) -> TSNEEmbedding: """ Implementation of Dmitry Kobak and Philipp Berens "The art of using t-SNE for single-cell transcriptomics" based on openTSNE. See https://doi.org/10.1038/s41467-019-13056-x | www.nature.com/naturecommunications Args: X The data matrix of shape (n_cells, n_genes) i.e. (n_samples, n_features) metric Any metric allowed by PyNNDescent (default: 'euclidean') exaggeration The exaggeration to use for the embedding perplexity The perplexity to use for the embedding Returns: The embedding as an opentsne.TSNEEmbedding object (which can be cast to an np.ndarray) """ n = X.shape[0] if n > 100_000: if exaggeration == -1: exaggeration = 1 + n / 333_333 # Subsample, optimize, then add the remaining cells and optimize again # Also, use exaggeration == 4 logging.info(f"Creating subset of {n // 40} elements") # Subsample and run a regular art_of_tsne on the subset indices = np.random.permutation(n) reverse = np.argsort(indices) X_sample, X_rest = X[indices[:n // 40]], X[indices[n // 40:]] logging.info(f"Embedding subset") Z_sample = art_of_tsne(X_sample) logging.info( f"Preparing partial initial embedding of the {n - n // 40} remaining elements" ) if isinstance(Z_sample.affinities, affinity.Multiscale): rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexities=[1 / 3, 1 / 3]) else: rest_init = Z_sample.prepare_partial(X_rest, k=1, perplexity=1 / 3) logging.info(f"Combining the initial embeddings, and standardizing") init_full = np.vstack((Z_sample, rest_init))[reverse] init_full = init_full / (np.std(init_full[:, 0]) * 10000) logging.info(f"Creating multiscale affinities") affinities = affinity.PerplexityBasedNN(X, perplexity=perplexity, metric=metric, method="approx", n_jobs=n_jobs) logging.info(f"Creating TSNE embedding") Z = TSNEEmbedding(init_full, affinities, negative_gradient_method="fft", n_jobs=n_jobs) logging.info(f"Optimizing, stage 1") Z.optimize(n_iter=250, inplace=True, exaggeration=12, momentum=0.5, learning_rate=n / 12, n_jobs=n_jobs) logging.info(f"Optimizing, stage 2") Z.optimize(n_iter=750, inplace=True, exaggeration=exaggeration, momentum=0.8, learning_rate=n / 12, n_jobs=n_jobs) elif n > 3_000: if exaggeration == -1: exaggeration = 1 # Use multiscale perplexity affinities_multiscale_mixture = affinity.Multiscale( X, perplexities=[perplexity, n / 100], metric=metric, method="approx", n_jobs=n_jobs) init = initialization.pca(X) Z = TSNEEmbedding(init, affinities_multiscale_mixture, negative_gradient_method="fft", n_jobs=n_jobs) Z.optimize(n_iter=250, inplace=True, exaggeration=12, momentum=0.5, learning_rate=n / 12, n_jobs=n_jobs) Z.optimize(n_iter=750, inplace=True, exaggeration=exaggeration, momentum=0.8, learning_rate=n / 12, n_jobs=n_jobs) else: if exaggeration == -1: exaggeration = 1 # Just a plain TSNE with high learning rate lr = max(200, n / 12) aff = affinity.PerplexityBasedNN(X, perplexity=perplexity, metric=metric, method="approx", n_jobs=n_jobs) init = initialization.pca(X) Z = TSNEEmbedding(init, aff, learning_rate=lr, n_jobs=n_jobs, negative_gradient_method="fft") Z.optimize(250, exaggeration=12, momentum=0.5, inplace=True, n_jobs=n_jobs) Z.optimize(750, exaggeration=exaggeration, momentum=0.8, inplace=True, n_jobs=n_jobs) return Z
norm = raw * 256.0 / sum(raw) profile[0, ] = norm ## 2. Run t-SNE seed = 211 threads = int(cores) affinities_multiscale_mixture = affinity.Multiscale( profile, perplexities=[30, 300], metric="cosine", n_jobs=threads, random_state=seed, ) init = initialization.pca(profile, random_state=seed) embedding = TSNEEmbedding( init, affinities_multiscale_mixture, negative_gradient_method="fft", n_jobs=threads, ) embedding1 = embedding.optimize(n_iter=250, exaggeration=6, momentum=0.5) embedding2 = embedding1.optimize(n_iter=750, exaggeration=1, momentum=0.8) embedding_multiscale = embedding2.view(np.ndarray) ## 3. Save matrix df = pd.DataFrame(embedding_multiscale, columns=['Dim1', 'Dim2'], index=ctgs) df.to_csv(outfile, sep='\t', float_format='%.3f')