def activations_tsne_plot(activations, labels, ds): """Compute embeddings using t-SNE and plot them.""" tsne = TSNE( perplexity=30, metric="euclidean", n_jobs=8, random_state=42, verbose=False, ) fig, axes = plt.subplots(nrows=1, ncols=len(activations), figsize=(25,5)) embs = [] for idx, acts in enumerate(activations): print("Learning embeddings for layer " + str(idx) + "...") embeddings = tsne.fit(acts) for i,actual_label in enumerate(ds.classes): indices = np.argwhere(labels == i) indices = np.squeeze(indices) axes[idx].scatter(embeddings[indices,0],embeddings[indices,1],label=actual_label,s=2) axes[idx].legend() axes[idx].set_title("Activations in layer " + str(idx)) embs.append(embeddings) fig.tight_layout() return embs
def plot_tsne(source_data, source_name, target_data, target_name, plot_directory): fig, ax = plt.subplots() perplexities = [100] for i, perplexity in enumerate(perplexities): tsne = TSNE(n_components=2, initialization='pca', random_state=0, perplexity=perplexity, n_iter=1000, neighbors='approx') x_source_transformed = tsne.fit(source_data) x_target_transformed = tsne.fit(target_data) ax.set_title('Perplexity=%d' % perplexity) ax.scatter(x_source_transformed[:, 0], x_source_transformed[:, 1], c='r', label='source') ax.scatter(x_target_transformed[:, 0], x_target_transformed[:, 1], c='b', label='target') ax.xaxis.set_major_formatter(NullFormatter()) ax.yaxis.set_major_formatter(NullFormatter()) ax.axis('tight') ax.legend() plt.savefig(plot_directory + 'tsne_source' + source_name + '_target' + target_name + '.png', dpi=500)
class OpenTsne(Transformer): """ This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet] by means of tsne. This implementation used [open-tsne](https://opentsne.readthedocs.io/en/latest/tsne_algorithm.html). Important: OpenTSNE is a faster variant of TSNE but it only allows for <2 components. You may also notice that it is relatively slow. This unfortunately is a fact of TSNE. This embedding transformation might require you to manually install extra dependencies unless you installed via either; ``` pip install whatlies[opentsne] pip install whatlies[all] ``` Arguments: n_components: the number of compoments to create/add kwargs: keyword arguments passed to the OpenTsne implementation, includes things like `perplexity` [link](https://opentsne.readthedocs.io/en/latest/api/index.html) Usage: ```python from whatlies.language import SpacyLanguage from whatlies.transformers import OpenTsne words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman", "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire", "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water", "person", "family", "brother", "sister"] lang = SpacyLanguage("en_core_web_md") emb = lang[words] emb.transform(OpenTsne(2)).plot_interactive_matrix('tsne_0', 'tsne_1') ``` """ def __init__(self, n_components=2, **kwargs): super().__init__() self.n_components = n_components self.kwargs = kwargs self.tfm = TSNE(n_components=n_components, **kwargs) def fit(self, embset): names, X = embset.to_names_X() self.emb = self.tfm.fit(X) self.is_fitted = True return self def transform(self, embset): names, X = embset.to_names_X() new_vecs = np.array(self.emb.transform(X)) names_out = names + [f"tsne_{i}" for i in range(self.n_components)] vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)]) new_dict = new_embedding_dict(names_out, vectors_out, embset) return EmbeddingSet(new_dict, name=f"{embset.name}.tsne_{self.n_components}()")
class TSNEWrapper: def __init__(self, params, random_seed): self.tsneer = TSNE(n_components=params['embed_dim'], random_state=random_seed) def fit(self, data): self.embedding = self.tsneer.fit(data) def transform(self, data): new_embedded_data = self.embedding.transform(data) return new_embedded_data
def hc_tsne( X, initialization, tree, alpha=1e-3, weights=(0.5, 0.5, 0.0), margin=0.5, loss_logger=None, **tsne_kwargs, ): """Run openTSNE with custom `negative_gradient_method`, in which the hierarchical constraints are encoded in a regularization term. Args: X: ndarray (N, D) initialization: initialization embedding in 2D, (N, 2) tree: hierarchical constraints represented in tree form (using anytree lib) alpha: contribution of regularization term in the new objective function weights: weights of different elements in the regularization margin: margin in the triplet loss. The real margin m is calculated as `margin * dist(anchor, negative)` loss_logger: logger object (containing a dict) to store loss at each iter. **tsne_kwargs: openTSNE params Returns: Z: new embedding model, can be used as (N, 2) array, or tsne object for embedding new datapoints. """ # from the tree-like constraints, create a regularization term by # using the defined hierarchical triplet loss. tree_regularizer = partial( hierarchical_triplet_loss, tree=tree, margin=margin, weights=weights ) # run openTSNE with custom negative gradient function tsne = TSNE( initialization=initialization, callbacks=ErrorLogger(), # use this to evaluate kl_loss at every 10 iterations negative_gradient_method=partial( my_kl_divergence_bh, list_regularizers=[(alpha, tree_regularizer)], logger=loss_logger, ), **tsne_kwargs, ) Z = tsne.fit(X) # now clear the regularizers from tsne object so we will not use them for embedding # new samples (of test set) Z.gradient_descent_params["negative_gradient_method"] = "bh" return Z
def compute_tsne(A): adata = A.copy() #tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(),n_jobs=8, random_state=42, n_iter=750 ) tsne = TSNE(perplexity=30, metric="euclidean", callbacks=None, n_jobs=10, random_state=42, n_iter=750) adata.varm['TSNE10'] = tsne.fit(adata.varm['TSVD']) return adata
def tsne(x, n=100000): from openTSNE import TSNE from openTSNE.callbacks import ErrorLogger x_in = x[:n, :] tsne = TSNE( perplexity=500, metric="euclidean", callbacks=ErrorLogger(), n_iter=2000, n_jobs=4, ) x_embedded = tsne.fit(x_in) return x_embedded
def run_transformation(self, X, y, transformation_params, callback): class CallbackAdapter: def __init__(self, callback, early_exaggeration_iter): self.callback = callback self.exaggeration_phase = early_exaggeration_iter > 0 self.early_exaggeration_iter = early_exaggeration_iter def __call__(self, iteration, error, embedding): if not self.exaggeration_phase: iteration += self.early_exaggeration_iter if self.exaggeration_phase and iteration == self.early_exaggeration_iter: self.exaggeration_phase = False self.callback( 'embedding', iteration, dict(embedding=embedding.view(np.ndarray), error_metrics=dict(kl_divergence=error))) callback_adapter = CallbackAdapter( callback, transformation_params['early_exaggeration_iter']) tsne = TSNE( **transformation_params, min_grad_norm=0, # never stop n_iter=10000000, # TODO callbacks=callback_adapter, callbacks_every_iters=1) with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=NumbaWarning) callback( 'start', 0, dict(error_metrics=[ dict(name='kl_divergence', label='KL divergence:') ])) callback('status', 0, dict(message='Initializing TSNE')) tsne.fit(X)
def reduce_dimension(embeddings, reduction='pca'): if reduction == 'pca': pca = PCA(n_components=2) embeddings = pca.fit_transform(embeddings) elif reduction == 'tsne': otsne = OTSNE(initialization='pca', n_jobs=8, callbacks=ErrorLogger(), negative_gradient_method='bh') embeddings = otsne.fit(embeddings) # stsne = STSNE() # embeddings = stsne.fit_transform(embeddings) elif reduction == 'none': pass else: raise Exception return embeddings
class TSNEm: def __init__(self, n_components=None, random_state=None, initialization="pca", perplexity=30, n_jobs=6): self.n_components = n_components self.random_state = random_state self.tsne = OpenTSNE(n_components=self.n_components, random_state=self.random_state, initialization=initialization, perplexity=perplexity, n_jobs=n_jobs) def fit_transform(self, X): embeddings = self.tsne.fit(X) self.embeddings = embeddings return embeddings def transform(self, x): return self.embeddings.transform(x)
def calculate_dim_red(): self.embedding_train = None sc.pp.highly_variable_genes(self.data, n_top_genes=500) sc.pp.pca(self.data, n_comps=self.n_comps, zero_center=True) X_pca = self.data.obsm['X_pca'] tSNE_init = X_pca[:, :2] print('feature selection and PCA compression finished ') if self.UMAP: import umap reducer = umap.UMAP(n_components=n_components) X_embedded = reducer.fit_transform(X_pca) self.results['UMAP1'] = X_embedded[:, 0].tolist() if n_components == 2: self.results['UMAP2'] = X_embedded[:, 1].tolist() print('UMAP finished') if self.tSNE: from openTSNE import TSNE from openTSNE.callbacks import ErrorLogger tsne = TSNE(perplexity=30, callbacks=ErrorLogger(), initialization='pca', random_state=42, early_exaggeration_iter=50, n_components=2) %time embedding_train = tsne.fit(X_pca) self.embedding_train = embedding_train self.results['tSNE1'] = embedding_train.T[0].tolist() self.results['tSNE2'] = embedding_train.T[1].tolist() print('tSNE finished') return self.data, self.results
def _tsne_projection(data, num_tsne_components=2, num_pca_components=50): pca = PCA(n_components=num_pca_components) # PCA first speed up the tSNE pca_data = pca.fit_transform(data) tsne = TSNE(n_components=num_tsne_components) data_embedded = tsne.fit(pca_data) return data_embedded
import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import os import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from openTSNE import TSNE from openTSNE.callbacks import ErrorLogger from openTSNE import utils df = pd.read_csv("train.csv") df = df[:100] label = df.label df.drop("label", axis=1, inplace=True) standardized_data = StandardScaler().fit_transform(df) print(standardized_data.shape) tsne = TSNE( perplexity=30, metric="euclidean", callbacks=ErrorLogger(), n_jobs=8, random_state=42, ) embedding_train = tsne.fit(standardized_data) utils.plot(embedding_train, label, colors=utils.MACOSKO_COLORS)
def tsne(X, initialization="pca", **tsne_kwargs): """Original openTSNE""" tsne = TSNE( initialization=initialization, negative_gradient_method="bh", **tsne_kwargs, ) return tsne.fit(X)
legend_kwargs_.update(legend_kwargs) ax.legend(handles=legend_handles, **legend_kwargs_) matplotlib.pyplot.show() if __name__ == '__main__': data_dir = "D:\\2020BUAA\dataset\JNU" pic_data = os.path.join(data_dir, "JNU_data_0-1.pk") with open(pic_data, 'rb') as file_1: txt_all_data = pickle.load(file_1) source_train_X, source_train_y = txt_all_data[0] source_val_X, source_val_y = txt_all_data[1] target_train_X, target_train_y = txt_all_data[2] target_val_X, target_val_y = txt_all_data[3] x, y = source_val_X, source_val_y tsne = TSNE( perplexity=30, n_iter=100, metric="euclidean", callbacks=ErrorLogger(), n_jobs=8, random_state=42, ) embedding = tsne.fit(x) viz_plot(embedding, y, colors=MOUSE_10X_COLORS, draw_centers=False)
def make_data_faster(dataset_shortname): k_folder = '/home/single_cell_analysis/kallisto_out_single_bustools_dev/kallisto_' + dataset_shortname if dataset_shortname in ["pbmc_1k_v3", "pbmc_10k_v3", "neuron_10k_v3"]: dataset_shortname = dataset_shortname.split( "_")[0] + dataset_shortname.split( "_")[1] + "_" + dataset_shortname.split("_")[2] c_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/filtered_feature_bc_matrix' c_raw_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/raw_feature_bc_matrix' c_raw = anndata.AnnData( scipy.io.mmread(os.path.join(c_raw_folder, 'matrix.mtx.gz')).tocsr().T) c_barcodes = pd.read_csv(os.path.join(c_raw_folder, 'barcodes.tsv.gz'), index_col=0, header=None, names=['barcode']) c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1) c_raw.obs = c_barcodes c_raw.var = pd.read_csv(os.path.join(c_raw_folder, 'features.tsv.gz'), header=None, index_col=0, names=['ensembl_id', 'gene_name', 'kind'], sep='\t') print('Loaded c raw mtx:', c_raw.X.shape) del c_barcodes # load c filtered matrix c = anndata.AnnData( scipy.io.mmread(os.path.join(c_folder, 'matrix.mtx.gz')).tocsr().T) c_barcodes = pd.read_csv(os.path.join(c_folder, 'barcodes.tsv.gz'), index_col=0, header=None, names=['barcode']) c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1) c.obs = c_barcodes c.var = pd.read_csv(os.path.join(c_folder, 'features.tsv.gz'), header=None, index_col=0, names=['ensembl_id', 'gene_name', 'kind'], sep='\t') print('Loaded c filtered mtx:', c.X.shape) del c_barcodes ## load kallisto raw matrix k_raw = anndata.AnnData( scipy.io.mmread(os.path.join(k_folder, 'genes.mtx')).tocsr()) k_raw.obs = pd.read_csv(os.path.join(k_folder, 'genes.barcodes.txt'), index_col=0, header=None, names=['barcode']) k_raw.var = pd.read_csv(os.path.join(k_folder, 'genes.genes.txt'), header=None, index_col=0, names=['ensembl_id'], sep='\t') print('Loaded k raw mtx:', k_raw.X.shape) # truncdates the ensembl version number off the kallisto labels k_raw.var['full_emsembl_id'] = k_raw.var.index k_raw.var.index = k_raw.var['full_emsembl_id'].str.slice(0, 18) if dataset_shortname in ['hgmm1k_v2', 'hgmm1k_v3', 'hgmm10k_v3']: k_raw.var.index = k_raw.var['full_emsembl_id'] # do this as late as possible k = k_raw[c.obs.index.values] print('Loaded k filtered mtx:', k.X.shape) c_raw.obs['counts'] = c_raw.X.sum(1) c_raw.obs['ngenes'] = np.array((c_raw.X > 0).sum(1)) c_raw = c_raw[c_raw.obs['counts'] > 0] c_raw.layers['log1p'] = np.log1p(c_raw.X) c_raw.obs['log10counts'] = np.log10(c_raw.obs['counts']) print('Cell Ranger raw:', c_raw.shape) # count UMIs, genes, log transform raw kallisto barcodes # first remove kallisto barcodes with 0 gene counts k_raw.obs['counts'] = k_raw.X.sum(1) k_raw.obs['ngenes'] = np.array((k_raw.X > 0).sum(1)) k_raw = k_raw[k_raw.obs['counts'] > 0] k_raw.layers['log1p'] = np.log1p(k_raw.X) k_raw.obs['log10counts'] = np.log10(k_raw.obs['counts']) print('kallisto raw:', k_raw.shape) c.obs['counts'] = c.X.sum(1) c.obs['ngenes'] = np.array((c.X > 0).sum(1)) c = c[c.obs['counts'] > 0] c.layers['log1p'] = np.log1p(c.X) c.obs['log10counts'] = np.log10(c.obs['counts']) print('Cell Ranger filtered:', c.shape) # count UMIs, genes, log transform filtered kallisto barcodes # first remove kallisto barcodes with 0 gene counts k.obs['counts'] = k.X.sum(1) k.obs['ngenes'] = np.array((k.X > 0).sum(1)) k = k[k.obs['counts'] > 0] k.layers['log1p'] = np.log1p(k.X) k.obs['log10counts'] = np.log10(k.obs['counts']) print('kallisto filtered:', k.shape) joint_obs = k_raw.obs.join(c_raw.obs, how='outer', lsuffix='-kallisto', rsuffix='-tenx') joint_obs = joint_obs.fillna(0) print('Total barcodes seen') print(len(joint_obs)) # barcodes seen by both common_obs = k_raw.obs.join(c_raw.obs, how='inner', lsuffix='-kallisto', rsuffix='-tenx') print('Barcodes seen by both') print(len(common_obs)) kobs = k_raw.obs.join(c_raw.obs, how='left', lsuffix='-kallisto', rsuffix='-tenx') kobs = kobs.sort_values(by=['counts-kallisto'], ascending=False) print('Barcodes seen by kallisto missed by Cell Ranger') print(len(joint_obs) - len(kobs)) # just Cell Ranger observations tobs = c_raw.obs.copy() tobs = tobs.sort_values('counts', ascending=False) print('Barcodes seen by Cell Ranger missed by kallisto') print(len(joint_obs) - len(tobs)) # ## Compute correlations between kallisto and Cell Ranger # handy and fast function for computing correlation on sparse matrices def sparse_M_std(X): n = X.shape[1] return np.sqrt(n * X.multiply(X).sum(1) - np.multiply(X.sum(1), X.sum(1))) def sparse_M_corr(X, Y): X_std = sparse_M_std(X) Y_std = sparse_M_std(Y) XY_std = np.multiply(X_std, Y_std) n = X.shape[1] XY_cov = n * X.multiply(Y).sum(1) - np.multiply(X.sum(1), Y.sum(1)) R = np.divide(XY_cov, XY_std) return np.squeeze(np.asarray(R)) raw_counts_correlation = sparse_M_corr( k_raw[common_obs.index].layers['log1p'], c_raw[common_obs.index].layers['log1p']) filtered_counts_correlation = sparse_M_corr( k_raw[c.obs.index].layers['log1p'], c_raw[c.obs.index].layers['log1p']) print('Correlations computed!') tsvd = TruncatedSVD(n_components=10) TSVD = tsvd.fit_transform(k.layers['log1p']) k.obsm['TSVD'] = TSVD k.obsm['TSVD'] print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_)) print(datetime.datetime.now()) tsvd = TruncatedSVD(n_components=10) TSVD = tsvd.fit_transform(c.layers['log1p']) c.obsm['TSVD'] = TSVD c.obsm['TSVD'] print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_)) print(datetime.datetime.now()) print('Calculating L1 distances...') # taking manhattan distance between matrices dnck = manhattan_distances(c.layers['log1p'], k.layers['log1p']) dnkk = manhattan_distances(k.layers['log1p'], k.layers['log1p']) print(datetime.datetime.now()) # nkc are the kallisto-cellranger distances nck = np.diagonal(dnck) # ncc are the kallisto-kallisto distances nkk = [] for row in dnkk: val = np.partition(row, 1)[1] nkk.append(val) print('L1 distances done!') print(datetime.datetime.now()) print('Doing t-SNE') print(datetime.datetime.now()) tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(), n_jobs=8, random_state=42, n_iter=750) k.obsm['TSNE10'] = tsne.fit(k.obsm['TSVD']) print('kallisto TSNE-10 done.') print(datetime.datetime.now()) # Perform TSNE on top 10 truncated SVD components of Cell Ranger filtered matrix print('Doing t-SNE on top 10 PC for Cell Ranger') # print(datetime.datetime.now()) tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(), n_jobs=8, random_state=42, n_iter=750) c.obsm['TSNE10'] = tsne.fit(c.obsm['TSVD']) print('Cell Ranger TSNE-10 done.') print(datetime.datetime.now()) c_raw.write( os.path.join("./write_data/" + dataset_shortname + '_tenx_raw.h5ad')) k_raw.write( os.path.join("./write_data/" + dataset_shortname + '_kallisto_raw.h5ad')) k.write( os.path.join("./write_data/" + dataset_shortname + '_kallisto.h5ad')) c.write(os.path.join("./write_data/" + dataset_shortname + '_tenx.h5ad')) with open(os.path.join("./write_data/" + dataset_shortname + '_kobs.pkl'), 'wb') as handle: pickle.dump(kobs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join("./write_data/" + dataset_shortname + '_tobs.pkl'), 'wb') as handle: pickle.dump(tobs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_common_obs.pkl'), 'wb') as handle: pickle.dump(common_obs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_joint_obs.pkl'), 'wb') as handle: pickle.dump(joint_obs, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join("./write_data/" + dataset_shortname + '_nkk.pkl'), 'wb') as handle: pickle.dump(nkk, handle, protocol=pickle.HIGHEST_PROTOCOL) with open(os.path.join("./write_data/" + dataset_shortname + '_nck.pkl'), 'wb') as handle: pickle.dump(nck, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_raw_counts_correlation.pkl'), 'wb') as handle: pickle.dump(raw_counts_correlation, handle, protocol=pickle.HIGHEST_PROTOCOL) with open( os.path.join("./write_data/" + dataset_shortname + '_filtered_counts_correlation.pkl'), 'wb') as handle: pickle.dump(filtered_counts_correlation, handle, protocol=pickle.HIGHEST_PROTOCOL)
'cp_dose', 'cp_time' ])) n_components = 4 tsne = TSNE( n_components=n_components, # https://github.com/pavlin-policar/openTSNE/issues/121 negative_gradient_method='bh', perplexity=30, metric='euclidean', verbose=True, n_jobs=10, random_state=42 ) embedding = tsne.fit(x_train) # can embed new data: # embedded_test = embedding.transform(np.array(test_features.drop(columns=[...]))) np.savetxt(f"tsne{n_components}dims.csv", embedding, delimiter=',', header=",".join([f'X{i}' for i in range(embedding.shape[1])])) # ## Advanced embedding. https://opentsne.readthedocs.io/en/latest/examples/02_advanced_usage/02_advanced_usage.html # affinities_train = PerplexityBasedNN( # x_train, # perplexity=30, # metric='euclidean',
from openTSNE import TSNE from openTSNE.callbacks import ErrorLogger import numpy as np from sklearn.model_selection import train_test_split from sklearn.datasets import load_digits import matplotlib.pyplot as plt #%% x, y = load_digits(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.33, random_state=42) print("%d training samples" % x_train.shape[0]) print("%d test samples" % x_test.shape[0]) tsne = TSNE( perplexity=30, metric="euclidean", callbacks=ErrorLogger(), n_jobs=4, random_state=42, ) embedding_train = tsne.fit(x_train) embedding_test = embedding_train.transform(x_test)
validation_posterior = Posterior(vae, validation_cells_dataset, use_cuda=False) print(X_.shape) result_dict['validation_error'] = validation_posterior.reconstruction_error() # Get expression rate parameterization from representation space Z_hat = vae.sample_from_posterior_z(torch.from_numpy(cells_dataset.X.toarray()).float()) Z_hat = np.array(Z_hat.detach()).astype(np.double) tsne = TSNE(callbacks=ErrorLogger(), initialization='random', negative_gradient_method='fft', callbacks_every_iters=100, n_iter=2000, neighbors='approx') YY = tsne.fit(Z_hat) df = pd.DataFrame(index=tmp_adata.obs.index) df['ss_depth'] = result_dict['ss_depth'] df['ss_cells'] = result_dict['ss_cells'] df['validation_error'] = result_dict['validation_error'] df['tsne_0'] = YY[:, 0] df['tsne_1'] = YY[:, 1] # out_file = f'scvi_output_{ds}/{ds}_c{ss_cells}_d{ss_depth}.csv' # if not os.path.exists(os.path.dirname(out_file)): # os.makedirs(os.path.dirname(out_file)) df.to_csv(input.SCVI_PARTIAL_SUMMARY) # # combines all separate depths into a single csv file # all_results = pd.concat(results_list).reset_index()
def main(): parser = argparse.ArgumentParser() ## Required parameters ############### parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument("--pretrain_model", default='bert-case-uncased', type=str, required=True, help="Pre-trained model") parser.add_argument("--num_labels_task", default=None, type=int, required=True, help="num_labels_task") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--eval_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--task", default=2, type=int, required=True, help="Choose Task") parser.add_argument("--choose_eval_test_both", default=2, type=int, help="choose test dev both") ############### args = parser.parse_args() #print(args.do_train, args.do_eval) #exit() processors = Processor_1 num_labels = args.num_labels_task if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() print(n_gpu) print(device) else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") ''' if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) ''' os.makedirs(args.output_dir, exist_ok=True) tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model) train_examples = None num_train_steps = None aspect_list = None sentiment_list = None processor = processors() num_labels = num_labels #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir) filenames = os.listdir(args.output_dir) filenames = [x for x in filenames if "pytorch_model.bin_test_best" in x] print(filenames) file_mark = [] #model_performace_dev = dict() model_performace_test = dict() for x in filenames: ### #eval:0 #test:1 if args.choose_eval_test_both == 0: file_mark.append([x, True]) elif args.choose_eval_test_both == 1: file_mark.append([x, False]) else: file_mark.append([x, True]) file_mark.append([x, False]) #### #### train_examples, aspect_list, sentiment_list = processor.get_test_examples( args.data_dir) test_examples, _, _ = processor.get_test_examples(args.data_dir) #eval_examples, _, _ = processor.get_dev_examples(args.data_dir) if args.task == 1: num_labels = len(aspect_list) elif args.task == 2: num_labels = len(sentiment_list) else: print("What's task?") exit() test = convert_examples_to_features(test_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task) #dev = convert_examples_to_features( #eval_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task) ### for x, mark in file_mark: #mark: eval-True; test-False #choose_eval_test_both: eval-0, test-1, both-2 if mark == True: #dev continue print(x, mark) output_model_file = os.path.join(args.output_dir, x) #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True) model = RobertaForMaskedLMDomainTask.from_pretrained( args.pretrain_model, output_hidden_states=False, output_attentions=False, return_dict=True, num_labels=args.num_labels_task) model.load_state_dict(torch.load(output_model_file), strict=False) #strict False: ignore non-matching keys model.to(device) ####################################### param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] #no_decay = ['bias', 'LayerNorm.weight'] no_grad = [ 'bert.encoder.layer.11.output.dense_ent', 'bert.encoder.layer.11.output.LayerNorm_ent' ] param_optimizer = [(n, p) for n, p in param_optimizer if not any(nd in n for nd in no_grad)] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total*0.1), num_training_steps=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) exit() model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) ####################################### #param_optimizer = [para[0] for para in model.named_parameters()] #param_optimizer = [para for para in model.named_parameters()][-2] #print(param_optimizer) if mark: eval_features = dev print(0) else: eval_features = test print(1) logger.info("***** Running evaluation *****") #logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Num examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_attention_mask = torch.tensor( [f.attention_mask for f in eval_features], dtype=torch.long) if args.task == 1: print("Excuting the task 1") elif args.task == 2: all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) else: print("Wrong here2") all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) all_aspect_ids = torch.tensor([f.aspect_id for f in eval_features], dtype=torch.long) if args.task == 1: eval_data = TensorDataset(all_input_ids, all_attention_mask, all_label_ids, all_aspect_ids) elif args.task == 2: eval_data = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids, all_aspect_ids) else: print("Wrong here1") if args.local_rank == -1: eval_sampler = RandomSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if mark: output_eval_file = os.path.join( args.output_dir, "eval_results_{}.txt".format(x.split("_")[-1])) output_file_pred = os.path.join( args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1])) output_file_glod = os.path.join( args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1])) else: output_eval_file = os.path.join( args.output_dir, "test_results_{}.txt".format(x.split("_")[-1])) output_file_pred = os.path.join( args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1])) output_file_glod = os.path.join( args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1])) fpred = open(output_file_pred, "w") fgold = open(output_file_glod, "w") model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 sentiment_map = sorted(list(set(sentiment_list))) aspect_map = sorted(list(set(aspect_list))) sentiment_map = {label: i for i, label in enumerate(sentiment_map)} aspect_map = {label: i for i, label in enumerate(aspect_map)} print(sentiment_map) print(aspect_map) #exit() #data_dict = {'laptop':{'negative':[],'neutral':[],'positive':[]},'restaurant':{'negative':[],'neutral':[],'positive':[]}} #aspect, sentiment, tensor all_aspect_list = list() all_sentiment_list = list() all_tensor_list = list() restaurant_aspect_list = list() restaurant_sentiment_list = list() restaurant_tensor_list = list() laptop_aspect_list = list() laptop_sentiment_list = list() laptop_tensor_list = list() for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")): #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch)) batch = tuple(t.to(device) for i, t in enumerate(batch)) if args.task == 1: input_ids, attention_mask, label_ids, aspect_ids = batch elif args.task == 2: input_ids, attention_mask, segment_ids, label_ids, aspect_ids = batch else: print("Wrong here3") if args.task == 1: #loss, logits, hidden_states, attentions ''' output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) logits = output.logits tmp_eval_loss = output.loss ''' # #tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class") with torch.no_grad(): rep_domain, rep_task = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="in_domain_task_rep") #logits = output.logits #tmp_eval_loss = output.loss elif args.task == 2: #loss, logits, hidden_states, attentions ''' output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids) logits = output.logits tmp_eval_loss = output.loss ''' # with torch.no_grad(): rep_domain, rep_task = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="in_domain_task_rep") else: print("Wrong!!") #print(rep_domain.shape) #print(rep_task.shape) rep = torch.cat([rep_task, rep_domain], -1).to("cpu") #print(rep.shape) #label_ids:{'negative': 0, 'neutral': 1, 'positive': 2} #aspect_ids:{'laptop': 0, 'restaurant': 1} #sentiment_map={"laptop_negative":1,"laptop_neutral":3,"laptop_positive":5, "restaurant_negative":0,"restaurant_neutral":2,"restaurant_positive":4} sentiment_map = { "l_neg": 1, "l_ne": 3, "l_pos": 5, "Negative": 0, "Neutral": 2, "Postive": 4 } #sentiment_map={"laptop_negative":0,"laptop_positive":2, "restaurant_negative":1,"restaurant_positive":3} for index, tensor in enumerate(rep): #aspect, sentiment, tensor #if label_ids[index] == 1: #netural # continue if aspect_ids[index] == 0: if label_ids[index] == 0: #data_dict['laptop']['negative'].append(tensor) laptop_sentiment_list.append(torch.tensor(1)) all_sentiment_list.append(torch.tensor(1)) elif label_ids[index] == 1: #data_dict['laptop']['neutral'].append(tensor) laptop_sentiment_list.append(torch.tensor(3)) all_sentiment_list.append(torch.tensor(3)) elif label_ids[index] == 2: #data_dict['laptop']['positive'].append(tensor) laptop_sentiment_list.append(torch.tensor(5)) all_sentiment_list.append(torch.tensor(5)) laptop_aspect_list.append(aspect_ids[index]) #laptop_sentiment_list.append(label_ids[index]) laptop_tensor_list.append(tensor) else: if label_ids[index] == 0: #data_dict['restaurant']['negative'].append(tensor) restaurant_sentiment_list.append(torch.tensor(0)) all_sentiment_list.append(torch.tensor(0)) elif label_ids[index] == 1: #data_dict['restaurant']['neutral'].append(tensor) restaurant_sentiment_list.append(torch.tensor(2)) all_sentiment_list.append(torch.tensor(2)) elif label_ids[index] == 2: #data_dict['restaurant']['positive'].append(tensor) restaurant_sentiment_list.append(torch.tensor(4)) all_sentiment_list.append(torch.tensor(4)) restaurant_aspect_list.append(aspect_ids[index]) #restaurant_sentiment_list.append(label_ids[index]) restaurant_tensor_list.append(tensor) all_aspect_list.append(aspect_ids[index]) #all_sentiment_list.append(label_ids[index]) all_tensor_list.append(tensor) ######### laptop_aspect_list = torch.stack(laptop_aspect_list).to("cpu").numpy() laptop_sentiment_list = torch.stack(laptop_sentiment_list).to( "cpu").numpy() laptop_tensor_list = torch.stack(laptop_tensor_list).to("cpu").numpy() restaurant_aspect_list = torch.stack(restaurant_aspect_list).to( "cpu").numpy() restaurant_sentiment_list = torch.stack(restaurant_sentiment_list).to( "cpu").numpy() restaurant_tensor_list = torch.stack(restaurant_tensor_list).to( "cpu").numpy() all_aspect_list = torch.stack(all_aspect_list).to("cpu").numpy() all_sentiment_list = torch.stack(all_sentiment_list).to("cpu").numpy() all_tensor_list = torch.stack(all_tensor_list).to("cpu").numpy() ######### ######### print(laptop_aspect_list.shape) print(laptop_sentiment_list.shape) print(laptop_tensor_list.shape) print("===") print(restaurant_aspect_list.shape) print(restaurant_sentiment_list.shape) print(restaurant_tensor_list.shape) print("===") print(all_aspect_list.shape) #print(all_sentiment_list) print(all_sentiment_list.shape) print(all_tensor_list.shape) print("===") ######### #with open(args.output_dir+".json", "w") as outfile: # json.dump(data_dict, outfile) #####Start to draw######## #emb = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(all_tensor_list) #print(emb.shape) ''' = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X) = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X) = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X) = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X) = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X) = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X) ''' #tsne = TSNE(perplexity=30,metric="euclidean",callbacks=ErrorLogger(),n_jobs=64,random_state=42) ''' tsne = TSNE( perplexity=30, n_iter=50, metric="euclidean", callbacks=ErrorLogger(), n_jobs=64, random_state=42, ) embedding_train = tsne.fit(all_tensor_list) ''' #plot(all_tensor_list, all_sentiment_list) #cosine #perplexity #400-->1200 #64 tsne = TSNE( perplexity=64, n_iter=1200, metric="euclidean", callbacks=ErrorLogger(), n_jobs=64, random_state=42, learning_rate='auto', initialization='pca', n_components=2, ) ### #embedding_train = tsne.fit(all_tensor_list) #utils_.plot(x=embedding_train, y=all_aspect_list, colors=utils_.MOUSE_10X_COLORS, label_map=aspect_map) #utils_.plot(x=embedding_train, y=all_sentiment_list, colors=utils_.MOUSE_10X_COLORS, label_map=sentiment_map) ### ### embedding_train = tsne.fit(restaurant_tensor_list) utils_.plot(x=embedding_train, y=restaurant_sentiment_list, colors=utils_.MOUSE_10X_COLORS, label_map=sentiment_map) ### ### #embedding_train = tsne.fit(laptop_tensor_list) #utils_.plot(x=embedding_train, y=laptop_sentiment_list, colors=utils_.MOUSE_10X_COLORS, label_map=sentiment_map) ### #plt.savefig(args.output_dir+'.pdf') plt.title("Semi-supervised contrastive learning") #plt.title("Fine-tune (Standard)") #plt.title("Fine-tune (Few-shot)") #plt.title("Supervised contrastive learning") #plt.title("Common fine-tuning") plt.savefig('output.pdf')
columns=bumon_name[idx_none_target], values='n_items', aggfunc='sum')\ .fillna(0) _df.head() #%% user_category_ratio_df = _df.div(_df.sum(axis=1), axis=0).fillna(0) user_category_ratio_arr = user_category_ratio_df.values user_category_ratio_arr = np.clip(user_category_ratio_arr, 0, 1).astype(np.float32) #%% from openTSNE import TSNE tsne = TSNE() embedding = tsne.fit(user_category_ratio_arr) # %% vis_x = embedding[:, 0] vis_y = embedding[:, 1] max_idx = np.argmax(user_category_ratio_arr, axis=1) plt.scatter(vis_x, vis_y, c=max_idx, cmap=plt.cm.get_cmap("jet", 124), marker='.') plt.colorbar(ticks=range(124)) plt.clim(-0.5, 123.5) plt.show()
def activations_tsne_plot_save(activations_zero, activations_first, activations_second, activations_third, labels, ds, filename): """Compute embeddings using t-SNE and save their plots.""" tsne = TSNE( perplexity=30, metric="euclidean", n_jobs=8, random_state=42, verbose=False, ) # embeddings_zero print("Learning embeddings for original dataset") embeddings_zero = tsne.fit(activations_zero) fig, axes = plt.subplots(figsize=(12, 8)) for i, actual_label in enumerate(ds.classes): indices = np.argwhere(labels == i) indices = np.squeeze(indices) axes.scatter(embeddings_zero[indices, 0], embeddings_zero[indices, 1], label=actual_label, s=12) axes.legend(markerscale=3, fontsize=12) plt.savefig(filename + "_l0.png") plt.close() # embeddings_first embeddings_first = [] print("Learning embeddings for first layer") for j, acts in enumerate(activations_first): embedding = tsne.fit(acts) embeddings_first.append(embedding) fig, axes = plt.subplots(figsize=(12, 8)) for i, actual_label in enumerate(ds.classes): indices = np.argwhere(labels == i) indices = np.squeeze(indices) axes.scatter(embedding[indices, 0], embedding[indices, 1], label=actual_label, s=12) axes.legend(markerscale=3, fontsize=12) plt.savefig(filename + "_l1_f" + str(j) + ".png") plt.close() # embeddings_second embeddings_second = [] print("Learning embeddings for second layer") for j, acts in enumerate(activations_second): embedding = tsne.fit(acts) embeddings_second.append(embedding) fig, axes = plt.subplots(figsize=(12, 8)) for i, actual_label in enumerate(ds.classes): indices = np.argwhere(labels == i) indices = np.squeeze(indices) axes.scatter(embedding[indices, 0], embedding[indices, 1], label=actual_label, s=12) axes.legend(markerscale=3, fontsize=12) plt.savefig(filename + "_l2_f" + str(j) + ".png") plt.close() # embeddings_third print("Learning embeddings for third layer") embeddings_third = tsne.fit(activations_third) fig, axes = plt.subplots(figsize=(12, 8)) for i, actual_label in enumerate(ds.classes): indices = np.argwhere(labels == i) indices = np.squeeze(indices) axes.scatter(embeddings_third[indices, 0], embeddings_third[indices, 1], label=actual_label, s=12) axes.legend(markerscale=3, fontsize=12) plt.savefig(filename + "_l3.png") plt.close() return embeddings_zero, embeddings_first, embeddings_second, embeddings_third
y = data["CellType1"].astype(str) print("Data set contains %d samples with %d features" % x.shape) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.33, random_state=42) print("%d training samples" % x_train.shape[0]) print("%d test samples" % x_test.shape[0]) tsne = TSNE( perplexity=30, metric="euclidean", callbacks=ErrorLogger(), n_jobs=8, random_state=42, ) # embedding_train = tsne.fit(x_train) embedding_test = tsne.fit(x_test) fig, ax = plt.subplots(figsize=(8, 8)) #utils.plot(embedding_train, y_train, colors=utils.MACOSKO_COLORS, alpha=0.25, ax=ax) utils.plot(embedding_test, y_test, colors=utils.MACOSKO_COLORS, alpha=0.75, ax=ax)