def activations_tsne_plot(activations, labels, ds):
    """Compute embeddings using t-SNE and plot them."""
    tsne = TSNE(
        perplexity=30,
        metric="euclidean",
        n_jobs=8,
        random_state=42,
        verbose=False,
    )
    fig, axes = plt.subplots(nrows=1, ncols=len(activations), figsize=(25,5))

    embs = []
    for idx, acts in enumerate(activations):
        print("Learning embeddings for layer " + str(idx) + "...")
        embeddings = tsne.fit(acts)

        for i,actual_label in enumerate(ds.classes):
            indices = np.argwhere(labels == i)
            indices = np.squeeze(indices)
            
            axes[idx].scatter(embeddings[indices,0],embeddings[indices,1],label=actual_label,s=2)
            axes[idx].legend()
            axes[idx].set_title("Activations in layer " + str(idx))
            
        embs.append(embeddings)

    fig.tight_layout()
    return embs
Beispiel #2
0
def plot_tsne(source_data, source_name, target_data, target_name,
              plot_directory):
    fig, ax = plt.subplots()
    perplexities = [100]
    for i, perplexity in enumerate(perplexities):
        tsne = TSNE(n_components=2,
                    initialization='pca',
                    random_state=0,
                    perplexity=perplexity,
                    n_iter=1000,
                    neighbors='approx')
        x_source_transformed = tsne.fit(source_data)
        x_target_transformed = tsne.fit(target_data)
        ax.set_title('Perplexity=%d' % perplexity)
        ax.scatter(x_source_transformed[:, 0],
                   x_source_transformed[:, 1],
                   c='r',
                   label='source')
        ax.scatter(x_target_transformed[:, 0],
                   x_target_transformed[:, 1],
                   c='b',
                   label='target')
        ax.xaxis.set_major_formatter(NullFormatter())
        ax.yaxis.set_major_formatter(NullFormatter())
        ax.axis('tight')
        ax.legend()
        plt.savefig(plot_directory + 'tsne_source' + source_name + '_target' +
                    target_name + '.png',
                    dpi=500)
Beispiel #3
0
class OpenTsne(Transformer):
    """
    This transformer transformers all vectors in an [EmbeddingSet][whatlies.embeddingset.EmbeddingSet]
    by means of tsne. This implementation used
    [open-tsne](https://opentsne.readthedocs.io/en/latest/tsne_algorithm.html).

    Important:
        OpenTSNE is a faster variant of TSNE but it only allows for <2 components.
        You may also notice that it is relatively slow. This unfortunately is a fact of TSNE.

        This embedding transformation might require you to manually install extra dependencies
        unless you installed via either;

        ```
        pip install whatlies[opentsne]
        pip install whatlies[all]
        ```

    Arguments:
        n_components: the number of compoments to create/add
        kwargs: keyword arguments passed to the OpenTsne implementation, includes things like `perplexity` [link](https://opentsne.readthedocs.io/en/latest/api/index.html)

    Usage:

    ```python
    from whatlies.language import SpacyLanguage
    from whatlies.transformers import OpenTsne

    words = ["prince", "princess", "nurse", "doctor", "banker", "man", "woman",
             "cousin", "neice", "king", "queen", "dude", "guy", "gal", "fire",
             "dog", "cat", "mouse", "red", "blue", "green", "yellow", "water",
             "person", "family", "brother", "sister"]

    lang = SpacyLanguage("en_core_web_md")
    emb = lang[words]

    emb.transform(OpenTsne(2)).plot_interactive_matrix('tsne_0', 'tsne_1')
    ```
    """

    def __init__(self, n_components=2, **kwargs):
        super().__init__()
        self.n_components = n_components
        self.kwargs = kwargs
        self.tfm = TSNE(n_components=n_components, **kwargs)

    def fit(self, embset):
        names, X = embset.to_names_X()
        self.emb = self.tfm.fit(X)
        self.is_fitted = True
        return self

    def transform(self, embset):
        names, X = embset.to_names_X()
        new_vecs = np.array(self.emb.transform(X))
        names_out = names + [f"tsne_{i}" for i in range(self.n_components)]
        vectors_out = np.concatenate([new_vecs, np.eye(self.n_components)])
        new_dict = new_embedding_dict(names_out, vectors_out, embset)
        return EmbeddingSet(new_dict, name=f"{embset.name}.tsne_{self.n_components}()")
Beispiel #4
0
class TSNEWrapper:
    def __init__(self, params, random_seed):
        self.tsneer = TSNE(n_components=params['embed_dim'],
                           random_state=random_seed)

    def fit(self, data):
        self.embedding = self.tsneer.fit(data)

    def transform(self, data):
        new_embedded_data = self.embedding.transform(data)
        return new_embedded_data
Beispiel #5
0
def hc_tsne(
    X,
    initialization,
    tree,
    alpha=1e-3,
    weights=(0.5, 0.5, 0.0),
    margin=0.5,
    loss_logger=None,
    **tsne_kwargs,
):
    """Run openTSNE with custom `negative_gradient_method`, in which the
    hierarchical constraints are encoded in a regularization term.

    Args:
        X: ndarray (N, D)
        initialization: initialization embedding in 2D, (N, 2)
        tree: hierarchical constraints represented in tree form (using anytree lib)
        alpha: contribution of regularization term in the new objective function
        weights: weights of different elements in the regularization
        margin: margin in the triplet loss.
            The real margin m is calculated as `margin * dist(anchor, negative)`
        loss_logger: logger object (containing a dict) to store loss at each iter.
        **tsne_kwargs: openTSNE params

    Returns:
        Z: new embedding model, can be used as (N, 2) array,
            or tsne object for embedding new datapoints.
    """
    # from the tree-like constraints, create a regularization term by
    #   using the defined hierarchical triplet loss.
    tree_regularizer = partial(
        hierarchical_triplet_loss, tree=tree, margin=margin, weights=weights
    )

    # run openTSNE with custom negative gradient function
    tsne = TSNE(
        initialization=initialization,
        callbacks=ErrorLogger(),  # use this to evaluate kl_loss at every 10 iterations
        negative_gradient_method=partial(
            my_kl_divergence_bh,
            list_regularizers=[(alpha, tree_regularizer)],
            logger=loss_logger,
        ),
        **tsne_kwargs,
    )

    Z = tsne.fit(X)

    # now clear the regularizers from tsne object so we will not use them for embedding
    # new samples (of test set)
    Z.gradient_descent_params["negative_gradient_method"] = "bh"
    return Z
Beispiel #6
0
def compute_tsne(A):
    adata = A.copy()

    #tsne = TSNE(perplexity=30, metric="euclidean", callbacks=openTSNE.callbacks.ErrorLogger(),n_jobs=8, random_state=42, n_iter=750 )
    tsne = TSNE(perplexity=30,
                metric="euclidean",
                callbacks=None,
                n_jobs=10,
                random_state=42,
                n_iter=750)
    adata.varm['TSNE10'] = tsne.fit(adata.varm['TSVD'])

    return adata
Beispiel #7
0
def tsne(x, n=100000):
    from openTSNE import TSNE
    from openTSNE.callbacks import ErrorLogger

    x_in = x[:n, :]
    tsne = TSNE(
        perplexity=500,
        metric="euclidean",
        callbacks=ErrorLogger(),
        n_iter=2000,
        n_jobs=4,
    )
    x_embedded = tsne.fit(x_in)
    return x_embedded
    def run_transformation(self, X, y, transformation_params, callback):
        class CallbackAdapter:
            def __init__(self, callback, early_exaggeration_iter):
                self.callback = callback
                self.exaggeration_phase = early_exaggeration_iter > 0
                self.early_exaggeration_iter = early_exaggeration_iter

            def __call__(self, iteration, error, embedding):
                if not self.exaggeration_phase:
                    iteration += self.early_exaggeration_iter
                if self.exaggeration_phase and iteration == self.early_exaggeration_iter:
                    self.exaggeration_phase = False

                self.callback(
                    'embedding', iteration,
                    dict(embedding=embedding.view(np.ndarray),
                         error_metrics=dict(kl_divergence=error)))

        callback_adapter = CallbackAdapter(
            callback, transformation_params['early_exaggeration_iter'])

        tsne = TSNE(
            **transformation_params,
            min_grad_norm=0,  # never stop
            n_iter=10000000,  # TODO
            callbacks=callback_adapter,
            callbacks_every_iters=1)

        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=NumbaWarning)
            callback(
                'start', 0,
                dict(error_metrics=[
                    dict(name='kl_divergence', label='KL divergence:')
                ]))
            callback('status', 0, dict(message='Initializing TSNE'))
            tsne.fit(X)
Beispiel #9
0
def reduce_dimension(embeddings, reduction='pca'):
    if reduction == 'pca':
        pca = PCA(n_components=2)
        embeddings = pca.fit_transform(embeddings)
    elif reduction == 'tsne':
        otsne = OTSNE(initialization='pca',
                      n_jobs=8,
                      callbacks=ErrorLogger(),
                      negative_gradient_method='bh')
        embeddings = otsne.fit(embeddings)


#         stsne = STSNE()
#         embeddings = stsne.fit_transform(embeddings)
    elif reduction == 'none':
        pass
    else:
        raise Exception
    return embeddings
Beispiel #10
0
class TSNEm:
    def __init__(self, n_components=None, random_state=None,
                 initialization="pca", perplexity=30, n_jobs=6):
        self.n_components = n_components
        self.random_state = random_state
        self.tsne = OpenTSNE(n_components=self.n_components,
                             random_state=self.random_state,
                             initialization=initialization,
                             perplexity=perplexity,
                             n_jobs=n_jobs)


    def fit_transform(self, X):
        embeddings = self.tsne.fit(X)
        self.embeddings = embeddings
        return embeddings


    def transform(self, x):
        return self.embeddings.transform(x)
        def calculate_dim_red():
            self.embedding_train = None
            sc.pp.highly_variable_genes(self.data, n_top_genes=500)
            sc.pp.pca(self.data, n_comps=self.n_comps, zero_center=True)
            X_pca = self.data.obsm['X_pca']
            tSNE_init = X_pca[:, :2]
            print('feature selection and PCA compression finished ')

            if self.UMAP:
                import umap
                reducer = umap.UMAP(n_components=n_components)
                X_embedded = reducer.fit_transform(X_pca)
                self.results['UMAP1'] = X_embedded[:, 0].tolist()
                if n_components == 2:
                    self.results['UMAP2'] = X_embedded[:, 1].tolist()
                print('UMAP finished')

            if self.tSNE:
                from openTSNE import TSNE
                from openTSNE.callbacks import ErrorLogger

                tsne = TSNE(perplexity=30,
                callbacks=ErrorLogger(),
                initialization='pca',
                random_state=42,
                early_exaggeration_iter=50,
                n_components=2)

                %time embedding_train = tsne.fit(X_pca)
                self.embedding_train = embedding_train
                

                self.results['tSNE1'] = embedding_train.T[0].tolist()
                self.results['tSNE2'] = embedding_train.T[1].tolist()
                print('tSNE finished')
            return self.data, self.results
Beispiel #12
0
def _tsne_projection(data, num_tsne_components=2, num_pca_components=50):
    pca = PCA(n_components=num_pca_components)  # PCA first speed up the tSNE
    pca_data = pca.fit_transform(data)
    tsne = TSNE(n_components=num_tsne_components)
    data_embedded = tsne.fit(pca_data)
    return data_embedded
Beispiel #13
0
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger
from openTSNE import utils

df = pd.read_csv("train.csv")
df = df[:100]
label = df.label
df.drop("label", axis=1, inplace=True)
standardized_data = StandardScaler().fit_transform(df)
print(standardized_data.shape)

tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)

embedding_train = tsne.fit(standardized_data)
utils.plot(embedding_train, label, colors=utils.MACOSKO_COLORS)
Beispiel #14
0
def tsne(X, initialization="pca", **tsne_kwargs):
    """Original openTSNE"""
    tsne = TSNE(
        initialization=initialization, negative_gradient_method="bh", **tsne_kwargs,
    )
    return tsne.fit(X)
            legend_kwargs_.update(legend_kwargs)
        ax.legend(handles=legend_handles, **legend_kwargs_)

    matplotlib.pyplot.show()


if __name__ == '__main__':
    data_dir = "D:\\2020BUAA\dataset\JNU"
    pic_data = os.path.join(data_dir, "JNU_data_0-1.pk")

    with open(pic_data, 'rb') as file_1:
        txt_all_data = pickle.load(file_1)

    source_train_X, source_train_y = txt_all_data[0]
    source_val_X, source_val_y = txt_all_data[1]
    target_train_X, target_train_y = txt_all_data[2]
    target_val_X, target_val_y = txt_all_data[3]

    x, y = source_val_X, source_val_y

    tsne = TSNE(
        perplexity=30,
        n_iter=100,
        metric="euclidean",
        callbacks=ErrorLogger(),
        n_jobs=8,
        random_state=42,
    )
    embedding = tsne.fit(x)
    viz_plot(embedding, y, colors=MOUSE_10X_COLORS, draw_centers=False)
def make_data_faster(dataset_shortname):
    k_folder = '/home/single_cell_analysis/kallisto_out_single_bustools_dev/kallisto_' + dataset_shortname
    if dataset_shortname in ["pbmc_1k_v3", "pbmc_10k_v3", "neuron_10k_v3"]:
        dataset_shortname = dataset_shortname.split(
            "_")[0] + dataset_shortname.split(
                "_")[1] + "_" + dataset_shortname.split("_")[2]
    c_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/filtered_feature_bc_matrix'
    c_raw_folder = '/home/single_cell_analysis/cellranger_out/cellranger3_' + dataset_shortname + '_out/outs/raw_feature_bc_matrix'

    c_raw = anndata.AnnData(
        scipy.io.mmread(os.path.join(c_raw_folder, 'matrix.mtx.gz')).tocsr().T)
    c_barcodes = pd.read_csv(os.path.join(c_raw_folder, 'barcodes.tsv.gz'),
                             index_col=0,
                             header=None,
                             names=['barcode'])
    c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1)
    c_raw.obs = c_barcodes
    c_raw.var = pd.read_csv(os.path.join(c_raw_folder, 'features.tsv.gz'),
                            header=None,
                            index_col=0,
                            names=['ensembl_id', 'gene_name', 'kind'],
                            sep='\t')
    print('Loaded c raw mtx:', c_raw.X.shape)

    del c_barcodes

    # load c filtered matrix
    c = anndata.AnnData(
        scipy.io.mmread(os.path.join(c_folder, 'matrix.mtx.gz')).tocsr().T)
    c_barcodes = pd.read_csv(os.path.join(c_folder, 'barcodes.tsv.gz'),
                             index_col=0,
                             header=None,
                             names=['barcode'])
    c_barcodes.index = c_barcodes.index.str.slice(0, 16, 1)
    c.obs = c_barcodes
    c.var = pd.read_csv(os.path.join(c_folder, 'features.tsv.gz'),
                        header=None,
                        index_col=0,
                        names=['ensembl_id', 'gene_name', 'kind'],
                        sep='\t')
    print('Loaded c filtered mtx:', c.X.shape)

    del c_barcodes

    ## load kallisto raw matrix
    k_raw = anndata.AnnData(
        scipy.io.mmread(os.path.join(k_folder, 'genes.mtx')).tocsr())
    k_raw.obs = pd.read_csv(os.path.join(k_folder, 'genes.barcodes.txt'),
                            index_col=0,
                            header=None,
                            names=['barcode'])
    k_raw.var = pd.read_csv(os.path.join(k_folder, 'genes.genes.txt'),
                            header=None,
                            index_col=0,
                            names=['ensembl_id'],
                            sep='\t')
    print('Loaded k raw mtx:', k_raw.X.shape)

    # truncdates the ensembl version number off the kallisto labels
    k_raw.var['full_emsembl_id'] = k_raw.var.index
    k_raw.var.index = k_raw.var['full_emsembl_id'].str.slice(0, 18)

    if dataset_shortname in ['hgmm1k_v2', 'hgmm1k_v3', 'hgmm10k_v3']:
        k_raw.var.index = k_raw.var['full_emsembl_id']

        # do this as late as possible
    k = k_raw[c.obs.index.values]
    print('Loaded k filtered mtx:', k.X.shape)

    c_raw.obs['counts'] = c_raw.X.sum(1)
    c_raw.obs['ngenes'] = np.array((c_raw.X > 0).sum(1))
    c_raw = c_raw[c_raw.obs['counts'] > 0]
    c_raw.layers['log1p'] = np.log1p(c_raw.X)
    c_raw.obs['log10counts'] = np.log10(c_raw.obs['counts'])
    print('Cell Ranger raw:', c_raw.shape)

    # count UMIs, genes, log transform raw kallisto barcodes
    # first remove kallisto barcodes with 0 gene counts

    k_raw.obs['counts'] = k_raw.X.sum(1)
    k_raw.obs['ngenes'] = np.array((k_raw.X > 0).sum(1))
    k_raw = k_raw[k_raw.obs['counts'] > 0]
    k_raw.layers['log1p'] = np.log1p(k_raw.X)
    k_raw.obs['log10counts'] = np.log10(k_raw.obs['counts'])
    print('kallisto raw:', k_raw.shape)

    c.obs['counts'] = c.X.sum(1)
    c.obs['ngenes'] = np.array((c.X > 0).sum(1))
    c = c[c.obs['counts'] > 0]
    c.layers['log1p'] = np.log1p(c.X)
    c.obs['log10counts'] = np.log10(c.obs['counts'])
    print('Cell Ranger filtered:', c.shape)

    # count UMIs, genes, log transform filtered kallisto barcodes
    # first remove kallisto barcodes with 0 gene counts

    k.obs['counts'] = k.X.sum(1)
    k.obs['ngenes'] = np.array((k.X > 0).sum(1))
    k = k[k.obs['counts'] > 0]
    k.layers['log1p'] = np.log1p(k.X)
    k.obs['log10counts'] = np.log10(k.obs['counts'])
    print('kallisto filtered:', k.shape)

    joint_obs = k_raw.obs.join(c_raw.obs,
                               how='outer',
                               lsuffix='-kallisto',
                               rsuffix='-tenx')
    joint_obs = joint_obs.fillna(0)
    print('Total barcodes seen')
    print(len(joint_obs))

    # barcodes seen by both
    common_obs = k_raw.obs.join(c_raw.obs,
                                how='inner',
                                lsuffix='-kallisto',
                                rsuffix='-tenx')
    print('Barcodes seen by both')
    print(len(common_obs))

    kobs = k_raw.obs.join(c_raw.obs,
                          how='left',
                          lsuffix='-kallisto',
                          rsuffix='-tenx')
    kobs = kobs.sort_values(by=['counts-kallisto'], ascending=False)
    print('Barcodes seen by kallisto missed by Cell Ranger')
    print(len(joint_obs) - len(kobs))

    # just Cell Ranger observations
    tobs = c_raw.obs.copy()
    tobs = tobs.sort_values('counts', ascending=False)
    print('Barcodes seen by Cell Ranger missed by kallisto')
    print(len(joint_obs) - len(tobs))

    # ## Compute correlations between kallisto and Cell Ranger
    # handy and fast function for computing correlation on sparse matrices
    def sparse_M_std(X):
        n = X.shape[1]
        return np.sqrt(n * X.multiply(X).sum(1) -
                       np.multiply(X.sum(1), X.sum(1)))

    def sparse_M_corr(X, Y):
        X_std = sparse_M_std(X)
        Y_std = sparse_M_std(Y)
        XY_std = np.multiply(X_std, Y_std)

        n = X.shape[1]
        XY_cov = n * X.multiply(Y).sum(1) - np.multiply(X.sum(1), Y.sum(1))
        R = np.divide(XY_cov, XY_std)
        return np.squeeze(np.asarray(R))

    raw_counts_correlation = sparse_M_corr(
        k_raw[common_obs.index].layers['log1p'],
        c_raw[common_obs.index].layers['log1p'])
    filtered_counts_correlation = sparse_M_corr(
        k_raw[c.obs.index].layers['log1p'], c_raw[c.obs.index].layers['log1p'])
    print('Correlations computed!')

    tsvd = TruncatedSVD(n_components=10)
    TSVD = tsvd.fit_transform(k.layers['log1p'])
    k.obsm['TSVD'] = TSVD
    k.obsm['TSVD']
    print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_))
    print(datetime.datetime.now())

    tsvd = TruncatedSVD(n_components=10)
    TSVD = tsvd.fit_transform(c.layers['log1p'])
    c.obsm['TSVD'] = TSVD
    c.obsm['TSVD']
    print('TSVD variance ratios:\n', list(tsvd.explained_variance_ratio_))
    print(datetime.datetime.now())

    print('Calculating L1 distances...')

    # taking manhattan distance between matrices
    dnck = manhattan_distances(c.layers['log1p'], k.layers['log1p'])
    dnkk = manhattan_distances(k.layers['log1p'], k.layers['log1p'])
    print(datetime.datetime.now())

    # nkc are the kallisto-cellranger distances
    nck = np.diagonal(dnck)

    # ncc are the kallisto-kallisto distances
    nkk = []
    for row in dnkk:
        val = np.partition(row, 1)[1]
        nkk.append(val)
    print('L1 distances done!')
    print(datetime.datetime.now())

    print('Doing t-SNE')
    print(datetime.datetime.now())
    tsne = TSNE(perplexity=30,
                metric="euclidean",
                callbacks=openTSNE.callbacks.ErrorLogger(),
                n_jobs=8,
                random_state=42,
                n_iter=750)
    k.obsm['TSNE10'] = tsne.fit(k.obsm['TSVD'])
    print('kallisto TSNE-10 done.')
    print(datetime.datetime.now())

    # Perform TSNE on top 10 truncated SVD components of Cell Ranger filtered matrix

    print('Doing t-SNE on top 10 PC for Cell Ranger')
    #
    print(datetime.datetime.now())
    tsne = TSNE(perplexity=30,
                metric="euclidean",
                callbacks=openTSNE.callbacks.ErrorLogger(),
                n_jobs=8,
                random_state=42,
                n_iter=750)
    c.obsm['TSNE10'] = tsne.fit(c.obsm['TSVD'])
    print('Cell Ranger TSNE-10 done.')
    print(datetime.datetime.now())

    c_raw.write(
        os.path.join("./write_data/" + dataset_shortname + '_tenx_raw.h5ad'))
    k_raw.write(
        os.path.join("./write_data/" + dataset_shortname +
                     '_kallisto_raw.h5ad'))
    k.write(
        os.path.join("./write_data/" + dataset_shortname + '_kallisto.h5ad'))
    c.write(os.path.join("./write_data/" + dataset_shortname + '_tenx.h5ad'))

    with open(os.path.join("./write_data/" + dataset_shortname + '_kobs.pkl'),
              'wb') as handle:
        pickle.dump(kobs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join("./write_data/" + dataset_shortname + '_tobs.pkl'),
              'wb') as handle:
        pickle.dump(tobs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_common_obs.pkl'), 'wb') as handle:
        pickle.dump(common_obs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_joint_obs.pkl'), 'wb') as handle:
        pickle.dump(joint_obs, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join("./write_data/" + dataset_shortname + '_nkk.pkl'),
              'wb') as handle:
        pickle.dump(nkk, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(os.path.join("./write_data/" + dataset_shortname + '_nck.pkl'),
              'wb') as handle:
        pickle.dump(nck, handle, protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_raw_counts_correlation.pkl'), 'wb') as handle:
        pickle.dump(raw_counts_correlation,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)

    with open(
            os.path.join("./write_data/" + dataset_shortname +
                         '_filtered_counts_correlation.pkl'), 'wb') as handle:
        pickle.dump(filtered_counts_correlation,
                    handle,
                    protocol=pickle.HIGHEST_PROTOCOL)
Beispiel #17
0
    'cp_dose',
    'cp_time'
  ]))

n_components = 4
tsne = TSNE(
    n_components=n_components, # https://github.com/pavlin-policar/openTSNE/issues/121
    negative_gradient_method='bh',
    perplexity=30,
    metric='euclidean',
    verbose=True,
    n_jobs=10,
    random_state=42
    )

embedding = tsne.fit(x_train)


# can embed new data:
# embedded_test = embedding.transform(np.array(test_features.drop(columns=[...])))

np.savetxt(f"tsne{n_components}dims.csv", embedding, delimiter=',', header=",".join([f'X{i}' for i in range(embedding.shape[1])]))



# ## Advanced embedding. https://opentsne.readthedocs.io/en/latest/examples/02_advanced_usage/02_advanced_usage.html

# affinities_train = PerplexityBasedNN(
#       x_train,
#       perplexity=30,
#       metric='euclidean',
Beispiel #18
0
from openTSNE import TSNE
from openTSNE.callbacks import ErrorLogger

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits

import matplotlib.pyplot as plt

#%%

x, y = load_digits(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.33,
                                                    random_state=42)

print("%d training samples" % x_train.shape[0])
print("%d test samples" % x_test.shape[0])

tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=4,
    random_state=42,
)

embedding_train = tsne.fit(x_train)

embedding_test = embedding_train.transform(x_test)
Beispiel #19
0
        validation_posterior = Posterior(vae, validation_cells_dataset, use_cuda=False)
        print(X_.shape)
        result_dict['validation_error'] = validation_posterior.reconstruction_error()

        # Get expression rate parameterization from representation space
        Z_hat = vae.sample_from_posterior_z(torch.from_numpy(cells_dataset.X.toarray()).float())
        Z_hat = np.array(Z_hat.detach()).astype(np.double)

        tsne = TSNE(callbacks=ErrorLogger(),
                    initialization='random',
                    negative_gradient_method='fft',
                    callbacks_every_iters=100,
                    n_iter=2000,
                    neighbors='approx')

        YY = tsne.fit(Z_hat)

        df = pd.DataFrame(index=tmp_adata.obs.index)
        df['ss_depth'] = result_dict['ss_depth']
        df['ss_cells'] = result_dict['ss_cells']
        df['validation_error'] = result_dict['validation_error']
        df['tsne_0'] = YY[:, 0]
        df['tsne_1'] = YY[:, 1]

#         out_file = f'scvi_output_{ds}/{ds}_c{ss_cells}_d{ss_depth}.csv'
#         if not os.path.exists(os.path.dirname(out_file)):
#             os.makedirs(os.path.dirname(out_file))

        df.to_csv(input.SCVI_PARTIAL_SUMMARY)
#         # combines all separate depths into a single csv file
#         all_results = pd.concat(results_list).reset_index()
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser()
    ## Required parameters
    ###############
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument("--pretrain_model",
                        default='bert-case-uncased',
                        type=str,
                        required=True,
                        help="Pre-trained model")
    parser.add_argument("--num_labels_task",
                        default=None,
                        type=int,
                        required=True,
                        help="num_labels_task")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        default=False,
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--eval_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--task",
                        default=2,
                        type=int,
                        required=True,
                        help="Choose Task")
    parser.add_argument("--choose_eval_test_both",
                        default=2,
                        type=int,
                        help="choose test dev both")
    ###############

    args = parser.parse_args()
    #print(args.do_train, args.do_eval)
    #exit()

    processors = Processor_1

    num_labels = args.num_labels_task

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
        print(n_gpu)
        print(device)
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {}, n_gpu: {}, distributed training: {}, 16-bits training: {}"
        .format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    #args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")
    '''
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    '''
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = RobertaTokenizer.from_pretrained(args.pretrain_model)

    train_examples = None
    num_train_steps = None
    aspect_list = None
    sentiment_list = None
    processor = processors()
    num_labels = num_labels
    #train_examples, aspect_list, sentiment_list = processor.get_train_examples(args.data_dir)

    filenames = os.listdir(args.output_dir)
    filenames = [x for x in filenames if "pytorch_model.bin_test_best" in x]
    print(filenames)

    file_mark = []
    #model_performace_dev = dict()
    model_performace_test = dict()
    for x in filenames:
        ###
        #eval:0 #test:1
        if args.choose_eval_test_both == 0:
            file_mark.append([x, True])
        elif args.choose_eval_test_both == 1:
            file_mark.append([x, False])
        else:
            file_mark.append([x, True])
            file_mark.append([x, False])

    ####
    ####
    train_examples, aspect_list, sentiment_list = processor.get_test_examples(
        args.data_dir)
    test_examples, _, _ = processor.get_test_examples(args.data_dir)
    #eval_examples, _, _ = processor.get_dev_examples(args.data_dir)
    if args.task == 1:
        num_labels = len(aspect_list)
    elif args.task == 2:
        num_labels = len(sentiment_list)
    else:
        print("What's task?")
        exit()
    test = convert_examples_to_features(test_examples, aspect_list,
                                        sentiment_list, args.max_seq_length,
                                        tokenizer, args.task)

    #dev = convert_examples_to_features(
    #eval_examples, aspect_list, sentiment_list, args.max_seq_length, tokenizer, args.task)
    ###

    for x, mark in file_mark:
        #mark: eval-True; test-False
        #choose_eval_test_both: eval-0, test-1, both-2
        if mark == True:  #dev
            continue
        print(x, mark)
        output_model_file = os.path.join(args.output_dir, x)

        #model = RobertaForSequenceClassification.from_pretrained(args.pretrain_model, num_labels=num_labels, output_hidden_states=False, output_attentions=False, return_dict=True)
        model = RobertaForMaskedLMDomainTask.from_pretrained(
            args.pretrain_model,
            output_hidden_states=False,
            output_attentions=False,
            return_dict=True,
            num_labels=args.num_labels_task)
        model.load_state_dict(torch.load(output_model_file), strict=False)
        #strict False: ignore non-matching keys
        model.to(device)

        #######################################
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        #no_decay = ['bias', 'LayerNorm.weight']
        no_grad = [
            'bert.encoder.layer.11.output.dense_ent',
            'bert.encoder.layer.11.output.LayerNorm_ent'
        ]
        param_optimizer = [(n, p) for n, p in param_optimizer
                           if not any(nd in n for nd in no_grad)]
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(t_total*0.1), num_training_steps=t_total)
        if args.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
                exit()

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)

        # multi-gpu training (should be after apex fp16 initialization)
        if n_gpu > 1:
            model = torch.nn.DataParallel(model)

        # Distributed training (should be after apex fp16 initialization)
        if args.local_rank != -1:
            model = torch.nn.parallel.DistributedDataParallel(
                model,
                device_ids=[args.local_rank],
                output_device=args.local_rank,
                find_unused_parameters=True)
        #######################################

        #param_optimizer = [para[0] for para in model.named_parameters()]
        #param_optimizer = [para for para in model.named_parameters()][-2]
        #print(param_optimizer)

        if mark:
            eval_features = dev
            print(0)
        else:
            eval_features = test
            print(1)

        logger.info("***** Running evaluation *****")
        #logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Num examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_attention_mask = torch.tensor(
            [f.attention_mask for f in eval_features], dtype=torch.long)
        if args.task == 1:
            print("Excuting the task 1")
        elif args.task == 2:
            all_segment_ids = torch.tensor(
                [f.segment_ids for f in eval_features], dtype=torch.long)
        else:
            print("Wrong here2")

        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        all_aspect_ids = torch.tensor([f.aspect_id for f in eval_features],
                                      dtype=torch.long)

        if args.task == 1:
            eval_data = TensorDataset(all_input_ids, all_attention_mask,
                                      all_label_ids, all_aspect_ids)
        elif args.task == 2:
            eval_data = TensorDataset(all_input_ids, all_attention_mask,
                                      all_segment_ids, all_label_ids,
                                      all_aspect_ids)
        else:
            print("Wrong here1")

        if args.local_rank == -1:
            eval_sampler = RandomSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        if mark:
            output_eval_file = os.path.join(
                args.output_dir,
                "eval_results_{}.txt".format(x.split("_")[-1]))
            output_file_pred = os.path.join(
                args.output_dir, "eval_pred_{}.txt".format(x.split("_")[-1]))
            output_file_glod = os.path.join(
                args.output_dir, "eval_gold_{}.txt".format(x.split("_")[-1]))
        else:
            output_eval_file = os.path.join(
                args.output_dir,
                "test_results_{}.txt".format(x.split("_")[-1]))
            output_file_pred = os.path.join(
                args.output_dir, "test_pred_{}.txt".format(x.split("_")[-1]))
            output_file_glod = os.path.join(
                args.output_dir, "test_gold_{}.txt".format(x.split("_")[-1]))

        fpred = open(output_file_pred, "w")
        fgold = open(output_file_glod, "w")

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        sentiment_map = sorted(list(set(sentiment_list)))
        aspect_map = sorted(list(set(aspect_list)))
        sentiment_map = {label: i for i, label in enumerate(sentiment_map)}
        aspect_map = {label: i for i, label in enumerate(aspect_map)}

        print(sentiment_map)
        print(aspect_map)
        #exit()

        #data_dict = {'laptop':{'negative':[],'neutral':[],'positive':[]},'restaurant':{'negative':[],'neutral':[],'positive':[]}}

        #aspect, sentiment, tensor
        all_aspect_list = list()
        all_sentiment_list = list()
        all_tensor_list = list()

        restaurant_aspect_list = list()
        restaurant_sentiment_list = list()
        restaurant_tensor_list = list()

        laptop_aspect_list = list()
        laptop_sentiment_list = list()
        laptop_tensor_list = list()

        for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration")):
            #batch = tuple(t.to(device) if i != 3 else t for i, t in enumerate(batch))
            batch = tuple(t.to(device) for i, t in enumerate(batch))

            if args.task == 1:
                input_ids, attention_mask, label_ids, aspect_ids = batch
            elif args.task == 2:
                input_ids, attention_mask, segment_ids, label_ids, aspect_ids = batch
            else:
                print("Wrong here3")

            if args.task == 1:
                #loss, logits, hidden_states, attentions
                '''
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
                logits = output.logits
                tmp_eval_loss = output.loss
                '''
                #
                #tmp_eval_loss, logits = model(input_ids_org=input_ids, sentence_label=label_ids, attention_mask=attention_mask, func="task_class")
                with torch.no_grad():
                    rep_domain, rep_task = model(input_ids_org=input_ids,
                                                 sentence_label=label_ids,
                                                 attention_mask=attention_mask,
                                                 func="in_domain_task_rep")
                #logits = output.logits
                #tmp_eval_loss = output.loss
            elif args.task == 2:
                #loss, logits, hidden_states, attentions
                '''
                output = model(input_ids=input_ids, token_type_ids=None, attention_mask=attention_mask, labels=label_ids)
                logits = output.logits
                tmp_eval_loss = output.loss
                '''
                #
                with torch.no_grad():
                    rep_domain, rep_task = model(input_ids_org=input_ids,
                                                 sentence_label=label_ids,
                                                 attention_mask=attention_mask,
                                                 func="in_domain_task_rep")
            else:
                print("Wrong!!")

            #print(rep_domain.shape)
            #print(rep_task.shape)
            rep = torch.cat([rep_task, rep_domain], -1).to("cpu")
            #print(rep.shape)

            #label_ids:{'negative': 0, 'neutral': 1, 'positive': 2}
            #aspect_ids:{'laptop': 0, 'restaurant': 1}
            #sentiment_map={"laptop_negative":1,"laptop_neutral":3,"laptop_positive":5, "restaurant_negative":0,"restaurant_neutral":2,"restaurant_positive":4}
            sentiment_map = {
                "l_neg": 1,
                "l_ne": 3,
                "l_pos": 5,
                "Negative": 0,
                "Neutral": 2,
                "Postive": 4
            }
            #sentiment_map={"laptop_negative":0,"laptop_positive":2, "restaurant_negative":1,"restaurant_positive":3}

            for index, tensor in enumerate(rep):
                #aspect, sentiment, tensor
                #if label_ids[index] == 1: #netural
                #    continue
                if aspect_ids[index] == 0:
                    if label_ids[index] == 0:
                        #data_dict['laptop']['negative'].append(tensor)
                        laptop_sentiment_list.append(torch.tensor(1))
                        all_sentiment_list.append(torch.tensor(1))
                    elif label_ids[index] == 1:
                        #data_dict['laptop']['neutral'].append(tensor)
                        laptop_sentiment_list.append(torch.tensor(3))
                        all_sentiment_list.append(torch.tensor(3))
                    elif label_ids[index] == 2:
                        #data_dict['laptop']['positive'].append(tensor)
                        laptop_sentiment_list.append(torch.tensor(5))
                        all_sentiment_list.append(torch.tensor(5))
                    laptop_aspect_list.append(aspect_ids[index])
                    #laptop_sentiment_list.append(label_ids[index])
                    laptop_tensor_list.append(tensor)
                else:
                    if label_ids[index] == 0:
                        #data_dict['restaurant']['negative'].append(tensor)
                        restaurant_sentiment_list.append(torch.tensor(0))
                        all_sentiment_list.append(torch.tensor(0))
                    elif label_ids[index] == 1:
                        #data_dict['restaurant']['neutral'].append(tensor)
                        restaurant_sentiment_list.append(torch.tensor(2))
                        all_sentiment_list.append(torch.tensor(2))
                    elif label_ids[index] == 2:
                        #data_dict['restaurant']['positive'].append(tensor)
                        restaurant_sentiment_list.append(torch.tensor(4))
                        all_sentiment_list.append(torch.tensor(4))
                    restaurant_aspect_list.append(aspect_ids[index])
                    #restaurant_sentiment_list.append(label_ids[index])
                    restaurant_tensor_list.append(tensor)

                all_aspect_list.append(aspect_ids[index])
                #all_sentiment_list.append(label_ids[index])
                all_tensor_list.append(tensor)

        #########
        laptop_aspect_list = torch.stack(laptop_aspect_list).to("cpu").numpy()
        laptop_sentiment_list = torch.stack(laptop_sentiment_list).to(
            "cpu").numpy()
        laptop_tensor_list = torch.stack(laptop_tensor_list).to("cpu").numpy()

        restaurant_aspect_list = torch.stack(restaurant_aspect_list).to(
            "cpu").numpy()
        restaurant_sentiment_list = torch.stack(restaurant_sentiment_list).to(
            "cpu").numpy()
        restaurant_tensor_list = torch.stack(restaurant_tensor_list).to(
            "cpu").numpy()

        all_aspect_list = torch.stack(all_aspect_list).to("cpu").numpy()
        all_sentiment_list = torch.stack(all_sentiment_list).to("cpu").numpy()
        all_tensor_list = torch.stack(all_tensor_list).to("cpu").numpy()
        #########

        #########
        print(laptop_aspect_list.shape)
        print(laptop_sentiment_list.shape)
        print(laptop_tensor_list.shape)
        print("===")

        print(restaurant_aspect_list.shape)
        print(restaurant_sentiment_list.shape)
        print(restaurant_tensor_list.shape)
        print("===")

        print(all_aspect_list.shape)
        #print(all_sentiment_list)
        print(all_sentiment_list.shape)
        print(all_tensor_list.shape)
        print("===")
        #########

        #with open(args.output_dir+".json", "w") as outfile:
        #    json.dump(data_dict, outfile)
        #####Start to draw########
        #emb = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(all_tensor_list)
        #print(emb.shape)
        '''
         = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X)
         = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X)
         = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X)
         = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X)
         = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X)
         = TSNE(n_components=2, perplexity=15, learning_rate=10).fit_transform(X)
        '''
        #tsne = TSNE(perplexity=30,metric="euclidean",callbacks=ErrorLogger(),n_jobs=64,random_state=42)
        '''
        tsne = TSNE(
            perplexity=30,
            n_iter=50,
            metric="euclidean",
            callbacks=ErrorLogger(),
            n_jobs=64,
            random_state=42,
        )
        embedding_train = tsne.fit(all_tensor_list)
        '''

        #plot(all_tensor_list, all_sentiment_list)
        #cosine
        #perplexity
        #400-->1200
        #64
        tsne = TSNE(
            perplexity=64,
            n_iter=1200,
            metric="euclidean",
            callbacks=ErrorLogger(),
            n_jobs=64,
            random_state=42,
            learning_rate='auto',
            initialization='pca',
            n_components=2,
        )
        ###
        #embedding_train = tsne.fit(all_tensor_list)
        #utils_.plot(x=embedding_train, y=all_aspect_list, colors=utils_.MOUSE_10X_COLORS, label_map=aspect_map)
        #utils_.plot(x=embedding_train, y=all_sentiment_list, colors=utils_.MOUSE_10X_COLORS, label_map=sentiment_map)
        ###

        ###
        embedding_train = tsne.fit(restaurant_tensor_list)
        utils_.plot(x=embedding_train,
                    y=restaurant_sentiment_list,
                    colors=utils_.MOUSE_10X_COLORS,
                    label_map=sentiment_map)
        ###

        ###
        #embedding_train = tsne.fit(laptop_tensor_list)
        #utils_.plot(x=embedding_train, y=laptop_sentiment_list, colors=utils_.MOUSE_10X_COLORS, label_map=sentiment_map)
        ###
        #plt.savefig(args.output_dir+'.pdf')
        plt.title("Semi-supervised contrastive learning")
        #plt.title("Fine-tune (Standard)")
        #plt.title("Fine-tune (Few-shot)")
        #plt.title("Supervised contrastive learning")
        #plt.title("Common fine-tuning")
        plt.savefig('output.pdf')
Beispiel #21
0
        columns=bumon_name[idx_none_target],
        values='n_items',
        aggfunc='sum')\
        .fillna(0)

_df.head()
#%%
user_category_ratio_df = _df.div(_df.sum(axis=1), axis=0).fillna(0)
user_category_ratio_arr = user_category_ratio_df.values
user_category_ratio_arr = np.clip(user_category_ratio_arr, 0,
                                  1).astype(np.float32)

#%%
from openTSNE import TSNE
tsne = TSNE()
embedding = tsne.fit(user_category_ratio_arr)

# %%

vis_x = embedding[:, 0]
vis_y = embedding[:, 1]
max_idx = np.argmax(user_category_ratio_arr, axis=1)
plt.scatter(vis_x,
            vis_y,
            c=max_idx,
            cmap=plt.cm.get_cmap("jet", 124),
            marker='.')
plt.colorbar(ticks=range(124))
plt.clim(-0.5, 123.5)
plt.show()
def activations_tsne_plot_save(activations_zero, activations_first,
                               activations_second, activations_third, labels,
                               ds, filename):
    """Compute embeddings using t-SNE and save their plots."""
    tsne = TSNE(
        perplexity=30,
        metric="euclidean",
        n_jobs=8,
        random_state=42,
        verbose=False,
    )

    # embeddings_zero
    print("Learning embeddings for original dataset")
    embeddings_zero = tsne.fit(activations_zero)

    fig, axes = plt.subplots(figsize=(12, 8))
    for i, actual_label in enumerate(ds.classes):
        indices = np.argwhere(labels == i)
        indices = np.squeeze(indices)

        axes.scatter(embeddings_zero[indices, 0],
                     embeddings_zero[indices, 1],
                     label=actual_label,
                     s=12)
        axes.legend(markerscale=3, fontsize=12)

    plt.savefig(filename + "_l0.png")
    plt.close()

    # embeddings_first
    embeddings_first = []
    print("Learning embeddings for first layer")
    for j, acts in enumerate(activations_first):
        embedding = tsne.fit(acts)
        embeddings_first.append(embedding)

        fig, axes = plt.subplots(figsize=(12, 8))
        for i, actual_label in enumerate(ds.classes):
            indices = np.argwhere(labels == i)
            indices = np.squeeze(indices)

            axes.scatter(embedding[indices, 0],
                         embedding[indices, 1],
                         label=actual_label,
                         s=12)
            axes.legend(markerscale=3, fontsize=12)

        plt.savefig(filename + "_l1_f" + str(j) + ".png")
        plt.close()

    # embeddings_second
    embeddings_second = []
    print("Learning embeddings for second layer")
    for j, acts in enumerate(activations_second):
        embedding = tsne.fit(acts)
        embeddings_second.append(embedding)

        fig, axes = plt.subplots(figsize=(12, 8))
        for i, actual_label in enumerate(ds.classes):
            indices = np.argwhere(labels == i)
            indices = np.squeeze(indices)

            axes.scatter(embedding[indices, 0],
                         embedding[indices, 1],
                         label=actual_label,
                         s=12)
            axes.legend(markerscale=3, fontsize=12)

        plt.savefig(filename + "_l2_f" + str(j) + ".png")
        plt.close()

    # embeddings_third
    print("Learning embeddings for third layer")
    embeddings_third = tsne.fit(activations_third)

    fig, axes = plt.subplots(figsize=(12, 8))
    for i, actual_label in enumerate(ds.classes):
        indices = np.argwhere(labels == i)
        indices = np.squeeze(indices)

        axes.scatter(embeddings_third[indices, 0],
                     embeddings_third[indices, 1],
                     label=actual_label,
                     s=12)
        axes.legend(markerscale=3, fontsize=12)

    plt.savefig(filename + "_l3.png")
    plt.close()

    return embeddings_zero, embeddings_first, embeddings_second, embeddings_third
Beispiel #23
0
y = data["CellType1"].astype(str)

print("Data set contains %d samples with %d features" % x.shape)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.33,
                                                    random_state=42)

print("%d training samples" % x_train.shape[0])
print("%d test samples" % x_test.shape[0])

tsne = TSNE(
    perplexity=30,
    metric="euclidean",
    callbacks=ErrorLogger(),
    n_jobs=8,
    random_state=42,
)

# embedding_train = tsne.fit(x_train)
embedding_test = tsne.fit(x_test)

fig, ax = plt.subplots(figsize=(8, 8))
#utils.plot(embedding_train, y_train, colors=utils.MACOSKO_COLORS, alpha=0.25, ax=ax)
utils.plot(embedding_test,
           y_test,
           colors=utils.MACOSKO_COLORS,
           alpha=0.75,
           ax=ax)