Example #1
0
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import phate
import scattertext as st
from scipy.sparse.linalg import svds

convention_df = st.SampleCorpora.ConventionData2012.get_data()
convention_df['parse'] = convention_df['text'].apply(
    st.whitespace_nlp_with_sentences)
corpus = (st.CorpusFromParsedDocuments(
    convention_df, category_col='party',
    parsed_col='parse').build().get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['speaker'])

embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
projection_raw = phate.PHATE().fit_transform(embeddings).T
projection = pd.DataFrame({
    'term': corpus.get_metadata(),
    'x': projection_raw[0],
    'y': projection_raw[1]
}).set_index('term')

category = 'democrat'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)
          ).astype(int)
html = st.produce_pca_explorer(corpus,
                               category=category,
                               category_name='Democratic',
                               not_category_name='Republican',
                               metadata=convention_df['speaker'],
                               width_in_pixels=1000,
Example #2
0
def spectral(data, label, gamma_):

    data_shape = data.shape
    label_shape = label.shape

    data = data.to_numpy()
    label = label.to_numpy()

    if gamma_ != 0:  # test
        spectral_cluster = SpectralClustering(n_clusters=10, gamma=gamma_)
        y_pred = spectral_cluster.fit_predict(data)

        true = np.squeeze(label)
        ARI = adjusted_rand_score(true, y_pred)

        print('Testing ARI: ', ARI)

        phate_operator = phate.PHATE(t=25)
        spectral_phate = phate_operator.fit_transform(data)
        phate.plot.scatter2d(spectral_phate, c=y_pred)

    else:  # train
        index = np.arange(data_shape[0])
        np.random.shuffle(index)

        sample = 10
        subsample_size = int(data_shape[0] / sample)

        ARI_subsample = np.zeros(sample)
        gamma_subsample = np.zeros(sample)

        for i in range(sample):
            print('current:', i)
            start = int(i * subsample_size)
            end = int((i + 1) * subsample_size)

            # subsampleing
            selected_data = np.zeros((subsample_size, data_shape[1]))
            selected_label = np.zeros((subsample_size, label_shape[1]))

            location = 0
            for id in index[start:end]:
                selected_data[location, :] = data[id, :]
                selected_label[location] = label[id]
                location += 1

            best_ARI_i = 0
            best_gamma = 0
            best_y_pred = np.zeros((subsample_size, label_shape[1]))

            for gamma_value in np.arange(0, 2, 0.2):
                spectral_cluster = SpectralClustering(
                    n_clusters=10, gamma=gamma_value)  # affinity:default ‘rbf’
                y_pred = spectral_cluster.fit_predict(selected_data)

                selected_label = np.squeeze(selected_label)

                current_ARI = adjusted_rand_score(selected_label, y_pred)

                if current_ARI > best_ARI_i:
                    best_ARI_i = current_ARI
                    best_gamma = gamma_value
                    best_y_pred = y_pred

            ARI_subsample[i] = best_ARI_i
            gamma_subsample[i] = best_gamma

            # plot phate for the last subsample
            if i == int(sample - 1):
                phate_operator = phate.PHATE(t=25)
                spectral_phate = phate_operator.fit_transform(selected_data)
                phate.plot.scatter2d(spectral_phate, c=best_y_pred)

        print('ARI: ', ARI_subsample)
        print('The average ARI: ', np.average(ARI_subsample))
        print('Gamma: ', gamma_subsample)
        print('The average gamma: ', np.average(gamma_subsample))
Example #3
0
def phate(
    adata: AnnData,
    n_components: int = 2,
    k: int = 5,
    a: int = 15,
    n_landmark: int = 2000,
    t: Union[int, str] = 'auto',
    gamma: float = 1.0,
    n_pca: int = 100,
    knn_dist: str = 'euclidean',
    mds_dist: str = 'euclidean',
    mds: Literal['classic', 'metric', 'nonmetric'] = 'metric',
    n_jobs: Optional[int] = None,
    random_state: Optional[Union[int, RandomState]] = None,
    verbose: Union[bool, int, None] = None,
    copy: bool = False,
    **kwargs,
) -> Optional[AnnData]:
    """\
    PHATE [Moon17]_.

    Potential of Heat-diffusion for Affinity-based Trajectory Embedding (PHATE)
    embeds high dimensional single-cell data into two or three dimensions for
    visualization of biological progressions.

    For more information and access to the object-oriented interface, read the
    `PHATE documentation <https://phate.readthedocs.io/>`__.  For
    tutorials, bug reports, and R/MATLAB implementations, visit the `PHATE
    GitHub page <https://github.com/KrishnaswamyLab/PHATE/>`__. For help
    using PHATE, go `here <https://krishnaswamylab.org/get-help>`__.

    Parameters
    ----------
    adata
        Annotated data matrix.
    n_components
        number of dimensions in which the data will be embedded
    k
        number of nearest neighbors on which to build kernel
    a
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    n_landmark
        number of landmarks to use in fast PHATE
    t
        power to which the diffusion operator is powered
        sets the level of diffusion. If 'auto', t is selected
        according to the knee point in the Von Neumann Entropy of
        the diffusion operator
    gamma
        Informational distance constant between -1 and 1.
        `gamma=1` gives the PHATE log potential, `gamma=0` gives
        a square root potential.
    n_pca
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        log(n_samples) time.
    knn_dist
        recommended values: 'euclidean' and 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph
    mds_dist
        recommended values: 'euclidean' and 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for MDS
    mds
        Selects which MDS algorithm is used for dimensionality reduction.
    n_jobs
        The number of jobs to use for the computation.
        If `None`, `sc.settings.n_jobs` is used.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used
    random_state
        Random seed. Defaults to the global `numpy` random number generator
    verbose
        If `True` or an `int`/`Verbosity` ≥ 2/`hint`, print status messages.
        If `None`, `sc.settings.verbosity` is used.
    copy
        Return a copy instead of writing to `adata`.
    kwargs
        Additional arguments to `phate.PHATE`

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    **X_phate** : `np.ndarray`, (`adata.obs`, shape=[n_samples, n_components], dtype `float`)
        PHATE coordinates of data.

    Examples
    --------
    >>> from anndata import AnnData
    >>> import scanpy.external as sce
    >>> import phate
    >>> tree_data, tree_clusters = phate.tree.gen_dla(
    ...     n_dim=100,
    ...     n_branch=20,
    ...     branch_length=100,
    ... )
    >>> tree_data.shape
    (2000, 100)
    >>> adata = AnnData(tree_data)
    >>> sce.tl.phate(adata, k=5, a=20, t=150)
    >>> adata.obsm['X_phate'].shape
    (2000, 2)
    >>> sce.pl.phate(adata)
    """
    start = logg.info('computing PHATE')
    adata = adata.copy() if copy else adata
    verbosity = settings.verbosity if verbose is None else verbose
    verbose = verbosity if isinstance(verbosity, bool) else verbosity >= 2
    n_jobs = settings.n_jobs if n_jobs is None else n_jobs
    try:
        import phate
    except ImportError:
        raise ImportError(
            'You need to install the package `phate`: please run `pip install '
            '--user phate` in a terminal.')
    X_phate = phate.PHATE(
        n_components=n_components,
        k=k,
        a=a,
        n_landmark=n_landmark,
        t=t,
        gamma=gamma,
        n_pca=n_pca,
        knn_dist=knn_dist,
        mds_dist=mds_dist,
        mds=mds,
        n_jobs=n_jobs,
        random_state=random_state,
        verbose=verbose,
        **kwargs,
    ).fit_transform(adata)
    # update AnnData instance
    adata.obsm['X_phate'] = X_phate  # annotate samples with PHATE coordinates
    logg.info(
        '    finished',
        time=start,
        deep=('added\n'
              "    'X_phate', PHATE coordinates (adata.obsm)"),
    )
    return adata if copy else None
Example #4
0
def phate(adata,
          n_components=2,
          k=5,
          a=15,
          n_landmark=2000,
          t='auto',
          gamma=1,
          n_pca=100,
          knn_dist='euclidean',
          mds_dist='euclidean',
          mds='metric',
          n_jobs=None,
          random_state=None,
          verbose=None,
          copy=False,
          **kwargs):
    """PHATE [Moon17]_.

    Potential of Heat-diffusion for Affinity-based Trajectory Embedding (PHATE)
    embeds high dimensional single-cell data into two or three dimensions for
    visualization of biological progressions.

    For more information and access to the object-oriented interface, read the
    `PHATE documentation <https://phate.readthedocs.io/>`__.  For
    tutorials, bug reports, and R/MATLAB implementations, visit the `PHATE
    GitHub page <https://github.com/KrishnaswamyLab/PHATE/>`__. For help
    using PHATE, go `here <https://krishnaswamylab.org/get-help>`__.

    Parameters
    ----------
    adata : :class:`~anndata.AnnData`
        Annotated data matrix.
    n_components : `int`, optional (default: 2)
        number of dimensions in which the data will be embedded
    k : `int`, optional (default: 5)
        number of nearest neighbors on which to build kernel
    a : `int`, optional (default: 15)
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    n_landmark : `int`, optional (default: 2000)
        number of landmarks to use in fast PHATE
    t : `int` or 'auto', optional (default: 'auto')
        power to which the diffusion operator is powered
        sets the level of diffusion. If 'auto', t is selected
        according to the knee point in the Von Neumann Entropy of
        the diffusion operator
    gamma : float, optional, default: 1
        Informational distance constant between -1 and 1.
        `gamma=1` gives the PHATE log potential, `gamma=0` gives
        a square root potential.
    n_pca : `int`, optional (default: 100)
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        log(n_samples) time.
    knn_dist : string, optional (default: 'euclidean')
        recommended values: 'euclidean' and 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph
    mds_dist : string, optional (default: 'euclidean')
        recommended values: 'euclidean' and 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for MDS
    mds : {'classic', 'metric', 'nonmetric'}, optional (default: 'metric')
        Selects which MDS algorithm is used for dimensionality reduction
    n_jobs : `int` or `None`, optional (default: `sc.settings.n_jobs`)
        The number of jobs to use for the computation.
        If `None`, `sc.settings.n_jobs` is used.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used
    random_state : `int`, `numpy.RandomState` or `None`, optional (default: `None`)
        Random seed. Defaults to the global `numpy` random number generator
    verbose : `bool`, `int` or `None`, optional (default: `sc.settings.verbosity`)
        If `True` or an integer `>= 2`, print status messages.
        If `None`, `sc.settings.verbosity` is used.
    copy : `bool` (default: `False`)
        Return a copy instead of writing to `adata`.
    kwargs : additional arguments to `phate.PHATE`

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with the following fields.

    X_phate : `np.ndarray`, (`adata.obs`, shape=[n_samples, n_components], dtype `float`)
        PHATE coordinates of data.

    Examples
    --------
    >>> import scanpy.api as sc
    >>> import phate
    >>> tree_data, tree_clusters = phate.tree.gen_dla(n_dim=100,
                                                      n_branch=20,
                                                      branch_length=100)
    >>> tree_data.shape
    (2000, 100)
    >>> adata = sc.AnnData(tree_data)
    >>> sc.tl.phate(adata, k=5, a=20, t=150)
    >>> adata.obsm['X_phate'].shape
    (2000, 2)
    >>> sc.pl.phate(adata)
    """
    logg.info('computing PHATE', r=True)
    adata = adata.copy() if copy else adata
    verbose = settings.verbosity if verbose is None else verbose
    if isinstance(settings.verbosity, (str, int)):
        verbose = _settings_verbosity_greater_or_equal_than(2)
    n_jobs = settings.n_jobs if n_jobs is None else n_jobs
    try:
        import phate
    except ImportError:
        raise ImportError(
            'You need to install the package `phate`: please run `pip install '
            '--user phate` in a terminal.')
    X_phate = phate.PHATE(n_components=n_components,
                          k=k,
                          a=a,
                          n_landmark=n_landmark,
                          t=t,
                          gamma=gamma,
                          n_pca=n_pca,
                          knn_dist=knn_dist,
                          mds_dist=mds_dist,
                          mds=mds,
                          n_jobs=n_jobs,
                          random_state=random_state,
                          verbose=verbose,
                          **kwargs).fit_transform(adata)
    logg.info(
        '    finished',
        time=True,
        end=' ' if _settings_verbosity_greater_or_equal_than(3) else '\n')
    # update AnnData instance
    adata.obsm['X_phate'] = X_phate  # annotate samples with PHATE coordinates
    logg.hint('added\n' '    \'X_phate\', PHATE coordinates (adata.obsm)')
    return adata if copy else None
Example #5
0
def test_simple():
    tree_data, tree_clusters = phate.tree.gen_dla()
    phate_operator = phate.PHATE(k=15, t=100)
    tree_phate = phate_operator.fit_transform(tree_data)
    assert tree_phate.shape == (tree_data.shape[0], 2)
    if True:
        # save adata obj with batch correction
        adata.write(os.path.join(pdfp, 'mouse_200614.h5ad'))
        print('\n... saved @' +
              datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('... sc embeddings in {:.2f}-min'.format((time.time() - start) / 60))

    # compute PHATE
    G = gt.Graph(data=adata.uns['neighbors']['connectivities'] +
                 sparse.diags([1] * adata.shape[0], format='csr'),
                 precomputed='adjacency',
                 use_pygsp=True)
    G.knn_max = None

    phate_op = phate.PHATE(knn_dist='precomputed',
                           gamma=0,
                           n_jobs=-1,
                           random_state=rs)
    adata.obsm['X_phate'] = phate_op.fit_transform(G.K)

    if True:
        # save adata obj with batch correction
        adata.write(os.path.join(pdfp, 'mouse_200614.h5ad'))
        print('\n... saved @' +
              datetime.datetime.now().strftime('%y%m%d.%H:%M:%S'))
    print('... full PHATE in {:.2f}-min'.format((time.time() - start) / 60))

    if True:
        # MELD
        adata.obs['res_sca1'] = [
            1 if i == 'SCA1' else -1 for i in adata.obs['genotype']
        ]
Example #7
0
def run_phate_from_file(
        filename,
        # data loading params
        sparse=True,
        gene_names=None,
        cell_names=None,
        cell_axis=None,
        gene_labels=None,
        allow_duplicates=None,
        genome=None,
        metadata_channels=None,
        # filtering params
        min_library_size=2000,
        min_cells_per_gene=10,
        # normalization params
        library_size_normalize=True,
        transform='sqrt',
        pseudocount=None,
        cofactor=None,
        # kernel params
        knn=5,
        decay=15,
        n_pca=100,
        knn_dist='euclidean',
        n_jobs=1,
        random_state=42,
        verbose=1,
        # phate params
        n_components=2,
        t_phate='auto',
        gamma=1,
        mds_dist='euclidean',
        mds='metric',
        # output params
        output='phate.csv',
        validate=False):
    """Run PHATE and MAGIC on a file

    Parameters
    ----------
    filename : str
        Allowed types: csv, tsv, mtx, hdf5/h5 (10X format),
        directory/zip (10X format)
    sparse : bool (recommended: True for scRNAseq, False for CyTOF)
        Force data sparsity. If `None`, sparsity is determined by data type.
    gene_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says gene names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        gene names, list gives an array of gene names, `False` means
        no gene names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing gene names, list gives an array of gene names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says cell names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        cell names, list gives an array of cell names, `False` means
        no cell names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing cell names, list gives an array of cell names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_axis : {'row', 'column'}
        States whether cells are on rows or columns. If cell_axis=='row',
        data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of
        shape [n_genes, n_cells]. Only valid for filetype mtx and csv
    gene_labels : {'symbol', 'id', 'both'}
        Choice of gene labels for 10X data. Recommended: 'both'
        Only valid for directory, zip, hdf5, h5
    allow_duplicates : bool
        Allow duplicate gene names in 10X data. Recommended: True
        Only valid for directory, zip, hdf5, h5
    genome : str
        Genome name. Only valid for hdf5, h5
    metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
        Names of channels in fcs data which are not real measurements.
        Only valid if datatype is fcs.
    min_library_size : int or `None`, optional (default: 2000)
        Cutoff for library size normalization. If `None`,
        library size filtering is not used
    min_cells_per_gene : int or `None`, optional (default: 10)
        Minimum non-zero cells for a gene to be used. If `None`,
        genes are not removed
    library_size_normalize : `bool`, optional (default: True)
        Use library size normalization
    transform : {'sqrt', 'log', 'arcsinh', None}
        How to transform the data. If `None`, no transformation is done
    pseudocount : float (recommended: 1)
        Number of pseudocounts to add to genes prior to log transformation
    cofactor : float (recommended: 5)
        Factor by which to divide genes prior to arcsinh transformation
    knn : int, optional, default: 10
        number of nearest neighbors on which to build kernel
    decay : int, optional, default: 15
        sets decay rate of kernel tails.
        If None, alpha decaying kernel is not used
    n_pca : int, optional, default: 100
        Number of principal components to use for calculating
        neighborhoods. For extremely large datasets, using
        n_pca < 20 allows neighborhoods to be calculated in
        roughly log(n_samples) time.
    knn_dist : string, optional, default: 'euclidean'
        recommended values: 'euclidean', 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for building kNN graph.
    n_jobs : integer, optional, default: 1
        The number of jobs to use for the computation.
        If -1 all CPUs are used. If 1 is given, no parallel computing code is
        used at all, which is useful for debugging.
        For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for
        n_jobs = -2, all CPUs but one are used
    random_state : integer or numpy.RandomState, optional, default: None
        The generator used to initialize random PCA
        If an integer is given, it fixes the seed
        Defaults to the global `numpy` random number generator
    verbose : `int` or `boolean`, optional (default: 1)
        If `True` or `> 0`, print status messages
    n_components : int, optional, default: 2
        number of dimensions in which the data will be embedded for PHATE
    mds_dist : string, optional, default: 'euclidean'
        recommended values: 'euclidean' and 'cosine'
        Any metric from `scipy.spatial.distance` can be used
        distance metric for MDS
    mds : string, optional, default: 'metric'
        choose from ['classic', 'metric', 'nonmetric'].
        Selects which MDS algorithm is used for dimensionality reduction
    gamma : float, optional, default: 1
        Informational distance constant between -1 and 1.
        `gamma=1` gives the PHATE log potential, `gamma=0` gives
        a square root potential.
    t_phate : int, optional, default: 'auto'
        power to which the diffusion operator is powered for PHATE.
        This sets the level of diffusion. If 'auto', t is selected
        according to the knee point in the Von Neumann Entropy of
        the diffusion operator
    output : str, optional (default: 'phate.csv')
        Output CSV file to save low-dimensional embedding
    """
    # check arguments
    filetype = check_filetype(filename)
    load_fn, load_kws = check_load_args(filetype,
                                        sparse=sparse,
                                        gene_names=gene_names,
                                        cell_names=cell_names,
                                        cell_axis=cell_axis,
                                        gene_labels=gene_labels,
                                        allow_duplicates=allow_duplicates,
                                        genome=genome,
                                        metadata_channels=metadata_channels)
    transform_fn, transform_kws = check_transform_args(transform=transform,
                                                       pseudocount=pseudocount,
                                                       cofactor=cofactor)

    # set up logging
    # https://github.com/scottgigante/tasklogger
    tasklogger.set_level(verbose)

    # load data
    # example: scprep.io.load_csv("data.csv")
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.io
    tasklogger.log_info("Loading data from {}...".format(filename))
    data = load_fn(filename, **load_kws)
    data = scprep.sanitize.check_numeric(data, copy=True)
    tasklogger.log_info("Loaded {} cells and {} genes.".format(
        data.shape[0], data.shape[1]))

    # filter data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.filter
    if min_library_size is not None:
        tasklogger.log_info("Filtering cells by library size >= {}...".format(
            min_library_size))
        data = scprep.filter.filter_library_size(data, cutoff=min_library_size)
        tasklogger.log_info("Retained {} cells.".format(data.shape[0]))
    if min_cells_per_gene is not None:
        tasklogger.log_info(
            "Filtering genes by min cells >= {}...".format(min_cells_per_gene))
        data = scprep.filter.filter_rare_genes(data,
                                               min_cells=min_cells_per_gene)
        tasklogger.log_info("Retained {} genes.".format(data.shape[1]))

    # normalize data
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.normalize
    if library_size_normalize:
        tasklogger.log_info("Library size normalizing data...")
        data = scprep.normalize.library_size_normalize(data)

    # transform data
    # example: data = scprep.transform.sqrt(data)
    # https://scprep.readthedocs.io/en/stable/reference.html#module-scprep.transform
    if transform is not None:
        tasklogger.log_info("Applying {} transform...".format(transform))
        data = transform_fn(data, **transform_kws)

    # run PHATE
    # https://phate.readthedocs.io/
    phate_op = phate.PHATE(knn=knn,
                           decay=decay,
                           t=t_phate,
                           n_pca=n_pca,
                           knn_dist=knn_dist,
                           n_jobs=n_jobs,
                           random_state=random_state,
                           verbose=verbose,
                           n_components=n_components,
                           gamma=gamma,
                           mds_dist=mds_dist,
                           mds=mds)
    phate_data = phate_op.fit_transform(data)

    # save as csv
    phate_data = pd.DataFrame(
        phate_data,
        columns=["PHATE{}".format(i + 1) for i in range(n_components)],
        index=data.index
        if hasattr(data, 'index') else np.arange(phate_data.shape[0]))
    if cell_axis in ['col', 'column']:
        phate_data = phate_data.T
    tasklogger.log_info("Saving data to {}...".format(output))
    phate_data.to_csv(output)
    tasklogger.log_info("Complete.".format(output))
    if validate:
        correct_phate_data = scprep.io.load_csv(
            'https://raw.githubusercontent.com/KrishnaswamyLab/phate-docker/'
            'master/phate-validate.csv',
            sparse=False)
        try:
            np.testing.assert_equal(scprep.utils.toarray(phate_data),
                                    scprep.utils.toarray(correct_phate_data))
            tasklogger.log_debug(
                "Validation complete, output is equal to expected")
        except AssertionError:
            np.testing.assert_allclose(
                scprep.utils.toarray(phate_data),
                scprep.utils.toarray(correct_phate_data),
                atol=1e-14)
            tasklogger.log_debug(
                "Validation complete, output is numerically equivalent to expected"
            )
Example #8
0
import scattertext as st

movie_df = st.SampleCorpora.RottenTomatoes.get_data()
movie_df.category = movie_df.category \
    .apply(lambda x: {'rotten': 'Negative', 'fresh': 'Positive', 'plot': 'Plot'}[x])

corpus = st.CorpusFromPandas(
    movie_df,
    category_col='movie_name',
    text_col='text',
    nlp=st.whitespace_nlp_with_sentences
).build().get_stoplisted_unigram_corpus()

html = st.produce_pairplot(
    corpus,
    category_projector=st.CategoryProjector(projector=phate.PHATE()),
    metadata=movie_df['category'] + ': ' + movie_df['movie_name'],
    #scaler=st.Scalers.scale_0_to_1,
    #show_halo=False,
    #d3_url_struct=st.D3URLs(
    #    d3_scale_chromatic_url='scattertext/data/viz/scripts/d3-scale-chromatic.v1.min.js',
    #    d3_url='scattertext/data/viz/scripts/d3.min.js'
    #),
    default_to_term_comparison=False
)

file_name = 'movie_pair_plot_phates.html'
open(file_name, 'wb').write(html.encode('utf-8'))
print('./' + file_name)
        embedding = umap.UMAP().fit_transform(subset_sc_df.loc[:,
                                                               morph_features])

        # Combine results with single cell dataframe
        embedding_df = pd.concat(
            [
                subset_sc_df.drop(morph_features,
                                  axis="columns").reset_index(drop=True),
                pd.DataFrame(embedding, columns=["umap_0", "umap_1"]),
            ],
            axis="columns",
        )
        all_sc_umap_embeddings.append(embedding_df.assign(grit_gene=gene))

        # Apply PHATE
        phate_operator = phate.PHATE(n_jobs=-2)
        phate_operator.set_params(decay=20, t="auto", gamma=0, verbose=0)

        Y_phate = phate_operator.fit_transform(
            subset_sc_df.loc[:, morph_features])

        # Combine results with single cell dataframe
        phate_embedding_df = pd.concat(
            [
                subset_sc_df.drop(morph_features,
                                  axis="columns").reset_index(drop=True),
                pd.DataFrame(Y_phate, columns=["phate_0", "phate_1"]),
            ],
            axis="columns",
        )
        all_sc_phate_embeddings.append(
Example #10
0
def PHATE(data, verbose=False, n_jobs=-1, **kwargs):
    return phate.PHATE(verbose=verbose, n_jobs=n_jobs,
                       **kwargs).fit_transform(data)
def main():
    usage = ""  # TODO
    parser = OptionParser(usage=usage)
    parser.add_option("-o", "--out_dir", help="Directory to write output")
    (options, args) = parser.parse_args()

    data_root = args[0]
    cancer_biomarker = args[1]
    cell_type_biomarkers = args[2].split(',')
    out_dir = options.out_dir

    sc.settings.verbosity = 3
    sc.logging.print_versions()
    sc.settings.set_figure_params(dpi=80)

    data = H5COUNTS(join(data_root, DATA_F))
    data.preprocess_data()
    data.add_clustering_results(path=join(data_root, 'interim/'))

    for tumor in TUMORS:
        ad = data.tumor_to_ad[tumor]

        obs_filt = ad.obs.loc[ad.obs['cluster'].notnull()]
        indices = [int(x) for x in obs_filt.index]
        X_filt = ad.X.iloc[indices]
        X_filt = X_filt.set_index(obs_filt.index)
        ad_filt = AnnData(X=X_filt, obs=obs_filt, var=ad.var)

        phate_operator = phate.PHATE(n_jobs=-2, random_state=1)
        X_phate = phate_operator.fit_transform(ad_filt.X)
        ad_filt.obs = pd.DataFrame(
            data=[[x, y, cluster]
                  for (x, y), cluster in zip(X_phate, ad_filt.obs['cluster'])],
            columns=['PHATE 1', 'PHATE 2', 'cluster'])

        # Color points by cluster
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        ax = sc.pl.scatter(ad_filt,
                           x='PHATE 1',
                           y='PHATE 2',
                           color='cluster',
                           ax=ax,
                           legend_loc='right margin',
                           show=False)
        ax.set_xticks([])
        ax.set_yticks([])
        l, b, w, h = fig.axes[-1].get_position().bounds
        ll, bb, ww, hh = fig.axes[0].get_position().bounds
        plt.tight_layout()
        fig.savefig(join(out_dir, '{}_color_by_cluster.png'.format(tumor)),
                    format='png',
                    dpi=150
                    #bbox_inches='tight'
                    )

        # Color by genes
        genes = [cancer_biomarker] + cell_type_biomarkers
        for gene in genes:
            fig, ax = plt.subplots(1, 1, figsize=(8, 6))
            ax = sc.pl.scatter(ad_filt,
                               x='PHATE 1',
                               y='PHATE 2',
                               color=gene,
                               ax=ax,
                               legend_loc='right margin',
                               show=False)
            ax.set_xticks([])
            ax.set_yticks([])
            l, b, w, h = fig.axes[-1].get_position().bounds
            ll, bb, ww, hh = fig.axes[0].get_position().bounds
            plt.tight_layout()
            fig.savefig(join(out_dir, '{}_color_by_{}.png'.format(tumor,
                                                                  gene)),
                        format='png',
                        dpi=150
                        #bbox_inches='tight'
                        )
Example #12
0
def test_tree():
    # generate DLA tree
    M, C = phate.tree.gen_dla(n_dim=50,
                              n_branch=4,
                              branch_length=50,
                              rand_multiplier=2,
                              seed=37,
                              sigma=4)

    # instantiate phate_operator
    phate_operator = phate.PHATE(
        n_components=2,
        decay=10,
        knn=5,
        t=30,
        mds="classic",
        knn_dist="euclidean",
        mds_dist="euclidean",
        n_jobs=-2,
        n_landmark=None,
        verbose=False,
    )

    # run phate with classic MDS
    print("DLA tree, classic MDS")
    Y_cmds = phate_operator.fit_transform(M)
    assert Y_cmds.shape == (M.shape[0], 2)

    # run phate with metric MDS
    # change the MDS embedding without recalculating diffusion potential
    phate_operator.set_params(mds="metric")
    print("DLA tree, metric MDS (log)")
    Y_mmds = phate_operator.fit_transform(M)
    assert Y_mmds.shape == (M.shape[0], 2)

    # run phate with nonmetric MDS
    phate_operator.set_params(gamma=0)
    print("DLA tree, metric MDS (sqrt)")
    Y_sqrt = phate_operator.fit_transform(M)
    assert Y_sqrt.shape == (M.shape[0], 2)

    D = squareform(pdist(M))
    K = phate_operator.graph.kernel
    phate_operator.set_params(knn_dist="precomputed",
                              random_state=42,
                              verbose=False)
    phate_precomputed_D = phate_operator.fit_transform(D)
    phate_precomputed_K = phate_operator.fit_transform(K)

    phate_operator.set_params(knn_dist="precomputed_distance")
    phate_precomputed_distance = phate_operator.fit_transform(D)

    phate_operator.set_params(knn_dist="precomputed_affinity")
    phate_precomputed_affinity = phate_operator.fit_transform(K)

    np.testing.assert_allclose(phate_precomputed_K,
                               phate_precomputed_affinity,
                               atol=5e-4)
    np.testing.assert_allclose(phate_precomputed_D,
                               phate_precomputed_distance,
                               atol=5e-4)
    return 0
Example #13
0
             x="Model",
             y=['Accuracy', 'F1', 'Recall', 'Precision'],
             barmode='group',
             height=400)

fig.update_yaxes(title_text="Model Metrics")
fig.update_layout(title_text="Model Performance")
fig.show()
# -

# ### PHATE

# !pip install phate

import phate
p = phate.PHATE(random_state=42)
X_phate = p.fit_transform(X_train_prepared)
X_phate.shape
fig, ax = plt.subplots(figsize=(6, 4))
phate.plot.scatter2d(p, c=y_train['Bankrupt?'], ax=ax, alpha=0.5)

# +
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

models = get_model()
names, results, result_df = bl_performance(X_phate, y_train, models)
# -
Example #14
0
def compute_diffusion_potential(data,
                                N,
                                decay,
                                gamma,
                                knn,
                                landmarks=2000,
                                n_jobs=10,
                                random_state=None):
    """Short summary.

    Parameters
    ----------
    data : type
        Description of parameter `data`.
    N : type
        Description of parameter `N`.
    decay : type
        Description of parameter `decay`.
    gamma : type
        Description of parameter `gamma`.
    knn : type
        Description of parameter `knn`.
    landmarks : type
        Description of parameter `landmarks`.
    n_jobs : type
        Description of parameter `n_jobs`.
    random_state : integer or numpy.RandomState, optional, default: None
        The generator used to initialize PHATE and PCA.
        If an integer is given, it fixes the seed.
        Defaults to the global `numpy` random number generator

    Returns
    -------
    type
        Description of returned object.

    """
    with tasklogger.log_task("diffusion potential"):

        if landmarks != None and landmarks > data.shape[0]:
            landmarks = None

        diff_op = phate.PHATE(
            verbose=False,
            n_landmark=landmarks,
            decay=decay,
            gamma=gamma,
            n_pca=None,
            knn=knn,
            n_jobs=n_jobs,
            random_state=random_state,
        )
        diff_op.fit(data)

        pca = sklearn.decomposition.PCA(n_components=25,
                                        random_state=random_state)
        diff_potential_pca = pca.fit_transform(diff_op.diff_potential)

    return (
        diff_potential_pca[:, pca.explained_variance_ /
                           np.sum(pca.explained_variance_) > 0.01],
        diff_op,
        pca,
    )
Example #15
0
def PHATE(X, *args, is_graph=False, knn_dist='euclidean', verbose=0, **kwargs):
    if knn_dist is None:
        if is_graph:
            knn_dist = 'precomputed'
    return phate.PHATE(*args, knn_dist=knn_dist, verbose=verbose,
                       **kwargs).fit_transform(X)
Example #16
0
def run_phate(
        filename,
        # data loading params
        sparse=None,
        gene_names=None,
        cell_names=None,
        cell_axis=None,
        delimiter=None,
        gene_labels=None,
        allow_duplicates=None,
        genome=None,
        metadata_channels=None,
        # filtering params
        min_library_size=2000,
        min_cells_per_gene=10,
        # normalization params
        library_size_normalize=True,
        transform='sqrt',
        pseudocount=None,
        cofactor=None,
        **phate_kws):
    """Run PHATE on a file

    Parameters
    ----------
    filename : str
        Allowed types: csv, tsv, mtx, hdf5/h5 (10X format),
        directory/zip (10X format)
    sparse : bool (recommended: True for scRNAseq, False for CyTOF)
        Force data sparsity. If `None`, sparsity is determined by data type.
    gene_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says gene names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        gene names, list gives an array of gene names, `False` means
        no gene names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing gene names, list gives an array of gene names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_names : str, list or bool
        Allowed values:
        - if filetype is csv or fcs, `True` says cell names are data
        headers, `str` gives a path to a separate csv or tsv file containing
        cell names, list gives an array of cell names, `False` means
        no cell names are given
        - if filetype is mtx, `str` gives a path to a separate csv or tsv file
        containing cell names, list gives an array of cell names, or `False`
        means no gene names are given
        - if filetype is hdf5, h5, directory or zip, must be `None`.
    cell_axis : {'row', 'column'}
        States whether cells are on rows or columns. If cell_axis=='row',
        data is of shape [n_cells, n_genes]. If cell_axis=='column', data is of
        shape [n_genes, n_cells]. Only valid for filetype mtx and csv
    gene_labels : {'symbol', 'id', 'both'}
        Choice of gene labels for 10X data. Recommended: 'both'
        Only valid for directory, zip, hdf5, h5
    allow_duplicates : bool
        Allow duplicate gene names in 10X data. Recommended: True
        Only valid for directory, zip, hdf5, h5
    genome : str
        Genome name. Only valid for hdf5, h5
    metadata_channels : list of str (recommended: ['Time', 'Event_length', 'DNA1', 'DNA2', 'Cisplatin', 'beadDist', 'bead1'])
        Names of channels in fcs data which are not real measurements.
        Only valid if datatype is fcs.
    min_library_size : int or `None`, optional (default: 2000)
        Cutoff for library size normalization. If `None`,
        library size filtering is not used
    min_cells_per_gene : int or `None`, optional (default: 10)
        Minimum non-zero cells for a gene to be used. If `None`,
        genes are not removed
    library_size_normalize : `bool`, optional (default: True)
        Use library size normalization
    transform : {'sqrt', 'log', 'arcsinh', None}
        How to transform the data. If `None`, no transformation is done
    pseudocount : float (recommended: 1)
        Number of pseudocounts to add to genes prior to log transformation
    cofactor : float (recommended: 5)
        Factor by which to divide genes prior to arcsinh transformation
    **phate_kws : keyword arguments for PHATE
    """
    # check arguments
    if os.path.isdir(filename):
        filetype = 'dir'
    elif os.path.isfile(filename):
        filetype = filename.split('.')[-1]
    else:
        raise RuntimeError("file {} not found".format(filename))

    load_args = [
        'gene_names', 'cell_names', 'cell_axis', 'delimiter', 'sparse',
        'gene_labels', 'allow_duplicates', 'metadata_channels'
    ]
    if filetype == 'zip':
        load_fn = scpreprocess.io.load_10X_zip
        load_kws = {
            'sparse': sparse,
            'gene_labels': gene_labels,
            'allow_duplicates': allow_duplicates
        }
    elif filetype == 'dir':
        load_fn = scpreprocess.io.load_10X
        load_kws = {
            'sparse': sparse,
            'gene_labels': gene_labels,
            'allow_duplicates': allow_duplicates
        }
    elif filetype in ['hdf5', 'h5']:
        load_fn = scpreprocess.io.load_10X_HDF5
        load_kws = {
            'sparse': sparse,
            'gene_labels': gene_labels,
            'allow_duplicates': allow_duplicates,
            'genome': genome
        }
    elif filetype == 'tsv':
        load_fn = scpreprocess.io.load_tsv
        load_kws = {
            'sparse': sparse,
            'gene_names': gene_names,
            'cell_names': cell_names,
            'cell_axis': cell_axis
        }
    elif filetype == 'csv':
        load_fn = scpreprocess.io.load_csv
        load_kws = {
            'sparse': sparse,
            'gene_names': gene_names,
            'cell_names': cell_names,
            'cell_axis': cell_axis
        }
    elif filetype == 'mtx':
        load_fn = scpreprocess.io.load_mtx
        load_kws = {
            'sparse': sparse,
            'gene_names': gene_names,
            'cell_names': cell_names,
            'cell_axis': cell_axis
        }
    elif filetype == 'fcs':
        load_fn = scpreprocess.io.load_fcs
        load_kws = {
            'sparse': sparse,
            'gene_names': gene_names,
            'cell_names': cell_names,
            'metadata_channels': metadata_channels
        }
    else:
        raise RuntimeError("filetype {} not recognized. Expected 'csv', "
                           "'tsv', 'mtx', 'zip', 'hdf5', 'h5', 'fcs' or a "
                           "directory".format(filetype))
    for arg in load_args:
        if arg == 'sparse':
            # allow None
            pass
        elif arg in load_kws:
            assert eval(arg) is not None, \
                "Expected {} not None for filetype {}".format(arg, filetype)
        else:
            assert eval(arg) is None, \
                "Expected {} to be None for filetype {}. Got {}".format(
                    arg, filetype, eval(arg))

    transform_args = ['pseudocount', 'cofactor']
    if transform == 'sqrt':
        transform_fn = scpreprocess.transform.sqrt_transform
        transform_kws = {}
    elif transform == 'log':
        transform_fn = scpreprocess.transform.log_transform
        transform_kws = {'cofactor': cofactor}
    elif transform == 'arcsinh':
        transform_fn = scpreprocess.transform.arcsinh_transform
        transform_kws = {'pseudocount': pseudocount}
    elif transform is None:
        transform_kws = {}
    else:
        raise RuntimeError("transformation {} not recognized. "
                           "Choose from ['sqrt', 'log', 'arcsinh', "
                           "None]".format(transform))
    for arg in transform_args:
        if arg in transform_kws:
            assert eval(arg) is not None, \
                "Expected {} not None for {} transformation".format(
                    arg, transform)
        else:
            assert eval(arg) is None, \
                "Expected {} to be None for {} transformation. Got {}".format(
                    arg, transform, eval(arg))

    data = load_fn(filename, **load_kws)
    if min_library_size is not None:
        data = scpreprocess.filter.filter_library_size(data,
                                                       cutoff=min_library_size)
    if min_cells_per_gene is not None:
        data = scpreprocess.filter.remove_rare_genes(data,
                                                     cutoff=min_cells_per_gene)
    if library_size_normalize:
        data = scpreprocess.normalize.library_size_normalize(data)
    if transform is not None:
        data = transform_fn(data, **transform_kws)

    phate_op = phate.PHATE(**phate_kws)

    phate_data = phate_op.fit_transform(data)
    return phate_data, phate_op
        os.path.abspath(os.sep),
        "data",
        "lab",
        "DataSets",
        "Krause_2018_primary_parathyroid_adenoma",
        "ParaY9_HHT_cellranger",
        "filtered_gene_bc_matrices_h5.h5",  # raw_gene_bc_matrices
    ),
    gene_labels='both',
    allow_duplicates=True)

data = scprep.filter.remove_rare_genes(data, min_cells=3)
data = scprep.normalize.library_size_normalize(data)
data = scprep.transform.sqrt(data)

ph = phate.PHATE(n_components=2)
phate_data = ph.fit_transform(data)
np.save("{}Phate2d.npy".format(data_name), phate_data)

ph.set_params(n_components=3)
phate3_data = ph.transform()
np.save("{}Phate3d.npy".format(data_name), phate3_data)

mg = magic.MAGIC()
_ = mg.fit_transform(data)

# reduce memory footprint
del mg.graph.data
del mg.graph.data_nu
del mg.graph._kernel
del mg.graph._diff_op