Ejemplo n.º 1
0
    def download(self, fpath: Optional[PathLike] = None, **kwargs: Any) -> Any:
        """Download the dataset into ``fpath``."""
        fpath = str(self.path if fpath is None else fpath)
        if not fpath.endswith(self._extension):
            fpath += self._extension

        if os.path.isfile(fpath):
            logg.debug(f"Loading dataset `{self.name}` from `{fpath}`")
        else:
            logg.debug(
                f"Downloading dataset `{self.name}` from `{self.url}` as `{fpath}`"
            )

        dirname = Path(fpath).parent
        try:
            if not dirname.is_dir():
                logg.info(f"Creating directory `{dirname}`")
                dirname.mkdir(parents=True, exist_ok=True)
        except OSError as e:
            logg.error(f"Unable to create directory `{dirname}`. Reason `{e}`")

        data = self._download(fpath=fpath, backup_url=self.url, **kwargs)

        if self.shape is not None and data.shape != self.shape:
            raise ValueError(
                f"Expected the data to have shape `{self.shape}`, found `{data.shape}`."
            )

        return data
Ejemplo n.º 2
0
    def compute_distances(self) -> sp.csr_matrix:
        """Compute the distances between clonotypes. `prepare` must have
        been ran previously. Returns a clonotype x clonotype sparse
        distance matrix."""
        start = logging.info(
            "Computing clonotype x clonotype distances.")  # type: ignore
        n_clonotypes = self.clonotypes.shape[0]

        # only use multiprocessing for sufficiently large datasets
        # for small datasets the overhead is too large for a benefit
        if self.n_jobs == 1 or n_clonotypes <= 2 * self.chunksize:
            dist_rows = tqdm(
                (self._dist_for_clonotype(i) for i in range(n_clonotypes)),
                total=n_clonotypes,
            )
        else:
            logging.info(
                "NB: Computation happens in chunks. The progressbar only advances "
                "when a chunk has finished. ")  # type: ignore

            dist_rows = process_map(
                self._dist_for_clonotype,
                range(n_clonotypes),
                max_workers=self.n_jobs
                if self.n_jobs is not None else cpu_count(),
                chunksize=2000,
                tqdm_class=tqdm,
            )

        dist = sp.vstack(dist_rows)
        dist.eliminate_zeros()
        logging.hint("Done computing clonotype x clonotype distances. ",
                     time=start)
        return dist  # type: ignore
Ejemplo n.º 3
0
    def add_image(self, layer: str) -> bool:
        """
        Add a new :mod:`napari` image layer.

        Parameters
        ----------
        layer
            Layer in the underlying's :class:`ImageContainer` which contains the image.

        Returns
        -------
        `True` if the layer has been added, otherwise `False`.
        """
        if layer in self.view.layernames:
            self._handle_already_present(layer)
            return False

        img: np.ndarray = self.model.container.data[layer].transpose(
            "y", "x", ...).values
        if img.shape[-1] > 4:
            logg.warning(f"Unable to show image of shape `{img.shape}`")
            return False

        logg.info(f"Creating image `{layer}` layer")
        self.view.viewer.add_image(
            img_as_float(img),
            name=layer,
            rgb=True,
            colormap=self.model.cmap,
            blending=self.model.blending,
        )

        return True
Ejemplo n.º 4
0
    def compute_distances(
        self, n_jobs: Union[int, None] = None,
    ):
        """Computes the distances between CDR3 sequences 

        Parameters
        ----------
        j_jobs
            Number of CPUs to use for alignment and levenshtein distance. 
            Default: use all CPUS. 
        """
        for arm, arm_dict in self.index_dict.items():
            arm_dict["dist_mat"] = tcr_dist(
                arm_dict["unique_seqs"],
                metric=self.metric,
                cutoff=self.cutoff,
                n_jobs=n_jobs,
            )
            logging.info("Finished computing {} pairwise distances.".format(arm))

        coords, values = zip(*self._cell_dist_mat_reduce())
        rows, cols = zip(*coords)
        dist_mat = coo_matrix(
            (values, (rows, cols)), shape=(self.adata.n_obs, self.adata.n_obs)
        )
        logging.info("Finished constructing cell x cell distance matrix. ")
        dist_mat.eliminate_zeros()
        self._dist_mat = dist_mat.tocsr()
Ejemplo n.º 5
0
    def __init__(
        self,
        adata: AnnData,
        *,
        metric: Union[Literal["alignment", "identity", "levenshtein"],
                      DistanceCalculator] = "identity",
        cutoff: float = 10,
        receptor_arms: Literal["TRA", "TRB", "all", "any"] = "all",
        dual_tcr: Literal["primary_only", "all", "any"] = "primary_only",
        sequence: Literal["aa", "nt"] = "aa",
    ):
        """Class to compute Neighborhood graphs of CDR3 sequences. 

        For documentation of the parameters, see :func:`tcr_neighbors`. 
        """
        start = logging.info("Initializing TcrNeighbors object...")
        if metric == "identity" and cutoff != 0:
            raise ValueError("Identity metric only works with cutoff == 0")
        if metric != "identity" and cutoff == 0:
            logging.warn(f"Running with {metric} metric, but cutoff == 0. ")
        if sequence == "nt" and metric == "alignment":
            raise ValueError(
                "Using nucleotide sequences with alignment metric is not supported. "
            )
        self.adata = adata
        self.metric = metric
        self.cutoff = cutoff
        self.receptor_arms = receptor_arms
        self.dual_tcr = dual_tcr
        self.sequence = sequence
        self._build_index_dict()
        self._dist_mat = None
        logging.info("Finished initalizing TcrNeighbors object. ", time=start)
Ejemplo n.º 6
0
 def filter_low(self , value):
     if value is True:
         self.data_df = self.palantir.preprocess.filter_counts_data(self.data_df)
         adata.uns['palantir_norm_data'] = self.data_df
         logg.info('data filtered for low counts:\n\t' +\
                   'cell_min_molecules=1000\n\tgenes_min_cells=10',
                   r=True)
Ejemplo n.º 7
0
def test_timing(monkeypatch, capsys, logging_state):
    s.logfile = sys.stderr
    counter = 0

    class IncTime:
        @staticmethod
        def now(tz):
            nonlocal counter
            counter += 1
            return datetime(2000,
                            1,
                            1,
                            second=counter,
                            microsecond=counter,
                            tzinfo=tz)

    monkeypatch.setattr(l, 'datetime', IncTime)
    s.verbosity = Verbosity.debug

    l.hint('1')
    assert counter == 1 and capsys.readouterr().err == '--> 1\n'
    start = l.info('2')
    assert counter == 2 and capsys.readouterr().err == '2\n'
    l.hint('3')
    assert counter == 3 and capsys.readouterr().err == '--> 3\n'
    l.info('4', time=start)
    assert counter == 4 and capsys.readouterr().err == '4 (0:00:02)\n'
    l.info('5 {time_passed}', time=start)
    assert counter == 5 and capsys.readouterr().err == '5 0:00:03\n'
Ejemplo n.º 8
0
    def compute_eigen(
        self,
        n_comps: int = 15,
        sym: Optional[bool] = None,
        sort: Literal['decrease', 'increase'] = 'decrease',
    ):
        """\
        Compute eigen decomposition of transition matrix.

        Parameters
        ----------
        n_comps
            Number of eigenvalues/vectors to be computed, set `n_comps = 0` if
            you need all eigenvectors.
        sym
            Instead of computing the eigendecomposition of the assymetric
            transition matrix, computed the eigendecomposition of the symmetric
            Ktilde matrix.

        Returns
        -------
        Writes the following attributes.

        eigen_values : numpy.ndarray
            Eigenvalues of transition matrix.
        eigen_basis : numpy.ndarray
             Matrix of eigenvectors (stored in columns).  `.eigen_basis` is
             projection of data matrix on right eigenvectors, that is, the
             projection on the diffusion components.  these are simply the
             components of the right eigenvectors and can directly be used for
             plotting.
        """
        np.set_printoptions(precision=10)
        if self._transitions_sym is None:
            raise ValueError('Run `.compute_transitions` first.')
        matrix = self._transitions_sym
        # compute the spectrum
        if n_comps == 0:
            evals, evecs = scipy.linalg.eigh(matrix)
        else:
            n_comps = min(matrix.shape[0] - 1, n_comps)
            # ncv = max(2 * n_comps + 1, int(np.sqrt(matrix.shape[0])))
            ncv = None
            which = 'LM' if sort == 'decrease' else 'SM'
            # it pays off to increase the stability with a bit more precision
            matrix = matrix.astype(np.float64)
            evals, evecs = scipy.sparse.linalg.eigsh(matrix,
                                                     k=n_comps,
                                                     which=which,
                                                     ncv=ncv)
            evals, evecs = evals.astype(np.float32), evecs.astype(np.float32)
        if sort == 'decrease':
            evals = evals[::-1]
            evecs = evecs[:, ::-1]
        logg.info('    eigenvalues of transition matrix\n'
                  '    {}'.format(str(evals).replace('\n', '\n    ')))
        if self._number_connected_components > len(evals) / 2:
            logg.warning('Transition matrix has many disconnected components!')
        self._eigen_values = evals
        self._eigen_basis = evecs
Ejemplo n.º 9
0
def define_clonotypes(
    adata: AnnData,
    *,
    key_added: str = "clone_id",
    distance_key: Union[str, None] = None,
    **kwargs,
) -> Optional[Tuple[pd.Series, pd.Series, dict]]:
    """
    Define :term:`clonotypes <Clonotype>` based on :term:`CDR3` nucleic acid
    sequence identity.

    As opposed to :func:`~scirpy.tl.define_clonotype_clusters` which employs
    a more flexible definition of :term:`clonotype clusters <Clonotype cluster>`,
    this function stringently defines clonotypes based on nucleic acid sequence
    identity. Technically, this function is an alias to :func:`~scirpy.tl.define_clonotype_clusters`
    with different default parameters.

    {clonotype_definition}

    Parameters
    ----------
    adata
        Annotated data matrix
    {common_doc}
    {within_group}
    key_added
        The column name under which the clonotype clusters and cluster sizes
        will be stored in `adata.obs` and under which the clonotype network will be
        stored in `adata.uns`
    inplace
        If `True`, adds the results to anndata, otherwise return them.
    {paralellism}

    {return_values}

    """
    if distance_key is None and "ir_dist_nt_identity" not in adata.uns:
        # For the case of "clonotypes" we want to compute the distance automatically
        # if it doesn't exist yet. Since it's just a sparse ID matrix, this
        # should be instant.
        logging.info(
            "ir_dist for sequence='nt' and metric='identity' not found. "
            "Computing with default parameters.")  # type: ignore
        ir_dist(adata,
                metric="identity",
                sequence="nt",
                key_added=distance_key)

    return define_clonotype_clusters(
        adata,
        key_added=key_added,
        sequence="nt",
        metric="identity",
        partitions="connected",
        **kwargs,
    )
Ejemplo n.º 10
0
    def add_img(
        self,
        img: Input_t,
        layer: Optional[str] = None,
        channel_dim: str = "channels",
        lazy: bool = True,
        chunks: Optional[int] = None,
        **kwargs: Any,
    ) -> None:
        """
        Add a new image to the container.

        Parameters
        ----------
        img
            In memory array or path to on-disk *TIFF*/*JPEG* image.
        %(img_layer)s
        channel_dim
            Name of the channel dimension.
        lazy
            Whether to use :mod:`rasterio` or :mod:`dask` to lazily load image.
        chunks
            Chunk size for :mod:`dask`, used in call to :func:`xarray.open_rasterio` for *TIFF* images.

        Returns
        -------
        Nothing, just adds a new ``layer`` to :attr:`data`.

        Raises
        ------
        ValueError
            If loading from a file/store with an unknown format.
        NotImplementedError
            If loading a specific data type has not been implemented.

        Notes
        -----
        Lazy loading via :mod:`dask` is not supported for on-disk *JPEG* files, they will be loaded in memory.
        Multi-page *TIFFs* will be loaded in one :class:`xarray.DataArray`, with concatenated channel dimensions.
        """
        layer = self._get_next_image_id("image") if layer is None else layer
        img = self._load_img(img, chunks=chunks, layer=layer, **kwargs)

        if img is not None:  # not reading a .nc file
            if TYPE_CHECKING:
                assert isinstance(img, xr.DataArray)
            img = img.rename({img.dims[-1]: channel_dim})

            logg.info(
                f"{'Overwriting' if layer in self else 'Adding'} image layer `{layer}`"
            )
            self.data[layer] = img

        if not lazy:
            # load in memory
            self.data.load()
Ejemplo n.º 11
0
    def _cell_dist_mat_reduce(self):
        """Compute the distance matrix by using custom reduction functions.
        More flexible than `_build_cell_dist_mat_min`, but requires more memory.
        Reduce dual is called before reduce arms.
        """
        coord_dict = dict()

        def _add_to_dict(d, c1, c2, cell_row, cell_col, value):
            """Add a value to the nested coord dict"""
            try:
                tmp_dict = d[(cell_row, cell_col)]
                try:
                    tmp_dict2 = tmp_dict[arm]
                    try:
                        if (c1, c2) in tmp_dict2:
                            # can be in arbitrary order apprarently
                            assert (c2, c1) not in tmp_dict2
                            tmp_dict2[(c2, c1)] = value
                        tmp_dict2[(c1, c2)] = value
                    except KeyError:
                        tmp_dict2 = {(c1, c2): value}
                except KeyError:
                    tmp_dict[arm] = {(c1, c2): value}
            except KeyError:
                d[(cell_row, cell_col)] = {arm: {(c1, c2): value}}

        for arm, arm_info in self.index_dict.items():
            dist_mat, seq_to_cell, chain_inds = (
                arm_info["dist_mat"],
                arm_info["seq_to_cell"],
                arm_info["chain_inds"],
            )
            start = logging.info(
                f"Started comstructing {arm} coord-dictionary...")
            for row, col, value in tqdm(zip(dist_mat.row, dist_mat.col,
                                            dist_mat.data),
                                        total=dist_mat.nnz):
                for c1, c2 in itertools.product(chain_inds, repeat=2):
                    for cell_row, cell_col in itertools.product(
                            seq_to_cell[c1][row], seq_to_cell[c2][col]):
                        # fill upper diagonal. Important: these are dist-mat row,cols
                        # not cell-mat row cols. This is required, because the
                        # itertools.product returns all combinations for the diagonal
                        # but not for the other values.
                        _add_to_dict(coord_dict, c1, c2, cell_row, cell_col,
                                     value)
                        if row != col:
                            _add_to_dict(coord_dict, c1, c2, cell_col,
                                         cell_row, value)

            logging.info(f"Finished constructing {arm} coord-dictionary",
                         time=start)

        yield from self._reduce_coord_dict(coord_dict)
Ejemplo n.º 12
0
def test_formats(capsys, logging_state):
    s.logfile = sys.stderr
    s.verbosity = Verbosity.debug
    l.error('0')
    assert capsys.readouterr().err == 'ERROR: 0\n'
    l.warning('1')
    assert capsys.readouterr().err == 'WARNING: 1\n'
    l.info('2')
    assert capsys.readouterr().err == '2\n'
    l.hint('3')
    assert capsys.readouterr().err == '--> 3\n'
    l.debug('4')
    assert capsys.readouterr().err == '    4\n'
Ejemplo n.º 13
0
    def add_points(self,
                   vec: Union[np.ndarray, pd.Series],
                   layer_name: str,
                   key: Optional[str] = None) -> bool:
        """
        Add a new :mod:`napari` points layer.

        Parameters
        ----------
        vec
            Values to plot. If :class:`pandas.Series`, it is expected to be categorical.
        layer_name
            Name of the layer to add.
        key
            Key from :attr:`anndata.AnnData.obs` from where the data was taken from.
            Only used when ``vec`` is :class:`pandas.Series`.

        Returns
        -------
        `True` if the layer has been added, otherwise `False`.
        """
        if layer_name in self.view.layernames:
            self._handle_already_present(layer_name)
            return False

        logg.info(f"Creating point `{layer_name}` layer")
        properties = self._get_points_properties(vec, key=key)
        layer: Points = self.view.viewer.add_points(
            self.model.coordinates,
            name=layer_name,
            size=self.model.spot_diameter,
            opacity=1,
            edge_width=1,
            blending=self.model.blending,
            face_colormap=self.model.cmap,
            edge_colormap=self.model.cmap,
            symbol=self.model.symbol.v,
            **properties,
        )
        # https://github.com/napari/napari/issues/2019
        # TODO: uncomment the 2 lines below once a solution is found for contrasting colors
        # we could use the selected points where the cluster labels are position as a black BG
        # layer._text._color = properties["colors"]
        # layer._text.events.color()
        self._hide_points_controls(layer,
                                   is_categorical=is_categorical_dtype(vec))

        layer.editable = False
        layer.events.select.connect(self._move_layer_to_front)

        return True
Ejemplo n.º 14
0
def _self_loops(self_transitions, velo_graph):
    # set the diagonal elements.
    if self_transitions is not None:
        logg.info(f"Self transitions using {self_transitions!r}")
    if self_transitions == "scvelo":
        confidence = velo_graph.max(1).A.flatten()
        ub = np.percentile(confidence, 98)
        self_prob = np.clip(ub - confidence, 0, 1)
        velo_graph.setdiag(self_prob)
    if self_transitions == "velocyto":
        self_prob = velo_graph.max(1).A.flatten()
        velo_graph.setdiag(self_prob)

    return velo_graph
Ejemplo n.º 15
0
def clone_degree(self: Dandelion,
                 weight: Union[None, str] = None,
                 verbose: bool = True) -> Dandelion:
    """
    Calculates node degree in BCR network.

    Parameters
    ----------
    self : Dandelion
        `Dandelion` object after `tl.generate_network` has been run.
    weight : str, optional
        Atribute name for retrieving edge weight in graph. None defaults to ignoring this. See `networkx.Graph.degree`.
    verbose : bool
        Whether or not to show logging information.

    Returns
    -------
    Dandelion object with metadata updated with node degree information.
    """
    if verbose:
        start = logg.info('Calculating node degree')
    if self.__class__ == Dandelion:
        try:
            G = self.graph[0]
        except:
            dist = np.sum([
                self.distance[x].toarray()
                for x in self.distance if type(self.distance[x]) is csr_matrix
            ],
                          axis=0)
            A = csr_matrix(dist)
            G = nx.Graph()
            G.add_weighted_edges_from(
                zip(list(self.metadata.index), list(self.metadata.index),
                    A.data))

        if len(G) == 0:
            raise AttributeError(
                'Graph not found. Plase run tl.generate_network.')
        else:
            cd = pd.DataFrame.from_dict(G.degree(weight=weight))
            cd.set_index(0, inplace=True)
            self.metadata['clone_degree'] = pd.Series(cd[1])
            if verbose:
                logg.info(' finished',
                          time=start,
                          deep=('Updated Dandelion metadata\n'))
    else:
        raise TypeError('Input object must be of {}'.format(Dandelion))
Ejemplo n.º 16
0
def clone_centrality(self: Dandelion, verbose: bool = True) -> Dandelion:
    """
    Calculates node closeness centrality in BCR network.

    Parameters
    ----------
    self : Dandelion
        `Dandelion` object after `tl.generate_network` has been run.
    verbose : bool
        Whether or not to show logging information.

    Returns
    -------
    Dandelion object with metadata updated with node closeness centrality information.
    """
    if verbose:
        start = logg.info('Calculating node closeness centrality')
    if self.__class__ == Dandelion:
        try:
            G = self.graph[0]
        except:
            dist = np.sum([
                self.distance[x].toarray()
                for x in self.distance if type(self.distance[x]) is csr_matrix
            ],
                          axis=0)
            A = csr_matrix(dist)
            G = nx.Graph()
            G.add_weighted_edges_from(
                zip(list(self.metadata.index), list(self.metadata.index),
                    A.data))

        if len(G) == 0:
            raise AttributeError(
                'Graph not found. Plase run tl.generate_network.')
        else:
            cc = nx.closeness_centrality(G)
            cc = pd.DataFrame.from_dict(cc,
                                        orient='index',
                                        columns=['clone_centrality'])
            self.metadata['clone_centrality'] = pd.Series(
                cc['clone_centrality'])
            if verbose:
                logg.info(' finished',
                          time=start,
                          deep=('Updated Dandelion metadata\n'))
    else:
        raise TypeError('Input object must be of {}'.format(Dandelion))
Ejemplo n.º 17
0
    def export(self, _: Viewer) -> None:
        """Export shapes into :class:`AnnData` object."""
        for layer in self.view.layers:
            if not isinstance(layer, Shapes) or not layer.selected:
                continue
            if not len(layer.data):
                logg.warning(
                    f"Shape layer `{layer.name}` has no visible shapes")
                continue

            key = f"{layer.name}_{self.model.key_added}"

            logg.info(
                f"Adding `adata.obs[{key!r}]`\n       `adata.uns[{key!r}]['meshes']`"
            )
            self._save_shapes(layer, key=key)
            self._update_obs_items(key)
Ejemplo n.º 18
0
 def _prepare(self, adata: AnnData):
     """Initalize the DoubleLookupNeighborFinder and all required lookup tables"""
     start = logging.info("Initializing lookup tables. ")
     self._make_clonotype_table(adata)
     self._make_chain_count()
     self.neighbor_finder = DoubleLookupNeighborFinder(self.clonotypes)
     self._add_distance_matrices(adata)
     self._add_lookup_tables()
     logging.hint("Done initializing lookup tables.", time=start)
Ejemplo n.º 19
0
def lsi(data: Union[AnnData, MuData], scale_embeddings=True, n_comps=50):
    """
    Run Latent Semantic Indexing

    PARAMETERS
    ----------
    data:
            AnnData object or MuData object with 'atac' modality
    scale_embeddings: bool (default: True)
            Scale embeddings to zero mean and unit variance
    n_comps: int (default: 50)
            Number of components to calculate with SVD
    """
    if isinstance(data, AnnData):
        adata = data
    elif isinstance(data, MuData) and "atac" in data.mod:
        adata = data.mod["atac"]
    else:
        raise TypeError(
            "Expected AnnData or MuData object with 'atac' modality")

    # In an unlikely scnenario when there are less 50 features, set n_comps to that value
    n_comps = min(n_comps, adata.X.shape[1])

    logging.info("Performing SVD")
    cell_embeddings, svalues, peaks_loadings = svds(adata.X, k=n_comps)

    # Re-order components in the descending order
    cell_embeddings = cell_embeddings[:, ::-1]
    svalues = svalues[::-1]
    peaks_loadings = peaks_loadings[::-1, :]

    if scale_embeddings:
        cell_embeddings = (cell_embeddings - cell_embeddings.mean(axis=0)
                           ) / cell_embeddings.std(axis=0)

    stdev = svalues / np.sqrt(adata.X.shape[0] - 1)

    adata.obsm["X_lsi"] = cell_embeddings
    adata.uns["lsi"] = {"stdev": stdev}
    adata.varm["LSI"] = peaks_loadings.T

    return None
Ejemplo n.º 20
0
        def process(self):
            """
            A method to run `palantir` on input Data Frame
            """

            # Principal component analysis
            logg.info('PCA in progress ...')

            self.pca_projections, self.var_r = self.palantir.utils.run_pca(
                self.data_df)

            adata.uns['palantir_pca_results'] = {}
            adata.uns['palantir_pca_results'][
                'pca_projections'] = self.pca_projections
            adata.uns['palantir_pca_results']['variance_ratio'] = self.var_r

            # Diffusion maps
            logg.info('Diffusion maps in progress ...')

            self.dm_res = self.palantir.utils.run_diffusion_maps(
                self.pca_projections)
            self.ms_data = self.palantir.utils.determine_multiscale_space(
                self.dm_res)

            adata.uns['palantir_diff_maps'] = self.dm_res
            adata.uns['palantir_ms_data'] = self.ms_data

            # tSNE visualization
            logg.info('tSNE in progress ...')

            self.tsne = self.palantir.utils.run_tsne(self.ms_data)

            adata.uns['palantir_tsne'] = self.tsne

            # MAGIC imputation
            logg.info('imputation in progress ...')

            self.imp_df = self.palantir.utils.run_magic_imputation(
                self.data_df, self.dm_res)

            adata.uns['palantir_imp_df'] = self.imp_df

            logg.info('End of processing, start plotting.')
def scale_array(
    X,
    *,
    zero_center: bool = True,
    max_value: Optional[float] = None,
    copy: bool = False,
    return_mean_std: bool = False,
):
    if copy:
        X = X.copy()
    if not zero_center and max_value is not None:
        logg.info(  # Be careful of what? This should be more specific
            "... be careful when using `max_value` " "without `zero_center`."
        )

    if np.issubdtype(X.dtype, np.integer):
        logg.info(
            '... as scaling leads to float results, integer '
            'input is cast to float, returning copy.'
        )
        X = X.astype(float)

    mean, var = _get_mean_var(X)
    std = np.sqrt(var)
    std[std == 0] = 1
    if issparse(X):
        if zero_center:
            raise ValueError("Cannot zero-center sparse matrix.")
        sparsefuncs.inplace_column_scale(X, 1 / std)
    else:
        if zero_center:
            X -= mean
        X /= std

    # do the clipping
    if max_value is not None:
        logg.debug(f"... clipping at max_value {max_value}")
        X[X > max_value] = max_value

    if return_mean_std:
        return X, mean, std
    else:
        return X
Ejemplo n.º 22
0
 def _reduce_coord_dict(self, coord_dict):
     """Applies reduction functions to the coord dict.
     Yield (coords, value) pairs."""
     start = logging.info("Constructing cell x cell distance matrix...")
     reduce_dual = (self._reduce_dual_all
                    if self.dual_ir == "all" else self._reduce_dual_any)
     reduce_arms = (self._reduce_arms_all if self.receptor_arms == "all"
                    else self._reduce_arms_any)
     for (cell_row, cell_col), entry in tqdm(coord_dict.items(),
                                             total=len(coord_dict)):
         reduced_dual = (reduce_dual(value_dict, chain, cell_row, cell_col)
                         for chain, value_dict in entry.items())
         reduced = reduce_arms(
             reduced_dual,
             cell_row,
             cell_col,
         )
         yield (cell_row, cell_col), reduced
     logging.info("Finished constructing cell x cell distance matrix. ",
                  time=start)
Ejemplo n.º 23
0
    def compute_partition(self) -> None:
        """
        Compute communication classes for the Markov chain.

        Returns
        -------
        None
            Nothing, but updates the following fields:
                - :paramref:`recurrent_classes`
                - :paramref:`transient_classes`
                - :paramref:`irreducible`
        """

        start = logg.info("Computing communication classes")

        rec_classes, trans_classes = partition(self._T)

        self._is_irreducible = len(rec_classes) == 1 and len(
            trans_classes) == 0

        if not self._is_irreducible:
            self._trans_classes = _make_cat(trans_classes, self._n_states,
                                            self._adata.obs_names)
            self._rec_classes = _make_cat(rec_classes, self._n_states,
                                          self._adata.obs_names)
            self._adata.obs[f"{self._rc_key}_rec_classes"] = self._rec_classes
            self._adata.obs[
                f"{self._rc_key}_trans_classes"] = self._trans_classes
            logg.info(
                f"Found `{(len(rec_classes))}` recurrent and `{len(trans_classes)}` transient classes\n"
                f"Adding `.recurrent_classes`\n"
                f"       `.transient_classes`\n"
                f"       `.irreducible`\n"
                f"    Finish",
                time=start,
            )
        else:
            logg.warning(
                "The transition matrix is irreducible - cannot further partition it\n    Finish",
                time=start,
            )
Ejemplo n.º 24
0
    def __init__(
        self,
        adata: AnnData,
        *,
        metric: Union[Literal["alignment", "identity", "levenshtein",
                              "hamming"], DistanceCalculator, ] = "identity",
        cutoff: Union[int, None] = None,
        receptor_arms: Literal["VJ", "VDJ", "all", "any"] = "all",
        dual_ir: Literal["primary_only", "all", "any"] = "primary_only",
        sequence: Literal["aa", "nt"] = "aa",
    ):
        """Class to compute Neighborhood graphs of CDR3 sequences.

        For documentation of the parameters, see :func:`ir_neighbors`.
        """
        start = logging.info("Initializing IrNeighbors object...")
        if metric == "identity" and cutoff != 0:
            raise ValueError("Identity metric only works with cutoff == 0")
        if metric != "identity" and cutoff == 0:
            logging.warning(f"Running with {metric} metric, but cutoff == 0. ")
        if sequence == "nt" and metric == "alignment":
            raise ValueError(
                "Using nucleotide sequences with alignment metric is not supported. "
            )
        if receptor_arms not in ["VJ", "VDJ", "all", "any"]:
            raise ValueError(
                "Invalid value for `receptor_arms`. Note that starting with v0.5 "
                "`TRA` and `TRB` are not longer valid values.")
        if dual_ir not in ["primary_only", "all", "any"]:
            raise ValueError("Invalid value for `dual_ir")
        if sequence not in ["aa", "nt"]:
            raise ValueError("Invalid value for `sequence`")
        self.adata = adata
        self.metric = metric
        self.cutoff = cutoff
        self.receptor_arms = receptor_arms
        self.dual_ir = dual_ir
        self.sequence = sequence
        self._build_index_dict()
        self._dist_mat = None
        logging.info("Finished initalizing IrNeighbors object. ", time=start)
Ejemplo n.º 25
0
        def __init__(self ,
                     adata,
                     func=None ,
                     normalize = False,
                     log_transform = False,
                     filter_low = False
                    ):
            """
            Parameters
            ----------
            adata : AnnData, or Dataframe of cells X genes
            func : function wrapper to import palantir (not to be used)
            normalize : `bool` (default: `False`)
                property setter passed to palantir to normalize using palantir method
                `palantir.preprocess.normalize_counts`.
            log_transform : `bool` (default: `False`)
                property setter passed to palantir. Some datasets show better signal in the log
                scale. Applied using `palantir.preprocess.log_transform`
            filter_low : `bool` (default: `False`)
                property setter passed to palantir to remove low molecule count cells and low detection genes
            """

            # instantiate variables
            self.func = func
            self.adata = adata
            self._normalize = normalize
            self._log_transform = log_transform
            self._filter_low = filter_low

            try:
                # for AnnData
                self.data_df = self.adata.to_df()
            except AttributeError:
                # assume the data is a cell X genes Dataframe
                logg.info('Assuming the data is a cell X genes Dataframe',
                	      r=True)

            # load palantir
            self.__call__()
            logg.info('palantir loaded ...', r=True)
Ejemplo n.º 26
0
def _root_final(
    adata: AnnData,
    final: bool = True,
    cluster_key: Optional[str] = None,
    weight_connectivities: float = None,
    percentile: int = 98,
    n_matches_min: Optional[int] = 1,
    n_start_end: Optional[int] = None,
    show_plots: bool = False,
    copy: bool = False,
) -> Optional[AnnData]:

    key = RcKey.FORWARD if final else RcKey.BACKWARD
    logg.info(f"Computing `{key}`")
    adata = adata.copy() if copy else adata

    # compute kernel object
    kernel = transition_matrix(adata,
                               backward=not final,
                               weight_connectivities=weight_connectivities)

    # create MarkovChain object
    mc = MarkovChain(kernel)

    # run the computation
    mc.compute_eig()
    mc.compute_approx_rcs(
        percentile=percentile,
        n_matches_min=n_matches_min,
        use=n_start_end,
        n_clusters_kmeans=n_start_end,
        cluster_key=cluster_key,
    )

    if show_plots:
        mc.plot_real_spectrum()
        mc.plot_eig_embedding(abs_value=True, perc=[0, 98], use=n_start_end)
        mc.plot_eig_embedding(left=False, use=n_start_end)

    return adata if copy else None
Ejemplo n.º 27
0
    def compute_transition_matrix(
        self, density_normalize: bool = True, **kwargs
    ) -> "ConnectivityKernel":
        """
        Compute transition matrix based on transcriptomic similarity.

        Uses symmetric, weighted KNN graph to compute symmetric transition matrix. The connectivities are computed
        using :func:`scanpy.pp.neighbors`. Depending on the parameters used there, they can be UMAP connectivities or
        gaussian-kernel-based connectivities with adaptive kernel width.

        Params
        ------
        density_normalize
            Whether or not to use the underlying KNN graph for density normalization.

        Returns
        -------
        None
            Makes :paramref:`transition_matrix` available.
        """

        start = logg.info("Computing transition matrix based on connectivities")

        params = dict(dnorm=density_normalize)
        if params == self._params:
            assert self.transition_matrix is not None, _ERROR_EMPTY_CACHE_MSG
            logg.debug(_LOG_USING_CACHE)
            logg.info("    Finish", time=start)
            return self

        self._params = params
        conn = self._conn.copy()

        if density_normalize:
            conn = self.density_normalize(conn)
        logg.info("    Finish", time=start)

        self.transition_matrix = csr_matrix(conn)

        return self
Ejemplo n.º 28
0
def cell_similarity(adata: AnnData,
                    key_added: Optional[str] = 'cell_similarity',
                    sim_type: Optional[str] = 'hub-promoted',
                    use_weights: Optional[bool] = True,
                    copy: bool = False,
                    **neighbors_kwds) -> Optional[AnnData]:
    """\
    Calculate cell similarity score based on the kNN graph. Higher scores
    are associated to cells mostly close to similar cells.
    
    Parameters
    ----------
    adata
        Annotated data matrix. 
    key_added
        The name of the entry in adata.obs with calculated values.
    copy
        Return a copy instead of writing to adata.
    sim_type:
    	Similarity function. Can be one in 'dice', 'salton', 'hub-promoted','hub-suppressed', 'jaccard', 'inv-log-weight', 'resource-allocation','leight-holme-newman'. For more information check here https://graph-tool.skewed.de/static/doc/topology.html?highlight=distance#graph_tool.topology.vertex_similarity
    state
        A separate block state object

    Returns
    -------
    Depending on `copy`, returns or updates `adata` with stability values 
    in adata.obs['cell_stability']
"""
    from .._utils import get_graph_tool_from_adata
    logg.info("Adding cell similarity scores")
    g = get_graph_tool_from_adata(adata,
                                  use_weights=use_weights,
                                  **neighbors_kwds)
    n_cells = g.num_vertices()
    S = gt.vertex_similarity(g, sim_type=sim_type).get_2d_array(range(n_cells))
    D = np.dot(S, S)
    D = np.diag(D / np.max(D))  # take the scaled diagonal
    adata.obs[f'{key_added}'] = D
    return adata if copy else None
Ejemplo n.º 29
0
    def compute_transitions(self, density_normalize: bool = True):
        """\
        Compute transition matrix.

        Parameters
        ----------
        density_normalize
            The density rescaling of Coifman and Lafon (2006): Then only the
            geometry of the data matters, not the sampled density.

        Returns
        -------
        Makes attributes `.transitions_sym` and `.transitions` available.
        """
        start = logg.info('computing transitions')
        W = self._connectivities
        # density normalization as of Coifman et al. (2005)
        # ensures that kernel matrix is independent of sampling density
        if density_normalize:
            # q[i] is an estimate for the sampling density at point i
            # it's also the degree of the underlying graph
            q = np.asarray(W.sum(axis=0))
            if not issparse(W):
                Q = np.diag(1.0 / q)
            else:
                Q = scipy.sparse.spdiags(1.0 / q, 0, W.shape[0], W.shape[0])
            K = Q @ W @ Q
        else:
            K = W

        # z[i] is the square root of the row sum of K
        z = np.sqrt(np.asarray(K.sum(axis=0)))
        if not issparse(K):
            self.Z = np.diag(1.0 / z)
        else:
            self.Z = scipy.sparse.spdiags(1.0 / z, 0, K.shape[0], K.shape[0])
        self._transitions_sym = self.Z @ K @ self.Z
        logg.info('    finished', time=start)
def scale_sparse(
    X,
    *,
    zero_center: bool = True,
    max_value: Optional[float] = None,
    copy: bool = False,
    return_mean_std: bool = False,
):
    # need to add the following here to make inplace logic work
    if zero_center:
        logg.info(
            "... as `zero_center=True`, sparse input is "
            "densified and may lead to large memory consumption"
        )
        X = X.toarray()
        copy = False  # Since the data has been copied
    return scale_array(
        X,
        zero_center=zero_center,
        copy=copy,
        max_value=max_value,
        return_mean_std=return_mean_std,
    )