Python MultimodalData Examples, pegasusio.MultimodalData Python Examples

Example #1

0

Show file

File: aggr_data.py Project: slowkow/pegasusio

 def aggregate(self) -> MultimodalData:
     """ Aggregate all data together """
     data = MultimodalData()
     for key in list(self.aggr):
         unidata = self._aggregate_unidata(self.aggr.pop(key))
         data.add_data(unidata)
     return data

Example #2

0

Show file

def set_group_attribute(data: MultimodalData, attribute_string: str) -> None:
    """Set group attributes used in batch correction.

    Batch correction assumes the differences in gene expression between channels are due to batch effects. However, in many cases, we know that channels can be partitioned into several groups and each group is biologically different from others. In this case, *pegasus* will only perform batch correction for channels within each group.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    attribute_string: ``str``
        Attributes used to construct groups:

        * ``None``
            Assume all channels are from one group.

        * ``attr``
            Define groups by sample attribute ``attr``, which is a keyword in ``data.obs``.

        * ``att1+att2+...+attrn``
            Define groups by the Cartesian product of these *n* attributes, which are keywords in ``data.obs``.

        * ``attr=value_11,...value_1n_1;value_21,...value_2n_2;...;value_m1,...,value_mn_m``
            In this form, there will be *(m+1)* groups. A cell belongs to group *i* (*i > 1*) if and only if its sample attribute ``attr``, which is a keyword in ``data.obs``, has a value among ``value_i1``, ... ``value_in_i``. A cell belongs to group 0 if it does not belong to any other groups.

    Returns
    -------
    None

        Update ``data.obs``:

        * ``data.obs["Group"]``: Group ID for each cell.

    Examples
    --------

    >>> pg.set_group_attribute(data, attr_string = "Individual")

    >>> pg.set_group_attribute(data, attr_string = "Individual+assignment")

    >>> pg.set_group_attribute(data, attr_string = "Channel=1,3,5;2,4,6,8")
    """

    if attribute_string.find("=") >= 0:
        attr, value_str = attribute_string.split("=")
        assert attr in data.obs.columns
        values = value_str.split(";")
        data.obs["Group"] = "0"
        for group_id, value in enumerate(values):
            vals = value.split(",")
            idx = np.isin(data.obs[attr], vals)
            data.obs.loc[idx, "Group"] = str(group_id + 1)
    elif attribute_string.find("+") >= 0:
        attrs = attribute_string.split("+")
        assert np.isin(attrs, data.obs.columns).sum() == len(attrs)
        data.obs["Group"] = data.obs[attrs].apply(lambda x: "+".join(x),
                                                  axis=1)
    else:
        assert attribute_string in data.obs.columns
        data.obs["Group"] = data.obs[attribute_string]

Example #3

0

Show file

def diffmap(
    data: MultimodalData,
    n_components: int = 100,
    rep: str = "pca",
    solver: str = "eigsh",
    random_state: int = 0,
    max_t: float = 5000,
) -> None:
    """Calculate Diffusion Map.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    n_components: ``int``, optional, default: ``100``
        Number of diffusion components to calculate.

    rep: ``str``, optional, default: ``"pca"``
        Embedding Representation of data used for calculating the Diffusion Map. By default, use PCA coordinates.

    solver: ``str``, optional, default: ``"eigsh"``
        Solver for eigen decomposition:
            * ``"eigsh"``: default setting. Use *scipy* `eigsh <https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.linalg.eigsh.html>`_ as the solver to find eigenvalus and eigenvectors using the Implicitly Restarted Lanczos Method.
            * ``"randomized"``: Use *scikit-learn* `randomized_svd <https://scikit-learn.org/stable/modules/generated/sklearn.utils.extmath.randomized_svd.html>`_ as the solver to calculate a truncated randomized SVD.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    max_t: ``float``, optional, default: ``5000``
        pegasus tries to determine the best t to sum up to between ``[1, max_t]``.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm["X_diffmap"]``: Diffusion Map matrix of the data.

    Update ``data.uns``:
        * ``data.uns["diffmap_evals"]``: Eigenvalues corresponding to Diffusion Map matrix.

    Examples
    --------
    >>> pg.diffmap(data)
    """

    rep = update_rep(rep)
    Phi_pt, Lambda, Phi = calculate_diffusion_map(
        W_from_rep(data, rep),
        n_components=n_components,
        solver=solver,
        random_state=random_state,
        max_t=max_t,
    )

    data.obsm["X_diffmap"] = Phi_pt
    data.uns["diffmap_evals"] = Lambda
    data.obsm["X_phi"] = Phi

Example #4

0

Show file

def get_neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: str = "pca",
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `pegasusio.MultimodalData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to all available threads
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if knn_is_cached(data, indices_key, distances_key, K):
        indices = data.uns[indices_key]
        distances = data.uns[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep),
            K=K,
            n_jobs=effective_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
        )
        data.uns[indices_key] = indices
        data.uns[distances_key] = distances

    return indices, distances

Example #5

0

Show file

File: text_utils.py Project: klarman-cell-observatory/pegasusio

def write_mtx_file(data: MultimodalData, output_directory: str, precision: int = 2):
    """ Write output to mtx files in output_directory
    """
    output_dir = os.path.abspath(output_directory)
    os.makedirs(output_dir, exist_ok = True)

    for key in data.list_data():
        _write_mtx(data.get_data(key), os.path.join(output_dir, key), precision)

    logger.info("Mtx files are written.")

Example #6

0

Show file

def _check_group(data: MultimodalData) -> None:
    if "Group" not in data.obs:
        data.obs["Group"] = pd.Categorical.from_codes(codes=np.zeros(
            data.shape[0], dtype=np.int32),
                                                      categories=["one_group"])
    elif not is_categorical_dtype(data.obs["Group"]):
        data.obs["Group"] = pd.Categorical(data.obs["Group"].values)

    if "Groups" not in data.uns:
        data.uns["Groups"] = data.obs["Group"].cat.categories.values

Example #7

0

Show file

File: zarr_utils.py Project: klarman-cell-observatory/pegasusio

 def write_multimodal_data(self, data: MultimodalData, overwrite: bool = True) -> None:
     """ Write MultimodalData
     """
     if overwrite:
         for key in data.list_data():
             self.write_unimodal_data(self.root, key, data.get_data(key), overwrite = overwrite)
     else:
         for key in data.data.deleted:
             del self.root[key]
         for key in data.data.accessed:
             self.write_unimodal_data(self.root, key, data.get_data(key), overwrite = key in data.data.modified)            
     self.root.attrs['_selected'] = data._selected

Example #8

0

Show file

def write_loom_file(data: MultimodalData, output_file: str) -> None:
    """ Write a MultimodalData to loom file. Will assert data only contain one type of experiment.
    """
    keys = data.list_data()
    if len(keys) > 1:
        raise ValueError(
            f"Data contain multiple modalities: {','.join(keys)}!")
    data.select_data(keys[0])
    matrices = data.list_keys()
    assert "X" in matrices
    if len(matrices) == 0:
        raise ValueError("Could not write empty matrix to a loom file!")

    def _process_attrs(key_name: str, attrs: pd.DataFrame,
                       attrs_multi: dict) -> Dict[str, object]:
        res_dict = {key_name: attrs.index.values}
        for key in attrs.columns:
            res_dict[key] = np.array(attrs[key].values)
        for key, value in attrs_multi.items():
            if value.ndim > 1:  # value.ndim == 1 refers to np.recarray, which will not be written to a loom file.
                res_dict[key] = value if value.shape[1] > 1 else value[:, 0]
        return res_dict

    row_attrs = _process_attrs("Gene", data.var, data.varm)
    col_attrs = _process_attrs("CellID", data.obs, data.obsm)

    accession_key = "featureid" if "featureid" in row_attrs else (
        "gene_ids" if "gene_ids" in row_attrs else None)
    if accession_key is not None:
        row_attrs["Accession"] = row_attrs.pop(accession_key)

    layers = {}
    for matkey in matrices:
        layers["" if matkey == "X" else matkey] = data.get_matrix(matkey).T

    file_attrs = {}
    for key, value in data.uns.items():
        if isinstance(value, str):
            file_attrs[key] = value

    import loompy
    loompy.create(output_file,
                  layers,
                  row_attrs,
                  col_attrs,
                  file_attrs=file_attrs)

    logger.info(f"{output_file} is written.")

Example #9

0

Show file

def load_10x_h5_file_v2(h5_in: h5py.Group) -> MultimodalData:
    """Load 10x v2 format matrix from hdf5 file

    Parameters
    ----------

    h5_in : h5py.Group
        An instance of h5py.Group class that is connected to a 10x v2 formatted hdf5 file.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file_v2(h5_in)
    """
    data = MultimodalData()
    for genome in h5_in.keys():
        group = h5_in[genome]

        M, N = group["shape"][...]
        mat = csr_matrix(
            (
                group["data"][...],
                group["indices"][...],
                group["indptr"][...],
            ),
            shape=(N, M),
        )

        barcodes = group["barcodes"][...].astype(str)
        ids = group["genes"][...].astype(str)
        names = group["gene_names"][...].astype(str)

        unidata = UnimodalData({"barcodekey": barcodes}, {
            "featurekey": names,
            "featureid": ids
        }, {"X": mat}, {
            "modality": "rna",
            "genome": genome
        })
        unidata.separate_channels()

        data.add_data(unidata)

    return data

Example #10

0

Show file

File: preprocessing.py Project: michael-alperovich/pegasus

def log_norm(
    data: MultimodalData,
    norm_count: float = 1e5,
    backup_matrix: str = "raw.X",
) -> None:
    """Normalization, and then apply natural logarithm to the data.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.

    norm_count: ``int``, optional, default: ``1e5``.
        Total counts of one cell after normalization.

    backup_matrix: ``str``, optional, default: ``raw.X``.
        The key name of the backup count matrix, usually the raw counts.

    Returns
    -------
    ``None``

    Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``.

    In case of rerunning normalization while ``backup_matrix`` already exists, use ``backup_matrix`` instead of ``data.X`` for normalization.

    Examples
    --------
    >>> pg.log_norm(data)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()

    assert data.get_modality() == "rna"

    if backup_matrix not in data.list_keys():
        data.add_matrix(backup_matrix, data.X)
        data.X = data.X.astype(np.float32)  # force copy
    else:
        # The case of rerunning log_norm. Use backup matrix as source.
        data.X = data.get_matrix(backup_matrix).astype(
            np.float32)  # force copy
        logger.warning(
            "Rerun log-normalization. Use the raw counts in backup instead.")

    data.obs["scale"] = normalize_by_count(data.X, data.var["robust"].values,
                                           norm_count, True)
    data.uns["norm_count"] = norm_count

Example #11

0

Show file

def select_hvf_pegasus(data: MultimodalData,
                       consider_batch: bool,
                       n_top: int = 2000,
                       span: float = 0.02) -> None:
    """ Select highly variable features using the pegasus method
    """
    if "robust" not in data.var:
        raise ValueError(
            "Please run `identify_robust_genes` to identify robust genes")

    estimate_feature_statistics(data, consider_batch)

    robust_idx = data.var["robust"].values
    hvf_index = np.zeros(robust_idx.sum(), dtype=bool)

    mean = data.var.loc[robust_idx, "mean"]
    var = data.var.loc[robust_idx, "var"]

    span_value = span
    while True:
        lobj = fit_loess(mean, var, span=span_value, degree=2)
        if lobj is not None:
            break
        span_value += 0.01
    if span_value > span:
        logger.warning(
            "Leoss span is adjusted from {:.2f} to {:.2f} to avoid fitting errors."
            .format(span, span_value))

    rank1 = np.zeros(hvf_index.size, dtype=int)
    rank2 = np.zeros(hvf_index.size, dtype=int)

    delta = var - lobj.outputs.fitted_values
    fc = var / lobj.outputs.fitted_values

    rank1[np.argsort(delta)[::-1]] = range(hvf_index.size)
    rank2[np.argsort(fc)[::-1]] = range(hvf_index.size)
    hvf_rank = rank1 + rank2

    hvf_index[np.argsort(hvf_rank)[:n_top]] = True

    data.var["hvf_loess"] = 0.0
    data.var.loc[robust_idx, "hvf_loess"] = lobj.outputs.fitted_values

    data.var["hvf_rank"] = -1
    data.var.loc[robust_idx, "hvf_rank"] = hvf_rank
    data.var["highly_variable_features"] = False
    data.var.loc[robust_idx, "highly_variable_features"] = hvf_index

Example #12

0

Show file

def reduce_diffmap_to_3d(data: MultimodalData, random_state: int = 0) -> None:
    """Reduce high-dimensional Diffusion Map matrix to 3-dimentional.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.


    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm["X_diffmap_pca"]``: 3D Diffusion Map matrix of data.

    Examples
    --------
    >>> pg.reduce_diffmap_to_3d(data)
    """

    if "X_diffmap" not in data.obsm.keys():
        raise ValueError("Please run diffmap first!")

    pca = PCA(n_components=3, random_state=random_state)
    data.obsm["X_diffmap_pca"] = pca.fit_transform(data.obsm["X_diffmap"])

Example #13

0

Show file

def estimate_adjustment_matrices(data: MultimodalData) -> bool:
    """ Estimate adjustment matrices
    """

    if "plus" in data.varm.keys() or "muls" in data.varm.keys():
        # This only happens if this is for subclustering. Thus do not calculate factors, using factors calculated from parent for batch correction.
        assert "plus" in data.varm.keys() and "muls" in data.varm.keys()
        return True

    if ("gmeans" not in data.varm) or ("gstds" not in data.varm):
        estimate_feature_statistics(data, True)

    if data.uns["Channels"].size == 1:
        logger.warning(
            "Warning: data only contains 1 channel. Batch correction disabled!"
        )
        return False

    nchannel = data.uns["Channels"].size

    plus = np.zeros((data.shape[1], nchannel))
    muls = np.zeros((data.shape[1], nchannel))

    ncells = data.uns["ncells"]
    means = data.varm["means"]
    partial_sum = data.varm["partial_sum"]
    gmeans = data.varm["gmeans"]
    gstds = data.varm["gstds"]
    c2gid = data.uns["c2gid"]
    for i in range(data.uns["Channels"].size):
        if ncells[i] > 1:
            muls[:, i] = (partial_sum[:, i] / (ncells[i] - 1.0)) ** 0.5
        outliers = muls[:, i] < 1e-6
        normals = np.logical_not(outliers)
        muls[outliers, i] = 1.0
        muls[normals, i] = gstds[normals, c2gid[i]] / muls[normals, i]
        plus[:, i] = gmeans[:, c2gid[i]] - muls[:, i] * means[:, i]

    data.varm["plus"] = plus
    data.varm["muls"] = muls

    return True

Example #14

0

Show file

def select_hvf_seurat(
    data: MultimodalData,
    consider_batch: bool,
    n_top: int,
    min_disp: float,
    max_disp: float,
    min_mean: float,
    max_mean: float,
    n_jobs: int,
) -> None:
    """ Select highly variable features using Seurat method.
    """

    robust_idx = data.var["robust"].values
    X = data.X[:, robust_idx]

    hvf_rank = (select_hvf_seurat_multi(
        X,
        data.uns["Channels"],
        data.obs["Channel"],
        n_top,
        n_jobs=n_jobs,
        min_disp=min_disp,
        max_disp=max_disp,
        min_mean=min_mean,
        max_mean=max_mean,
    ) if consider_batch else select_hvf_seurat_single(
        X,
        n_top=n_top,
        min_disp=min_disp,
        max_disp=max_disp,
        min_mean=min_mean,
        max_mean=max_mean,
    ))

    hvf_index = hvf_rank >= 0

    data.var["hvf_rank"] = -1
    data.var.loc[robust_idx, "hvf_rank"] = hvf_rank
    data.var["highly_variable_features"] = False
    data.var.loc[robust_idx, "highly_variable_features"] = hvf_index

Example #15

0

Show file

def filter_data(data: MultimodalData, focus_list: List[str] = None) -> None:
    """ Filter data based on qc_metrics calculated in ``pg.qc_metrics``.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.
    focus_list: ``List[str]``, optional, default None
        UnimodalData objects with keys in focus_list were qc_metrics marked. Filter them and make sure other modalities' barcodes are consistent with filtered barcodes. If focus_list is None and self._selected's modality is "rna", focus_list = [self._selected]

    Returns
    -------
    ``None``

    Update ``data`` with cells after filtration.

    Examples
    --------
    >>> pg.filter_data(data)
    """
    data.filter_data(focus_list=focus_list, cache_passqc=True)

Example #16

0

Show file

File: text_utils.py Project: klarman-cell-observatory/pegasusio

def load_mtx_file(path: str, genome: str = None, modality: str = None) -> MultimodalData:
    """Load gene-count matrix from Market Matrix files (10x v2, v3 and HCA DCP formats)

    Parameters
    ----------

    path : `str`
        Path to mtx files. The directory implied by path should either contain matrix, feature and barcode information, or folders containing these information.
    genome : `str`, optional (default: None)
        Genome name of the matrix. If None, genome will be inferred from path.
    modality: `str`, optional (default: None)
        Modality, choosing from 'rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr' or 'atac'. If None, use 'rna' as default.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pairs.

    Examples
    --------
    >>> io.load_mtx_file('example.mtx.gz', genome = 'mm10')
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} does not exist!")

    orig_file = path
    if os.path.isdir(orig_file):
        path = orig_file.rstrip('/')
        file_name = _locate_mtx_file(path)
    else:
        if (not orig_file.endswith(".mtx")) and (not orig_file.endswith(".mtx.gz")):
            raise ValueError(f"File {orig_file} does not end with suffix .mtx or .mtx.gz!")
        path, file_name = os.path.split(orig_file)

    data = MultimodalData()

    if modality is None:
        modality = "rna"

    if file_name is not None:
        if genome is None:
            genome = "unknown"
        data.add_data(
            load_one_mtx_file(
                path,
                file_name,
                genome,
                modality
            ),
        )
    else:
        for dir_entry in os.scandir(path):
            if dir_entry.is_dir():
                file_name = _locate_mtx_file(dir_entry.path)
                if file_name is None:
                    raise ValueError(f"Folder {dir_entry.path} does not contain a mtx file!")
                dgenome, dmodality = _parse_dir_name(dir_entry.name, modality)
                data.add_data(load_one_mtx_file(dir_entry.path, file_name, dgenome, dmodality))

    return data

Example #17

0

Show file

File: preprocessing.py Project: klarman-cell-observatory/pegasus

def identify_robust_genes(data: MultimodalData,
                          percent_cells: float = 0.05) -> None:
    """ Identify robust genes as candidates for HVG selection and remove genes that are not expressed in any cells.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.
    percent_cells: ``float``, optional, default: ``0.05``
       Only assign genes to be ``robust`` that are expressed in at least ``percent_cells`` % of cells.

    Returns
    -------
    ``None``

    Update ``data.var``:

        * ``n_cells``: Total number of cells in which each gene is measured.
        * ``percent_cells``: Percent of cells in which each gene is measured.
        * ``robust``: Boolean type indicating if a gene is robust based on the QC metrics.
        * ``highly_variable_features``: Boolean type indicating if a gene is a highly variable feature. By default, set all robust genes as highly variable features.

    Examples
    --------
    >>> pg.identify_robust_genes(data, percent_cells = 0.05)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()

    prior_n = data.shape[1]

    if issparse(data.X):
        data.var["n_cells"] = data.X.getnnz(axis=0)
        data._inplace_subset_var(data.var["n_cells"] > 0)
        data.var["percent_cells"] = (data.var["n_cells"] / data.shape[0]) * 100
        data.var["robust"] = data.var["percent_cells"] >= percent_cells
    else:
        data.var["robust"] = True

    data.var["highly_variable_features"] = data.var[
        "robust"]  # default all robust genes are "highly" variable
    logger.info(
        f"After filtration, {data.shape[1]}/{prior_n} genes are kept. Among {data.shape[1]} genes, {data.var['robust'].sum()} genes are robust."
    )

Example #18

0

Show file

File: zarr_utils.py Project: klarman-cell-observatory/pegasusio

    def read_multimodal_data(self, attach_zarrobj = False) -> MultimodalData:
        """ Read MultimodalData
        """
        data = MultimodalData()
        for key, group in self.root.groups():
            unidata = self.read_unimodal_data(group)
            data.add_data(unidata)

        if self.root.attrs.get('_selected', None) is not None:
            data.select_data(self.root.attrs['_selected'])

        if attach_zarrobj:
            data._zarrobj = self

        return data

Example #19

0

Show file

File: doublet_detection.py Project: slowkow/pegasus

def mark_doublets(
    data: MultimodalData,
    demux_attr: Optional[str] = 'demux_type',
    dbl_clusts: Optional[str] = None,
) -> None:
    """Convert doublet prediction into doublet annotations that Pegasus can recognize. In addition, clusters in dbl_clusts will be marked as doublets.

    Must run ``infer_doublets`` first.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    demux_attr: ``str``, optional, default: ``demux_type``
        Attribute indicating singlets/doublets that Pegasus can recognize. Currently this is 'demux_type', which is also used for hashing.

    dbl_clusts: ``str``, optional, default: None
        Indicate which clusters should be marked as all doublets. It takes the format of 'clust:value1,value2,...', where 'clust' refers to the cluster attribute.

    Returns
    -------
    ``None``

    Update ``data.obs``:
        * ``data.obs[demux_attr]``: Singlet/doublet annotation.

    Examples
    --------
    >>> pg.mark_doublets(data, dbl_clusts='Annotation:B/T doublets')
    """
    codes = data.obs["pred_dbl"].values.astype(np.int32)
    if dbl_clusts is not None:
        cluster, value_str = dbl_clusts.split(':')
        idx = np.isin(data.obs[cluster], value_str.split(','))
        codes[idx] = 1
    data.obs[demux_attr] = pd.Categorical.from_codes(
        codes, categories=["singlet", "doublet"])

Example #20

0

Show file

def correct_batch_effects(data: MultimodalData,
                          keyword: str,
                          features: str = None) -> None:
    """ Apply calculated plus and muls to correct batch effects for a dense matrix
    """
    X = data.uns[keyword]
    m = X.shape[1]
    if features is not None:
        selected = data.var[features].values
        plus = data.varm["plus"][selected, :]
        muls = data.varm["muls"][selected, :]
    else:
        selected = np.ones(data.shape[1], dtype=bool)
        plus = data.varm["plus"]
        muls = data.varm["muls"]

    for i, channel in enumerate(data.uns["Channels"]):
        idx = np.isin(data.obs["Channel"], channel)
        if idx.sum() == 0:
            continue
        X[idx] = X[idx] * np.reshape(muls[:, i], newshape=(1, m)) + np.reshape(
            plus[:, i], newshape=(1, m))
    data.uns["_tmp_ls_" + str(features)] = True

Example #21

0

Show file

File: cyto_utils.py Project: slowkow/pegasusio

def load_fcs_file(input_fcs: str, genome: str = None) -> MultimodalData:
    """Load Cyto data from a FCS file, support v2.0, v3.0 and v3.1.

    Parameters
    ----------

    input_fcs : `str`
        The FCS file.
    genome : `str`, optional (default None)
        The genome reference. If None, use "unknown" instead.

    Returns
    -------

    A MultimodalData object containing a (genome, CytoData) pair.

    Examples
    --------
    >>> io.load_fcs_file('example.fcs', genome = 'GRCh38')
    """
    try:
        from pegasusio.cylib.io import read_fcs
    except ModuleNotFoundError:
        print("No module named 'pegasusio.cylib.io'")

    if not os.path.isfile(input_fcs):
        raise FileNotFoundError(f"File {input_fcs} does not exist!")
    feature_metadata, matrix, metadata = read_fcs(input_fcs)
    barcode_metadata = {"barcodekey": [f"event{i}" for i in range(1, matrix.shape[0] + 1)]}
    genome = "unknown" if genome is None else genome
    metadata["genome"] = genome
    metadata["modality"] = "cyto"

    cytodata = CytoData(barcode_metadata, feature_metadata, {"raw.data": matrix}, metadata)
    data = MultimodalData(cytodata)

    return data

Example #22

0

Show file

def log_norm(data: MultimodalData,
             norm_count: float = 1e5,
             backup_matrix: str = "raw.X") -> None:
    """Normalization, and then apply natural logarithm to the data.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.

    norm_count: ``int``, optional, default: ``1e5``.
        Total count of cells after normalization.

    backup_matrix: ``str``, optional, default: ``raw.X``.
        Where to back up the count matrix.

    Returns
    -------
    ``None``

    Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``.

    Examples
    --------
    >>> pg.log_norm(data)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()

    assert data.get_modality() == "rna"

    data.add_matrix(backup_matrix, data.X)
    data.X = data.X.astype(np.float32)  # force copy

    from pegasus.cylib.fast_utils import normalize_by_count
    data.obs["scale"] = normalize_by_count(data.X, data.var["robust"].values,
                                           norm_count, True)
    data.uns["norm_count"] = norm_count

Example #23

0

Show file

def calc_kBET(
    data: MultimodalData,
    attr: str,
    rep: str = "pca",
    K: int = 25,
    alpha: float = 0.05,
    n_jobs: int = -1,
    random_state: int = 0,
    temp_folder: str = None,
    use_cache: bool = True,
) -> Tuple[float, float, float]:
    """Calculate the kBET metric of the data regarding a specific sample attribute and embedding.

    The kBET metric is defined in [Büttner18]_, which measures if cells from different samples mix well in their local neighborhood.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    attr: ``str``
        The sample attribute to consider. Must exist in ``data.obs``.

    rep: ``str``, optional, default: ``"pca"``
        The embedding representation to be used. The key ``'X_' + rep`` must exist in ``data.obsm``. By default, use PCA coordinates.

    K: ``int``, optional, default: ``25``
        Number of nearest neighbors, using L2 metric.

    alpha: ``float``, optional, default: ``0.05``
        Acceptance rate threshold. A cell is accepted if its kBET p-value is greater than or equal to ``alpha``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads used. If ``-1``, use all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    temp_folder: ``str``, optional, default: ``None``
        Temporary folder for joblib execution.

    use_cache: ``bool``, optional, default: ``True``
        If use cache results for kNN.

    Returns
    -------
    stat_mean: ``float``
        Mean kBET chi-square statistic over all cells.

    pvalue_mean: ``float``
        Mean kBET p-value over all cells.

    accept_rate: ``float``
        kBET Acceptance rate of the sample.

    Examples
    --------
    >>> pg.calc_kBET(data, attr = 'Channel')

    >>> pg.calc_kBET(data, attr = 'Channel', rep = 'umap')
    """
    assert attr in data.obs
    if data.obs[attr].dtype.name != "category":
        data.obs[attr] = pd.Categorical(data.obs[attr])

    ideal_dist = (data.obs[attr].value_counts(normalize=True,
                                              sort=False).values
                  )  # ideal no batch effect distribution
    nsample = data.shape[0]
    nbatch = ideal_dist.size

    attr_values = data.obs[attr].values.copy()
    attr_values.categories = range(nbatch)

    indices, distances = get_neighbors(
        data,
        K=K,
        rep=rep,
        n_jobs=n_jobs,
        random_state=random_state,
        use_cache=use_cache,
    )
    knn_indices = np.concatenate(
        (np.arange(nsample).reshape(-1, 1), indices[:, 0:K - 1]),
        axis=1)  # add query as 1-nn

    # partition into chunks
    n_jobs = min(eff_n_jobs(n_jobs), nsample)
    starts = np.zeros(n_jobs + 1, dtype=int)
    quotient = nsample // n_jobs
    remainder = nsample % n_jobs
    for i in range(n_jobs):
        starts[i + 1] = starts[i] + quotient + (1 if i < remainder else 0)

    from joblib import Parallel, delayed, parallel_backend
    with parallel_backend("loky", inner_max_num_threads=1):
        kBET_arr = np.concatenate(
            Parallel(n_jobs=n_jobs, temp_folder=temp_folder)(
                delayed(calc_kBET_for_one_chunk)(knn_indices[
                    starts[i]:starts[i + 1], :], attr_values, ideal_dist, K)
                for i in range(n_jobs)))

    res = kBET_arr.mean(axis=0)
    stat_mean = res[0]
    pvalue_mean = res[1]
    accept_rate = (kBET_arr[:, 1] >= alpha).sum() / nsample

    return (stat_mean, pvalue_mean, accept_rate)

Example #24

0

Show file

def neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: "str" = "pca",
    n_comps: int = None,
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
    use_cache: bool = True,
    dist: str = "l2",
) -> None:
    """Compute k nearest neighbors and affinity matrix, which will be used for diffmap and graph-based community detection algorithms.

    The kNN calculation uses `hnswlib <https://github.com/nmslib/hnswlib>`_ introduced by [Malkov16]_.

    Parameters
    ----------

    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    K: ``int``, optional, default: ``100``
        Number of neighbors, including the data point itself.

    rep: ``str``, optional, default: ``"pca"``
        Embedding representation used to calculate kNN. If ``None``, use ``data.X``; otherwise, keyword ``'X_' + rep`` must exist in ``data.obsm``.

    n_comps: `int`, optional (default: None)
        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all physical CPU cores.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    use_cache: ``bool``, optional, default: ``True``
        * If ``True`` and found cached knn results, Pegasus will use cached results and do not recompute.
        * Otherwise, compute kNN irrespective of caching status.

    dist: ``str``, optional (default: ``"l2"``)
        Distance metric to use. By default, use squared L2 distance. Available options, inner product ``"ip"`` or cosine similarity ``"cosine"``.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm[rep + "_knn_indices"]``: kNN index matrix. Row i is the index list of kNN of cell i (excluding itself), sorted from nearest to farthest.
        * ``data.obsm[rep + "_knn_distances"]``: kNN distance matrix. Row i is the distance list of kNN of cell i (excluding itselt), sorted from smallest to largest.

    Update ``data.obsp``:
        * ``data.obsp["W_" + rep]``: kNN graph of the data in terms of affinity matrix.

    Examples
    --------
    >>> pg.neighbors(data)
    """

    # calculate kNN
    rep = update_rep(rep)
    indices, distances = get_neighbors(
        data,
        K=K,
        rep=rep,
        n_comps=n_comps,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
        use_cache=use_cache,
        dist=dist,
    )

    # calculate affinity matrix
    W = calculate_affinity_matrix(indices[:, 0:K - 1], distances[:, 0:K - 1])
    data.obsp["W_" + rep] = W
    # pop out jump method values
    data.uns.pop(f"{rep}_jump_values", None)
    data.uns.pop(f"{rep}_optimal_k", None)

Example #25

0

Show file

def get_neighbors(
    data: MultimodalData,
    K: int = 100,
    rep: str = "pca",
    n_comps: int = None,
    n_jobs: int = -1,
    random_state: int = 0,
    full_speed: bool = False,
    use_cache: bool = True,
    dist: str = "l2",
) -> Tuple[List[int], List[float]]:
    """Find K nearest neighbors for each data point and return the indices and distances arrays.

    Parameters
    ----------

    data : `pegasusio.MultimodalData`
        An AnnData object.
    K : `int`, optional (default: 100)
        Number of neighbors, including the data point itself.
    rep : `str`, optional (default: 'pca')
        Representation used to calculate kNN. If `None` use data.X
    n_comps: `int`, optional (default: None)
        Number of components to be used in the `rep`. If n_comps == None, use all components; otherwise, use the minimum of n_comps and rep's dimensions.
    n_jobs : `int`, optional (default: -1)
        Number of threads to use. -1 refers to using all physical CPU cores.
    random_state: `int`, optional (default: 0)
        Random seed for random number generator.
    full_speed: `bool`, optional (default: False)
        If full_speed, use multiple threads in constructing hnsw index. However, the kNN results are not reproducible. If not full_speed, use only one thread to make sure results are reproducible.
    use_cache: `bool`, optional (default: True)
        If use_cache and found cached knn results, will not recompute.
    dist: `str`, optional (default: 'l2')
        Distance metric to use. By default, use squared L2 distance. Available options, inner product 'ip' or cosine similarity 'cosine'.

    Returns
    -------

    kNN indices and distances arrays.

    Examples
    --------
    >>> indices, distances = tools.get_neighbors(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if use_cache and knn_is_cached(data, indices_key, distances_key, K):
        indices = data.obsm[indices_key]
        distances = data.obsm[distances_key]
        logger.info("Found cached kNN results, no calculation is required.")
    else:
        indices, distances = calculate_nearest_neighbors(
            X_from_rep(data, rep, n_comps),
            K=K,
            n_jobs=eff_n_jobs(n_jobs),
            random_state=random_state,
            full_speed=full_speed,
            dist=dist,
        )
        data.obsm[indices_key] = indices
        data.register_attr(indices_key, "knn")
        data.obsm[distances_key] = distances
        data.register_attr(distances_key, "knn")

    return indices, distances

Example #26

0

Show file

def net_fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    net_alpha: float = 0.1,
    polish_target_steps: int = 1500,
    out_basis: str = "net_fle",
) -> None:
    """Construct Net-Force-directed (FLE) graph.

    Net-FLE is an approximated FLE graph using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_target_steps: ``int``, optional, default: ``1500``
        After running the deep regressor to predict new coordinate, Number of ForceAtlas2 iterations.

    out_basis: ``str``, optional, default: ``"net_fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net FLE coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_fle(data)
    """

    if file_name is None:
        if file_name is None:
            import tempfile

            _, file_name = tempfile.mkstemp()

    n_jobs = effective_n_jobs(n_jobs)
    rep = update_rep(rep)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )

    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(X,
                                                     K=K,
                                                     n_jobs=n_jobs,
                                                     random_state=random_state,
                                                     full_speed=full_speed)
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    W = calculate_affinity_matrix(indices, distances)

    X_fle = calc_force_directed_layout(
        W,
        file_name + ".small",
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )

    data.uns["X_" + out_basis + "_small"] = X_fle
    data.obs["ds_diffmap_selected"] = selected

    n_components = 2 if not is3d else 3
    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_fle
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_fle,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        polish_target_steps,
        is3d,
        memory,
        random_state,
        init=Y_init,
    )

Example #27

0

Show file

def net_umap(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    full_speed: bool = False,
    net_alpha: float = 0.1,
    polish_learning_rate: float = 10.0,
    polish_n_epochs: int = 30,
    out_basis: str = "net_umap",
) -> None:
    """Calculate Net-UMAP embedding of cells.

    Net-UMAP is an approximated UMAP embedding using Deep Learning model to improve the speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``10.0``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``30``
        Number of iterations for polishing UMAP run.

    out_basis: ``str``, optional, default: ``"net_umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net UMAP coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_umap(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    n_jobs = effective_n_jobs(n_jobs)

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )
    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]

    ds_indices_key = "ds_" + rep + "_knn_indices"  # ds refers to down-sampling
    ds_distances_key = "ds_" + rep + "_knn_distances"
    indices, distances = calculate_nearest_neighbors(
        X,
        K=n_neighbors,
        n_jobs=n_jobs,
        random_state=random_state,
        full_speed=full_speed,
    )
    data.uns[ds_indices_key] = indices
    data.uns[ds_distances_key] = distances

    knn_indices = np.insert(data.uns[ds_indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(X.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[ds_distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    X_umap = calc_umap(
        X,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    data.uns["X_" + out_basis + "_small"] = X_umap
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_umap
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_umap,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    knn_indices = np.insert(data.uns[indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)

    data.obsm["X_" + out_basis] = calc_umap(
        X_full,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        init=Y_init,
        n_epochs=polish_n_epochs,
        learning_rate=polish_learning_rate,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

Example #28

0

Show file

def net_tsne(
    data: MultimodalData,
    rep: str = "pca",
    n_jobs: int = -1,
    n_components: int = 2,
    perplexity: float = 30,
    early_exaggeration: int = 12,
    learning_rate: float = 1000,
    random_state: int = 0,
    select_frac: float = 0.1,
    select_K: int = 25,
    select_alpha: float = 1.0,
    net_alpha: float = 0.1,
    polish_learning_frac: float = 0.33,
    polish_n_iter: int = 150,
    out_basis: str = "net_tsne",
) -> None:
    """Calculate Net-tSNE embedding of cells.

    Net-tSNE is an approximated tSNE embedding using Deep Learning model to improve the calculation speed.

    In specific, the deep model used is MLPRegressor_, the *scikit-learn* implementation of Multi-layer Perceptron regressor.

    See [Li20]_ for details.

    .. _MLPRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells (``n_obs``) and columns for genes (``n_feature``).

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated tSNE coordinates. By default, generate 2-dimensional data for 2D visualization.

    perplexity: ``float``, optional, default: ``30``
        The perplexity is related to the number of nearest neighbors used in other manifold learning algorithms. Larger datasets usually require a larger perplexity.

    early_exaggeration: ``int``, optional, default: ``12``
        Controls how tight natural clusters in the original space are in the embedded space, and how much space will be between them.

    learning_rate: ``float``, optional, default: ``1000``
        The learning rate can be a critical parameter, which should be between 100 and 1000.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    select_frac: ``float``, optional, default: ``0.1``
        Down sampling fraction on the cells.

    select_K: ``int``, optional, default: ``25``
        Number of neighbors to be used to estimate local density for each data point for down sampling.

    select_alpha: ``float``, optional, default: ``1.0``
        Weight the down sample to be proportional to ``radius ** select_alpha``.

    net_alpha: ``float``, optional, default: ``0.1``
        L2 penalty (regularization term) parameter of the deep regressor.

    polish_learning_frac: ``float``, optional, default: ``0.33``
        After running the deep regressor to predict new coordinates, use ``polish_learning_frac`` * ``n_obs`` as the learning rate to polish the coordinates.

    polish_n_iter: ``int``, optional, default: ``150``
        Number of iterations for polishing tSNE run.

    out_basis: ``str``, optional, default: ``"net_tsne"``
        Key name for the approximated tSNE coordinates calculated.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: Net tSNE coordinates of the data.

    Update ``data.obs``:
        * ``data.obs['ds_selected']``: Boolean array to indicate which cells are selected during the down sampling phase.

    Examples
    --------
    >>> pg.net_tsne(data)
    """

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    if not knn_is_cached(data, indices_key, distances_key, select_K):
        raise ValueError("Please run neighbors first!")

    n_jobs = effective_n_jobs(n_jobs)

    selected = select_cells(
        data.uns[distances_key],
        select_frac,
        K=select_K,
        alpha=select_alpha,
        random_state=random_state,
    )

    X_full = X_from_rep(data, rep)
    X = X_full[selected, :]
    X_tsne = calc_tsne(
        X,
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        learning_rate,
        random_state,
    )

    data.uns["X_" + out_basis + "_small"] = X_tsne
    data.obs["ds_selected"] = selected

    Y_init = np.zeros((data.shape[0], n_components), dtype=np.float64)
    Y_init[selected, :] = X_tsne
    Y_init[~selected, :] = net_train_and_predict(X,
                                                 X_tsne,
                                                 X_full[~selected, :],
                                                 net_alpha,
                                                 random_state,
                                                 verbose=True)

    data.obsm["X_" + out_basis + "_pred"] = Y_init

    polish_learning_rate = polish_learning_frac * data.shape[0]
    data.obsm["X_" + out_basis] = calc_tsne(
        X_full,
        n_jobs,
        n_components,
        perplexity,
        early_exaggeration,
        polish_learning_rate,
        random_state,
        init=Y_init,
        n_iter=polish_n_iter,
        n_iter_early_exag=0,
    )

Example #29

0

Show file

def fle(
    data: MultimodalData,
    file_name: str = None,
    n_jobs: int = -1,
    rep: str = "diffmap",
    K: int = 50,
    full_speed: bool = False,
    target_change_per_node: float = 2.0,
    target_steps: int = 5000,
    is3d: bool = False,
    memory: int = 8,
    random_state: int = 0,
    out_basis: str = "fle",
) -> None:
    """Construct the Force-directed (FLE) graph.

    This implementation uses forceatlas2-python_ package, which is a Python wrapper of ForceAtlas2_.

    See [Jacomy14]_ for details on FLE.

    .. _forceatlas2-python: https://github.com/klarman-cell-observatory/forceatlas2-python
    .. _ForceAtlas2: https://github.com/klarman-cell-observatory/forceatlas2

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    file_name: ``str``, optional, default: ``None``
        Temporary file to store the coordinates as the input to forceatlas2. If ``None``, use ``tempfile.mkstemp`` to generate file name.

    n_jobs: ``int``, optional, default: ``-1``
        Number of threads to use. If ``-1``, use all available threads.

    rep: ``str``, optional, default: ``"diffmap"``
        Representation of data used for the calculation. By default, use Diffusion Map coordinates. If ``None``, use the count matrix ``data.X``.

    K: ``int``, optional, default: ``50``
        Number of nearest neighbors to be considered during the computation.

    full_speed: ``bool``, optional, default: ``False``
        * If ``True``, use multiple threads in constructing ``hnsw`` index. However, the kNN results are not reproducible.
        * Otherwise, use only one thread to make sure results are reproducible.

    target_change_per_node: ``float``, optional, default: ``2.0``
        Target change per node to stop ForceAtlas2.

    target_steps: ``int``, optional, default: ``5000``
        Maximum number of iterations before stopping the ForceAtlas2 algorithm.

    is3d: ``bool``, optional, default: ``False``
        If ``True``, calculate 3D force-directed layout.

    memory: ``int``, optional, default: ``8``
        Memory size in GB for the Java FA2 component. By default, use 8GB memory.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"fle"``
        Key name for calculated FLE coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: FLE coordinates of the data.

    Examples
    --------
    >>> pg.fle(data)
    """

    if file_name is None:
        import tempfile

        _, file_name = tempfile.mkstemp()

    n_jobs = effective_n_jobs(n_jobs)
    rep = update_rep(rep)

    if ("W_" + rep) not in data.uns:
        neighbors(
            data,
            K=K,
            rep=rep,
            n_jobs=n_jobs,
            random_state=random_state,
            full_speed=full_speed,
        )

    data.obsm["X_" + out_basis] = calc_force_directed_layout(
        W_from_rep(data, rep),
        file_name,
        n_jobs,
        target_change_per_node,
        target_steps,
        is3d,
        memory,
        random_state,
    )

Example #30

0

Show file

def umap(
    data: MultimodalData,
    rep: str = "pca",
    n_components: int = 2,
    n_neighbors: int = 15,
    min_dist: float = 0.5,
    spread: float = 1.0,
    random_state: int = 0,
    out_basis: str = "umap",
) -> None:
    """Calculate UMAP embedding of cells.

    This function uses umap-learn_ package. See [McInnes18]_ for details on UMAP.

    .. _umap-learn: https://github.com/lmcinnes/umap

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Annotated data matrix with rows for cells and columns for genes.

    rep: ``str``, optional, default: ``"pca"``
        Representation of data used for the calculation. By default, use PCA coordinates. If ``None``, use the count matrix ``data.X``.

    n_components: ``int``, optional, default: ``2``
        Dimension of calculated UMAP coordinates. By default, generate 2-dimensional data for 2D visualization.

    n_neighbors: ``int``, optional, default: ``15``
        Number of nearest neighbors considered during the computation.

    min_dist: ``float``, optional, default: ``0.5``
        The effective minimum distance between embedded data points.

    spread: ``float``, optional, default: ``1.0``
        The effective scale of embedded data points.

    random_state: ``int``, optional, default: ``0``
        Random seed set for reproducing results.

    out_basis: ``str``, optional, default: ``"umap"``
        Key name for calculated UMAP coordinates to store.

    Returns
    -------
    ``None``

    Update ``data.obsm``:
        * ``data.obsm['X_' + out_basis]``: UMAP coordinates of the data.

    Examples
    --------
    >>> pg.umap(data)
    """
    start = time.time()

    rep = update_rep(rep)
    indices_key = rep + "_knn_indices"
    distances_key = rep + "_knn_distances"

    X = X_from_rep(data, rep)
    if not knn_is_cached(data, indices_key, distances_key, n_neighbors):
        if indices_key in data.uns and n_neighbors > data.uns[
                indices_key].shape[1] + 1:
            logger.warning(
                f"Reduce K for neighbors in UMAP from {n_neighbors} to {data.uns[indices_key].shape[1] + 1}"
            )
            n_neighbors = data.uns[indices_key].shape[1] + 1
        else:
            raise ValueError("Please run neighbors first!")

    knn_indices = np.insert(data.uns[indices_key][:, 0:n_neighbors - 1],
                            0,
                            range(data.shape[0]),
                            axis=1)
    knn_dists = np.insert(data.uns[distances_key][:, 0:n_neighbors - 1],
                          0,
                          0.0,
                          axis=1)
    data.obsm["X_" + out_basis] = calc_umap(
        X,
        n_components,
        n_neighbors,
        min_dist,
        spread,
        random_state,
        knn_indices=knn_indices,
        knn_dists=knn_dists,
    )

    end = time.time()
    logger.info("UMAP is calculated. Time spent = {:.2f}s.".format(end -
                                                                   start))