Example #1
0
def write_loom_file(data: MultimodalData, output_file: str) -> None:
    """ Write a MultimodalData to loom file. Will assert data only contain one type of experiment.
    """
    keys = data.list_data()
    if len(keys) > 1:
        raise ValueError(
            f"Data contain multiple modalities: {','.join(keys)}!")
    data.select_data(keys[0])
    matrices = data.list_keys()
    assert "X" in matrices
    if len(matrices) == 0:
        raise ValueError("Could not write empty matrix to a loom file!")

    def _process_attrs(key_name: str, attrs: pd.DataFrame,
                       attrs_multi: dict) -> Dict[str, object]:
        res_dict = {key_name: attrs.index.values}
        for key in attrs.columns:
            res_dict[key] = np.array(attrs[key].values)
        for key, value in attrs_multi.items():
            if value.ndim > 1:  # value.ndim == 1 refers to np.recarray, which will not be written to a loom file.
                res_dict[key] = value if value.shape[1] > 1 else value[:, 0]
        return res_dict

    row_attrs = _process_attrs("Gene", data.var, data.varm)
    col_attrs = _process_attrs("CellID", data.obs, data.obsm)

    accession_key = "featureid" if "featureid" in row_attrs else (
        "gene_ids" if "gene_ids" in row_attrs else None)
    if accession_key is not None:
        row_attrs["Accession"] = row_attrs.pop(accession_key)

    layers = {}
    for matkey in matrices:
        layers["" if matkey == "X" else matkey] = data.get_matrix(matkey).T

    file_attrs = {}
    for key, value in data.uns.items():
        if isinstance(value, str):
            file_attrs[key] = value

    import loompy
    loompy.create(output_file,
                  layers,
                  row_attrs,
                  col_attrs,
                  file_attrs=file_attrs)

    logger.info(f"{output_file} is written.")
def log_norm(
    data: MultimodalData,
    norm_count: float = 1e5,
    backup_matrix: str = "raw.X",
) -> None:
    """Normalization, and then apply natural logarithm to the data.

    Parameters
    ----------
    data: ``pegasusio.MultimodalData``
        Use current selected modality in data, which should contain one RNA expression matrix.

    norm_count: ``int``, optional, default: ``1e5``.
        Total counts of one cell after normalization.

    backup_matrix: ``str``, optional, default: ``raw.X``.
        The key name of the backup count matrix, usually the raw counts.

    Returns
    -------
    ``None``

    Update ``data.X`` with count matrix after log-normalization. In addition, back up the original count matrix as ``backup_matrix``.

    In case of rerunning normalization while ``backup_matrix`` already exists, use ``backup_matrix`` instead of ``data.X`` for normalization.

    Examples
    --------
    >>> pg.log_norm(data)
    """
    if isinstance(data, MultimodalData):
        data = data.current_data()

    assert data.get_modality() == "rna"

    if backup_matrix not in data.list_keys():
        data.add_matrix(backup_matrix, data.X)
        data.X = data.X.astype(np.float32)  # force copy
    else:
        # The case of rerunning log_norm. Use backup matrix as source.
        data.X = data.get_matrix(backup_matrix).astype(
            np.float32)  # force copy
        logger.warning(
            "Rerun log-normalization. Use the raw counts in backup instead.")

    data.obs["scale"] = normalize_by_count(data.X, data.var["robust"].values,
                                           norm_count, True)
    data.uns["norm_count"] = norm_count