Python MultimodalData.MultimodalDataの例、pegasusio.MultimodalData.MultimodalData Pythonの例

コード例 #1

0

ファイルを表示

ファイル: aggr_data.py プロジェクト: slowkow/pegasusio

 def aggregate(self) -> MultimodalData:
     """ Aggregate all data together """
     data = MultimodalData()
     for key in list(self.aggr):
         unidata = self._aggregate_unidata(self.aggr.pop(key))
         data.add_data(unidata)
     return data

コード例 #2

0

ファイルを表示

ファイル: text_utils.py プロジェクト: klarman-cell-observatory/pegasusio

def load_mtx_file(path: str, genome: str = None, modality: str = None) -> MultimodalData:
    """Load gene-count matrix from Market Matrix files (10x v2, v3 and HCA DCP formats)

    Parameters
    ----------

    path : `str`
        Path to mtx files. The directory implied by path should either contain matrix, feature and barcode information, or folders containing these information.
    genome : `str`, optional (default: None)
        Genome name of the matrix. If None, genome will be inferred from path.
    modality: `str`, optional (default: None)
        Modality, choosing from 'rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr' or 'atac'. If None, use 'rna' as default.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pairs.

    Examples
    --------
    >>> io.load_mtx_file('example.mtx.gz', genome = 'mm10')
    """
    if not os.path.exists(path):
        raise FileNotFoundError(f"{path} does not exist!")

    orig_file = path
    if os.path.isdir(orig_file):
        path = orig_file.rstrip('/')
        file_name = _locate_mtx_file(path)
    else:
        if (not orig_file.endswith(".mtx")) and (not orig_file.endswith(".mtx.gz")):
            raise ValueError(f"File {orig_file} does not end with suffix .mtx or .mtx.gz!")
        path, file_name = os.path.split(orig_file)

    data = MultimodalData()

    if modality is None:
        modality = "rna"

    if file_name is not None:
        if genome is None:
            genome = "unknown"
        data.add_data(
            load_one_mtx_file(
                path,
                file_name,
                genome,
                modality
            ),
        )
    else:
        for dir_entry in os.scandir(path):
            if dir_entry.is_dir():
                file_name = _locate_mtx_file(dir_entry.path)
                if file_name is None:
                    raise ValueError(f"Folder {dir_entry.path} does not contain a mtx file!")
                dgenome, dmodality = _parse_dir_name(dir_entry.name, modality)
                data.add_data(load_one_mtx_file(dir_entry.path, file_name, dgenome, dmodality))

    return data

コード例 #3

0

ファイルを表示

ファイル: zarr_utils.py プロジェクト: klarman-cell-observatory/pegasusio

    def read_multimodal_data(self, attach_zarrobj = False) -> MultimodalData:
        """ Read MultimodalData
        """
        data = MultimodalData()
        for key, group in self.root.groups():
            unidata = self.read_unimodal_data(group)
            data.add_data(unidata)

        if self.root.attrs.get('_selected', None) is not None:
            data.select_data(self.root.attrs['_selected'])

        if attach_zarrobj:
            data._zarrobj = self

        return data

コード例 #4

0

ファイルを表示

def load_10x_h5_file_v2(h5_in: h5py.Group) -> MultimodalData:
    """Load 10x v2 format matrix from hdf5 file

    Parameters
    ----------

    h5_in : h5py.Group
        An instance of h5py.Group class that is connected to a 10x v2 formatted hdf5 file.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file_v2(h5_in)
    """
    data = MultimodalData()
    for genome in h5_in.keys():
        group = h5_in[genome]

        M, N = group["shape"][...]
        mat = csr_matrix(
            (
                group["data"][...],
                group["indices"][...],
                group["indptr"][...],
            ),
            shape=(N, M),
        )

        barcodes = group["barcodes"][...].astype(str)
        ids = group["genes"][...].astype(str)
        names = group["gene_names"][...].astype(str)

        unidata = UnimodalData({"barcodekey": barcodes}, {
            "featurekey": names,
            "featureid": ids
        }, {"X": mat}, {
            "modality": "rna",
            "genome": genome
        })
        unidata.separate_channels()

        data.add_data(unidata)

    return data

コード例 #5

0

ファイルを表示

ファイル: cyto_utils.py プロジェクト: slowkow/pegasusio

def load_fcs_file(input_fcs: str, genome: str = None) -> MultimodalData:
    """Load Cyto data from a FCS file, support v2.0, v3.0 and v3.1.

    Parameters
    ----------

    input_fcs : `str`
        The FCS file.
    genome : `str`, optional (default None)
        The genome reference. If None, use "unknown" instead.

    Returns
    -------

    A MultimodalData object containing a (genome, CytoData) pair.

    Examples
    --------
    >>> io.load_fcs_file('example.fcs', genome = 'GRCh38')
    """
    try:
        from pegasusio.cylib.io import read_fcs
    except ModuleNotFoundError:
        print("No module named 'pegasusio.cylib.io'")

    if not os.path.isfile(input_fcs):
        raise FileNotFoundError(f"File {input_fcs} does not exist!")
    feature_metadata, matrix, metadata = read_fcs(input_fcs)
    barcode_metadata = {"barcodekey": [f"event{i}" for i in range(1, matrix.shape[0] + 1)]}
    genome = "unknown" if genome is None else genome
    metadata["genome"] = genome
    metadata["modality"] = "cyto"

    cytodata = CytoData(barcode_metadata, feature_metadata, {"raw.data": matrix}, metadata)
    data = MultimodalData(cytodata)

    return data

コード例 #6

0

ファイルを表示

ファイル: text_utils.py プロジェクト: klarman-cell-observatory/pegasusio

def load_csv_file(
    input_csv: str,
    sep: str = ",",
    genome: str = None,
    modality: str = None,
) -> MultimodalData:
    """Load count matrix from a CSV-style file, such as CSV file or DGE style tsv file.

    Parameters
    ----------

    input_csv : `str`
        The CSV file, gzipped or not, containing the count matrix.
    sep: `str`, optional (default: ',')
        Separator between fields, either ',' or '\t'.
    genome : `str`, optional (default None)
        The genome reference. If None, use "unknown" instead.
    modality: `str`, optional (default None)
        Modality. If None, use "rna" instead.

    Returns
    -------

    A MultimodalData object containing a (genome, UnimodalData) pair.

    Examples
    --------
    >>> io.load_csv_file('example_ADT.csv')
    >>> io.load_csv_file('example.umi.dge.txt.gz', genome = 'GRCh38', sep = '\t')
    """
    try:
        from pegasusio.cylib.io import read_csv
    except ModuleNotFoundError:
        print("No module named 'pegasusio.cylib.io'")

    if not os.path.exists(input_csv):
        raise FileNotFoundError(f"File {input_csv} does not exist!")

    barcode_metadata = feature_metadata = None

    input_csv = os.path.abspath(input_csv)
    path = os.path.dirname(input_csv)
    fname = os.path.basename(input_csv)

    barcode_file = os.path.join(path, "cells.csv")
    if not os.path.isfile(barcode_file):
        barcode_file += ".gz"
    feature_file = os.path.join(path, "genes.csv")
    if not os.path.isfile(feature_file):
        feature_file += ".gz"

    if os.path.isfile(barcode_file) and os.path.isfile(feature_file):
        barcode_metadata, format_type = _load_barcode_metadata(barcode_file, sep = sep)
        feature_metadata, format_type = _load_feature_metadata(feature_file, format_type, sep = sep)
        assert format_type == "HCA DCP"

    if input_csv.endswith(".gz"):
        csv_fifo = os.path.join(tempfile.gettempdir(), fname + ".fifo")
        if os.path.exists(csv_fifo):
            os.unlink(csv_fifo)
        os.mkfifo(csv_fifo)
        subprocess.Popen(f"gunzip -c {shlex.quote(input_csv)} > {shlex.quote(csv_fifo)}", shell = True)
        row_ind, col_ind, data, shape, rowkey, rownames, colnames = read_csv(csv_fifo, sep)
        os.unlink(csv_fifo)
    else:
        row_ind, col_ind, data, shape, rowkey, rownames, colnames = read_csv(input_csv, sep)

    if rowkey == "cellkey":
        # HCA format
        assert (barcode_metadata is not None) and (feature_metadata is not None) and (barcode_metadata.shape[0] == shape[0]) and (feature_metadata.shape[0] == shape[1]) and \
               ((barcode_metadata["barcodekey"].values != np.array(rownames)).sum() == 0) and ((feature_metadata["featureid"].values != np.array(colnames)).sum() == 0)
        mat = csr_matrix((data, (row_ind, col_ind)), shape = shape)
    else:
        mat = csr_matrix((data, (col_ind, row_ind)), shape = (shape[1], shape[0]))
        if barcode_metadata is None:
            barcode_metadata = {"barcodekey": colnames}
        else:
            assert (barcode_metadata.shape[0] == shape[1]) and ((barcode_metadata["barcodekey"].values != np.array(colnames)).sum() == 0)
        if feature_metadata is None:
            feature_metadata = {"featurekey": rownames}
        else:
            assert (feature_metadata.shape[0] == shape[0]) and ((feature_metadata["featurekey"].values != np.array(rownames)).sum() == 0)

    genome = genome if genome is not None else "unknown"
    modality = modality if modality is not None else "rna"

    if modality == "citeseq":
        unidata = CITESeqData(barcode_metadata, feature_metadata, {"raw.count": mat}, {"genome": genome, "modality": modality})
    else:
        unidata = UnimodalData(barcode_metadata, feature_metadata, {"X": mat}, {"genome": genome, "modality": modality})

    data = MultimodalData(unidata)

    return data

コード例 #7

0

ファイルを表示

def read_input(
    input_file: str,
    file_type: str = None,
    mode: str = "r",
    genome: str = None,
    modality: str = None,
    black_list: Set[str] = None,
    select_data: Set[str] = None,
    select_genome: Set[str] = None,
    select_modality: Set[str] = None,
) -> MultimodalData:
    """Load data into memory.

    This function is used to load input data into memory. Inputs can be in 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data) or 'nanostring' (Nanostring GeoMx spatial data) formats.

    Parameters
    ----------

    input_file : `str`
        Input file name.
    file_type : `str`, optional (default: None)
        File type, choosing from 'zarr', 'h5ad', 'loom', '10x', 'mtx', 'csv', 'tsv', 'fcs' (for flow/mass cytometry data) or 'nanostring'. If None, inferred from input_file.
    mode: `str`, optional (default: 'r')
        File open mode, options are 'r' or 'a'. If mode == 'a', file_type must be zarr and ngene/select_singlets cannot be set.
    genome : `str`, optional (default: None)
        For formats like loom, mtx, dge, csv and tsv, genome is used to provide genome name. In this case if genome is None, except mtx format, "unknown" is used as the genome name instead.
    modality : `str`, optional (default: None)
        Default modality, choosing from 'rna', 'atac', 'tcr', 'bcr', 'crispr', 'hashing', 'citeseq', 'cyto' (flow cytometry / mass cytometry) or 'nanostring'. If None, use 'rna' as default.
    black_list : `Set[str]`, optional (default: None)
        Attributes in black list will be poped out.
    select_data: `Set[str]`, optional (default: None)
        Only select data with keys in select_data. Select_data, select_genome and select_modality are mutually exclusive.
    select_genome: `Set[str]`, optional (default: None)
        Only select data with genomes in select_genome. Select_data, select_genome and select_modality are mutually exclusive.
    select_modality: `Set[str]`, optional (default: None)
        Only select data with modalities in select_modality. Select_data, select_genome and select_modality are mutually exclusive.

    Returns
    -------

    A MultimodalData object.

    Examples
    --------
    >>> data = io.read_input('example_10x.h5')
    >>> data = io.read_input('example.h5ad')
    >>> data = io.read_input('example_ADT.csv', genome = 'hashing_HTO', modality = 'hashing')
    """

    if is_list_like(input_file):
        input_file = [
            os.path.expanduser(os.path.expandvars(x)) for x in input_file
        ]
    else:
        input_file = os.path.expanduser(os.path.expandvars(input_file))

    if file_type is None:
        file_type, _, _ = infer_file_type(input_file)

    if mode == "a":
        if file_type != "zarr":
            raise ValueError("Only Zarr file can have mode 'a'!")
        zf = ZarrFile(input_file, mode=mode)
        data = zf.read_multimodal_data(attach_zarrobj=True)
    else:
        if file_type == "zarr":
            zf = ZarrFile(input_file)
            data = zf.read_multimodal_data()
        elif file_type == "h5ad":
            data = MultimodalData(anndata.read_h5ad(input_file),
                                  genome=genome,
                                  modality=modality)
        elif file_type == "loom":
            data = load_loom_file(input_file, genome=genome, modality=modality)
        elif file_type == "10x":
            data = load_10x_h5_file(input_file)
        elif file_type == "fcs":
            data = load_fcs_file(input_file, genome=genome)
        elif file_type == "nanostring":
            input_matrix = input_file[0]
            segment_file = input_file[1]
            annotation_file = input_file[2] if len(input_file) > 2 else None
            data = load_nanostring_files(input_matrix,
                                         segment_file,
                                         annotation_file=annotation_file,
                                         genome=genome)
        elif file_type == "mtx":
            data = load_mtx_file(input_file, genome=genome, modality=modality)
        else:
            assert file_type == "csv" or file_type == "tsv"
            if is_vdj_file(input_file, file_type):
                data = load_10x_vdj_file(input_file,
                                         genome=genome,
                                         modality=modality)
            else:
                data = load_csv_file(input_file,
                                     sep="," if file_type == "csv" else "\t",
                                     genome=genome,
                                     modality=modality)

    data.subset_data(select_data, select_genome, select_modality)
    data.kick_start()
    data.scan_black_list(black_list)

    logger.info(f"{file_type} file '{input_file}' is loaded.")

    return data

コード例 #8

0

ファイルを表示

def write_output(data: Union[MultimodalData, UnimodalData],
                 output_file: str,
                 file_type: str = None,
                 is_sparse: bool = True,
                 precision: int = 2) -> None:
    """ Write data back to disk.

    This function is used to write data back to disk.

    Parameters
    ----------

    data : MutimodalData
        data to write back.
    output_file : `str`
        output file name. Note that for mtx files, output_file specifies a directory. For scp format, file_type must be specified. 
    file_type : `str`, optional (default: None)
        File type can be 'zarr' (as folder), 'zarr.zip' (as a ZIP file), 'h5ad', 'loom', 'mtx' or 'scp'. If file_type is None, it will be inferred based on output_file.
    is_sparse : `bool`, optional (default: True)
        Only used for writing out SCP-compatible files, if write expression as a sparse matrix.
    precision : `int`, optional (default: 2)
        Precision after decimal point for values in mtx and scp expression matrix.

    Returns
    -------
    `None`

    Examples
    --------
    >>> io.write_output(data, 'test.zarr')
    """
    if isinstance(data, UnimodalData):
        data = MultimodalData(data)

    output_file = os.path.expanduser(os.path.expandvars(output_file))

    def _infer_output_file_type(output_File: str) -> str:
        if output_file.endswith(".zarr"):
            return "zarr"
        elif output_file.endswith(".zarr.zip"):
            return "zarr.zip"
        elif output_file.endswith(".h5ad"):
            return "h5ad"
        elif output_file.endswith(".loom"):
            return "loom"
        else:
            name, sep, suf = output_file.rpartition(".")
            return "mtx" if sep == "" else suf

    file_type = _infer_output_file_type(
        output_file) if file_type is None else file_type
    if file_type not in {"zarr", "zarr.zip", "h5ad", "loom", "mtx", "scp"}:
        raise ValueError(f"Unsupported output file type '{file_type}'!")

    _tmp_multi = data._clean_tmp(
    )  # for each unidata, remove uns keys starting with '_tmp' and store these values to _tmp_multi

    if file_type.startswith("zarr"):
        zf = ZarrFile(
            output_file,
            mode="w",
            storage_type="ZipStore" if file_type == "zarr.zip" else None)
        zf.write_multimodal_data(data)
        del zf
    elif file_type == "h5ad":
        data.to_anndata().write(output_file, compression="gzip")
    elif file_type == "loom":
        write_loom_file(data, output_file)
    elif file_type == "mtx":
        write_mtx_file(data, output_file, precision=precision)
    else:
        assert file_type == "scp"
        write_scp_file(data,
                       output_file,
                       is_sparse=is_sparse,
                       precision=precision)

    data._addback_tmp(_tmp_multi)
    logger.info(f"{file_type} file '{output_file}' is written.")

コード例 #9

0

ファイルを表示

def load_10x_vdj_file(input_csv: str, genome: str = None, modality: str = None) -> MultimodalData:
    """Load VDJ data from a 10x CSV file

    Parameters
    ----------

    input_csv : `str`
        The CSV file, gzipped or not, containing the count matrix.
    genome : `str`, optional (default None)
        The genome reference. If None, use "unknown" instead.
    modality: `str`, optional (default None)
        Modality. It should be automatically detected from the CSV file. If not None and the detected modality is not the same as the one users' provide, report an error.

    Returns
    -------

    A MultimodalData object containing a (genome, VDJData) pair.

    Examples
    --------
    >>> io.load_csv_file('vdj_t_all_contig_annotations.csv', genome = 'GRCh38_tcr')
    """
    try:
        from pegasusio.cylib.funcs import convert_10x_vdj_to_vdjdata
    except ModuleNotFoundError:
        print("No module named 'pegasusio.cylib.funcs'")
        
    df = pd.read_csv(input_csv)
    idx = df["productive"] == (True if df["productive"].dtype.kind == "b" else "True")
    df = df[idx]
    df.sort_values(by = "barcode", inplace = True, kind = "mergesort") # sort barcode and make sure it is stable

    feature_name = [x for x in df["chain"].value_counts().index if x != "Multi"][0]
    modal = None
    if feature_name in VDJData._features["tcr"]:
        modal = "tcr"
    elif feature_name in VDJData._features["bcr"]:
        modal = "bcr"
    else:
        raise ValueError(f"Unknown feature '{feature_name}' detected!")

    if (modality is not None) and (modality != modal):
        raise ValueError(f"Detected modality '{modal}' does not match user-provided modality '{modality}'!")
    modality = modal

    # Set up feature keys
    feature_metadata = {"featurekey": [x + (str(y + 1) if y > 0 else "") for x, y in itertools.product(VDJData._features[modality], range(VDJData._n_contigs))]}
    fid2pos = {}
    for i, value in enumerate(feature_metadata["featurekey"]):
        fid2pos[value] = i

    n_barcodes = df["barcode"].nunique()

    barcodes, is_cell, mats, strarrs = convert_10x_vdj_to_vdjdata(df["barcode"].values,
                                                                  df[VDJData._matrix_keywords[0:4] + ["is_cell"]].values.astype(np.int32),
                                                                  df[VDJData._matrix_keywords[4:] + ["chain"]].values,
                                                                  fid2pos, n_barcodes, VDJData._n_contigs)

    barcode_metadata = {"barcodekey": barcodes, "is_cell": is_cell}

    matrices = {}
    for i, keyword in enumerate(VDJData._matrix_keywords):
        mat = mats[i]
        if keyword == "high_confidence":
            mat = mat.astype(np.bool_)
        matrices[keyword] = csr_matrix(mat)

    genome = "unknown" if genome is None else genome
    metadata = {"genome": genome, "modality": modality}
    for i, keyword in enumerate(VDJData._uns_keywords):
        metadata[keyword] = strarrs[i]

    vdjdata = VDJData(barcode_metadata, feature_metadata, matrices, metadata)
    vdjdata.separate_channels()
    data = MultimodalData(vdjdata)

    return data

コード例 #10

0

ファイルを表示

def load_10x_h5_file_v3(h5_in: h5py.Group) -> MultimodalData:
    """Load 10x v3 format matrix from hdf5 file, allowing detection of crispr and citeseq libraries

    Parameters
    ----------

    h5_in : h5py.Group
        An instance of h5py.Group class that is connected to a 10x v3 formatted hdf5 file.

    Returns
    -------

    A MultimodalData object containing (genome, UnimodalData) pair per genome.

    Examples
    --------
    >>> io.load_10x_h5_file_v3(h5_in)
    """
    M, N = h5_in["matrix/shape"][...]
    bigmat = csr_matrix(
        (
            h5_in["matrix/data"][...],
            h5_in["matrix/indices"][...],
            h5_in["matrix/indptr"][...],
        ),
        shape=(N, M),
    )
    barcodes = h5_in["matrix/barcodes"][...].astype(str)
    df = pd.DataFrame(
        data={
            "genome": h5_in["matrix/features/genome"][...].astype(str),
            "feature_type": h5_in["matrix/features/feature_type"][...].astype(
                str),
            "id": h5_in["matrix/features/id"][...].astype(str),
            "name": h5_in["matrix/features/name"][...].astype(str)
        })

    genomes = list(df["genome"].unique())
    if "" in genomes:
        genomes.remove("")
    default_genome = genomes[0] if len(genomes) == 1 else None

    data = MultimodalData()
    gb = df.groupby(by=["genome", "feature_type"])
    for name, group in gb:
        barcode_metadata = {"barcodekey": barcodes}
        feature_metadata = {
            "featurekey": group["name"].values,
            "featureid": group["id"].values
        }
        mat = bigmat[:, gb.groups[name]]

        genome = name[0] if (name[0] != ""
                             or default_genome is None) else default_genome
        modality = "custom"
        if name[1] == "Gene Expression":
            modality = "rna"
        elif name[1] == "CRISPR Guide Capture":
            modality = "crispr"
        elif name[1] == "Antibody Capture":
            modality = "citeseq"

        if modality == "citeseq":
            unidata = CITESeqData(barcode_metadata, feature_metadata,
                                  {"raw.count": mat}, {
                                      "genome": genome,
                                      "modality": modality
                                  })
        else:
            unidata = UnimodalData(barcode_metadata, feature_metadata,
                                   {"X": mat}, {
                                       "genome": genome,
                                       "modality": modality
                                   })
        unidata.separate_channels()

        data.add_data(unidata)

    return data

コード例 #11

0

ファイルを表示

def load_loom_file(input_loom: str,
                   genome: str = None,
                   modality: str = None) -> MultimodalData:
    """Load count matrix from a LOOM file.

    Parameters
    ----------

    input_loom : `str`
        The LOOM file, containing the count matrix.
    genome : `str`, optional (default None)
        The genome reference. If None, use "unknown" instead. If not None and input loom contains genome attribute, the attribute will be overwritten.
    modality: `str`, optional (default None)
        Modality. If None, use "rna" instead. If not None and input loom contains modality attribute, the attribute will be overwritten.

    Returns
    -------

    A MultimodalData object containing a (genome, UmimodalData) pair.

    Examples
    --------
    >>> io.load_loom_file('example.loom', genome = 'GRCh38')
    """
    col_trans = {"CellID": "barcodekey", "obs_names": "barcodekey"}
    row_trans = {
        "Gene": "featurekey",
        "var_names": "featurekey",
        "Accession": "featureid",
        "gene_ids": "featureid"
    }

    import loompy
    with loompy.connect(input_loom) as ds:
        barcode_metadata = {}
        barcode_multiarrays = {}
        for key, arr in ds.col_attrs.items():
            key = col_trans.get(key, key)
            if arr.ndim == 1:
                barcode_metadata[key] = arr
            elif arr.ndim > 1:
                barcode_multiarrays[key] = arr
            else:
                raise ValueError(
                    f"Detected column attribute '{key}' has ndim = {arr.ndim}!"
                )

        feature_metadata = {}
        feature_multiarrays = {}
        for key, arr in ds.row_attrs.items():
            key = row_trans.get(key, key)
            if arr.ndim == 1:
                feature_metadata[key] = arr
            elif arr.ndim > 1:
                feature_multiarrays[key] = arr
            else:
                raise ValueError(
                    f"Detected row attribute '{key}' has ndim = {arr.ndim}!")

        matrices = {}
        for key, mat in ds.layers.items():
            key = "X" if key == "" else key
            matrices[key] = mat.sparse().T.tocsr()

        metadata = dict(ds.attrs)
        if genome is not None:
            metadata["genome"] = genome
        elif "genome" not in metadata:
            metadata["genome"] = "unknown"

        if modality is not None:
            metadata["modality"] = modality
        elif "modality" not in metadata:
            if metadata.get("experiment_type", "none") in modalities:
                metadata["modality"] = metadata.pop("experiment_type")
            else:
                metadata["modality"] = "rna"

        unidata = UnimodalData(barcode_metadata, feature_metadata, matrices,
                               metadata, barcode_multiarrays,
                               feature_multiarrays)

    data = MultimodalData(unidata)
    return data

コード例 #12

0

ファイルを表示

ファイル: nanostring_utils.py プロジェクト: klarman-cell-observatory/pegasusio

def load_nanostring_files(input_matrix: str,
                          segment_file: str,
                          annotation_file: str = None,
                          genome: str = None) -> MultimodalData:
    """Load Nanostring GeoMx input files.

    Parameters
    ----------

    input_matrix : `str`
        Input Q3 normalized data matrix.
    segment_file: `str`
        Segment file containing segmentation information for each ROI. If segment_file == 'protein', load GeoMx protein results from nCounter.
    annotation_file: `str`, optional (default None)
        An optional annotation file providing tissue type information etc.
    genome : `str`, optional (default None)
        The genome reference. If None, use "unknown" instead.

    Returns
    -------

    A MultimodalData object containing a (genome, CytoData) pair.

    Examples
    --------
    >>> io.load_fcs_file('example.fcs', genome = 'GRCh38')
    """
    is_protein = segment_file == "protein"

    if not os.path.isfile(input_matrix):
        raise FileNotFoundError(f"File {input_matrix} does not exist!")

    genome = "unknown" if genome is None else genome
    metadata = {"genome": genome, "modality": "nanostring"}
    barcode_multiarrays = None

    if not is_protein:
        df = pd.read_csv(input_matrix, sep='\t', header=0, index_col=0)

        barcodekey = pd.Index([x.replace('.', '-') for x in df.columns.values])
        barcode_metadata = {
            "barcodekey": barcodekey.values
        }  # I guess the original matrix is processed in R because '-' -> '.'.
        feature_metadata = {"featurekey": df.index.values}
        matrices = {
            "Q3Norm": np.transpose(df.values)
        }  # float64, do we need to convert it to float32?
        cur_matrix = "Q3Norm"

        if not os.path.isfile(segment_file):
            raise FileNotFoundError(f"File {segment_file} does not exist!")
        df = pd.read_csv(segment_file, sep='\t', header=0, index_col=0)

        idx = barcodekey.isin(df.index)
        if idx.sum() < barcodekey.size:
            logger.warning(
                f"Cannot find {barcodekey[~idx]} from the segment property file! Number of AOIs reduces to {idx.sum()}."
            )
            barcodekey = barcodekey[idx]
            barcode_metadata["barcodekey"] = barcodekey.values
        if idx.sum() < df.shape[0]:
            logger.warning(
                f"Sample IDs {','.join(x for x in df.index[~df.index.isin(barcodekey)])} from the segment property file are not located in the matrix file!"
            )
        df = df.reindex(barcodekey)

        for key in [
                "primer plate well", "slide name", "scan name", "panel",
                "segment", "aoi"
        ]:
            if key in df.columns:
                df.loc[df[key].isna(), key] = "None"
                barcode_metadata[key] = df[key].values
        if "roi" in df.columns:
            rois = df["roi"].copy()
            rois[rois.isna()] = -1.0
            rois = rois.astype(int).astype(str)
            rois[rois == "-1"] = "None"
            barcode_metadata["roi"] = rois.values
        for key in ["area", "SequencingSaturation"]:
            if key in df.columns:
                df.loc[df[key].isna(), key] = 0.0
                barcode_metadata[key] = df[key].values.astype(np.float32)
        for key in [
                "RawReads", "TrimmedReads", "StitchedReads", "AlignedReads",
                "DeduplicatedReads"
        ]:
            if key in df.columns:
                df.loc[df[key].isna(), key] = 0
                barcode_metadata[key] = df[key].values.astype(np.int32)
    else:
        df = pd.read_csv(input_matrix, sep=',', header=None, index_col=0)

        barcodekey = pd.Index(df.loc["Segment displayed name", 4:].values)
        barcode_metadata = {
            "barcodekey":
            barcodekey.values,
            "segment":
            pd.Categorical(df.loc["Segment (Name/ Label)", 4:].values),
            "AOI surface area":
            df.loc["AOI surface area", 4:].values.astype(np.float64),
            "AOI nuclei count":
            df.loc["AOI nuclei count", 4:].values.astype(np.int32)
        }
        probe_pos = df.index.get_loc("#Probe Group") + 1
        df_probes = df.iloc[probe_pos:]
        rawmat = np.transpose(df_probes.loc[:, 4:].values.astype(np.float64))
        idx_signal = df_probes[2] == "Endogenous"
        idx_control = df_probes[2] == "Control"
        idx_negative = df_probes[2] == "Negative"

        feature_metadata = {
            "featurekey": df_probes.loc[idx_signal, 3].values,
            "ProbeAnnotation": df_probes.index[idx_signal].values
        }
        matrices = {"RawData": rawmat[:, idx_signal]}  # float64
        metadata["control_names"] = df_probes.loc[idx_control, 3].values
        metadata["negative_names"] = df_probes.loc[idx_negative, 3].values
        barcode_multiarrays = {
            "controls": rawmat[:, idx_control],
            "negatives": rawmat[:, idx_negative]
        }

        cur_matrix = "RawData"

    if annotation_file is not None:
        if not os.path.isfile(annotation_file):
            raise FileNotFoundError(f"File {annotation_file} does not exist!")
        df = pd.read_csv(annotation_file, sep='\t', header=0, index_col=0)
        assert barcodekey.isin(df.index).sum() == barcodekey.size
        df = df.reindex(barcodekey)
        for key in df.columns:
            barcode_metadata[key] = pd.Categorical(df[key].values)

    nanodata = NanostringData(barcode_metadata,
                              feature_metadata,
                              matrices,
                              metadata,
                              barcode_multiarrays=barcode_multiarrays,
                              cur_matrix=cur_matrix)
    data = MultimodalData(nanodata)

    return data