def aggregate(self) -> MultimodalData: """ Aggregate all data together """ data = MultimodalData() for key in list(self.aggr): unidata = self._aggregate_unidata(self.aggr.pop(key)) data.add_data(unidata) return data
def load_mtx_file(path: str, genome: str = None, modality: str = None) -> MultimodalData: """Load gene-count matrix from Market Matrix files (10x v2, v3 and HCA DCP formats) Parameters ---------- path : `str` Path to mtx files. The directory implied by path should either contain matrix, feature and barcode information, or folders containing these information. genome : `str`, optional (default: None) Genome name of the matrix. If None, genome will be inferred from path. modality: `str`, optional (default: None) Modality, choosing from 'rna', 'citeseq', 'hashing', 'tcr', 'bcr', 'crispr' or 'atac'. If None, use 'rna' as default. Returns ------- A MultimodalData object containing (genome, UnimodalData) pairs. Examples -------- >>> io.load_mtx_file('example.mtx.gz', genome = 'mm10') """ if not os.path.exists(path): raise FileNotFoundError(f"{path} does not exist!") orig_file = path if os.path.isdir(orig_file): path = orig_file.rstrip('/') file_name = _locate_mtx_file(path) else: if (not orig_file.endswith(".mtx")) and (not orig_file.endswith(".mtx.gz")): raise ValueError(f"File {orig_file} does not end with suffix .mtx or .mtx.gz!") path, file_name = os.path.split(orig_file) data = MultimodalData() if modality is None: modality = "rna" if file_name is not None: if genome is None: genome = "unknown" data.add_data( load_one_mtx_file( path, file_name, genome, modality ), ) else: for dir_entry in os.scandir(path): if dir_entry.is_dir(): file_name = _locate_mtx_file(dir_entry.path) if file_name is None: raise ValueError(f"Folder {dir_entry.path} does not contain a mtx file!") dgenome, dmodality = _parse_dir_name(dir_entry.name, modality) data.add_data(load_one_mtx_file(dir_entry.path, file_name, dgenome, dmodality)) return data
def read_multimodal_data(self, attach_zarrobj = False) -> MultimodalData: """ Read MultimodalData """ data = MultimodalData() for key, group in self.root.groups(): unidata = self.read_unimodal_data(group) data.add_data(unidata) if self.root.attrs.get('_selected', None) is not None: data.select_data(self.root.attrs['_selected']) if attach_zarrobj: data._zarrobj = self return data
def load_10x_h5_file_v2(h5_in: h5py.Group) -> MultimodalData: """Load 10x v2 format matrix from hdf5 file Parameters ---------- h5_in : h5py.Group An instance of h5py.Group class that is connected to a 10x v2 formatted hdf5 file. Returns ------- A MultimodalData object containing (genome, UnimodalData) pair per genome. Examples -------- >>> io.load_10x_h5_file_v2(h5_in) """ data = MultimodalData() for genome in h5_in.keys(): group = h5_in[genome] M, N = group["shape"][...] mat = csr_matrix( ( group["data"][...], group["indices"][...], group["indptr"][...], ), shape=(N, M), ) barcodes = group["barcodes"][...].astype(str) ids = group["genes"][...].astype(str) names = group["gene_names"][...].astype(str) unidata = UnimodalData({"barcodekey": barcodes}, { "featurekey": names, "featureid": ids }, {"X": mat}, { "modality": "rna", "genome": genome }) unidata.separate_channels() data.add_data(unidata) return data
def load_10x_h5_file_v3(h5_in: h5py.Group) -> MultimodalData: """Load 10x v3 format matrix from hdf5 file, allowing detection of crispr and citeseq libraries Parameters ---------- h5_in : h5py.Group An instance of h5py.Group class that is connected to a 10x v3 formatted hdf5 file. Returns ------- A MultimodalData object containing (genome, UnimodalData) pair per genome. Examples -------- >>> io.load_10x_h5_file_v3(h5_in) """ M, N = h5_in["matrix/shape"][...] bigmat = csr_matrix( ( h5_in["matrix/data"][...], h5_in["matrix/indices"][...], h5_in["matrix/indptr"][...], ), shape=(N, M), ) barcodes = h5_in["matrix/barcodes"][...].astype(str) df = pd.DataFrame( data={ "genome": h5_in["matrix/features/genome"][...].astype(str), "feature_type": h5_in["matrix/features/feature_type"][...].astype( str), "id": h5_in["matrix/features/id"][...].astype(str), "name": h5_in["matrix/features/name"][...].astype(str) }) genomes = list(df["genome"].unique()) if "" in genomes: genomes.remove("") default_genome = genomes[0] if len(genomes) == 1 else None data = MultimodalData() gb = df.groupby(by=["genome", "feature_type"]) for name, group in gb: barcode_metadata = {"barcodekey": barcodes} feature_metadata = { "featurekey": group["name"].values, "featureid": group["id"].values } mat = bigmat[:, gb.groups[name]] genome = name[0] if (name[0] != "" or default_genome is None) else default_genome modality = "custom" if name[1] == "Gene Expression": modality = "rna" elif name[1] == "CRISPR Guide Capture": modality = "crispr" elif name[1] == "Antibody Capture": modality = "citeseq" if modality == "citeseq": unidata = CITESeqData(barcode_metadata, feature_metadata, {"raw.count": mat}, { "genome": genome, "modality": modality }) else: unidata = UnimodalData(barcode_metadata, feature_metadata, {"X": mat}, { "genome": genome, "modality": modality }) unidata.separate_channels() data.add_data(unidata) return data
def pseudobulk( data: MultimodalData, sample: str, attrs: Optional[Union[List[str], str]] = None, mat_key: Optional[str] = "counts", cluster: Optional[str] = None, ) -> UnimodalData: """Generate Pseudo-bulk count matrices. Parameters ----------- data: ``MultimodalData`` or ``UnimodalData`` object Annotated data matrix with rows for cells and columns for genes. sample: ``str`` Specify the cell attribute used for aggregating pseudo-bulk data. Key must exist in ``data.obs``. attrs: ``str`` or ``List[str]``, optional, default: ``None`` Specify additional cell attributes to remain in the pseudo bulk data. If set, all attributes' keys must exist in ``data.obs``. Notice that for a categorical attribute, each pseudo-bulk's value is the one of highest frequency among its cells, and for a numeric attribute, each pseudo-bulk's value is the mean among its cells. mat_key: ``str``, optional, default: ``counts`` Specify the single-cell count matrix used for aggregating pseudo-bulk counts: If specified, use the count matrix with key ``mat_key`` from matrices of ``data``; otherwise, default is ``counts``. cluster: ``str``, optional, default: ``None`` If set, additionally generate pseudo-bulk matrices per cluster specified in ``data.obs[cluster]``. Returns ------- A UnimodalData object ``udata`` containing pseudo-bulk information: * It has the following count matrices: * ``X``: The pseudo-bulk count matrix over all cells. * If ``cluster`` is set, a number of pseudo-bulk count matrices of cells belonging to the clusters, respectively. * ``udata.obs``: It contains pseudo-bulk attributes aggregated from the corresponding single-cell attributes. * ``udata.var``: Gene names and Ensembl IDs are maintained. Update ``data``: * Add the returned UnimodalData object above to ``data`` with key ``<sample>-pseudobulk``, where ``<sample>`` is replaced by the actual value of ``sample`` argument. Examples -------- >>> pg.pseudobulk(data, sample="Channel") """ X = data.get_matrix(mat_key) assert sample in data.obs.columns, f"Sample key '{sample}' must exist in data.obs!" sample_vec = (data.obs[sample] if is_categorical_dtype(data.obs[sample]) else data.obs[sample].astype("category")) bulk_list = sample_vec.cat.categories df_barcode = data.obs.reset_index() mat_dict = { "counts": get_pseudobulk_count(X, df_barcode, sample, bulk_list) } # Generate pseudo-bulk attributes if specified bulk_attr_list = [] if attrs is not None: if isinstance(attrs, str): attrs = [attrs] for attr in attrs: assert (attr in data.obs.columns ), f"Cell attribute key '{attr}' must exist in data.obs!" for bulk in bulk_list: df_bulk = df_barcode.loc[df_barcode[sample] == bulk] if attrs is not None: bulk_attr = df_bulk[attrs].apply(set_bulk_value, axis=0) bulk_attr["barcodekey"] = bulk else: bulk_attr = pd.Series({"barcodekey": bulk}) bulk_attr_list.append(bulk_attr) df_pseudobulk = pd.DataFrame(bulk_attr_list) df_feature = pd.DataFrame(index=data.var_names) if "featureid" in data.var.columns: df_feature["featureid"] = data.var["featureid"] if cluster is not None: assert (cluster in data.obs.columns ), f"Cluster key '{attr}' must exist in data.obs!" cluster_list = data.obs[cluster].astype("category").cat.categories for cls in cluster_list: mat_dict[f"{cluster}_{cls}.X"] = get_pseudobulk_count( X, df_barcode.loc[df_barcode[cluster] == cls], sample, bulk_list) udata = UnimodalData( barcode_metadata=df_pseudobulk, feature_metadata=df_feature, matrices=mat_dict, genome=sample, modality="pseudobulk", cur_matrix="counts", ) data.add_data(udata) return udata