def make_st_dataset(cnt_pths : List[str], topn_genes : bool = None, min_counts : int = 0, min_spots : int = 0, filter_genes : bool = False, transpose : bool = False, keep_barcodes: bool = True, )-> CountData : """ Generate CountData object for ST-data Parameter: --------- cnt_pths : List[str] list of paths to ST-data topn_genes : bool number of top expressed genes to include in analysis min_counts : int minimal number of observed counts assigned to a specific spot/cell for it to be included min_occurance : int minimal number of occurances of a gene among all spots/cells for it to be included in analysis filter_genes : bool exclude MALAT1 and RP genes from analysis transpose : bool transpose data keep_barcodes: bool if original rownames should be used or whether rownames should be based on spatial coordinates. Only applicable when using h5ad files. Returns: ------- CountData object for the ST data """ # create joint matrix for count data st_ext = utils.get_extenstion(cnt_pths[0]) if st_ext == "h5ad": cnt = utils.read_h5ad_st(cnt_pths,keep_barcodes) else: cnt = utils.make_joint_matrix(cnt_pths, transpose) # select top N genes if specified if topn_genes is not None: genesize = cnt.values.sum(axis = 0) topn_genes = np.min((topn_genes,genesize.shape[0])) sel = np.argsort(genesize)[::-1] sel = sel[0:topn_genes] cnt = cnt.iloc[:,sel] dataset = CountData(cnt) # filter genes based on name if filter_genes: dataset.filter_genes() # filter data based on quality if any([min_counts > 0,min_spots > 0]): dataset.filter_bad(min_counts = min_counts, min_occurance = min_spots, ) return dataset
def make_sc_dataset(cnt_pth : str, lbl_pth : str, topn_genes : int = None, gene_list_pth : str = None, filter_genes : bool = False, lbl_colname : str = 'bio_celltype', min_counts : int = 300, min_cells : int = 0, transpose : bool = False, upper_bound : int = None, lower_bound : int = None, ): """ Generate CountData object for SC-data Parameter: --------- cnt_pth : str path to SC count data lbl_pth : str path to SC label data topn_genes : bool number of top expressed genes to include gene_list_pth : str gene list lbl_colname : str name of column containing labels min_counts : int minimal number of observed counts assigned to a specific spot/cell for it to be included min_cells : int minimal number of occurances of a gene among all cells for it to be included transpose : bool transpose data lower_bound : int lower bound for the number of cells to include from each type upper_bound : int upper bound for the number of cells to include from each type Returns: ------- CountData object for the SC data """ sc_ext = utils.get_extenstion(cnt_pth) if sc_ext == 'h5ad' : cnt,lbl = utils.read_h5ad_sc(cnt_pth, lbl_colname, lbl_pth, ) else: cnt = utils.read_file(cnt_pth,sc_ext) if transpose: cnt = cnt.T lbl = utils.read_file(lbl_pth) # get labels if lbl_colname is None: lbl = lbl.iloc[:,0] else: lbl = lbl.loc[:,lbl_colname] # match count and label data inter = cnt.index.intersection(lbl.index) if inter.shape[0] < 1: print("[ERROR] : single cell count and annotation"\ " data did not match. Exiting.", file = sys.stderr, ) cnt = cnt.loc[inter,:] lbl = lbl.loc[inter] if upper_bound is not None or\ lower_bound is not None: cnt,lbl = utils.subsample_data(cnt, lbl, lower_bound, upper_bound, ) # select top N expressed genes if topn_genes is not None: genesize = cnt.values.sum(axis = 0) topn_genes = np.min((topn_genes,genesize.shape[0])) sel = np.argsort(genesize)[::-1] sel = sel[0:topn_genes] cnt = cnt.iloc[:,sel] # only use genes in specific genes list # if specified if gene_list_pth is not None: with open(gene_list_pth,'r+') as fopen: gene_list = fopen.readlines() gene_list = pd.Index([ x.replace('\n','') for x in gene_list ]) sel = cnt.columns.intersection(gene_list) cnt = cnt.loc[:,sel] # create sc data set dataset = CountData(cnt = cnt, lbl = lbl) # filter genes based on names if filter_genes: dataset.filter_genes() # filter data based on quality if any([min_counts > 0,min_cells > 0]): dataset.filter_bad(min_counts = min_counts, min_occurance = min_cells, ) return dataset