def _merge_rna(paths, metadata, save_dir, id_col="lane_id", parallel=True): """""" # TODO: significant memory leakage -- maybe make an optional kwarg if parallel: pool = Parallel(n_jobs=-2) rna_list = pool( delayed(Counts.from_cellranger)(path) for path in paths) else: rna_list = [Counts.from_cellranger(path) for path in paths] widths = list(map(lambda x: x.shape[1], rna_list)) if len(set(widths)) > 1: raise ValueError( f"Can't merge matrices with mixed shapes: {set(widths)}. Details: {list(zip(paths, widths))}" ) rna = Counts.concatenate(rna_list) meta = None if metadata is not None: metadata_cols = [ col for col in metadata.columns if not col.startswith("path_") ] metadata = metadata[metadata_cols] cells_per_matrix = [counts.shape[0] for counts in rna_list] meta = metadata.loc[metadata.index.repeat( cells_per_matrix)].reset_index(drop=True) if id_col in metadata: rna.index = rna.index.str.slice(0, -1) + meta[id_col].astype(str) if rna.index.duplicated().any(): raise ValueError( "cell identifiers must be unique. Consider using metadata with `lane_id` column or specify a custom " "`id_col") if meta is None: meta = pd.DataFrame(index=rna.cell_ids) meta.index.name = None else: meta = pd.DataFrame(meta) meta.index = rna.cell_ids if save_dir: os.makedirs(save_dir, exist_ok=True) meta.to_csv(save_dir / "meta.tsv", sep="\t") # TODO: move create_rds val to config rna.save(save_dir / "rna.pickle", save_rds=True) return rna, meta
def test_normalize_cf_at(test_normalize_fix): """Functionality not yet implemented""" return cf = test_normalize_fix rna = Counts.load(cf["normalize"].path_map["rna"]) cf = cf.at("normalize") assert cf.rna.shape == rna.shape assert len(cf.meta) == len(rna) assert len(cf.rna.features) == len(rna.features)
def test_normalize_cf_goto(test_subset_fix): cf = test_subset_fix rna = Counts.load(cf["normalize"].path_map["rna"]) cf.goto_process("root") assert len(cf.meta) == 600 assert len(cf.rna) == 600 cf = cf.goto_process("normalize") assert len(cf.meta) == 59 assert len(cf.rna) == 59 assert cf.rna.shape == rna.shape assert len(cf.rna.features) == len(rna.features)
def _get_test_data_slice(n_cells, n_genes, keep_raw=False): # create sample metadata data_dir = Path(__file__).parent.parent / "data" os.makedirs(data_dir, exist_ok=True) subdirs = ["v3_gz/sample_1", "v3_gz/sample_2"] sample_metadata = pd.DataFrame({ "entity_id": ["sample_1", "sample_2"], "path_rna": [str(data_dir / x) for x in subdirs] }) sample_metadata.to_csv(data_dir / "sample_metadata.tsv", sep="\t", index=False) # pull and unzip data from 10X data_dir_gzip = data_dir / "v3_gz" download_data(data_dir) # cut data into two samples of 200 cells x 100 genes src = data_dir / "filtered_gene_bc_matrices/hg19/" files = os.listdir(src) rna = Counts.from_cellranger(src) rna_1 = rna[:n_cells, :n_genes] rna_2 = rna[n_cells:2 * n_cells, :n_genes] # save a v2 chemistry version dst_1_v2 = data_dir / "v2/sample_1/" dst_2_v2 = data_dir / "v2/sample_2/" os.makedirs(dst_1_v2, exist_ok=True) os.makedirs(dst_2_v2, exist_ok=True) rna_1.to_cellranger(dst_1_v2, gz=False, chemistry="v2") rna_2.to_cellranger(dst_2_v2, gz=False, chemistry="v2") # save a v3 chemistry version (features.tsv with third column) files.remove("genes.tsv") files.append("features.tsv") dst_1_v3 = data_dir / "v3/sample_1/" dst_2_v3 = data_dir / "v3/sample_2/" os.makedirs(dst_1_v3, exist_ok=True) os.makedirs(dst_2_v3, exist_ok=True) rna_1.to_cellranger(dst_1_v3, gz=False, chemistry="v3") rna_2.to_cellranger(dst_2_v3, gz=False, chemistry="v3") # save a gzipped version dst_1_gz = data_dir_gzip / "sample_1/" dst_2_gz = data_dir_gzip / "sample_2/" os.makedirs(dst_1_gz, exist_ok=True) os.makedirs(dst_2_gz, exist_ok=True) compress_move(files, dst_1_v3, dst_1_gz) compress_move(files, dst_2_v3, dst_2_gz) # remove downloads if not keep_raw: shutil.rmtree(data_dir / "filtered_gene_bc_matrices", ) os.remove(data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz")
def _merge_rna(paths, metadata, save_dir): """""" rna_list = [Counts.from_cellranger(dir_) for dir_ in paths] meta = None if metadata is not None: metadata_cols = [ col for col in metadata.columns if not col.startswith("path_") ] metadata = metadata[metadata_cols] cells_per_matrix = [counts.shape[0] for counts in rna_list] meta = metadata.loc[metadata.index.repeat( cells_per_matrix)].reset_index(drop=True) rna = Counts.concatenate(rna_list) if meta is not None: meta.index = rna.cell_ids else: meta = rna.cell_ids if save_dir: os.makedirs(save_dir, exist_ok=True) meta.to_csv(save_dir / "meta.tsv", sep="\t") # TODO: move create_rds val to config rna.save(save_dir / "rna.pickle", create_rds=True) return rna, meta
def get_test_data_full(): """ Similar to `get_test_data`, but gets full dataset without slicing and saves it to data/full """ data_dir = Path(__file__).parent.parent / "data" download_data(data_dir) src = data_dir / "filtered_gene_bc_matrices/hg19/" dst = data_dir / "full" dst.mkdir(exist_ok=True) rna = Counts.from_cellranger(src) rna.to_cellranger(dst, gz=False, chemistry="v3") shutil.rmtree(data_dir / "filtered_gene_bc_matrices", ) os.remove(data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz")
def test_from_cellranger_fix(sample_1): rna = Counts.from_cellranger(sample_1) return rna
def test_load(test_save_fix): rna = Counts.load(test_save_fix) return rna
def test_from_cellranger_gz(sample_1_gz): rna = Counts.from_cellranger(sample_1_gz) return rna
def test_from_cellranger_v2(sample_1_v2): rna = Counts.from_cellranger(sample_1_v2) return rna
def get_test_data(): """ Get sample data from 10X for testing. The data comes in the format of v2 chemistry, and a v3 version is artificially created, as well as v3 .gz version Returns: """ # create sample metadata data_dir = Path(__file__).parent.parent / "data" subdirs = ["v3_gz/sample_1", "v3_gz/sample_2"] sample_metadata = pd.DataFrame({ "sample": ["sample_1", "sample_2"], "path_rna": [str(data_dir / x) for x in subdirs] }) sample_metadata.to_csv(data_dir / "sample_metadata.tsv", sep="\t", index=False) # pull and unzip data from 10X data_dir_gzip = data_dir / "v3_gz" download_path = data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz" urllib.request.urlretrieve( DATA_URL, filename=download_path, ) tar = tarfile.open(download_path, "r:gz") tar.extractall(data_dir) tar.close() # cut data into two samples of 200 cells x 100 genes src = data_dir / "filtered_gene_bc_matrices/hg19/" files = os.listdir(src) rna = Counts.from_cellranger(src) rna_1 = rna[:200, :100] rna_2 = rna[200:400, :100] # save a v2 chemistry version dst_1_v2 = data_dir / "v2/sample_1/" dst_2_v2 = data_dir / "v2/sample_2/" os.makedirs(dst_1_v2, exist_ok=True) os.makedirs(dst_2_v2, exist_ok=True) rna_1.to_cellranger(dst_1_v2, gz=False, chemistry="v2") rna_2.to_cellranger(dst_2_v2, gz=False, chemistry="v2") # save a v3 chemistry version (features.tsv with third column) files.remove("genes.tsv") files.append("features.tsv") dst_1_v3 = data_dir / "v3/sample_1/" dst_2_v3 = data_dir / "v3/sample_2/" os.makedirs(dst_1_v3, exist_ok=True) os.makedirs(dst_2_v3, exist_ok=True) rna_1.to_cellranger(dst_1_v3, gz=False, chemistry="v3") rna_2.to_cellranger(dst_2_v3, gz=False, chemistry="v3") # save a gzipped version dst_1_gz = data_dir_gzip / "sample_1/" dst_2_gz = data_dir_gzip / "sample_2/" os.makedirs(dst_1_gz, exist_ok=True) os.makedirs(dst_2_gz, exist_ok=True) compress_move(files, dst_1_v3, dst_1_gz) compress_move(files, dst_2_v3, dst_2_gz) # remove downloads shutil.rmtree(data_dir / "filtered_gene_bc_matrices", ) os.remove(data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz")