Esempio n. 1
0
    def _merge_rna(paths, metadata, save_dir, id_col="lane_id", parallel=True):
        """"""
        # TODO: significant memory leakage -- maybe make an optional kwarg
        if parallel:
            pool = Parallel(n_jobs=-2)
            rna_list = pool(
                delayed(Counts.from_cellranger)(path) for path in paths)
        else:
            rna_list = [Counts.from_cellranger(path) for path in paths]
        widths = list(map(lambda x: x.shape[1], rna_list))
        if len(set(widths)) > 1:
            raise ValueError(
                f"Can't merge matrices with mixed shapes: {set(widths)}. Details: {list(zip(paths, widths))}"
            )
        rna = Counts.concatenate(rna_list)
        meta = None
        if metadata is not None:
            metadata_cols = [
                col for col in metadata.columns if not col.startswith("path_")
            ]
            metadata = metadata[metadata_cols]
            cells_per_matrix = [counts.shape[0] for counts in rna_list]
            meta = metadata.loc[metadata.index.repeat(
                cells_per_matrix)].reset_index(drop=True)
            if id_col in metadata:
                rna.index = rna.index.str.slice(0,
                                                -1) + meta[id_col].astype(str)
        if rna.index.duplicated().any():
            raise ValueError(
                "cell identifiers must be unique. Consider using metadata with `lane_id` column or specify a custom "
                "`id_col")
        if meta is None:
            meta = pd.DataFrame(index=rna.cell_ids)
            meta.index.name = None
        else:
            meta = pd.DataFrame(meta)
            meta.index = rna.cell_ids
        if save_dir:
            os.makedirs(save_dir, exist_ok=True)

            meta.to_csv(save_dir / "meta.tsv", sep="\t")
            # TODO: move create_rds val to config
            rna.save(save_dir / "rna.pickle", save_rds=True)
        return rna, meta
Esempio n. 2
0
def test_normalize_cf_at(test_normalize_fix):
    """Functionality not yet implemented"""
    return

    cf = test_normalize_fix
    rna = Counts.load(cf["normalize"].path_map["rna"])
    cf = cf.at("normalize")
    assert cf.rna.shape == rna.shape
    assert len(cf.meta) == len(rna)
    assert len(cf.rna.features) == len(rna.features)
Esempio n. 3
0
def test_normalize_cf_goto(test_subset_fix):
    cf = test_subset_fix
    rna = Counts.load(cf["normalize"].path_map["rna"])
    cf.goto_process("root")
    assert len(cf.meta) == 600
    assert len(cf.rna) == 600
    cf = cf.goto_process("normalize")
    assert len(cf.meta) == 59
    assert len(cf.rna) == 59
    assert cf.rna.shape == rna.shape
    assert len(cf.rna.features) == len(rna.features)
Esempio n. 4
0
def _get_test_data_slice(n_cells, n_genes, keep_raw=False):
    # create sample metadata
    data_dir = Path(__file__).parent.parent / "data"
    os.makedirs(data_dir, exist_ok=True)
    subdirs = ["v3_gz/sample_1", "v3_gz/sample_2"]
    sample_metadata = pd.DataFrame({
        "entity_id": ["sample_1", "sample_2"],
        "path_rna": [str(data_dir / x) for x in subdirs]
    })
    sample_metadata.to_csv(data_dir / "sample_metadata.tsv",
                           sep="\t",
                           index=False)

    # pull and unzip data from 10X
    data_dir_gzip = data_dir / "v3_gz"
    download_data(data_dir)

    # cut data into two samples of 200 cells x 100 genes
    src = data_dir / "filtered_gene_bc_matrices/hg19/"
    files = os.listdir(src)
    rna = Counts.from_cellranger(src)
    rna_1 = rna[:n_cells, :n_genes]
    rna_2 = rna[n_cells:2 * n_cells, :n_genes]

    # save a v2 chemistry version
    dst_1_v2 = data_dir / "v2/sample_1/"
    dst_2_v2 = data_dir / "v2/sample_2/"
    os.makedirs(dst_1_v2, exist_ok=True)
    os.makedirs(dst_2_v2, exist_ok=True)
    rna_1.to_cellranger(dst_1_v2, gz=False, chemistry="v2")
    rna_2.to_cellranger(dst_2_v2, gz=False, chemistry="v2")

    # save a v3 chemistry version (features.tsv with third column)
    files.remove("genes.tsv")
    files.append("features.tsv")
    dst_1_v3 = data_dir / "v3/sample_1/"
    dst_2_v3 = data_dir / "v3/sample_2/"
    os.makedirs(dst_1_v3, exist_ok=True)
    os.makedirs(dst_2_v3, exist_ok=True)
    rna_1.to_cellranger(dst_1_v3, gz=False, chemistry="v3")
    rna_2.to_cellranger(dst_2_v3, gz=False, chemistry="v3")

    # save a gzipped version
    dst_1_gz = data_dir_gzip / "sample_1/"
    dst_2_gz = data_dir_gzip / "sample_2/"
    os.makedirs(dst_1_gz, exist_ok=True)
    os.makedirs(dst_2_gz, exist_ok=True)
    compress_move(files, dst_1_v3, dst_1_gz)
    compress_move(files, dst_2_v3, dst_2_gz)

    # remove downloads
    if not keep_raw:
        shutil.rmtree(data_dir / "filtered_gene_bc_matrices", )
    os.remove(data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz")
Esempio n. 5
0
 def _merge_rna(paths, metadata, save_dir):
     """"""
     rna_list = [Counts.from_cellranger(dir_) for dir_ in paths]
     meta = None
     if metadata is not None:
         metadata_cols = [
             col for col in metadata.columns if not col.startswith("path_")
         ]
         metadata = metadata[metadata_cols]
         cells_per_matrix = [counts.shape[0] for counts in rna_list]
         meta = metadata.loc[metadata.index.repeat(
             cells_per_matrix)].reset_index(drop=True)
     rna = Counts.concatenate(rna_list)
     if meta is not None:
         meta.index = rna.cell_ids
     else:
         meta = rna.cell_ids
     if save_dir:
         os.makedirs(save_dir, exist_ok=True)
         meta.to_csv(save_dir / "meta.tsv", sep="\t")
         # TODO: move create_rds val to config
         rna.save(save_dir / "rna.pickle", create_rds=True)
     return rna, meta
Esempio n. 6
0
def get_test_data_full():
    """
    Similar to `get_test_data`, but gets full dataset without slicing and saves
    it to data/full
    """
    data_dir = Path(__file__).parent.parent / "data"
    download_data(data_dir)
    src = data_dir / "filtered_gene_bc_matrices/hg19/"
    dst = data_dir / "full"
    dst.mkdir(exist_ok=True)
    rna = Counts.from_cellranger(src)
    rna.to_cellranger(dst, gz=False, chemistry="v3")
    shutil.rmtree(data_dir / "filtered_gene_bc_matrices", )
    os.remove(data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz")
Esempio n. 7
0
def test_from_cellranger_fix(sample_1):
    rna = Counts.from_cellranger(sample_1)
    return rna
Esempio n. 8
0
def test_load(test_save_fix):
    rna = Counts.load(test_save_fix)
    return rna
Esempio n. 9
0
def test_from_cellranger_gz(sample_1_gz):
    rna = Counts.from_cellranger(sample_1_gz)
    return rna
Esempio n. 10
0
def test_from_cellranger_v2(sample_1_v2):
    rna = Counts.from_cellranger(sample_1_v2)
    return rna
Esempio n. 11
0
def get_test_data():
    """
    Get sample data from 10X for testing. The data comes in the format of v2
    chemistry, and a v3 version is artificially created, as well as v3 .gz
    version
    Returns:

    """
    # create sample metadata
    data_dir = Path(__file__).parent.parent / "data"
    subdirs = ["v3_gz/sample_1", "v3_gz/sample_2"]
    sample_metadata = pd.DataFrame({
        "sample": ["sample_1", "sample_2"],
        "path_rna": [str(data_dir / x) for x in subdirs]
    })
    sample_metadata.to_csv(data_dir / "sample_metadata.tsv",
                           sep="\t",
                           index=False)

    # pull and unzip data from 10X
    data_dir_gzip = data_dir / "v3_gz"
    download_path = data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz"
    urllib.request.urlretrieve(
        DATA_URL,
        filename=download_path,
    )
    tar = tarfile.open(download_path, "r:gz")
    tar.extractall(data_dir)
    tar.close()

    # cut data into two samples of 200 cells x 100 genes
    src = data_dir / "filtered_gene_bc_matrices/hg19/"
    files = os.listdir(src)
    rna = Counts.from_cellranger(src)
    rna_1 = rna[:200, :100]
    rna_2 = rna[200:400, :100]

    # save a v2 chemistry version
    dst_1_v2 = data_dir / "v2/sample_1/"
    dst_2_v2 = data_dir / "v2/sample_2/"
    os.makedirs(dst_1_v2, exist_ok=True)
    os.makedirs(dst_2_v2, exist_ok=True)
    rna_1.to_cellranger(dst_1_v2, gz=False, chemistry="v2")
    rna_2.to_cellranger(dst_2_v2, gz=False, chemistry="v2")

    # save a v3 chemistry version (features.tsv with third column)
    files.remove("genes.tsv")
    files.append("features.tsv")
    dst_1_v3 = data_dir / "v3/sample_1/"
    dst_2_v3 = data_dir / "v3/sample_2/"
    os.makedirs(dst_1_v3, exist_ok=True)
    os.makedirs(dst_2_v3, exist_ok=True)
    rna_1.to_cellranger(dst_1_v3, gz=False, chemistry="v3")
    rna_2.to_cellranger(dst_2_v3, gz=False, chemistry="v3")

    # save a gzipped version
    dst_1_gz = data_dir_gzip / "sample_1/"
    dst_2_gz = data_dir_gzip / "sample_2/"
    os.makedirs(dst_1_gz, exist_ok=True)
    os.makedirs(dst_2_gz, exist_ok=True)
    compress_move(files, dst_1_v3, dst_1_gz)
    compress_move(files, dst_2_v3, dst_2_gz)

    # remove downloads
    shutil.rmtree(data_dir / "filtered_gene_bc_matrices", )
    os.remove(data_dir / "pbmc3k_filtered_gene_bc_matrices.tar.gz")