Example #1
0
def test_align_and_extract_umis(new_pipegraph):
    from mbf_align.post_process import AnnotateFastqBarcodes

    for folder in [
            get_sample_path(Path("mbf_align/sample_extract_barcodes")),
            get_sample_path(Path("mbf_align/sample_extract_barcodes_gz")),
    ]:
        new_pipegraph.new_pipegraph()
        genome = get_human_22_fake_genome()

        mbf_qualitycontrol.prune_qc(lambda _: False)
        r = Sample("test",
                   str(folder),
                   False,
                   pairing="only_second",
                   vid="AA123")
        al = AlignedSample("test", str(folder / "test.bam"), genome, False,
                           "AA123")

        x = al.post_process(
            AnnotateFastqBarcodes(r, {
                "XC": [0, 4],
                "XM": [7, 7 + 4]
            }))
        ppg.run_pipegraph()
        f = x.get_bam()
        r = next(f.fetch())
        print(r.tags)
        assert r.get_tag("XC") == "AGTC"
        assert r.get_tag("XM") == "TGAC"
Example #2
0
    def test_quick_run(self, new_pipegraph, per_test_store):
        from mbf_sampledata import get_sample_path, get_human_22_fake_genome

        from mbf_align.lanes import AlignedSample
        import mbf_qualitycontrol

        new_pipegraph.quiet = False
        mbf_qualitycontrol.disable_qc()
        input_file = get_sample_path("mbf_externals/input.bam")
        background_file = get_sample_path("mbf_externals/background.bam")
        genome = get_human_22_fake_genome()
        input = AlignedSample("input",
                              input_file,
                              genome,
                              is_paired=False,
                              vid="AA000")
        background = AlignedSample("background",
                                   background_file,
                                   genome,
                                   is_paired=False,
                                   vid="AA001")

        a = PeakZilla()
        gr = a.call_peaks(input, background, {"-c": "1.01", "-s": "0.1"})
        gr.write()
        ppg.util.global_pipegraph.run()
        assert len(gr.df) == 37
        assert "AA000" in gr.vid
        assert "AA001" in gr.vid
Example #3
0
    def test_subtract_subset(self, new_pipegraph):
        from mbf_sampledata import get_sample_path
        from mbf_bam import subtract_bam

        input = get_sample_path("mbf_align/chipseq_chr22.bam")
        minued = get_sample_path("mbf_align/chipseq_chr22_subset_plus_unmapped.bam")
        output = "output.bam"
        print(input, input.exists())
        print(minued, minued.exists())
        subtract_bam(str(output), str(input.absolute()), str(minued.absolute()))
        f = pysam.Samfile(output)
        should = 80495
        total = sum((x.total for x in f.get_index_statistics()))
        assert should == total
 def test_extended_minus_background(self, new_pipegraph):
     genome = get_human_22_fake_genome()
     lane1 = mbf_align.lanes.AlignedSample(
         "one",
         mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
         genome,
         False,
         None,
     )
     start = 41842000
     regions = pd.DataFrame(
         {
             "chr": ["chr22"],
             "start": [
                 start,
             ],
             "stop": [start + 1000],
         }
     )
     extend = 10
     sermb = smooth.SmoothExtendedReadsMinusBackground({lane1.name: lane1}, extend)
     calculated = sermb.calc(regions, lane1)
     should = np.zeros((1, 1000))
     assert (should == calculated).all()
     assert lane1.load() in sermb.get_dependencies(lane1)
    def test_volcano_plot(self):
        ppg.util.global_pipegraph.quiet = False
        import mbf_sampledata

        pasilla_data = pd.read_csv(
            mbf_sampledata.get_sample_path(
                "mbf_comparisons/pasillaCount_deseq2.tsv.gz"),
            sep=" ",
        )
        # pasilla_data = pasilla_data.set_index('Gene')
        pasilla_data.columns = [str(x) for x in pasilla_data.columns]
        treated = [x for x in pasilla_data.columns if x.startswith("treated")]
        untreated = [
            x for x in pasilla_data.columns if x.startswith("untreated")
        ]
        pasilla_data = DelayedDataFrame("pasilla", pasilla_data)
        comp = Comparisons(pasilla_data, {
            "treated": treated,
            "untreated": untreated
        }).a_vs_b("treated", "untreated", TTest())
        comp.filter([("log2FC", "|>=", 2.0), ("FDR", "<=", 0.05)])
        prune_qc(lambda job: "volcano" in job.job_id)
        run_pipegraph()
        qc_jobs = list(get_qc_jobs())
        qc_jobs = [x for x in qc_jobs if not x._pruned]
        print(qc_jobs)
        assert len(qc_jobs) == 1
        assert_image_equal(qc_jobs[0].filenames[0])
    def test_by_annotator(self, new_pipegraph_no_qc):
        genome = get_human_22_fake_genome()
        start = 17750239
        df = pd.DataFrame(
            [
                {
                    "chr": "chr22",
                    "start": start,
                    "stop": start + 1000,
                },
                {
                    "chr": "chr22",
                    "start": start + 20000,
                    "stop": start + 20000 + 1000,
                },
                {
                    "chr": "chr22",
                    "start": start + 30000,
                    "stop": start + 30000 + 1000,
                },
            ]
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lanes = {lane1.name: lane1}
        raw_data = {
            lane1.name: np.array(
                [
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                ]
            )
        }
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )

        class FakeAnno(mbf_genomics.annotator.Annotator):
            columns = ["colA"]

            def calc(self, df):
                return pd.Series([1, 3, 2])

        o = order.ByAnnotator(FakeAnno())
        ppg.JobGeneratingJob("shu", lambda: None).depends_on(
            o.get_dependencies(plot_regions, lanes)[0]
        )
        ppg.run_pipegraph()
        plot_regions._load()

        norm_data = norm.AsIs().calc(lanes, raw_data)
        res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data)
        assert (res_order == [0, 2, 1]).all()
    def test_smooth(self, new_pipegraph_no_qc):
        genome = get_human_22_fake_genome()
        df = pd.DataFrame(
            [
                {
                    "chr": "chr22",
                    "start": 36925 * 1000 - 1000,
                    "stop": 36925 * 1000 + 1000,
                },
                {
                    "chr": "chr22",
                    "start": 31485 * 1000 - 2000,
                    "stop": 31485 * 1000 + 2000,
                },
                {"chr": "chr22", "start": 41842 * 1000, "stop": (41842 * 1000) + 1},
            ]
        )
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lane2 = mbf_align.lanes.AlignedSample(
            "two",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )

        h = mbf_heatmap.chipseq.Heatmap(
            plot_regions,
            [lane1, lane2],
            region_strategy=regions.RegionFromCenter(1000),
            smoothing_strategy=smooth.SmoothExtendedReads(),
        )
        fn = "test.png"
        h.plot(fn, norm.AsIs(), order.FirstLaneSum())
        ppg.run_pipegraph()
        assert_image_equal(fn)
Example #8
0
 def test_rename_raises_on_no_replacement(self, new_pipegraph):
     ppg.util.global_pipegraph.quiet = False
     input = get_sample_path("mbf_align/ex2.bam")
     output = "out.bam"
     j = job_reheader_and_rename_chromosomes(input, output, {})
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert not Path("out.bam").exists()
     assert "No replacement happened" in str(j.exception)
def get_human_22_fake_genome():
    import gzip

    genes = pd.read_msgpack(
        gzip.GzipFile(
            mbf_sampledata.get_sample_path("mbf_align/hs_22_genes.msgpack.gz")
        )
    ).reset_index()
    tr = pd.read_msgpack(
        gzip.GzipFile(
            mbf_sampledata.get_sample_path("mbf_align/hs_22_transcripts.msgpack.gz")
        )
    ).reset_index()
    genes["chr"] = "chr22"
    tr["chr"] = "chr22"
    return MockGenome(
        df_genes=genes, df_transcripts=tr, chr_lengths={"chr22": 50_818_468}
    )
    def test_deseq2_with_and_without_additional_columns(self):
        import mbf_sampledata

        pasilla_data = pd.read_csv(
            mbf_sampledata.get_sample_path(
                "mbf_comparisons/pasillaCount_deseq2.tsv.gz"),
            sep=" ",
        )
        # pasilla_data = pasilla_data.set_index('Gene')
        pasilla_data.columns = [str(x) for x in pasilla_data.columns]
        print(pasilla_data.columns)
        pasilla_data = pasilla_data.assign(
            treated_fake=pasilla_data.treated2fb,
            untreated_fake=pasilla_data.untreated2fb,
        )

        gts = {
            "treated": [
                x for x in pasilla_data.columns
                if x.startswith("treated") and "3" not in x
            ],
            "untreated": [
                x for x in pasilla_data.columns
                if x.startswith("untreated") and "3" not in x
            ],
            "other": [x for x in pasilla_data.columns if "3" in x],
        }
        assert len(gts["other"]) == 2
        assert sum(
            (len(x)
             for x in gts.values())) + 1 == len(pasilla_data.columns)  # GeneId
        ddf = DelayedDataFrame("ex", pasilla_data)
        c = Comparisons(ddf, gts)
        with_other = c.a_vs_b(
            "treated",
            "untreated",
            DESeq2Unpaired(),
            include_other_samples_for_variance=True,
        )
        without_other = c.a_vs_b(
            "treated",
            "untreated",
            DESeq2Unpaired(),
            include_other_samples_for_variance=False,
        )
        force_load(ddf.add_annotator(with_other))
        force_load(ddf.add_annotator(without_other))
        # run_pipegraph()
        df = ddf.df
        print(df.head())
        df.to_csv("test.csv")
        # this is a fairly weak test, but it shows that it at least does *something*
        assert (df[with_other["p"]] != pytest.approx(
            df[without_other["p"]])).all()
        assert (df[with_other["log2FC"]] != pytest.approx(
            df[without_other["log2FC"]])).all()
Example #11
0
 def test_rename(self, new_pipegraph):
     ppg.util.global_pipegraph.quiet = False
     input = get_sample_path("mbf_align/ex2.bam")
     output = "out.bam"
     job_reheader_and_rename_chromosomes(
         input, output, {"chr1": "shu", "chr2": "sha"}
     )
     ppg.run_pipegraph()
     assert Path("out.bam").exists()
     f = pysam.Samfile("out.bam")
     assert set(f.references) == set(["shu", "sha"])
    def test_by_column(self, new_pipegraph_no_qc):
        genome = get_human_22_fake_genome()
        start = 17750239
        df = pd.DataFrame(
            [
                {
                    "chr": "chr22",
                    "start": start,
                    "stop": start + 1000,
                    "colA": "a",
                },
                {
                    "chr": "chr22",
                    "start": start + 20000,
                    "stop": start + 20000 + 1000,
                    "colA": "c",
                },
                {
                    "chr": "chr22",
                    "start": start + 30000,
                    "stop": start + 30000 + 1000,
                    "colA": "b",
                },
            ]
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lanes = {lane1.name: lane1}
        o = order.ByAnnotator("colA", func=lambda x: [ord(y) for y in x])
        raw_data = {
            lane1.name: np.array(
                [
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                    [0, 0, 0, 0],
                ]
            )
        }
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )
        ppg.JobGeneratingJob("shu", lambda: None).depends_on(plot_regions.load())
        ppg.run_pipegraph()
        plot_regions._load()

        norm_data = norm.AsIs().calc(lanes, raw_data)
        res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data)
        assert (res_order == [0, 2, 1]).all()
    def test_extended(self, new_pipegraph):
        genome = get_human_22_fake_genome()
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        start = 41842000
        regions = pd.DataFrame(
            {
                "chr": ["chr22"],
                "start": [
                    start,
                ],
                "stop": [start + 1000],
            }
        )
        extend = 10
        calculated = smooth.SmoothExtendedReads(extend).calc(regions, lane1)
        should = np.zeros(1000)
        known = [
            (41842170, True, [(0, 36)]),
            (41842241, False, [(0, 36)]),
            (41842399, False, [(0, 36)]),
            (41842416, False, [(0, 36)]),
            (41842602, True, [(0, 36)]),
            (41842687, False, [(0, 36)]),
            (41842689, True, [(0, 36)]),
            (41842730, True, [(0, 36)]),
            (41842750, False, [(0, 36)]),
            (41842770, True, [(0, 36)]),
            (41842796, True, [(0, 36)]),
            (41842942, False, [(0, 36)]),
            (41842985, False, [(0, 36)]),
        ]

        for pos, is_reverse, cigar in known:
            pos -= start
            print(pos)
            if is_reverse:  # downstream verlaengern!
                should[pos - extend : pos + cigar[0][1]] += 1
            else:
                should[pos : pos + cigar[0][1] + extend] += 1
        should = should.reshape((1, 1000))
        assert should.shape == calculated.shape
        if (should != calculated).any():
            for ii in range(1000):
                if should[0, ii] != calculated[0, ii]:
                    print(ii, should[0, ii], calculated[0, ii])
        assert (should == calculated).all()
    def test_simple(self, new_pipegraph_no_qc):
        genome = get_human_22_fake_genome()
        start = 17750239
        df = pd.DataFrame(
            [
                {"chr": "chr22", "start": start, "stop": start + 1000},
                {"chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000},
                {"chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000},
            ]
        )
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lane2 = mbf_align.lanes.AlignedSample(
            "two",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )

        h = mbf_heatmap.chipseq.Heatmap(
            plot_regions,
            [lane1, lane2],
            region_strategy=regions.RegionAsIs(),
            smoothing_strategy=smooth.SmoothRaw(),
        )
        fn = "test.png"
        h.plot(fn, norm.AsIs(), order.AsIs())
        ppg.run_pipegraph()
        assert_image_equal(fn)
Example #15
0
def test_get_reads_in_exon():
    import mbf_sampledata
    import pysam

    genome = mbf_sampledata.get_human_22_fake_genome()
    bam = pysam.Samfile(
        mbf_sampledata.get_sample_path("mbf_align/rnaseq_spliced_chr22.bam"))
    g = genome.genes["ENSG00000128228"]
    reads = g.get_reads_in_exons(bam)
    assert reads
    start = 21642302 - 1
    stop = 21644299
    for r in reads:
        ov = r.get_overlap(start, stop)
        assert ov > 0
    def _get_tuch_data(self):
        import mbf_sampledata
        import mbf_r
        import rpy2.robjects as ro

        path = mbf_sampledata.get_sample_path("mbf_comparisons/TuchEtAlS1.csv")
        # directly from the manual.
        # plus minus """To make
        # this file, we downloaded Table S1 from Tuch et al. [39], deleted some unnecessary columns
        # and edited the column headings slightly:"""
        ro.r("""load_data = function(path) {
                rawdata <- read.delim(path, check.names=FALSE, stringsAsFactors=FALSE)
                library(edgeR)
                y <- DGEList(counts=rawdata[,3:8], genes=rawdata[,1:2])
                library(org.Hs.eg.db)
                idfound <- y$genes$idRefSeq %in% mappedRkeys(org.Hs.egREFSEQ)
                y <- y[idfound,]
                egREFSEQ <- toTable(org.Hs.egREFSEQ)
                m <- match(y$genes$idRefSeq, egREFSEQ$accession)
                y$genes$EntrezGene <- egREFSEQ$gene_id[m]
                egSYMBOL <- toTable(org.Hs.egSYMBOL)
                m <- match(y$genes$EntrezGene, egSYMBOL$gene_id)
                y$genes$Symbol <- egSYMBOL$symbol[m]

                o <- order(rowSums(y$counts), decreasing=TRUE)
                y <- y[o,]
                d <- duplicated(y$genes$Symbol)
                y <- y[!d,]

                cbind(y$genes, y$counts)
            }
""")
        df = mbf_r.convert_dataframe_from_r(ro.r("load_data")(str(path)))
        df.columns = [
            "idRefSeq",
            "nameOfGene",
            "EntrezGene",
            "Symbol",
            "8.N",
            "8.T",
            "33.N",
            "33.T",
            "51.N",
            "51.T",
        ]
        assert len(df) == 10519
        return df
    def test_deseq2(self):
        import mbf_sampledata

        pasilla_data = pd.read_csv(
            mbf_sampledata.get_sample_path(
                "mbf_comparisons/pasillaCount_deseq2.tsv.gz"),
            sep=" ",
        )
        # pasilla_data = pasilla_data.set_index('Gene')
        pasilla_data.columns = [str(x) for x in pasilla_data.columns]

        gts = {
            "treated":
            [x for x in pasilla_data.columns if x.startswith("treated")],
            "untreated":
            [x for x in pasilla_data.columns if x.startswith("untreated")],
        }
        ddf = DelayedDataFrame("ex", pasilla_data)
        c = Comparisons(ddf, gts)
        a = c.a_vs_b("treated", "untreated", DESeq2Unpaired())
        force_load(ddf.add_annotator(a))
        run_pipegraph()
        check = """# This is deseq2 version specific data- probably needs fixing if upgrading deseq2
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## FBgn0039155 453 -3.72 0.160 -23.2 1.63e-119 1.35e-115
## FBgn0029167 2165 -2.08 0.103 -20.3 1.43e-91 5.91e-88
## FBgn0035085 367 -2.23 0.137 -16.3 6.38e-60 1.75e-56
## FBgn0029896 258 -2.21 0.159 -13.9 5.40e-44 1.11e-40
## FBgn0034736 118 -2.56 0.185 -13.9 7.66e-44 1.26e-40
"""
        df = ddf.df.sort_values(a["FDR"])
        df = df.set_index("Gene")
        for row in check.split("\n"):
            row = row.strip()
            if row and not row[0] == "#":
                row = row.split()
                self.assertAlmostEqual(df.ix[row[0]][a["log2FC"]],
                                       float(row[2]),
                                       places=2)
                self.assertAlmostEqual(df.ix[row[0]][a["p"]],
                                       float(row[5]),
                                       places=2)
                self.assertAlmostEqual(df.ix[row[0]][a["FDR"]],
                                       float(row[6]),
                                       places=2)
    def test_correlation(self):
        ppg.util.global_pipegraph.quiet = False
        import mbf_sampledata

        pasilla_data = pd.read_csv(
            mbf_sampledata.get_sample_path(
                "mbf_comparisons/pasillaCount_deseq2.tsv.gz"),
            sep=" ",
        )
        # pasilla_data = pasilla_data.set_index('Gene')
        pasilla_data.columns = [str(x) for x in pasilla_data.columns]
        treated = [x for x in pasilla_data.columns if x.startswith("treated")]
        untreated = [
            x for x in pasilla_data.columns if x.startswith("untreated")
        ]
        pasilla_data = DelayedDataFrame("pasilla", pasilla_data)
        Comparisons(pasilla_data, {"treated": treated, "untreated": untreated})
        prune_qc(lambda job: "correlation" in job.job_id)
        run_pipegraph()
        qc_jobs = list(get_qc_jobs())
        qc_jobs = [x for x in qc_jobs if not x._pruned]
        print(qc_jobs)
        assert len(qc_jobs) == 1
        assert_image_equal(qc_jobs[0].filenames[0])
Example #19
0
 def get_alignment_stats(self, bam_filename):
     assert (Path(bam_filename).resolve() == get_sample_path(
         "mbf_align/rnaseq_spliced_chr22.bam").resolve())
     return {"Hello": 23}
    def test_ithlane_max(self, new_pipegraph):
        genome = get_human_22_fake_genome()
        start = 17750239
        df = pd.DataFrame(
            [
                {"chr": "chr22", "start": start, "stop": start + 1000},
                {"chr": "chr22", "start": start + 20000, "stop": start + 20000 + 1000},
                {"chr": "chr22", "start": start + 30000, "stop": start + 30000 + 1000},
            ]
        )
        lane1 = mbf_align.lanes.AlignedSample(
            "one",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        lane2 = mbf_align.lanes.AlignedSample(
            "two",
            mbf_sampledata.get_sample_path("mbf_align/chipseq_chr22.bam"),
            genome,
            False,
            None,
        )
        with pytest.raises(AttributeError):
            order.IthLaneMax(lane1.name)

        o = order.IthLaneMax(1)
        # raw_data = {lane1.name: smooth.SmoothRaw().calc(df, lane1)}
        raw_data = {
            lane1.name: np.array(
                [
                    [0, 0, 5, 0],
                    [2, 1, 1, 1],
                    [1, 0, 0, 0],
                ]
            )
        }

        print(raw_data)
        print(raw_data[lane1.name].max(axis=1))
        lanes = {lane1.name: lane1}
        lanes[lane2.name] = lane2
        norm_data = norm.AsIs().calc(lanes, raw_data)
        plot_regions = mbf_genomics.regions.GenomicRegions(
            "testregions", lambda: df, [], genome
        )

        with pytest.raises(KeyError):
            o.calc(
                plot_regions,
                {lane1.name: lane1, lane2.name: lane2},
                raw_data,
                norm_data,
            )

        o = order.IthLaneMax(lane2)
        with pytest.raises(KeyError):
            o.calc(plot_regions, {lane1.name: lane1}, raw_data, norm_data)

        raw_data[lane2.name] = raw_data[lane1.name].copy()
        res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data)
        assert clusters is None
        assert (
            res_order == [2, 1, 0]
        ).all()  # remember, from top to bottom in plotting later on.

        raw_data[lane2.name] = np.array(
            [
                [0, 0, 0, 0],
                [5, 1, 1, 0],
                [1, 0, 0, 4],
            ]
        )
        o = order.IthLaneMax(0)
        res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data)
        assert (
            res_order == [2, 1, 0]
        ).all()  # remember, from top to bottom in plotting later on.

        o = order.IthLaneMax(1)
        res_order, clusters = o.calc(plot_regions, lanes, raw_data, norm_data)

        assert (
            res_order == [0, 2, 1]
        ).all()  # remember, from top to bottom in plotting later on.