Ejemplo n.º 1
0
    def register_qc_biotypes(self):
        output_filename = self.result_dir / f"{self.name}_reads_per_biotype.png"

        from mbf_genomics.genes import Genes
        from mbf_genomics.genes.anno_tag_counts import GeneUnstranded

        genes = Genes(self.genome)
        anno = GeneUnstranded(self)

        def plot(output_filename):
            print(genes.df.columns)
            return (dp(genes.df).groupby("biotype").summarize(
                (anno.columns[0],
                 lambda x: x.sum(), "read count")).mutate(sample=self.name).p9(
                 ).theme_bw().annotation_stripes().add_bar(
                     "biotype", "read count",
                     stat="identity").scale_y_continuous(
                         labels=lambda xs: ["%.2g" % x for x in xs])
                    # .turn_x_axis_labels()
                    .coord_flip().title(self.name).render(
                        output_filename,
                        width=6,
                        height=2 + len(genes.df.biotype.unique()) * 0.25,
                    ))

        return register_qc(
            ppg.FileGeneratingJob(output_filename,
                                  plot).depends_on(genes.add_annotator(anno)))
Ejemplo n.º 2
0
 def test_pruning_plotjob(self, new_pipegraph):
     jobA = register_qc(ppg.PlotJob("c.png", lambda: None, lambda: None))
     assert not jobA._pruned
     prune_qc()
     assert jobA._pruned
     assert jobA.cache_job._pruned
     assert jobA.table_job._pruned
Ejemplo n.º 3
0
    def test_registration_and_pruning(self, new_pipegraph):
        with pytest.raises(TypeError):
            register_qc("shu")
        jobA = ppg.FileGeneratingJob("a",
                                     lambda: Path("a").write_text("hello"))
        register_qc(jobA)
        print(list(get_qc_jobs()))
        assert jobA in list(get_qc_jobs())
        assert not jobA._pruned
        jobc = register_qc(
            ppg.FileGeneratingJob("c", lambda: Path("b").write_text("hello")))

        def check_prune(job):
            return job.job_id.lower()[-1] == "c"

        prune_qc(check_prune)
        assert jobc in list(get_qc_jobs())
        assert not jobc._pruned
        jobB = register_qc(
            ppg.FileGeneratingJob("b", lambda: Path("b").write_text("hello")))
        assert jobB in list(get_qc_jobs())
        assert jobB._pruned
        jobC = register_qc(
            ppg.FileGeneratingJob("C", lambda: Path("b").write_text("hello")))
        assert not jobC._pruned
        assert len(list(get_qc_jobs())) == 4
        prune_qc()
        assert jobA._pruned
        assert jobB._pruned
        assert jobc._pruned
        assert jobC._pruned
        for j in get_qc_jobs():
            assert j._pruned
Ejemplo n.º 4
0
    def register_qc_complexity(self):

        output_filename = self.result_dir / f"{self.name}_complexity.png"

        def calc():
            import mbf_bam

            counts = mbf_bam.calculate_duplicate_distribution(
                str(self.bam_filename), str(self.index_filename)
            )
            return pd.DataFrame(
                {
                    "source": self.name,
                    "Repetition count": list(counts.keys()),
                    "Count": list(counts.values()),
                }
            )

        def plot(df):
            import numpy as np

            unique_count = df["Count"].sum()
            total_count = (df["Count"] * df["Repetition count"]).sum()
            pcb = float(unique_count) / total_count
            if pcb >= 0.9:  # pragma: no cover
                severity = "none"
            elif pcb >= 0.8:  # pragma: no cover
                severity = "mild"
            elif pcb >= 0.5:  # pragma: no cover
                severity = "moderate"
            else:
                severity = "severe"
            title = (
                "Genomic positions with repetition count reads\nTotal read count: %i\nPCR Bottleneck coefficient: %.2f (%s)"
                % (total_count, pcb, severity)
            )
            return (
                dp(df)
                .p9()
                .theme_bw()
                .add_point("Repetition count", "Count")
                .add_line("Repetition count", "Count")
                .scale_y_continuous(
                    trans="log2",
                    breaks=[2 ** x for x in range(1, 24)],
                    labels=lambda x: ["2^%0.f" % np.log(xs) for xs in x],
                )
                .title(title)
                .pd
            )

        return register_qc(
            ppg.PlotJob(output_filename, calc, plot)
            .depends_on(self.load())
            .use_cores(-1)
        )
Ejemplo n.º 5
0
    def register_qc_splicing(self):
        """How many reads were spliced? How many of those splices were known splice sites,
        how many were novel"""
        output_filename = self.result_dir / f"{self.name}_splice_sites.png"

        def calc():
            from mbf_bam import count_introns

            bam_filename, bam_index_name = self.get_bam_names()
            counts_per_chromosome = count_introns(bam_filename, bam_index_name)
            known_splice_sites_by_chr = {
                chr: set()
                for chr in self.genome.get_chromosome_lengths()
            }
            for gene in self.genome.genes.values():
                for start, stop in zip(*gene.introns_all):
                    known_splice_sites_by_chr[gene.chr].add((start, stop))
            total_counts = collections.Counter()
            known_count = 0
            unknown_count = 0
            for chr, counts in counts_per_chromosome.items():
                for k, v in counts.items():
                    if k[0] == 0xFFFFFFFF:
                        intron_counts = 0xFFFFFFFF - k[1]
                        total_counts[intron_counts] += v
                    else:
                        if k in known_splice_sites_by_chr[chr]:
                            known_count += v
                        else:
                            unknown_count += v
            result = {"side": [], "x": [], "count": []}
            result["side"].append("splice sites")
            result["x"].append("unknown")
            result["count"].append(unknown_count)
            result["side"].append("splice sites")
            result["x"].append("known")
            result["count"].append(known_count)

            for x, count in total_counts.items():
                result["side"].append("reads with x splices")
                result["x"].append(x)
                result["count"].append(count)

            return pd.DataFrame(result)

        def plot(df):
            return (dp(df).p9().theme_bw().add_bar(
                "x", "count", stat="identity").facet_wrap(
                    "side", scales="free", ncol=1).scale_y_continuous(
                        labels=lambda xs: ["%.2g" % x for x in xs]).title(
                            self.name).theme(
                                panel_spacing_y=0.2).render(output_filename))

        return register_qc(
            ppg.PlotJob(output_filename, calc,
                        plot).depends_on(self.load()).use_cores(-1))
Ejemplo n.º 6
0
    def register_qc_pca(self):
        output_filename = self.result_dir / "pca.png"

        def plot():
            import sklearn.decomposition as decom

            pca = decom.PCA(n_components=2, whiten=False)
            data = self.get_df()
            # min max scaling 0..1 per gene
            data = data.sub(data.min(axis=1), axis=0)
            data = data.div(data.max(axis=1), axis=0)

            data = data[~pd.isnull(data).any(axis=1)]  # can' do pca on NAN values
            pca.fit(data.T)
            xy = pca.transform(data.T)
            title = "PCA %s (%s)\nExplained variance: x %.2f%%, y %.2f%%" % (
                self.ddf.name,
                self.find_variable_name(),
                pca.explained_variance_ratio_[0] * 100,
                pca.explained_variance_ratio_[1] * 100,
            )
            plot_df = pd.DataFrame(
                {
                    "x": xy[:, 0],
                    "y": xy[:, 1],
                    "label": [self.get_plot_name(c) for (a, c) in self.samples],
                    "group": [
                        self.sample_column_to_group[c] for (a, c) in self.samples
                    ],
                }
            )
            p = dp(plot_df).p9().theme_bw().add_scatter("x", "y", color="group")
            if data.shape[1] < 15:
                p = p.add_text(
                    "x",
                    "y",
                    "label",
                    _alpha=0.5,
                    # _adjust_text={
                    # "expand_points": (2, 2),
                    # "arrowprops": {"arrowstyle": "->", "color": "darkgrey"},
                    # },
                )
            p = (
                p.scale_color_many_categories()
                .title(title)
                .render(output_filename, width=8, height=6, dpi=72)
            )
            plot_df.to_csv(output_filename.with_suffix(".tsv"), sep="\t")

        return register_qc(
            ppg.MultiFileGeneratingJob(
                [output_filename, output_filename.with_suffix(".tsv")], plot
            ).depends_on(self.deps())
        )
Ejemplo n.º 7
0
    def register_qc_correlation(self):
        output_filename = self.result_dir / "pearson_correlation.png"

        def plot(output_filename):
            data = self.get_df()
            data = data.sub(data.min(axis=1), axis=0)
            data = data.div(data.max(axis=1), axis=0)
            # data -= data.min()  # min max scaling 0..1 per gene
            # data /= data.max()
            data = data[
                ~pd.isnull(data).any(axis=1)
            ]  # can' do correlation on NAN values
            sample_names = [self.get_plot_name(x) for x in data.columns]
            sample_groups = [self.sample_column_to_group[x] for x in data.columns]
            data.columns = sample_names

            order_pdf = pd.DataFrame(
                {"sample": sample_names, "group": sample_groups}
            ).sort_values(["group", "sample"])
            ordered_names = ["group"] + list(order_pdf["sample"])
            sample_count = data.shape[1]
            pdf = (
                data.corr().transpose().assign(group=0).transpose()
            )  # value doesn't matter, this just reserves space on the plot
            pdf = pd.melt(pdf.reset_index(), "index")
            (
                dp(pdf)
                .categorize("index", ordered_names)
                .categorize("variable", ordered_names)
                .p9()
                .add_tile("index", "variable", fill="value")
                .scale_fill_gradient2(
                    "blue", "white", "red", limits=[-1, 1], midpoint=0
                )
                .add_scatter(
                    _x=1, y="sample", color="group", _shape="s", data=order_pdf, _size=3
                )
                .scale_color_many_categories()
                .hide_x_axis_title()
                .hide_y_axis_title()
                .turn_x_axis_labels()
                .render(
                    output_filename,
                    width=1 + 0.15 * sample_count,
                    height=0.15 * sample_count,
                )
            )

        return register_qc(
            ppg.FileGeneratingJob(output_filename, plot).depends_on(self.deps())
        )
Ejemplo n.º 8
0
    def register_qc_distribution(self, genes):
        output_filename = genes.result_dir / self.qc_folder / "read_distribution.png"
        output_filename.parent.mkdir(exist_ok=True)

        def plot(output_filename, elements):
            df = genes.df
            df = dp(df).select(
                {x.aligned_lane.name: x.columns[0]
                 for x in elements}).pd
            if len(df) == 0:
                df = pd.DataFrame({"x": [0], "y": [0], "text": "no data"})
                dp(df).p9().add_text("x", "y",
                                     "text").render(output_filename).pd
            else:
                plot_df = dp(df).melt(var_name="sample", value_name="count").pd

                plot = dp(plot_df).p9().theme_bw()
                print(df)

                # df.to_pickle(output_filename + '.pickle')
                if ((df > 0).sum(axis=0) > 1).any() and len(df) > 1:
                    # plot = plot.geom_violin(
                    # dp.aes(x="sample", y="count"), width=0.5, bw=0.1
                    # )
                    pass  # oh so slow as of 20201019
                if len(plot_df["sample"].unique()) > 1:
                    plot = plot.annotation_stripes(fill_range=True)
                if (plot_df["count"] > 0).any():
                    # can't have a log boxplot with all nans (log(0))
                    plot = plot.scale_y_continuous(
                        trans="log10",
                        name=self.qc_distribution_scale_y_name,
                        breaks=[1, 10, 100, 1000, 10000, 100_000, 1e6, 1e7],
                    )

                return (plot.add_boxplot(
                    x="sample",
                    y="count",
                    _width=0.1,
                    _fill=None,
                    _color="blue").turn_x_axis_labels().title(
                        "Raw read distribution").hide_x_axis_title().
                        render_args(limitsize=False).render(
                            output_filename,
                            width=0.2 * len(elements) + 1,
                            height=4))

        return register_qc(
            QCCollectingJob(output_filename, plot).depends_on(
                genes.add_annotator(self)).add(self))
Ejemplo n.º 9
0
    def register_qc_pca(self, genes):
        output_filename = genes.result_dir / self.qc_folder / f"pca.png"

        def plot(output_filename, elements):
            import sklearn.decomposition as decom

            if len(elements) == 1:
                xy = np.array([[0], [0]]).transpose()
                title = "PCA %s - fake / single sample" % genes.name
            else:
                pca = decom.PCA(n_components=2, whiten=False)
                data = genes.df[[x.columns[0] for x in elements]]
                data -= data.min()  # min max scaling 0..1
                data /= data.max()
                data = data[~pd.isnull(data).any(
                    axis=1)]  # can' do pca on NAN values
                if len(data):
                    pca.fit(data.T)
                    xy = pca.transform(data.T)
                    title = "PCA %s\nExplained variance: x %.2f%%, y %.2f%%" % (
                        genes.name,
                        pca.explained_variance_ratio_[0] * 100,
                        pca.explained_variance_ratio_[1] * 100,
                    )
                else:
                    xy = np.array([[0] * len(elements),
                                   [0] * len(elements)]).transpose()
                    title = "PCA %s - fake / no rows" % genes.name

            plot_df = pd.DataFrame({
                "x": xy[:, 0],
                "y": xy[:, 1],
                "label": [x.plot_name for x in elements]
            })
            print(plot_df)
            (dp(plot_df).p9().theme_bw().add_scatter("x", "y").add_text(
                "x",
                "y",
                "label",
                # cool, this can go into an endless loop...
                # _adjust_text={
                # "expand_points": (2, 2),
                # "arrowprops": {"arrowstyle": "->", "color": "red"},
                # },
            ).scale_color_many_categories().title(title).render(
                output_filename, width=8, height=6))

        return register_qc(
            QCCollectingJob(output_filename, plot).depends_on(
                genes.add_annotator(self)).add(self))
Ejemplo n.º 10
0
    def register_qc_volcano(self, genes, filtered=None, filter_func=None):
        """perform a volcano plot
        """
        if filtered is None:
            output_filename = genes.result_dir / "volcano.png"
        else:
            output_filename = filtered.result_dir / "volcano.png"

        def plot(output_filename):
            df = (dp(genes.df).mutate(significant=filter_func(genes.df) if
                                      filter_func is not None else "tbd.").pd)

            no_sig_lower = (df["significant"] & (df[self["log2FC"]] < 0)).sum()
            no_sig_higher = (df["significant"] &
                             (df[self["log2FC"]] > 0)).sum()

            (dp(df).p9().scale_color_many_categories(
                name="regulated", shift=3).scale_y_continuous(
                    name="p",
                    trans=dp.reverse_transform("log10"),
                    labels=lambda xs: ["%.2g" % x for x in xs],
                ).add_vline(xintercept=1, _color="blue").add_vline(
                    xintercept=-1, _color="blue").add_hline(yintercept=0.05,
                                                            _color="blue").
             add_rect(  # shade 'simply' significant regions
                 xmin="xmin",
                 xmax="xmax",
                 ymin="ymin",
                 ymax="ymax",
                 _fill="lightgrey",
                 data=pd.DataFrame({
                     "xmin": [-np.inf, 1],
                     "xmax": [-1, np.inf],
                     "ymin": [0, 0],
                     "ymax": [0.05, 0.05],
                 }),
                 _alpha=0.8,
             ).add_scatter(
                 self["log2FC"], self["p"], color="significant").title(
                     f"# regulated down/ up: {no_sig_lower} / {no_sig_higher}")
             # .coord_trans(x="reverse", y="reverse")  #broken as of 2019-01-31
             .render(output_filename, width=8, height=6, dpi=300))

        return register_qc(
            ppg.FileGeneratingJob(output_filename, plot).depends_on(
                genes.add_annotator(self),
                ppg.FunctionInvariant(
                    str(output_filename) + "_filter_func", filter_func),
            ))
Ejemplo n.º 11
0
    def register_qc_fastqc(self):
        from mbf_externals import FASTQC
        from mbf_qualitycontrol import register_qc

        a = FASTQC()
        output_dir = self.result_dir / "FASTQC"
        temp_job = self.prepare_input()
        if hasattr(temp_job, 'filenames'):
            filenames = temp_job.filenames
        else:
            filenames = []
            for j in temp_job:  # is actually joblist
                filenames.extend(j.filenames)

        job = a.run(output_dir, filenames)
        return register_qc(job.depends_on(temp_job))
Ejemplo n.º 12
0
    def register_qc(self, new_lane):
        """Plot for to see how much you lost.

        """
        output_filename = (
            new_lane.result_dir / ".." / "alignment_substract.png"
        ).resolve()
        print(output_filename)

        def calc_and_plot(output_filename, lanes):
            parts = []
            for l in lanes:
                was = l.parent.mapped_reads()
                now = l.mapped_reads()
                lost = was - now
                parts.append(
                    pd.DataFrame(
                        {
                            "what": ["kept", "lost"],
                            "count": [now, lost],
                            "sample": l.name,
                        }
                    )
                )
            df = pd.concat(parts)
            return (
                dp(df)
                .categorize("what", ["lost", "kept"])
                .p9()
                .theme_bw()
                .annotation_stripes()
                .add_bar(
                    "sample", "count", fill="what", position="stack", stat="identity"
                )
                .title(lanes[0].genome.name + " substraction")
                .turn_x_axis_labels()
                .scale_y_continuous(labels=lambda xs: ["%.2g" % x for x in xs])
                .render_args(width=len(parts) * 0.2 + 1, height=5)
                .render(output_filename)
            )

        return register_qc(
            QCCollectingJob(output_filename, calc_and_plot)
            .depends_on(new_lane.load())
            .add(new_lane)
        )  # since everybody says self.load, we get them all
Ejemplo n.º 13
0
    def register_qc_alignment_stats(self):
        output_filename = self.result_dir / ".." / "alignment_statistics.png"

        def calc_and_plot(output_filename, lanes):
            parts = []
            for lane in lanes:
                p = lane.get_alignment_stats()
                parts.append(
                    pd.DataFrame(
                        {
                            "what": list(p.keys()),
                            "count": list(p.values()),
                            "sample": lane.name,
                        }
                    )
                )
            df = pd.concat(parts)
            order = sorted(df["what"].unique())
            umrn = "Uniquely mapped reads number"
            if umrn in order:
                order = [x for x in order if x != umrn] + [umrn]
            return (
                dp(df)
                .categorize("what", order)
                .p9()
                .theme_bw()
                .annotation_stripes()
                .add_bar(
                    "sample", "count", fill="what", position="stack", stat="identity"
                )
                .title(lanes[0].genome.name)
                .turn_x_axis_labels()
                .scale_y_continuous(labels=lambda xs: ["%.2g" % x for x in xs])
                .render_args(width=len(parts) * 0.2 + 1, height=5, limitsize=False)
                .render(output_filename)
            )

        return register_qc(
            QCCollectingJob(output_filename, calc_and_plot)
            .depends_on(self.load())
            .add(self)
        )  # since everybody says self.load, we get them all
Ejemplo n.º 14
0
    def register_qc_distribution(self):
        output_filename = self.result_dir / "distribution.png"

        def plot(output_filename):
            df = self.get_df()
            sample_count = df.shape[1]
            sample_names = [self.get_plot_name(x) for x in df.columns]
            sample_groups = [self.sample_column_to_group[x] for x in df.columns]
            df.columns = pd.MultiIndex.from_tuples(
                zip(sample_names, sample_groups), names=("sample", "group")
            )
            order = [
                x[0]
                for x in sorted(zip(sample_names, sample_groups), key=lambda v: v[1])
            ]
            return (
                dp(df)
                .melt(value_name="y")
                .categorize("sample", order)
                .p9()
                .theme_bw()
                .annotation_stripes()
                .geom_violin(dp.aes("sample", "y"), width=0.5)
                .add_boxplot(x="sample", y="y", _width=0.1, _fill=None, color="group")
                .scale_color_many_categories()
                .scale_y_continuous(trans="log10", name=self.find_variable_name())
                .turn_x_axis_labels()
                .hide_x_axis_title()
                .render(
                    output_filename,
                    height=5,
                    width=1 + 0.25 * sample_count,
                    limitsize=False,
                )
            )

        return register_qc(
            ppg.FileGeneratingJob(output_filename, plot).depends_on(self.deps())
        )
Ejemplo n.º 15
0
    def register_qc_subchromosomal(self):
        """Subchromosom distribution plot - good to detect amplified regions
        or ancient virus awakening"""
        import mbf_genomics

        output_filename = (self.result_dir /
                           f"{self.name}_subchromosomal_distribution.png")

        class IntervalStrategyWindows(
                mbf_genomics.genes.anno_tag_counts._IntervalStrategy):
            """For QC purposes, spawn all chromosomes with
            windows of the definied size

            See mbf_align.lanes.AlignedLane.register_qc_subchromosomal

            """
            def __init__(self, window_size):
                self.window_size = window_size

            def _get_interval_tuples_by_chr(self, genome):
                result = {}
                for chr, length in genome.get_chromosome_lengths().items():
                    result[chr] = []
                    for ii in range(0, length, self.window_size):
                        result[chr].append(("%s_%i" % (chr, ii), 0, [ii],
                                            [ii + self.window_size]))
                return result

        def calc():
            from mbf_bam import count_reads_unstranded

            interval_strategy = IntervalStrategyWindows(250_000)
            intervals = interval_strategy._get_interval_tuples_by_chr(
                self.genome)

            bam_filename, bam_index_name = self.get_bam_names()
            counts = count_reads_unstranded(
                bam_filename,
                bam_index_name,
                intervals,
                intervals,
                each_read_counts_once=True,
            )
            true_chromosomes = set(self.genome.get_true_chromosomes())
            result = {"chr": [], "window": [], "count": []}
            for key, count in counts.items():
                if not key.startswith("_"):
                    # must handle both 2R_1234
                    # and Unmapped_scaffold_29_D1705_1234
                    *c, window = key.split("_")
                    chr = "_".join(c)
                    if chr in true_chromosomes:  # pragma: no branch
                        window = int(window)
                        result["chr"].append(chr)
                        result["window"].append(window)
                        result["count"].append(count)
            return pd.DataFrame(result)

        def plot(df):
            import natsort

            df["count"] += 1  # so we don't crash in the log scale if all values are 0 for a chr
            return (dp(df).categorize(
                "chr",
                natsort.natsorted(X["chr"].unique())).p9().theme_bw().add_line(
                    "window", "count", _alpha=0.3).scale_y_log10().facet_wrap(
                        "chr", scales="free",
                        ncol=1).hide_x_axis_labels().title(
                            self.name).render_args(width=6,
                                                   height=2 +
                                                   len(df["chr"].unique()) * 1,
                                                   limitsize=False).pd)

        return register_qc(
            ppg.PlotJob(output_filename, calc,
                        plot).depends_on(self.load()).use_cores(-1))
Ejemplo n.º 16
0
    def register_qc_gene_strandedness(self):  # noqa: C901
        from mbf_genomics.genes.anno_tag_counts import _IntervalStrategy

        class IntervalStrategyExonIntronClassification(_IntervalStrategy):
            """For QC purposes, defines all intron/exon intervals tagged
            with nothing but intron/exon

            See mbf_align.lanes.AlignedLane.register_qc_gene_strandedness

            """
            def _get_interval_tuples_by_chr(self, genome):
                from mbf_nested_intervals import IntervalSet

                coll = {chr: [] for chr in genome.get_chromosome_lengths()}
                for g in genome.genes.values():
                    exons = g.exons_overlapping
                    if len(exons[0]) == 0:  # pragma: no cover
                        exons = g.exons_merged
                    for start, stop in zip(*exons):
                        coll[g.chr].append(
                            (start, stop, 0b0101 if g.strand == 1 else 0b0110))
                    for start, stop in zip(*g.introns_strict):
                        coll[g.chr].append(
                            (start, stop, 0b1001 if g.strand == 1 else 0b1010))
                result = {}
                for chr, tups in coll.items():
                    iset = IntervalSet.from_tuples_with_id(tups)
                    # iset = iset.merge_split()
                    iset = iset.merge_hull()
                    if iset.any_overlapping():
                        raise NotImplementedError("Should not be reached")
                    result[chr] = []
                    for start, stop, ids in iset.to_tuples_with_id():
                        ids = set(ids)
                        if len(ids) == 1:
                            id = list(ids)[0]
                            if id == 0b0101:
                                tag = "exon"
                                strand = +1
                            elif id == 0b0110:
                                tag = "exon"
                                strand = -1
                            elif id == 0b1001:
                                tag = "intron"
                                strand = +1
                            elif id == 0b1010:
                                tag = "intron"
                                strand = -1
                            else:  # pragma: no cover
                                raise NotImplementedError(
                                    "Should not be reached")
                        else:
                            down = 0
                            for i in ids:
                                down |= i
                            if down & 0b1100 == 0b1100:
                                tag = "both"
                            elif down & 0b0100 == 0b0100:
                                tag = "exon"
                            else:  # pragma: no cover  haven't observed this case in the wild yet.
                                tag = (  # pragma: no cover
                                    "intron"  # pragma: no cover
                                )  # pragma: no cover  haven't observed this case in the wild yet.
                            if down & 0b11 == 0b11:
                                tag += "_undecidable"
                                strand = (
                                    1
                                )  # doesn't matter, but must be one or the other
                            elif down & 0b01:
                                strand = 1
                            else:
                                strand -= 1

                        result[chr].append((tag, strand, [start], [stop]))
                return result

        output_filename = self.result_dir / f"{self.name}_strandedness.png"

        def calc():
            from mbf_genomics.genes.anno_tag_counts import IntervalStrategyGene
            from mbf_bam import count_reads_stranded

            interval_strategy = IntervalStrategyExonIntronClassification()
            intervals = interval_strategy._get_interval_tuples_by_chr(
                self.genome)

            bam_filename, bam_index_name = self.get_bam_names()
            forward, reverse = count_reads_stranded(
                bam_filename,
                bam_index_name,
                intervals,
                IntervalStrategyGene()._get_interval_tuples_by_chr(
                    self.genome),
                each_read_counts_once=True,
            )
            result = {"what": [], "count": [], "sample": self.name}
            for k in forward.keys() | reverse.keys():
                if k.endswith("_undecidable"):
                    result["what"].append(k)
                    result["count"].append(
                        forward.get(k, 0) + reverse.get(k, 0))
                elif not k.startswith("_"):
                    result["what"].append(k + "_correct")
                    result["count"].append(forward.get(k, 0))
                    result["what"].append(k + "_reversed")
                    result["count"].append(reverse.get(k, 0))
                elif k == "_outside":
                    result["what"].append("outside")
                    result["count"].append(forward.get(k, 0))

            return pd.DataFrame(result)

        def plot(df):
            return (dp(df).mutate(what=pd.Categorical(
                df["what"],
                [
                    "exon_correct",
                    "exon_reversed",
                    "exon_undecidable",
                    "intron_correct",
                    "intron_reversed",
                    "intron_undecidable",
                    "both_correct",
                    "both_reversed",
                    "both_undecidable",
                    "outside",
                ],
            )).p9().add_bar(
                "sample", "count", fill="what",
                position="dodge").scale_y_continuous(
                    labels=lambda xs: ["%.2g" % x
                                       for x in xs]).turn_x_axis_labels().pd)

        return register_qc(
            ppg.PlotJob(output_filename, calc,
                        plot).depends_on(self.load()).use_cores(-1))
Ejemplo n.º 17
0
    def register_qc_ma_plot(self, genes, filtered, filter_func):
        """perform an MA plot - not a straight annotator.register_qc function,
        but called by .filter
        """
        output_filename = filtered.result_dir / "ma_plot.png"

        def plot(output_filename):
            from statsmodels.nonparametric.smoothers_lowess import lowess

            print(genes.df.columns)
            print(list(self.sample_columns(self.comp[0])))
            print(list(self.sample_columns(self.comp[1])))
            df = genes.df[list(self.sample_columns(self.comp[0])) +
                          list(self.sample_columns(self.comp[1]))]
            df = df.assign(significant=filter_func(genes.df))
            pdf = []
            loes_pdfs = []
            # Todo: how many times can you over0lopt this?
            for a, b in itertools.combinations(
                [x for x in df.columns if not "significant" == x], 2):
                np_a = np.log2(df[a] + self.laplace_offset)
                np_b = np.log2(df[b] + self.laplace_offset)
                A = (np_a + np_b) / 2
                M = np_a - np_b
                local_pdf = pd.DataFrame({
                    "A": A,
                    "M": M,
                    "a": self.comparisons.get_plot_name(a),
                    "b": self.comparisons.get_plot_name(b),
                    "significant": df["significant"],
                }).sort_values("M")
                chosen = np.zeros(len(local_pdf), bool)
                chosen[:500] = True
                chosen[-500:] = True
                chosen[np.random.randint(0, len(chosen), 1000)] = True
                pdf.append(local_pdf)
                fitted = lowess(M, A, is_sorted=False)
                loes_pdfs.append(
                    pd.DataFrame({
                        "a": self.comparisons.get_plot_name(a),
                        "b": self.comparisons.get_plot_name(b),
                        "A": fitted[:, 0],
                        "M": fitted[:, 1],
                    }))
            pdf = pd.concat(pdf)
            pdf = pdf.assign(
                ab=[a + ":" + b for (a, b) in zip(pdf["a"], pdf["b"])])
            loes_pdf = pd.concat(loes_pdfs)
            loes_pdf = loes_pdf.assign(ab=[
                a + ":" + b for (a, b) in zip(loes_pdf["a"], loes_pdf["b"])
            ])
            (dp(pdf).p9().theme_bw(10).add_hline(
                yintercept=0, _color="lightblue").add_hline(
                    yintercept=1, _color="lightblue").add_hline(
                        yintercept=-1,
                        _color="lightblue").scale_color_many_categories(
                            name="significant", shift=3).add_point(
                                "A",
                                "M",
                                color="significant",
                                _size=1,
                                _alpha=0.3).add_line("A",
                                                     "M",
                                                     _color="blue",
                                                     data=loes_pdf).
             facet_wrap(["ab"]).title(
                 f"MA {filtered.name}\n{self.comparisons.find_variable_name()}"
             ).render(output_filename, width=8, height=6))

        return register_qc(
            ppg.FileGeneratingJob(output_filename, plot).depends_on(
                genes.add_annotator(self)).depends_on(self.comparisons.deps))