Esempio n. 1
0
def plot_venn_from_genes_with_comparisons(output_prefix,
                                          a_dict,
                                          id_column="gene_stable_id"):
    if len(a_dict) not in (2, 3):
        raise ValueError("Max support 3 sets currently")

    def plot():
        up = {}
        down = {}
        for name, genes_ddf in sorted(a_dict.items()):
            df = genes_ddf.df
            stable_ids = df[id_column]
            column = genes_ddf.venn_annotator["log2FC"]
            up[name] = set(stable_ids[df[column] > 0])
            down[name] = set(stable_ids[df[column] < 0])
        plt.figure(figsize=(4, 4))
        venn.venn(up)
        plt.savefig(str(output_prefix) + ".up.png", dpi=72)
        plt.figure(figsize=(4, 4))
        venn.venn(down)
        plt.savefig(str(output_prefix) + ".down.png", dpi=72)

    return (ppg.MultiFileGeneratingJob(
        [str(output_prefix) + ".up.png",
         str(output_prefix) + ".down.png"], plot).depends_on([
             x.add_annotator(x.venn_annotator) for x in a_dict.values()
         ]).depends_on(ppg.ParameterInvariant(output_prefix, id_column)))
Esempio n. 2
0
    def test_multifilegenerating_job_requires_string_filenames(self):
        import pathlib

        x = lambda: 5  # noqa:E731
        ppg.MultiFileGeneratingJob(["a"], x)
        ppg.MultiFileGeneratingJob([pathlib.Path("a")], x)

        def inner():
            ppg.MultiFileGeneratingJob([0])

        assertRaises(TypeError, inner)

        def inner():
            ppg.MultiFileGeneratingJob([b"a"])  # bytes is not a string type

        assertRaises(TypeError, inner)
Esempio n. 3
0
    def save_input(self):
        """Store the filtered input also in filename for later reference"""
        import gzip

        temp_job = self.prepare_input()
        output_dir = self.result_dir / "aligner_input"
        output_dir.mkdir(exist_ok=True)
        output_names = [
            output_dir / (Path(x).name + ".gz") for x in temp_job.filenames
        ]
        pairs = zip(temp_job.filenames, output_names)

        def do_store():
            block_size = 10 * 1024 * 1024
            for input_filename, output_filename in pairs:
                op = open(input_filename, "rb")
                op_out = gzip.GzipFile(output_filename, "wb")
                f = op.read(block_size)
                while f:
                    op_out.write(f)
                    f = op.read(block_size)
                op_out.close()
                op.close()

        return ppg.MultiFileGeneratingJob(output_names,
                                          do_store).depends_on(temp_job)
Esempio n. 4
0
            def align_job(
                self,
                input_fastq,
                paired_end_filename,
                index_basename,
                output_bam_filename,
                parameters,
            ):
                def align():
                    with open(output_bam_filename, "w") as op:
                        json.dump(
                            [
                                open(input_fastq).read(200),
                                open(paired_end_filename).read(200)
                                if paired_end_filename
                                else "",
                                index_basename,
                                str(parameters),
                            ],
                            op,
                        )
                    with open(str(output_bam_filename) + ".bai", "w") as op:
                        op.write("Done")

                job = ppg.MultiFileGeneratingJob(
                    [output_bam_filename, str(output_bam_filename) + ".bai"], align
                )
                job.depends_on_params("")
                return job
Esempio n. 5
0
    def register_qc_pca(self):
        output_filename = self.result_dir / "pca.png"

        def plot():
            import sklearn.decomposition as decom

            pca = decom.PCA(n_components=2, whiten=False)
            data = self.get_df()
            # min max scaling 0..1 per gene
            data = data.sub(data.min(axis=1), axis=0)
            data = data.div(data.max(axis=1), axis=0)

            data = data[~pd.isnull(data).any(axis=1)]  # can' do pca on NAN values
            pca.fit(data.T)
            xy = pca.transform(data.T)
            title = "PCA %s (%s)\nExplained variance: x %.2f%%, y %.2f%%" % (
                self.ddf.name,
                self.find_variable_name(),
                pca.explained_variance_ratio_[0] * 100,
                pca.explained_variance_ratio_[1] * 100,
            )
            plot_df = pd.DataFrame(
                {
                    "x": xy[:, 0],
                    "y": xy[:, 1],
                    "label": [self.get_plot_name(c) for (a, c) in self.samples],
                    "group": [
                        self.sample_column_to_group[c] for (a, c) in self.samples
                    ],
                }
            )
            p = dp(plot_df).p9().theme_bw().add_scatter("x", "y", color="group")
            if data.shape[1] < 15:
                p = p.add_text(
                    "x",
                    "y",
                    "label",
                    _alpha=0.5,
                    # _adjust_text={
                    # "expand_points": (2, 2),
                    # "arrowprops": {"arrowstyle": "->", "color": "darkgrey"},
                    # },
                )
            p = (
                p.scale_color_many_categories()
                .title(title)
                .render(output_filename, width=8, height=6, dpi=72)
            )
            plot_df.to_csv(output_filename.with_suffix(".tsv"), sep="\t")

        return register_qc(
            ppg.MultiFileGeneratingJob(
                [output_filename, output_filename.with_suffix(".tsv")], plot
            ).depends_on(self.deps())
        )
Esempio n. 6
0
 def align_job(
     self,
     input_fastq,
     paired_end_filename,
     index_basename,
     output_bam_filename,
     parameters,
 ):
     job = ppg.MultiFileGeneratingJob(
         [output_bam_filename, str(output_bam_filename) + ".bai"], lambda: 5
     )
     # job.depends_on_params("") # that's the line we check
     return job
Esempio n. 7
0
    def call(self):
        """
        Creates the vcf generating job.

        Creates a pypipegraph Job that does the variant calling and takes care
        of preprocessing and dependencies.

        Returns
        -------
        pypipegraph.FileGeneratingJob
            The job that does the variant calling.
        """
        run_callable = self.caller.run()

        def run():
            run_callable(self.input_samples, self.output_file)

        job = ppg.FileGeneratingJob(self.output_file, run, empty_ok=False)
        # job can depend on preprocessor dependencies, caller dependencies and the preprocessor job
        job.depends_on(self.caller.get_dependencies())
        lanes_loaded = [
            input_sample.load() for sample_list in self.input_samples
            for input_sample in sample_list
        ]
        # If a snp caller needs some preparation, this is the place to do it.
        if self.caller.preprocessor is not None:
            job.depends_on(self.caller.preprocessor.get_dependencies())
            job.depends_on(self.caller.preprocessor.prerequisite_jobs())
            preprocessor_output = self.caller.preprocessor.get_preprocessed_output(
                self.input_samples)
            if len(preprocessor_output) > 0:
                preprocessor_job = ppg.MultiFileGeneratingJob(
                    preprocessor_output,
                    self.caller.preprocessor.preprocess(self.input_samples),
                )
                preprocessor_job.depends_on(lanes_loaded)
                preprocessor_job.depends_on(
                    self.caller.preprocessor.get_dependencies()).depends_on(
                        self.caller.preprocessor.prerequisite_jobs())
                job.depends_on(preprocessor_job)
        job.depends_on(lanes_loaded)
        job.depends_on(
            ppg.FunctionInvariant(
                f"{self.caller.__class__.__name__}.run",
                self.caller.__class__.run,
            ))
        for sample_list in self.input_samples:
            for input_sample in sample_list:
                job.depends_on(input_sample.load())
        job.cores_needed = self.caller.get_cores_needed()
        return job
Esempio n. 8
0
    def download_files(self):
        result = []
        for url, target_fn in zip(self.urls, self.target_files):

            def download(url=url, target_fn=target_fn):
                Path(target_fn).parent.mkdir(exist_ok=True, parents=True)
                target_fn.with_name(target_fn.name + ".url").write_text(url)
                with open(str(target_fn) + "_temp", "wb") as op:
                    download_file(url, op)
                shutil.move(str(target_fn) + "_temp", target_fn)

            job = ppg.MultiFileGeneratingJob(
                [target_fn,
                 target_fn.with_name(target_fn.name + ".url")], download)
            result.append(job)
        return result
Esempio n. 9
0
    def test_creating_index_for_mfg(self):
        def gen():
            shutil.copy(get_sample_data(Path("mbf_align/ex2.bam")),
                        "sample.bam")

        ppg.util.global_pipegraph.quiet = False

        job = ppg.MultiFileGeneratingJob(["sample.bam"], gen)
        genome = object()
        lane = mbf_align.AlignedSample("test_lane", job, genome, False,
                                       "AA123")
        assert isinstance(lane.load()[1], ppg.FileGeneratingJob)
        assert lane.load()[0] in lane.load()[1].prerequisites
        ppg.run_pipegraph()
        assert Path("sample.bam").exists()
        assert Path("sample.bam.bai").exists()
Esempio n. 10
0
def job_reheader_and_rename_chromosomes(input_bam_path, output_bam_path,
                                        replacements):
    input_path_bam = Path(input_bam_path)
    output_bam_path = Path(output_bam_path)

    def do_replace(replacements=replacements):
        reheader_and_rename_chromosomes(input_bam_path, output_bam_path,
                                        replacements)

    output_bam_path.parent.mkdir(exist_ok=True, parents=True)
    return ppg.MultiFileGeneratingJob(
        [output_bam_path,
         output_bam_path.with_suffix(".bam.bai")], do_replace).depends_on(
             ppg.FileInvariant(input_bam_path),
             ppg.FunctionInvariant("mbf_bam.reheader_and_rename_chromosomes",
                                   reheader_and_rename_chromosomes),
         )
Esempio n. 11
0
    def test_accepts(self):
        import pathlib

        write("aaa", "hello")
        write("bbb", "hello")
        write("ccc", "hello")
        a = ppg.FileTimeInvariant(pathlib.Path("aaa"))
        a1 = ppg.MultiFileInvariant([pathlib.Path("bbb"), "ccc"])
        b = ppg.FileGeneratingJob(
            pathlib.Path("b"),
            lambda of: write(of, "bb" + read("aaa") + read("bbb") + read("ccc")),
        )
        b.depends_on(a)
        b.depends_on(a1)

        dd = Dummy()

        def mf():
            write("c", "cc" + read("g"))
            write("d", "dd" + read("h") + dd.attr)
            write("e", "ee" + read("i") + read("j"))

        c = ppg.MultiFileGeneratingJob([pathlib.Path("c"), "d", pathlib.Path("e")], mf)
        c.depends_on(b)
        d = ppg.FunctionInvariant(pathlib.Path("f"), lambda x: x + 1)
        c.depends_on(d)
        e = ppg.ParameterInvariant(pathlib.Path("c"), "hello")
        c.depends_on(e)
        f = ppg.TempFileGeneratingJob(pathlib.Path("g"), lambda: write("g", "gg"))
        c.depends_on(f)

        def tmf():
            write("h", "hh")
            write("i", "ii")

        g = ppg.MultiTempFileGeneratingJob([pathlib.Path("h"), "i"], tmf)
        c.depends_on(g)

        def tpf():
            write("j", "jjjj")
            write("k", "kkkk")

        h = ppg.TempFilePlusGeneratingJob(pathlib.Path("j"), pathlib.Path("k"), tpf)
        c.depends_on(h)

        i = ppg.CachedDataLoadingJob(
            pathlib.Path("l"), lambda: write("l", "llll"), lambda res: res
        )
        c.depends_on(i)

        m = ppg.CachedAttributeLoadingJob(pathlib.Path("m"), dd, "attr", lambda: "55")
        c.depends_on(m)
        ppg.run_pipegraph()
        assert read("aaa") == "hello"
        assert read("b") == "bbhellohellohello"
        assert read("c") == "ccgg"
        assert read("d") == "ddhh55"
        assert read("e") == "eeiijjjj"
        assert not (os.path.exists("g"))
        assert not (os.path.exists("h"))
        assert not (os.path.exists("i"))
        assert not (os.path.exists("j"))
        assert read("k") == "kkkk"
Esempio n. 12
0
 def inner():
     ppg.MultiFileGeneratingJob([b"a"])  # bytes is not a string type
Esempio n. 13
0
 def inner():
     ppg.MultiFileGeneratingJob([0])
Esempio n. 14
0
 def test_lane_raises_on_multifilegeneratingJobWithNoBAM(self):
     mfg = ppg.MultiFileGeneratingJob(["a.sam"], lambda: 5)
     genome = object()
     with pytest.raises(ValueError):
         mbf_align.AlignedSample("test_lane", mfg, genome, False, "AA123")
Esempio n. 15
0
def run_exports(gen_additional_jobs=None, handle_ppg=True, settings='ovca'):
    if settings == 'ovca':
        apply_ovca_settings()
    else:
        raise ValueError("unknow setting value", settings)

    old = Path(os.getcwd()).absolute()
    os.chdir("/project")
    if handle_ppg:
        ppg.new_pipegraph()
    # os.chdir(old)
    to_wide_columns = {}
    jobs = []
    for cls in exporting_classes:
        instance = cls()
        if hasattr(instance, "exports"):
            instance.exports()

        out_prefix = getattr(instance, "out_prefix", "")
        for method_name in dir(instance):
            method = getattr(instance, method_name)
            if hasattr(method, "_output_name"):
                print(cls.__name__, method.__name__)
                output_filename = ("/project/processed/" + out_prefix +
                                   method._output_name + ".units")
                cwd = str(Path(method._abs_filename).parent)

                def write(output_filename=output_filename,
                          method=method,
                          cwd=cwd):
                    os.chdir(cwd)
                    df = method()
                    os.chdir("/project")
                    check_dataframe(out_prefix + method._output_name, df)
                    Path(output_filename).parent.mkdir(exist_ok=True,
                                                       parents=True)
                    if "unit" in df:
                        for ii, (unit, sub_df) in enumerate(
                                df.groupby("unit", sort=True)):
                            try:
                                sub_df.to_parquet(
                                    output_filename[:output_filename.
                                                    rfind(".")] + "." +
                                    str(ii) + ".parquet")
                            except:
                                sub_df.to_pickle("debug.pickle")
                                raise

                        Path(output_filename).write_text(
                            json.dumps(sorted(df.unit.unique())))
                    else:
                        df.to_parquet(
                            output_filename[:output_filename.rfind(".")] +
                            ".0.parquet")
                        Path(output_filename).write_text(json.dumps(["nounit"
                                                                     ]))
                    Path(output_filename + ".desc").write_text(
                        method._description)

                job = ppg.MultiFileGeneratingJob(
                    [output_filename, output_filename + ".desc"], write)
                job.depends_on(
                    ppg.FunctionInvariant(output_filename + "_inner_func",
                                          method))
                if method._input_files:
                    job.depends_on(ppg.MultiFileInvariant(method._input_files))
                if method._deps:
                    if hasattr(method._deps, "__call__"):
                        deps = method._deps(method.__self__)
                    else:
                        deps = method._deps
                    job.depends_on(deps)

                print(output_filename)
                print("")
                os.chdir("/project")
                jobs.append(job)
                to_wide_columns[out_prefix +
                                method._output_name] = method._wide_columns

    def dump_to_wide_columns(output_filename):
        Path(output_filename).write_text(json.dumps(to_wide_columns))

    jobs.append(
        ppg.FileGeneratingJob(
            "/project/processed/_to_wide_columns.json",
            dump_to_wide_columns).depends_on(
                ppg.ParameterInvariant(
                    "/project/processed/_to_wide_columns.json",
                    ppg.util.freeze(to_wide_columns),
                )))

    old = Path(os.getcwd()).absolute()
    if handle_ppg:
        os.chdir("/project")
        ppg.run_pipegraph()
    os.chdir(old)
    return jobs