Example #1
0
    def test_job_creation_after_pipegraph_run_raises(self):
        def inner():
            ppg.FileGeneratingJob("A", lambda: None)

        ppg.new_pipegraph(quiet=True, dump_graph=False)
        ppg.run_pipegraph()
        assertRaises(ValueError, inner)
Example #2
0
    def test_no_rerun_if_ignore_code_changes_and_plot_changes(self):
        import pydataframe
        def calc():
            append('out/calc', 'A')
            return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
        def plot(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('X','Y')
        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'A')
        self.assertEqual(read('out/plot'),'B')

        ppg.new_pipegraph(rc_gen(), quiet=True)
        def plot2(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('Y','X')
        job = ppg.PlotJob(of, calc, plot2)
        job.ignore_code_changes()
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'A')
        self.assertEqual(read('out/plot'),'B')
    def test_reruns_just_plot_if_plot_changed(self):
        import pydataframe

        def calc():
            append('out/calc', 'A')
            return pydataframe.DataFrame({
                "X": list(range(0, 100)),
                'Y': list(range(50, 150))
            })

        def plot(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('X', 'Y')

        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'), 'A')
        self.assertEqual(read('out/plot'), 'B')

        ppg.new_pipegraph(rc_gen(), quiet=True)

        def plot2(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('Y', 'X')

        job = ppg.PlotJob(of, calc, plot2)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'), 'A')
        self.assertEqual(read('out/plot'), 'BB')
Example #4
0
    def test_job_creation_after_pipegraph_run_raises(self):
        def inner():
            ppg.FileGeneratingJob("A", lambda: None)

        ppg.new_pipegraph(quiet=True, dump_graph=False)
        ppg.run_pipegraph()
        assertRaises(ValueError, inner)
Example #5
0
        def test_reruns_just_plot_if_plot_changed(self):
            def calc():
                append("out/calc", "A")
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150))
                })

            def plot(df):
                append("out/plot", "B")
                return pyggplot.Plot(df).add_scatter("X", "Y")

            of = "out/test.png"
            ppg.PlotJob(of, calc, plot)
            ppg.run_pipegraph()
            self.assertTrue(magic(of).find(b"PNG image") != -1)
            self.assertEqual(read("out/calc"), "A")
            self.assertEqual(read("out/plot"), "B")

            ppg.new_pipegraph(rc_gen(), quiet=True)

            def plot2(df):
                append("out/plot", "B")
                return pyggplot.Plot(df).add_scatter("Y", "X")

            ppg.PlotJob(of, calc, plot2)
            ppg.run_pipegraph()
            self.assertTrue(magic(of).find(b"PNG image") != -1)
            self.assertEqual(read("out/calc"), "A")
            self.assertEqual(read("out/plot"), "BB")
Example #6
0
    def test_unpickle_bug_prevents_single_job_from_unpickling(self):
        def do_a():
            write("out/A", "A")
            append("out/As", "A")

        ppg.FileGeneratingJob("out/A", do_a)

        def do_b():
            write("out/B", "A")
            append("out/Bs", "A")

        job_B = ppg.FileGeneratingJob("out/B", do_b)
        cd = CantDepickle()
        job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,))
        job_B.depends_on(job_parameter_unpickle_problem)
        ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/As") == "A"
        assert read("out/B") == "A"
        assert read("out/Bs") == "A"
        print("second run")
        ppg.new_pipegraph(dump_graph=False)

        ppg.FileGeneratingJob("out/A", do_a)
        job_B = ppg.FileGeneratingJob("out/B", do_b)
        job_parameter_unpickle_problem = ppg.ParameterInvariant("C", (cd,))
        job_B.depends_on(job_parameter_unpickle_problem)
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert read("out/A") == "A"
        assert read("out/As") == "A"
        assert read("out/B") == "A"
        assert (
            read("out/Bs") == "AA"
        )  # this one got rerun because we could not load the invariant...
Example #7
0
 def test_jobs_concurrent_jobs_run_concurrently(self):
     # we'll determine this by the start respective end times..
     ppg.new_pipegraph(
         ppg.resource_coordinators.LocalSystem(max_cores_to_use=2),
         quiet=True,
         dump_graph=False,
     )
     jobA = ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
     jobB = ppg.FileGeneratingJob("out/B", lambda: write("out/B", "B"))
     jobA.cores_needed = 1
     jobB.cores_needed = 1
     ppg.run_pipegraph()
     assert read("out/A") == "A"
     assert read("out/B") == "B"
     if jobA.start_time < jobB.start_time:
         first_job = jobA
         second_job = jobB
     else:
         first_job = jobB
         second_job = jobA
     print(
         "times",
         first_job.start_time,
         first_job.stop_time,
         second_job.start_time,
         second_job.stop_time,
     )
     if jobA.start_time is None:
         raise ValueError("JobA did not run")
     assert first_job.stop_time > second_job.start_time
Example #8
0
    def test_reruns_both_if_calc_changed(self):
        import pydataframe
        def calc():
            append('out/calc', 'A')
            return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
        def plot(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('X','Y')
        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'A')
        self.assertEqual(read('out/plot'),'B')

        ppg.new_pipegraph(rc_gen(), quiet=True)
        def calc2():
            append('out/calc', 'A')
            x = 5
            return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
        job = ppg.PlotJob(of, calc2, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'),'AA')
        self.assertEqual(read('out/plot'),'BB')
Example #9
0
    def test_run_may_be_called_only_once(self):
        ppg.new_pipegraph(quiet=True, dump_graph=False)
        ppg.run_pipegraph()

        def inner():
            ppg.run_pipegraph()

        assertRaises(ValueError, inner)
Example #10
0
    def test_run_may_be_called_only_once(self):
        ppg.new_pipegraph(quiet=True, dump_graph=False)
        ppg.run_pipegraph()

        def inner():
            ppg.run_pipegraph()

        assertRaises(ValueError, inner)
Example #11
0
    def test_can_not_run_twice(self):

        ppg.new_pipegraph(dump_graph=False)
        ppg.run_pipegraph()
        try:
            ppg.run_pipegraph()
            assert False  # "Exception not correctly raised"
        except ValueError as e:
            print(e)
            assert "Each pipegraph may be run only once." in str(e)
Example #12
0
    def test_can_not_run_twice(self):

        ppg.new_pipegraph(dump_graph=False)
        ppg.run_pipegraph()
        try:
            ppg.run_pipegraph()
            assert False  # "Exception not correctly raised"
        except ValueError as e:
            print(e)
            assert "Each pipegraph may be run only once." in str(e)
Example #13
0
 def test_basic(self):
     ppg.new_pipegraph(rc_gen(), quiet=False)
     import pydataframe
     def calc():
         return pydataframe.DataFrame({"X": list(range(0, 100)), 'Y': list(range(50, 150))})
     def plot(df):
         return pyggplot.Plot(df).add_scatter('X','Y')
     of = 'out/test.png'
     job = ppg.PlotJob(of, calc, plot)
     ppg.run_pipegraph()
     self.assertTrue(magic(of).find('PNG image') != -1)
Example #14
0
    def test_can_not_add_jobs_after_run(self):

        ppg.new_pipegraph(dump_graph=False)
        ppg.run_pipegraph()
        try:
            ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
            assert False  # , "Exception not correctly raised")
        except ValueError as e:
            print(e)
            assert (
                "This pipegraph was already run. You need to create a new one for more jobs"
                in str(e))
Example #15
0
    def test_indirect_cicle(self):
        ppg.new_pipegraph(quiet=True, dump_graph=False)
        jobA = ppg.FileGeneratingJob("A", lambda: write("A", "A"))
        jobB = ppg.FileGeneratingJob("B", lambda: write("B", "A"))
        jobC = ppg.FileGeneratingJob("C", lambda: write("C", "A"))
        jobC.depends_on(jobB)
        jobB.depends_on(jobA)
        jobA.depends_on(jobC)

        def inner():
            ppg.run_pipegraph()

        assertRaises(ppg.CycleError, inner)
Example #16
0
    def test_can_not_add_jobs_after_run(self):

        ppg.new_pipegraph(dump_graph=False)
        ppg.run_pipegraph()
        try:
            ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
            assert False  # , "Exception not correctly raised")
        except ValueError as e:
            print(e)
            assert (
                "This pipegraph was already run. You need to create a new one for more jobs"
                in str(e)
            )
Example #17
0
 def test_non_default_status_filename(self):
     try:
         forget_job_status("shu.dat")
         forget_job_status()
         ppg.new_pipegraph(
             quiet=True, invariant_status_filename="shu.dat", dump_graph=False
         )
         ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
         ppg.run_pipegraph()
         assert os.path.exists("shu.dat")
         assert not (os.path.exists(ppg.graph.invariant_status_filename_default))
     finally:
         forget_job_status("shu.dat")
Example #18
0
 def test_non_default_status_filename(self):
     try:
         forget_job_status("shu.dat")
         forget_job_status()
         ppg.new_pipegraph(quiet=True,
                           invariant_status_filename="shu.dat",
                           dump_graph=False)
         ppg.FileGeneratingJob("out/A", lambda: write("out/A", "A"))
         ppg.run_pipegraph()
         assert os.path.exists("shu.dat")
         assert not (os.path.exists(
             ppg.graph.invariant_status_filename_default))
     finally:
         forget_job_status("shu.dat")
Example #19
0
        def test_basic(self):
            ppg.new_pipegraph(rc_gen(), quiet=False)

            def calc():
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150))
                })

            def plot(df):
                return pyggplot.Plot(df).add_scatter("X", "Y")

            of = "out/test.png"
            ppg.PlotJob(of, calc, plot)
            ppg.run_pipegraph()
            self.assertTrue(magic(of).find(b"PNG image") != -1)
    def test_basic(self):
        ppg.new_pipegraph(rc_gen(), quiet=False)
        import pydataframe

        def calc():
            return pydataframe.DataFrame({
                "X": list(range(0, 100)),
                'Y': list(range(50, 150))
            })

        def plot(df):
            return pyggplot.Plot(df).add_scatter('X', 'Y')

        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
Example #21
0
 def transmit_pipegraph(self, jobs):
     global global_pipegraph
     try:
         pypipegraph.new_pipegraph(pypipegraph.resource_coordinators.DummyResourceCoordinator())
         global_pipegraph = pypipegraph.util.global_pipegraph
         jobs = cPickle.loads(jobs) #which fills the global pipegraph...
         logger.info("received pipegraph")
         logger.info("job len %i" % len(jobs))
         for name in jobs:
             logger.info("adding %s" % name)
             global_pipegraph.add_job(jobs[name])
         logger.info("Loaded pipegraph. Num jobs: %i" % len(global_pipegraph.jobs))
         global_pipegraph.running = True
         return {'ok': True, 'exception': ''}
     except Exception, e:
         logger.info("Pipegraph loading failed")
         logger.info(traceback.format_tb())
         return {"ok": False, 'exception': str(e)}
Example #22
0
 def transmit_pipegraph(self, jobs):
     global global_pipegraph
     try:
         pypipegraph.new_pipegraph(
             pypipegraph.resource_coordinators.DummyResourceCoordinator())
         global_pipegraph = pypipegraph.util.global_pipegraph
         jobs = cPickle.loads(jobs)  #which fills the global pipegraph...
         logger.info("received pipegraph")
         logger.info("job len %i" % len(jobs))
         for name in jobs:
             logger.info("adding %s" % name)
             global_pipegraph.add_job(jobs[name])
         logger.info("Loaded pipegraph. Num jobs: %i" %
                     len(global_pipegraph.jobs))
         global_pipegraph.running = True
         return {'ok': True, 'exception': ''}
     except Exception, e:
         logger.info("Pipegraph loading failed")
         logger.info(traceback.format_tb())
         return {"ok": False, 'exception': str(e)}
Example #23
0
        def np(quiet=True, **kwargs):
            if not first[0]:
                Path(target_path).mkdir(parents=True, exist_ok=True)
                os.chdir(target_path)
                Path("logs").mkdir()
                Path("cache").mkdir()
                Path("results").mkdir()
                Path("out").mkdir()
                import logging

                h = logging.getLogger("pypipegraph")
                h.setLevel(logging.WARNING)
                first[0] = True

            rc = ppg.resource_coordinators.LocalSystem(1)
            ppg.new_pipegraph(rc, quiet=quiet, dump_graph=False, **kwargs)
            ppg.util.global_pipegraph.result_dir = Path("results")
            g = ppg.util.global_pipegraph
            g.new_pipegraph = np
            return g
    def test_no_rerun_if_calc_change_but_ignore_codechanges(self):
        import pydataframe

        def calc():
            append('out/calc', 'A')
            return pydataframe.DataFrame({
                "X": list(range(0, 100)),
                'Y': list(range(50, 150))
            })

        def plot(df):
            append('out/plot', 'B')
            return pyggplot.Plot(df).add_scatter('X', 'Y')

        of = 'out/test.png'
        job = ppg.PlotJob(of, calc, plot)
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'), 'A')
        self.assertEqual(read('out/plot'), 'B')

        ppg.new_pipegraph(rc_gen(), quiet=True)

        def calc2():
            append('out/calc', 'A')
            x = 5
            return pydataframe.DataFrame({
                "X": list(range(0, 100)),
                'Y': list(range(50, 150))
            })

        job = ppg.PlotJob(of, calc2, plot)
        job.ignore_code_changes()
        ppg.run_pipegraph()
        self.assertTrue(magic(of).find('PNG image') != -1)
        self.assertEqual(read('out/calc'), 'A')

        self.assertEqual(read('out/plot'), 'B')
Example #25
0
def get_genome(name=None):
    global ppg_genome
    cache_dir = Path(__file__).parent / "run" / "genome_cache"
    if ppg_genome is None:
        old_pipegraph = ppg.util.global_pipegraph
        ppg.new_pipegraph()
        g = get_Candidatus_carsonella_ruddii_pv(
            name, cache_dir=cache_dir  # , ignore_code_changes=True
        )
        g.download_genome()
        # g.job_genes()
        # g.job_transcripts()
        ppg_genome = g
        ppg.run_pipegraph()
        ppg.util.global_pipegraph = old_pipegraph
    return InteractiveFileBasedGenome(
        name,
        ppg_genome._filename_lookups["genome.fasta"],
        ppg_genome._filename_lookups["cdna.fasta"],
        ppg_genome._filename_lookups["proteins.fasta"],
        ppg_genome._filename_lookups["genes.gtf"],
        ppg_genome.cache_dir,
    )
Example #26
0
        def test_no_rerun_if_calc_change_but_ignore_codechanges(self):
            def calc():
                append("out/calc", "A")
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150))
                })

            def plot(df):
                append("out/plot", "B")
                return pyggplot.Plot(df).add_scatter("X", "Y")

            of = "out/test.png"
            job = ppg.PlotJob(of, calc, plot)
            ppg.run_pipegraph()
            self.assertTrue(magic(of).find(b"PNG image") != -1)
            self.assertEqual(read("out/calc"), "A")
            self.assertEqual(read("out/plot"), "B")

            ppg.new_pipegraph(rc_gen(), quiet=True)

            def calc2():
                append("out/calc", "A")
                x = 5  # noqa: E157,F841
                return pd.DataFrame({
                    "X": list(range(0, 100)),
                    "Y": list(range(50, 150))
                })

            job = ppg.PlotJob(of, calc2, plot)
            job.ignore_code_changes()
            ppg.run_pipegraph()
            self.assertTrue(magic(of).find(b"PNG image") != -1)
            self.assertEqual(read("out/calc"), "A")

            self.assertEqual(read("out/plot"), "B")
Example #27
0
import pypipegraph
import urllib2
import hashlib

pypipegraph.new_pipegraph()

output_filename = "result.tab"  # where to store the final counts


# each call to download_job will return a job that downloads just this url.
def download_job(url):
    target_file = (
        "website_%s" % hashlib.md5(url).hexdigest()
    )  # we need a unique name for each target file.

    def do_download():
        request = urllib2.urlopen(url)
        data = request.read()
        request.close()
        file_handle = open(target_file, "wb")
        file_handle.write(data)
        file_handle.close()

    return pypipegraph.FileGeneratingJob(target_file, do_download)


def retrieve_urls():
    # now I said we were downloading these, but to make the tutorial independand,
    # we'll fake that bit, ok?
    # just pretend ;).
    return [
Example #28
0
import pypipegraph
import urllib2
import hashlib
pypipegraph.new_pipegraph()

output_filename = 'result.tab'  # where to store the final counts


# each call to download_job will return a job that downloads just this url.
def download_job(url):
    target_file = 'website_%s' % hashlib.md5(
        url).hexdigest()  # we need a unique name for each target file.

    def do_download():
        request = urllib2.urlopen(url)
        data = request.read()
        request.close()
        file_handle = open(target_file, 'wb')
        file_handle.write(data)
        file_handle.close()

    return pypipegraph.FileGeneratingJob(target_file, do_download)


def retrieve_urls():
    # now I said we were downloading these, but to make the tutorial independand,
    # we'll fake that bit, ok?
    # just pretend ;).
    return [
        'http://code.google.com/p/pypipegraph',
        'http://code.google.com/p/pypipegraph/w/list'
def install_bioconductor():
    bc_version = os.environ["BIOCONDUCTOR_VERSION"]
    cran_mode = os.environ["CRAN_MODE"]
    sources = ["cran", "software", "annotation", "experiment"]
    sources = {
        x: load_packages(x, os.environ["URL_%s" % x.upper()]).get() for x in sources
    }
    if bc_version in manual_overwrite:
        for src_name, src in manual_overwrite[bc_version].items():
            for pkg_name, url in src.items():
                sources[src_name][pkg_name]["url"] = url

    pkgs = list(sources.values())

    whitelist = os.environ["BIOCONDUCTOR_WHITELIST"].split(":")

    logging.basicConfig(
        filename="/anysnake/bioconductor/ppg.log", level=logging.INFO, filemode="w"
    )
    cpus = int(ppg.util.CPUs() * 1.25)  # rule of thumb to achieve maximum throughupt
    ppg.new_pipegraph(
        invariant_status_filename="/anysnake/bioconductor/.ppg_status",
        resource_coordinator=ppg.resource_coordinators.LocalSystem(
            max_cores_to_use=cpus, interactive=False
        ),
    )
    jobs, prune_because_of_missing_preqs = build_jobs(pkgs)
    # now we have jobs for *every* R package
    # which we now need to filter down

    to_prune = set()
    to_prune.update(sources["annotation"].keys())
    to_prune.update(sources["experiment"].keys())
    to_prune.update(prune_because_of_missing_preqs)
    prune(jobs, to_prune)

    if cran_mode == "minimal":
        prune(jobs, sources["cran"])
        already_unpruned = set()
        for k in sources["software"]:
            for j in jobs[k]:
                unprune(j, already_unpruned)
        prune(jobs, to_prune)

    already_unpruned = set()
    for k in whitelist:
        if k in jobs:
            for j in jobs[k]:
                unprune(j, already_unpruned)
    if "_full_" in whitelist:
        for k in sources["software"]:
            for j in jobs[k]:
                unprune(j, already_unpruned)

    # still need to apply the blacklist, no matter whether __full__ was set!
    to_prune = set()
    to_prune.update(windows_only_packages(pkgs))
    to_prune.update(blacklist)
    if bc_version in blacklist_per_version:
        to_prune.update(blacklist_per_version[bc_version])
    prune(jobs, to_prune)

    ppg.util.global_pipegraph.connect_graph()
    ppg.run_pipegraph()
    for j in ppg.util.global_pipegraph.job_uniquifier.values():
        if j._pruned:
            print("pruned", j.job_id, "because of", j._pruned)
    write_done_sentinel(cran_mode, whitelist)
Example #30
0
def run_exports(gen_additional_jobs=None, handle_ppg=True, settings='ovca'):
    if settings == 'ovca':
        apply_ovca_settings()
    else:
        raise ValueError("unknow setting value", settings)

    old = Path(os.getcwd()).absolute()
    os.chdir("/project")
    if handle_ppg:
        ppg.new_pipegraph()
    # os.chdir(old)
    to_wide_columns = {}
    jobs = []
    for cls in exporting_classes:
        instance = cls()
        if hasattr(instance, "exports"):
            instance.exports()

        out_prefix = getattr(instance, "out_prefix", "")
        for method_name in dir(instance):
            method = getattr(instance, method_name)
            if hasattr(method, "_output_name"):
                print(cls.__name__, method.__name__)
                output_filename = ("/project/processed/" + out_prefix +
                                   method._output_name + ".units")
                cwd = str(Path(method._abs_filename).parent)

                def write(output_filename=output_filename,
                          method=method,
                          cwd=cwd):
                    os.chdir(cwd)
                    df = method()
                    os.chdir("/project")
                    check_dataframe(out_prefix + method._output_name, df)
                    Path(output_filename).parent.mkdir(exist_ok=True,
                                                       parents=True)
                    if "unit" in df:
                        for ii, (unit, sub_df) in enumerate(
                                df.groupby("unit", sort=True)):
                            try:
                                sub_df.to_parquet(
                                    output_filename[:output_filename.
                                                    rfind(".")] + "." +
                                    str(ii) + ".parquet")
                            except:
                                sub_df.to_pickle("debug.pickle")
                                raise

                        Path(output_filename).write_text(
                            json.dumps(sorted(df.unit.unique())))
                    else:
                        df.to_parquet(
                            output_filename[:output_filename.rfind(".")] +
                            ".0.parquet")
                        Path(output_filename).write_text(json.dumps(["nounit"
                                                                     ]))
                    Path(output_filename + ".desc").write_text(
                        method._description)

                job = ppg.MultiFileGeneratingJob(
                    [output_filename, output_filename + ".desc"], write)
                job.depends_on(
                    ppg.FunctionInvariant(output_filename + "_inner_func",
                                          method))
                if method._input_files:
                    job.depends_on(ppg.MultiFileInvariant(method._input_files))
                if method._deps:
                    if hasattr(method._deps, "__call__"):
                        deps = method._deps(method.__self__)
                    else:
                        deps = method._deps
                    job.depends_on(deps)

                print(output_filename)
                print("")
                os.chdir("/project")
                jobs.append(job)
                to_wide_columns[out_prefix +
                                method._output_name] = method._wide_columns

    def dump_to_wide_columns(output_filename):
        Path(output_filename).write_text(json.dumps(to_wide_columns))

    jobs.append(
        ppg.FileGeneratingJob(
            "/project/processed/_to_wide_columns.json",
            dump_to_wide_columns).depends_on(
                ppg.ParameterInvariant(
                    "/project/processed/_to_wide_columns.json",
                    ppg.util.freeze(to_wide_columns),
                )))

    old = Path(os.getcwd()).absolute()
    if handle_ppg:
        os.chdir("/project")
        ppg.run_pipegraph()
    os.chdir(old)
    return jobs
Example #31
0
 def inner():
     ppg.new_pipegraph(quiet=True, dump_graph=False)
     jobA = ppg.FileGeneratingJob("A", lambda: write("A", "A"))
     jobB = ppg.FileGeneratingJob("A", lambda: write("B", "A"))
     jobA.depends_on(jobB)
     jobB.depends_on(jobA)
def main(argv):
    
    if len(argv) < 3:
        print("call: translations_spanish_graph.py data_path (bibtex_key|component)")
        sys.exit(1)

    # This creates a global Pipegraph object 
    # All new jobs will automatically register with it.
    pypipegraph.new_pipegraph() 

    invariants_csv_files = []
    for file in glob.glob(os.path.join(argv[1], "*.csv")):
        invariants_csv_files.append(pypipegraph.FileTimeInvariant(file))
    
    dictdata_ids = []
    
    def load_dictdata_ids():
        cr = loaded_data["cr"]
        dictdata_ids = cr.dictdata_ids_for_bibtex_key(argv[2])
        if len(dictdata_ids) == 0:
            dictdata_ids = cr.dictdata_ids_for_component(argv[2])
            if len(dictdata_ids) == 0:
                print("did not find any dictionary data for the bibtex_key or component {0}.".format(argv[2]))
                sys.exit(1)
        return dictdata_ids
        
    def create_corpusreader():
        cr = CorpusReaderDict(argv[1])
        return cr
    
    def set_corpusreader(value):
        loaded_data["cr"] = value
        loaded_data["dictdata_ids"] = load_dictdata_ids()

    cr_loading_job = pypipegraph.CachedDataLoadingJob(filename_corpusreader, create_corpusreader, set_corpusreader)
    cr_loading_job.depends_on(invariants_csv_files)
    

    def generate_dictdata_graph_job(dictdata_id):

        cr = loaded_data["cr"]
        dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
        target_file = "{0}.dot".format(dictdata_string)

        # now, we need a function that downloads from url and stores to target_file
        def generate_dictdata_graph():
            gr = Graph()
            src_language_iso = cr.src_language_iso_for_dictdata_id(dictdata_id)
            tgt_language_iso = cr.tgt_language_iso_for_dictdata_id(dictdata_id)
            if src_language_iso != 'spa' and tgt_language_iso != 'spa':
                raise(NoSpanishException)
            
            language_iso = None
            if tgt_language_iso == 'spa':
                language_iso = src_language_iso
            else:
                language_iso = tgt_language_iso
                            
            bibtex_key = dictdata_string.split("_")[0]
    
            for head, translation in cr.heads_with_translations_for_dictdata_id(dictdata_id):
                if src_language_iso == 'spa':
                    (head, translation) = (translation, head)
                    
                head_with_source = escape_string("{0}|{1}".format(head, bibtex_key))
                translation = escape_string(translation)
                
                #translation_with_language = "{0}|{1}".format(translation, language_iso)
                
                #if head_with_source not in gr:
                gr.add_node(head_with_source, attr_dict={ "lang": language_iso, "source": bibtex_key })
                
                #if translation not in gr:
                gr.add_node(translation, attr_dict={ "lang": "spa" })
                    
                #if not gr.has_edge((head_with_source, translation)):
                gr.add_edge(head_with_source, translation)
    
            output = codecs.open(target_file, "w", "utf-8")
            output.write(write(gr))
            output.close()
        return pypipegraph.FileGeneratingJob(target_file, generate_dictdata_graph)
        
    def gen_jobs():
        cr = loaded_data["cr"]
        jobs_generate_dot = [generate_dictdata_graph_job(dictdata_id) for dictdata_id in loaded_data["dictdata_ids"]
                            if cr.src_language_iso_for_dictdata_id(dictdata_id) == "spa" or
                                cr.tgt_language_iso_for_dictdata_id(dictdata_id) == "spa"]
        for job in jobs_generate_dot:
            job.depends_on(cr_loading_job)
        def combine_graphs():
            gr = None
            for dictdata_id in loaded_data["dictdata_ids"]:
                #dictdata_string = cr.dictdata_string_id_for_dictata_id(dictdata_id)
                #target_file = "{0}.dot".format(dictdata_string)
                j = generate_dictdata_graph_job(dictdata_id)
                target_file = j.job_id
                IN = codecs.open(target_file, "r", "utf-8")
                if gr == None:
                    gr = read(IN.read())
                else:
                    gr2 = read(IN.read())
                    for node in gr2:
                        gr.add_node(node, gr2.node[node])
                    for n1, n2 in gr2.edges_iter():
                        gr.add_edge(n1, n2, gr2.edge[n1][n2])
                IN.close()
            OUT = codecs.open(filename_combined_graph, "w", "utf-8")
            OUT.write(write(gr))
            OUT.close()
    
        job_combine_graphs = pypipegraph.FileGeneratingJob(filename_combined_graph, combine_graphs)
        job_combine_graphs.depends_on(jobs_generate_dot)

    pypipegraph.JobGeneratingJob("makejobs", gen_jobs).depends_on(cr_loading_job)


    pypipegraph.run_pipegraph()
                               ExonSmartStrandedPython)

work_dir = Path("_benchmark_read_counting")
work_dir.mkdir(exist_ok=True)
os.chdir(work_dir)

bam_name = (Path("results") / "aligned" / "STAR_2.6.1d" /
            "Drosophila_melanogaster_94" / "ERR2984187" / "ERR2984187.bam")

if not bam_name.exists():
    # leverage pipeline to get some sample data

    import mbf_align
    import mbf_externals

    ppg.new_pipegraph()

    genome = mbf_genomes.EnsemblGenome("Drosophila_melanogaster", 94)
    aligner = mbf_externals.aligners.STAR()

    # just some random drospohila lane.
    samples = {"ERR2984187": "ERR2984187"}
    raw = {
        name: mbf_align.Sample(
            name,
            mbf_align.strategies.FASTQsFromAccession(err),
            reverse_reads=False,
            pairing="only_first",
        )
        for name, err in samples.items()
    }
if __name__ == "__main__":
    import time
    import os
    import sys

    sys.path.append("../../")
    import pypipegraph as ppg

    ppg.new_pipegraph()

    def run_long():
        time.sleep(20)
        raise ValueError()

    def run_short():
        time.sleep(5)
        with open("short.dat", "wb") as op:
            op.write("DONO")

    if os.path.exists("short.dat"):
        os.unlink("short.dat")
    if os.path.exists("long.dat"):
        os.unlink("long.dat")

    job1 = ppg.FileGeneratingJob("long.dat", run_long)
    job2 = ppg.FileGeneratingJob("short.dat", run_short)
    job1.depends_on(job2)

    ppg.run_pipegraph()