def test_name_must_be_str(self):
     with pytest.raises(TypeError):
         ppg.CachedDataLoadingJob(123, lambda: 123, lambda: 5)
     with pytest.raises(ValueError):
         ppg.CachedDataLoadingJob("123", 123, lambda: 5)
     with pytest.raises(ValueError):
         ppg.CachedDataLoadingJob("123", lambda: 5, 123)
        def gen():
            calc_job = ppg.CachedDataLoadingJob("out/B", calc, store)

            def gen2():
                dump_job = ppg.FileGeneratingJob("out/A", dump)
                dump_job.depends_on(calc_job)

            ppg.JobGeneratingJob("out/D", gen2)
    def _anno_cache_and_calc(self, anno):
        def calc():
            if not isinstance(anno.columns, list):
                raise ValueError("Columns was not a list")

            if hasattr(anno, "calc_ddf"):
                df = anno.calc_ddf(self.ddf)
            else:
                df = anno.calc(self.ddf.df)
            if isinstance(df, pd.Series) and len(anno.columns) == 1:
                df = pd.DataFrame({anno.columns[0]: df})
            if not isinstance(df, pd.DataFrame):
                raise ValueError(
                    "result was no dataframe (or series and len(anno.columns) == 1)"
                )
            return df

        def load(df):
            s_should = set(anno.columns)
            if not len(s_should):
                raise ValueError("anno.columns was empty")
            s_actual = set(df.columns)
            if s_should != s_actual:
                raise ValueError(
                    "Annotator declared different columns from those actualy calculated: %s"
                    % (s_should.symmetric_difference(s_actual))
                )
            if set(df.columns).intersection(self.ddf.df.columns):
                raise ValueError(
                    "Annotator created columns that were already present.",
                    self.ddf.name,
                    anno.get_cache_name(),
                    set(df.columns).intersection(self.ddf.df.columns),
                )
            self.ddf.df = _combine_annotator_df_and_old_df(df, self.ddf.df)

        (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True)
        job = ppg.CachedDataLoadingJob(
            self.ddf.cache_dir / anno.__class__.__name__ / anno.get_cache_name(),
            calc,
            load,
        )
        ppg.Job.depends_on(
            job, self.load()
        )  # both the load and nthe calc needs our ddf.df
        job.depends_on(
            self.load(),
            ppg.FunctionInvariant(
                self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"),
                anno.calc if hasattr(anno, "calc") else anno.calc_ddf,
            ),
        )
        for d in anno.dep_annos():
            if d is not None:
                job.depends_on(self.ddf.anno_jobs[d.get_cache_name()])
        job.depends_on(anno.deps(self.ddf))
        job.lfg.cores_needed = getattr(anno, "cores_needed", 1)
        return job
    def test_cached_dataloading_job_does_not_load_its_preqs_on_cached(
        self, new_pipegraph
    ):
        o = Dummy()

        def a():
            o.a = "A"
            append("out/A", "A")

        def calc():
            append("out/B", "B")
            return o.a * 2

        def load(value):
            o.c = value
            append("out/Cx", "C")  # not C, that's the cached file, you know...

        def output():
            write("out/D", o.c)

        dl = ppg.DataLoadingJob("out/A", a)
        ca = ppg.CachedDataLoadingJob("out/C", calc, load)
        fg = ppg.FileGeneratingJob("out/D", output)
        fg.depends_on(ca)
        ca.depends_on(dl)
        ppg.run_pipegraph()
        assert read("out/D") == "AA"  # we did write the final result
        assert read("out/A") == "A"  # ran the dl job
        assert read("out/B") == "B"  # ran the calc job...
        assert read("out/Cx") == "C"  # ran the load jobo
        os.unlink("out/D")  # so the filegen and the loadjob of cached should rerun...
        new_pipegraph.new_pipegraph()
        dl = ppg.DataLoadingJob("out/A", a)
        ca = ppg.CachedDataLoadingJob("out/C", calc, load)
        fg = ppg.FileGeneratingJob("out/D", output)
        fg.depends_on(ca)
        ca.depends_on(dl)
        ppg.run_pipegraph()
        assert read("out/D") == "AA"  # we did write the final result
        assert read("out/A") == "A"  # did not run the dl job
        assert read("out/B") == "B"  # did not run the calc job again
        assert read("out/Cx") == "CC"  # did run the load job again
Exemple #5
0
    def load(self):
        def load_func(df):
            self.ddf.df = df
            self.ddf.non_annotator_columns = self.ddf.df.columns

        job = ppg.CachedDataLoadingJob(self.ddf.cache_dir / "calc",
                                       self.loading_function, load_func)
        job.depends_on(self.deps).depends_on(
            ppg.FunctionInvariant(
                self.ddf.__class__.__name__ + "_" + self.ddf.name + "_load",
                self.loading_function,
            ))
        return job
    def test_no_dependand_still_calc(self):
        o = Dummy()

        def calc():
            return ", ".join(str(x) for x in range(0, 100))

        def store(value):
            o.a = value

        ppg.CachedDataLoadingJob("out/mycalc", calc, store)
        # job.ignore_code_changes() #or it would run anyway... hm.
        assert not (os.path.exists("out/mycalc"))
        ppg.run_pipegraph()
        assert os.path.exists("out/mycalc")
    def test_simple(self):
        o = Dummy()

        def calc():
            return ", ".join(str(x) for x in range(0, 100))

        def store(value):
            o.a = value

        job = ppg.CachedDataLoadingJob("out/mycalc", calc, store)
        of = "out/A"

        def do_write():
            write(of, o.a)

        ppg.FileGeneratingJob(of, do_write).depends_on(job)
        ppg.run_pipegraph()
        assert read(of) == ", ".join(str(x) for x in range(0, 100))
Exemple #8
0
    def test_cached_jobs_get_depencies_only_on_the_lazy_filegenerator_not_on_the_loading_job(
            self):
        o = Dummy()

        def calc():
            return list(range(0, o.b))

        def load(value):
            o.a = value

        job = ppg.CachedDataLoadingJob("a", calc, load)

        def do_b():
            return 100

        jobB = ppg.AttributeLoadingJob("b", o, "b", do_b)
        job.depends_on(jobB)
        assert not (jobB in job.prerequisites)
        assert jobB in job.lfg.prerequisites
    def test_preqrequisites_end_up_on_lfg(self):
        o = Dummy()

        def calc():
            return ", ".join(str(x) for x in range(0, 100))

        def store(value):
            o.a = value

        job = ppg.CachedDataLoadingJob("out/mycalc", calc, store)
        of = "out/A"

        def do_write():
            write(of, o.a)

        ppg.FileGeneratingJob(of, do_write).depends_on(job)
        job_preq = ppg.FileGeneratingJob("out/B", do_write)
        job.depends_on(job_preq)
        assert not (job_preq in job.prerequisites)
        assert job_preq in job.lfg.prerequisites
    def test_cant_unpickle(self):
        o = Dummy()

        def calc():
            return ", ".join(str(x) for x in range(0, 100))

        def store(value):
            o.a = value

        job = ppg.CachedDataLoadingJob("out/mycalc", calc, store)
        job.ignore_code_changes()
        write("out/mycalc", "no unpickling this")
        of = "out/A"

        def do_write():
            write(of, o.a)

        ppg.FileGeneratingJob(of, do_write).depends_on(job)
        with pytest.raises(ValueError):
            ppg.run_pipegraph()
        assert isinstance(job.exception, ValueError)
        assert "Unpickling error" in str(job.exception)
Exemple #11
0
    def _anno_cache_and_calc(self, anno):
        def calc():
            if not isinstance(anno.columns, list):
                raise ValueError("Columns was not a list")

            if hasattr(anno, "calc_ddf"):
                df = anno.calc_ddf(self.ddf)
            else:
                df = anno.calc(self.ddf.df)
            if isinstance(df, pd.Series) and len(anno.columns) == 1:
                df = pd.DataFrame({anno.columns[0]: df})
            if not isinstance(df, pd.DataFrame):
                raise ValueError(
                    "result was no dataframe (or series and len(anno.columns) == 1)"
                )
            return df

        def load(df):
            s_should = set(anno.columns)
            if not len(s_should):
                raise ValueError("anno.columns was empty")
            s_actual = set(df.columns)
            if s_should != s_actual:
                raise ValueError(
                    "Annotator declared different columns from those actualy calculated: %s"
                    % (s_should.symmetric_difference(s_actual)))
            if set(df.columns).intersection(self.ddf.df.columns):
                raise ValueError(
                    "Annotator created columns that were already present.",
                    self.ddf.name,
                    anno.get_cache_name(),
                    set(df.columns).intersection(self.ddf.df.columns),
                )
            if isinstance(df.index, pd.RangeIndex):
                if len(df) == len(
                        self.ddf.df):  # assume it's simply ordered by the df
                    df.index = self.ddf.df.index
                else:
                    raise ValueError(
                        "Length and index mismatch between DataFrame and Annotator result - "
                        "Annotator must return either a DF with a compatible index "
                        "or at least one with the same length (and a RangeIndex)"
                    )

            self.ddf.df = pd.concat([self.ddf.df, df], axis=1)

        (self.ddf.cache_dir / anno.__class__.__name__).mkdir(exist_ok=True)
        job = ppg.CachedDataLoadingJob(
            self.ddf.cache_dir / anno.__class__.__name__ /
            anno.get_cache_name(),
            calc,
            load,
        )
        ppg.Job.depends_on(
            job, self.load())  # both the load and nthe calc needs our ddf.df
        job.depends_on(
            self.load(),
            ppg.FunctionInvariant(
                self.ddf.cache_dir / (anno.get_cache_name() + "_calc_func"),
                anno.calc if hasattr(anno, "calc") else anno.calc_ddf,
            ),
        )
        for d in anno.dep_annos():
            if d is not None:
                job.depends_on(self.ddf.anno_jobs[d.get_cache_name()])
        job.depends_on(anno.deps(self.ddf))
        job.lfg.cores_needed = getattr(anno, "cores_needed", 1)
        return job
Exemple #12
0
    def test_accepts(self):
        import pathlib

        write("aaa", "hello")
        write("bbb", "hello")
        write("ccc", "hello")
        a = ppg.FileTimeInvariant(pathlib.Path("aaa"))
        a1 = ppg.MultiFileInvariant([pathlib.Path("bbb"), "ccc"])
        b = ppg.FileGeneratingJob(
            pathlib.Path("b"),
            lambda of: write(of, "bb" + read("aaa") + read("bbb") + read("ccc")),
        )
        b.depends_on(a)
        b.depends_on(a1)

        dd = Dummy()

        def mf():
            write("c", "cc" + read("g"))
            write("d", "dd" + read("h") + dd.attr)
            write("e", "ee" + read("i") + read("j"))

        c = ppg.MultiFileGeneratingJob([pathlib.Path("c"), "d", pathlib.Path("e")], mf)
        c.depends_on(b)
        d = ppg.FunctionInvariant(pathlib.Path("f"), lambda x: x + 1)
        c.depends_on(d)
        e = ppg.ParameterInvariant(pathlib.Path("c"), "hello")
        c.depends_on(e)
        f = ppg.TempFileGeneratingJob(pathlib.Path("g"), lambda: write("g", "gg"))
        c.depends_on(f)

        def tmf():
            write("h", "hh")
            write("i", "ii")

        g = ppg.MultiTempFileGeneratingJob([pathlib.Path("h"), "i"], tmf)
        c.depends_on(g)

        def tpf():
            write("j", "jjjj")
            write("k", "kkkk")

        h = ppg.TempFilePlusGeneratingJob(pathlib.Path("j"), pathlib.Path("k"), tpf)
        c.depends_on(h)

        i = ppg.CachedDataLoadingJob(
            pathlib.Path("l"), lambda: write("l", "llll"), lambda res: res
        )
        c.depends_on(i)

        m = ppg.CachedAttributeLoadingJob(pathlib.Path("m"), dd, "attr", lambda: "55")
        c.depends_on(m)
        ppg.run_pipegraph()
        assert read("aaa") == "hello"
        assert read("b") == "bbhellohellohello"
        assert read("c") == "ccgg"
        assert read("d") == "ddhh55"
        assert read("e") == "eeiijjjj"
        assert not (os.path.exists("g"))
        assert not (os.path.exists("h"))
        assert not (os.path.exists("i"))
        assert not (os.path.exists("j"))
        assert read("k") == "kkkk"
 def inner():
     ppg.CachedDataLoadingJob(5, lambda: 1, lambda value: 55)
 def inner():
     ppg.CachedDataLoadingJob("out/a", lambda value: 55, "shu")
Exemple #15
0
 def test_use_cores(self):
     ca = ppg.CachedDataLoadingJob("out/C", lambda: 55, lambda x: None)
     assert ca.use_cores(5) is ca
     assert ca.lfg.cores_needed == 5