Esempio n. 1
0
    def test_filtering_by_definition(self):

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        c = XAnno("C", [1, 2])
        a += c
        d = XAnno("D", [4, 5])

        # native column
        a1 = a.filter("a1", ("A", "==", 1))
        # search for the anno
        a2 = a.filter("a2", ("C", "==", 2))
        # extract the column name from the anno - anno already added
        a4 = a.filter("a4", (d, "==", 5))
        # extract the column name from the anno - anno not already added
        a3 = a.filter("a3", (c, "==", 1))
        # lookup column to name
        a6 = a.filter("a6", ("X", "==", 2), column_lookup={"X": "C"})
        # lookup column to anno
        a7 = a.filter("a7", ("X", "==", 2), column_lookup={"X": c})

        if not ppg.inside_ppg():
            e1 = XAnno("E", [6, 7])
            e2 = XAnno("E", [6, 8])
            assert find_annos_from_column("E") == [e1, e2]
            # column name to longer unique
            with pytest.raises(KeyError):
                a.filter("a5", ("E", "==", 5))
            with pytest.raises(KeyError):
                a.filter("a5", ((c, "D"), "==", 5))
        force_load(a1.annotate())
        force_load(a2.annotate())
        force_load(a3.annotate())
        force_load(a4.annotate())
        force_load(a6.annotate())
        force_load(a7.annotate())
        run_pipegraph()

        assert (a1.df["A"] == [1]).all()

        assert (a2.df["A"] == [2]).all()

        assert (a3.df["A"] == [1]).all()

        assert (a4.df["A"] == [2]).all()
        assert (a6.df["A"] == [2]).all()
        assert (a7.df["A"] == [2]).all()
Esempio n. 2
0
    def test_filtering(self):
        class A(Annotator):
            cache_name = "A"
            columns = ["aa"]

            def calc(self, df):
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        class B(Annotator):
            cache_name = "B"
            columns = ["ab"]

            def calc(self, df):
                return df["aa"] + "b"

            def dep_annos(self):
                return [A()]

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        a += Constant("C", "c")
        assert "C" in a.df.columns
        b = a.filter("sha", lambda df: df["A"] == 1)
        assert "C" in b.df.columns
        a += A()
        assert "aa" in a.df.columns
        assert "aa" in b.df.columns
        b += B()
        assert "ab" in b.df.columns
        assert not "ab" in a.df.columns
Esempio n. 3
0
 def test_filtering_by_definition_operators(self):
     a = DelayedDataFrame("shu", pd.DataFrame({"A": [-1, 0, 1, 2, 3, 4]}))
     assert (a.filter("a1", [("A", "==", 0)]).df["A"] == [0]).all()
     assert (a.filter("a2", [("A", ">=", 3)]).df["A"] == [3, 4]).all()
     assert (a.filter("a3", [("A", "<=", 0)]).df["A"] == [-1, 0]).all()
     assert (a.filter("a4", [("A", ">", 3)]).df["A"] == [4]).all()
     assert (a.filter("a5", [("A", "<", 0)]).df["A"] == [-1]).all()
     assert (a.filter("a6", [("A", "|>", 0)]).df["A"] == [-1, 1, 2, 3, 4]).all()
     assert (a.filter("a7", [("A", "|>=", 1)]).df["A"] == [-1, 1, 2, 3, 4]).all()
     assert (a.filter("a8", [("A", "|<", 2)]).df["A"] == [-1, 0, 1]).all()
     assert (a.filter("a9", [("A", "|<=", 2)]).df["A"] == [-1, 0, 1, 2]).all()
     with pytest.raises(ValueError):
         a.filter("a10", [("A", "xx", 2)])
Esempio n. 4
0
    def test_filteringC(self):
        ppg.util.global_pipegraph.quiet = False

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        # a += LenAnno("C")
        b = a.filter("sha", lambda df: df["C"] == 2, LenAnno("C"), set())
        b.write()
        ppg.run_pipegraph()
        assert "C" in a.df
        assert "C" in b.df
Esempio n. 5
0
    def test_filtering_on_annotator(self):
        class A(Annotator):
            cache_name = "A"
            columns = ["aa"]

            def calc(self, df):
                return pd.DataFrame(
                    {self.columns[0]: (["a", "b"] * int(len(df) / 2 + 1))[: len(df)]},
                    index=df.index,
                )

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        with pytest.raises(KeyError):
            b = a.filter("sha", lambda df: df["aa"] == "a")
        b = a.filter("sha", lambda df: df["aa"] == "a", [A()])
        canno = Constant("C", "c")
        a += canno
        b += canno
        assert (b.df["A"] == [1]).all()
Esempio n. 6
0
    def test_filteringA(self):
        ppg.util.global_pipegraph.quiet = False

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        b = a.filter("sha", lambda df: df["A"] == 1)
        a += LenAnno("C")
        b.write()
        ppg.run_pipegraph()
        assert "C" in b.df.columns
        assert "C" in a.df.columns
        assert (b.df["C"] == "C2").all()
        assert (a.df["C"] == "C2").all()
Esempio n. 7
0
    def test_filtering_result_dir(self):
        counts = collections.Counter()

        class A(Annotator):
            cache_name = "A"
            columns = ["aa"]

            def calc(self, df):
                counts["A"] += 1
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        b = a.filter("sha", lambda df: df["A"] == 1, result_dir="shu2")
        assert b.result_dir.absolute() == Path("shu2").absolute()
Esempio n. 8
0
 def test_multi_level(self):
     a = DelayedDataFrame(
         "shu",
         lambda: pd.DataFrame(
             {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]}
         ).set_index("idx"),
     )
     b = a.filter("sha", lambda df: df["C"] == 4, Constant("C", 4))
     a1 = LenAnno("count")
     b += a1
     c = b.filter("shc", lambda df: df["A"] >= 2)
     a2 = LenAnno("count2")
     c += a2
     c.write()
     ppg.run_pipegraph()
     assert len(c.df) == 2
     assert (c.df["A"] == [2, 3]).all()
     assert (c.df["count"] == "count3").all()
     assert (c.df["count2"] == "count22").all()
Esempio n. 9
0
    def test_filtering_on_annotator_missing(self):
        class A(Annotator):
            cache_name = "A"
            columns = ["aa"]

            def calc(self, df):
                return pd.DataFrame(
                    {self.columns[0]: (["a", "b"] * int(len(df) / 2 + 1))[: len(df)]},
                    index=df.index,
                )

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        b = a.filter("sha", lambda df: df["aaA"] == "a")
        load_job = b.load()
        a.write()
        print("run now")
        with pytest.raises(ppg.RuntimeError):
            ppg.run_pipegraph()
        assert "KeyError" in repr(load_job.lfg.exception)
Esempio n. 10
0
    def test_filter_and_clone_without_annos(self):
        ppg.util.global_pipegraph.quiet = False

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        # a += LenAnno("C")
        b = a.filter("sha", lambda df: df["C"] == 2, LenAnno("C"), set())
        b.write()
        with pytest.raises(ValueError):
            b.clone_without_annotators("shc", "hello")
        c = b.clone_without_annotators("shc", result_dir="dir_c")
        fn = c.write()[1]
        ppg.run_pipegraph()
        assert "C" in a.df
        assert "C" in b.df
        assert "C" not in c.df
        written = pd.read_csv(fn, sep="\t")
        assert set(c.df.columns) == set(written.columns)
        for col in c.df.columns:
            assert (c.df[col] == written[col]).all()
Esempio n. 11
0
    def test_filtering2(self):
        counts = collections.Counter()

        class A(Annotator):
            cache_name = "A"
            columns = ["aa"]

            def calc(self, df):
                counts["A"] += 1
                return pd.DataFrame({self.columns[0]: "a"}, index=df.index)

        class B(Annotator):
            cache_name = "B"
            columns = ["ab"]

            def calc(self, df):
                counts["B"] += 1
                return df["aa"] + "b"

            def dep_annos(self):
                return [A()]

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        b = a.filter("sha", lambda df: df["A"] == 1)
        b += B()
        assert "aa" in b.df.columns
        assert "ab" in b.df.columns
        assert not "aa" in a.df.columns
        assert not "ab" in a.df.columns
        assert counts["A"] == 1
        a += A()
        assert "aa" in a.df.columns
        assert counts["A"] == 2  # no two recalcs
        assert not "ab" in a.df.columns
        a += B()
        assert "ab" in a.df.columns
        assert counts["A"] == 2  # no two recalcs
        assert counts["B"] == 2  # no two recalcs
Esempio n. 12
0
    def test_filteringB(self):
        ppg.util.global_pipegraph.quiet = False

        a = DelayedDataFrame(
            "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]})
        )
        b = a.filter("sha", lambda df: df["A"] == 1)
        a += LenAnno("C")
        b += LenAnno("D")
        assert not LenAnno("D").get_cache_name() in a.anno_jobs
        b.write()
        ppg.run_pipegraph()
        assert not LenAnno("D").get_cache_name() in a.anno_jobs
        assert "C" in b.df.columns
        assert "C" in a.df.columns
        assert not "D" in a.df.columns
        assert len(a.df) == 2
        assert len(b.df) == 1
        assert (b.df["C"] == "C2").all()
        assert (b.df["D"] == "D1").all()
        assert (a.df["C"] == "C2").all()
        assert not "D" in a.df.columns