def test_filtering_by_definition(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) c = XAnno("C", [1, 2]) a += c d = XAnno("D", [4, 5]) # native column a1 = a.filter("a1", ("A", "==", 1)) # search for the anno a2 = a.filter("a2", ("C", "==", 2)) # extract the column name from the anno - anno already added a4 = a.filter("a4", (d, "==", 5)) # extract the column name from the anno - anno not already added a3 = a.filter("a3", (c, "==", 1)) # lookup column to name a6 = a.filter("a6", ("X", "==", 2), column_lookup={"X": "C"}) # lookup column to anno a7 = a.filter("a7", ("X", "==", 2), column_lookup={"X": c}) if not ppg.inside_ppg(): e1 = XAnno("E", [6, 7]) e2 = XAnno("E", [6, 8]) assert find_annos_from_column("E") == [e1, e2] # column name to longer unique with pytest.raises(KeyError): a.filter("a5", ("E", "==", 5)) with pytest.raises(KeyError): a.filter("a5", ((c, "D"), "==", 5)) force_load(a1.annotate()) force_load(a2.annotate()) force_load(a3.annotate()) force_load(a4.annotate()) force_load(a6.annotate()) force_load(a7.annotate()) run_pipegraph() assert (a1.df["A"] == [1]).all() assert (a2.df["A"] == [2]).all() assert (a3.df["A"] == [1]).all() assert (a4.df["A"] == [2]).all() assert (a6.df["A"] == [2]).all() assert (a7.df["A"] == [2]).all()
def test_filtering(self): class A(Annotator): cache_name = "A" columns = ["aa"] def calc(self, df): return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "B" columns = ["ab"] def calc(self, df): return df["aa"] + "b" def dep_annos(self): return [A()] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += Constant("C", "c") assert "C" in a.df.columns b = a.filter("sha", lambda df: df["A"] == 1) assert "C" in b.df.columns a += A() assert "aa" in a.df.columns assert "aa" in b.df.columns b += B() assert "ab" in b.df.columns assert not "ab" in a.df.columns
def test_filtering_by_definition_operators(self): a = DelayedDataFrame("shu", pd.DataFrame({"A": [-1, 0, 1, 2, 3, 4]})) assert (a.filter("a1", [("A", "==", 0)]).df["A"] == [0]).all() assert (a.filter("a2", [("A", ">=", 3)]).df["A"] == [3, 4]).all() assert (a.filter("a3", [("A", "<=", 0)]).df["A"] == [-1, 0]).all() assert (a.filter("a4", [("A", ">", 3)]).df["A"] == [4]).all() assert (a.filter("a5", [("A", "<", 0)]).df["A"] == [-1]).all() assert (a.filter("a6", [("A", "|>", 0)]).df["A"] == [-1, 1, 2, 3, 4]).all() assert (a.filter("a7", [("A", "|>=", 1)]).df["A"] == [-1, 1, 2, 3, 4]).all() assert (a.filter("a8", [("A", "|<", 2)]).df["A"] == [-1, 0, 1]).all() assert (a.filter("a9", [("A", "|<=", 2)]).df["A"] == [-1, 0, 1, 2]).all() with pytest.raises(ValueError): a.filter("a10", [("A", "xx", 2)])
def test_filteringC(self): ppg.util.global_pipegraph.quiet = False a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) # a += LenAnno("C") b = a.filter("sha", lambda df: df["C"] == 2, LenAnno("C"), set()) b.write() ppg.run_pipegraph() assert "C" in a.df assert "C" in b.df
def test_filtering_on_annotator(self): class A(Annotator): cache_name = "A" columns = ["aa"] def calc(self, df): return pd.DataFrame( {self.columns[0]: (["a", "b"] * int(len(df) / 2 + 1))[: len(df)]}, index=df.index, ) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) with pytest.raises(KeyError): b = a.filter("sha", lambda df: df["aa"] == "a") b = a.filter("sha", lambda df: df["aa"] == "a", [A()]) canno = Constant("C", "c") a += canno b += canno assert (b.df["A"] == [1]).all()
def test_filteringA(self): ppg.util.global_pipegraph.quiet = False a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) b = a.filter("sha", lambda df: df["A"] == 1) a += LenAnno("C") b.write() ppg.run_pipegraph() assert "C" in b.df.columns assert "C" in a.df.columns assert (b.df["C"] == "C2").all() assert (a.df["C"] == "C2").all()
def test_filtering_result_dir(self): counts = collections.Counter() class A(Annotator): cache_name = "A" columns = ["aa"] def calc(self, df): counts["A"] += 1 return pd.DataFrame({self.columns[0]: "a"}, index=df.index) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) b = a.filter("sha", lambda df: df["A"] == 1, result_dir="shu2") assert b.result_dir.absolute() == Path("shu2").absolute()
def test_multi_level(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) b = a.filter("sha", lambda df: df["C"] == 4, Constant("C", 4)) a1 = LenAnno("count") b += a1 c = b.filter("shc", lambda df: df["A"] >= 2) a2 = LenAnno("count2") c += a2 c.write() ppg.run_pipegraph() assert len(c.df) == 2 assert (c.df["A"] == [2, 3]).all() assert (c.df["count"] == "count3").all() assert (c.df["count2"] == "count22").all()
def test_filtering_on_annotator_missing(self): class A(Annotator): cache_name = "A" columns = ["aa"] def calc(self, df): return pd.DataFrame( {self.columns[0]: (["a", "b"] * int(len(df) / 2 + 1))[: len(df)]}, index=df.index, ) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) b = a.filter("sha", lambda df: df["aaA"] == "a") load_job = b.load() a.write() print("run now") with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "KeyError" in repr(load_job.lfg.exception)
def test_filter_and_clone_without_annos(self): ppg.util.global_pipegraph.quiet = False a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) # a += LenAnno("C") b = a.filter("sha", lambda df: df["C"] == 2, LenAnno("C"), set()) b.write() with pytest.raises(ValueError): b.clone_without_annotators("shc", "hello") c = b.clone_without_annotators("shc", result_dir="dir_c") fn = c.write()[1] ppg.run_pipegraph() assert "C" in a.df assert "C" in b.df assert "C" not in c.df written = pd.read_csv(fn, sep="\t") assert set(c.df.columns) == set(written.columns) for col in c.df.columns: assert (c.df[col] == written[col]).all()
def test_filtering2(self): counts = collections.Counter() class A(Annotator): cache_name = "A" columns = ["aa"] def calc(self, df): counts["A"] += 1 return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "B" columns = ["ab"] def calc(self, df): counts["B"] += 1 return df["aa"] + "b" def dep_annos(self): return [A()] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) b = a.filter("sha", lambda df: df["A"] == 1) b += B() assert "aa" in b.df.columns assert "ab" in b.df.columns assert not "aa" in a.df.columns assert not "ab" in a.df.columns assert counts["A"] == 1 a += A() assert "aa" in a.df.columns assert counts["A"] == 2 # no two recalcs assert not "ab" in a.df.columns a += B() assert "ab" in a.df.columns assert counts["A"] == 2 # no two recalcs assert counts["B"] == 2 # no two recalcs
def test_filteringB(self): ppg.util.global_pipegraph.quiet = False a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) b = a.filter("sha", lambda df: df["A"] == 1) a += LenAnno("C") b += LenAnno("D") assert not LenAnno("D").get_cache_name() in a.anno_jobs b.write() ppg.run_pipegraph() assert not LenAnno("D").get_cache_name() in a.anno_jobs assert "C" in b.df.columns assert "C" in a.df.columns assert not "D" in a.df.columns assert len(a.df) == 2 assert len(b.df) == 1 assert (b.df["C"] == "C2").all() assert (b.df["D"] == "D1").all() assert (a.df["C"] == "C2").all() assert not "D" in a.df.columns