def test_annos_dependening_none(self): class A(Annotator): cache_name = "hello" columns = ["aa"] def calc(self, df): return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "hello2" columns = ["ab"] def calc(self, df): return df["aa"] + "b" def dep_annos(self): return [None, A(), None] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += B() a.annotate() assert "ab" in a.df.columns assert "aa" in a.df.columns assert (a.df["ab"] == (a.df["aa"] + "b")).all()
def test_annos_same_column_different_anno(self): count = [0] class CountingConstant(Annotator): def __init__(self, column_name, value): count[0] += 1 self.columns = [column_name] self.value = value def calc(self, df): return pd.DataFrame({self.columns[0]: self.value}, index=df.index) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) c = CountingConstant("hello", "c") a += c a.annotate() assert "hello" in a.df.columns assert count[0] == 1 c = CountingConstant("hello2", "c") a += c a.annotate() assert "hello2" in a.df.columns assert count[0] == 2 d = CountingConstant("hello2", "d") assert c is not d with pytest.raises(ValueError): a += d
def test_annotator(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += Constant("column", "value") a.annotate() assert "column" in a.df.columns assert (a.df["column"] == "value").all()
def test_annos_dependening(self): class A(Annotator): cache_name = "hello" columns = ["aa"] def calc(self, df): return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "hello2" columns = ["ab"] def calc(self, df): return df["aa"] + "b" def dep_annos(self): return [A()] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += B() ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) ppg.run_pipegraph() assert "ab" in a.df.columns assert "aa" in a.df.columns assert (a.df["ab"] == (a.df["aa"] + "b")).all()
def test_annotator_basic(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += Constant("aa", "aa") force_load(a.annotate()) ppg.run_pipegraph() assert (a.df["aa"] == "aa").all()
def test_DynamicColumNames(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class Dynamic(Annotator): @property def columns(self): return ["a"] def calc(self, df): return pd.DataFrame({"a": ["x", "y"]}) a += Dynamic() a.annotate() assert_frame_equal( a.df, pd.DataFrame({"A": [1, 2], "B": ["c", "d"], "a": ["x", "y"]}) )
def test_missing_external_genome(self): g = DelayedDataFrame("ex", pd.DataFrame({"gene_stable_id": ["a", "c", "b"]})) anno = genes.annotators.Description() g += anno force_load(g.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "ddf had no .genome and no genome was passed to Description" in str( g.anno_jobs[anno.get_cache_name()].lfg.exception)
def test_annos_added_only_once(self): count = [0] class CountingConstant(Annotator): def __init__(self, column_name, value): count[0] += 1 self.columns = [column_name] self.value = value def calc(self, df): return pd.DataFrame({self.columns[0]: self.value}, index=df.index) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) c = CountingConstant("hello", "c") a += c a.annotate() assert "hello" in a.df.columns assert count[0] == 1 a += c # this get's ignored
def test_annotator_coliding_with_non_anno_column(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) a += Constant("A", "aa") lj = a.anno_jobs["A"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "were already present" in str(lj().exception)
def test_anno_returing_right_length_but_wrong_start_range_index(self): a = DelayedDataFrame("shu", lambda: pd.DataFrame({"A": [1, 2, 3]})) class BadAnno(Annotator): columns = ["X"] def calc(self, df): return pd.Series(["a", "b", "c"], index=pd.RangeIndex(5, 5 + 3)) a += BadAnno() force_load(a.annotate()) lj = a.anno_jobs["X"] with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Index mismatch" in str(lj().exception)
def test_anno_not_returning_enough_rows_and_no_index_range_index_on_df(self): class BrokenAnno(Annotator): columns = ["X"] def calc(self, df): return pd.DataFrame({"X": [1]}) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) ) a += BrokenAnno() lj = a.anno_jobs["X"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Length and index mismatch " in str(lj().exception)
def test_anno_returning_series(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C"] def calc(self, df): return pd.Series(list(range(len(df)))) a += SeriesAnno() ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) ppg.run_pipegraph() assert (a.df["C"] == [0, 1, 2]).all()
def test_annotator_raising(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class RaiseAnno(Annotator): columns = ["aa"] cache_name = "empty" def calc(self, df): raise ValueError("hello") anno1 = RaiseAnno() a += anno1 force_load(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() anno_job = a.anno_jobs[RaiseAnno().get_cache_name()] assert "hello" in str(anno_job.lfg.exception)
def test_DynamicColumNames(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class Dynamic(Annotator): @property def columns(self): return ["a"] def calc(self, df): return pd.DataFrame({"a": ["x", "y"]}) a += Dynamic() a.anno_jobs[Dynamic().get_cache_name()] force_load(a.annotate()) ppg.run_pipegraph() assert_frame_equal( a.df, pd.DataFrame({"A": [1, 2], "B": ["c", "d"], "a": ["x", "y"]}) )
def test_annotator_missing_columns(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class MissingColumnNames(Annotator): cache_name = "MissingColumnNames" def calc(self, df): return pd.DataFrame({}) def __repr__(self): return "MissingColumnNames()" a += MissingColumnNames() lg = a.anno_jobs["MissingColumnNames"] force_load(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "AttributeError" in repr(lg().lfg.exception)
def test_anno_returning_string(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C", "D"] def calc(self, df): return "abc" a += SeriesAnno() lj = a.anno_jobs["C"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "result was no dataframe" in str(lj().lfg.exception)
def test_lying_about_columns(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) class SeriesAnno(Annotator): columns = ["C"] def calc(self, df): return pd.DataFrame({"D": [0, 1, 2]}) a += SeriesAnno() lj = a.anno_jobs["C"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "declared different " in str(lj().exception)
def test_annotator_columns_not_list(self): class BrokenAnno(Annotator): def __init__( self, ): self.columns = "shu" def calc(self, df): return pd.DataFrame( {self.columns[0]: ["%s%i" % (self.columns[0], len(df))] * len(df)} ) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += BrokenAnno() lg = a.anno_jobs[BrokenAnno().get_cache_name()] force_load(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "list" in str(lg().lfg.exception)
def test_annotator_empty_columns(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) class EmptyColumnNames(Annotator): columns = [] cache_name = "empty" def calc(self, df): return pd.DataFrame({"shu": [1, 2]}) def __repr__(self): return "EmptyColumNames()" a += EmptyColumnNames() force_load(a.annotate()) anno_job_cb = a.anno_jobs[EmptyColumnNames().get_cache_name()] with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert anno_job_cb() is anno_job_cb() assert "anno.columns was empty" in repr(anno_job_cb().exception)
def test_external_genome(self): genome = MockGenome( pd.DataFrame({ "stable_id": ["a", "b", "c"], "chr": "1", "tss": [0, 100, 1000], "tes": [10, 101, 1010], }), df_genes_meta=pd.DataFrame({ "gene_stable_id": ["a", "b", "c"], "description": ["hello", "world", "!"], }).set_index("gene_stable_id"), ) g = DelayedDataFrame("ex", pd.DataFrame({"gene_stable_id": ["a", "c", "b"]})) anno = genes.annotators.Description(genome) g += anno force_load(g.annotate()) ppg.run_pipegraph() assert "description" in g.df.columns assert (g.df.sort_values("gene_stable_id")["description"] == [ "hello", "world", "!" ]).all()