def test_annos_dependening_none(self): class A(Annotator): cache_name = "hello" columns = ["aa"] def calc(self, df): return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "hello2" columns = ["ab"] def calc(self, df): return df["aa"] + "b" def dep_annos(self): return [None, A(), None] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += B() a.annotate() assert "ab" in a.df.columns assert "aa" in a.df.columns assert (a.df["ab"] == (a.df["aa"] + "b")).all()
def test_ttest_paired(self): data = pd.DataFrame({ "A.R1": [0, 0, 0, 0], "A.R2": [0, 0, 0, 0], "A.R3": [0, 0.001, 0.001, 0.001], "B.R1": [0.95, 0, 0.56, 0], "B.R2": [0.99, 0, 0.56, 0], "B.R3": [0.98, 0, 0.57, 0.5], "C.R1": [0.02, 0.73, 0.59, 0], "C.R2": [0.03, 0.75, 0.57, 0], "C.R3": [0.05, 0.7, 0.58, 1], }) ddf = DelayedDataFrame("ex1", data) gts = { k: list(v) for (k, v) in itertools.groupby(sorted(data.columns), lambda x: x[0]) } c = Comparisons(ddf, gts) a = c.a_vs_b("A", "B", TTestPaired()) force_load(ddf.add_annotator(a)) run_pipegraph() assert ddf.df[a["p"]].iloc[0] == pytest.approx(8.096338300746213e-07, abs=1e-4) assert ddf.df[a["p"]].iloc[1] == pytest.approx(0.42264973081037427, abs=1e-4) assert ddf.df[a["p"]].iloc[2] == pytest.approx(0.041378369826042816, abs=1e-4) assert ddf.df[a["p"]].iloc[3] == pytest.approx(0.42264973081037427, abs=1e-4) assert ddf.df[a["FDR"]].values == pytest.approx( [3.238535e-06, 4.226497e-01, 8.275674e-02, 4.226497e-01], abs=1e-4)
def test_edgeR(self): df = self._get_tuch_data() ddf = DelayedDataFrame("ex1", df) gts = { "T": [x for x in df.columns if ".T" in x], "N": [x for x in df.columns if ".N" in x], } c = Comparisons(ddf, gts) a = c.a_vs_b("T", "N", EdgeRUnpaired()) force_load(ddf.add_annotator(a)) run_pipegraph() # these are from the last run - the manual has no simple a vs b comparison... # at least we'l notice if this changes assert ddf.df[ddf.df.nameOfGene == "PTHLH"][ a["log2FC"]].values == approx([4.003122]) assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["FDR"]].values == approx( [1.332336e-11]) assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["p"]].values == approx( [5.066397e-15]) df = ddf.df.set_index("nameOfGene") t_columns = [x[1] for x in gts["T"]] n_columns = [x[1] for x in gts["N"]] assert df.loc["PTHLH"][t_columns].sum( ) > df.loc["PTHLH"][n_columns].sum() assert ddf.df[ddf.df.nameOfGene == "PTGFR"][ a["log2FC"]].values == approx([-5.127508]) assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["FDR"]].values == approx( [6.470885e-10]) assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["p"]].values == approx( [3.690970e-13]) assert df.loc["PTGFR"][t_columns].sum( ) < df.loc["PTGFR"][n_columns].sum()
def test_annos_dependening(self): class A(Annotator): cache_name = "hello" columns = ["aa"] def calc(self, df): return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "hello2" columns = ["ab"] def calc(self, df): return df["aa"] + "b" def dep_annos(self): return [A()] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += B() ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) ppg.run_pipegraph() assert "ab" in a.df.columns assert "aa" in a.df.columns assert (a.df["ab"] == (a.df["aa"] + "b")).all()
def test_edgeR_paired(self): df = self._get_tuch_data() ddf = DelayedDataFrame("ex1", df) gts = { "T": [x for x in sorted(df.columns) if ".T" in x], "N": [x for x in sorted(df.columns) if ".N" in x], } c = Comparisons(ddf, gts) a = c.a_vs_b("T", "N", EdgeRPaired()) force_load(ddf.add_annotator(a)) run_pipegraph() # these are from the last run - the manual has no simple a vs b comparison... # at least we'l notice if this changes assert ddf.df[ddf.df.nameOfGene == "PTHLH"][ a["log2FC"]].values == approx([3.97], abs=1e-3) assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["FDR"]].values == approx( [4.27e-18]) assert ddf.df[ddf.df.nameOfGene == "PTHLH"][a["p"]].values == approx( [8.13e-22]) df = ddf.df.set_index("nameOfGene") t_columns = [x[1] for x in gts["T"]] n_columns = [x[1] for x in gts["N"]] assert df.loc["PTHLH"][t_columns].sum( ) > df.loc["PTHLH"][n_columns].sum() assert ddf.df[ddf.df.nameOfGene == "PTGFR"][ a["log2FC"]].values == approx([-5.18], abs=1e-2) assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["FDR"]].values == approx( [3.17e-19]) assert ddf.df[ddf.df.nameOfGene == "PTGFR"][a["p"]].values == approx( [3.01e-23]) assert df.loc["PTGFR"][t_columns].sum( ) < df.loc["PTGFR"][n_columns].sum()
def test_nested_anno_dependencies(self): class Nested(Annotator): columns = ["b"] def calc(self, df): return pd.Series([10] * len(df)) def dep_annos(self): return [Constant("Nestedconst", 5)] class Nesting(Annotator): columns = ["a"] def calc(self, df): return pd.Series([15] * len(df)) def dep_annos(self): return [Constant("Nestingconst", 5), Nested()] anno = Nesting() a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) ) a += anno a.write() ppg.run_pipegraph() assert (a.df["a"] == 15).all() assert (a.df["b"] == 10).all() assert (a.df["Nestedconst"] == 5).all() assert (a.df["Nestingconst"] == 5).all()
def test_filtering(self): class A(Annotator): cache_name = "A" columns = ["aa"] def calc(self, df): return pd.DataFrame({self.columns[0]: "a"}, index=df.index) class B(Annotator): cache_name = "B" columns = ["ab"] def calc(self, df): return df["aa"] + "b" def dep_annos(self): return [A()] a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += Constant("C", "c") assert "C" in a.df.columns b = a.filter("sha", lambda df: df["A"] == 1) assert "C" in b.df.columns a += A() assert "aa" in a.df.columns assert "aa" in b.df.columns b += B() assert "ab" in b.df.columns assert not "ab" in a.df.columns
def test_multi_plus_filter(self, clear_annotators): d = DelayedDataFrame( "ex1", pd.DataFrame({ "a1": [1 / 0.99, 2 / 0.99, 3 / 0.99], "a2": [1 * 0.99, 2 * 0.99, 3 * 0.99], "b1": [2 * 0.99, 8 * 0.99, (16 * 3) * 0.99], "b2": [2 / 0.99, 8 / 0.99, (16 * 3) / 0.99], "delta": [10, 20, 30], }), ) c = Comparisons(d, {"a": ["a1", "a2"], "b": ["b1", "b2"]}) a = c.a_vs_b("a", "b", Log2FC(), laplace_offset=0) anno1 = Constant("shu1", 5) anno2 = Constant("shu2", 5) # noqa: F841 anno3 = Constant("shu3", 5) # noqa: F841 to_test = [ (("log2FC", "==", -1.0), [-1.0]), (("log2FC", ">", -2.0), [-1.0]), (("log2FC", "<", -2.0), [-4.0]), (("log2FC", ">=", -2.0), [-1.0, -2.0]), (("log2FC", "<=", -2.0), [-2.0, -4.0]), (("log2FC", "|>", 2.0), [-4.0]), (("log2FC", "|<", 2.0), [-1.0]), (("log2FC", "|>=", 2.0), [-2.0, -4.0]), (("log2FC", "|<=", 2.0), [-1.0, -2.0]), ((a["log2FC"], "<", -2.0), [-4.0]), (("log2FC", "|", -2.0), ValueError), ([("log2FC", "|>=", 2.0), ("log2FC", "<=", 0)], [-2.0, -4.0]), ((anno1, ">=", 5), [-1, -2.0, -4.0]), (((anno1, 0), ">=", 5), [-1, -2.0, -4.0]), (("shu2", ">=", 5), [-1, -2.0, -4.0]), (("delta", ">", 10), [-2.0, -4.0]), ] if not ppg.inside_ppg(): # can't test for missing columns in ppg. to_test.extend([(("log2FC_no_such_column", "<", -2.0), KeyError)]) filtered = {} for ii, (f, r) in enumerate(to_test): if r in (ValueError, KeyError): with pytest.raises(r): a.filter([f], "new%i" % ii) else: filtered[tuple(f)] = a.filter( [f] if isinstance(f, tuple) else f, "new%i" % ii) assert filtered[tuple(f)].name == "new%i" % ii force_load(filtered[tuple(f)].annotate(), filtered[tuple(f)].name) force_load(d.add_annotator(a), "somethingsomethingjob") run_pipegraph() c = a["log2FC"] assert (d.df[c] == [-1.0, -2.0, -4.0]).all() for f, r in to_test: if r not in (ValueError, KeyError): try: assert filtered[tuple(f)].df[c].values == approx(r) except AssertionError: print(f) raise
def test_annotator(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += Constant("column", "value") a.annotate() assert "column" in a.df.columns assert (a.df["column"] == "value").all()
def test_annotator_basic(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) a += Constant("aa", "aa") force_load(a.annotate()) ppg.run_pipegraph() assert (a.df["aa"] == "aa").all()
def test_deseq2_with_and_without_additional_columns(self): import mbf_sampledata pasilla_data = pd.read_csv( mbf_sampledata.get_sample_path( "mbf_comparisons/pasillaCount_deseq2.tsv.gz"), sep=" ", ) # pasilla_data = pasilla_data.set_index('Gene') pasilla_data.columns = [str(x) for x in pasilla_data.columns] print(pasilla_data.columns) pasilla_data = pasilla_data.assign( treated_fake=pasilla_data.treated2fb, untreated_fake=pasilla_data.untreated2fb, ) gts = { "treated": [ x for x in pasilla_data.columns if x.startswith("treated") and "3" not in x ], "untreated": [ x for x in pasilla_data.columns if x.startswith("untreated") and "3" not in x ], "other": [x for x in pasilla_data.columns if "3" in x], } assert len(gts["other"]) == 2 assert sum( (len(x) for x in gts.values())) + 1 == len(pasilla_data.columns) # GeneId ddf = DelayedDataFrame("ex", pasilla_data) c = Comparisons(ddf, gts) with_other = c.a_vs_b( "treated", "untreated", DESeq2Unpaired(), include_other_samples_for_variance=True, ) without_other = c.a_vs_b( "treated", "untreated", DESeq2Unpaired(), include_other_samples_for_variance=False, ) force_load(ddf.add_annotator(with_other)) force_load(ddf.add_annotator(without_other)) # run_pipegraph() df = ddf.df print(df.head()) df.to_csv("test.csv") # this is a fairly weak test, but it shows that it at least does *something* assert (df[with_other["p"]] != pytest.approx( df[without_other["p"]])).all() assert (df[with_other["log2FC"]] != pytest.approx( df[without_other["log2FC"]])).all()
def test_missing_external_genome(self): g = DelayedDataFrame("ex", pd.DataFrame({"gene_stable_id": ["a", "c", "b"]})) anno = genes.annotators.Description() g += anno force_load(g.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "ddf had no .genome and no genome was passed to Description" in str( g.anno_jobs[anno.get_cache_name()].lfg.exception)
def test_write(self): test_df = pd.DataFrame({"A": [1, 2]}) def load(): return test_df a = DelayedDataFrame("shu", load) fn = a.write()[0] ppg.run_pipegraph() assert Path(fn.filenames[0]).exists() assert_frame_equal(pd.read_csv(fn.filenames[0], sep="\t"), test_df)
def test_simple(self): d = DelayedDataFrame( "ex1", pd.DataFrame({ "a": [1, 2, 3], "b": [2, 8, 16 * 3] })) c = Comparisons(d, {"a": ["a"], "b": ["b"]}) a = c.a_vs_b("a", "b", Log2FC, laplace_offset=0) assert d.has_annotator(a) force_load(d.add_annotator(a), "fl1") run_pipegraph() assert (d.df[a["log2FC"]] == [-1.0, -2.0, -4.0]).all()
def test_filteringC(self): ppg.util.global_pipegraph.quiet = False a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) # a += LenAnno("C") b = a.filter("sha", lambda df: df["C"] == 2, LenAnno("C"), set()) b.write() ppg.run_pipegraph() assert "C" in a.df assert "C" in b.df
def test_create(self): test_df = pd.DataFrame({"A": [1, 2]}) def load(): return test_df a = DelayedDataFrame("shu", load) assert not hasattr(a, "df") force_load(a.load(), False) ppg.run_pipegraph() assert_frame_equal(a.df, test_df) assert a.non_annotator_columns == "A"
def test_simple_from_anno_plus_column_pos(self): d = DelayedDataFrame( "ex1", pd.DataFrame({ "a": [1, 2, 3], "b": [2, 8, 16 * 3] })) a = Constant("five", 5) b = Constant("ten", 10) c = Comparisons(d, {"a": [(a, 0)], "b": [(b, 0)]}) a = c.a_vs_b("a", "b", Log2FC(), laplace_offset=0) force_load(d.add_annotator(a), "fl1") run_pipegraph() assert (d.df[a["log2FC"]] == [-1, -1, -1]).all()
def test_write_excel(self): test_df = pd.DataFrame({"A": [1, 2]}) def load(): return test_df a = DelayedDataFrame("shu", load, result_dir="sha") assert Path("sha").exists() assert_frame_equal(a.df, test_df) assert a.non_annotator_columns == "A" fn = a.write("sha.xls")[1] assert fn.exists() assert_frame_equal(pd.read_excel(fn), test_df)
def test_annotator_coliding_with_non_anno_column(self): a = DelayedDataFrame( "shu", lambda: pd.DataFrame( {"A": [1, 2, 3], "B": ["a", "b", "c"], "idx": ["x", "y", "z"]} ).set_index("idx"), ) a += Constant("A", "aa") lj = a.anno_jobs["A"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "were already present" in str(lj().exception)
def test_filteringA(self): ppg.util.global_pipegraph.quiet = False a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) b = a.filter("sha", lambda df: df["A"] == 1) a += LenAnno("C") b.write() ppg.run_pipegraph() assert "C" in b.df.columns assert "C" in a.df.columns assert (b.df["C"] == "C2").all() assert (a.df["C"] == "C2").all()
def test_write(self): test_df = pd.DataFrame({"A": [1, 2]}) def load(): return test_df a = DelayedDataFrame("shu", load, result_dir="sha") assert Path("sha").exists() assert_frame_equal(a.df, test_df) assert a.non_annotator_columns == "A" fn = a.write()[1] assert "/sha" in str(fn.parent) assert fn.exists() assert_frame_equal(pd.read_csv(fn, sep="\t"), test_df)
def test_write_excel2(self): data = {} for i in range(0, 257): c = "A%i" % i d = [1, 1] data[c] = d test_df = pd.DataFrame(data) def load(): return test_df a = DelayedDataFrame("shu", load, result_dir="sha") fn = a.write("sha.xls")[1] assert fn.exists() assert_frame_equal(pd.read_csv(fn, sep="\t"), test_df)
def test_anno_returing_right_length_but_wrong_start_range_index(self): a = DelayedDataFrame("shu", lambda: pd.DataFrame({"A": [1, 2, 3]})) class BadAnno(Annotator): columns = ["X"] def calc(self, df): return pd.Series(["a", "b", "c"], index=pd.RangeIndex(5, 5 + 3)) a += BadAnno() force_load(a.annotate()) lj = a.anno_jobs["X"] with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Index mismatch" in str(lj().exception)
def normed_ddf(self, input_ddf): def load(): df = input_ddf.df[[ac[1] for ac in self.columns]] normed_df = self.normalization_strategy.calc( df, [ac[1] for ac in self.columns]) return normed_df output_name = input_ddf.name + "_heatmap_" + self.normalization_strategy.name if ppg.inside_ppg(): deps = [ self.ddf.add_annotator(ac[0]) for ac in self.columns if ac[0] is not None ] + [ self.normalization_strategy.deps(), input_ddf.load(), ppg.FunctionInvariant(output_name + '_calc', self.normalization_strategy.calc) ] else: deps = [] return DelayedDataFrame( output_name, load, deps, input_ddf.result_dir, )
def test_anno_not_returning_enough_rows_and_no_index_range_index_on_df(self): class BrokenAnno(Annotator): columns = ["X"] def calc(self, df): return pd.DataFrame({"X": [1]}) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "c"]}) ) a += BrokenAnno() lj = a.anno_jobs["X"] ppg.JobGeneratingJob("shu", lambda: 55).depends_on(a.annotate()) with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Length and index mismatch " in str(lj().exception)
def test_volcano_plot(self): ppg.util.global_pipegraph.quiet = False import mbf_sampledata pasilla_data = pd.read_csv( mbf_sampledata.get_sample_path( "mbf_comparisons/pasillaCount_deseq2.tsv.gz"), sep=" ", ) # pasilla_data = pasilla_data.set_index('Gene') pasilla_data.columns = [str(x) for x in pasilla_data.columns] treated = [x for x in pasilla_data.columns if x.startswith("treated")] untreated = [ x for x in pasilla_data.columns if x.startswith("untreated") ] pasilla_data = DelayedDataFrame("pasilla", pasilla_data) comp = Comparisons(pasilla_data, { "treated": treated, "untreated": untreated }).a_vs_b("treated", "untreated", TTest()) comp.filter([("log2FC", "|>=", 2.0), ("FDR", "<=", 0.05)]) prune_qc(lambda job: "volcano" in job.job_id) run_pipegraph() qc_jobs = list(get_qc_jobs()) qc_jobs = [x for x in qc_jobs if not x._pruned] print(qc_jobs) assert len(qc_jobs) == 1 assert_image_equal(qc_jobs[0].filenames[0])
def test_filtering_result_dir(self): counts = collections.Counter() class A(Annotator): cache_name = "A" columns = ["aa"] def calc(self, df): counts["A"] += 1 return pd.DataFrame({self.columns[0]: "a"}, index=df.index) a = DelayedDataFrame( "shu", lambda: pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) ) b = a.filter("sha", lambda df: df["A"] == 1, result_dir="shu2") assert b.result_dir.absolute() == Path("shu2").absolute()
def test_deseq2(self): import mbf_sampledata pasilla_data = pd.read_csv( mbf_sampledata.get_sample_path( "mbf_comparisons/pasillaCount_deseq2.tsv.gz"), sep=" ", ) # pasilla_data = pasilla_data.set_index('Gene') pasilla_data.columns = [str(x) for x in pasilla_data.columns] gts = { "treated": [x for x in pasilla_data.columns if x.startswith("treated")], "untreated": [x for x in pasilla_data.columns if x.startswith("untreated")], } ddf = DelayedDataFrame("ex", pasilla_data) c = Comparisons(ddf, gts) a = c.a_vs_b("treated", "untreated", DESeq2Unpaired()) force_load(ddf.add_annotator(a)) run_pipegraph() check = """# This is deseq2 version specific data- probably needs fixing if upgrading deseq2 ## baseMean log2FoldChange lfcSE stat pvalue padj ## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric> ## FBgn0039155 453 -3.72 0.160 -23.2 1.63e-119 1.35e-115 ## FBgn0029167 2165 -2.08 0.103 -20.3 1.43e-91 5.91e-88 ## FBgn0035085 367 -2.23 0.137 -16.3 6.38e-60 1.75e-56 ## FBgn0029896 258 -2.21 0.159 -13.9 5.40e-44 1.11e-40 ## FBgn0034736 118 -2.56 0.185 -13.9 7.66e-44 1.26e-40 """ df = ddf.df.sort_values(a["FDR"]) df = df.set_index("Gene") for row in check.split("\n"): row = row.strip() if row and not row[0] == "#": row = row.split() self.assertAlmostEqual(df.ix[row[0]][a["log2FC"]], float(row[2]), places=2) self.assertAlmostEqual(df.ix[row[0]][a["p"]], float(row[5]), places=2) self.assertAlmostEqual(df.ix[row[0]][a["FDR"]], float(row[6]), places=2)
def test_double_comparison_with_different_strategies(self): data = pd.DataFrame({ "A.R1": [0, 0, 0, 0], "A.R2": [0, 0, 0, 0], "A.R3": [0, 0.001, 0.001, 0.001], "B.R1": [0.95, 0, 0.56, 0], "B.R2": [0.99, 0, 0.56, 0], "B.R3": [0.98, 0, 0.57, 0.5], "C.R1": [0.02, 0.73, 0.59, 0], "C.R2": [0.03, 0.75, 0.57, 0], "C.R3": [0.05, 0.7, 0.58, 1], }) ddf = DelayedDataFrame("ex1", data) gts = { k: list(v) for (k, v) in itertools.groupby(sorted(data.columns), lambda x: x[0]) } c = Comparisons(ddf, gts) a = c.a_vs_b("A", "B", TTestPaired()) force_load(ddf.add_annotator(a)) b = c.a_vs_b("A", "B", TTest()) force_load(ddf.add_annotator(b)) run_pipegraph() assert ddf.df[a["p"]].iloc[0] == pytest.approx(8.096338300746213e-07, abs=1e-4) assert ddf.df[a["p"]].iloc[1] == pytest.approx(0.42264973081037427, abs=1e-4) assert ddf.df[a["p"]].iloc[2] == pytest.approx(0.041378369826042816, abs=1e-4) assert ddf.df[a["p"]].iloc[3] == pytest.approx(0.42264973081037427, abs=1e-4) assert ddf.df[a["FDR"]].values == pytest.approx( [3.238535e-06, 4.226497e-01, 8.275674e-02, 4.226497e-01], abs=1e-4) assert ddf.df[b["p"]].iloc[0] == pytest.approx(8.096e-07, abs=1e-4) # value calculated with scipy to double check. assert ddf.df[b["p"]].iloc[1] == pytest.approx(0.42264973081037427, abs=1e-4) assert ddf.df[b["p"]].iloc[2] == pytest.approx(0.04157730613277929, abs=1e-4) assert ddf.df[b["p"]].iloc[3] == pytest.approx(0.703158104919873, abs=1e-4) assert ddf.df[b["FDR"]].values == pytest.approx( [3.238535e-06, 5.635329e-01, 8.315462e-02, 7.031581e-01], abs=1e-4)
def test_write_mangle(self): test_df = pd.DataFrame({"A": [1, 2], "B": ["c", "d"]}) def load(): return test_df a = DelayedDataFrame("shu", load) assert_frame_equal(a.df, test_df) assert (a.non_annotator_columns == ["A", "B"]).all() def mangle(df): df = df.drop("A", axis=1) df = df[df.B == "c"] return df fn = a.write("test.csv", mangle)[1] assert fn.exists() assert_frame_equal(pd.read_csv(fn, sep="\t"), mangle(test_df))