コード例 #1
0
    def testSimple(self):
        if not self.canRun:
            return
        pdf = PanDatFactory(**netflowSchema())
        _dat = netflowPandasData()
        dat = pdf.PanDat(**{t:getattr(_dat, t) for t in pdf.all_tables})
        self.assertTrue(pdf.good_pan_dat_object(dat))

        dat2 = pdf.copy_pan_dat(dat)
        self.assertTrue(pdf._same_data(dat, dat2))
        self.assertTrue(pdf.good_pan_dat_object(dat2))
        delattr(dat2, "nodes")
        msg = []
        self.assertFalse(pdf.good_pan_dat_object(dat2, msg.append))
        self.assertTrue(msg[-1] == "nodes not an attribute.")

        dat3 = pdf.copy_pan_dat(dat)
        dat3.cost.drop("commodity", axis=1, inplace=True)
        self.assertFalse(pdf.good_pan_dat_object(dat3, msg.append))
        self.assertTrue("The following are (table, field) pairs missing from the data" in msg[-1])

        dat4 = pdf.copy_pan_dat(dat)
        dat4.cost["cost"] += 1
        self.assertFalse(pdf._same_data(dat, dat4))

        pdf2 = PanDatFactory(**{t:'*' for t in pdf.all_tables})
        dat5 = pdf2.copy_pan_dat(dat)
        self.assertTrue(pdf._same_data(dat, dat5))
        self.assertTrue(pdf2._same_data(dat, dat5))
        dat.commodities = dat.commodities.append(dat.commodities[dat.commodities["name"] == "Pencils"])
        dat.arcs = dat.arcs.append(dat.arcs[dat.arcs["destination"] == "Boston"])
        self.assertFalse(pdf2._same_data(dat, dat5))
        self.assertFalse(pdf._same_data(dat, dat5))
コード例 #2
0
ファイル: testpandat_utils.py プロジェクト: adampkehoe/ticdat
 def testFindDups(self):
     pdf = PanDatFactory(**sillyMeSchema())
     tdf = TicDatFactory(
         **{
             k: [[], list(pkfs) + list(dfs)]
             for k, (pkfs, dfs) in sillyMeSchema().items()
         })
     rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]
     ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables})
     panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat))
     dups = pdf.find_duplicates(panDat)
     self.assertTrue(set(dups) == {'a'} and set(dups['a']['aField']) == {1})
     dups = pdf.find_duplicates(panDat, as_table=False, keep=False)
     self.assertTrue(
         set(dups) == {'a'} and dups['a'].value_counts()[True] == 2)
     dups = pdf.find_duplicates(panDat, as_table=False)
     self.assertTrue(
         set(dups) == {'a'} and dups['a'].value_counts()[True] == 1)
     rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 2, 3, 40)]
     ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables})
     panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat))
     dups = pdf.find_duplicates(panDat, keep=False)
     self.assertTrue(
         set(dups) == {'a', 'b'} and set(dups['a']['aField']) == {1})
     dups = pdf.find_duplicates(panDat, as_table=False, keep=False)
     self.assertTrue({k: v.value_counts()[True]
                      for k, v in dups.items()} == {
                          'a': 3,
                          'b': 2
                      })
コード例 #3
0
ファイル: testpandat_utils.py プロジェクト: adampkehoe/ticdat
    def testXToManyTwo(self):
        input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]],
                                     child_one=[["F1", "F2", "F3"], []],
                                     child_two=[["F1", "F2"], ["F3"]],
                                     child_three=[[], ["F1", "F2", "F3"]])
        for t in ["child_one", "child_two", "child_three"]:
            input_schema.add_foreign_key(t, "parent",
                                         [["F1"] * 2, ["F2"] * 2, ["F3"] * 2])
        self.assertTrue({fk.cardinality
                         for fk in input_schema.foreign_keys} ==
                        {"one-to-one", "many-to-one"})

        rows = [[1, 2, 3], [1, 2.1, 3], [4, 5, 6], [4, 5.1, 6], [7, 8, 9]]
        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(parent=rows,
                         child_one=rows,
                         child_two=rows,
                         child_three=rows)
        self.assertTrue(
            all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables))
        orig_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.child_one[1, 2, 4] = {}
        dat.child_two[1, 2.2] = 3
        dat.child_three.append([1, 2, 4])
        new_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        fk_fails = input_schema.find_foreign_key_failures(new_pan_dat)
        self.assertTrue(len(fk_fails) == 3)
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))

        input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]],
                                     child_one=[["F1", "F2", "F3"], []],
                                     child_two=[["F1", "F2"], ["F3"]],
                                     child_three=[[], ["F1", "F2", "F3"]])
        for t in ["child_one", "child_two", "child_three"]:
            input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F3"] * 2])
        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(parent=rows,
                         child_one=rows,
                         child_two=rows,
                         child_three=rows)
        self.assertTrue(
            all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables))
        orig_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.child_one[1, 2, 4] = {}
        dat.child_two[1, 2.2] = 4
        dat.child_three.append([1, 2, 4])
        new_pan_dat = input_schema.copy_pan_dat(
            copy_to_pandas_with_reset(tdf, dat))
        self.assertTrue(
            len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3)
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
コード例 #4
0
    def testDataTypes(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [10,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5
        ticdat.nutritionQuantities['a', 2] = 12

        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        self.assertFalse(pdf.find_data_type_failures(pandat))
        pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat))
        self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001))

        pdf = PanDatFactory(**dietSchema())
        pdf.set_data_type("foods", "cost", nullable=False)
        pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True)
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')})
        self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
        self.assertTrue(set({(v["food"], v["category"])
                             for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                            {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})

        failed = pdf.find_data_type_failures(pandat, as_table=False)
        self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
        fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15})
        self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0})
        self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed="*")
        self.assertFalse(pdf.find_data_type_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"])
        failed = pdf.find_data_type_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
        pdf.replace_data_type_failures(pandat)
        self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
コード例 #5
0
ファイル: testpandat_utils.py プロジェクト: adampkehoe/ticdat
    def testDataTypes_two(self):
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**tdf.schema())

        def makeIt():
            rtn = tdf.TicDat()
            rtn.foods["a"] = 12
            rtn.foods["b"] = None
            rtn.foods[None] = 101
            rtn.categories["1"] = {"maxNutrition": 100, "minNutrition": 40}
            rtn.categories["2"] = [10, 20]
            for f, p in itertools.product(rtn.foods, rtn.categories):
                rtn.nutritionQuantities[f, p] = 5
            rtn.nutritionQuantities['a', 2] = 12
            return tdf.copy_to_pandas(rtn, drop_pk_columns=False)

        dat = makeIt()
        errs = pdf.find_data_type_failures(dat)
        self.assertTrue(len(errs) == 2 and not pdf.find_duplicates(dat))
        dat_copied = pdf.copy_pan_dat(dat)
        pdf.replace_data_type_failures(dat)
        self.assertTrue(pdf._same_data(dat, dat_copied, epsilon=0.00001))
        pdf2 = pdf.clone()
        pdf2.set_default_value("foods", "name", "a")
        pdf2.set_default_value("nutritionQuantities", "food", "a")
        pdf2.replace_data_type_failures(dat_copied)
        self.assertFalse(pdf._same_data(dat, dat_copied, epsilon=0.00001))
        self.assertFalse(pdf.find_data_type_failures(dat_copied))
        dups = pdf.find_duplicates(dat_copied)
        self.assertTrue(
            len(dups) == 2 and len(dups["foods"]) == 1
            and len(dups["nutritionQuantities"]) == 2)

        from pandas import isnull

        def noneify(iter_of_tuples):
            return {
                tuple(None if isnull(_) else _ for _ in tuple_)
                for tuple_ in iter_of_tuples
            }

        self.assertTrue(
            noneify(errs['nutritionQuantities', 'food'].itertuples(
                index=False)) == {(None, "1", 5), (None, "2", 5)})
        self.assertTrue(
            noneify(errs['foods',
                         'name'].itertuples(index=False)) == {(None, 101)})
        pdf = PanDatFactory(**tdf.schema())
        pdf.set_data_type("foods", "name", nullable=True, strings_allowed='*')
        pdf.set_data_type("nutritionQuantities",
                          "food",
                          nullable=True,
                          strings_allowed='*')
        self.assertFalse(pdf.find_data_type_failures(dat))
        pdf.set_data_type("foods", "cost", nullable=False)
        errs = pdf.find_data_type_failures(dat)
        self.assertTrue(len(errs) == 1)
        self.assertTrue(
            noneify(errs['foods',
                         'cost'].itertuples(index=False)) == {('b', None)})
コード例 #6
0
    def testXToMany(self):
        input_schema = PanDatFactory (roster = [["Name"],["Grade", "Arrival Inning", "Departure Inning",
                                                          "Min Innings Played", "Max Innings Played"]],
                                      positions = [["Position"],["Position Importance", "Position Group",
                                                                 "Consecutive Innings Only"]],
                                      innings = [["Inning"],["Inning Group"]],
                                      position_constraints = [["Position Group", "Inning Group", "Grade"],
                                                              ["Min Players", "Max Players"]])
        input_schema.add_foreign_key("position_constraints", "roster", ["Grade", "Grade"])
        input_schema.add_foreign_key("position_constraints", "positions", ["Position Group", "Position Group"])
        input_schema.add_foreign_key("position_constraints", "innings", ["Inning Group", "Inning Group"])

        self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"many-to-many"})

        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat()
        for i,p in enumerate(["bob", "joe", "fred", "alice", "lisa", "joean", "ginny"]):
            dat.roster[p]["Grade"] = (i%3)+1
        dat.roster["dummy"]["Grade"]  = "whatevers"
        for i,p in enumerate(["pitcher", "catcher", "1b", "2b", "ss", "3b", "lf", "cf", "rf"]):
            dat.positions[p]["Position Group"] = "PG %s"%((i%4)+1)
        for i in range(1, 10):
            dat.innings[i]["Inning Group"] = "before stretch" if i < 7 else "after stretch"
        dat.innings[0] ={}
        for pg, ig, g in itertools.product(["PG %s"%i for i in range(1,5)], ["before stretch", "after stretch"],
                                           [1, 2, 3]):
            dat.position_constraints[pg, ig, g] = {}

        orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))

        dat.position_constraints["no", "no", "no"] = dat.position_constraints[1, 2, 3] = {}
        new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema._same_data(orig_pan_dat, new_pan_dat))
        fk_fails = input_schema.find_foreign_key_failures(new_pan_dat)
        fk_fails_2 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low")
        fk_fails_3 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low", as_table=False)
        self.assertTrue({tuple(k)[:2] + (tuple(k[2]),): len(v) for k,v in fk_fails.items()} ==
                        {k:len(v) for k,v in fk_fails_2.items()} ==
                        {k:v.count(True) for k,v in fk_fails_3.items()} ==
                        {('position_constraints', 'innings', ("Inning Group", "Inning Group")): 2,
                         ('position_constraints', 'positions', ("Position Group", "Position Group")): 2,
                         ('position_constraints', 'roster', ("Grade", "Grade")): 1})
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))

        input_schema = PanDatFactory(table_one=[["One", "Two"], []],
                                     table_two=[["One"], ["Two"]])
        input_schema.add_foreign_key("table_two", "table_one", ["One", "One"])
        self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-many"})

        tdf = TicDatFactory(**input_schema.schema())
        dat = tdf.TicDat(table_one = [[1,2], [3,4], [5,6], [7,8]], table_two = {1:2, 3:4, 5:6})

        orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat))
        dat.table_two[9]=10
        new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat))
        fk_fails = input_schema.find_foreign_key_failures(new_pan_dat)
        self.assertTrue({tuple(k)[:2]:len(v) for k,v in fk_fails.items()} == {('table_two', 'table_one'): 1})
        input_schema.remove_foreign_key_failures(new_pan_dat)
        self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat))
        self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
コード例 #7
0
    def testDataPredicates(self):
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [21,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5


        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        ticdat.nutritionQuantities['a', 2] = 12
        ticdat.categories["3"] = ['a', 100]
        pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        def perform_predicate_checks(sch):
            pdf = PanDatFactory(**sch)
            pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
            good_qty = lambda qty : 5 < qty <= 12
            pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
            pdf.add_data_row_predicate("categories",
                                       lambda row: row["maxNutrition"] >= row["minNutrition"],
                                       "minmax")
            failed = pdf.find_data_row_failures(pandat)
            self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
            self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'})
            self.assertTrue(set({(v["food"], v["category"])
                                 for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                                {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
            self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'})
            failed = pdf.find_data_row_failures(pandat, as_table=False)
            self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
            failed = pdf.find_data_row_failures(pandat_2)
            self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})

        perform_predicate_checks(dietSchema())
        perform_predicate_checks({t:'*' for t in dietSchema()})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.add_data_row_predicate("arcs", lambda row: True, "capacity")
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"]
        pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity")
        failed = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
コード例 #8
0
    def testDataRowPredicatesTwo(self):
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        num_calls=[0]
        mess_it_up=[]
        def pre_processor(dat):
            num_calls[0] += 1
            if mess_it_up:
                dat.messing_it_up+=1
            return {t:len(getattr(dat, t)) for t in tdf.all_tables}
        pdf.add_data_row_predicate("foods", lambda row, y: y==12, predicate_kwargs_maker=lambda dat: {"y":12})
        pdf.add_data_row_predicate("categories", lambda row, nutritionQuantities, foods, categories:
                               row["name"] == "fat" or categories == 4,
                               predicate_name="catfat", predicate_kwargs_maker=pre_processor)
        pdf.add_data_row_predicate("foods", lambda row, nutritionQuantities, foods, categories:
                               row["name"] == "pizza" or foods == 9,
                               predicate_name= "foodza", predicate_kwargs_maker=pre_processor)
        def dummy_kwargs_maker(dat):
            if pdf.good_pan_dat_object(dat):
                return {"x":1}
        for t in tdf.all_tables:
            pdf.add_data_row_predicate(t, lambda row, x: x==1, predicate_name=f"dummy_{t}",
                                       predicate_kwargs_maker=dummy_kwargs_maker)
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData())))
        self.assertFalse(pdf.find_data_row_failures(pandat))
        self.assertTrue(num_calls[0] == 1)
        pandat.foods = pandat.foods[pandat.foods["name"] != "pizza"].copy()
        pandat.categories = pandat.categories[pandat.categories["name"] != "fat"].copy()
        fails = pdf.find_data_row_failures(pandat)
        self.assertTrue(num_calls[0] == 2)
        self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')})
        self.assertTrue(set(fails['categories', 'catfat']["name"]) == set(dietData().categories).difference(["fat"]))
        self.assertTrue(set(fails['foods', 'foodza']["name"]) == set(dietData().foods).difference(["pizza"]))

        mess_it_up.append(1)
        ex = []
        try:
            pdf.find_data_row_failures(pandat)
        except Exception as e:
            ex[:] = [str(e.__class__)]
        self.assertTrue("AttributeError" in ex[0])
        fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure")
        self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')})
        self.assertTrue(num_calls[0] == 4)
        for v in fails.values():
            self.assertTrue(v.primary_key == '*' and "no attribute" in v.error_message)
        pdf = pdf.clone()
        fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure")
        self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')})
        mess_it_up=[]
        def fail_on_bad_name(row, bad_name):
            if row["name"] == bad_name:
                return f"{bad_name} is bad"
            return True
        pdf.add_data_row_predicate("foods", fail_on_bad_name, predicate_name="baddy",
                                   predicate_kwargs_maker=lambda dat: {"bad_name": sorted(dat.foods["name"])[0]},
                                   predicate_failure_response="Error Message")
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData())))
        fails = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(map(tuple, fails)) == {('foods', 'baddy')})
        self.assertTrue(len(fails['foods', 'baddy']) == 1)
        self.assertTrue(list(fails['foods', 'baddy']["Error Message"])[0] == "chicken is bad")
コード例 #9
0
    def testDataPredicates(self):
        # this test won't run properly if the -O flag is applied
        if not self.canRun:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())

        ticdat = tdf.TicDat()
        ticdat.foods["a"] = 12
        ticdat.foods["b"] = None
        ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40}
        ticdat.categories["2"] = [21,20]
        for f, p in itertools.product(ticdat.foods, ticdat.categories):
            ticdat.nutritionQuantities[f,p] = 5


        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        ticdat.nutritionQuantities['a', 2] = 12
        ticdat.categories["3"] = ['a', 100]
        pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))

        def perform_predicate_checks(sch):
            pdf = PanDatFactory(**sch)
            pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost")
            good_qty = lambda qty : 5 < qty <= 12
            pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty")
            pdf.add_data_row_predicate("categories",
                                       lambda row: row["maxNutrition"] >= row["minNutrition"],
                                       "minmax")
            pdf2 = PanDatFactory(**sch)
            def make_error_message_predicate(f, name):
                def error_message_predicate(row):
                    rtn = f(row)
                    if rtn:
                        return True
                    return f"{name} failed!"
                return error_message_predicate
            for t, preds in pdf._data_row_predicates.items():
                for p_name, rpi in preds.items():
                    pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name),
                                                predicate_name=p_name, predicate_failure_response="Error Message")
            failed = pdf.find_data_row_failures(pandat)
            failed2 = pdf2.find_data_row_failures(pandat)
            self.assertTrue(set(failed) == set(failed2) ==  {('foods', 'cost'),
                                            ('nutritionQuantities', 'qty'), ('categories', 'minmax')})
            self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'})
            for f in [failed, failed2]:
                self.assertTrue(set({(v["food"], v["category"])
                                     for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) ==
                                    {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')})
                self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'})
            for t, n in failed2:
                self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'})
            for _pdf in [pdf, pdf2]:
                failed = _pdf.find_data_row_failures(pandat, as_table=False)
                self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
                ex = []
                try:
                    _pdf.find_data_row_failures(pandat_2)
                except Exception as e:
                    ex[:] = [str(e.__class__)]
                self.assertTrue("TypeError" in ex[0])
                failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
                self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'})
            failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure")
            df = failed['categories', 'minmax']
            err_str = list(df[df['name'] == '3']["Error Message"])[0]
            self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>")

        perform_predicate_checks(dietSchema())
        perform_predicate_checks({t:'*' for t in dietSchema()})

        tdf = TicDatFactory(**netflowSchema())
        tdf.enable_foreign_key_links()
        addNetflowForeignKeys(tdf)
        pdf = PanDatFactory(**netflowSchema())
        ticdat = tdf.copy_tic_dat(netflowData())
        for n in ticdat.nodes["Detroit"].arcs_source:
            ticdat.arcs["Detroit", n] = n
        pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat))
        self.assertFalse(pdf.find_duplicates(pandat))
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        pdf.add_data_row_predicate("arcs", lambda row: True, "capacity")
        self.assertFalse(pdf.find_data_row_failures(pandat))

        pdf = PanDatFactory(**netflowSchema())
        good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"]
        pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity")
        failed = pdf.find_data_row_failures(pandat)
        self.assertTrue(set(failed) == {('arcs', 'capacity')})
        self.assertTrue(set({(v["source"], v["destination"])
                             for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})

        pdf = PanDatFactory(table=[[],["Field", "Error Message", "Error Message (1)"]])
        pdf.add_data_row_predicate("table", predicate=lambda row: f"Oops {row['Field']}" if row["Field"] > 1 else True,
                                   predicate_name="silly", predicate_failure_response="Error Message")
        df = DataFrame({"Field":[2, 1], "Error Message":["what", "go"], "Error Message (1)": ["now", "go"]})
        fails = pdf.find_data_row_failures(pdf.PanDat(table=df))
        df = fails["table", "silly"]
        self.assertTrue(list(df.columns) == ["Field", "Error Message", "Error Message (1)", "Error Message (2)"])
        self.assertTrue(set(df["Field"]) == {2} and set(df["Error Message (2)"]) == {'Oops 2'})
コード例 #10
0
ファイル: testpandat_io.py プロジェクト: nandi6uc/ticdat
    def testCsvSimple(self):
        if not self.can_run:
            return
        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "diet_csv")
        pdf.csv.write_directory(panDat, dirPath)
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        panDat2 = pdf2.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        pdf2 = PanDatFactory(**{
            k: v
            for k, v in dietSchema().items() if k != "nutritionQuantities"
        })
        panDat2 = pdf2.copy_pan_dat(panDat)
        dirPath = os.path.join(_scratchDir, "diet_missing_csv")
        pdf2.csv.write_directory(panDat2, dirPath, makeCleanDir(dirPath))
        panDat3 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf2._same_data(panDat2, panDat3))
        self.assertTrue(all(hasattr(panDat3, x) for x in pdf.all_tables))
        self.assertFalse(len(panDat3.nutritionQuantities))
        self.assertTrue(len(panDat3.categories) and len(panDat3.foods))

        pdf2 = PanDatFactory(
            **{k: v
               for k, v in dietSchema().items() if k == "categories"})
        panDat2 = pdf2.copy_pan_dat(panDat)
        pdf2.csv.write_directory(panDat2, dirPath, makeCleanDir(dirPath))
        panDat3 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf2._same_data(panDat2, panDat3))
        self.assertTrue(all(hasattr(panDat3, x) for x in pdf.all_tables))
        self.assertFalse(
            len(panDat3.nutritionQuantities) or len(panDat3.foods))
        self.assertTrue(len(panDat3.categories))

        tdf = TicDatFactory(**netflowSchema())
        pdf = PanDatFactory(**netflowSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **
                {t: getattr(netflowData(), t)
                 for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(netflowSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "netflow_csv")
        pdf.csv.write_directory(panDat, dirPath)
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))
        pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables})
        pdf2.csv.write_directory(panDat, dirPath)
        panDat2 = pdf2.csv.create_pan_dat(dirPath)
        self.assertTrue(pdf._same_data(panDat, panDat2))

        tdf = TicDatFactory(**dietSchema())
        pdf = PanDatFactory(**dietSchema())
        ticDat = tdf.freeze_me(
            tdf.TicDat(
                **{t: getattr(dietData(), t)
                   for t in tdf.primary_key_fields}))
        panDat = pan_dat_maker(dietSchema(), ticDat)
        dirPath = os.path.join(_scratchDir, "diet_csv")
        pdf.csv.write_directory(panDat, dirPath, decimal=",")
        panDat2 = pdf.csv.create_pan_dat(dirPath)
        self.assertFalse(pdf._same_data(panDat, panDat2))
        panDat2 = pdf.csv.create_pan_dat(dirPath, decimal=",")
        self.assertTrue(pdf._same_data(panDat, panDat2))