def testDietCleaningOpalytisThree(self): tdf = TicDatFactory(**dietSchema()) tdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(tdf) ticDat = tdf.copy_tic_dat(dietData()) pdf = PanDatFactory(**tdf.schema()) pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(pdf) input_set = create_inputset_mock(tdf, ticDat) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.categories.pop("fat") self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) tdf.remove_foreign_key_failures(ticDat) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def test_data_row_max_failures(self): pdf = PanDatFactory(table_one=[["Field"], []], table_two=[[], ["Field"]]) for t in ["table_one", "table_two"]: pdf.set_data_type(t, "Field") for table, dts in pdf.data_types.items(): for field, dt in dts.items(): if table == "table_one": pdf.add_data_row_predicate( table, lambda row: dt.valid_data(row["Field"])) else: pdf.add_data_row_predicate( table, lambda row: True if not dt.valid_data(row["Field"]) else "Oops", predicate_failure_response="Error Message") dat = pdf.PanDat(table_one=DataFrame( {"Field": list(range(1, 11)) + [-_ for _ in range(1, 11)]}), table_two=DataFrame( {"Field": [10.1] * 10 + [-2] * 10})) errs = pdf.find_data_row_failures(dat) self.assertTrue( len(errs) == 2 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_row_failures(dat, max_failures=11) self.assertTrue(len(errs) == 2) self.assertTrue( any(len(_) == 10 for _ in errs.values()) and any(len(_) == 1 for _ in errs.values())) errs = pdf.find_data_row_failures(dat, max_failures=10) self.assertTrue( len(errs) == 1 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_row_failures(dat, max_failures=9) self.assertTrue( len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))
def testSillyCleaningOpalyticsTwo(self): tdf = TicDatFactory(**sillyMeSchema()) tdf.add_data_row_predicate("c", lambda row: row["cData4"] == 'd') ticDat = tdf.TicDat(**sillyMeData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**sillyMeSchema()) pdf.add_data_row_predicate("c", lambda row: row["cData4"] == 'd') panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.c.pop() ticDat.c.pop(0) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testVariousCoverages(self): pdf = PanDatFactory(**dietSchema()) _d = dict(categories={ "minNutrition": 0, "maxNutrition": float("inf") }, foods={"cost": 0}, nutritionQuantities={"qty": 0}) pdf.set_default_values(**_d) self.assertTrue(pdf._default_values == _d) pdf = PanDatFactory(**netflowSchema()) addNetflowForeignKeys(pdf) pdf.clear_foreign_keys("arcs") self.assertTrue({_[0] for _ in pdf._foreign_keys} == {"cost", "inflow"}) pdf.add_data_row_predicate("arcs", lambda row: True) pdf.add_data_row_predicate("arcs", lambda row: True, "dummy") pdf.add_data_row_predicate("arcs", None, 0) pdf = pdf.clone() self.assertTrue(set(pdf._data_row_predicates["arcs"]) == {"dummy"}) pdf = PanDatFactory(pdf_table_one=[["A Field"], []], pdf_table_two=[["B Field"], []], pdf_table_three=[["C Field"], []]) pdf.add_foreign_key("pdf_table_one", "pdf_table_two", ["A Field", "B Field"]) pdf.add_foreign_key("pdf_table_two", "pdf_table_three", ["B Field", "C Field"]) pdf.add_foreign_key("pdf_table_three", "pdf_table_one", ["C Field", "A Field"])
def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= row["minNutrition"], "minmax") pdf2 = PanDatFactory(**sch) def make_error_message_predicate(f, name): def error_message_predicate(row): rtn = f(row) if rtn: return True return f"{name} failed!" return error_message_predicate for t, preds in pdf._data_row_predicates.items(): for p_name, rpi in preds.items(): pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name), predicate_name=p_name, predicate_failure_response="Error Message") failed = pdf.find_data_row_failures(pandat) failed2 = pdf2.find_data_row_failures(pandat) self.assertTrue(set(failed) == set(failed2) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'}) for f in [failed, failed2]: self.assertTrue(set({(v["food"], v["category"]) for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'}) for t, n in failed2: self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'}) for _pdf in [pdf, pdf2]: failed = _pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) ex = [] try: _pdf.find_data_row_failures(pandat_2) except Exception as e: ex[:] = [str(e.__class__)] self.assertTrue("TypeError" in ex[0]) failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'}) failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") df = failed['categories', 'minmax'] err_str = list(df[df['name'] == '3']["Error Message"])[0] self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>")
def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : numericish(qty) and 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: all(map(numericish, [row["minNutrition"], row["maxNutrition"]])) and row["maxNutrition"] >= row["minNutrition"], "minmax") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'}) failed = pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
def testDataPredicates(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [21,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) ticdat.nutritionQuantities['a', 2] = 12 ticdat.categories["3"] = ['a', 100] pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= row["minNutrition"], "minmax") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'}) failed = pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) failed = pdf.find_data_row_failures(pandat_2) self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'}) perform_predicate_checks(dietSchema()) perform_predicate_checks({t:'*' for t in dietSchema()}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.add_data_row_predicate("arcs", lambda row: True, "capacity") self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"] pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")})
def testDataRowPredicatesTwo(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) num_calls=[0] mess_it_up=[] def pre_processor(dat): num_calls[0] += 1 if mess_it_up: dat.messing_it_up+=1 return {t:len(getattr(dat, t)) for t in tdf.all_tables} pdf.add_data_row_predicate("foods", lambda row, y: y==12, predicate_kwargs_maker=lambda dat: {"y":12}) pdf.add_data_row_predicate("categories", lambda row, nutritionQuantities, foods, categories: row["name"] == "fat" or categories == 4, predicate_name="catfat", predicate_kwargs_maker=pre_processor) pdf.add_data_row_predicate("foods", lambda row, nutritionQuantities, foods, categories: row["name"] == "pizza" or foods == 9, predicate_name= "foodza", predicate_kwargs_maker=pre_processor) def dummy_kwargs_maker(dat): if pdf.good_pan_dat_object(dat): return {"x":1} for t in tdf.all_tables: pdf.add_data_row_predicate(t, lambda row, x: x==1, predicate_name=f"dummy_{t}", predicate_kwargs_maker=dummy_kwargs_maker) pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData()))) self.assertFalse(pdf.find_data_row_failures(pandat)) self.assertTrue(num_calls[0] == 1) pandat.foods = pandat.foods[pandat.foods["name"] != "pizza"].copy() pandat.categories = pandat.categories[pandat.categories["name"] != "fat"].copy() fails = pdf.find_data_row_failures(pandat) self.assertTrue(num_calls[0] == 2) self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')}) self.assertTrue(set(fails['categories', 'catfat']["name"]) == set(dietData().categories).difference(["fat"])) self.assertTrue(set(fails['foods', 'foodza']["name"]) == set(dietData().foods).difference(["pizza"])) mess_it_up.append(1) ex = [] try: pdf.find_data_row_failures(pandat) except Exception as e: ex[:] = [str(e.__class__)] self.assertTrue("AttributeError" in ex[0]) fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure") self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')}) self.assertTrue(num_calls[0] == 4) for v in fails.values(): self.assertTrue(v.primary_key == '*' and "no attribute" in v.error_message) pdf = pdf.clone() fails = pdf.find_data_row_failures(pandat, exception_handling="Handled as Failure") self.assertTrue(set(map(tuple, fails)) == {('categories', 'catfat'), ('foods', 'foodza')}) mess_it_up=[] def fail_on_bad_name(row, bad_name): if row["name"] == bad_name: return f"{bad_name} is bad" return True pdf.add_data_row_predicate("foods", fail_on_bad_name, predicate_name="baddy", predicate_kwargs_maker=lambda dat: {"bad_name": sorted(dat.foods["name"])[0]}, predicate_failure_response="Error Message") pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, tdf.copy_tic_dat(dietData()))) fails = pdf.find_data_row_failures(pandat) self.assertTrue(set(map(tuple, fails)) == {('foods', 'baddy')}) self.assertTrue(len(fails['foods', 'baddy']) == 1) self.assertTrue(list(fails['foods', 'baddy']["Error Message"])[0] == "chicken is bad")
def testDataPredicates(self): # this test won't run properly if the -O flag is applied if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [21,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) ticdat.nutritionQuantities['a', 2] = 12 ticdat.categories["3"] = ['a', 100] pandat_2 = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= row["minNutrition"], "minmax") pdf2 = PanDatFactory(**sch) def make_error_message_predicate(f, name): def error_message_predicate(row): rtn = f(row) if rtn: return True return f"{name} failed!" return error_message_predicate for t, preds in pdf._data_row_predicates.items(): for p_name, rpi in preds.items(): pdf2.add_data_row_predicate(t, make_error_message_predicate(rpi.predicate, p_name), predicate_name=p_name, predicate_failure_response="Error Message") failed = pdf.find_data_row_failures(pandat) failed2 = pdf2.find_data_row_failures(pandat) self.assertTrue(set(failed) == set(failed2) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == set(failed2['foods', 'cost']["name"]) == {'b'}) for f in [failed, failed2]: self.assertTrue(set({(v["food"], v["category"]) for v in f['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(f['categories', 'minmax']["name"]) == {'2'}) for t, n in failed2: self.assertTrue(set(failed2[t, n]["Error Message"]) == {f'{n} failed!'}) for _pdf in [pdf, pdf2]: failed = _pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) ex = [] try: _pdf.find_data_row_failures(pandat_2) except Exception as e: ex[:] = [str(e.__class__)] self.assertTrue("TypeError" in ex[0]) failed = _pdf.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2', '3'}) failed = pdf2.find_data_row_failures(pandat_2, exception_handling="Handled as Failure") df = failed['categories', 'minmax'] err_str = list(df[df['name'] == '3']["Error Message"])[0] self.assertTrue(err_str=="Exception<'>=' not supported between instances of 'int' and 'str'>") perform_predicate_checks(dietSchema()) perform_predicate_checks({t:'*' for t in dietSchema()}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_duplicates(pandat)) self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.add_data_row_predicate("arcs", lambda row: True, "capacity") self.assertFalse(pdf.find_data_row_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) good_capacity = lambda capacity: numericish(capacity) or capacity in ["Boston", "Seattle", "lumberjack"] pdf.add_data_row_predicate("arcs", lambda row: good_capacity(row["capacity"]), "capacity") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")}) pdf = PanDatFactory(table=[[],["Field", "Error Message", "Error Message (1)"]]) pdf.add_data_row_predicate("table", predicate=lambda row: f"Oops {row['Field']}" if row["Field"] > 1 else True, predicate_name="silly", predicate_failure_response="Error Message") df = DataFrame({"Field":[2, 1], "Error Message":["what", "go"], "Error Message (1)": ["now", "go"]}) fails = pdf.find_data_row_failures(pdf.PanDat(table=df)) df = fails["table", "silly"] self.assertTrue(list(df.columns) == ["Field", "Error Message", "Error Message (1)", "Error Message (2)"]) self.assertTrue(set(df["Field"]) == {2} and set(df["Error Message (2)"]) == {'Oops 2'})