def make_pdf(): pdf = PanDatFactory(data_table = [["a"], ["b", "c"]], parameters = [["a"], ["b"]]) pdf.add_parameter("Something", 100, max=100, inclusive_max=True) pdf.add_parameter("Another thing", 5, must_be_int=True) pdf.add_parameter("Untyped thing", "whatever", enforce_type_rules=False) pdf.add_parameter("Last", 'boo', number_allowed=False, strings_allowed='*') return PanDatFactory.create_from_full_schema(pdf.schema(True))
def test_fk_max_failures(self): tdf = TicDatFactory(**dietSchema()) addDietForeignKeys(tdf) dat = tdf.TicDat(nutritionQuantities=[[f"food_{_}", f"cat_{_}", 10] for _ in range(10)]) pan_dat = tdf.copy_to_pandas(dat, drop_pk_columns=False) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) errs = pdf.find_foreign_key_failures(pan_dat) self.assertTrue( len(errs) == 2 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=11) self.assertTrue( len(errs) == 2 and set(map(len, errs.values())) == {10, 1}) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=10) self.assertTrue( len(errs) == 1 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=9) self.assertTrue( len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))
def testRoundTrips(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) tdf.enable_foreign_key_links() oldDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False) self.assertTrue(pdf.good_pan_dat_object(pan_dat)) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(tdf._same_data(oldDat, tic_dat)) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) oldDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) pan_dat = tdf.copy_to_pandas(oldDat, drop_pk_columns=False) self.assertTrue(pdf.good_pan_dat_object(pan_dat)) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(tdf._same_data(oldDat, tic_dat)) pdf = PanDatFactory(table=[["a", "b"], ["c"]]) pan_dat = pdf.PanDat(table=utils.DataFrame({ "a": [1, 2, 1, 1], "b": [10, 10, 10, 11], "c": [101, 102, 103, 104] })) self.assertTrue( len(pdf.find_duplicates(pan_dat, keep=False)["table"]) == 2) tic_dat = pdf.copy_to_tic_dat(pan_dat) self.assertTrue(len(tic_dat.table) == len(pan_dat.table) - 1) tdf = TicDatFactory(**pdf.schema()) tic_dat = tdf.TicDat(table=[[1, 2, 3], [None, 2, 3], [2, 1, None]]) self.assertTrue(len(tic_dat.table) == 3) tic_dat_two = pdf.copy_to_tic_dat( tdf.copy_to_pandas(tic_dat, drop_pk_columns=False)) self.assertFalse(tdf._same_data(tic_dat, tic_dat_two)) tic_dat3 = tdf.TicDat( table=[[1, 2, 3], [float("nan"), 2, 3], [2, 1, float("nan")]]) # this fails because _same_data isn't smart enough to check against nan in the keys, # because float("nan") != float("nan") self.assertFalse(tdf._same_data(tic_dat3, tic_dat_two)) pdf = PanDatFactory(table=[["a"], ["b", "c"]]) tdf = TicDatFactory(**pdf.schema()) tic_dat = tdf.TicDat(table=[[1, 2, 3], [2, None, 3], [2, 1, None]]) tic_dat_two = pdf.copy_to_tic_dat( tdf.copy_to_pandas(tic_dat, drop_pk_columns=False)) self.assertFalse(tdf._same_data(tic_dat, tic_dat_two)) tic_dat3 = tdf.TicDat( table=[[1, 2, 3], [2, float("nan"), 3], [2, 1, float("nan")]]) # _same_data works fine in checking nan equivalence in data rows - which maybe self.assertTrue( tdf._same_data(tic_dat3, tic_dat_two, nans_are_same_for_data_rows=True))