def testDupsOpalytics(self): if not self.can_run: return for hack in [True, False]: tdf = TicDatFactory(one=[["a"], ["b", "c"]], two=[["a", "b"], ["c"]], three=[["a", "b", "c"], []]) tdf2 = TicDatFactory( **{t: [[], ["a", "b", "c"]] for t in tdf.all_tables}) td = tdf2.TicDat( **{ t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables }) inputset = create_inputset_mock(tdf2, td, hack) pdf = PanDatFactory(**tdf.schema()) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=True) self.assertTrue( all(len(getattr(panDat, t)) == 6 for t in tdf.all_tables)) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=False) self.assertTrue( all(len(getattr(panDat, t)) < 6 for t in tdf.all_tables)) td_1 = tdf.TicDat( **{ t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables }) td_2 = pdf.copy_to_tic_dat(panDat) self.assertTrue( all( set(getattr(td_1, t)) == set(getattr(td_2, t)) for t in tdf.all_tables))
def testDietCleaningOpalytics(self): sch = dietSchema() sch["categories"][-1].append("_active") tdf1 = TicDatFactory(**dietSchema()) tdf2 = TicDatFactory(**sch) ticDat2 = tdf2.copy_tic_dat(dietData()) for v in ticDat2.categories.values(): v["_active"] = True ticDat2.categories["fat"]["_active"] = False ticDat1 = tdf1.copy_tic_dat(dietData()) input_set = create_inputset_mock_with_active_hack(tdf2, ticDat2) pdf1 = PanDatFactory(**tdf1.schema()) panDat = pdf1.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf1._same_data(pdf1.copy_to_tic_dat(panDat), ticDat1)) panDatPurged = pdf1.opalytics.create_pan_dat(input_set) self.assertFalse( tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1)) ticDat1.categories.pop("fat") tdf1.remove_foreign_key_failures(ticDat1) self.assertTrue( tdf1._same_data(pdf1.copy_to_tic_dat(panDatPurged), ticDat1))
def test_nullables(self): core_path = os.path.join(_scratchDir, "nullables") pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]]) pdf.set_data_type("table_with_stuffs", "field one") pdf.set_data_type("table_with_stuffs", "field two", number_allowed=False, strings_allowed='*', nullable=True) dat = TicDatFactory(**pdf.schema()).TicDat( table_with_stuffs=[[101, "022"], [202, None], [303, "111"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) self.assertFalse(pdf.find_data_type_failures(dat)) for attr, path in [["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["sql", core_path + ".db"], ["json", core_path + ".json"]]: f_or_d = "directory" if attr == "csv" else "file" write_func, write_kwargs = utils._get_write_function_and_kwargs( pdf, path, f_or_d) write_func(dat, path, **write_kwargs) dat_1 = utils._get_dat_object(pdf, "create_pan_dat", path, f_or_d, False) self.assertTrue( pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
def testSqlSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) filePath = os.path.join(_scratchDir, "diet.db") pdf.sql.write_file(panDat, filePath) sqlPanDat = pdf.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.sql.write_file(panDat, filePath) sqlPanDat = pdf2.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "netflow.db") pdf.sql.write_file(panDat, filePath) panDat2 = pdf.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) sqlPanDat = pdf2.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat))
def testFindDups(self): pdf = PanDatFactory(**sillyMeSchema()) tdf = TicDatFactory( **{ k: [[], list(pkfs) + list(dfs)] for k, (pkfs, dfs) in sillyMeSchema().items() }) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat) self.assertTrue(set(dups) == {'a'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 2) dups = pdf.find_duplicates(panDat, as_table=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 1) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 2, 3, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat, keep=False) self.assertTrue( set(dups) == {'a', 'b'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue({k: v.value_counts()[True] for k, v in dups.items()} == { 'a': 3, 'b': 2 })
def testSpacesOpalytics(self): if not self.can_run: return for hack, raw_data in list(itertools.product(*(([True, False], ) * 2))): tdf = TicDatFactory(**spacesSchema()) ticDat = tdf.TicDat(**spacesData()) inputset = create_inputset_mock(tdf, ticDat, hack) pdf = PanDatFactory(**tdf.schema()) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=raw_data) self.assertTrue(tdf._same_data(ticDat, pdf.copy_to_tic_dat(panDat)))
def testMissingOpalyticsTable(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData())) inputset = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory( **(dict(dietSchema(), missing_table=[["a"], ["b"]]))) panDat = pdf.opalytics.create_pan_dat(inputset) ticDat2 = pdf.copy_to_tic_dat(panDat) self.assertTrue(tdf._same_data(ticDat, ticDat2)) self.assertFalse(ticDat2.missing_table)
def testDietCleaningOpalytisThree(self): tdf = TicDatFactory(**dietSchema()) tdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(tdf) ticDat = tdf.copy_tic_dat(dietData()) pdf = PanDatFactory(**tdf.schema()) pdf.add_data_row_predicate("categories", lambda row: row["maxNutrition"] >= 66) addDietForeignKeys(pdf) input_set = create_inputset_mock(tdf, ticDat) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.categories.pop("fat") self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) tdf.remove_foreign_key_failures(ticDat) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testDietCleaningOpalyticsTwo(self): tdf = TicDatFactory(**dietSchema()) addDietForeignKeys(tdf) tdf.set_data_type("categories", "maxNutrition", min=66, inclusive_max=True) ticDat = tdf.copy_tic_dat(dietData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**dietSchema()) addDietForeignKeys(pdf) pdf.set_data_type("categories", "maxNutrition", min=66, inclusive_max=True) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.categories.pop("fat") self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) tdf.remove_foreign_key_failures(ticDat) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def testDateTimeTwo(self): file = os.path.join(_scratchDir, "datetime_pd.xls") df = utils.pd.DataFrame({ "a": list( map(utils.pd.Timestamp, [ "June 13 1960 4:30PM", "Dec 11 1970 1AM", "Sept 11 2001 9:30AM" ])) }) df.to_excel(file, "Cool Runnings") pdf = PanDatFactory(cool_runnings=[["a"], []]) pdf.set_data_type("cool_runnings", "a", datetime=True) dat = pdf.xls.create_pan_dat(file) self.assertTrue(set(dat.cool_runnings["a"]) == set(df["a"]))
def testSillyCleaningOpalyticsOne(self): tdf = TicDatFactory(**sillyMeSchema()) tdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) ticDat = tdf.TicDat(**sillyMeData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**sillyMeSchema()) pdf.set_data_type("c", "cData4", number_allowed=False, strings_allowed=['d']) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.c.pop() ticDat.c.pop(0) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def perform_predicate_checks(sch): pdf = PanDatFactory(**sch) pdf.add_data_row_predicate("foods", lambda row: numericish(row["cost"]) and not isnan(row["cost"]), "cost") good_qty = lambda qty : numericish(qty) and 5 < qty <= 12 pdf.add_data_row_predicate("nutritionQuantities", lambda row: good_qty(row["qty"]), "qty") pdf.add_data_row_predicate("categories", lambda row: all(map(numericish, [row["minNutrition"], row["maxNutrition"]])) and row["maxNutrition"] >= row["minNutrition"], "minmax") failed = pdf.find_data_row_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty'), ('categories', 'minmax')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) self.assertTrue(set(failed['categories', 'minmax']["name"]) == {'2'}) failed = pdf.find_data_row_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True])
def testJsonCross(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) filePath = os.path.join(_scratchDir, "diet_cross.json") pdf.json.write_file(panDat, filePath) ticDat2 = tdf.json.create_tic_dat(filePath, from_pandas=True) self.assertTrue(tdf._same_data(ticDat, ticDat2, epsilon=0.0001)) tdf.json.write_file(ticDat, filePath, allow_overwrite=True, to_pandas=True) panDat2 = pdf.json.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2, epsilon=0.0001))
def testDietOpalytics(self): if not self.can_run: return for hack, raw_data, activeEnabled in list( itertools.product(*(([True, False], ) * 3))): tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.copy_tic_dat(dietData())) inputset = create_inputset_mock(tdf, ticDat, hack, activeEnabled) pdf = PanDatFactory(**dietSchema()) panDat = pdf.opalytics.create_pan_dat(inputset) self.assertFalse(pdf.find_duplicates(panDat)) ticDat2 = pdf.copy_to_tic_dat(panDat) self.assertTrue(tdf._same_data(ticDat, ticDat2)) tdf2 = TicDatFactory( **{ k: [pks, list(dfs) + ["dmy"]] for k, (pks, dfs) in tdf.schema().items() }) _dat = tdf2.copy_tic_dat(ticDat) panDat = pdf.opalytics.create_pan_dat( create_inputset_mock(tdf2, _dat, hack)) self.assertTrue(tdf._same_data(ticDat, pdf.copy_to_tic_dat(panDat))) pdf2 = PanDatFactory(**tdf2.schema()) ex = self.firesException(lambda: pdf2.opalytics.create_pan_dat( inputset, raw_data=raw_data)) self.assertTrue( all(_ in ex for _ in ["(table, field) pairs missing"] + ["'%s', 'dmy'" % _ for _ in pdf2.all_tables]))
def testXlsSpacey(self): if not self.can_run: return tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat(**spacesData()) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".xlsx" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2))
def testJsonSpacey(self): if not self.can_run: return tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat(**spacesData()) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".json" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) pdf.json.write_file(panDat, filePath, case_space_table_names=True) panDat2 = pdf.json.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) panDat3 = pdf.json.create_pan_dat( pdf.json.write_file(panDat, "", case_space_table_names=True)) self.assertTrue(pdf._same_data(panDat, panDat3)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) pdf.json.write_file(panDat, filePath, case_space_table_names=True) panDat2 = pdf.json.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) panDat3 = pdf.json.create_pan_dat( pdf.json.write_file(panDat, "", case_space_table_names=True)) self.assertTrue(pdf._same_data(panDat, panDat3)) dicted = json.loads(pdf.json.write_file(panDat, "", orient='columns')) panDat4 = pdf.PanDat(**dicted) self.assertTrue(pdf._same_data(panDat, panDat4, epsilon=1e-5))
def testSillyCleaningOpalyticsThree(self): tdf = TicDatFactory(**sillyMeSchema()) tdf.add_data_row_predicate("c", lambda row: row["cData4"] != 4) tdf.add_data_row_predicate("c", lambda row: row["cData4"] != 24) ticDat = tdf.TicDat(**sillyMeData()) input_set = create_inputset_mock(tdf, ticDat) pdf = PanDatFactory(**sillyMeSchema()) pdf.add_data_row_predicate("c", lambda row: row["cData4"] != 4) pdf.add_data_row_predicate("c", lambda row: row["cData4"] != 24) panDat = pdf.opalytics.create_pan_dat(input_set, raw_data=True) self.assertTrue(tdf._same_data(pdf.copy_to_tic_dat(panDat), ticDat)) panDatPurged = pdf.opalytics.create_pan_dat(input_set, raw_data=False) self.assertFalse( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat)) ticDat.c.pop() ticDat.c.pop(0) self.assertTrue( tdf._same_data(pdf.copy_to_tic_dat(panDatPurged), ticDat))
def test_data_type_max_failures(self): pdf = PanDatFactory(table_one=[["Field"], []], table_two=[[], ["Field"]]) for t in ["table_one", "table_two"]: pdf.set_data_type(t, "Field") dat = pdf.PanDat(table_one=DataFrame( {"Field": list(range(1, 11)) + [-_ for _ in range(1, 11)]}), table_two=DataFrame( {"Field": [10.1] * 10 + [-2] * 10})) errs = pdf.find_data_type_failures(dat) self.assertTrue( len(errs) == 2 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_type_failures(dat, max_failures=11) self.assertTrue(len(errs) == 2) self.assertTrue( any(len(_) == 10 for _ in errs.values()) and any(len(_) == 1 for _ in errs.values())) errs = pdf.find_data_type_failures(dat, max_failures=10) self.assertTrue( len(errs) == 1 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_type_failures(dat, max_failures=9) self.assertTrue( len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))
def testSqlSpaceyTwo(self): if not self.can_run: return self.assertTrue(pandatio.sql, "this unit test requires SQLite installed") tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat( **{ "a_table": { 1: [1, 2, "3"], 22.2: (12, 0.12, "something"), 0.23: (11, 12, "thirt") }, "b_table": { (1, 2, "foo"): 1, (1012.22, 4, "0012"): 12 }, "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5), ("another", 5, 12.5, 24)) }) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".db" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path=None, con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(db_file_path=None, con=con) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path="", con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(None, con) self.assertTrue(pdf._same_data(panDat, panDat2))
def test_fk_max_failures(self): tdf = TicDatFactory(**dietSchema()) addDietForeignKeys(tdf) dat = tdf.TicDat(nutritionQuantities=[[f"food_{_}", f"cat_{_}", 10] for _ in range(10)]) pan_dat = tdf.copy_to_pandas(dat, drop_pk_columns=False) pdf = PanDatFactory.create_from_full_schema( tdf.schema(include_ancillary_info=True)) errs = pdf.find_foreign_key_failures(pan_dat) self.assertTrue( len(errs) == 2 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=11) self.assertTrue( len(errs) == 2 and set(map(len, errs.values())) == {10, 1}) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=10) self.assertTrue( len(errs) == 1 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_foreign_key_failures(pan_dat, max_failures=9) self.assertTrue( len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))
def test_missing_tables(self): core_path = os.path.join(_scratchDir, "missing_tables") pdf_1 = PanDatFactory(this=[["Something"], ["Another"]]) pdf_2 = PanDatFactory( **dict(pdf_1.schema(), that=[["What", "Ever"], []])) dat = pdf_1.PanDat(this={ "Something": ["a", "b", "c"], "Another": [2, 3, 5] }) for attr, path in [["sql", core_path + ".db"], ["csv", core_path + "_csv"], ["json", core_path + ".json"], ["xls", core_path + ".xlsx"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf_1, attr), func)(dat, path) dat_1 = getattr(pdf_2, attr).create_pan_dat(path) self.assertTrue(pdf_1._same_data(dat, dat_1))
def testCsvSpacey(self): if not self.can_run: return self.assertTrue(pandatio.sql, "this unit test requires SQLite installed") tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat( **{ "a_table": { 1: [1, 2, "3"], 22.2: (12, 0.12, "something"), 0.23: (11, 12, "thirt") }, "b_table": { (1, 2, "foo"): 1, (1012.22, 4, "0012"): 12 }, "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5), ("another", 5, 12.5, 24)) }) panDat = pan_dat_maker(spacesSchema(), ticDat) dirPath = os.path.join(_scratchDir, "spaces_2_csv") pdf.csv.write_directory(panDat, dirPath, case_space_table_names=True) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) dirPath = os.path.join(_scratchDir, "spaces_2_2_csv") pdf.csv.write_directory(panDat, dirPath, case_space_table_names=True, sep=":") panDat2 = pdf.csv.create_pan_dat(dirPath, sep=":") self.assertTrue(pdf._same_data(panDat, panDat2))
def testVariousCoverages(self): pdf = PanDatFactory(**dietSchema()) _d = dict(categories={"minNutrition": 0, "maxNutrition": float("inf")}, foods={"cost": 0}, nutritionQuantities={"qty": 0}) pdf.set_default_values(**_d) self.assertTrue(pdf._default_values == _d) pdf = PanDatFactory(**netflowSchema()) addNetflowForeignKeys(pdf) pdf.clear_foreign_keys("arcs") self.assertTrue({_[0] for _ in pdf._foreign_keys} == {"cost", "inflow"}) pdf.add_data_row_predicate("arcs", lambda row: True) pdf.add_data_row_predicate("arcs", lambda row: True, "dummy") pdf.add_data_row_predicate("arcs", None, 0) pdf = pdf.clone() self.assertTrue(set(pdf._data_row_predicates["arcs"]) == {"dummy"})
def testDataTypes(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [10,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 ticdat.nutritionQuantities['a', 2] = 12 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat)) self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001)) pdf = PanDatFactory(**dietSchema()) pdf.set_data_type("foods", "cost", nullable=False) pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) failed = pdf.find_data_type_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15}) self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0}) self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed="*") self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"]) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")}) pdf.replace_data_type_failures(pandat) self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
def make_pdf(): pdf = PanDatFactory(data_table = [["a"], ["b", "c"]], parameters = [["a"], ["b"]]) pdf.add_parameter("Something", 100, max=100, inclusive_max=True) pdf.add_parameter("Another thing", 5, must_be_int=True) pdf.add_parameter("Untyped thing", "whatever", enforce_type_rules=False) pdf.add_parameter("Last", 'boo', number_allowed=False, strings_allowed='*') return PanDatFactory.create_from_full_schema(pdf.schema(True))
def testDictConstructions(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) panDat2 = pdf.PanDat(**{t:getattr(panDat, t).to_dict() for t in pdf.all_tables}) panDat3 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables}) panDat3_1 = pdf.PanDat(**{t:list(map(list, getattr(panDat, t).itertuples(index=False))) for t in pdf.all_tables}) self.assertTrue(all(pdf._same_data(panDat, _) for _ in [panDat2, panDat3, panDat3_1])) panDat.foods["extra"] = 12 panDat4 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables}) self.assertTrue(pdf._same_data(panDat, panDat4)) self.assertTrue(set(panDat4.foods["extra"]) == {12}) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) panDat2 = pdf.PanDat(**{t:getattr(panDat, t).to_dict() for t in pdf.all_tables}) panDat3 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="records") for t in pdf.all_tables}) self.assertTrue(all(pdf._same_data(panDat, _) for _ in [panDat2, panDat3])) panDat.cost["extra"] = "boger" panDat4 = pdf.PanDat(**{t:getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables}) self.assertTrue(pdf._same_data(panDat, panDat4)) self.assertTrue(set(panDat4.cost["extra"]) == {"boger"})
def testAdditionalFKs(self): pdf = PanDatFactory(pt1 = [["F1"],[]], pt2 = [["F2"],[]], pt3 = [["F1","F2"],[]], pt4 = [["F1"],["F2"]], pt5 = [[],["F1","F2"]]) for c in ["pt3", "pt4", "pt5"]: pdf.add_foreign_key(c, "pt1", ["F1", "F1"]) pdf.add_foreign_key(c, "pt2", ["F2", "F2"]) tdf = TicDatFactory(**pdf.schema()) def pan_dat_(_): rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _)) self.assertFalse(pdf.find_duplicates(rtn)) return rtn ticDat = tdf.TicDat(pt1=[1, 2, 3, 4], pt2=[5, 6, 7, 8]) for f1, f2 in itertools.product(range(1,5), range(5,9)): ticDat.pt3[f1, f2] = {} ticDat.pt4[f1] = f2 ticDat.pt5.append((f1, f2)) origDat = tdf.copy_tic_dat(ticDat, freeze_it=True) self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(origDat))) ticDat.pt3["no",6] = ticDat.pt3[1, "no"] = {} ticDat.pt4["no"] = 6 ticDat.pt4["nono"]=6.01 panDat = pan_dat_(ticDat) fails1 = pdf.find_foreign_key_failures(panDat) self.assertTrue(fails1) pdf.remove_foreign_key_failures(panDat) self.assertFalse(pdf.find_foreign_key_failures(panDat)) self.assertTrue(pdf._same_data(panDat, pan_dat_(origDat))) orig_lens = {t:len(getattr(origDat, t)) for t in tdf.all_tables} ticDat.pt3["no",6] = ticDat.pt3[1, "no"] = {} ticDat.pt4["no"] = 6 ticDat.pt4["nono"]=6.01 ticDat.pt5.append(("no",6)) ticDat.pt5.append((1, "no")) panDat = pan_dat_(ticDat) fails2 = pdf.find_foreign_key_failures(panDat) self.assertTrue(set(fails1) != set(fails2) and set(fails1).issubset(fails2)) pdf.remove_foreign_key_failures(panDat) self.assertFalse(pdf.find_foreign_key_failures(panDat)) self.assertTrue({t:len(getattr(panDat, t)) for t in tdf.all_tables} == orig_lens)
def testBasicFKs(self): for cloning in [True, False, "*"]: clone_me_maybe = lambda x : x.clone(tdf.all_tables if cloning == "*" else None) if cloning else x pdf = PanDatFactory(plants = [["name"], ["stuff", "otherstuff"]], lines = [["name"], ["plant", "weird stuff"]], line_descriptor = [["name"], ["booger"]], products = [["name"],["gover"]], production = [["line", "product"], ["min", "max"]], pureTestingTable = [[], ["line", "plant", "product", "something"]], extraProduction = [["line", "product"], ["extramin", "extramax"]], weirdProduction = [["line1", "line2", "product"], ["weirdmin", "weirdmax"]]) pdf.add_foreign_key("production", "lines", ("line", "name")) pdf.add_foreign_key("production", "products", ("product", "name")) pdf.add_foreign_key("lines", "plants", ("plant", "name")) pdf.add_foreign_key("line_descriptor", "lines", ("name", "name")) for f in set(pdf.data_fields["pureTestingTable"]).difference({"something"}): pdf.add_foreign_key("pureTestingTable", "%ss"%f, (f,"name")) pdf.add_foreign_key("extraProduction", "production", (("line", "line"), ("product","product"))) pdf.add_foreign_key("weirdProduction", "production", (("line1", "line"), ("product","product"))) pdf.add_foreign_key("weirdProduction", "extraProduction", (("line2","line"), ("product","product"))) self._testPdfReproduction(pdf) pdf = clone_me_maybe(pdf) tdf = TicDatFactory(**pdf.schema()) goodDat = tdf.TicDat() goodDat.plants["Cleveland"] = ["this", "that"] goodDat.plants["Newark"]["otherstuff"] =1 goodDat.products["widgets"] = goodDat.products["gadgets"] = "shizzle" for i,p in enumerate(goodDat.plants): goodDat.lines[i]["plant"] = p for i,(pl, pd) in enumerate(itertools.product(goodDat.lines, goodDat.products)): goodDat.production[pl, pd] = {"min":1, "max":10+i} badDat1 = tdf.copy_tic_dat(goodDat) badDat1.production["notaline", "widgets"] = [0,1] badDat2 = tdf.copy_tic_dat(badDat1) def pan_dat_(_): rtn = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, _)) self.assertFalse(pdf.find_duplicates(rtn)) return rtn fk, fkm = ForeignKey, ForeignKeyMapping fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1)) fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2)) self.assertTrue(set(fk_fails1) == set(fk_fails2) == {fk('production', 'lines', fkm('line', 'name'), 'many-to-one')}) self.assertTrue(set(pdf.find_foreign_key_failures(pan_dat_(badDat1), verbosity="Low")) == set(pdf.find_foreign_key_failures(pan_dat_(badDat2), verbosity="Low")) == {('production', 'lines', ('line', 'name'))}) for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]: self.assertTrue(set(row_fails["line"]) == {"notaline"} and set(row_fails["product"]) == {"widgets"}) badDat1.lines["notaline"]["plant"] = badDat2.lines["notaline"]["plant"] = "notnewark" fk_fails1 = pdf.find_foreign_key_failures(pan_dat_(badDat1)) fk_fails2 = pdf.find_foreign_key_failures(pan_dat_(badDat2)) self.assertTrue(set(fk_fails1) == set(fk_fails2) == {fk('lines', 'plants', fkm('plant', 'name'), 'many-to-one')}) for row_fails in [next(iter(_.values())) for _ in [fk_fails1, fk_fails2]]: self.assertTrue(set(row_fails["name"]) == {"notaline"} and set(row_fails["plant"]) == {"notnewark"}) for bad in [badDat1, badDat2]: bad_pan = pdf.remove_foreign_key_failures(pan_dat_(bad)) self.assertFalse(pdf.find_foreign_key_failures(bad_pan)) self.assertTrue(pdf._same_data(bad_pan, pan_dat_(goodDat))) _ = len(goodDat.lines) for i,p in enumerate(list(goodDat.plants.keys()) + list(goodDat.plants.keys())): goodDat.lines[i+_]["plant"] = p for l in goodDat.lines: if i%2: goodDat.line_descriptor[l] = i+10 for i,(l,pl,pdct) in enumerate(sorted(itertools.product(goodDat.lines, goodDat.plants, goodDat.products))): goodDat.pureTestingTable.append((l,pl,pdct,i)) self.assertFalse(pdf.find_foreign_key_failures(pan_dat_(goodDat))) badDat = tdf.copy_tic_dat(goodDat) badDat.pureTestingTable.append(("j", "u", "nk", "ay")) fk_fails = pdf.find_foreign_key_failures(pan_dat_(badDat)) self.assertTrue(set(fk_fails) == {fk('pureTestingTable', 'plants', fkm('plant', 'name'), 'many-to-one'), fk('pureTestingTable', 'products', fkm('product', 'name'), 'many-to-one'), fk('pureTestingTable', 'lines', fkm('line', 'name'), 'many-to-one')}) for df in fk_fails.values(): df = df.T c = df.columns[0] self.assertTrue({'ay', 'j', 'nk', 'u'} == set(df[c]))
def testXToManyTwo(self): input_schema = PanDatFactory (parent = [["F1", "F2"],["F3"]], child_one = [["F1", "F2", "F3"], []], child_two = [["F1", "F2"], ["F3"]], child_three = [[],["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"]*2, ["F2"]*2, ["F3"]*2]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-one", "many-to-one"}) rows =[[1,2,3], [1,2.1,3], [4,5,6],[4,5.1,6],[7,8,9]] tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent = rows, child_one = rows, child_two = rows, child_three=rows) self.assertTrue(all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1,2.2]=3 dat.child_three.append([1,2,4]) new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue(len(fk_fails) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory (parent = [["F1", "F2"],["F3"]], child_one = [["F1", "F2", "F3"], []], child_two = [["F1", "F2"], ["F3"]], child_three = [[],["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"]*2, ["F3"]*2]) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue(all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1,2.2]=4 dat.child_three.append([1,2,4]) new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertTrue(len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
def testXToMany(self): input_schema = PanDatFactory (roster = [["Name"],["Grade", "Arrival Inning", "Departure Inning", "Min Innings Played", "Max Innings Played"]], positions = [["Position"],["Position Importance", "Position Group", "Consecutive Innings Only"]], innings = [["Inning"],["Inning Group"]], position_constraints = [["Position Group", "Inning Group", "Grade"], ["Min Players", "Max Players"]]) input_schema.add_foreign_key("position_constraints", "roster", ["Grade", "Grade"]) input_schema.add_foreign_key("position_constraints", "positions", ["Position Group", "Position Group"]) input_schema.add_foreign_key("position_constraints", "innings", ["Inning Group", "Inning Group"]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"many-to-many"}) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat() for i,p in enumerate(["bob", "joe", "fred", "alice", "lisa", "joean", "ginny"]): dat.roster[p]["Grade"] = (i%3)+1 dat.roster["dummy"]["Grade"] = "whatevers" for i,p in enumerate(["pitcher", "catcher", "1b", "2b", "ss", "3b", "lf", "cf", "rf"]): dat.positions[p]["Position Group"] = "PG %s"%((i%4)+1) for i in range(1, 10): dat.innings[i]["Inning Group"] = "before stretch" if i < 7 else "after stretch" dat.innings[0] ={} for pg, ig, g in itertools.product(["PG %s"%i for i in range(1,5)], ["before stretch", "after stretch"], [1, 2, 3]): dat.position_constraints[pg, ig, g] = {} orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.position_constraints["no", "no", "no"] = dat.position_constraints[1, 2, 3] = {} new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema._same_data(orig_pan_dat, new_pan_dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) fk_fails_2 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low") fk_fails_3 = input_schema.find_foreign_key_failures(new_pan_dat, verbosity="Low", as_table=False) self.assertTrue({tuple(k)[:2] + (tuple(k[2]),): len(v) for k,v in fk_fails.items()} == {k:len(v) for k,v in fk_fails_2.items()} == {k:v.count(True) for k,v in fk_fails_3.items()} == {('position_constraints', 'innings', ("Inning Group", "Inning Group")): 2, ('position_constraints', 'positions', ("Position Group", "Position Group")): 2, ('position_constraints', 'roster', ("Grade", "Grade")): 1}) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory(table_one=[["One", "Two"], []], table_two=[["One"], ["Two"]]) input_schema.add_foreign_key("table_two", "table_one", ["One", "One"]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-many"}) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(table_one = [[1,2], [3,4], [5,6], [7,8]], table_two = {1:2, 3:4, 5:6}) orig_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.table_two[9]=10 new_pan_dat = input_schema.copy_pan_dat(copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue({tuple(k)[:2]:len(v) for k,v in fk_fails.items()} == {('table_two', 'table_one'): 1}) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))