def testDataTypes_two(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**tdf.schema()) def makeIt(): rtn = tdf.TicDat() rtn.foods["a"] = 12 rtn.foods["b"] = None rtn.foods[None] = 101 rtn.categories["1"] = {"maxNutrition": 100, "minNutrition": 40} rtn.categories["2"] = [10, 20] for f, p in itertools.product(rtn.foods, rtn.categories): rtn.nutritionQuantities[f, p] = 5 rtn.nutritionQuantities['a', 2] = 12 return tdf.copy_to_pandas(rtn, drop_pk_columns=False) dat = makeIt() errs = pdf.find_data_type_failures(dat) self.assertTrue(len(errs) == 2 and not pdf.find_duplicates(dat)) dat_copied = pdf.copy_pan_dat(dat) pdf.replace_data_type_failures(dat) self.assertTrue(pdf._same_data(dat, dat_copied, epsilon=0.00001)) pdf2 = pdf.clone() pdf2.set_default_value("foods", "name", "a") pdf2.set_default_value("nutritionQuantities", "food", "a") pdf2.replace_data_type_failures(dat_copied) self.assertFalse(pdf._same_data(dat, dat_copied, epsilon=0.00001)) self.assertFalse(pdf.find_data_type_failures(dat_copied)) dups = pdf.find_duplicates(dat_copied) self.assertTrue( len(dups) == 2 and len(dups["foods"]) == 1 and len(dups["nutritionQuantities"]) == 2) from pandas import isnull def noneify(iter_of_tuples): return { tuple(None if isnull(_) else _ for _ in tuple_) for tuple_ in iter_of_tuples } self.assertTrue( noneify(errs['nutritionQuantities', 'food'].itertuples( index=False)) == {(None, "1", 5), (None, "2", 5)}) self.assertTrue( noneify(errs['foods', 'name'].itertuples(index=False)) == {(None, 101)}) pdf = PanDatFactory(**tdf.schema()) pdf.set_data_type("foods", "name", nullable=True, strings_allowed='*') pdf.set_data_type("nutritionQuantities", "food", nullable=True, strings_allowed='*') self.assertFalse(pdf.find_data_type_failures(dat)) pdf.set_data_type("foods", "cost", nullable=False) errs = pdf.find_data_type_failures(dat) self.assertTrue(len(errs) == 1) self.assertTrue( noneify(errs['foods', 'cost'].itertuples(index=False)) == {('b', None)})
def test_datetime(self): core_path = os.path.join(_scratchDir, "parameters") pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]], parameters=[["a"], ["b"]]) pdf.add_parameter("p1", "Dec 15 1970", datetime=True) pdf.add_parameter("p2", None, datetime=True, nullable=True) pdf.set_data_type("table_with_stuffs", "field one", datetime=True) pdf.set_data_type("table_with_stuffs", "field two", datetime=True, nullable=True) dat = TicDatFactory(**pdf.schema()).TicDat( table_with_stuffs=[[dateutil.parser.parse("July 11 1972"), None], [ datetime.datetime.now(), dateutil.parser.parse("Sept 11 2011") ]], parameters=[["p1", "7/11/1911"], ["p2", None]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) self.assertFalse( pdf.find_data_type_failures(dat) or pdf.find_data_row_failures(dat)) for attr, path in [["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["sql", core_path + ".db"], ["json", core_path + ".json"]]: func = "write_directory" if attr == "csv" else "write_file" getattr(getattr(pdf, attr), func)(dat, path) dat_1 = getattr(pdf, attr).create_pan_dat(path) self.assertFalse(pdf._same_data(dat, dat_1)) self.assertFalse( pdf.find_data_type_failures(dat_1) or pdf.find_data_row_failures(dat_1)) dat_1 = pdf.copy_to_tic_dat(dat_1) self.assertTrue(set(dat_1.parameters) == {'p1', 'p2'}) self.assertTrue( isinstance(dat_1.parameters["p1"]["b"], (datetime.datetime, numpy.datetime64)) and not pd.isnull(dat_1.parameters["p1"]["b"])) self.assertTrue(pd.isnull(dat_1.parameters["p2"]["b"])) self.assertTrue( all( isinstance(_, (datetime.datetime, numpy.datetime64)) and not pd.isnull(_) for _ in dat_1.table_with_stuffs)) self.assertTrue( all( isinstance(_, (datetime.datetime, numpy.datetime64)) or _ is None or utils.safe_apply(math.isnan)(_) for v in dat_1.table_with_stuffs.values() for _ in v.values())) self.assertTrue({ pd.isnull(_) for v in dat_1.table_with_stuffs.values() for _ in v.values() } == {True, False})
def test_nullables(self): core_path = os.path.join(_scratchDir, "nullables") pdf = PanDatFactory(table_with_stuffs=[["field one"], ["field two"]]) pdf.set_data_type("table_with_stuffs", "field one") pdf.set_data_type("table_with_stuffs", "field two", number_allowed=False, strings_allowed='*', nullable=True) dat = TicDatFactory(**pdf.schema()).TicDat( table_with_stuffs=[[101, "022"], [202, None], [303, "111"]]) dat = TicDatFactory(**pdf.schema()).copy_to_pandas( dat, drop_pk_columns=False) self.assertFalse(pdf.find_data_type_failures(dat)) for attr, path in [["csv", core_path + "_csv"], ["xls", core_path + ".xlsx"], ["sql", core_path + ".db"], ["json", core_path + ".json"]]: f_or_d = "directory" if attr == "csv" else "file" write_func, write_kwargs = utils._get_write_function_and_kwargs( pdf, path, f_or_d) write_func(dat, path, **write_kwargs) dat_1 = utils._get_dat_object(pdf, "create_pan_dat", path, f_or_d, False) self.assertTrue( pdf._same_data(dat, dat_1, nans_are_same_for_data_rows=True))
def testDataTypes(self): if not self.canRun: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticdat = tdf.TicDat() ticdat.foods["a"] = 12 ticdat.foods["b"] = None ticdat.categories["1"] = {"maxNutrition":100, "minNutrition":40} ticdat.categories["2"] = [10,20] for f, p in itertools.product(ticdat.foods, ticdat.categories): ticdat.nutritionQuantities[f,p] = 5 ticdat.nutritionQuantities['a', 2] = 12 pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pandat_copy = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat)) self.assertTrue(pdf._same_data(pandat, pandat_copy, epsilon=0.00001)) pdf = PanDatFactory(**dietSchema()) pdf.set_data_type("foods", "cost", nullable=False) pdf.set_data_type("nutritionQuantities", "qty", min=5, inclusive_min=False, max=12, inclusive_max=True) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('foods', 'cost'), ('nutritionQuantities', 'qty')}) self.assertTrue(set(failed['foods', 'cost']["name"]) == {'b'}) self.assertTrue(set({(v["food"], v["category"]) for v in failed['nutritionQuantities', 'qty'].T.to_dict().values()}) == {('b', '1'), ('a', '2'), ('a', '1'), ('b', '2')}) failed = pdf.find_data_type_failures(pandat, as_table=False) self.assertTrue(4 == failed['nutritionQuantities', 'qty'].value_counts()[True]) fixed = pdf.replace_data_type_failures(pdf.copy_pan_dat(pandat), {("nutritionQuantities", "qty"): 5.15}) self.assertTrue(set(fixed.foods["cost"]) == {0.0, 12.0}) self.assertTrue(set(fixed.nutritionQuantities["qty"]) == {5.15, 12.0}) tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) pdf = PanDatFactory(**netflowSchema()) ticdat = tdf.copy_tic_dat(netflowData()) for n in ticdat.nodes["Detroit"].arcs_source: ticdat.arcs["Detroit", n] = n pandat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticdat)) self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed="*") self.assertFalse(pdf.find_data_type_failures(pandat)) pdf = PanDatFactory(**netflowSchema()) pdf.set_data_type("arcs", "capacity", strings_allowed=["Boston", "Seattle", "lumberjack"]) failed = pdf.find_data_type_failures(pandat) self.assertTrue(set(failed) == {('arcs', 'capacity')}) self.assertTrue(set({(v["source"], v["destination"]) for v in failed['arcs', 'capacity'].T.to_dict().values()}) == {("Detroit", "New York")}) pdf.replace_data_type_failures(pandat) self.assertTrue(set(pandat.arcs["capacity"]) == {120, 'Boston', 0, 'Seattle'})
def test_data_type_max_failures(self): pdf = PanDatFactory(table_one=[["Field"], []], table_two=[[], ["Field"]]) for t in ["table_one", "table_two"]: pdf.set_data_type(t, "Field") dat = pdf.PanDat(table_one=DataFrame( {"Field": list(range(1, 11)) + [-_ for _ in range(1, 11)]}), table_two=DataFrame( {"Field": [10.1] * 10 + [-2] * 10})) errs = pdf.find_data_type_failures(dat) self.assertTrue( len(errs) == 2 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_type_failures(dat, max_failures=11) self.assertTrue(len(errs) == 2) self.assertTrue( any(len(_) == 10 for _ in errs.values()) and any(len(_) == 1 for _ in errs.values())) errs = pdf.find_data_type_failures(dat, max_failures=10) self.assertTrue( len(errs) == 1 and all(len(_) == 10 for _ in errs.values())) errs = pdf.find_data_type_failures(dat, max_failures=9) self.assertTrue( len(errs) == 1 and all(len(_) == 9 for _ in errs.values()))