def testFindDups(self): pdf = PanDatFactory(**sillyMeSchema()) tdf = TicDatFactory( **{ k: [[], list(pkfs) + list(dfs)] for k, (pkfs, dfs) in sillyMeSchema().items() }) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat) self.assertTrue(set(dups) == {'a'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 2) dups = pdf.find_duplicates(panDat, as_table=False) self.assertTrue( set(dups) == {'a'} and dups['a'].value_counts()[True] == 1) rows = [(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 2, 3, 40)] ticDat = tdf.TicDat(**{t: rows for t in tdf.all_tables}) panDat = pdf.copy_pan_dat(copy_to_pandas_with_reset(tdf, ticDat)) dups = pdf.find_duplicates(panDat, keep=False) self.assertTrue( set(dups) == {'a', 'b'} and set(dups['a']['aField']) == {1}) dups = pdf.find_duplicates(panDat, as_table=False, keep=False) self.assertTrue({k: v.value_counts()[True] for k, v in dups.items()} == { 'a': 3, 'b': 2 })
def testXToManyTwo(self): input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F2"] * 2, ["F3"] * 2]) self.assertTrue({fk.cardinality for fk in input_schema.foreign_keys} == {"one-to-one", "many-to-one"}) rows = [[1, 2, 3], [1, 2.1, 3], [4, 5, 6], [4, 5.1, 6], [7, 8, 9]] tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 3 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) fk_fails = input_schema.find_foreign_key_failures(new_pan_dat) self.assertTrue(len(fk_fails) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat)) input_schema = PanDatFactory(parent=[["F1", "F2"], ["F3"]], child_one=[["F1", "F2", "F3"], []], child_two=[["F1", "F2"], ["F3"]], child_three=[[], ["F1", "F2", "F3"]]) for t in ["child_one", "child_two", "child_three"]: input_schema.add_foreign_key(t, "parent", [["F1"] * 2, ["F3"] * 2]) tdf = TicDatFactory(**input_schema.schema()) dat = tdf.TicDat(parent=rows, child_one=rows, child_two=rows, child_three=rows) self.assertTrue( all(len(getattr(dat, t)) == 5 for t in input_schema.all_tables)) orig_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertFalse(input_schema.find_foreign_key_failures(orig_pan_dat)) dat.child_one[1, 2, 4] = {} dat.child_two[1, 2.2] = 4 dat.child_three.append([1, 2, 4]) new_pan_dat = input_schema.copy_pan_dat( copy_to_pandas_with_reset(tdf, dat)) self.assertTrue( len(input_schema.find_foreign_key_failures(new_pan_dat)) == 3) input_schema.remove_foreign_key_failures(new_pan_dat) self.assertFalse(input_schema.find_foreign_key_failures(new_pan_dat)) self.assertTrue(input_schema._same_data(orig_pan_dat, new_pan_dat))
def testNetflow(self): if not self.can_run: return tdf = TicDatFactory(**netflowSchema()) addNetflowForeignKeys(tdf) ordered = tdf.sql._ordered_tables() self.assertTrue(ordered.index("nodes") < min(ordered.index(_) for _ in ("arcs", "cost", "inflow"))) self.assertTrue(ordered.index("commodities") < min(ordered.index(_) for _ in ("cost", "inflow"))) ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})) self._test_generic_copy(ticDat, tdf) self._test_generic_copy(ticDat, tdf, ["arcs", "nodes"]) filePath = os.path.join(_scratchDir, "netflow.sql") tdf.sql.write_db_data(ticDat, filePath) self.assertFalse(tdf.sql.find_duplicates(filePath)) sqlTicDat = tdf.sql.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, sqlTicDat)) def changeIt() : sqlTicDat.inflow['Pencils', 'Boston']["quantity"] = 12 self.assertTrue(self.firesException(changeIt)) self.assertTrue(tdf._same_data(ticDat, sqlTicDat)) sqlTicDat = tdf.sql.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, sqlTicDat)) self.assertFalse(self.firesException(changeIt)) self.assertFalse(tdf._same_data(ticDat, sqlTicDat)) pkHacked = netflowSchema() pkHacked["nodes"][0] = ["nimrod"] tdfHacked = TicDatFactory(**pkHacked) ticDatHacked = tdfHacked.TicDat(**{t : getattr(ticDat, t) for t in tdf.all_tables}) tdfHacked.sql.write_db_data(ticDatHacked, makeCleanPath(filePath)) self.assertFalse(tdfHacked.sql.find_duplicates(filePath)) self.assertTrue(self.firesException(lambda : tdfHacked.sql.write_db_data(ticDat, filePath))) tdfHacked.sql.write_db_data(ticDat, filePath, allow_overwrite =True) self.assertTrue("Unable to recognize field name in table nodes" in self.firesException(lambda :tdf.sql.create_tic_dat(filePath))) ticDatNew = tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields}) ticDatNew.cost['Pencils', 'booger', 'wooger'] = 10 ticDatNew.cost['junker', 'Detroit', 'New York'] = 20 ticDatNew.cost['bunker', 'Detroit', 'New Jerk'] = 20 ticDatNew.arcs['booger', 'wooger'] = 112 self.assertTrue({f[:2] + f[2][:1] : set(v.native_pks) for f,v in tdf.find_foreign_key_failures(ticDatNew).items()} == {('arcs', 'nodes', u'destination'): {('booger', 'wooger')}, ('arcs', 'nodes', u'source'): {('booger', 'wooger')}, ('cost', 'commodities', u'commodity'): {('bunker', 'Detroit', 'New Jerk'), ('junker', 'Detroit', 'New York')}, ('cost', 'nodes', u'destination'): {('bunker', 'Detroit', 'New Jerk'), ('Pencils', 'booger', 'wooger')}, ('cost', 'nodes', u'source'): {('Pencils', 'booger', 'wooger')}}) ticDat3 = tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields}) ticDat3.arcs['Detroit', 'Boston'] = float("inf") ticDat3.arcs['Denver', 'Boston'] = float("inf") self.assertFalse(tdf._same_data(ticDat3, ticDat)) tdf.sql.write_db_data(ticDat3, makeCleanPath(filePath)) ticDat4 = tdf.sql.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat3, ticDat4))
def testNulls(self): tdf = TicDatFactory(table=[["field one"], ["field two"]]) dat = tdf.TicDat(table=[[None, 100], [200, "this"], ["that", 300], [300, None], [400, "that"]]) file_path = os.path.join(_scratchDir, "nulls.accdb") tdf.mdb.write_file(dat, file_path) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertTrue(tdf._same_data(dat, dat_1)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, max=float("inf"), inclusive_max=True) tdf.set_infinity_io_flag(None) dat_inf = tdf.TicDat( table=[[float("inf"), 100], [200, "this"], ["that", 300], [300, float("inf")], [400, "that"]]) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertTrue(tdf._same_data(dat_inf, dat_1)) tdf.mdb.write_file(dat_inf, makeCleanPath(file_path)) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertTrue(tdf._same_data(dat_inf, dat_1)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, min=-float("inf"), inclusive_min=True) tdf.set_infinity_io_flag(None) dat_1 = tdf.mdb.create_tic_dat(file_path) self.assertFalse(tdf._same_data(dat_inf, dat_1)) dat_inf = tdf.TicDat( table=[[float("-inf"), 100], [200, "this"], ["that", 300], [300, -float("inf")], [400, "that"]]) self.assertTrue(tdf._same_data(dat_inf, dat_1))
def testNetflow(self): if not self.canRun: return tdf = TicDatFactory(**netflowSchema()) tdf.enable_foreign_key_links() addNetflowForeignKeys(tdf) oldDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(netflowData(),t) for t in tdf.primary_key_fields})) self._test_generic_free_copy(oldDat, tdf) self._test_generic_free_copy(oldDat, tdf, ["arcs", "nodes"]) ticDat = tdf.copy_to_pandas(oldDat, ["arcs", "cost"]) self.assertTrue(all(hasattr(ticDat, t) == (t in ["arcs", "cost"]) for t in tdf.all_tables)) self.assertTrue(len(ticDat.arcs.capacity.sloc["Boston",:]) == len(oldDat.nodes["Boston"].arcs_source) == 0) self.assertTrue(len(ticDat.arcs.capacity.sloc[:,"Boston"]) == len(oldDat.nodes["Boston"].arcs_destination) == 2) self.assertTrue(all(ticDat.arcs.capacity.sloc[:,"Boston"][src] == r["capacity"] for src, r in oldDat.nodes["Boston"].arcs_destination.items())) ticDat = tdf.copy_to_pandas(oldDat, drop_pk_columns=True) rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) # because we have single pk field tables, dropping the pk columns is probelmatic self.assertFalse(tdf._same_data(rebornTicDat, oldDat)) # but with the default argument all is well ticDat = tdf.copy_to_pandas(oldDat) rebornTicDat = tdf.TicDat(**{t:getattr(ticDat, t) for t in tdf.all_tables}) self.assertTrue(tdf._same_data(rebornTicDat, oldDat)) self.assertTrue(set(ticDat.inflow.columns) == {"quantity"}) self.assertTrue(set(ticDat.nodes.columns) == {"name"})
def test_numericish_text(self): dir_path = os.path.join(_scratchDir, "numericish") tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) dat = tdf.TicDat( parameters=[["a", "100"], ["b", "010"], [3, "200"], ["d", "020"]]) def round_trip(): tdf.csv.write_directory(dat, makeCleanDir(dir_path)) return tdf.csv.create_tic_dat(dir_path) dat2 = round_trip() self.assertFalse(tdf._same_data(dat, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", strings_allowed='*', number_allowed=True) tdf.set_default_value("parameters", "Value", "") dat2 = round_trip() self.assertTrue(tdf._same_data(dat, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Value", strings_allowed='*', number_allowed=False) dat = tdf.TicDat(parameters=[["a", "100"], ["b", "010"], ["c", "200"], ["d", "020"]]) dat2 = round_trip() self.assertTrue(tdf._same_data(dat, dat2))
def testSqlSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) filePath = os.path.join(_scratchDir, "diet.db") pdf.sql.write_file(panDat, filePath) sqlPanDat = pdf.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.sql.write_file(panDat, filePath) sqlPanDat = pdf2.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "netflow.db") pdf.sql.write_file(panDat, filePath) panDat2 = pdf.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) sqlPanDat = pdf2.sql.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, sqlPanDat))
def _test_generic_copy(self, ticDat, tdf, skip_tables=None): assert all(tdf.primary_key_fields.get(t) for t in tdf.all_tables) path = makeCleanDir(os.path.join(_scratchDir, "generic_copy")) replace_name = lambda f : "name_" if f == "name" else f clean_tdf = TicDatFactory(**{t:[list(map(replace_name, pks)), dfs] for t,(pks, dfs) in tdf.schema().items()}) temp_tdf = TicDatFactory(**{t:v if t in (skip_tables or []) else '*' for t,v in clean_tdf.schema().items()}) temp_dat = temp_tdf.TicDat(**{t:getattr(ticDat, t) for t in (skip_tables or [])}) for t in temp_tdf.generic_tables: setattr(temp_dat, t, getattr(clean_tdf.copy_to_pandas(ticDat, drop_pk_columns=False) ,t)) temp_tdf.csv.write_directory(temp_dat, path) self.assertFalse(temp_tdf.csv.find_duplicates(path)) read_dat = temp_tdf.csv.create_tic_dat(path) generic_free_dat, _ = utils.create_generic_free(read_dat, temp_tdf) check_dat = clean_tdf.TicDat() for t in temp_tdf.generic_tables: for r in getattr(generic_free_dat, t): pks = clean_tdf.primary_key_fields[t] getattr(check_dat, t)[r[pks[0]] if len(pks) == 1 else tuple(r[_] for _ in pks)] = \ {df:r[df] for df in clean_tdf.data_fields.get(t, [])} for t in (skip_tables or []): for k,v in getattr(generic_free_dat, t).items(): getattr(check_dat, t)[k] = v self.assertTrue(clean_tdf._same_data(check_dat, clean_tdf.copy_tic_dat(ticDat)))
def testXlsSpacey(self): if not self.can_run: return tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat(**spacesData()) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".xlsx" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) pdf.xls.write_file(panDat, filePath, case_space_sheet_names=True) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2))
def test_empty_text_none(self): # this is a naive data scientist who isn't using the parameters functionality filePath = os.path.join(_scratchDir, "empty.xls") tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) dat_n = tdf.TicDat(parameters=[[None, 100], ["b", 10.01], ["three", 200], ["d", None]]) dat_s = tdf.TicDat( parameters=[["", 100], ["b", 10.01], ["three", 200], ["d", ""]]) def round_trip(): tdf.xls.write_file(dat_n, filePath, allow_overwrite=True) return tdf.xls.create_tic_dat(filePath) dat2 = round_trip() self.assertTrue( tdf._same_data(dat_s, dat2) and not tdf._same_data(dat_n, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", nullable=True) tdf.set_default_value( "parameters", "Value", None) # this default alone will mess with number reading dat2 = round_trip() self.assertTrue(not tdf._same_data(dat_s, dat2) and tdf._same_data(dat_n, dat2)) tdf = TicDatFactory(parameters='*') dat = tdf.xls.create_tic_dat(filePath) self.assertTrue(dat.parameters.shape == (4, 2))
def testDupsOpalytics(self): if not self.can_run: return for hack in [True, False]: tdf = TicDatFactory(one=[["a"], ["b", "c"]], two=[["a", "b"], ["c"]], three=[["a", "b", "c"], []]) tdf2 = TicDatFactory( **{t: [[], ["a", "b", "c"]] for t in tdf.all_tables}) td = tdf2.TicDat( **{ t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables }) inputset = create_inputset_mock(tdf2, td, hack) pdf = PanDatFactory(**tdf.schema()) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=True) self.assertTrue( all(len(getattr(panDat, t)) == 6 for t in tdf.all_tables)) panDat = pdf.opalytics.create_pan_dat(inputset, raw_data=False) self.assertTrue( all(len(getattr(panDat, t)) < 6 for t in tdf.all_tables)) td_1 = tdf.TicDat( **{ t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], ["new", 1, 2]] for t in tdf.all_tables }) td_2 = pdf.copy_to_tic_dat(panDat) self.assertTrue( all( set(getattr(td_1, t)) == set(getattr(td_2, t)) for t in tdf.all_tables))
def test_empty_text_none(self): dir_path = os.path.join(_scratchDir, "empty_text") tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) dat_n = tdf.TicDat(parameters=[[None, 100], ["b", 10.01], ["three", 200], ["d", None]]) dat_s = tdf.TicDat( parameters=[["", 100], ["b", 10.01], ["three", 200], ["d", ""]]) def round_trip(): tdf.csv.write_directory(dat_n, makeCleanDir(dir_path)) return tdf.csv.create_tic_dat(dir_path) dat2 = round_trip() self.assertTrue( tdf._same_data(dat_s, dat2) and not tdf._same_data(dat_n, dat2)) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", nullable=True) tdf.set_default_value( "parameters", "Value", None) # this default alone will mess with number reading dat2 = round_trip() self.assertFalse( tdf._same_data(dat_s, dat2) or tdf._same_data(dat_n, dat2)) self.assertTrue( any(r["Value"] is None for r in dat2.parameters.values())) tdf = TicDatFactory(parameters=[["Key"], ["Value"]]) tdf.set_data_type("parameters", "Key", nullable=True) tdf.set_data_type("parameters", "Value", nullable=True, must_be_int=True) dat2 = round_trip() self.assertTrue(not tdf._same_data(dat_s, dat2) and tdf._same_data(dat_n, dat2))
def testDefaults(self): tdf = TicDatFactory(one=[["a"], ["b", "c"]], two=[["a", "b"], ["c"]], three=[["a", "b", "c"], []]) dat = tdf.TicDat(one=[[1, 2, 3], [4, 5, 6]], two=[[1, 2, 3], [4, 5, 6]], three=[[1, 2, 3], [4, 5, 6]]) filePath = makeCleanPath(os.path.join(_scratchDir, "defaults.sql")) tdf.sql.write_sql_file(dat, filePath) tdf2 = TicDatFactory(one=[["a"], ["b", "c"]], two=[["a", "b"], ["c"]], three=[["a", "b", "c"], ["d"]]) dat2 = tdf2.TicDat(one=dat.one, two=dat.two, three={k: {} for k in dat.three}) dat22 = tdf2.sql.create_tic_dat_from_sql(filePath) self.assertTrue(tdf2._same_data(dat2, dat22)) tdf2 = TicDatFactory(one=[["a"], ["b", "c"]], two=[["a", "b"], ["c"]], three=[["a", "b", "c"], ["d"]]) tdf2.set_default_value("three", "d", float("inf")) dat2_b = tdf2.TicDat(one=dat.one, two=dat.two, three={k: {} for k in dat.three}) dat22_b = tdf2.sql.create_tic_dat_from_sql(filePath) self.assertTrue(tdf2._same_data(dat2_b, dat22_b)) self.assertFalse(tdf2._same_data(dat2, dat2_b))
def testDictConstructions(self): tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) panDat2 = pdf.PanDat( **{t: getattr(panDat, t).to_dict() for t in pdf.all_tables}) panDat3 = pdf.PanDat(**{ t: getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables }) panDat3_1 = pdf.PanDat( **{ t: list(map(list, getattr(panDat, t).itertuples(index=False))) for t in pdf.all_tables }) self.assertTrue( all( pdf._same_data(panDat, _) for _ in [panDat2, panDat3, panDat3_1])) panDat.foods["extra"] = 12 panDat4 = pdf.PanDat(**{ t: getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables }) self.assertTrue(pdf._same_data(panDat, panDat4)) self.assertTrue(set(panDat4.foods["extra"]) == {12}) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) panDat2 = pdf.PanDat( **{t: getattr(panDat, t).to_dict() for t in pdf.all_tables}) panDat3 = pdf.PanDat( **{ t: getattr(panDat, t).to_dict(orient="records") for t in pdf.all_tables }) self.assertTrue( all(pdf._same_data(panDat, _) for _ in [panDat2, panDat3])) panDat.cost["extra"] = "boger" panDat4 = pdf.PanDat(**{ t: getattr(panDat, t).to_dict(orient="list") for t in pdf.all_tables }) self.assertTrue(pdf._same_data(panDat, panDat4)) self.assertTrue(set(panDat4.cost["extra"]) == {"boger"})
def testNetflow(self): if not self.can_run: return tdf = TicDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) self._test_generic_copy(ticDat, tdf) self._test_generic_copy(ticDat, tdf, ["arcs", "nodes"]) filePath = os.path.join(_scratchDir, "netflow.xls") tdf.xls.write_file(ticDat, filePath) xlsTicDat = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, xlsTicDat)) tdf.xls.write_file(ticDat, filePath + "x") self.assertTrue( tdf._same_data(ticDat, tdf.xls.create_tic_dat(filePath + "x"))) def changeIt(): xlsTicDat.inflow['Pencils', 'Boston']["quantity"] = 12 self.assertTrue(self.firesException(changeIt)) self.assertTrue(tdf._same_data(ticDat, xlsTicDat)) xlsTicDat = tdf.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, xlsTicDat)) self.assertFalse(self.firesException(changeIt)) self.assertFalse(tdf._same_data(ticDat, xlsTicDat)) self.assertFalse(tdf.xls.find_duplicates(filePath)) pkHacked = netflowSchema() pkHacked["nodes"][0] = ["nimrod"] tdfHacked = TicDatFactory(**pkHacked) self.assertTrue( self.firesException( lambda: tdfHacked.xls.write_file(ticDat, filePath))) tdfHacked.xls.write_file(ticDat, filePath, allow_overwrite=True) self.assertTrue("nodes : name" in self.firesException( lambda: tdf.xls.create_tic_dat(filePath))) ticDat = tdf.TicDat( **{t: getattr(netflowData(), t) for t in tdf.primary_key_fields}) ticDat.arcs["Detroit", "Boston"] = float("inf") ticDat.cost['Pencils', 'Detroit', 'Boston'] = -float("inf") tdf.xls.write_file(ticDat, makeCleanPath(filePath)) xlsTicDat = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, xlsTicDat)) tdf.xls.write_file(ticDat, filePath + "x", allow_overwrite=True) self.assertTrue( tdf._same_data(ticDat, tdf.xls.create_tic_dat(filePath + "x"))) self.assertFalse( tdf._same_data( ticDat, tdf.xls.create_tic_dat(filePath + "x", treat_inf_as_infinity=False)))
def testIssue45(self): pdf = PanDatFactory(data=[["a"], ["b"]]) tdf = TicDatFactory(**pdf.schema()) dat_nums = tdf.copy_to_pandas( tdf.TicDat(data=[[1, 2], [3, 4], [22, 44]]), drop_pk_columns=False) dat_strs = tdf.copy_to_pandas( tdf.TicDat(data=[["1", "2"], ["3", "4"], ["022", "0044"]]), drop_pk_columns=False) files = [ os.path.join(_scratchDir, _) for _ in ["dat_nums.xlsx", "dat_strs.xlsx"] ] pdf.xls.write_file(dat_nums, files[0]) pdf.xls.write_file(dat_strs, files[1]) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_mixed = tdf.copy_to_pandas( tdf.TicDat(data=[["1", 2], ["3", 4], ["022", 44]]), drop_pk_columns=False) dat_nums_2, dat_strs_2 = [pdf.xls.create_pan_dat(_) for _ in files] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_mixed)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed)) pdf = PanDatFactory(data=[["a"], ["b"]]) csv_dirs = [ os.path.join(_scratchDir, _) for _ in ["dat_nums_csv", "dat_strs_csv"] ] pdf.csv.write_directory(dat_nums, csv_dirs[0]) pdf.csv.write_directory(dat_strs, csv_dirs[1]) dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertTrue(pdf._same_data(dat_nums, dat_nums_2)) # this is pandas pushing things to be numeric self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertTrue(pdf._same_data(dat_nums, dat_strs_2)) pdf = PanDatFactory(data=[["a"], ["b"]]) pdf.set_data_type("data", "a", number_allowed=False, strings_allowed='*') dat_nums_2, dat_strs_2 = [pdf.csv.create_pan_dat(_) for _ in csv_dirs] self.assertFalse(pdf._same_data(dat_nums, dat_nums_2)) self.assertFalse(pdf._same_data(dat_strs, dat_strs_2)) self.assertFalse(pdf._same_data(dat_nums_2, dat_strs_2)) self.assertTrue(pdf._same_data(dat_strs_2, dat_mixed))
def testBooleansAndNulls(self): tdf = TicDatFactory(table=[["field one"], ["field two"]]) dat = tdf.TicDat(table=[[None, 100], [200, True], [False, 300], [300, None], [400, False]]) file_one = os.path.join(_scratchDir, "boolDefaults_1.json") file_two = os.path.join(_scratchDir, "boolDefaults_2.json") tdf.json.write_file(dat, file_one, verbose=True) tdf.json.write_file(dat, file_two, verbose=False) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat, dat_1)) self.assertTrue(tdf._same_data(dat, dat_2)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, max=float("inf"), inclusive_max=True) tdf.set_infinity_io_flag(None) dat_inf = tdf.TicDat( table=[[float("inf"), 100], [200, True], [False, 300], [300, float("inf")], [400, False]]) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2)) tdf.json.write_file(dat_inf, file_one, verbose=True, allow_overwrite=True) tdf.json.write_file(dat_inf, file_two, verbose=False, allow_overwrite=True) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2)) tdf = TicDatFactory(table=[["field one"], ["field two"]]) for f in ["field one", "field two"]: tdf.set_data_type("table", f, min=-float("inf"), inclusive_min=True) tdf.set_infinity_io_flag(None) dat_1 = tdf.json.create_tic_dat(file_one) dat_2 = tdf.json.create_tic_dat(file_two) self.assertFalse(tdf._same_data(dat_inf, dat_1)) self.assertFalse(tdf._same_data(dat_inf, dat_2)) dat_inf = tdf.TicDat( table=[[float("-inf"), 100], [200, True], [False, 300], [300, -float("inf")], [400, False]]) self.assertTrue(tdf._same_data(dat_inf, dat_1)) self.assertTrue(tdf._same_data(dat_inf, dat_2))
def testSillyTwoTables(self): if not self.can_run: return for hack, raw_data in list(product(*(([True, False],)*2))): tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) self.assertTrue(tdf._same_data(ticDat, tdf.opalytics.create_tic_dat( create_inputset_mock(tdf, ticDat, hack), raw_data=raw_data))) ticDat = tdf.TicDat(**sillyMeDataTwoTables()) self.assertTrue(tdf._same_data(ticDat, tdf.opalytics.create_tic_dat( create_inputset_mock(tdf, ticDat, hack), raw_data=raw_data)))
def testIssue45(self): tdf = TicDatFactory(data=[["a"], ["b"]]) dat_nums = tdf.TicDat(data=[[1, 2], [3, 4], [22, 44]]) dat_strs = tdf.TicDat(data=[["1", "2"], ["3", "4"], ["022", "0044"]]) files = [ os.path.join(_scratchDir, _) for _ in ["dat_nums.xlsx", "dat_strs.xlsx"] ] tdf.xls.write_file(dat_nums, files[0]) tdf.xls.write_file(dat_strs, files[1]) dat_nums_2, dat_strs_2 = [tdf.xls.create_tic_dat(_) for _ in files] self.assertTrue(tdf._same_data(dat_nums, dat_nums_2)) self.assertTrue(tdf._same_data(dat_strs, dat_strs_2))
def testSqlSpaceyTwo(self): if not self.can_run: return self.assertTrue(pandatio.sql, "this unit test requires SQLite installed") tdf = TicDatFactory(**spacesSchema()) pdf = PanDatFactory(**spacesSchema()) ticDat = tdf.TicDat( **{ "a_table": { 1: [1, 2, "3"], 22.2: (12, 0.12, "something"), 0.23: (11, 12, "thirt") }, "b_table": { (1, 2, "foo"): 1, (1012.22, 4, "0012"): 12 }, "c_table": (("this", 2, 3, 4), ("that", 102.212, 3, 5.5), ("another", 5, 12.5, 24)) }) panDat = pan_dat_maker(spacesSchema(), ticDat) ext = ".db" filePath = os.path.join(_scratchDir, "spaces_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path=None, con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(db_file_path=None, con=con) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "spaces_2_2%s" % ext) with pandatio.sql.connect(filePath) as con: pdf.sql.write_file(panDat, db_file_path="", con=con, case_space_table_names=True) with pandatio.sql.connect(filePath) as con: panDat2 = pdf.sql.create_pan_dat(None, con) self.assertTrue(pdf._same_data(panDat, panDat2))
def testXlsSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) filePath = os.path.join(_scratchDir, "diet.xlsx") pdf.xls.write_file(panDat, filePath) xlsPanDat = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat)) pdf_shrunk = PanDatFactory(**{ k: v for k, v in dietSchema().items() if k != "nutritionQuantities" }) self.assertTrue(len(pdf_shrunk.all_tables) == len(pdf.all_tables) - 1) xlsPanDatShrunk = pdf_shrunk.xls.create_pan_dat(filePath) self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk)) filePathShrunk = os.path.join(_scratchDir, "diet_shrunk.xlsx") self.assertTrue( self.firesException( lambda: pdf.xls.create_pan_dat(filePathShrunk))) pdf_shrunk.xls.write_file(panDat, filePathShrunk) xlsPanDatShrunk = pdf.xls.create_pan_dat(filePathShrunk) self.assertTrue(pdf_shrunk._same_data(panDat, xlsPanDatShrunk)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.xls.write_file(panDat, filePath) xlsPanDat = pdf2.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) filePath = os.path.join(_scratchDir, "netflow.xlsx") pdf.xls.write_file(panDat, filePath) panDat2 = pdf.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) xlsPanDat = pdf2.xls.create_pan_dat(filePath) self.assertTrue(pdf._same_data(panDat, xlsPanDat))
def testCsvSimple(self): if not self.can_run: return tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**netflowSchema()) pdf = PanDatFactory(**netflowSchema()) ticDat = tdf.freeze_me( tdf.TicDat( ** {t: getattr(netflowData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(netflowSchema(), ticDat) dirPath = os.path.join(_scratchDir, "netflow_csv") pdf.csv.write_directory(panDat, dirPath) panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) pdf2 = PanDatFactory(**{t: '*' for t in pdf.all_tables}) pdf2.csv.write_directory(panDat, dirPath) panDat2 = pdf2.csv.create_pan_dat(dirPath) self.assertTrue(pdf._same_data(panDat, panDat2)) tdf = TicDatFactory(**dietSchema()) pdf = PanDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) panDat = pan_dat_maker(dietSchema(), ticDat) dirPath = os.path.join(_scratchDir, "diet_csv") pdf.csv.write_directory(panDat, dirPath, decimal=",") panDat2 = pdf.csv.create_pan_dat(dirPath) self.assertFalse(pdf._same_data(panDat, panDat2)) panDat2 = pdf.csv.create_pan_dat(dirPath, decimal=",") self.assertTrue(pdf._same_data(panDat, panDat2))
def testColumnsWithoutData(self): tdf = TicDatFactory(data=[["a"], ["b"]]) for x in ["", "x"]: file = os.path.join(_scratchDir, "no_data.xls" + x) tdf.xls.write_file(tdf.TicDat(), file) dat = tdf.xls.create_tic_dat(file) self.assertFalse(dat._len_dict())
def testWeirdDiets(self): if not self.can_run: return filePath = os.path.join(_scratchDir, "weirdDiet.db") tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})) tdf2 = TicDatFactory(**dietSchemaWeirdCase()) dat2 = copyDataDietWeirdCase(ticDat) tdf2.sql.write_db_data(dat2, filePath , allow_overwrite=True) self.assertFalse(tdf2.sql.find_duplicates(filePath)) sqlTicDat = tdf.sql.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, sqlTicDat)) tdf3 = TicDatFactory(**dietSchemaWeirdCase2()) dat3 = copyDataDietWeirdCase2(ticDat) tdf3.sql.write_db_data(dat3, makeCleanPath(filePath)) with sql.connect(filePath) as con: con.execute("ALTER TABLE nutrition_quantities RENAME TO [nutrition quantities]") sqlTicDat2 = tdf3.sql.create_tic_dat(filePath) self.assertTrue(tdf3._same_data(dat3, sqlTicDat2)) with sql.connect(filePath) as con: con.execute("create table nutrition_quantities(boger)") self.assertTrue(self.firesException(lambda : tdf3.sql.create_tic_dat(filePath)))
def testNetflow(self): if not _can_unit_test: return tdf = TicDatFactory(**netflowSchema()) addNetflowForeignKeys(tdf) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(netflowData(), t) for t in tdf.all_tables})) filePath = "netflow.accdb" self.assertFalse(tdf.mdb.find_duplicates(filePath)) mdbTicDat = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, mdbTicDat)) def changeIt(): mdbTicDat.inflow['Pencils', 'Boston']["quantity"] = 12 self.assertTrue(self.firesException(changeIt)) self.assertTrue(tdf._same_data(ticDat, mdbTicDat)) mdbTicDat = tdf.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, mdbTicDat)) self.assertFalse(self.firesException(changeIt)) self.assertFalse(tdf._same_data(ticDat, mdbTicDat)) pkHacked = netflowSchema() pkHacked["nodes"][0] = ["nimrod"] tdfHacked = TicDatFactory(**pkHacked) self.assertTrue( "Unable to recognize field nimrod in table nodes" in self. firesException(lambda: tdfHacked.mdb.create_tic_dat(filePath)))
def testSpacey(self): if not _can_unit_test: return tdf = TicDatFactory(**spacesSchema()) spacesData = { "a_table": { 1: { "a Data 3": 3, "a Data 2": 2, "a Data 1": 1 }, 22: (1.1, 12, 12), 0.23: (11, 12, 11) }, "b_table": { ("1", "2", "3"): 1, ("a", "b", "b"): 12 }, "c_table": (("1", "2", "3", 4), { "c Data 4": 55, "c Data 2": "b", "c Data 3": "c", "c Data 1": "a" }, ("a", "b", "12", 24)) } dat = tdf.TicDat(**spacesData) filePath = "spaces.accdb" self.assertFalse(tdf.mdb.find_duplicates(filePath)) dat2 = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(dat, dat2))
def testWeirdDiets(self): if not _can_accdb_unit_test: return filePath = os.path.join(_scratchDir, "weirdDiet.accdb") tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me(tdf.TicDat(**{t:getattr(dietData(),t) for t in tdf.primary_key_fields})) tdf2 = TicDatFactory(**dietSchemaWeirdCase()) dat2 = copyDataDietWeirdCase(ticDat) tdf2.mdb.write_file(dat2, filePath , allow_overwrite=True) accdbTicDat = tdf.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, accdbTicDat)) tdf3 = TicDatFactory(**dietSchemaWeirdCase2()) dat3 = copyDataDietWeirdCase2(ticDat) tdf3.mdb.write_file(dat3, makeCleanPath(filePath)) with py.connect(_connection_str(filePath)) as con: con.cursor().execute("SELECT * INTO [nutrition quantities] FROM nutrition_quantities").commit() con.cursor().execute("DROP TABLE nutrition_quantities").commit() accdbTicDat2 = tdf3.mdb.create_tic_dat(filePath) self.assertTrue(tdf3._same_data(dat3, accdbTicDat2)) with py.connect(_connection_str(filePath)) as con: con.cursor().execute("create table nutrition_quantities (boger int)").commit() self.assertTrue(self.firesException(lambda : tdf3.mdb.create_tic_dat(filePath)))
def testSpacey(self): if not _can_accdb_unit_test: return tdf = TicDatFactory(**spacesSchema()) spacesData = { "a_table" : {1 : {"a Data 3":3, "a Data 2":2, "a Data 1":1}, 22 : (1.1, 12, 12), 0.23 : (11, 12, 11)}, "b_table" : {("1", "2", "3") : 1, ("a", "b", "b") : 12}, "c_table" : (("1", "2", "3", 4), {"c Data 4":55, "c Data 2":"b", "c Data 3":"c", "c Data 1":"a"}, ("a", "b", "12", 24) ) } dat = tdf.TicDat(**spacesData) filePath = makeCleanPath(os.path.join(_scratchDir, "spacey.accdb")) tdf.mdb.write_schema(filePath, a_table = {"a Field":"double"}, c_table = {"c Data 1":"text", "c Data 2":"text", "c Data 3":"text", "c Data 4":"int"}) tdf.mdb.write_file(dat, filePath) self.assertFalse(tdf.mdb.find_duplicates(filePath)) dat2 = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(dat,dat2)) with py.connect(_connection_str(filePath)) as con: for t in tdf.all_tables: con.cursor().execute("SELECT * INTO [%s] FROM %s"%(t.replace("_", " "), t)).commit() con.cursor().execute("DROP TABLE %s"%t).commit() #shutil.copy(filePath, "spaces.accdb") #uncomment to make readonly test file as .accdb dat3 = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(dat, dat3))
def testDiet(self): if not _can_accdb_unit_test: return tdf = TicDatFactory(**dietSchema()) ticDat = tdf.freeze_me( tdf.TicDat( **{t: getattr(dietData(), t) for t in tdf.primary_key_fields})) filePath = makeCleanPath(os.path.join(_scratchDir, "diet.accdb")) tdf.mdb.write_file(ticDat, filePath) #shutil.copy(filePath, "diet.accdb") #uncomment to make readonly test file as .accdb self.assertFalse(tdf.mdb.find_duplicates(filePath)) accdbTicDat = tdf.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, accdbTicDat)) def changeit(): accdbTicDat.categories["calories"]["minNutrition"] = 12 changeit() self.assertFalse(tdf._same_data(ticDat, accdbTicDat)) self.assertTrue( self.firesException(lambda: tdf.mdb.write_file(ticDat, filePath))) tdf.mdb.write_file(ticDat, filePath, allow_overwrite=True) accdbTicDat = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, accdbTicDat)) self.assertTrue(self.firesException(changeit)) self.assertTrue(tdf._same_data(ticDat, accdbTicDat))
def testDups(self): if not _can_accdb_unit_test: return tdf = TicDatFactory(one=[["a"], ["b, c"]], two=[["a", "b"], ["c"]], three=[["a", "b", "c"], []]) tdf2 = TicDatFactory( **{t: [[], ["a", "b", "c"]] for t in tdf.all_tables}) td = tdf2.TicDat( **{ t: [[1, 2, 1], [1, 2, 2], [2, 1, 3], [2, 2, 3], [1, 2, 2], [11, 1, 2]] for t in tdf.all_tables }) f = makeCleanPath(os.path.join(_scratchDir, "testDups.accdb")) tdf2.mdb.write_file(td, f) #shutil.copy(f, "dups.accdb") #uncomment to make readonly test file as .accdb dups = tdf.mdb.find_duplicates(f) self.assertTrue(dups == { 'three': { (1, 2, 2): 2 }, 'two': { (1, 2): 3 }, 'one': { 1: 3, 2: 2 } })