def _xls_write(self, tic_dat, file_path, tbl_name_mapping): verify(xlwt, "Can't write .xls files because xlwt package isn't installed.") tdf = self.tic_dat_factory book = xlwt.Workbook() for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): sheet = book.add_sheet(tbl_name_mapping[t][:_longest_sheet]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, cell) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, cell) if os.path.exists(file_path): os.remove(file_path) book.save(file_path)
def write_directory(self, tic_dat, dir_path, allow_overwrite=False, dialect='excel', write_header=True): """ write the ticDat data to a collection of csv files :param tic_dat: the data object :param dir_path: the directory in which to write the csv files :param allow_overwrite: boolean - are we allowed to overwrite existing files? :param dialect: the csv dialect. Consult csv documentation for details. :param write_header: Boolean. Should the header information be written as the first row? :return: """ verify(csv, "csv needs to be installed to use this subroutine") verify(dialect in csv.list_dialects(), "Invalid dialect %s" % dialect) verify(not os.path.isfile(dir_path), "A file is not a valid directory path") if self.tic_dat_factory.generic_tables: dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.csv.write_directory(dat, dir_path, allow_overwrite, dialect, write_header) tdf = self.tic_dat_factory msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) if not allow_overwrite: for t in tdf.all_tables: f = os.path.join(dir_path, t + ".csv") verify(not os.path.exists(f), "The %s path exists and overwrite is not allowed" % f) if not os.path.isdir(dir_path): os.mkdir(dir_path) for t in tdf.all_tables: f = os.path.join(dir_path, t + ".csv") with open(f, 'w') as csvfile: writer = csv.DictWriter( csvfile, dialect=dialect, fieldnames=tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) writer.writeheader() if write_header else None _t = getattr(tic_dat, t) if dictish(_t): for p_key, data_row in _t.items(): primaryKeyDict = { f: v for f, v in zip( tdf.primary_key_fields[t], p_key if containerish(p_key) else (p_key, )) } writer.writerow(dict(data_row, **primaryKeyDict)) else: for data_row in (_t if containerish(_t) else _t()): writer.writerow(dict(data_row))
def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping): verify(xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed.") tdf = self.tic_dat_factory if os.path.exists(file_path): os.remove(file_path) book = xlsx.Workbook(file_path) def clean_for_write(t, f, x): if self.tic_dat_factory.infinity_io_flag != "N/A" or \ (t == "parameters" and self.tic_dat_factory.parameters): return self.tic_dat_factory._infinity_flag_write_cell(t, f, x) if x in [float("inf"), -float("inf")] or isinstance(x, datetime.datetime): return str(x) return x for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))) : all_flds = self.tic_dat_factory.primary_key_fields.get(t, ()) + self.tic_dat_factory.data_fields.get(t, ()) sheet = book.add_worksheet(tbl_name_mapping[t][:_longest_sheet]) for i,f in enumerate(tdf.primary_key_fields.get(t,()) + tdf.data_fields.get(t, ())) : sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t) : for row_ind, (p_key, data) in enumerate(_t.items()) : for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key,)) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) else : for row_ind, data in enumerate(_t if containerish(_t) else _t()) : for field_ind, cell in enumerate(tuple(data[_f] for _f in tdf.data_fields[t])) : sheet.write(row_ind+1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) book.close()
def set_data_type(self, table, field, number_allowed = True, inclusive_min = True, inclusive_max = False, min = 0, max = float("inf"), must_be_int = False, strings_allowed= (), nullable = False): """ sets the data type for a field. By default, fields don't have types. Adding a data type doesn't block data of the wrong type from being entered. Data types are useful for recognizing errant data entries with find_data_type_failures(). Errant data entries can be replaced with replace_data_type_failures(). :param table: a table in the schema :param field: a data field for this table :param number_allowed: boolean does this field allow numbers? :param inclusive_min: boolean : if number allowed, is the min inclusive? :param inclusive_max: boolean : if number allowed, is the max inclusive? :param min: if number allowed, the minimum value :param max: if number allowed, the maximum value :param must_be_int: boolean : if number allowed, must the number be integral? :param strings_allowed: if a collection - then a list of the strings allowed. The empty collection prohibits strings. If a "*", then any string is accepted. :param nullable : boolean : can this value contain null (aka None aka nan (since pandas treats null as nan)) :return: """ verify(not self._has_been_used, "The data types can't be changed after a PanDatFactory has been used.") verify(table in self.all_tables, "Unrecognized table name %s"%table) verify(table not in self.generic_tables, "Cannot set data type for generic table") verify(field in self.data_fields[table] + self.primary_key_fields[table], "%s does not refer to a field for %s"%(field, table)) verify((strings_allowed == '*') or (containerish(strings_allowed) and all(utils.stringish(x) for x in strings_allowed)), """The strings_allowed argument should be a container of strings, or the single '*' character.""") if utils.containerish(strings_allowed): strings_allowed = tuple(strings_allowed) # defensive copy if number_allowed: verify(utils.numericish(max), "max should be numeric") verify(utils.numericish(min), "min should be numeric") verify(max >= min, "max cannot be smaller than min") self._data_types[table][field] = TypeDictionary(number_allowed=True, strings_allowed=strings_allowed, nullable = bool(nullable), min = min, max = max, inclusive_min= bool(inclusive_min), inclusive_max = bool(inclusive_max), must_be_int = bool(must_be_int)) else : self._data_types[table][field] = TypeDictionary(number_allowed=False, strings_allowed=strings_allowed, nullable = bool(nullable), min = 0, max = float("inf"), inclusive_min= True, inclusive_max = True, must_be_int = False)
def assertTicDatTablesSame(t1, t2, _goodTicDatTable, _assertTrue = assertTrue, _assertFalse = assertFalse) : _assertTrue(set(t1) == set(t2)) _assertTrue(_goodTicDatTable(t1) and _goodTicDatTable(t2)) if not dictish(t1) and not dictish(t2) : return if dictish(t1) != dictish(t2) and dictish(t2) : t1,t2 = t2,t1 if not dictish(t2) : _assertTrue(all(containerish(x) and len(x) == 0 for x in t1.values())) return for k1,v1 in t1.items() : v2 = t2[k1] if dictish(v1) != dictish(v2) and dictish(v2) : v2, v1 = v1, v2 if dictish(v1) and dictish(v2) : _assertTrue(set(v1) == set(v2)) for _k1 in v1 : _assertTrue(v1[_k1] == v2[_k1]) elif dictish(v1) and containerish(v2) : _assertTrue(sorted(v1.values()) == sorted(v2)) elif dictish(v1) : _assertTrue(len(v1) == 1 and v1.values()[0] == v2) else : if containerish(v1) != containerish(v2) and containerish(v2) : v2, v1 = v1, v2 if containerish(v1) and containerish(v2) : _assertTrue(len(v1) == len(v2)) _assertTrue(all(v1[x] == v2[x] for x in range(len(v1)))) elif containerish(v1) : _assertTrue(len(v1) == 1 and v1[0] == v2) else : _assertTrue(v1 == v2)
def copy_to_ampl(self, pan_dat, field_renamings = None, excluded_tables = None): """ copies the pan_dat object into a new pan_dat object populated with amplpy.DataFrame objects performs a deep copy :param pan_dat: a PanDat object :param field_renamings: dict or None. If fields are to be renamed in the copy, then a mapping from (table_name, field_name) -> new_field_name If a data field is to be omitted, then new_field can be falsey table_name cannot refer to an excluded table. (see below) field_name doesn't have to refer to a field to an element of self.data_fields[t], but it doesn't have to refer to a column in the pan_dat.table_name DataFrame :param excluded_tables: If truthy, a list of tables to be excluded from the copy. Tables without primary key fields are always excluded. :return: a deep copy of the tic_dat argument into amplpy.DataFrames """ verify(amplpy, "amplpy needs to be installed in order to enable AMPL functionality") msg = [] verify(self.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) verify(not excluded_tables or (containerish(excluded_tables) and set(excluded_tables).issubset(self.all_tables)), "bad excluded_tables argument") copy_tables = {t for t in self.all_tables if self.primary_key_fields[t]}.\ difference(excluded_tables or []) field_renamings = field_renamings or {} verify(dictish(field_renamings), "invalid field_renamings argument") for k,v in field_renamings.items(): verify(containerish(k) and len(k) == 2 and k[0] in copy_tables and k[1] in getattr(pan_dat, k[0]).columns and ((v and utils.stringish(v)) or (not bool(v) and k[1] not in self.primary_key_fields[k[0]])), "invalid field_renamings argument %s:%s"%(k,v)) class AmplPanDat(object): def __repr__(self): return "td:" + tuple(copy_tables).__repr__() rtn = AmplPanDat() for t in copy_tables: rename = lambda f : field_renamings.get((t, f), f) df_ampl = amplpy.DataFrame(index=tuple(map(rename, self.primary_key_fields[t]))) for f in self.primary_key_fields[t]: df_ampl.setColumn(rename(f), list(getattr(pan_dat, t)[f])) for f in {f for _t,f in field_renamings if _t == t}.union(self.data_fields[t]): if rename(f): df_ampl.addColumn(rename(f), list(getattr(pan_dat, t)[f])) setattr(rtn, t, df_ampl) return rtn
def assertTicDatTablesSame(t1, t2, _goodTicDatTable, _assertTrue = assertTrue, _assertFalse = assertFalse) : _assertTrue(set(t1) == set(t2)) _assertTrue(_goodTicDatTable(t1) and _goodTicDatTable(t2)) if not dictish(t1) and not dictish(t2) : return if dictish(t1) != dictish(t2) and dictish(t2) : t1,t2 = t2,t1 if not dictish(t2) : _assertTrue(all(containerish(x) and len(x) == 0 for x in t1.values())) return for k1,v1 in t1.items() : v2 = t2[k1] if dictish(v1) != dictish(v2) and dictish(v2) : v2, v1 = v1, v2 if dictish(v1) and dictish(v2) : _assertTrue(set(v1) == set(v2)) for _k1 in v1 : _assertTrue(v1[_k1] == v2[_k1]) elif dictish(v1) and containerish(v2) : _assertTrue(sorted(map(str, v1.values())) == sorted(map(str, v2))) elif dictish(v1) : _assertTrue(len(v1) == 1 and v1.values()[0] == v2) else : if containerish(v1) != containerish(v2) and containerish(v2) : v2, v1 = v1, v2 if containerish(v1) and containerish(v2) : _assertTrue(len(v1) == len(v2)) _assertTrue(all(v1[x] == v2[x] for x in range(len(v1)))) elif containerish(v1) : _assertTrue(len(v1) == 1 and v1[0] == v2) else : _assertTrue(v1 == v2)
def convert_to_dicts_that_can_be_turned_into_DataFrames( tdf, dat, field_renamings=None): ''' utility routine to help de-ticdat-ify small examples so that they can then be passed to amplpy team in a more easily understood notebook example with hard coded data. the inner dicts returned below can each be passed as an argument to pandas.DataFrame, and from there the `set_ampl_data` logic can be broken out explicitly :param tdf: a TicDatFactory :param dat: a TicDat object created by tdf :param field_renamings: the same argument used by copy_to_ampl :return: ''' assert utils.dictish(field_renamings) and \ all(utils.containerish(k) and len(k) == 2 and k[0] in tdf.all_tables and k[1] in tdf.primary_key_fields[k[0]] + tdf.data_fields[k[0]] and utils.stringish(v) and v not in tdf.primary_key_fields[k[0]] + tdf.data_fields[k[0]] for k,v in field_renamings.items()), "invalid field_renamings argument" dat = tdf.copy_to_pandas(dat, drop_pk_columns=False) def do_renames(t, df): for f in tdf.primary_key_fields[t] + tdf.data_fields[t]: if (t, f) in (field_renamings or []): df[field_renamings[t, f]] = df[f] df.drop(f, axis=1, inplace=True) return df rtn = { t: do_renames(t, getattr(dat, t).reset_index(drop=True)).to_dict() for t in tdf.all_tables } return rtn
def make_json_dict(tdf, tic_dat, verbose=False, use_infinity_io_flag_if_provided=False): assert tdf.good_tic_dat_object(tic_dat) def write_cell(t, f, x): if isinstance(x, datetime.datetime): return str(x) return x if not use_infinity_io_flag_if_provided else tdf._infinity_flag_write_cell( t, f, x) jdict = defaultdict(list) for t in tdf.all_tables: all_fields = tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get( t, ()) def make_row(row): assert containerish(row) and len(row) == len(all_fields) row = [write_cell(t, f, x) for f, x in zip(all_fields, row)] return {f: v for f, v in zip(all_fields, row)} if verbose else row appender = lambda row: jdict[t].append(make_row(row)) tbl = getattr(tic_dat, t) if tdf.primary_key_fields.get(t): for pk, data_row in tbl.items(): appender((list(pk) if containerish(pk) else [pk]) + [data_row[df] for df in tdf.data_fields[t]]) else: for data_row in tbl: appender([data_row[df] for df in tdf.data_fields[t]]) return jdict
def _get_data(self, tic_dat, as_sql): rtn = [] for t in self.tic_dat_factory.all_tables: _t = getattr(tic_dat, t) if dictish(_t): primarykeys = tuple(self.tic_dat_factory.primary_key_fields[t]) for pkrow, sqldatarow in _t.items(): _items = list(sqldatarow.items()) fields = primarykeys + tuple(x[0] for x in _items) datarow = ((pkrow, ) if len(primarykeys) == 1 else pkrow) + tuple(x[1] for x in _items) assert len(datarow) == len(fields) datarow = tuple( self._write_data_cell(t, f, x) for f, x in zip(fields, datarow)) str = "INSERT INTO [%s] (%s) VALUES (%s)" % (t, ",".join( _brackets(fields)), ",".join("%s" if as_sql else "?" for _ in fields)) if as_sql: rtn.append((str % tuple(map(_insert_format, datarow))) + ";") else: rtn.append((str, datarow)) else: for sqldatarow in (_t if containerish(_t) else _t()): k, v = zip(*sqldatarow.items()) str = "INSERT INTO [%s] (%s) VALUES (%s)"%\ (t, ",".join(_brackets(k)), ",".join( ["%s" if as_sql else "?"]*len(sqldatarow))) if as_sql: rtn.append((str % tuple(map(_insert_format, v))) + ";") else: rtn.append((str, v)) return tuple(rtn)
def testSilly(self): if not self.can_run: return tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x: tuple(x) if utils.containerish(x) else (x, ) for t in ("a", "b"): schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"], ()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a", "c")) tdf5 = tdf5.clone() filePath = os.path.join(_scratchDir, "silly.db") tdf.sql.write_db_data(ticDat, filePath) self.assertFalse(tdf.sql.find_duplicates(filePath)) ticDat2 = tdf2.sql.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.sql.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.sql.create_tic_dat(filePath) for t in ["a", "b"]: for k, v in getattr(ticDat4, t).items(): for _k, _v in v.items(): self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]): self.assertTrue(t == "b") else: self.assertTrue(t == "a") ticDat5 = tdf5.sql.create_tic_dat(filePath) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue( callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) self.assertTrue("table d" in self.firesException( lambda: tdf6.sql.create_tic_dat(filePath))) ticDat.a["theboger"] = (1, None, 12) tdf.sql.write_db_data(ticDat, makeCleanPath(filePath)) ticDatNone = tdf.sql.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)
def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping): verify( xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed." ) tdf = self.tic_dat_factory if os.path.exists(file_path): os.remove(file_path) book = xlsx.Workbook(file_path) def clean_inf(x): if x == float("inf"): return "inf" if x == -float("inf"): return "-inf" return x for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): sheet = book.add_worksheet(tbl_name_mapping[t]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, clean_inf(cell)) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, clean_inf(cell)) book.close()
def create_lingo_text(tdf, tic_dat, infinity=INFINITY): """ Generate a Lingo .ldt string from a TicDat object :param tdf: A TicDatFactory defining the schema :param tic_dat: A TicDat object consistent with tdf :param infinity: A number used to represent infinity in lingo :return: A string consistent with the Lingo .ldt format """ msg = [] verify(tdf.good_tic_dat_object(tic_dat, msg.append), "tic_dat not a good object for this factory : %s" % "\n".join(msg)) verify(not tdf.generator_tables, "doesn't work with generator tables.") verify( not tdf.generic_tables, "doesn't work with generic tables. (not yet - will add ASAP as needed) " ) dict_with_lists = defaultdict(list) dict_tables = {t for t, pk in tdf.primary_key_fields.items() if pk} prepend = getattr(tdf, "lingo_prepend", "") for t in dict_tables: for k, r in getattr(tic_dat, t).items(): row = list(k) if containerish(k) else [k] for f in tdf.data_fields.get(t, []): row.append(r[f]) dict_with_lists[t].append(row) for t in set(tdf.all_tables).difference(dict_tables): for r in getattr(tic_dat, t): row = [r[f] for f in tdf.data_fields[t]] dict_with_lists[t].append(row) rtn = "data:\n" for t in _sorted_tables(tdf): rtn += "%s" % (prepend + t) for field in tdf.data_fields[t]: rtn += ',' + prepend + t + "_" + field.replace(" ", "_").lower() rtn += "=\n" for row in dict_with_lists[t]: rtn += "\t" for field in row: if stringish(field): rtn += field + " " else: rtn += str(infinity) if float( 'inf') == field else str(field) + " " rtn += "\n" rtn += ";\n" rtn += "enddata" return rtn
def create_opl_text(tdf, tic_dat, infinity=INFINITY): """ Generate a OPL .dat string from a TicDat object :param tdf: A TicDatFactory defining the schema :param tic_dat: A TicDat object consistent with tdf :param infinity: A number used to represent infinity in OPL :return: A string consistent with the OPL .dat format """ msg = [] verify(tdf.good_tic_dat_object(tic_dat, msg.append), "tic_dat not a good object for this factory : %s" % "\n".join(msg)) verify(not tdf.generator_tables, "doesn't work with generator tables.") verify( not tdf.generic_tables, "doesn't work with generic tables. (not yet - will add ASAP as needed) " ) dict_with_lists = defaultdict(list) dict_tables = {t for t, pk in tdf.primary_key_fields.items() if pk} for t in dict_tables: for k, r in getattr(tic_dat, t).items(): row = list(k) if containerish(k) else [k] for f in tdf.data_fields.get(t, []): row.append(r[f]) dict_with_lists[t].append(row) for t in set(tdf.all_tables).difference(dict_tables): for r in getattr(tic_dat, t): row = [r[f] for f in tdf.data_fields[t]] dict_with_lists[t].append(row) rtn = "" for i, (t, l) in enumerate(dict_with_lists.items()): rtn += "\n" if i > 0 else "" rtn += "%s = {" % (tdf.opl_prepend + t) if len(l[0]) > 1: rtn += "\n" for x in range(len(l)): r = l[x] if len(r) > 1: rtn += "<" for i, v in enumerate(r): rtn += ('"%s"' % v if stringish(v) else (str(infinity) if float('inf') == v else str(v))) + ( ", " if i < len(r) - 1 else "") if len(r) == 1 and len(l) - 1 != x: rtn += ', ' if len(r) > 1: rtn += ">\n" rtn += "};\n" return rtn
def testSilly(self): tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"],()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a","c")) filePath = os.path.join(_scratchDir, "silly.db") tdf.sql.write_db_data(ticDat, filePath) ticDat2 = tdf2.sql.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.sql.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.sql.create_tic_dat(filePath) for t in ["a","b"]: for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") ticDat5 = tdf5.sql.create_tic_dat(filePath) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) self.assertTrue("table d" in self.firesException(lambda : tdf6.sql.create_tic_dat(filePath))) ticDat.a["theboger"] = (1, None, 12) tdf.sql.write_db_data(ticDat, makeCleanPath(filePath)) ticDatNone = tdf.sql.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)
def make_json_dict(tdf, tic_dat, verbose=False): assert tdf.good_tic_dat_object(tic_dat) jdict = defaultdict(list) for t in tdf.all_tables: all_fields = tdf.primary_key_fields.get(t,()) + tdf.data_fields.get(t,()) def make_row(row): assert containerish(row) and len(row) == len(all_fields) return {f:v for f,v in zip(all_fields, row)} if verbose else row appender = lambda row : jdict[t].append(make_row(row)) tbl = getattr(tic_dat, t) if tdf.primary_key_fields.get(t): for pk, data_row in tbl.items(): appender((list(pk) if containerish(pk) else [pk]) + [data_row[df] for df in tdf.data_fields[t]]) else: for data_row in tbl: appender([data_row[df] for df in tdf.data_fields[t]]) return jdict
def _try_create_space_case_mapping(tdf, ticdat): ''' :param tdf: a TicDatFactory :param ticdat: a ticdat for the tdf :return: {"mapping:mapping} if a good mapping can be made, else {"failures":failures} ''' assert tdf.good_tic_dat_object(ticdat), "ticdat not a good object for the tdf" rtn = defaultdict(set) for t in tdf.all_tables: if tdf.primary_key_fields.get(t): for ks in getattr(ticdat, t): for k in (ks if containerish(ks) else [ks]): if stringish(k): newk = ''.join(list(map(lambda c: c.upper() if c.isalnum() else '_', k))) rtn[newk].add(k) failures = {k:tuple(sorted(v)) for k,v in rtn.items() if len(v) > 1} if failures: return {"failures":failures} return {"mapping": {k:next(iter(v)) for k,v in rtn.items()}}
def make_row(row): assert containerish(row) and len(row) == len(all_fields) row = [write_cell(t, f, x) for f, x in zip(all_fields, row)] return {f: v for f, v in zip(all_fields, row)} if verbose else row
def make_row(row): assert containerish(row) and len(row) == len(all_fields) return {f: v for f, v in zip(all_fields, row)} if verbose else row
def __init__(self, **init_fields): """ create a PanDatFactory :param init_fields: a mapping of tables to primary key fields and data fields. Each field listing consists of two sub lists ... first primary keys fields, than data fields. ex: PanDatFactory (categories = [["name"],["Min Nutrition", "Max Nutrition"]], foods = [["Name"],["Cost"]] nutritionQuantities = [["Food", "Category"],["Qty"]]) Use '*' instead of a pair of lists for generic tables ex: PanDatFactory (typical_table = [["Primary Key Field"],["Data Field"]], generic_table = '*') :return: a PanDatFactory """ verify(DataFrame and pd, "Need to install pandas in order to create a PanDatFactory") self._has_been_used = False verify(not any(x.startswith("_") for x in init_fields), "table names shouldn't start with underscore") verify(not any(" " in x for x in init_fields), "table names shouldn't have white space") verify(len(init_fields) == len({_.lower() for _ in init_fields}), "there are case insensitive duplicate table names") for k,v in init_fields.items(): verify(v == '*' or (containerish(v) and len(v) == 2 and all(containerish(_) for _ in v)), ("Table %s needs to indicate it is a generic table by using '*'\n" + "or specify two sublists, one for primary key fields and one for data fields") %k) if v != '*': verify(all(utils.stringish(s) for _ in v for s in _), "The field names for %s need to be strings"%k) verify(v[0] or v[1], "No field names specified for table %s"%k) verify(len(set(v[0]).union(v[1])) == len(v[0])+len(v[1]), "There are duplicate field names for table %s"%k) verify(len({_.lower() for _ in list(v[0]) + list(v[1])}) == len(v[0])+len(v[1]), "There are case insensitive duplicate field names for %s"%k) self.generic_tables = frozenset(k for k,v in init_fields.items() if v == '*') self._primary_key_fields = FrozenDict({k : tuple(v[0])for k,v in init_fields.items() if v != '*'}) self._data_fields = FrozenDict({k : tuple(v[1]) for k,v in init_fields.items() if v != '*'}) self._default_values = clt.defaultdict(dict) for tbl,flds in self._data_fields.items(): for fld in flds: self._default_values[tbl][fld] = 0 self._data_types = clt.defaultdict(dict) self._data_row_predicates = clt.defaultdict(dict) self._foreign_keys = clt.defaultdict(set) self.all_tables = frozenset(init_fields) superself = self class PanDat(object): def __repr__(self): tlen = lambda t: len(getattr(self, t)) if isinstance(getattr(self, t), DataFrame) else None return "pd: {" + ", ".join("%s: %s"%(t, tlen(t)) for t in superself.all_tables) + "}" def __init__(self, **init_tables): superself._trigger_has_been_used() for t in init_tables : verify(t in superself.all_tables, "Unexpected table name %s"%t) tbl = safe_apply(DataFrame)(init_tables[t]) if tbl is None and dictish(init_tables[t]) and all(map(stringish, init_tables[t])): tbl = safe_apply(DataFrame)(**init_tables[t]) verify(isinstance(tbl, DataFrame), "Failed to provide a valid DataFrame or DataFrame construction argument for %s"%t) setattr(self, t, tbl.copy()) df = getattr(self, t) if list(df.columns) == list(range(len(df.columns))) and \ len(df.columns) >= len(superself._all_fields(t)): df.rename(columns={f1:f2 for f1, f2 in zip(df.columns, superself._all_fields(t))}, inplace=True) for t in set(superself.all_tables).difference(init_tables): setattr(self, t, DataFrame({f:[] for f in utils.all_fields(superself, t)})) missing_fields = {(t, f) for t in superself.all_tables for f in superself._all_fields(t) if f not in getattr(self, t).columns} verify(not missing_fields, "The following are (table, field) pairs missing from the data.\n%s"%missing_fields) for t in superself.all_tables: af = list(superself._all_fields(t)) df = getattr(self, t) if list(df.columns)[:len(af)] != af: extra_cols = [_ for _ in list(df.columns) if _ not in af] setattr(self, t, df[af + extra_cols]) assert list(getattr(self, t)) == af + extra_cols self.PanDat = PanDat self.xls = pandatio.XlsPanFactory(self) self.sql = pandatio.SqlPanFactory(self) self.csv = pandatio.CsvPanFactory(self) self.json = pandatio.JsonPanFactory(self) self.opalytics = pandatio.OpalyticsPanFactory(self)
def write_file(self, tic_dat, mdb_file_path, allow_overwrite=False): """ write the ticDat data to an SQLite database file :param tic_dat: the data object to write :param mdb_file_path: the file path of the SQLite database to populate :param allow_overwrite: boolean - are we allowed to overwrite pre-existing data :return: caveats : Numbers with absolute values larger than 1e+100 will be written as 1e+100 or -1e+100 NB - thrown Exceptions of the form "Data type mismatch in criteria expression" generally result either from Access's inability to store different data types in the same field, or from a mismatch between the data object and the default field types ticdat uses when creating an Access schema. For the latter, feel free to call the write_schema function on the data file first with explicitly identified field types. """ _standard_verify(self.tic_dat_factory.generic_tables) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(mdb_file_path), "A directory is not a valid Access file path") if not os.path.exists(mdb_file_path): self.write_schema(mdb_file_path) table_names = self._check_tables_fields( mdb_file_path, self.tic_dat_factory.all_tables) with _connect(_connection_str(mdb_file_path)) as con: for t in self.tic_dat_factory.all_tables: verify( table_names[t] == t, "Failed to find table %s in path %s" % (t, mdb_file_path)) if not allow_overwrite: with con.cursor() as cur: cur.execute("Select * from %s" % t) verify( not any(True for _ in cur.fetchall()), "allow_overwrite is False, but there are already data records in %s" % t) con.cursor().execute("Delete from %s" % t).commit() if allow_overwrite else None _t = getattr(tic_dat, t) if dictish(_t): primary_keys = tuple( self.tic_dat_factory.primary_key_fields[t]) for pk_row, sql_data_row in _t.items(): _items = tuple(sql_data_row.items()) fields = _brackets(primary_keys + tuple(x[0] for x in _items)) data_row = ((pk_row,) if len(primary_keys)==1 else pk_row) + \ tuple(_write_data(x[1]) for x in _items) assert len(data_row) == len(fields) str = "INSERT INTO %s (%s) VALUES (%s)"%\ (t, ",".join(fields), ",".join("?" for _ in fields)) con.cursor().execute(str, data_row).commit() else: for sql_data_row in (_t if containerish(_t) else _t()): str = "INSERT INTO %s (%s) VALUES (%s)" % (t, ",".join( _brackets(sql_data_row.keys())), ",".join( ["?"] * len(sql_data_row))) con.cursor().execute( str, tuple(map(_write_data, sql_data_row.values())))
def testSilly(self): tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) filePath = os.path.join(_scratchDir, "silly.mdb") self.assertTrue(firesException(lambda : tdf.mdb.write_file(ticDat, makeCleanPath(filePath)))) def sillyMeCleanData() : return { "a" : {"1" : (1, 2, "3"), "b" : (12, 12.2, "twelve"), "c" : (11, 12, "thirt")}, "b" : {(1, 2, "3") : 1, (3, 4, "b") : 12}, "c" : ((1, "2", 3, 4), (0.2, "b", 0.3, 0.4), (1.2, "b", 12, 24) ) } ticDat = tdf.TicDat(**sillyMeCleanData()) self.assertTrue(firesException(lambda : tdf.mdb.write_file(ticDat, makeCleanPath(filePath)))) def makeCleanSchema() : tdf.mdb.write_schema(makeCleanPath(filePath), a={"aData3" : "text"}, b = {"bField1" : "int", "bField2" : "int"}, c={"cData2" : "text"}) return filePath tdf.mdb.write_file(ticDat, makeCleanSchema()) mdbTicDat = tdf.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, mdbTicDat)) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"],()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a","c")) ticDat2 = tdf2.mdb.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.mdb.create_tic_dat(filePath) for t in ["a","b"]: for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") ticDat5 = tdf5.mdb.create_tic_dat(filePath) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) self.assertTrue("table d" in self.firesException(lambda : tdf6.mdb.create_tic_dat(filePath))) ticDat.a["theboger"] = (1, None, "twelve") tdf.mdb.write_file(ticDat, makeCleanSchema()) ticDatNone = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)
def testSilly(self): tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"],()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a","c")) filePath = os.path.join(_scratchDir, "silly.xls") tdf.xls.write_file(ticDat, filePath) ticDat2 = tdf2.xls.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.xls.create_tic_dat(filePath) for t in ["a","b"]: for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") ticDat5 = tdf5.xls.create_tic_dat(filePath) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat6 = tdf6.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) def writeData(data, write_header = True): import xlwt book = xlwt.Workbook() for t in tdf.all_tables : sheet = book.add_sheet(t) if write_header : for i,f in enumerate(tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) : sheet.write(0, i, f) for rowInd, row in enumerate(data) : for fieldInd, cellValue in enumerate(row): sheet.write(rowInd+ (1 if write_header else 0), fieldInd, cellValue) if os.path.exists(filePath): os.remove(filePath) book.save(filePath) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) rowCount = tdf.xls.get_duplicates(filePath) self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header=False) self.assertTrue(self.firesException(lambda : tdf.xls.create_tic_dat(filePath, freeze_it=True))) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True, headers_present=False) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) rowCount = tdf.xls.get_duplicates(filePath, headers_present=False) self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2) ticDat.a["theboger"] = (1, None, 12) tdf.xls.write_file(ticDat, filePath, allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath, freeze_it=True) # THIS IS A FLAW - but a minor one. None's are hard to represent. It is turning into the empty string here. # not sure how to handle this, but documenting for now. self.assertFalse(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "") writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)]) rowCount = tdf.xls.get_duplicates(filePath) self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3) self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)
def apply_mapping(k): if containerish(k): return tuple(list(map(apply_mapping, k))) return mapping.get(k, k)
def testSilly(self): if not _can_accdb_unit_test: return tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) filePath = os.path.join(_scratchDir, "silly.accdb") self.assertTrue(firesException(lambda : tdf.mdb.write_file(ticDat, makeCleanPath(filePath)))) def sillyMeCleanData() : return { "a" : {"1" : (1, 2, "3"), "b" : (12, 12.2, "twelve"), "c" : (11, 12, "thirt")}, "b" : {(1, 2, "3") : 1, (3, 4, "b") : 12}, "c" : ((1, "2", 3, 4), (0.2, "b", 0.3, 0.4), (1.2, "b", 12, 24) ) } ticDat = tdf.TicDat(**sillyMeCleanData()) self.assertTrue(firesException(lambda : tdf.mdb.write_file(ticDat, makeCleanPath(filePath)))) def makeCleanSchema() : tdf.mdb.write_schema(makeCleanPath(filePath), a={"aData3" : "text"}, b = {"bField1" : "int", "bField2" : "int"}, c={"cData2" : "text"}) return filePath tdf.mdb.write_file(ticDat, makeCleanSchema()) self.assertFalse(tdf.mdb.find_duplicates(filePath)) accdbTicDat = tdf.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, accdbTicDat)) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"],()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a","c")) ticDat2 = tdf2.mdb.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.mdb.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.mdb.create_tic_dat(filePath) for t in ["a","b"]: for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") ticDat5 = tdf5.mdb.create_tic_dat(filePath) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) self.assertTrue("table d" in self.firesException(lambda : tdf6.mdb.create_tic_dat(filePath))) ticDat.a["theboger"] = (1, None, "twelve") tdf.mdb.write_file(ticDat, makeCleanSchema()) ticDatNone = tdf.mdb.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None)
def doTest(headersPresent) : tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema5b = sillyMeSchema() for t in ("a", "b") : schema5b[t][1] = _tuple(schema5b[t][0]) + _tuple(schema5b[t][1]) schema5b["a"][0], schema5b["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [("dField",),[]] tdf2, tdf3, tdf4, tdf5, tdf5b, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema5b, schema6)) tdf5.set_generator_tables(["a", "c"]) tdf5b.set_generator_tables(("a", "c")) dirPath = makeCleanDir(os.path.join(_scratchDir, "silly")) tdf.csv.write_directory(ticDat, dirPath, write_header=headersPresent) ticDat2 = tdf2.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertFalse if headersPresent else self.assertTrue)(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)(tdf._same_data(ticDat, ticDat3)) if headersPresent : ticDat4 = tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent) for t in ("a", "b") : for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") else : self.assertTrue(self.firesException(lambda : tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent))) ticDat5 = tdf5.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)( tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat5b = tdf5b.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf5b._same_data(tdf._keyless(ticDat), ticDat5b)) self.assertTrue(callable(ticDat5b.a) and callable(ticDat5b.c) and not callable(ticDat5b.b)) ticDat6 = tdf6.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) allDataTdf = TicDatFactory(**{t:[[], tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())] for t in tdf.all_tables}) def writeData(data): td = allDataTdf.TicDat(a = data, b=data, c=data) allDataTdf.csv.write_directory(td, dirPath, allow_overwrite=True, write_header=headersPresent) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.csv.create_tic_dat(dirPath, headers_present=headersPresent, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[(1, 20, 30)]["bData"] == 40) rowCount = tdf.csv.get_duplicates(dirPath, headers_present= headersPresent) self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)]) rowCount = tdf.csv.get_duplicates(dirPath, headers_present=headersPresent) self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3) self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)
def testSilly(self): if not self.can_run: return tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x: tuple(x) if utils.containerish(x) else (x, ) for t in ("a", "b"): schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [["dField"], ()] tdf2, tdf3, tdf4, tdf5, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema6)) tdf5.set_generator_tables(("a", "c")) filePath = os.path.join(_scratchDir, "silly.xls") tdf.xls.write_file(ticDat, filePath) ticDat2 = tdf2.xls.create_tic_dat(filePath) self.assertFalse(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat3)) ticDat4 = tdf4.xls.create_tic_dat(filePath) for t in ["a", "b"]: for k, v in getattr(ticDat4, t).items(): for _k, _v in v.items(): self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]): self.assertTrue(t == "b") else: self.assertTrue(t == "a") ticDat5 = tdf5.xls.create_tic_dat(filePath, treat_inf_as_infinity=False) self.assertTrue(tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue( callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat6 = tdf6.xls.create_tic_dat(filePath) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue( firesException(lambda: tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) def writeData(data, write_header="same"): assert filePath.endswith(".xls") assert not write_header or write_header in ("lower", "same", "duped") import xlwt book = xlwt.Workbook() for t in tdf.all_tables: sheet = book.add_sheet(t) if write_header: all_fields = tdf.primary_key_fields.get( t, ()) + tdf.data_fields.get(t, ()) for i, f in enumerate( (2 if write_header == "duped" else 1) * all_fields): sheet.write( 0, i, f.lower() if write_header == "lower" or i >= len(all_fields) else f) for rowInd, row in enumerate(data): for fieldInd, cellValue in enumerate( (2 if write_header == "duped" else 1) * row): sheet.write(rowInd + (1 if write_header else 0), fieldInd, cellValue) if os.path.exists(filePath): os.remove(filePath) book.save(filePath) if write_header in [ "lower", "same" ]: # will use pandas to generate the xlsx file version file_path_x = filePath + "x" if os.path.exists(file_path_x): os.remove(file_path_x) writer = utils.pd.ExcelWriter(file_path_x) for t, (pks, dfs) in tdf.schema().items(): fields = pks + dfs if write_header == "lower": fields = [_.lower() for _ in fields] d = {f: [] for f in fields} for row in data: for f, c in zip(fields, row): d[f].append(c) utils.pd.DataFrame(d).to_excel(writer, t, index=False) writer.save() writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header="duped") self.assertTrue( self.firesException( lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True))) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header="lower") ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)], write_header=False) self.assertTrue( self.firesException( lambda: tdf.xls.create_tic_dat(filePath, freeze_it=True))) ticDatMan = tdf.xls.create_tic_dat(filePath, freeze_it=True, headers_present=False) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[1, 20, 30]["bData"] == 40) rowCount = tdf.xls.find_duplicates(filePath, headers_present=False) self.assertTrue( set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 2) ticDat.a["theboger"] = (1, None, 12) tdf.xls.write_file(ticDat, filePath, allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath, freeze_it=True) # THIS IS A FLAW - but a minor one. None's are hard to represent. It is turning into the empty string here. # not sure how to handle this, but documenting for now. self.assertFalse(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == "") # the workaround for this flaw is to set the data type to be nullabe but not allow the empty string tdfwa = TicDatFactory(**sillyMeSchema()) tdfwa.set_data_type("a", "aData2", nullable=True) ticDatNone = tdfwa.xls.create_tic_dat(filePath, freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) # checking the same thing with .xlsx - using openpyxl, None is indeed recovered even without tdfwa munging! tdf.xls.write_file(ticDat, filePath + "x", allow_overwrite=True) ticDatNone = tdf.xls.create_tic_dat(filePath + "x", freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) ticDatNone = tdfwa.xls.create_tic_dat(filePath + "x", freeze_it=True) self.assertTrue(tdf._same_data(ticDat, ticDatNone)) self.assertTrue(ticDatNone.a["theboger"]["aData2"] == None) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1, 20, 30, 12)]) for f in [filePath, filePath + "x"]: rowCount = tdf.xls.find_duplicates(f) self.assertTrue( set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1] == 3) self.assertTrue( set(rowCount["b"]) == {(1, 20, 30)} and rowCount["b"][1, 20, 30] == 2)
def doTest(headersPresent) : tdf = TicDatFactory(**sillyMeSchema()) ticDat = tdf.TicDat(**sillyMeData()) schema2 = sillyMeSchema() schema2["b"][0] = ("bField2", "bField1", "bField3") schema3 = sillyMeSchema() schema3["a"][1] = ("aData2", "aData3", "aData1") schema4 = sillyMeSchema() schema4["a"][1] = ("aData1", "aData3") schema5 = sillyMeSchema() _tuple = lambda x : tuple(x) if utils.containerish(x) else (x,) for t in ("a", "b") : schema5[t][1] = _tuple(schema5[t][1]) + _tuple(schema5[t][0]) schema5["a"][0], schema5["b"][0] = (), [] schema5b = sillyMeSchema() for t in ("a", "b") : schema5b[t][1] = _tuple(schema5b[t][0]) + _tuple(schema5b[t][1]) schema5b["a"][0], schema5b["b"][0] = (), [] schema6 = sillyMeSchema() schema6["d"] = [("dField",),[]] tdf2, tdf3, tdf4, tdf5, tdf5b, tdf6 = (TicDatFactory(**x) for x in (schema2, schema3, schema4, schema5, schema5b, schema6)) tdf5.set_generator_tables(["a", "c"]) tdf5b.set_generator_tables(("a", "c")) dirPath = makeCleanDir(os.path.join(_scratchDir, "silly")) tdf.csv.write_directory(ticDat, dirPath, write_header=headersPresent) ticDat2 = tdf2.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertFalse if headersPresent else self.assertTrue)(tdf._same_data(ticDat, ticDat2)) ticDat3 = tdf3.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)(tdf._same_data(ticDat, ticDat3)) if headersPresent : ticDat4 = tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent) for t in ("a", "b") : for k,v in getattr(ticDat4, t).items() : for _k, _v in v.items() : self.assertTrue(getattr(ticDat, t)[k][_k] == _v) if set(v) == set(getattr(ticDat, t)[k]) : self.assertTrue(t == "b") else : self.assertTrue(t == "a") else : self.assertTrue(self.firesException(lambda : tdf4.csv.create_tic_dat(dirPath, headers_present=headersPresent))) ticDat5 = tdf5.csv.create_tic_dat(dirPath, headers_present=headersPresent) (self.assertTrue if headersPresent else self.assertFalse)( tdf5._same_data(tdf._keyless(ticDat), ticDat5)) self.assertTrue(callable(ticDat5.a) and callable(ticDat5.c) and not callable(ticDat5.b)) ticDat5b = tdf5b.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf5b._same_data(tdf._keyless(ticDat), ticDat5b)) self.assertTrue(callable(ticDat5b.a) and callable(ticDat5b.c) and not callable(ticDat5b.b)) ticDat6 = tdf6.csv.create_tic_dat(dirPath, headers_present=headersPresent) self.assertTrue(tdf._same_data(ticDat, ticDat6)) self.assertTrue(firesException(lambda : tdf6._same_data(ticDat, ticDat6))) self.assertTrue(hasattr(ticDat6, "d") and utils.dictish(ticDat6.d)) allDataTdf = TicDatFactory(**{t:[[], tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())] for t in tdf.all_tables}) def writeData(data): td = allDataTdf.TicDat(a = data, b=data, c=data) allDataTdf.csv.write_directory(td, dirPath, allow_overwrite=True, write_header=headersPresent) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40)]) ticDatMan = tdf.csv.create_tic_dat(dirPath, headers_present=headersPresent, freeze_it=True) self.assertTrue(len(ticDatMan.a) == 2 and len(ticDatMan.b) == 3) self.assertTrue(ticDatMan.b[(1, 20, 30)]["bData"] == 40) rowCount = tdf.csv.find_duplicates(dirPath, headers_present= headersPresent) self.assertTrue(set(rowCount) == {'a'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==2) writeData([(1, 2, 3, 4), (1, 20, 30, 40), (10, 20, 30, 40), (1,20,30,12)]) rowCount = tdf.csv.find_duplicates(dirPath, headers_present=headersPresent) self.assertTrue(set(rowCount) == {'a', 'b'} and set(rowCount["a"]) == {1} and rowCount["a"][1]==3) self.assertTrue(set(rowCount["b"]) == {(1,20,30)} and rowCount["b"][1,20,30]==2)