def change_fields_with_reserved_keywords(tdf, reserved_keywords, undo=False): tdf_schema = tdf.schema() mapping = {} for table, fields in tdf_schema.items(): for fields_list in [fields[0], fields[1]]: for findex in range(len(fields_list)): original_field = fields_list[findex] if not undo: verify( not fields_list[findex].startswith('_'), ("Field names cannot start with '_', in table %s : " + "field is %s") % (table, fields_list[findex])) if fields_list[findex].lower() in reserved_keywords: fields_list[findex] = '_' + fields_list[findex] else: if fields_list[findex].startswith('_'): fields_list[findex] = fields_list[findex][1:] mapping[table, original_field] = fields_list[findex] rtn = ticdat.TicDatFactory(**tdf_schema) for (table, original_field), new_field in mapping.items(): if original_field in tdf.default_values.get(table, ()): rtn.set_default_value(table, new_field, tdf.default_values[table][original_field]) if original_field in tdf.data_types.get(table, ()): rtn.set_data_type(table, new_field, *(tdf.data_types[table][original_field])) if hasattr(tdf, 'opl_prepend'): rtn.opl_prepend = tdf.opl_prepend if hasattr(tdf, 'ampl_prepend'): rtn.ampl_prepend = tdf.ampl_prepend if hasattr(tdf, 'lingo_prepend'): rtn.lingo_prepend = tdf.lingo_prepend return rtn
def find_duplicates_from_dict_ticdat(tdf, dict_ticdat): assert isinstance(tdf, ticdat.TicDatFactory) assert dictish(dict_ticdat) and all(map(stringish, dict_ticdat)) and \ all(map(containerish, dict_ticdat.values())) primary_key_fields = {k: v for k, v in tdf.primary_key_fields.items() if v} if primary_key_fields: old_schema = { k: v for k, v in tdf.schema().items() if k in primary_key_fields } all_data_tdf = ticdat.TicDatFactory( **{t: [[], pks + dfs] for t, (pks, dfs) in old_schema.items()}) td = all_data_tdf.TicDat( ** {k: v for k, v in dict_ticdat.items() if k in primary_key_fields}) rtn = {t: defaultdict(int) for t in primary_key_fields} for t, flds in list(primary_key_fields.items()): tbl = getattr(td, t) for row in tbl: k = tuple(row[f] for f in flds) k = k[0] if len(k) == 1 else k rtn[t][k] += 1 rtn[t] = {k: v for k, v in rtn[t].items() if v > 1} if not rtn[t]: del (rtn[t]) return rtn
def create_generic_free(td, tdf): assert tdf.good_tic_dat_object(td) if not tdf.generic_tables: return td, tdf sch = { k: v for k, v in tdf.schema().items() if k not in tdf.generic_tables } for t in tdf.generic_tables: if len(getattr(td, t)): sch[t] = [[], list(getattr(td, t).columns)] rtn_tdf = ticdat.TicDatFactory(**sch) return rtn_tdf.TicDat(**{t: getattr(td, t) for t in rtn_tdf.all_tables}), rtn_tdf
tl._forceguout() # for testing purposes, using underscore is ok assert chk1 and chk2 verify( sum(td.childTable[k]["dummy"] for k in tl.slice("*", "*", 3)) == chk1 * 2.) verify( sum(td.childTable[k]["dummy"] for k in tl.slice("*", 3, 2)) == chk2 * 2.) verify(sum(td.childTable[k]["dummy"] for k in tl.slice("*", 3, 1)) == 0) verify(sum(td.childTable[k]["dummy"] for k in tl.slice("*", 3, 3)) == 0) assert tl._archived_slicings # make a simple schema tdf = ticdat.TicDatFactory(p1=[["id"], []], p2=[["id"], []], childTable=[["p1_1", "p1_2", "p2"], ["dummy"]]) tdf.set_default_value("childTable", "dummy", 2.) smallTd = tdf.TicDat() smallChk = populateTd(smallTd, 30, 20) smallSmartTupleList = tuplelist(smallTd.childTable) smallDumbTupleList = DumbTupleList(smallTd.childTable) smallSlicer = ticdat.Slicer(smallTd.childTable) smallChildDf = tdf.copy_to_pandas(smallTd, ["childTable"]).childTable checkChildDfLen(smallChildDf, *smallChk) checkTupleListLen(smallSmartTupleList, *smallChk) checkTupleListLen(smallDumbTupleList, *smallChk) checkSlicerLen(smallSlicer, *smallChk) checkChildDfSum(smallChildDf, *smallChk) checkTupleListSum(smallSmartTupleList, smallTd, *smallChk)
def create_duplicate_focused_tdf(tdf): primary_key_fields = {k: v for k, v in tdf.primary_key_fields.items() if v} if primary_key_fields: return ticdat.TicDatFactory( **{k: [[], v] for k, v in primary_key_fields.items()})
def find_denormalized_sub_table_failures(table, pk_fields, data_fields): """ checks to see if the table argument contains a denormalized sub-table indexed by pk_fields with data fields data_fields :param table: The table to study. Can either be a pandas DataFrame or a or a container of consistent {field_name:value} dictionaries. :param pk_fields: The pk_fields of the sub-table. Needs to be fields (but not necc primary key fields) of the table. :param data_fileds: The data fields of the sub-table. Needs to be fields (but not necc data fields) of the table. :return: A dictionary indexed by the pk_fields values in the table that are associated with improperly denormalized table rows. The values of the return dictionary are themselves dictionaries indexed by data fields. The values of the inner dictionary are tuples of the different distinct values found for the data field at the different rows with common primary key field values. The inner dictionaries are pruned so that only tuples of length >1 are included, and the return dictionary is pruned so that only entries with at least one non-pruned inner dictionary is included. Thus, a table that has a properly denormalized (pk_fields, data_fields) sub-table will return an empty dictionary. """ pk_fields = (pk_fields, ) if stringish(pk_fields) else pk_fields data_fields = (data_fields, ) if stringish(data_fields) else data_fields verify( containerish(pk_fields) and all(map(stringish, pk_fields)), "pk_fields needs to be either a field name or a container of field names" ) verify( containerish(data_fields) and all(map(stringish, data_fields)), "data_fields needs to be either a field name or a container of field names" ) verify( len(set(pk_fields).union(data_fields)) == len(pk_fields) + len(data_fields), "there are duplicate field names amongst pk_fields, data_fields") if DataFrame and isinstance(table, DataFrame): verify(hasattr(table, "columns"), "table missing columns") for f in tuple(pk_fields) + tuple(data_fields): verify(f in table.columns, "%s isn't a column for table" % f) tdf = ticdat.TicDatFactory( t=[[], tuple(pk_fields) + tuple(data_fields)]) dat = tdf.TicDat(t=table) return find_denormalized_sub_table_failures(dat.t, pk_fields, data_fields) verify( containerish(table) and all(map(dictish, table)), "table needs to either be a pandas.DataFrame or a container of {field_name:value} dictionaries" ) rtn = defaultdict(lambda: defaultdict(set)) for row in table: for f in tuple(pk_fields) + tuple(data_fields): verify( f in row, "%s isn't a key for one of the inner dictionaries of table" % f) verify(hasattr(row[f], "__hash__"), "the values for field %s all need to be hashable" % f) pk = row[pk_fields[0]] if len(pk_fields) == 1 else tuple( row[f] for f in pk_fields) for f in data_fields: rtn[pk][f].add(row[f]) for k, v in list(rtn.items()): rtn[k] = {f: tuple(s) for f, s in v.items() if len(s) > 1} if not rtn[k]: del (rtn[k]) return dict(rtn)