class XlsPanFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing Excel files with panDat objects. Don't create this object explicitly. A XlsPanFactory will automatically be associated with the xls attribute of the parent PanDatFactory. """ def __init__(self, pan_dat_factory): """ Don't create this object explicitly. A XlsPanFactory will automatically be associated with the xls attribute of the parent PanDatFactory. :param pan_dat_factory: :return: """ self.pan_dat_factory = pan_dat_factory self._isFrozen = True def create_pan_dat(self, xls_file_path, fill_missing_fields=False): """ Create a PanDat object from an Excel file :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :return: a PanDat object populated by the matching sheets. caveats: Missing sheets resolve to an empty table, but missing fields on matching sheets throw an Exception (unless fill_missing_fields is truthy). Table names are matched to sheets with with case-space insensitivity, but spaces and case are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to excel and then recover it, the type of data might change. For example df = pd.DataFrame({"a":["100", "200", "300"]}) df.to_excel("something.xlsx") df2 = pd.read_excel("something.xlsx") results in a numeric column in df2. To address this, you need to either use set_data_type for your PanDatFactory. This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]}) """ rtn = {} for t, s in self._get_sheet_names(xls_file_path).items(): rtn[t] = pd.read_excel( xls_file_path, s, dtype=self.pan_dat_factory._dtypes_for_pandas_read(t)) missing_tables = { t for t in self.pan_dat_factory.all_tables if t not in rtn } if missing_tables: print( "The following table names could not be found in the %s file.\n%s\n" % (xls_file_path, "\n".join(missing_tables))) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following are (table, field) pairs missing from the %s file.\n%s" % (xls_file_path, missing_fields)) return _clean_pandat_creator(self.pan_dat_factory, rtn) def _get_sheet_names(self, xls_file_path): sheets = defaultdict(list) try: xl = pd.ExcelFile(xls_file_path) except Exception as e: raise TicDatError("Unable to open %s as xls file : %s" % (xls_file_path, e)) for table, sheet in product(self.pan_dat_factory.all_tables, xl.sheet_names): if table.lower()[:_longest_sheet] == sheet.lower().replace( ' ', '_')[:_longest_sheet]: sheets[table].append(sheet) duplicated_sheets = tuple(_t for _t, _s in sheets.items() if len(_s) > 1) verify( not duplicated_sheets, "The following sheet names were duplicated : " + ",".join(duplicated_sheets)) sheets = FrozenDict({k: v[0] for k, v in sheets.items()}) return sheets def write_file(self, pan_dat, file_path, case_space_sheet_names=False): """ write the panDat data to an excel file :param pan_dat: the PanDat object to write :param file_path: The file path of the excel file to create :param case_space_sheet_names: boolean - make best guesses how to add spaces and upper case characters to sheet names :return: caveats: The row names (index) isn't written. """ msg = [] verify( self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s" % "\n".join(msg)) pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat) verify(not os.path.isdir(file_path), "A directory is not a valid xls file path") case_space_sheet_names = case_space_sheet_names and \ len(set(self.pan_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables))) writer = pd.ExcelWriter(file_path) for t in self.pan_dat_factory.all_tables: getattr(pan_dat, t).to_excel( writer, case_space_to_pretty(t) if case_space_sheet_names else t, index=False) writer.save()
class JsonPanFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing json data with PanDat objects. Don't create this object explicitly. A JsonPanFactory will automatically be associated with the json attribute of the parent PanDatFactory. """ def __init__(self, pan_dat_factory): """ Don't create this object explicitly. A JsonPanFactory will automatically be associated with the json attribute of the parent PanDatFactory. :param pan_dat_factory: :return: """ self.pan_dat_factory = pan_dat_factory to_json_args = inspect.getfullargspec(pd.DataFrame.to_json).args assert "orient" in to_json_args self._modern_pandas = "index" in to_json_args self._isFrozen = True def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs): """ Create a PanDat object from a JSON file or string :param path_or_buf: a valid JSON string or file-like :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param orient: Indication of expected JSON string format. See pandas.read_json for more details. :param kwargs: additional named arguments to pass to pandas.read_json :return: a PanDat object populated by the matching tables. caveats: Missing tables always resolve to an empty table. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to json and then recover it, the type of data might change. Specifically, text that looks numeric might be recovered as a number, to include the loss of leading zeros. To address this, you need to either use set_data_type for your PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better). """ if stringish(path_or_buf) and os.path.exists(path_or_buf): verify( os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf) with open(path_or_buf, "r") as f: loaded_dict = json.load(f) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) loaded_dict = json.loads(path_or_buf) verify(dictish(loaded_dict), "the json.load result doesn't resolve to a dictionary") verify( all(map(dictish, loaded_dict.values())), "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries" ) tbl_names = self._get_table_names(loaded_dict) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") rtn = {} for t, f in tbl_names.items(): kwargs_ = dict(kwargs) if "dtype" not in kwargs_: kwargs_[ "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t) rtn[t] = pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs_) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t, f in missing_fields]) missing_tables = sorted( set(self.pan_dat_factory.all_tables).difference(rtn)) if missing_tables: print( "The following table names could not be found in the SQLite database.\n%s\n" % "\n".join(missing_tables)) return _clean_pandat_creator(self.pan_dat_factory, rtn, json_read=True) def _get_table_names(self, loaded_dict): rtn = {} for table in self.pan_dat_factory.all_tables: rtn[table] = [ c for c in loaded_dict if c.lower().replace(" ", "_") == table.lower() ] verify( len(rtn[table]) <= 1, "Multiple dictionary key choices found for table %s" % table) if rtn[table]: rtn[table] = rtn[table][0] else: rtn.pop(table) return rtn def write_file(self, pan_dat, json_file_path, case_space_table_names=False, orient='split', index=False, indent=2, sort_keys=False, **kwargs): """ write the PanDat data to a json file (or json string) :param pan_dat: the PanDat object to write :param json_file_path: the json file into which the data is to be written. If falsey, will return a JSON string :param case_space_table_names: boolean - make best guesses how to add spaces and upper case characters to table names :param orient: Indication of expected JSON string format. See pandas.to_json for more details. :param index: boolean - whether or not to write the index. :param indent: 2. See json.dumps :param sort_keys: See json.dumps :param kwargs: additional named arguments to pass to pandas.to_json :return: """ msg = [] verify( self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s" % "\n".join(msg)) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") verify("index" not in kwargs, "index should be passed as a non-kwargs argument") pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat) if self._modern_pandas: # FYI - pandas Exception: ValueError: 'index=False' is only valid when 'orient' is 'split' or 'table' kwargs["index"] = index if orient in ("split", "table") else True case_space_table_names = case_space_table_names and \ len(set(self.pan_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables))) rtn = {} from ticdat.pandatfactory import _faster_df_apply for t in self.pan_dat_factory.all_tables: df = getattr(pan_dat, t).copy(deep=True).replace( float("inf"), "inf").replace(-float("inf"), "-inf") for f in df.columns: dt = self.pan_dat_factory.data_types.get(t, {}).get(f, None) if dt and dt.datetime: # pandas can be a real PIA when trying to mix types in a column def fixed(row): # this might not always fix things if isinstance(row[f], (pd.Timestamp, numpy.datetime64)): return str(row[f]) if pd.isnull(row[f]): return None return row[f] df[f] = _faster_df_apply(df, fixed) k = case_space_to_pretty(t) if case_space_table_names else t rtn[k] = json.loads( df.to_json(path_or_buf=None, orient=orient, **kwargs)) if orient == 'split' and not index: rtn[k].pop("index", None) if json_file_path: with open(json_file_path, "w") as f: json.dump(rtn, f, indent=indent, sort_keys=sort_keys) else: return json.dumps(rtn, indent=indent, sort_keys=sort_keys)
class CsvPanFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing csv files with PanDat objects. Don't create this object explicitly. A CsvPanFactory will automatically be associated with the csv attribute of the parent PanDatFactory. """ def __init__(self, pan_dat_factory): """ Don't create this object explicitly. A CsvPanFactory will automatically be associated with the csv attribute of the parent PanDatFactory. :param pan_dat_factory: :return: """ self.pan_dat_factory = pan_dat_factory self._isFrozen = True def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs): """ Create a PanDat object from a directory of csv files. :param db_file_path: the directory containing the .csv files. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param kwargs: additional named arguments to pass to pandas.read_csv :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to csv and then recover it, the type of data might change. For example df = pd.DataFrame({"a":["100", "200", "300"]}) df.to_csv("something.csv") df2 = pd.read_csv("something.csv") results in a numeric column in df2. To address this, you need to either use set_data_type for your PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better). This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]}) """ verify(os.path.isdir(dir_path), "%s not a directory path" % dir_path) tbl_names = self._get_table_names(dir_path) rtn = {} for t, f in tbl_names.items(): kwargs_ = dict(kwargs) if "dtype" not in kwargs_: kwargs_[ "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t) rtn[t] = pd.read_csv(f, **kwargs_) missing_tables = { t for t in self.pan_dat_factory.all_tables if t not in rtn } if missing_tables: print( "The following table names could not be found in the %s directory.\n%s\n" % (dir_path, "\n".join(missing_tables))) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following (table, file_name, field) triplets are missing fields.\n%s" % [(t, os.path.basename(tbl_names[t]), f) for t, f in missing_fields]) return _clean_pandat_creator(self.pan_dat_factory, rtn) def _get_table_names(self, dir_path): rtn = {} for table in self.pan_dat_factory.all_tables: rtn[table] = [ path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)] if os.path.isfile(path) and f.lower().replace(" ", "_") == "%s.csv" % table.lower() ] verify( len(rtn[table]) <= 1, "Multiple possible csv files found for table %s" % table) if len(rtn[table]) == 1: rtn[table] = rtn[table][0] else: rtn.pop(table) return rtn def write_directory(self, pan_dat, dir_path, case_space_table_names=False, index=False, **kwargs): """ write the PanDat data to a collection of csv files :param pan_dat: the PanDat object to write :param dir_path: the directory in which to write the csv files Set to falsey if using con argument. :param case_space_table_names: boolean - make best guesses how to add spaces and upper case characters to table names :param index: boolean - whether or not to write the index. :param kwargs: additional named arguments to pass to pandas.to_csv :return: caveats: The row names (index) isn't written (unless kwargs indicates it should be). """ verify(not os.path.isfile(dir_path), "A file is not a valid directory path") msg = [] verify( self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s" % "\n".join(msg)) pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat) verify("index" not in kwargs, "index should be passed as a non-kwargs argument") kwargs["index"] = index case_space_table_names = case_space_table_names and \ len(set(self.pan_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables))) if not os.path.isdir(dir_path): os.mkdir(dir_path) for t in self.pan_dat_factory.all_tables: f = os.path.join( dir_path, (case_space_to_pretty(t) if case_space_table_names else t) + ".csv") getattr(pan_dat, t).to_csv(f, **kwargs)
class SqlPanFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing SQLite files with PanDat objects. Don't create this object explicitly. A SqlPanFactory will automatically be associated with the sql attribute of the parent PanDatFactory. """ def __init__(self, pan_dat_factory): """ Don't create this object explicitly. A SqlPanFactory will automatically be associated with the sql attribute of the parent PanDatFactory. :param pan_dat_factory: :return: """ self.pan_dat_factory = pan_dat_factory self._isFrozen = True def create_pan_dat(self, db_file_path, con=None, fill_missing_fields=False): """ Create a PanDat object from a SQLite database file :param db_file_path: A SQLite DB File. Set to falsey if using con argument :param con: A connection object that can be passed to pandas read_sql. Set to falsey if using db_file_path argument. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :return: a PanDat object populated by the matching tables. caveats: Missing tables always resolve to an empty table, but missing fields on matching tables throw an exception (unless fill_missing_fields is truthy). Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). """ verify( bool(db_file_path) != bool(con), "use either the con argument or the db_file_path argument but not both" ) if db_file_path: verify( os.path.exists(db_file_path) and not os.path.isdir(db_file_path), "%s not a file path" % db_file_path) rtn = {} con_maker = lambda: _sql_con( db_file_path) if db_file_path else _DummyContextManager(con) with con_maker() as _: con_ = con or _ for t, s in self._get_table_names(con_).items(): rtn[t] = pd.read_sql(sql="Select * from [%s]" % s, con=con_) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following are (table, field) pairs missing from the %s file.\n%s" % (db_file_path, missing_fields)) missing_tables = sorted( set(self.pan_dat_factory.all_tables).difference(rtn)) if missing_tables: print( "The following table names could not be found in the SQLite database.\n%s\n" % "\n".join(missing_tables)) return _clean_pandat_creator(self.pan_dat_factory, rtn) def _get_table_names(self, con): rtn = {} def try_name(name): try: con.execute("Select * from [%s]" % name) except: return False return True for table in self.pan_dat_factory.all_tables: rtn[table] = [ t for t in all_underscore_replacements(table) if try_name(t) ] verify( len(rtn[table]) <= 1, "Multiple possible tables found for table %s" % table) if rtn[table]: rtn[table] = rtn[table][0] else: rtn.pop(table) return rtn def write_file(self, pan_dat, db_file_path, con=None, if_exists='replace', case_space_table_names=False): """ write the PanDat data to an excel file :param pan_dat: the PanDat object to write :param db_file_path: The file path of the SQLite file to create. Set to falsey if using con argument. :param con: A connection object that can be passed to pandas to_sql. Set to falsey if using db_file_path argument :param if_exists: ‘fail’, ‘replace’ or ‘append’. How to behave if the table already exists :param case_space_table_names: boolean - make best guesses how to add spaces and upper case characters to table names :return: caveats: The row names (index) isn't written. The default pandas schema generation is used, and thus foreign key relationships aren't written. """ # The code to generate foreign keys is written and tested as part of TicDatFactory, and # thus this shortcoming could be easily rectified if need be). # note - pandas has an unfortunate tendency to push types into SQLite columns. This can result in # writing-reading round trips converting your numbers to text if they are mixed type columns. verify( bool(db_file_path) != bool(con), "use either the con argument or the db_file_path argument but not both" ) msg = [] verify( self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s" % "\n".join(msg)) pan_dat = self.pan_dat_factory._pre_write_adjustment(pan_dat) if db_file_path: verify(not os.path.isdir(db_file_path), "A directory is not a valid SQLLite file path") case_space_table_names = case_space_table_names and \ len(set(self.pan_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables))) con_maker = lambda: _sql_con( db_file_path) if db_file_path else _DummyContextManager(con) with con_maker() as _: con_ = con or _ for t in self.pan_dat_factory.all_tables: getattr(pan_dat, t).to_sql(name=case_space_to_pretty(t) if case_space_table_names else t, con=con_, if_exists=if_exists, index=False)
class JsonTicFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing json files with TicDat objects. You need the json package to be installed to use it. """ def __init__(self, tic_dat_factory): """ Don't call this function explicitly. A JsonTicFactory will automatically be associated with the json attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._isFrozen = True def create_tic_dat(self, json_file_path, freeze_it=False, from_pandas=False): """ Create a TicDat object from a json file :param json_file_path: A json file path. It should encode a dictionary with table names as keys. Could also be an actual JSON string :param freeze_it: boolean. should the returned object be frozen? :param from_pandas: boolean. If truthy, then use pandas json readers. See PanDatFactory json readers for more details. :return: a TicDat object populated by the matching tables. caveats: Table names matches are case insensitive and also underscore-space insensitive. Tables that don't find a match are interpreted as an empty table. Dictionary keys that don't match any table are ignored. """ _standard_verify(self.tic_dat_factory) if from_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) _rtn = pdf.json.create_pan_dat(json_file_path) return pdf.copy_to_tic_dat(_rtn) jdict = self._create_jdict(json_file_path) tic_dat_dict = self._create_tic_dat_dict(jdict) missing_tables = set( self.tic_dat_factory.all_tables).difference(tic_dat_dict) if missing_tables: print( "The following table names could not be found in the json file/string\n%s\n" % "\n".join(missing_tables)) rtn = self.tic_dat_factory.TicDat(**tic_dat_dict) rtn = self.tic_dat_factory._parameter_table_post_read_adjustment(rtn) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn def find_duplicates(self, json_file_path, from_pandas=False): """ Find the row counts for duplicated rows. :param json_file_path: A json file path. It should encode a dictionary with table names as keys. :param from_pandas: boolean. If truthy, then use pandas json readers. See PanDatFactory json readers for more details. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the json entry with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ _standard_verify(self.tic_dat_factory) if from_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) _rtn = pdf.json.create_pan_dat(json_file_path) jdict = { t: [tuple(_) for _ in getattr(_rtn, t).itertuples(index=False)] for t in pdf.all_tables } else: jdict = self._create_jdict(json_file_path) rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict) return rtn or {} def _create_jdict(self, path_or_buf): if stringish(path_or_buf) and os.path.exists(path_or_buf): reasonble_string = path_or_buf verify(os.path.isfile(path_or_buf), "json_file_path is not a valid file path.") try: with open(path_or_buf, "r") as fp: jdict = json.load(fp) except Exception as e: raise TicDatError("Unable to interpret %s as json file : %s" % (path_or_buf, e)) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) reasonble_string = path_or_buf[:10] try: jdict = json.loads(path_or_buf) except Exception as e: raise TicDatError( "Unable to interpret %s as json string : %s" % (reasonble_string, e)) verify(dictish(jdict), "%s failed to load a dictionary" % reasonble_string) verify( all(map(stringish, jdict)), "The dictionary loaded from %s isn't indexed by strings" % reasonble_string) verify( all(map(containerish, jdict.values())), "The dictionary loaded from %s doesn't have containers as values" % reasonble_string) return jdict def _create_tic_dat_dict(self, jdict): tdf = self.tic_dat_factory rtn = {} table_keys = defaultdict(list) for t in tdf.all_tables: for t2 in jdict: if stringish(t2) and t.lower() == t2.replace(" ", "_").lower(): table_keys[t].append(t2) if len(table_keys[t]) >= 1: verify( len(table_keys[t]) < 2, "Found duplicate matching keys for table %s" % t) rtn[t] = jdict[table_keys[t][0]] orig_rtn, rtn = rtn, {} for t, rows in orig_rtn.items(): all_fields = tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get( t, ()) rtn[t] = [] for row in rows: if dictish(row): rtn[t].append({ f: tdf._general_read_cell(t, f, x) for f, x in row.items() }) else: rtn[t].append([ tdf._general_read_cell(t, f, x) for f, x in zip(all_fields, row) ]) return rtn def write_file(self, tic_dat, json_file_path, allow_overwrite=False, verbose=False, to_pandas=False): """ write the ticDat data to a json file (or json string) :param tic_dat: the data object to write (typically a TicDat) :param json_file_path: The file path of the json file to create. If empty string, then return a JSON string. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :param verbose: boolean. Verbose mode writes the data rows as dicts keyed by field name. Otherwise, they are lists. :param to_pandas: boolean. if truthy, then use the PanDatFactory method of writing to json. :return: """ _standard_verify(self.tic_dat_factory) verify(not (to_pandas and verbose), "verbose argument is inconsistent with to_pandas") verify( not (json_file_path and os.path.exists(json_file_path) and not allow_overwrite), "%s exists and allow_overwrite is not enabled" % json_file_path) if to_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) return pdf.json.write_file( self.tic_dat_factory.copy_to_pandas(tic_dat, drop_pk_columns=False), json_file_path) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) jdict = make_json_dict(self.tic_dat_factory, tic_dat, verbose, use_infinity_io_flag_if_provided=True) if not json_file_path: return json.dumps(jdict, sort_keys=True, indent=2) with open(json_file_path, "w") as fp: json.dump(jdict, fp, sort_keys=True, indent=2)
class OpalyticsTicFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading ticDat objects from the Opalytics Cloud Platform Not expected to used outside of Opalytics Cloud hosted notebooks """ def __init__(self, tic_dat_factory): """ Don't call this function explicitly. A OpalyticsTicFactory will automatically be associated with the opalytics attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._duplicate_focused_tdf = create_duplicate_focused_tdf( tic_dat_factory) self._isFrozen = True def _find_table_matchings(self, inputset): rtn = defaultdict(list) for t, x in product(self.tic_dat_factory.all_tables, inputset.schema): if stringish(x) and t.lower() == x.lower().replace(" ", "_"): rtn[t].append(x) return rtn def _good_inputset(self, inputset, message_writer=lambda x: x): if not hasattr(inputset, "schema") and dictish(inputset.schema): message_writer("Failed to find dictish schema attribute") return False if not hasattr(inputset, "getTable") and callable(inputset.getTable): message_writer("Failed to find calleable getTable attribute") return False table_matchings = self._find_table_matchings(inputset) badly_matched = {t for t, v in table_matchings.items() if len(v) != 1} if badly_matched: message_writer( "Following tables could not be uniquely resolved in inputset.schema\n%s" % badly_matched) return False return True def find_duplicates(self, inputset, raw_data=False): """ Find the row counts for duplicated rows. :param inputset: An opalytics inputset consistent with this TicDatFactory :param raw_data: boolean. should data cleaning be skipped? See create_tic_dat. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the table with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates. """ message = [] verify( self._good_inputset(inputset, message.append), "inputset is inconsistent with this TicDatFactory : %s" % (message or [None])[0]) if not self._duplicate_focused_tdf: return {} tdf = self._duplicate_focused_tdf return find_duplicates( tdf.opalytics.create_tic_dat(inputset, raw_data=raw_data), tdf) def _table_as_lists(self, t, df): verify(isinstance(df, DataFrame), "table %s isn't a DataFrame" % t) all_fields = set(self.tic_dat_factory.primary_key_fields[t]).\ union(self.tic_dat_factory.data_fields[t]) verify( "_active" not in all_fields, "Table %s has a field named '_active'.\n" + "This conflicts with internal data processing.\n" + " Don't use '_active' for in your TicDatFactory definition if you want to use this reader." ) for f in all_fields: verify(f in df.columns, "field %s can't be found in the DataFrame for %s" % (f, t)) all_fields = {f: list(df.columns).index(f) for f in all_fields} has_active = "_active" in df.columns active_index = list( df.columns).index("_active") if has_active else None rtn = [] for row in df.itertuples(index=False, name=None): if not has_active or row[active_index]: rtn.append( tuple(row[all_fields[f]] for f in self.tic_dat_factory.primary_key_fields[t] + self.tic_dat_factory.data_fields[t])) return rtn def create_tic_dat(self, inputset, raw_data=False, freeze_it=False): """ Create a TicDat object from an opalytics inputset :param inputset: An opalytics inputset consistent with this TicDatFactory :param raw_data: boolean. should data cleaning be skipped? On the Opalytics Cloud Platform cleaned data will be passed to instant apps. Data cleaning involves removing data type failures, data row predicate failures, foreign key failures and deactivated records. :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the tables as they are rendered by inputset """ message = [] verify( self._good_inputset(inputset, message.append), "inputset is inconsistent with this TicDatFactory : %s" % (message or [None])[0]) verify( DataFrame, "pandas needs to be installed to use the opalytics functionality") verify( not self.tic_dat_factory.generator_tables or self.tic_dat_factory.generic_tables, "The opalytics data reader is not yet working for generic tables nor generator tables" ) tms = { k: v[0] for k, v in self._find_table_matchings(inputset).items() } ia = {} if "includeActive" in inspect.getargspec(inputset.getTable)[0]: ia = {"includeActive": not raw_data} tl = lambda t: self._table_as_lists(t, inputset.getTable(tms[t], **ia)) rtn = self.tic_dat_factory.TicDat( **{t: tl(t) for t in self.tic_dat_factory.all_tables}) if not raw_data: def removing(): dtfs = self.tic_dat_factory.find_data_type_failures(rtn) for (t, f), (bvs, pks) in dtfs.items(): if pks is None: # i.e. no primary keys for dr in getattr(rtn, t): if dr[f] in bvs: getattr(rtn, t).remove(dr) else: for k in pks: getattr(rtn, t).pop( k, None) # could be popped for two fields drfs = self.tic_dat_factory.find_data_row_failures(rtn) cant_remove_again = set() for (t, pn), row_posns in drfs.items(): if self.tic_dat_factory.primary_key_fields[t]: for k in row_posns: getattr(rtn, t).pop( k, None) # could be popped for two predicates elif t not in cant_remove_again: bad_drs = [ dr for i, dr in enumerate(getattr(rtn, t)) if i in row_posns ] for dr in bad_drs: getattr(rtn, t).remove(dr) # once we start removing data rows by row index, the remaining row indicies # become invalid, so will need to ignoring any more such indicies for this table cant_remove_again.add(t) fkfs = self.tic_dat_factory.find_foreign_key_failures(rtn) if fkfs: self.tic_dat_factory.remove_foreign_keys_failures(rtn) return dtfs or drfs or fkfs while removing(): pass if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn
class XlsTicFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing Excel files with TicDat objects. Your system will need the xlrd package to read .xls and .xlsx files, the xlwt package to write .xls files, and the xlsxwriter package to write .xlsx files. Don't create this object explicitly. A XlsTicDatFactory will automatically be associated with the xls attribute of the parent TicDatFactory. """ def __init__(self, tic_dat_factory): """ Don't create this object explicitly. A XlsTicDatFactory will automatically be associated with the xls attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._isFrozen = True def create_tic_dat(self, xls_file_path, row_offsets=None, headers_present=True, treat_inf_as_infinity=True, freeze_it=False): """ Create a TicDat object from an Excel file :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema. :param row_offsets: (optional) A mapping from table names to initial number of rows to skip :param headers_present: Boolean. Does the first row of data contain the column headers? :param treat_inf_as_infinity: Boolean. Treat the "inf" string (case insensitive) as as infinity. Similar for "-inf" :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the matching sheets. caveats: Missing sheets resolve to an empty table, but missing fields on matching sheets throw an Exception. Sheet names are considered case insensitive, and white space is replaced with underscore for table name matching. Field names are considered case insensitive, but white space is respected. (ticdat supports whitespace in field names but not table names). The following two caveats apply only if data_types are used. --> Any field for which an empty string is invalid data and None is valid data will replace the empty string with None. --> Any field for which must_be_int is true will replace numeric data that satisfies int(x)==x with int(x). In other words, the ticdat equivalent of pandas.read_excel convert_float is to set must_be_int to true in data_types. """ self._verify_differentiable_sheet_names() verify(xlrd, "xlrd needs to be installed to use this subroutine") tdf = self.tic_dat_factory verify(not (treat_inf_as_infinity and tdf.generator_tables), "treat_inf_as_infinity not implemented for generator tables") verify(headers_present or not tdf.generic_tables, "headers need to be present to read generic tables") verify(utils.DataFrame or not tdf.generic_tables, "Strange absence of pandas despite presence of generic tables") if self.tic_dat_factory.generic_tables: verify( headers_present and treat_inf_as_infinity and not row_offsets, "headers_present, treat_inf_as_infinity and row_offsets must all be at default values\n" + "to use generic tables") rtn = self._create_tic_dat_dict(xls_file_path, row_offsets or {}, headers_present, treat_inf_as_infinity) if self.tic_dat_factory.generic_tables: if xls_file_path.endswith(".xls"): print( "** Warning : pandas doesn't always play well with older Excel formats." ) pdf = PanDatFactory( **{t: '*' for t in self.tic_dat_factory.generic_tables}) pandat = pdf.xls.create_pan_dat(xls_file_path) for t in self.tic_dat_factory.generic_tables: rtn[t] = getattr(pandat, t) rtn = tdf._parameter_table_post_read_adjustment(tdf.TicDat(**rtn)) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn def _verify_differentiable_sheet_names(self): rtn = defaultdict(set) for t in self.tic_dat_factory.all_tables: rtn[t[:_longest_sheet]].add(t) rtn = [v for k, v in rtn.items() if len(v) > 1] verify( not rtn, "The following tables collide when names are truncated to %s characters.\n%s" % (_longest_sheet, sorted(map(sorted, rtn)))) def _get_sheets_and_fields(self, xls_file_path, all_tables, row_offsets, headers_present, print_missing_tables=False): verify( utils.stringish(xls_file_path) and os.path.exists(xls_file_path), "xls_file_path argument %s is not a valid file path." % xls_file_path) try: book = xlrd.open_workbook(xls_file_path) except Exception as e: raise TicDatError("Unable to open %s as xls file : %s" % (xls_file_path, e)) sheets = defaultdict(list) for table, sheet in product(all_tables, book.sheets()): if table.lower()[:_longest_sheet] == sheet.name.lower().replace( ' ', '_')[:_longest_sheet]: sheets[table].append(sheet) duplicated_sheets = tuple(_t for _t, _s in sheets.items() if len(_s) > 1) verify( not duplicated_sheets, "The following sheet names were duplicated : " + ",".join(duplicated_sheets)) sheets = FrozenDict({k: v[0] for k, v in sheets.items()}) missing_tables = {t for t in all_tables if t not in sheets} if missing_tables and print_missing_tables: print( "The following table names could not be found in the %s file.\n%s\n" % (xls_file_path, "\n".join(missing_tables))) field_indicies, missing_fields, dup_fields = {}, {}, {} for table, sheet in sheets.items(): field_indicies[table], missing_fields[table], dup_fields[table] = \ self._get_field_indicies(table, sheet, row_offsets[table], headers_present) verify( not any(_ for _ in missing_fields.values()), "The following field names could not be found : \n" + "\n".join("%s : " % t + ",".join(bf) for t, bf in missing_fields.items() if bf)) verify( not any(_ for _ in dup_fields.values()), "The following field names were duplicated : \n" + "\n".join("%s : " % t + ",".join(bf) for t, bf in dup_fields.items() if bf)) return sheets, field_indicies, book.datemode def _create_generator_obj(self, xlsFilePath, table, row_offset, headers_present, treat_inf_as_infinity): tdf = self.tic_dat_factory ho = 1 if headers_present else 0 def tableObj(): sheets, field_indicies, datemode = self._get_sheets_and_fields( xlsFilePath, (table, ), {table: row_offset}, headers_present) if table in sheets: sheet = sheets[table] table_len = min( len(sheet.col_values(field_indicies[table][field])) for field in tdf.data_fields[table]) for x in (sheet.row_values(i) for i in range(table_len)[row_offset + ho:]): yield self._sub_tuple(table, tdf.data_fields[table], field_indicies[table], treat_inf_as_infinity, datemode)(x) return tableObj def _create_tic_dat_dict(self, xls_file_path, row_offsets, headers_present, treat_inf_as_infinity): tiai = treat_inf_as_infinity verify( utils.dictish(row_offsets) and set(row_offsets).issubset(self.tic_dat_factory.all_tables) and all( utils.numericish(x) and (x >= 0) for x in row_offsets.values()), "row_offsets needs to map from table names to non negative row offset" ) row_offsets = dict({t: 0 for t in self.tic_dat_factory.all_tables}, **row_offsets) tdf = self.tic_dat_factory rtn = {} sheets, field_indicies, dm = self._get_sheets_and_fields( xls_file_path, set(tdf.all_tables).difference(tdf.generator_tables), row_offsets, headers_present, print_missing_tables=True) ho = 1 if headers_present else 0 for tbl, sheet in sheets.items(): fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get( tbl, ()) assert fields or tbl in self.tic_dat_factory.generic_tables indicies = field_indicies[tbl] table_len = min( len(sheet.col_values(indicies[field])) for field in (fields or indicies)) if tdf.primary_key_fields.get(tbl, ()): tableObj = { self._sub_tuple(tbl, tdf.primary_key_fields[tbl], indicies, tiai, dm)(x): self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies, tiai, dm)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) } elif tbl in tdf.generic_tables: tableObj = None # will be read via PanDatFactory else: tableObj = [ self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies, tiai, dm)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) ] if tableObj is not None: rtn[tbl] = tableObj for tbl in tdf.generator_tables: rtn[tbl] = self._create_generator_obj(xls_file_path, tbl, row_offsets[tbl], headers_present, tiai) return rtn def find_duplicates(self, xls_file_path, row_offsets={}, headers_present=True): """ Find the row counts for duplicated rows. :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema (non primary key tables ignored). :param row_offsets: (optional) A mapping from table names to initial number of rows to skip (non primary key tables ignored) :param headers_present: Boolean. Does the first row of data contain the column headers? caveats: Missing sheets resolve to an empty table, but missing primary fields on matching sheets throw an Exception. Sheet names are considered case insensitive. :return: A dictionary whose keys are the table names for the primary key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the Excel sheet with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ self._verify_differentiable_sheet_names() verify(xlrd, "xlrd needs to be installed to use this subroutine") verify( utils.dictish(row_offsets) and set(row_offsets).issubset(self.tic_dat_factory.all_tables) and all( utils.numericish(x) and (x >= 0) for x in row_offsets.values()), "row_offsets needs to map from table names to non negative row offset" ) row_offsets = dict({t: 0 for t in self.tic_dat_factory.all_tables}, **row_offsets) tdf = self.tic_dat_factory pk_tables = tuple(t for t, _ in tdf.primary_key_fields.items() if _) rtn = {t: defaultdict(int) for t in pk_tables} sheets, fieldIndicies, dm = self._get_sheets_and_fields( xls_file_path, pk_tables, row_offsets, headers_present) ho = 1 if headers_present else 0 for table, sheet in sheets.items(): fields = tdf.primary_key_fields[table] + tdf.data_fields.get( table, ()) indicies = fieldIndicies[table] table_len = min( len(sheet.col_values(indicies[field])) for field in fields) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[table] + ho:]): rtn[table][self._sub_tuple(table, tdf.primary_key_fields[table], indicies, treat_inf_as_infinity=True, datemode=dm)(x)] += 1 for t in list(rtn.keys()): rtn[t] = {k: v for k, v in rtn[t].items() if v > 1} if not rtn[t]: del (rtn[t]) return rtn def _sub_tuple(self, table, fields, field_indicies, treat_inf_as_infinity, datemode): assert set(fields).issubset(field_indicies) if self.tic_dat_factory.infinity_io_flag != "N/A" or \ (table == "parameters" and self.tic_dat_factory.parameters): treat_inf_as_infinity = False def _read_cell(x, field): # reminder - data fields have a default default of zero, primary keys don't get a default default dv = self.tic_dat_factory.default_values.get(table, {}).get( field, ["LIST", "NOT", "POSSIBLE"]) dt = self.tic_dat_factory.data_types.get(table, {}).get(field) rtn = x[field_indicies[field]] if rtn == "" and ((dt and dt.nullable) or (not dt and dv is None)): return None if treat_inf_as_infinity and utils.stringish( rtn) and rtn.lower() in ["inf", "-inf"]: return float(rtn.lower()) if utils.numericish(rtn) and utils.safe_apply(int)( rtn) == rtn and dt and dt.must_be_int: rtn = int(rtn) if rtn == "": try_rtn = self.tic_dat_factory._general_read_cell( table, field, None) # None as infinity flagging if utils.numericish(try_rtn): return try_rtn if utils.numericish(rtn) and dt and dt.datetime: rtn = utils.safe_apply( lambda: xlrd.xldate_as_tuple(rtn, datemode))() if rtn is not None: f = datetime.datetime if utils.pd: f = utils.pd.Timestamp return f(year=rtn[0], month=rtn[1], day=rtn[2], hour=rtn[3], minute=rtn[4], second=rtn[5]) return self.tic_dat_factory._general_read_cell(table, field, rtn) def rtn(x): if len(fields) == 1: return _read_cell(x, fields[0]) return tuple(_read_cell(x, field) for field in fields) return rtn def _get_field_indicies(self, table, sheet, row_offset, headers_present): fields = self.tic_dat_factory.primary_key_fields.get(table, ()) + \ self.tic_dat_factory.data_fields.get(table, ()) if not headers_present: row_len = len(sheet.row_values( row_offset)) if sheet.nrows > 0 else len(fields) return ({f: i for i, f in enumerate(fields) if i < row_len}, [f for i, f in enumerate(fields) if i >= row_len], []) if sheet.nrows - row_offset <= 0: return {}, fields, [] if table in self.tic_dat_factory.generic_tables: temp_rtn = defaultdict(list) for ind, val in enumerate(sheet.row_values(row_offset)): temp_rtn[val].append(ind) else: temp_rtn = {field: list() for field in fields} for field, (ind, val) in product( fields, enumerate(sheet.row_values(row_offset))): if field == val or (all(map(utils.stringish, (field, val))) and field.lower() == val.lower()): temp_rtn[field].append(ind) return ({ field: inds[0] for field, inds in temp_rtn.items() if len(inds) == 1 }, [field for field, inds in temp_rtn.items() if len(inds) == 0], [field for field, inds in temp_rtn.items() if len(inds) > 1]) def write_file(self, tic_dat, file_path, allow_overwrite=False, case_space_sheet_names=False): """ write the ticDat data to an excel file :param tic_dat: the data object to write (typically a TicDat) :param file_path: The file path of the excel file to create Needs to end in either ".xls" or ".xlsx" The latter is capable of writing out larger tables, but the former handles infinity seamlessly. If ".xlsx", then be advised that +/- float("inf") will be replaced with "inf"/"-inf", unless infinity_io_flag is being applied. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? case_space_sheet_names: boolean - make best guesses how to add spaces and upper case characters to sheet names :return: caveats: None may be written out as an empty string. This reflects the behavior of xlwt. """ self._verify_differentiable_sheet_names() verify( utils.stringish(file_path) and (file_path.endswith(".xls") or file_path.endswith(".xlsx")), "file_path argument needs to end in .xls or .xlsx") msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid ticDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(file_path), "A directory is not a valid xls file path") verify(allow_overwrite or not os.path.exists(file_path), "The %s path exists and overwrite is not allowed" % file_path) if self.tic_dat_factory.generic_tables: dat, tdf = utils.create_generic_free(tic_dat, self.tic_dat_factory) return tdf.xls.write_file(dat, file_path, allow_overwrite, case_space_sheet_names) case_space_sheet_names = case_space_sheet_names and \ len(set(self.tic_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.tic_dat_factory.all_tables))) tbl_name_mapping = { t: case_space_to_pretty(t) if case_space_sheet_names else t for t in self.tic_dat_factory.all_tables } if file_path.endswith(".xls"): self._xls_write(tic_dat, file_path, tbl_name_mapping) else: self._xlsx_write(tic_dat, file_path, tbl_name_mapping) def _xls_write(self, tic_dat, file_path, tbl_name_mapping): verify(xlwt, "Can't write .xls files because xlwt package isn't installed.") tdf = self.tic_dat_factory def clean_for_write(t, f, x): if isinstance(x, datetime.datetime): return str(x) return self.tic_dat_factory._infinity_flag_write_cell(t, f, x) book = xlwt.Workbook() for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): all_flds = self.tic_dat_factory.primary_key_fields.get( t, ()) + self.tic_dat_factory.data_fields.get(t, ()) sheet = book.add_sheet(tbl_name_mapping[t][:_longest_sheet]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write( row_ind + 1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write( row_ind + 1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) if os.path.exists(file_path): os.remove(file_path) book.save(file_path) def _xlsx_write(self, tic_dat, file_path, tbl_name_mapping): verify( xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed." ) tdf = self.tic_dat_factory if os.path.exists(file_path): os.remove(file_path) book = xlsx.Workbook(file_path) def clean_for_write(t, f, x): if self.tic_dat_factory.infinity_io_flag != "N/A" or \ (t == "parameters" and self.tic_dat_factory.parameters): return self.tic_dat_factory._infinity_flag_write_cell(t, f, x) if x in [float("inf"), -float("inf")] or isinstance( x, datetime.datetime): return str(x) return x for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): all_flds = self.tic_dat_factory.primary_key_fields.get( t, ()) + self.tic_dat_factory.data_fields.get(t, ()) sheet = book.add_worksheet(tbl_name_mapping[t][:_longest_sheet]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write( row_ind + 1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write( row_ind + 1, field_ind, clean_for_write(t, all_flds[field_ind], cell)) book.close()
class CsvPanFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing csv files with PanDat objects. Don't create this object explicitly. A CsvPanFactory will automatically be associated with the csv attribute of the parent PanDatFactory. """ def __init__(self, pan_dat_factory): """ Don't create this object explicitly. A CsvPanFactory will automatically be associated with the csv attribute of the parent PanDatFactory. :param pan_dat_factory: :return: """ self.pan_dat_factory = pan_dat_factory self._isFrozen = True def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs): """ Create a PanDat object from a SQLite database file :param db_file_path: the directory containing the .csv files. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param kwargs: additional named arguments to pass to pandas.read_csv :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). """ verify(os.path.isdir(dir_path), "%s not a directory path"%dir_path) tbl_names = self._get_table_names(dir_path) rtn = {t: pd.read_csv(f, **kwargs) for t,f in tbl_names.items()} missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following (table, file_name, field) triplets are missing fields.\n%s" % [(t, os.path.basename(tbl_names[t]), f) for t,f in missing_fields]) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn def _get_table_names(self, dir_path): rtn = {} for table in self.pan_dat_factory.all_tables: rtn[table] = [path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)] if os.path.isfile(path) and f.lower().replace(" ", "_") == "%s.csv"%table.lower()] verify(len(rtn[table]) >= 1, "Unable to recognize table %s" % table) verify(len(rtn[table]) <= 1, "Multiple possible csv files found for table %s" % table) rtn[table] = rtn[table][0] return rtn def write_directory(self, pan_dat, dir_path, case_space_table_names=False, index=False, **kwargs): """ write the PanDat data to a collection of csv files :param pan_dat: the PanDat object to write :param dir_path: the directory in which to write the csv files Set to falsey if using con argument. :param case_space_table_names: boolean - make best guesses how to add spaces and upper case characters to table names :param index: boolean - whether or not to write the index. :param kwargs: additional named arguments to pass to pandas.to_csv :return: caveats: The row names (index) isn't written (unless kwargs indicates it should be). """ verify(not os.path.isfile(dir_path), "A file is not a valid directory path") msg = [] verify(self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) verify("index" not in kwargs, "index should be passed as a non-kwargs argument") kwargs["index"] = index case_space_table_names = case_space_table_names and \ len(set(self.pan_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables))) if not os.path.isdir(dir_path) : os.mkdir(dir_path) for t in self.pan_dat_factory.all_tables : f = os.path.join(dir_path, (case_space_to_pretty(t) if case_space_table_names else t) + ".csv") getattr(pan_dat, t).to_csv(f, **kwargs)
class CsvTicFactory(freezable_factory(object, "_isFrozen")) : """ Primary class for reading/writing csv files with TicDat objects. Your system will need the csv package if you want to use this class. Don't create this object explicitly. A CsvTicFactory will automatically be associated with the csv attribute of the parent """ def __init__(self, tic_dat_factory): """ Don't create this object explicitly. A CsvTicDatFactory will automatically be associated with the csv attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._isFrozen = True def create_tic_dat(self, dir_path, dialect='excel', headers_present = True, freeze_it = False, encoding=None): """ Create a TicDat object from the csv files in a directory :param dir_path: the directory containing the .csv files. :param dialect: the csv dialect. Consult csv documentation for details. :param headers_present: Boolean. Does the first row of data contain the column headers? :param encoding: see docstring for the Python.open function :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the matching files. caveats: Missing files resolve to an empty table, but missing fields on matching files throw an Exception. By default, data field values (but not primary key values) will be coerced into floats if possible. This includes coercing "inf" and "-inf" into +/- float("inf") The rules over which fields are/are-not coerced can be controlled via hints from the default values or data types. Default values and data types can also be used to control whether or not the empty string should be coerced into None. The infinity_io_flag rules are applied subsequent to this coercion. Note - pandas doesn't really do a fantastic job handling date types either, since it coerces all columns to be the same type. If that's the behavior you want you can use PanDatFactory. JSON is just a better file format than csv for a variety of reasons, to include typing of data. """ verify(csv, "csv needs to be installed to use this subroutine") tdf = self.tic_dat_factory verify(headers_present or not tdf.generic_tables, "headers need to be present to read generic tables") verify(DataFrame or not tdf.generic_tables, "Strange absence of pandas despite presence of generic tables") rtn = self.tic_dat_factory.TicDat(**self._create_tic_dat(dir_path, dialect, headers_present, encoding)) rtn = self.tic_dat_factory._parameter_table_post_read_adjustment(rtn) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn def _read_cell(self, table, field, x): def _inner_rtn(x): if table == "parameters" and self.tic_dat_factory.parameters: return x # reminder - data fields have a default default of zero, primary keys don't get a default default dv = self.tic_dat_factory.default_values.get(table, {}).get(field, ["LIST", "NOT", "POSSIBLE"]) dt = self.tic_dat_factory.data_types.get(table, {}).get(field) if x == "" and ((dt and dt.nullable) or (not dt and dv is None) or numericish(self.tic_dat_factory._general_read_cell(table, field, None))): return None should_try_float = (dt and dt.number_allowed) or (not dt and numericish(dv)) or \ (table in self.tic_dat_factory.generic_tables) if should_try_float: try: x = float(x) if int(x) == x and dt and dt.must_be_int: x = int(x) except: return x return x return self.tic_dat_factory._general_read_cell(table, field, _inner_rtn(x)) def _create_tic_dat(self, dir_path, dialect, headers_present, encoding): verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect) verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path) rtn = {t : self._create_table(dir_path, t, dialect, headers_present, encoding) for t in self.tic_dat_factory.all_tables} missing_tables = {t for t in self.tic_dat_factory.all_tables if not rtn[t]} if missing_tables: print ("The following table names could not be found in the %s directory.\n%s\n"% (dir_path,"\n".join(missing_tables))) return {k:v for k,v in rtn.items() if v} def find_duplicates(self, dir_path, dialect='excel', headers_present = True, encoding=None): """ Find the row counts for duplicated rows. :param dir_path: the directory containing .csv files. :param dialect: the csv dialect. Consult csv documentation for details. :param headers_present: Boolean. Does the first row of data contain the column headers? :param encoding: see docstring for the Python.open function :return: A dictionary whose keys are the table names for the primary key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the Excel sheet with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates caveats: Missing files resolve to an empty table, but missing fields (data or primary key) on matching files throw an Exception. """ verify(csv, "csv needs to be installed to use this subroutine") verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect) verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path) tdf = self.tic_dat_factory rtn = {t:defaultdict(int) for t,_ in tdf.primary_key_fields.items() if _ and self._get_file_path(dir_path, t)} for t in rtn: with open(self._get_file_path(dir_path, t), encoding=encoding) as csvfile: for r in self._get_data(csvfile, t, dialect, headers_present): p_key = r[tdf.primary_key_fields[t][0]] \ if len(tdf.primary_key_fields[t])==1 else \ tuple(r[_] for _ in tdf.primary_key_fields[t]) rtn[t][p_key] += 1 for t in list(rtn.keys()): rtn[t] = {k:v for k,v in rtn[t].items() if v > 1} if not rtn[t]: del(rtn[t]) return rtn def _get_file_path(self, dir_path, table): rtn = [path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)] if os.path.isfile(path) and f.lower().replace(" ", "_") == "%s.csv"%table.lower()] verify(len(rtn) <= 1, "duplicate .csv files found for %s"%table) if rtn: return rtn[0] def _get_data(self, csvfile, table, dialect, headers_present): tdf = self.tic_dat_factory fieldnames=tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get(table, ()) assert fieldnames or table in self.tic_dat_factory.generic_tables for row in csv.DictReader(csvfile, dialect = dialect, **({"fieldnames":fieldnames} if not headers_present else {})): if not headers_present: verify(len(row) == len(fieldnames), "Need %s columns for table %s"%(len(fieldnames), table)) yield {f: self._read_cell(table, f, row[f]) for f in fieldnames} else: key_matching = defaultdict(list) for k,f in product(row.keys(), fieldnames or row.keys()): if k.lower() ==f.lower(): key_matching[f].append(k) fieldnames = fieldnames or row.keys() for f in fieldnames: verify(f in key_matching, "Unable to find field name %s for table %s"%(f, table)) verify(len(key_matching[f]) <= 1, "Duplicate field names found for field %s table %s"%(f, table)) yield {f: self._read_cell(table, f, row[key_matching[f][0]]) for f in fieldnames} def _create_table(self, dir_path, table, dialect, headers_present, encoding): file_path = self._get_file_path(dir_path, table) if not (file_path and os.path.isfile(file_path)) : return tdf = self.tic_dat_factory if table in tdf.generator_tables: def rtn() : with open(file_path, encoding=encoding) as csvfile: for r in self._get_data(csvfile, table, dialect, headers_present): yield tuple(r[_] for _ in tdf.data_fields[table]) else: rtn = {} if tdf.primary_key_fields.get(table) else [] with open(file_path, encoding=encoding) as csvfile: for r in self._get_data(csvfile, table, dialect, headers_present) : if tdf.primary_key_fields.get(table) : p_key = r[tdf.primary_key_fields[table][0]] \ if len(tdf.primary_key_fields[table]) == 1 else \ tuple(r[_] for _ in tdf.primary_key_fields[table]) rtn[p_key] = tuple(r[_] for _ in tdf.data_fields.get(table,())) elif table in tdf.generic_tables: rtn.append(r) else: rtn.append(tuple(r[_] for _ in tdf.data_fields[table])) return rtn def write_directory(self, tic_dat, dir_path, allow_overwrite = False, dialect='excel', write_header = True, case_space_table_names = False): """ write the ticDat data to a collection of csv files :param tic_dat: the data object :param dir_path: the directory in which to write the csv files :param allow_overwrite: boolean - are we allowed to overwrite existing files? :param dialect: the csv dialect. Consult csv documentation for details. :param write_header: Boolean. Should the header information be written as the first row? :param case_space_table_names: boolean - make best guesses how to add spaces and upper case characters to table names :return: """ verify(csv, "csv needs to be installed to use this subroutine") verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect) verify(not os.path.isfile(dir_path), "A file is not a valid directory path") if self.tic_dat_factory.generic_tables: dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.csv.write_directory(dat, dir_path, allow_overwrite, dialect, write_header) tdf = self.tic_dat_factory msg = [] if not self.tic_dat_factory.good_tic_dat_object(tic_dat, lambda m : msg.append(m)) : raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) if not allow_overwrite: for t in tdf.all_tables : f = os.path.join(dir_path, t + ".csv") verify(not os.path.exists(f), "The %s path exists and overwrite is not allowed"%f) if not os.path.isdir(dir_path) : os.mkdir(dir_path) case_space_table_names = case_space_table_names and \ len(set(self.tic_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.tic_dat_factory.all_tables))) for t in tdf.all_tables : f = os.path.join(dir_path, (case_space_to_pretty(t) if case_space_table_names else t) + ".csv") with open(f, 'w', newline='') as csvfile: writer = csv.DictWriter(csvfile,dialect=dialect, fieldnames= tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ()) ) writer.writeheader() if write_header else None def infinty_io_dict(d): return {f: self.tic_dat_factory._infinity_flag_write_cell(t, f, x) for f,x in d.items()} _t = getattr(tic_dat, t) if dictish(_t) : for p_key, data_row in _t.items() : primaryKeyDict = {f:v for f,v in zip(tdf.primary_key_fields[t], p_key if containerish(p_key) else (p_key,))} writer.writerow(infinty_io_dict(dict(data_row, **primaryKeyDict))) else : for data_row in (_t if containerish(_t) else _t()) : writer.writerow(infinty_io_dict(data_row))
class SQLiteTicFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing SQLite files with TicDat objects. Don't create this object explicitly. A SQLiteTicFactory will automatically be associated with the sql attribute of the parent TicDatFactory. You need the sqlite3 package to be installed to use it. """ def __init__(self, tic_dat_factory): """ Don't call this function explicitly. A SQLiteTicFactory will automatically be associated with the sql attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._duplicate_focused_tdf = create_duplicate_focused_tdf( tic_dat_factory) self._isFrozen = True def _Rtn(self, freeze_it): def rtn(*args, **kwargs): rtn = self.tic_dat_factory.TicDat(*args, **kwargs) rtn = self.tic_dat_factory._parameter_table_post_read_adjustment( rtn) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn return rtn def create_tic_dat(self, db_file_path, freeze_it=False): """ Create a TicDat object from a SQLite database file :param db_file_path: A SQLite db with a consistent schema. :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the matching tables. caveats : "inf" and "-inf" (case insensitive) are read as floats, unless the infinity_io_flag is being applied. "true"/"false" (case insensitive) are read as booleans booleans. Tables that don't find a match are interpreted as an empty table. Missing fields on matching tables throw an exception. """ verify(sql, "sqlite3 needs to be installed to use this subroutine") return self._Rtn(freeze_it)(**self._create_tic_dat(db_file_path)) def create_tic_dat_from_sql(self, sql_file_path, includes_schema=False, freeze_it=False): """ Create a TicDat object from an SQLite sql text file :param sql_file_path: A text file containing SQLite compatible SQL statements delimited by ; :param includes_schema: boolean - does the sql_file_path contain schema generating SQL? :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the db created from the SQL. See create_tic_dat for caveats. """ verify(sql, "sqlite3 needs to be installed to use this subroutine") return self._Rtn(freeze_it)( **self._create_tic_dat_from_sql(sql_file_path, includes_schema)) def find_duplicates(self, db_file_path): """ Find the row counts for duplicated rows. :param db_file_path: A SQLite db with a consistent schema. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the SQLite table with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ verify(sql, "sqlite3 needs to be installed to use this subroutine") if not self._duplicate_focused_tdf: return {} return find_duplicates( self._duplicate_focused_tdf.sql.create_tic_dat(db_file_path), self._duplicate_focused_tdf) def _fks(self): rtn = defaultdict(set) for fk in self.tic_dat_factory.foreign_keys: rtn[fk.native_table].add(fk) return FrozenDict({k: tuple(v) for k, v in rtn.items()}) def _create_tic_dat_from_sql(self, sql_file_path, includes_schema): verify(os.path.exists(sql_file_path), "%s isn't a valid file path" % sql_file_path) verify( not self.tic_dat_factory.generator_tables, "recovery of generator tables from sql files not yet implemented") tdf = self.tic_dat_factory with sql.connect(":memory:") as con: if not includes_schema: for str in self._get_schema_sql( set(tdf.all_tables).difference(tdf.generic_tables)): con.execute(str) with open(sql_file_path, "r") as f: for str in f.read().split(";"): con.execute(str) return self._create_tic_dat_from_con( con, {t: t for t in self.tic_dat_factory.all_tables}) def _get_table_names(self, db_file_path, tables): rtn = {} with sql.connect(db_file_path) as con: def try_name(name): try: con.execute("Select * from [%s]" % name) except: return False return True for table in tables: rtn[table] = [ t for t in all_underscore_replacements(table) if try_name(t) ] verify( len(rtn[table]) <= 1, "Duplicate tables found for table %s in SQLite file %s" % (table, db_file_path)) if rtn[table]: rtn[table] = rtn[table][0] else: rtn.pop(table) return rtn def _check_tables_fields(self, db_file_path, tables): tdf = self.tic_dat_factory TDE = TicDatError verify(os.path.exists(db_file_path), "%s isn't a valid file path" % db_file_path) try: with sql.connect(db_file_path) as _: pass except Exception as e: raise TDE("Unable to open %s as SQLite file : %s" % (db_file_path, e)) table_names = self._get_table_names(db_file_path, tables) with sql.connect(db_file_path) as con: for table, sql_table in table_names.items(): for field in tdf.primary_key_fields.get(table, ()) + \ tdf.data_fields.get(table, ()): try: con.execute("Select [%s] from [%s]" % (field, sql_table)) except: raise TDE( "Unable to recognize field %s in table %s for file %s" % (field, table, db_file_path)) return table_names def _read_data_cell(self, t, f, x): if stringish(x) and x.lower() in ("inf", "-inf") and self.tic_dat_factory.infinity_io_flag == "N/A" and \ not (t == "parameters" and self.tic_dat_factory.parameters): return float(x) if stringish(x) and x.lower() == "true": return True if stringish(x) and x.lower() == "false": return False return self.tic_dat_factory._general_read_cell(t, f, x) def _create_gen_obj(self, db_file_path, table, table_name): tdf = self.tic_dat_factory def tableObj(): assert (not tdf.primary_key_fields.get(table)) and ( tdf.data_fields.get(table)) with sql.connect(db_file_path) as con: for row in con.execute("Select %s from [%s]" % (", ".join( _brackets(tdf.data_fields[table])), table_name)): yield [ self._read_data_cell(table, f, x) for f, x in zip(tdf.data_fields[table], row) ] return tableObj def _create_tic_dat(self, db_file_path): tdf = self.tic_dat_factory table_names = self._check_tables_fields(db_file_path, tdf.all_tables) with sql.connect(db_file_path) as con: rtn = self._create_tic_dat_from_con(con, table_names) for table in tdf.generator_tables: if table in table_names: rtn[table] = self._create_gen_obj(db_file_path, table, table_names[table]) return rtn def _create_tic_dat_from_con(self, con, table_names): missing_tables = sorted( set(self.tic_dat_factory.all_tables).difference(table_names)) if missing_tables: print( "The following table names could not be found in the SQLite database.\n%s\n" % "\n".join(missing_tables)) tdf = self.tic_dat_factory rtn = {} for table in set(tdf.all_tables).difference( tdf.generator_tables).difference(missing_tables): fields = tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get( table, ()) if not fields: assert table in tdf.generic_tables fields = tuple( x[1] for x in con.execute("PRAGMA table_info(%s)" % table)) rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else [] for row in con.execute( "Select %s from [%s]" % (", ".join(_brackets(fields)), table_names[table])): if table in tdf.generic_tables: rtn[table].append({ f: self._read_data_cell(table, f, d) for f, d in zip(fields, row) }) else: pkfs = tdf.primary_key_fields.get(table, ()) pk = tuple( self._read_data_cell(table, f, x) for f, x in zip(pkfs, row[:len(pkfs)])) data = [ self._read_data_cell(table, f, x) for f, x in zip(fields[len(pkfs):], row[len(pkfs):]) ] if dictish(rtn[table]): rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data else: rtn[table].append(data) return rtn def _ordered_tables(self): rtn = [] fks = self._fks() def processTable(t): if t not in rtn: for fk in fks.get(t, ()): processTable(fk.foreign_table) rtn.append(t) list(map(processTable, self.tic_dat_factory.all_tables)) return tuple(rtn) def _get_schema_sql(self, tables): assert not set( self.tic_dat_factory.generic_tables).intersection(tables) rtn = [] fks = self._fks() default_ = lambda t, f: self.tic_dat_factory.default_values[t][f] def data_type(t, f): if t == "parameters" and self.tic_dat_factory.parameters: return "" # the TEXT data type doesn't seem to have much value for my purposes. def_ = default_(t, f) if numericish(def_): if safe_apply(int)(def_) == def_: return "INT" return "FLOAT" return "" # the TEXT data type doesn't seem to have much value for my purposes. for t in [_ for _ in self._ordered_tables() if _ in tables]: str = "Create TABLE [%s] (\n" % t strl = _brackets(self.tic_dat_factory.primary_key_fields.get(t, ())) + \ ["[%s] " % f + data_type(t,f) + " default [%s]"%default_(t,f) for f in self.tic_dat_factory.data_fields.get(t, ())] for fk in fks.get(t, ()): nativefields, foreignfields = zip( *(fk.nativetoforeignmapping().items())) strl.append( "FOREIGN KEY(%s) REFERENCES [%s](%s)" % (",".join(_brackets(nativefields)), fk.foreign_table, ",".join(_brackets(foreignfields)))) if self.tic_dat_factory.primary_key_fields.get(t): strl.append("PRIMARY KEY(%s)" % (",".join( _brackets(self.tic_dat_factory.primary_key_fields[t])))) str += ",\n".join(strl) + "\n);" rtn.append(str) return tuple(rtn) def _write_data_cell(self, t, f, x): if x is True or x is False: return str(x) return self.tic_dat_factory._infinity_flag_write_cell(t, f, x) def _get_data(self, tic_dat, as_sql): rtn = [] for t in self.tic_dat_factory.all_tables: _t = getattr(tic_dat, t) if dictish(_t): primarykeys = tuple(self.tic_dat_factory.primary_key_fields[t]) for pkrow, sqldatarow in _t.items(): _items = list(sqldatarow.items()) fields = primarykeys + tuple(x[0] for x in _items) datarow = ((pkrow, ) if len(primarykeys) == 1 else pkrow) + tuple(x[1] for x in _items) assert len(datarow) == len(fields) datarow = tuple( self._write_data_cell(t, f, x) for f, x in zip(fields, datarow)) str = "INSERT INTO [%s] (%s) VALUES (%s)" % (t, ",".join( _brackets(fields)), ",".join("%s" if as_sql else "?" for _ in fields)) if as_sql: rtn.append((str % tuple(map(_insert_format, datarow))) + ";") else: rtn.append((str, datarow)) else: for sqldatarow in (_t if containerish(_t) else _t()): k, v = zip(*sqldatarow.items()) str = "INSERT INTO [%s] (%s) VALUES (%s)"%\ (t, ",".join(_brackets(k)), ",".join( ["%s" if as_sql else "?"]*len(sqldatarow))) if as_sql: rtn.append((str % tuple(map(_insert_format, v))) + ";") else: rtn.append((str, v)) return tuple(rtn) def write_db_schema(self, db_file_path): """ :param db_file_path: the file path of the SQLite database to create :return: """ verify( not self.tic_dat_factory.generic_tables, "generic_tables are not compatible with write_db_schema. " + "Use write_db_data instead.") with _sql_con(db_file_path, foreign_keys=False) as con: for str in self._get_schema_sql(self.tic_dat_factory.all_tables): con.execute(str) def write_db_data(self, tic_dat, db_file_path, allow_overwrite=False): """ write the ticDat data to an SQLite database file :param tic_dat: the data object to write :param db_file_path: the file path of the SQLite database to populate :param allow_overwrite: boolean - are we allowed to overwrite pre-existing data :return: caveats : True, False are written as "True", "False". Also see infinity_io_flag __doc__ """ verify(sql, "sqlite3 needs to be installed to use this subroutine") msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(db_file_path), "A directory is not a valid SQLite file path") if self.tic_dat_factory.generic_tables: dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.sql.write_db_data(dat, db_file_path, allow_overwrite) if not os.path.exists(db_file_path): self.write_db_schema(db_file_path) table_names = self._check_tables_fields( db_file_path, self.tic_dat_factory.all_tables) with _sql_con(db_file_path, foreign_keys=False) as con: for t in self.tic_dat_factory.all_tables: verify( table_names.get(t) == t, "Failed to find table %s in path %s" % (t, db_file_path)) verify( allow_overwrite or not any(True for _ in con.execute("Select * from [%s]" % t)), "allow_overwrite is False, but there are already data records in %s" % t) con.execute("Delete from [%s]" % t) if allow_overwrite else None for sql_str, data in self._get_data(tic_dat, as_sql=False): con.execute(sql_str, list(data)) def write_sql_file(self, tic_dat, sql_file_path, include_schema=False, allow_overwrite=False): """ write the sql for the ticDat data to a text file :param tic_dat: the data object to write :param sql_file_path: the path of the text file to hold the sql statements for the data :param include_schema: boolean - should we write the schema sql first? :param allow_overwrite: boolean - are we allowed to overwrite pre-existing file :return: caveats : float("inf"), float("-inf") are written as "inf", "-inf" (unless infinity_io_flag is being applied). True/False are written as "True", "False" """ verify(sql, "sqlite3 needs to be installed to use this subroutine") verify( allow_overwrite or not os.path.exists(sql_file_path), "The %s path exists and overwrite is not allowed" % sql_file_path) must_schema = set( self.tic_dat_factory.all_tables if include_schema else []) if self.tic_dat_factory.generic_tables: gt = self.tic_dat_factory.generic_tables dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.sql._write_sql_file(dat, sql_file_path, must_schema.union(gt)) return self._write_sql_file(tic_dat, sql_file_path, must_schema) def _write_sql_file(self, tic_dat, sql_file_path, schema_tables): with open(sql_file_path, "w") as f: for str in self._get_schema_sql(schema_tables) + \ self._get_data(tic_dat, as_sql=True): f.write(str + "\n")
class CsvTicFactory(freezable_factory(object, "_isFrozen")) : """ Primary class for reading/writing csv files with ticDat objects. Your system will need the csv package if you want to use this class. """ def __init__(self, tic_dat_factory): """ Don't create this object explicitly. A CsvTicDatFactory will automatically be associated with the csv attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._isFrozen = True def create_tic_dat(self, dir_path, dialect='excel', headers_present = True, freeze_it = False): """ Create a TicDat object from the csv files in a directory :param dir_path: the directory containing the .csv files. :param dialect: the csv dialect. Consult csv documentation for details. :param headers_present: Boolean. Does the first row of data contain the column headers? :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the matching files. caveats: Missing files resolve to an empty table, but missing fields on matching files throw an Exception. Data field values (but not primary key values) will be coerced into floats if possible. """ verify(csv, "csv needs to be installed to use this subroutine") tdf = self.tic_dat_factory verify(headers_present or not tdf.generic_tables, "headers need to be present to read generic tables") verify(DataFrame or not tdf.generic_tables, "Strange absence of pandas despite presence of generic tables") rtn = self.tic_dat_factory.TicDat(**self._create_tic_dat(dir_path, dialect, headers_present)) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn def _create_tic_dat(self, dir_path, dialect, headers_present): verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect) verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path) rtn = {t : self._create_table(dir_path, t, dialect, headers_present) for t in self.tic_dat_factory.all_tables} missing_tables = {t for t in self.tic_dat_factory.all_tables if not rtn[t]} if missing_tables: print ("The following table names could not be found in the %s directory.\n%s\n"% (dir_path,"\n".join(missing_tables))) return {k:v for k,v in rtn.items() if v} def find_duplicates(self, dir_path, dialect='excel', headers_present = True): """ Find the row counts for duplicated rows. :param dir_path: the directory containing .csv files. :param dialect: the csv dialect. Consult csv documentation for details. :param headers_present: Boolean. Does the first row of data contain the column headers? :return: A dictionary whose keys are the table names for the primary key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the Excel sheet with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates caveats: Missing files resolve to an empty table, but missing fields (data or primary key) on matching files throw an Exception. """ verify(csv, "csv needs to be installed to use this subroutine") verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect) verify(os.path.isdir(dir_path), "Invalid directory path %s"%dir_path) tdf = self.tic_dat_factory rtn = {t:defaultdict(int) for t,_ in tdf.primary_key_fields.items() if _ and self._get_file_path(dir_path, t)} for t in rtn: with open(self._get_file_path(dir_path, t)) as csvfile: for r in self._get_data(csvfile, t, dialect, headers_present): p_key = r[tdf.primary_key_fields[t][0]] \ if len(tdf.primary_key_fields[t])==1 else \ tuple(r[_] for _ in tdf.primary_key_fields[t]) rtn[t][p_key] += 1 for t in list(rtn.keys()): rtn[t] = {k:v for k,v in rtn[t].items() if v > 1} if not rtn[t]: del(rtn[t]) return rtn def _get_file_path(self, dir_path, table): rtn = [path for f in os.listdir(dir_path) for path in [os.path.join(dir_path, f)] if os.path.isfile(path) and f.lower().replace(" ", "_") == "%s.csv"%table.lower()] verify(len(rtn) <= 1, "duplicate .csv files found for %s"%table) if rtn: return rtn[0] def _get_data(self, csvfile, table, dialect, headers_present): tdf = self.tic_dat_factory fieldnames=tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get(table, ()) assert fieldnames or table in self.tic_dat_factory.generic_tables for row in csv.DictReader(csvfile, dialect = dialect, **({"fieldnames":fieldnames} if not headers_present else {})): if not headers_present: verify(len(row) == len(fieldnames), "Need %s columns for table %s"%(len(fieldnames), table)) yield {f: _try_float(row[f]) for f in fieldnames} else: key_matching = defaultdict(list) for k,f in product(row.keys(), fieldnames or row.keys()): if k.lower() ==f.lower(): key_matching[f].append(k) fieldnames = fieldnames or row.keys() for f in fieldnames: verify(f in key_matching, "Unable to find field name %s for table %s"%(f, table)) verify(len(key_matching[f]) <= 1, "Duplicate field names found for field %s table %s"%(f, table)) yield {f: _try_float(row[key_matching[f][0]]) for f in fieldnames} def _create_table(self, dir_path, table, dialect, headers_present): file_path = self._get_file_path(dir_path, table) if not (file_path and os.path.isfile(file_path)) : return tdf = self.tic_dat_factory if table in tdf.generator_tables: def rtn() : with open(file_path) as csvfile: for r in self._get_data(csvfile, table, dialect, headers_present): yield tuple(r[_] for _ in tdf.data_fields[table]) else: rtn = {} if tdf.primary_key_fields.get(table) else [] with open(file_path) as csvfile: for r in self._get_data(csvfile, table, dialect, headers_present) : if tdf.primary_key_fields.get(table) : p_key = r[tdf.primary_key_fields[table][0]] \ if len(tdf.primary_key_fields[table]) == 1 else \ tuple(r[_] for _ in tdf.primary_key_fields[table]) rtn[p_key] = tuple(r[_] for _ in tdf.data_fields.get(table,())) elif table in tdf.generic_tables: rtn.append(r) else: rtn.append(tuple(r[_] for _ in tdf.data_fields[table])) return rtn def write_directory(self, tic_dat, dir_path, allow_overwrite = False, dialect='excel', write_header = True): """ write the ticDat data to a collection of csv files :param tic_dat: the data object :param dir_path: the directory in which to write the csv files :param allow_overwrite: boolean - are we allowed to overwrite existing files? :param dialect: the csv dialect. Consult csv documentation for details. :param write_header: Boolean. Should the header information be written as the first row? :return: """ verify(csv, "csv needs to be installed to use this subroutine") verify(dialect in csv.list_dialects(), "Invalid dialect %s"%dialect) verify(not os.path.isfile(dir_path), "A file is not a valid directory path") if self.tic_dat_factory.generic_tables: dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.csv.write_directory(dat, dir_path, allow_overwrite, dialect, write_header) tdf = self.tic_dat_factory msg = [] if not self.tic_dat_factory.good_tic_dat_object(tic_dat, lambda m : msg.append(m)) : raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) if not allow_overwrite: for t in tdf.all_tables : f = os.path.join(dir_path, t + ".csv") verify(not os.path.exists(f), "The %s path exists and overwrite is not allowed"%f) if not os.path.isdir(dir_path) : os.mkdir(dir_path) for t in tdf.all_tables : f = os.path.join(dir_path, t + ".csv") with open(f, 'w') as csvfile: writer = csv.DictWriter(csvfile,dialect=dialect, fieldnames= tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ()) ) writer.writeheader() if write_header else None _t = getattr(tic_dat, t) if dictish(_t) : for p_key, data_row in _t.items() : primaryKeyDict = {f:v for f,v in zip(tdf.primary_key_fields[t], p_key if containerish(p_key) else (p_key,))} writer.writerow(dict(data_row, **primaryKeyDict)) else : for data_row in (_t if containerish(_t) else _t()) : writer.writerow(dict(data_row))
class XlsTicFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing Excel files with ticDat objects. Your system will need the xlrd package to read .xls and .xlsx files, the xlwt package to write .xls files, and the xlsxwriter package to write .xlsx files. """ def __init__(self, tic_dat_factory): """ Don't create this object explicitly. A XlsTicDatFactory will automatically be associated with the xls attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._isFrozen = True def create_tic_dat(self, xls_file_path, row_offsets={}, headers_present=True, treat_large_as_inf=False, freeze_it=False): """ Create a TicDat object from an Excel file :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema. :param row_offsets: (optional) A mapping from table names to initial number of rows to skip :param headers_present: Boolean. Does the first row of data contain the column headers? :param treat_large_as_inf: Boolean. Treat numbers >= 1e100 as infinity Generally only needed for .xlsx files that were themselves created by ticdat (see write_file docs) :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the matching sheets. caveats: Missing sheets resolve to an empty table, but missing fields on matching sheets throw an Exception. Sheet names are considered case insensitive, and white space is replaced with underscore for table name matching. Field names are considered case insensitive, but white space is respected. (ticdat supports whitespace in field names but not table names). The following two caveats apply only if data_types are used. --> Any field for which an empty string is invalid data and None is valid data will replace the empty string with None. --> Any field for which must_be_int is true will replace numeric data that satisfies int(x)==x with int(x). In other words, the ticdat equivalent of pandas.read_excel convert_float is to set must_be_int to true in data_types. """ self._verify_differentiable_sheet_names() verify(xlrd, "xlrd needs to be installed to use this subroutine") tdf = self.tic_dat_factory verify(not (treat_large_as_inf and tdf.generator_tables), "treat_large_as_inf not implemented for generator tables") verify(headers_present or not tdf.generic_tables, "headers need to be present to read generic tables") verify(utils.DataFrame or not tdf.generic_tables, "Strange absence of pandas despite presence of generic tables") rtn = tdf.TicDat(**self._create_tic_dat_dict( xls_file_path, row_offsets, headers_present)) replaceable = defaultdict(dict) for t, dfs in tdf.data_types.items(): replaceable[t] = { df for df, dt in dfs.items() if (not dt.valid_data('')) and dt.valid_data(None) } for t in set(tdf.all_tables).difference(tdf.generator_tables, tdf.generic_tables): _t = getattr(rtn, t) for r in _t.values() if utils.dictish(_t) else _t: for f, v in r.items(): if f in replaceable[t] and v == '': r[f] = None elif treat_large_as_inf: if v >= _xlsx_hack_inf: r[f] = float("inf") if v <= -_xlsx_hack_inf: r[f] = -float("inf") if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn def _verify_differentiable_sheet_names(self): rtn = defaultdict(set) for t in self.tic_dat_factory.all_tables: rtn[t[:_longest_sheet]].add(t) rtn = [v for k, v in rtn.items() if len(v) > 1] verify( not rtn, "The following tables collide when names are truncated to %s characters.\n%s" % (_longest_sheet, sorted(map(sorted, rtn)))) def _get_sheets_and_fields(self, xls_file_path, all_tables, row_offsets, headers_present): verify( utils.stringish(xls_file_path) and os.path.exists(xls_file_path), "xls_file_path argument %s is not a valid file path." % xls_file_path) try: book = xlrd.open_workbook(xls_file_path) except Exception as e: raise TicDatError("Unable to open %s as xls file : %s" % (xls_file_path, e.message)) sheets = defaultdict(list) for table, sheet in product(all_tables, book.sheets()): if table.lower()[:_longest_sheet] == sheet.name.lower().replace( ' ', '_')[:_longest_sheet]: sheets[table].append(sheet) duplicated_sheets = tuple(_t for _t, _s in sheets.items() if len(_s) > 1) verify( not duplicated_sheets, "The following sheet names were duplicated : " + ",".join(duplicated_sheets)) sheets = FrozenDict({k: v[0] for k, v in sheets.items()}) field_indicies, missing_fields, dup_fields = {}, {}, {} for table, sheet in sheets.items(): field_indicies[table], missing_fields[table], dup_fields[table] = \ self._get_field_indicies(table, sheet, row_offsets[table], headers_present) verify( not any(_ for _ in missing_fields.values()), "The following field names could not be found : \n" + "\n".join("%s : " % t + ",".join(bf) for t, bf in missing_fields.items() if bf)) verify( not any(_ for _ in dup_fields.values()), "The following field names were duplicated : \n" + "\n".join("%s : " % t + ",".join(bf) for t, bf in dup_fields.items() if bf)) return sheets, field_indicies def _create_generator_obj(self, xlsFilePath, table, row_offset, headers_present): tdf = self.tic_dat_factory ho = 1 if headers_present else 0 def tableObj(): sheets, field_indicies = self._get_sheets_and_fields( xlsFilePath, (table, ), {table: row_offset}, headers_present) if table in sheets: sheet = sheets[table] table_len = min( len(sheet.col_values(field_indicies[table][field])) for field in tdf.data_fields[table]) for x in (sheet.row_values(i) for i in range(table_len)[row_offset + ho:]): yield self._sub_tuple(table, tdf.data_fields[table], field_indicies[table])(x) return tableObj def _create_tic_dat_dict(self, xls_file_path, row_offsets, headers_present): verify( utils.dictish(row_offsets) and set(row_offsets).issubset(self.tic_dat_factory.all_tables) and all( utils.numericish(x) and (x >= 0) for x in row_offsets.values()), "row_offsets needs to map from table names to non negative row offset" ) row_offsets = dict({t: 0 for t in self.tic_dat_factory.all_tables}, **row_offsets) tdf = self.tic_dat_factory rtn = {} sheets, field_indicies = self._get_sheets_and_fields( xls_file_path, set(tdf.all_tables).difference(tdf.generator_tables), row_offsets, headers_present) ho = 1 if headers_present else 0 for tbl, sheet in sheets.items(): fields = tdf.primary_key_fields.get(tbl, ()) + tdf.data_fields.get( tbl, ()) assert fields or tbl in self.tic_dat_factory.generic_tables indicies = field_indicies[tbl] table_len = min( len(sheet.col_values(indicies[field])) for field in (fields or indicies)) if tdf.primary_key_fields.get(tbl, ()): tableObj = { self._sub_tuple(tbl, tdf.primary_key_fields[tbl], indicies)(x): self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) } elif tbl in tdf.generic_tables: tableObj = [{ f: x[i] for f, i in field_indicies[tbl].items() } for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:])] else: tableObj = [ self._sub_tuple(tbl, tdf.data_fields.get(tbl, ()), indicies)(x) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[tbl] + ho:]) ] rtn[tbl] = tableObj for tbl in tdf.generator_tables: rtn[tbl] = self._create_generator_obj(xls_file_path, tbl, row_offsets[tbl], headers_present) return rtn def find_duplicates(self, xls_file_path, row_offsets={}, headers_present=True): """ Find the row counts for duplicated rows. :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema (non primary key tables ignored). :param row_offsets: (optional) A mapping from table names to initial number of rows to skip (non primary key tables ignored) :param headers_present: Boolean. Does the first row of data contain the column headers? caveats: Missing sheets resolve to an empty table, but missing primary fields on matching sheets throw an Exception. Sheet names are considered case insensitive. :return: A dictionary whose keys are the table names for the primary key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the Excel sheet with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ self._verify_differentiable_sheet_names() verify(xlrd, "xlrd needs to be installed to use this subroutine") verify( utils.dictish(row_offsets) and set(row_offsets).issubset(self.tic_dat_factory.all_tables) and all( utils.numericish(x) and (x >= 0) for x in row_offsets.values()), "row_offsets needs to map from table names to non negative row offset" ) row_offsets = dict({t: 0 for t in self.tic_dat_factory.all_tables}, **row_offsets) tdf = self.tic_dat_factory pk_tables = tuple(t for t, _ in tdf.primary_key_fields.items() if _) rtn = {t: defaultdict(int) for t in pk_tables} sheets, fieldIndicies = self._get_sheets_and_fields( xls_file_path, pk_tables, row_offsets, headers_present) ho = 1 if headers_present else 0 for table, sheet in sheets.items(): fields = tdf.primary_key_fields[table] + tdf.data_fields.get( table, ()) indicies = fieldIndicies[table] table_len = min( len(sheet.col_values(indicies[field])) for field in fields) for x in (sheet.row_values(i) for i in range(table_len)[row_offsets[table] + ho:]): rtn[table][self._sub_tuple(table, tdf.primary_key_fields[table], indicies)(x)] += 1 for t in list(rtn.keys()): rtn[t] = {k: v for k, v in rtn[t].items() if v > 1} if not rtn[t]: del (rtn[t]) return rtn def _sub_tuple(self, table, fields, field_indicies): assert set(fields).issubset(field_indicies) data_types = self.tic_dat_factory.data_types def _convert_float(x, field): rtn = x[field_indicies[field]] if utils.numericish(rtn) and utils.safe_apply(int)(rtn) == rtn and \ table in data_types and field in data_types[table] and \ data_types[table][field].must_be_int: return int(rtn) return rtn def rtn(x): if len(fields) == 1: return _convert_float(x, fields[0]) return tuple(_convert_float(x, field) for field in fields) return rtn def _get_field_indicies(self, table, sheet, row_offset, headers_present): fields = self.tic_dat_factory.primary_key_fields.get(table, ()) + \ self.tic_dat_factory.data_fields.get(table, ()) if not headers_present: row_len = len(sheet.row_values( row_offset)) if sheet.nrows > 0 else len(fields) return ({f: i for i, f in enumerate(fields) if i < row_len}, [f for i, f in enumerate(fields) if i >= row_len], []) if sheet.nrows - row_offset <= 0: return {}, fields, [] if table in self.tic_dat_factory.generic_tables: temp_rtn = defaultdict(list) for ind, val in enumerate(sheet.row_values(row_offset)): temp_rtn[val].append(ind) else: temp_rtn = {field: list() for field in fields} for field, (ind, val) in product( fields, enumerate(sheet.row_values(row_offset))): if field == val or (all(map(utils.stringish, (field, val))) and field.lower() == val.lower()): temp_rtn[field].append(ind) return ({ field: inds[0] for field, inds in temp_rtn.items() if len(inds) == 1 }, [field for field, inds in temp_rtn.items() if len(inds) == 0], [field for field, inds in temp_rtn.items() if len(inds) > 1]) def write_file(self, tic_dat, file_path, allow_overwrite=False): """ write the ticDat data to an excel file :param tic_dat: the data object to write (typically a TicDat) :param file_path: The file path of the excel file to create Needs to end in either ".xls" or ".xlsx" The latter is capable of writing out larger tables, but the former handles infinity seamlessly. If ".xlsx", then be advised that +/- float("inf") will be replaced with +/- 1e+100 :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :return: caveats: None may be written out as an empty string. This reflects the behavior of xlwt. """ self._verify_differentiable_sheet_names() verify( utils.stringish(file_path) and (file_path.endswith(".xls") or file_path.endswith(".xlsx")), "file_path argument needs to end in .xls or .xlsx") msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid ticDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(file_path), "A directory is not a valid xls file path") verify(allow_overwrite or not os.path.exists(file_path), "The %s path exists and overwrite is not allowed" % file_path) if self.tic_dat_factory.generic_tables: dat, tdf = utils.create_generic_free(tic_dat, self.tic_dat_factory) return tdf.xls.write_file(dat, file_path, allow_overwrite) if file_path.endswith(".xls"): self._xls_write(tic_dat, file_path) else: self._xlsx_write(tic_dat, file_path) def _xls_write(self, tic_dat, file_path): verify(xlwt, "Can't write .xls files because xlwt package isn't installed.") tdf = self.tic_dat_factory book = xlwt.Workbook() for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): sheet = book.add_sheet(t[:_longest_sheet]) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, cell) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, cell) if os.path.exists(file_path): os.remove(file_path) book.save(file_path) def _xlsx_write(self, tic_dat, file_path): verify( xlsx, "Can't write .xlsx files because xlsxwriter package isn't installed." ) tdf = self.tic_dat_factory if os.path.exists(file_path): os.remove(file_path) book = xlsx.Workbook(file_path) def clean_inf(x): if x == float("inf"): return _xlsx_hack_inf if x == -float("inf"): return -_xlsx_hack_inf return x for t in sorted(sorted(tdf.all_tables), key=lambda x: len(tdf.primary_key_fields.get(x, ()))): sheet = book.add_worksheet(t) for i, f in enumerate( tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())): sheet.write(0, i, f) _t = getattr(tic_dat, t) if utils.dictish(_t): for row_ind, (p_key, data) in enumerate(_t.items()): for field_ind, cell in enumerate( (p_key if containerish(p_key) else (p_key, )) + tuple(data[_f] for _f in tdf.data_fields.get(t, ()))): sheet.write(row_ind + 1, field_ind, clean_inf(cell)) else: for row_ind, data in enumerate( _t if containerish(_t) else _t()): for field_ind, cell in enumerate( tuple(data[_f] for _f in tdf.data_fields[t])): sheet.write(row_ind + 1, field_ind, clean_inf(cell)) book.close()
class OpalyticsPanFactory(freezable_factory(object, "_isFrozen")) : """ Primary class for reading PanDat objects from the Opalytics Cloud Platform. Not expected to be used outside of Opalytics Cloud hosted notebooks. Don't create this object explicitly. An OpalyticsPanFactory will automatically be associated with the opalytics attribute of the parent PanDatFactory. """ def __init__(self, pan_dat_factory): """ Don't create this object explicitly. A JsonPanFactory will automatically be associated with the json attribute of the parent PanDatFactory. :param pan_dat_factory: :return: """ self.pan_dat_factory = pan_dat_factory self._isFrozen = True def _find_table_matchings(self, inputset): rtn = defaultdict(list) for t,x in product(self.pan_dat_factory.all_tables, inputset.schema): if stringish(x) and t.lower() == x.lower().replace(" ", "_"): rtn[t].append(x) return rtn def _good_inputset(self, inputset, message_writer = lambda x : x): if not hasattr(inputset, "schema") and dictish(inputset.schema): message_writer("Failed to find dictish schema attribute") return False if not hasattr(inputset, "getTable") and callable(inputset.getTable): message_writer("Failed to find calleable getTable attribute") return False table_matchings = self._find_table_matchings(inputset) badly_matched = {t for t,v in table_matchings.items() if len(v) != 1} if badly_matched: message_writer("Following tables could not be uniquely resolved in inputset.schema\n%s"% badly_matched) return False return True def create_pan_dat(self, inputset, raw_data=False, freeze_it=False): """ Create a PanDat object from an opalytics inputset :param inputset: An opalytics inputset consistent with this PanDatFactory :param raw_data: boolean. should data cleaning be skipped? On the Opalytics Cloud Platform cleaned data will be passed to instant apps. Data cleaning involves removing data type failures, data row predicate failures, foreign key failures, duplicated rows and deactivated records. :return: a PanDat object populated by the tables as they are rendered by inputset """ message = [] verify(self._good_inputset(inputset, message.append), "inputset is inconsistent with this PanDatFactory : %s"%(message or [None])[0]) for t in self.pan_dat_factory.all_tables: all_fields = set(self.pan_dat_factory.primary_key_fields[t]).\ union(self.pan_dat_factory.data_fields[t]) verify("_active" not in all_fields, "Table %s has a field named '_active'.\n" + "This conflicts with internal data processing.\n" + " Don't use '_active' for in your PanDatFactory definition if you want to use this reader.") tms = {k:v[0] for k,v in self._find_table_matchings(inputset).items()} ia = {} if "includeActive" in inspect.getargspec(inputset.getTable)[0]: ia = {"includeActive": not raw_data} rtn = self.pan_dat_factory.PanDat(**{t:inputset.getTable(tms[t], **ia) for t in tms}) for t in self.pan_dat_factory.all_tables: df = getattr(rtn, t) if "_active" in df.columns: df = df[df["_active"]].drop('_active', axis=1) setattr(rtn, t, df) if not raw_data: def removing(): removal_occured = set() for (t,_), brs in list(self.pan_dat_factory.find_data_type_failures(rtn, as_table=False).items()) + \ list(self.pan_dat_factory.find_data_row_failures(rtn, as_table=False).items()) + \ [((t, None), brs) for t,brs in self.pan_dat_factory.find_duplicates(rtn, as_table=False).items()]: if t not in removal_occured: removal_occured.add(t) setattr(rtn, t, getattr(rtn, t)[[not _ for _ in brs]]) fkfs = self.pan_dat_factory.find_foreign_key_failures(rtn) if fkfs: self.pan_dat_factory.remove_foreign_key_failures(rtn) return removal_occured or fkfs while removing(): pass return rtn
class JsonTicFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing json files with ticDat objects. You need the json package to be installed to use it. """ def __init__(self, tic_dat_factory): """ Don't call this function explicitly. A JsonTicFactory will automatically be associated with the json attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._isFrozen = True def create_tic_dat(self, json_file_path, freeze_it=False): """ Create a TicDat object from a json file :param json_file_path: A json file path. It should encode a dictionary with table names as keys. :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the matching tables. caveats: Table names matches are case insensitive and also underscore-space insensitive. Tables that don't find a match are interpreted as an empty table. Dictionary keys that don't match any table are ignored. """ _standard_verify(self.tic_dat_factory) jdict = self._create_jdict(json_file_path) tic_dat_dict = self._create_tic_dat_dict(jdict) missing_tables = set( self.tic_dat_factory.all_tables).difference(tic_dat_dict) if missing_tables: print( "The following table names could not be found in the %s file.\n%s\n" % (json_file_path, "\n".join(missing_tables))) rtn = self.tic_dat_factory.TicDat(**tic_dat_dict) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn def find_duplicates(self, json_file_path): """ Find the row counts for duplicated rows. :param json_file_path: A json file path. It should encode a dictionary with table names as keys. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the json entry with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ _standard_verify(self.tic_dat_factory) jdict = self._create_jdict(json_file_path) rtn = find_duplicates_from_dict_ticdat(self.tic_dat_factory, jdict) return rtn or {} def _create_jdict(self, json_file_path): verify(os.path.isfile(json_file_path), "json_file_path is not a valid file path.") try: with open(json_file_path, "r") as fp: jdict = json.load(fp) except Exception as e: raise TicDatError("Unable to interpret %s as json file : %s" % (json_file_path, e.message)) verify(dictish(jdict), "%s failed to load a dictionary" % json_file_path) verify( all(map(stringish, jdict)), "The dictionary loaded from %s isn't indexed by strings" % json_file_path) verify( all(map(containerish, jdict.values())), "The dictionary loaded from %s doesn't have containers as values" % json_file_path) return jdict def _create_tic_dat_dict(self, jdict): tdf = self.tic_dat_factory rtn = {} table_keys = defaultdict(list) for t in tdf.all_tables: for t2 in jdict: if stringish(t2) and t.lower() == t2.replace(" ", "_").lower(): table_keys[t].append(t2) if len(table_keys[t]) >= 1: verify( len(table_keys[t]) < 2, "Found duplicate matching keys for table %s" % t) rtn[t] = jdict[table_keys[t][0]] return rtn def write_file(self, tic_dat, json_file_path, allow_overwrite=False, verbose=False): """ write the ticDat data to an excel file :param tic_dat: the data object to write (typically a TicDat) :param json_file_path: The file path of the json file to create. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :param verbose: boolean. Verbose mode writes the data rows as dicts keyed by field name. Otherwise, they are lists. :return: """ _standard_verify(self.tic_dat_factory) verify(not (os.path.exists(json_file_path) and not allow_overwrite), "%s exists and allow_overwrite is not enabled" % json_file_path) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) jdict = make_json_dict(self.tic_dat_factory, tic_dat, verbose) with open(json_file_path, "w") as fp: json.dump(jdict, fp, sort_keys=True, indent=2)
class MdbTicFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing Access/MDB files with ticDat objects. Your system will need the required pypyodbc package if you want to actually do something with it. """ def __init__(self, tic_dat_factory): """ Don't create this object explicitly. A MdbTicDatFactory will automatically be associated with the mdb attribute of the parent TicDatFactory. :param tic_dat_factory: :return: """ self.tic_dat_factory = tic_dat_factory self._duplicate_focused_tdf = create_duplicate_focused_tdf( tic_dat_factory) self._isFrozen = True def create_tic_dat(self, mdb_file_path, freeze_it=False): """ Create a TicDat object from an Access MDB file :param mdb_file_path: An Access db with a consistent schema. :param freeze_it: boolean. should the returned object be frozen? :return: a TicDat object populated by the matching tables. caveats : Numbers with absolute values larger than 1e+100 will be read as float("inf") or float("-inf") """ _standard_verify(self.tic_dat_factory.generic_tables) rtn = self.tic_dat_factory.TicDat( **self._create_tic_dat(mdb_file_path)) if freeze_it: return self.tic_dat_factory.freeze_me(rtn) return rtn def find_duplicates(self, mdb_file_path): """ Find the row counts for duplicated rows. :param mdb_file_path: An Access db with a consistent schema. :return: A dictionary whose keys are table names for the primary-ed key tables. Each value of the return dictionary is itself a dictionary. The inner dictionary is keyed by the primary key values encountered in the table, and the value is the count of records in the mdb table with this primary key. Row counts smaller than 2 are pruned off, as they aren't duplicates """ _standard_verify(self.tic_dat_factory.generic_tables) if not self._duplicate_focused_tdf: return {} return find_duplicates( self._duplicate_focused_tdf.mdb.create_tic_dat(mdb_file_path), self._duplicate_focused_tdf) def _get_table_names(self, db_file_path, tables): rtn = {} with _connect(_connection_str(db_file_path)) as con: def try_name(name): with con.cursor() as cur: try: cur.execute("Select * from [%s]" % name) except: return False return True for table in tables: rtn[table] = [ t for t in all_underscore_replacements(table) if try_name(t) ] verify( len(rtn[table]) >= 1, "Unable to recognize table %s in MS Access file %s" % (table, db_file_path)) verify( len(rtn[table]) <= 1, "Duplicate tables found for table %s in MS Access file %s" % (table, db_file_path)) rtn[table] = rtn[table][0] return rtn def _check_tables_fields(self, mdb_file_path, tables): tdf = self.tic_dat_factory TDE = TicDatError verify(os.path.exists(mdb_file_path), "%s isn't a valid file path" % mdb_file_path) try: with _connect(_connection_str(mdb_file_path)) as _: pass except Exception as e: raise TDE("Unable to open %s as MS Access file : %s" % (mdb_file_path, e.message)) table_names = self._get_table_names(mdb_file_path, tables) with _connect(_connection_str(mdb_file_path)) as con: for table in tables: with con.cursor() as cur: cur.execute("Select * from [%s]" % table_names[table]) fields = set(_[0].lower() for _ in cur.description) for field in tdf.primary_key_fields.get( table, ()) + tdf.data_fields.get(table, ()): verify( field.lower() in fields, "Unable to recognize field %s in table %s for file %s" % (field, table, mdb_file_path)) return table_names def _create_gen_obj(self, mdbFilePath, table, table_name): tdf = self.tic_dat_factory def tableObj(): assert (not tdf.primary_key_fields.get(table)) and ( tdf.data_fields.get(table)) with _connect(_connection_str(mdbFilePath)) as con: with con.cursor() as cur: cur.execute("Select %s from [%s]" % (", ".join( _brackets(tdf.data_fields[table])), table_name)) for row in cur.fetchall(): yield list(map(_read_data, row)) return tableObj def _create_tic_dat(self, mdbFilePath): tdf = self.tic_dat_factory table_names = self._check_tables_fields(mdbFilePath, tdf.all_tables) rtn = {} with _connect(_connection_str(mdbFilePath)) as con: for table in set(tdf.all_tables).difference(tdf.generator_tables): fields = tdf.primary_key_fields.get(table, ()) + tdf.data_fields.get( table, ()) rtn[table] = {} if tdf.primary_key_fields.get(table, ()) else [] with con.cursor() as cur: cur.execute( "Select %s from [%s]" % (", ".join(_brackets(fields)), table_names[table])) for row in cur.fetchall(): pk = row[:len(tdf.primary_key_fields.get(table, ()))] data = list( map( _read_data, row[len(tdf.primary_key_fields.get(table, ( ))):])) if dictish(rtn[table]): rtn[table][pk[0] if len(pk) == 1 else tuple(pk)] = data else: rtn[table].append(data) for table in tdf.generator_tables: rtn[table] = self._create_gen_obj(mdbFilePath, table, table_names[table]) return rtn @property def can_write_new_file(self): """ :return: True if this environment can write to a new mdb database files, False otherwise """ return _write_new_file_works def write_schema(self, mdb_file_path, **field_types): """ :param mdb_file_path: The file path of the mdb database to create :param field_types: Named arguments are table names. Argument values are mapping of field name to field type. Allowable field types are text, double and int If missing, primary key fields are text, and data fields are double :return: """ verify( not self.tic_dat_factory.generic_tables, "generic_tables are not compatible with write_schema. " + "Use write_file instead.") _standard_verify(self.tic_dat_factory.generic_tables) verify(dictish(field_types), "field_types should be a dict") for k, v in field_types.items(): verify(k in self.tic_dat_factory.all_tables, "%s isn't a table name" % k) verify(dictish(v), "Need a mapping from field names to field types for %s" % k) for fld, type_ in v.items(): verify( fld in self.tic_dat_factory.primary_key_fields.get(k, ()) + self.tic_dat_factory.data_fields.get(k, ()), "%s isn't a field name for table %s" % (fld, k)) verify( type_ in ("text", "double", "int"), "For table %s, field %s, %s isn't one of (text, double, int)" % (k, fld, type_)) get_fld_type = lambda tbl, fld, default: field_types.get(tbl, {}).get( fld, default) if not os.path.exists(mdb_file_path): verify( mdb_file_path.endswith(".mdb") or mdb_file_path.endswith(".accdb"), "For file creation, specify either an .mdb or .accdb file name" ) if mdb_file_path.endswith(".mdb"): verify( py, "pypyodbc needs to be installed to create a new .mdb file") verify(self.can_write_new_file, "Creating a new file not enabled for this OS") py.win_create_mdb(mdb_file_path) else: blank_accdb = os.path.join(_code_dir(), "blank.accdb") verify( os.path.exists(blank_accdb) and os.path.isfile(blank_accdb), "You need to run accdb_create_setup.py as a post pip install operation " + "to configure writing to new .accdb files.") shutil.copy(blank_accdb, mdb_file_path) with _connect(_connection_str(mdb_file_path)) as con: for t in self.tic_dat_factory.all_tables: str = "Create TABLE [%s] (\n" % t strl = ["[%s] %s"%(f, get_fld_type(t, f, "text")) for f in self.tic_dat_factory.primary_key_fields.get(t, ())] + \ ["[%s] %s"%(f, get_fld_type(t, f, "double")) for f in self.tic_dat_factory.data_fields.get(t, ())] if self.tic_dat_factory.primary_key_fields.get(t): strl.append("PRIMARY KEY(%s)" % ",".join( _brackets(self.tic_dat_factory.primary_key_fields[t]))) str += ",\n".join(strl) + "\n);" con.cursor().execute(str).commit() def write_file(self, tic_dat, mdb_file_path, allow_overwrite=False): """ write the ticDat data to an SQLite database file :param tic_dat: the data object to write :param mdb_file_path: the file path of the SQLite database to populate :param allow_overwrite: boolean - are we allowed to overwrite pre-existing data :return: caveats : Numbers with absolute values larger than 1e+100 will be written as 1e+100 or -1e+100 NB - thrown Exceptions of the form "Data type mismatch in criteria expression" generally result either from Access's inability to store different data types in the same field, or from a mismatch between the data object and the default field types ticdat uses when creating an Access schema. For the latter, feel free to call the write_schema function on the data file first with explicitly identified field types. """ _standard_verify(self.tic_dat_factory.generic_tables) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(mdb_file_path), "A directory is not a valid Access file path") if not os.path.exists(mdb_file_path): self.write_schema(mdb_file_path) table_names = self._check_tables_fields( mdb_file_path, self.tic_dat_factory.all_tables) with _connect(_connection_str(mdb_file_path)) as con: for t in self.tic_dat_factory.all_tables: verify( table_names[t] == t, "Failed to find table %s in path %s" % (t, mdb_file_path)) if not allow_overwrite: with con.cursor() as cur: cur.execute("Select * from %s" % t) verify( not any(True for _ in cur.fetchall()), "allow_overwrite is False, but there are already data records in %s" % t) con.cursor().execute("Delete from %s" % t).commit() if allow_overwrite else None _t = getattr(tic_dat, t) if dictish(_t): primary_keys = tuple( self.tic_dat_factory.primary_key_fields[t]) for pk_row, sql_data_row in _t.items(): _items = tuple(sql_data_row.items()) fields = _brackets(primary_keys + tuple(x[0] for x in _items)) data_row = ((pk_row,) if len(primary_keys)==1 else pk_row) + \ tuple(_write_data(x[1]) for x in _items) assert len(data_row) == len(fields) str = "INSERT INTO %s (%s) VALUES (%s)"%\ (t, ",".join(fields), ",".join("?" for _ in fields)) con.cursor().execute(str, data_row).commit() else: for sql_data_row in (_t if containerish(_t) else _t()): str = "INSERT INTO %s (%s) VALUES (%s)" % (t, ",".join( _brackets(sql_data_row.keys())), ",".join( ["?"] * len(sql_data_row))) con.cursor().execute( str, tuple(map(_write_data, sql_data_row.values())))
class _PostgresFactory( freezable_factory(object, "_isFrozen"), ): def __init__(self, tdf): self.tdf = tdf self._isFrozen = True def _check_good_pgtd_compatible_table_field_names(self): all_fields = lambda t: self.tdf.primary_key_fields.get(t, ( )) + self.tdf.data_fields.get(t, ()) for t in self.tdf.all_tables: # play nice with the table/field names or don't play at all verify( _pg_name(t) == t, f"Table {t} doesn't obey a postgres friendly naming convention." + f"It should be have been named {_pg_name(t)}\n" + "This is a postgres specific requirement. See pgsql doc string for more info." ) verify( len(all_fields(t)) == len(set(map(_pg_name, all_fields(t)))), f"Table {t} has field names that collide with each other under case/space insensitivity.\n" + "This is a postgres specific requirement. See pgsql doc string for more info." ) # a little testing indicated that the problem is with reserved words as fields, but not tables reserved_word_collision = { _ for _ in all_fields(t) if _.lower() in _the_reserved_words } verify( not reserved_word_collision, f"The following field names from table {t} collide with PostGres " + f"reserved words {reserved_word_collision}") def check_tables_fields(self, engine, schema, error_on_missing_table=False): ''' throws a TicDatError if there there isn't a postgres schema in engine with the proper tables and fields. :param engine: has an .execute method :param schema: string that represents a postgres schema :param error_on_missing_table: boolean - should an error be thrown for missing tables? If falsey, then print a warning instead. :return: A list of missing tables. Will raise TicDatError if there are missing tables and error_on_missing_table is truthy. ''' tdf = self.tdf verify( schema in [ row[0] for row in engine.execute( "select schema_name from information_schema.schemata") ], f"Schema {schema} is missing from engine {engine}") pg_tables = [ row[0] for row in engine.execute( f"select table_name from information_schema.tables where table_schema ='{schema}'" ) ] missing_tables = [] for table in tdf.all_tables: if table in pg_tables: pg_fields = [ row[0] for row in engine.execute( f"""SELECT column_name FROM information_schema.columns WHERE table_schema = '{schema}' AND table_name = '{table}'""" ) ] for field in tdf.primary_key_fields.get(table, ()) + \ tdf.data_fields.get(table, ()): matches = [f for f in pg_fields if f == _pg_name(field)] verify( len(matches) == 1, f"Unable to recognize {table}.{_pg_name(field)} in postgres schema {schema}" ) else: missing_tables.append(table) verify( not (missing_tables and error_on_missing_table), f"Unable to recognize tables {missing_tables} in postgres schema {schema}" ) if missing_tables: print( "The following table names could not be found in the %s schema.\n%s\n" % (schema, "\n".join(missing_tables))) return missing_tables def _fks(self): rtn = defaultdict(set) for fk in self.tdf.foreign_keys: rtn[fk.native_table].add(fk) return FrozenDict({k: tuple(v) for k, v in rtn.items()}) def _ordered_tables(self): rtn = [] fks = self._fks() def processTable(t): if t not in rtn: for fk in fks.get(t, ()): processTable(fk.foreign_table) rtn.append(t) list(map(processTable, self.tdf.all_tables)) return tuple(rtn) def _get_schema_sql(self, tables, schema, forced_field_types): rtn = [] fks = self._fks() def get_fld_type(t, f, default_type): if (t, f) in forced_field_types: return forced_field_types[t, f] if t == "parameters" and self.tdf.parameters: return "text" fld_type = self.tdf.data_types.get(t, {}).get(f) if not fld_type: return default_type if fld_type.datetime: return "timestamp" verify( not (fld_type.number_allowed and fld_type.strings_allowed), f"Select one of string or numeric for {t}.{f} if declaring type and using postgres" ) if fld_type.strings_allowed: return 'text' if fld_type.number_allowed: if fld_type.must_be_int: return 'integer' else: return 'float' else: TicDatError( f"Allow one of text or numeric for {t}.{f} if declaring type and using postgres" ) def db_default(t, f): rtn = self.tdf.default_values[t][f] if forced_field_types.get((t, f)) in ("bool", "boolean"): return bool(rtn) if rtn is None: return "NULL" return rtn def nullable(t, f): fld_type = self.tdf.data_types.get(t, {}).get(f) if not fld_type: return True if fld_type.number_allowed and self.tdf.infinity_io_flag is None: return True return fld_type.nullable def default_sql_str(t, f): fld_type = self.tdf.data_types.get(t, {}).get(f) if fld_type and fld_type.datetime: return "" return f" DEFAULT {db_default(t, f)}" for t in [_ for _ in self._ordered_tables() if _ in tables]: str = f"CREATE TABLE {schema}.{t} (\n" strl = [f"{_pg_name(f)} " + get_fld_type(t, f, 'text') for f in self.tdf.primary_key_fields.get(t, ())] + \ [f"{_pg_name(f)} " + get_fld_type(t, f, 'float') + (f"{' NOT NULL' if not nullable(t,f) else ''}") + default_sql_str(t, f) for f in self.tdf.data_fields.get(t, ())] if self.tdf.primary_key_fields.get(t): strl.append( f"PRIMARY KEY ({','.join(map(_pg_name, self.tdf.primary_key_fields[t]))})" ) for fk in fks.get(t, ()): nativefields, foreignfields = zip( *(fk.nativetoforeignmapping().items())) strl.append( f"FOREIGN KEY ({','.join(map(_pg_name, nativefields))}) REFERENCES " + f"{schema}.{fk.foreign_table} ({','.join(map(_pg_name, foreignfields))})" ) str += ",\n".join(strl) + "\n);" rtn.append(str) return tuple(rtn) def write_schema(self, engine, schema, forced_field_types=None, include_ancillary_info=True): """ :param engine: typically a sqlalchemy database engine with drivertype postgres (really just needs an .execute) :param schema: a string naming the postgres schema to populate (will create if needed) :param forced_field_types : A dictionary mappying (table, field) to a field type Absent forcing, types are inferred from tic_dat_factory.data_types if possible, and set via the assumption that PK fields are text and data fields are floats if not. :param include_ancillary_info : boolean. If False, no primary key or foreign key info will be written :return: """ self._check_good_pgtd_compatible_table_field_names() forced_field_types = forced_field_types or {} all_fields = lambda t: self.tdf.primary_key_fields.get(t, ( )) + self.tdf.data_fields.get(t, ()) good_forced_field_type_entry = lambda k, v: isinstance(k, tuple) and len(k) == 2 \ and k[1] in all_fields(k[0]) and v in \ ["text", "integer", "float", "bool", "boolean", "timestamp"] verify( dictish(forced_field_types) and all( good_forced_field_type_entry(k, v) for k, v in forced_field_types.items()), "bad forced_field_types argument") if not include_ancillary_info: from ticdat import TicDatFactory tdf = TicDatFactory(**{ t: [[], pks + dfs] for t, (pks, dfs) in self.tdf.schema().items() }) for t, dts in self.tdf.data_types.items(): for f, dt in dts.items(): tdf.set_data_type(t, f, *dt) forced_field_types_ = { (t, f): "text" for t, (pks, dfs) in self.tdf.schema().items() for f in pks if f not in tdf.data_types.get(t, {}) } forced_field_types_.update(forced_field_types) return PostgresTicFactory(tdf).write_schema( engine, schema, forced_field_types_) verify(not getattr(self.tdf, "generic_tables", None), "TicDat for postgres does not yet support generic tables") if schema not in [ row[0] for row in engine.execute( "select schema_name from information_schema.schemata") ]: engine.execute(sa.schema.CreateSchema(schema)) for str in self._get_schema_sql(self.tdf.all_tables, schema, forced_field_types): engine.execute(str) def _handle_prexisting_rows(self, engine, schema, pre_existing_rows): verify(isinstance(pre_existing_rows, dict), "pre_existing_rows needs to dict") verify( set(pre_existing_rows).issubset(self.tdf.all_tables), "bad pre_existing_rows keys") verify( set(pre_existing_rows.values()).issubset({'delete', 'append'}), "bad pre_existing_rows values") pre_existing_rows = dict({t: "delete" for t in self.tdf.all_tables}, **pre_existing_rows) # need to iterate from leaves (children) upwards to avoid breaking foreign keys with delete for t in reversed(self._ordered_tables()): if pre_existing_rows[t] == "delete": try: engine.execute( f"truncate table {schema}.{t}" ) # postgres truncate will fail on FKs re:less except Exception as e: assert "foreign key" in str( e ), "truncate should only fail due to foreign key issues" engine.execute(f"DELETE FROM {schema}.{t}")
class JsonPanFactory(freezable_factory(object, "_isFrozen")): """ Primary class for reading/writing json data with PanDat objects. Don't create this object explicitly. A JsonPanFactory will automatically be associated with the json attribute of the parent PanDatFactory. """ def __init__(self, pan_dat_factory): """ Don't create this object explicitly. A JsonPanFactory will automatically be associated with the json attribute of the parent PanDatFactory. :param pan_dat_factory: :return: """ self.pan_dat_factory = pan_dat_factory to_json_args = inspect.getargspec(pd.DataFrame.to_json).args assert "orient" in to_json_args self._modern_pandas = "index" in to_json_args self._isFrozen = True def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs): """ Create a PanDat object from a SQLite database file :param path_or_buf: a valid JSON string or file-like :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param orient: Indication of expected JSON string format. See pandas.read_json for more details. :param kwargs: additional named arguments to pass to pandas.read_json :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). +- "inf", "-inf" strings will be converted to +-float("inf") """ if os.path.exists(path_or_buf): verify(os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf) with open(path_or_buf, "r") as f: loaded_dict = json.load(f) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) loaded_dict = json.loads(path_or_buf) verify(dictish(loaded_dict), "path_or_buf to json.load as a dict") verify(all(map(dictish, loaded_dict.values())), "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries") tbl_names = self._get_table_names(loaded_dict) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") rtn = {t: pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs) for t,f in tbl_names.items()} missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t,f in missing_fields]) for v in rtn.values(): v.replace("inf", float("inf"), inplace=True) v.replace("-inf", -float("inf"), inplace=True) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn def _get_table_names(self, loaded_dict): rtn = {} for table in self.pan_dat_factory.all_tables: rtn[table] = [c for c in loaded_dict if c.lower().replace(" ", "_") == table.lower()] verify(len(rtn[table]) >= 1, "Unable to recognize table %s" % table) verify(len(rtn[table]) <= 1, "Multiple dictionary key choices found for table %s" % table) rtn[table] = rtn[table][0] return rtn def write_file(self, pan_dat, json_file_path, case_space_table_names=False, orient='split', index=False, indent=None, sort_keys=False, **kwargs): """ write the PanDat data to a collection of csv files :param pan_dat: the PanDat object to write :param json_file_path: the json file into which the data is to be written. If falsey, will return a JSON string :param case_space_table_names: boolean - make best guesses how to add spaces and upper case characters to table names :param orient: Indication of expected JSON string format. See pandas.to_json for more details. :param index: boolean - whether or not to write the index. :param indent: None. See json.dumps :param sort_keys: See json.dumps :param kwargs: additional named arguments to pass to pandas.to_json :return: caveats: +-float("inf") will be converted to "inf", "-inf" """ msg = [] verify(self.pan_dat_factory.good_pan_dat_object(pan_dat, msg.append), "pan_dat not a good object for this factory : %s"%"\n".join(msg)) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") verify("index" not in kwargs, "index should be passed as a non-kwargs argument") if self._modern_pandas: # FYI - pandas Exception: ValueError: 'index=False' is only valid when 'orient' is 'split' or 'table' kwargs["index"] = index if orient in ("split", "table") else True case_space_table_names = case_space_table_names and \ len(set(self.pan_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.pan_dat_factory.all_tables))) rtn = {} for t in self.pan_dat_factory.all_tables: df = getattr(pan_dat, t).replace(float("inf"), "inf").replace(-float("inf"), "-inf") k = case_space_to_pretty(t) if case_space_table_names else t rtn[k] = json.loads(df.to_json(path_or_buf=None, orient=orient, **kwargs)) if orient == 'split' and not index: rtn[k].pop("index", None) if json_file_path: with open(json_file_path, "w") as f: json.dump(rtn, f, indent=indent, sort_keys=sort_keys) else: return json.dumps(rtn, indent=indent, sort_keys=sort_keys)