def _create_jdict(self, path_or_buf): if stringish(path_or_buf) and os.path.exists(path_or_buf): reasonble_string = path_or_buf verify(os.path.isfile(path_or_buf), "json_file_path is not a valid file path.") try: with open(path_or_buf, "r") as fp: jdict = json.load(fp) except Exception as e: raise TicDatError("Unable to interpret %s as json file : %s" % (path_or_buf, e)) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) reasonble_string = path_or_buf[:10] try: jdict = json.loads(path_or_buf) except Exception as e: raise TicDatError( "Unable to interpret %s as json string : %s" % (reasonble_string, e)) verify(dictish(jdict), "%s failed to load a dictionary" % reasonble_string) verify( all(map(stringish, jdict)), "The dictionary loaded from %s isn't indexed by strings" % reasonble_string) verify( all(map(containerish, jdict.values())), "The dictionary loaded from %s doesn't have containers as values" % reasonble_string) return jdict
def get_fld_type(t, f, default_type): if (t, f) in forced_field_types: return forced_field_types[t, f] if t == "parameters" and self.tdf.parameters: return "text" fld_type = self.tdf.data_types.get(t, {}).get(f) if not fld_type: return default_type if fld_type.datetime: return "timestamp" verify( not (fld_type.number_allowed and fld_type.strings_allowed), f"Select one of string or numeric for {t}.{f} if declaring type and using postgres" ) if fld_type.strings_allowed: return 'text' if fld_type.number_allowed: if fld_type.must_be_int: return 'integer' else: return 'float' else: TicDatError( f"Allow one of text or numeric for {t}.{f} if declaring type and using postgres" )
def write_file(self, tic_dat, json_file_path, allow_overwrite=False, verbose=False): """ write the ticDat data to an excel file :param tic_dat: the data object to write (typically a TicDat) :param json_file_path: The file path of the json file to create. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :param verbose: boolean. Verbose mode writes the data rows as dicts keyed by field name. Otherwise, they are lists. :return: """ _standard_verify(self.tic_dat_factory) verify(not (os.path.exists(json_file_path) and not allow_overwrite), "%s exists and allow_overwrite is not enabled" % json_file_path) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) jdict = make_json_dict(self.tic_dat_factory, tic_dat, verbose) with open(json_file_path, "w") as fp: json.dump(jdict, fp, sort_keys=True, indent=2)
def write_directory(self, tic_dat, dir_path, allow_overwrite=False, dialect='excel', write_header=True): """ write the ticDat data to a collection of csv files :param tic_dat: the data object :param dir_path: the directory in which to write the csv files :param allow_overwrite: boolean - are we allowed to overwrite existing files? :param dialect: the csv dialect. Consult csv documentation for details. :param write_header: Boolean. Should the header information be written as the first row? :return: """ verify(csv, "csv needs to be installed to use this subroutine") verify(dialect in csv.list_dialects(), "Invalid dialect %s" % dialect) verify(not os.path.isfile(dir_path), "A file is not a valid directory path") if self.tic_dat_factory.generic_tables: dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.csv.write_directory(dat, dir_path, allow_overwrite, dialect, write_header) tdf = self.tic_dat_factory msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) if not allow_overwrite: for t in tdf.all_tables: f = os.path.join(dir_path, t + ".csv") verify(not os.path.exists(f), "The %s path exists and overwrite is not allowed" % f) if not os.path.isdir(dir_path): os.mkdir(dir_path) for t in tdf.all_tables: f = os.path.join(dir_path, t + ".csv") with open(f, 'w') as csvfile: writer = csv.DictWriter( csvfile, dialect=dialect, fieldnames=tdf.primary_key_fields.get(t, ()) + tdf.data_fields.get(t, ())) writer.writeheader() if write_header else None _t = getattr(tic_dat, t) if dictish(_t): for p_key, data_row in _t.items(): primaryKeyDict = { f: v for f, v in zip( tdf.primary_key_fields[t], p_key if containerish(p_key) else (p_key, )) } writer.writerow(dict(data_row, **primaryKeyDict)) else: for data_row in (_t if containerish(_t) else _t()): writer.writerow(dict(data_row))
def write_db_data(self, tic_dat, db_file_path, allow_overwrite = False): """ write the ticDat data to an SQLite database file :param tic_dat: the data object to write :param db_file_path: the file path of the SQLite database to populate :param allow_overwrite: boolean - are we allowed to overwrite pre-existing data :return: caveats : float("inf"), float("-inf") are written as "inf", "-inf" """ verify(sql, "sqlite3 needs to be installed to use this subroutine") msg = [] if not self.tic_dat_factory.good_tic_dat_object(tic_dat, lambda m : msg.append(m)) : raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(db_file_path), "A directory is not a valid SQLite file path") if self.tic_dat_factory.generic_tables: dat, tdf = create_generic_free(tic_dat, self.tic_dat_factory) return tdf.sql.write_db_data(dat, db_file_path, allow_overwrite) if not os.path.exists(db_file_path) : self.write_db_schema(db_file_path) table_names = self._check_tables_fields(db_file_path, self.tic_dat_factory.all_tables) with _sql_con(db_file_path, foreign_keys=False) as con: for t in self.tic_dat_factory.all_tables: verify(table_names[t] == t, "Failed to find table %s in path %s"% (t, db_file_path)) verify(allow_overwrite or not any(True for _ in con.execute("Select * from %s"%t)), "allow_overwrite is False, but there are already data records in %s"%t) con.execute("Delete from %s"%t) if allow_overwrite else None for sql_str, data in self._get_data(tic_dat, as_sql=False): con.execute(sql_str, list(data))
def write_data(self, tic_dat, engine, schema, dsn=None, pre_existing_rows=None): """ write the ticDat data to a PostGres database :param tic_dat: the data object to write :param engine: a sqlalchemy database engine with drivertype postgres :param schema: the postgres schema to write to (call self.write_schema explicitly as needed) :param dsn: optional - if truthy, a dict that can be unpacked as arguments to psycopg2.connect. Will speed up bulk writing compared to engine.execute If truthy and not a dict, then will be passed directly to psycopg2.connect as the sole argument. :param pre_existing_rows: if provided, a dict mapping table name to either "delete" or "append" default behavior is "delete" :return: """ verify(sa, "sqalchemy needs to be installed to use this subroutine") verify(engine.name == 'postgresql', "a sqlalchemy engine with drivername='postgres' is required") verify(not dsn or psycopg2, "need psycopg2 to use the faster dsn write option") self._check_good_pgtd_compatible_table_field_names() msg = [] if not self.tdf.good_tic_dat_object(tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) verify(not self.tdf.generic_tables, "TicDat for postgres does not yet support generic tables") self.check_tables_fields( engine, schema, error_on_missing_table=True) # call self.write_schema as needed self._handle_prexisting_rows(engine, schema, pre_existing_rows or {}) if dsn: connect_kwargs = dsn if dsn and dictish(dsn) else {} connect_args = [dsn] if dsn and not dictish(dsn) else [] with psycopg2.connect(*connect_args, **connect_kwargs) as db: with db.cursor() as cursor: for k, v in self._get_data(tic_dat, schema, dump_format="dict").items(): psycopg2.extras.execute_values(cursor, k, v) else: all_dat = self._get_data(tic_dat, schema) if len(all_dat) > 1000: print( "***pgtd.py not using most efficient data writing technique**" ) for sql_str, data in all_dat: engine.execute(sql_str, data)
def write_file(self, tic_dat, file_path, allow_overwrite=False, case_space_sheet_names=False): """ write the ticDat data to an excel file :param tic_dat: the data object to write (typically a TicDat) :param file_path: The file path of the excel file to create Needs to end in either ".xls" or ".xlsx" The latter is capable of writing out larger tables, but the former handles infinity seamlessly. If ".xlsx", then be advised that +/- float("inf") will be replaced with "inf"/"-inf", unless infinity_io_flag is being applied. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? case_space_sheet_names: boolean - make best guesses how to add spaces and upper case characters to sheet names :return: caveats: None may be written out as an empty string. This reflects the behavior of xlwt. """ self._verify_differentiable_sheet_names() verify( utils.stringish(file_path) and (file_path.endswith(".xls") or file_path.endswith(".xlsx")), "file_path argument needs to end in .xls or .xlsx") msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid ticDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(file_path), "A directory is not a valid xls file path") verify(allow_overwrite or not os.path.exists(file_path), "The %s path exists and overwrite is not allowed" % file_path) if self.tic_dat_factory.generic_tables: dat, tdf = utils.create_generic_free(tic_dat, self.tic_dat_factory) return tdf.xls.write_file(dat, file_path, allow_overwrite, case_space_sheet_names) case_space_sheet_names = case_space_sheet_names and \ len(set(self.tic_dat_factory.all_tables)) == \ len(set(map(case_space_to_pretty, self.tic_dat_factory.all_tables))) tbl_name_mapping = { t: case_space_to_pretty(t) if case_space_sheet_names else t for t in self.tic_dat_factory.all_tables } if file_path.endswith(".xls"): self._xls_write(tic_dat, file_path, tbl_name_mapping) else: self._xlsx_write(tic_dat, file_path, tbl_name_mapping)
def write_file(self, tic_dat, json_file_path, allow_overwrite=False, verbose=False, to_pandas=False): """ write the ticDat data to a json file (or json string) :param tic_dat: the data object to write (typically a TicDat) :param json_file_path: The file path of the json file to create. If empty string, then return a JSON string. :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :param verbose: boolean. Verbose mode writes the data rows as dicts keyed by field name. Otherwise, they are lists. :param to_pandas: boolean. if truthy, then use the PanDatFactory method of writing to json. :return: """ _standard_verify(self.tic_dat_factory) verify(not (to_pandas and verbose), "verbose argument is inconsistent with to_pandas") verify( not (json_file_path and os.path.exists(json_file_path) and not allow_overwrite), "%s exists and allow_overwrite is not enabled" % json_file_path) if to_pandas: from ticdat import PanDatFactory pdf = PanDatFactory.create_from_full_schema( self.tic_dat_factory.schema(include_ancillary_info=True)) return pdf.json.write_file( self.tic_dat_factory.copy_to_pandas(tic_dat, drop_pk_columns=False), json_file_path) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) jdict = make_json_dict(self.tic_dat_factory, tic_dat, verbose, use_infinity_io_flag_if_provided=True) if not json_file_path: return json.dumps(jdict, sort_keys=True, indent=2) with open(json_file_path, "w") as fp: json.dump(jdict, fp, sort_keys=True, indent=2)
def _create_jdict(self, json_file_path): verify(os.path.isfile(json_file_path), "json_file_path is not a valid file path.") try : with open(json_file_path, "r") as fp: jdict = json.load(fp) except Exception as e: raise TicDatError("Unable to interpret %s as json file : %s"% (json_file_path, e.message)) verify(dictish(jdict), "%s failed to load a dictionary"%json_file_path) verify(all(map(stringish, jdict)), "The dictionary loaded from %s isn't indexed by strings"%json_file_path) verify(all(map(containerish, jdict.values())), "The dictionary loaded from %s doesn't have containers as values"%json_file_path) return jdict
def _get_sheet_names(self, xls_file_path): sheets = defaultdict(list) try : xl = pd.ExcelFile(xls_file_path) except Exception as e: raise TicDatError("Unable to open %s as xls file : %s"%(xls_file_path, e.message)) for table, sheet in product(self.pan_dat_factory.all_tables, xl.sheet_names) : if table.lower()[:_longest_sheet] == sheet.lower().replace(' ', '_')[:_longest_sheet]: sheets[table].append(sheet) duplicated_sheets = tuple(_t for _t,_s in sheets.items() if len(_s) > 1) verify(not duplicated_sheets, "The following sheet names were duplicated : " + ",".join(duplicated_sheets)) sheets = FrozenDict({k:v[0] for k,v in sheets.items()}) return sheets
def _get_sheets_and_fields(self, xls_file_path, all_tables, row_offsets, headers_present, print_missing_tables=False): verify( utils.stringish(xls_file_path) and os.path.exists(xls_file_path), "xls_file_path argument %s is not a valid file path." % xls_file_path) try: book = xlrd.open_workbook(xls_file_path) except Exception as e: raise TicDatError("Unable to open %s as xls file : %s" % (xls_file_path, e)) sheets = defaultdict(list) for table, sheet in product(all_tables, book.sheets()): if table.lower()[:_longest_sheet] == sheet.name.lower().replace( ' ', '_')[:_longest_sheet]: sheets[table].append(sheet) duplicated_sheets = tuple(_t for _t, _s in sheets.items() if len(_s) > 1) verify( not duplicated_sheets, "The following sheet names were duplicated : " + ",".join(duplicated_sheets)) sheets = FrozenDict({k: v[0] for k, v in sheets.items()}) missing_tables = {t for t in all_tables if t not in sheets} if missing_tables and print_missing_tables: print( "The following table names could not be found in the %s file.\n%s\n" % (xls_file_path, "\n".join(missing_tables))) field_indicies, missing_fields, dup_fields = {}, {}, {} for table, sheet in sheets.items(): field_indicies[table], missing_fields[table], dup_fields[table] = \ self._get_field_indicies(table, sheet, row_offsets[table], headers_present) verify( not any(_ for _ in missing_fields.values()), "The following field names could not be found : \n" + "\n".join("%s : " % t + ",".join(bf) for t, bf in missing_fields.items() if bf)) verify( not any(_ for _ in dup_fields.values()), "The following field names were duplicated : \n" + "\n".join("%s : " % t + ",".join(bf) for t, bf in dup_fields.items() if bf)) return sheets, field_indicies, book.datemode
def _get_sheets_and_fields(self, xls_file_path, all_tables, row_offsets, headers_present, print_missing_tables = False): verify(utils.stringish(xls_file_path) and os.path.exists(xls_file_path), "xls_file_path argument %s is not a valid file path."%xls_file_path) try : book = xlrd.open_workbook(xls_file_path) if xls_file_path.endswith(".xls") else \ openpyxl.load_workbook(xls_file_path, data_only=True) except Exception as e: raise TicDatError("Unable to open %s as xls file : %s"%(xls_file_path, e)) sheet_name = lambda sheet: sheet.name if xls_file_path.endswith(".xls") else sheet.title sheets = defaultdict(list) book_sheets = lambda: book.sheets() if xls_file_path.endswith(".xls") else book.worksheets for table, sheet in product(all_tables, book_sheets()) : if table.lower()[:_longest_sheet] == sheet_name(sheet).lower().replace(' ', '_')[:_longest_sheet]: sheets[table].append(sheet) duplicated_sheets = tuple(_t for _t,_s in sheets.items() if len(_s) > 1) verify(not duplicated_sheets, "The following sheet names were duplicated s: " + ",".join(duplicated_sheets)) wrapped_sheet = lambda sheet: _XlrdSheetWrapper(sheet, book.datemode) if xls_file_path.endswith(".xls") else \ _OpenPyxlSheetWrapper(sheet, prune_trailing_empty_rows= self.tic_dat_factory.xlsx_trailing_empty_rows == "prune") sheets = FrozenDict({k: wrapped_sheet(v[0]) for k,v in sheets.items()}) missing_tables = {t for t in all_tables if t not in sheets} if missing_tables and print_missing_tables: print ("The following table names could not be found in the %s file.\n%s\n"% (xls_file_path,"\n".join(missing_tables))) field_indicies, missing_fields, dup_fields = {}, {}, {} for table, sheet in sheets.items() : field_indicies[table], missing_fields[table], dup_fields[table] = \ self._get_field_indicies(table, sheet, row_offsets[table], headers_present) verify(not any(_ for _ in missing_fields.values()), "The following field names could not be found : \n" + "\n".join("%s : "%t + ",".join(bf) for t,bf in missing_fields.items() if bf)) verify(not any(_ for _ in dup_fields.values()), "The following field names were duplicated : \n" + "\n".join("%s : "%t + ",".join(bf) for t,bf in dup_fields.items() if bf)) return sheets, field_indicies
def write_file(self, tic_dat, file_path, allow_overwrite=False): """ write the ticDat data to an excel file :param tic_dat: the data object to write (typically a TicDat) :param file_path: The file path of the excel file to create Needs to end in either ".xls" or ".xlsx" The latter is capable of writing out larger tables, but the former handles infinity seamlessly. If ".xlsx", then be advised that +/- float("inf") will be replaced with +/- 1e+100 :param allow_overwrite: boolean - are we allowed to overwrite an existing file? :return: caveats: None may be written out as an empty string. This reflects the behavior of xlwt. """ self._verify_differentiable_sheet_names() verify( utils.stringish(file_path) and (file_path.endswith(".xls") or file_path.endswith(".xlsx")), "file_path argument needs to end in .xls or .xlsx") msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid ticDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(file_path), "A directory is not a valid xls file path") verify(allow_overwrite or not os.path.exists(file_path), "The %s path exists and overwrite is not allowed" % file_path) if self.tic_dat_factory.generic_tables: dat, tdf = utils.create_generic_free(tic_dat, self.tic_dat_factory) return tdf.xls.write_file(dat, file_path, allow_overwrite) if file_path.endswith(".xls"): self._xls_write(tic_dat, file_path) else: self._xlsx_write(tic_dat, file_path)
def write_file(self, tic_dat, mdb_file_path, allow_overwrite=False): """ write the ticDat data to an SQLite database file :param tic_dat: the data object to write :param mdb_file_path: the file path of the SQLite database to populate :param allow_overwrite: boolean - are we allowed to overwrite pre-existing data :return: caveats : Numbers with absolute values larger than 1e+100 will be written as 1e+100 or -1e+100 NB - thrown Exceptions of the form "Data type mismatch in criteria expression" generally result either from Access's inability to store different data types in the same field, or from a mismatch between the data object and the default field types ticdat uses when creating an Access schema. For the latter, feel free to call the write_schema function on the data file first with explicitly identified field types. """ _standard_verify(self.tic_dat_factory.generic_tables) msg = [] if not self.tic_dat_factory.good_tic_dat_object( tic_dat, lambda m: msg.append(m)): raise TicDatError("Not a valid TicDat object for this schema : " + " : ".join(msg)) verify(not os.path.isdir(mdb_file_path), "A directory is not a valid Access file path") if not os.path.exists(mdb_file_path): self.write_schema(mdb_file_path) table_names = self._check_tables_fields( mdb_file_path, self.tic_dat_factory.all_tables) with _connect(_connection_str(mdb_file_path)) as con: for t in self.tic_dat_factory.all_tables: verify( table_names[t] == t, "Failed to find table %s in path %s" % (t, mdb_file_path)) if not allow_overwrite: with con.cursor() as cur: cur.execute("Select * from %s" % t) verify( not any(True for _ in cur.fetchall()), "allow_overwrite is False, but there are already data records in %s" % t) con.cursor().execute("Delete from %s" % t).commit() if allow_overwrite else None _t = getattr(tic_dat, t) if dictish(_t): primary_keys = tuple( self.tic_dat_factory.primary_key_fields[t]) for pk_row, sql_data_row in _t.items(): _items = tuple(sql_data_row.items()) fields = _brackets(primary_keys + tuple(x[0] for x in _items)) data_row = ((pk_row,) if len(primary_keys)==1 else pk_row) + \ tuple(_write_data(x[1]) for x in _items) assert len(data_row) == len(fields) str = "INSERT INTO %s (%s) VALUES (%s)"%\ (t, ",".join(fields), ",".join("?" for _ in fields)) con.cursor().execute(str, data_row).commit() else: for sql_data_row in (_t if containerish(_t) else _t()): str = "INSERT INTO %s (%s) VALUES (%s)" % (t, ",".join( _brackets(sql_data_row.keys())), ",".join( ["?"] * len(sql_data_row))) con.cursor().execute( str, tuple(map(_write_data, sql_data_row.values())))
def create_pan_dat(self, xls_file_path, fill_missing_fields=False): """ Create a PanDat object from an Excel file :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :return: a PanDat object populated by the matching sheets. caveats: Missing sheets resolve to an empty table, but missing fields on matching sheets throw an Exception (unless fill_missing_fields is truthy). Table names are matched to sheets with with case-space insensitivity, but spaces and case are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to excel and then recover it, the type of data might change. For example df = pd.DataFrame({"a":["100", "200", "300"]}) df.to_excel("something.xlsx") df2 = pd.read_excel("something.xlsx") results in a numeric column in df2. To address this, you need to either use set_data_type for your PanDatFactory. This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]}) """ rtn = {} try: xl = pd.ExcelFile(xls_file_path) except Exception as e: raise TicDatError("Unable to open %s as xls file : %s" % (xls_file_path, e)) for t, s in self._get_sheet_names(xl).items(): rtn[t] = pd.read_excel( xl, s, dtype=self.pan_dat_factory._dtypes_for_pandas_read(t)) missing_tables = { t for t in self.pan_dat_factory.all_tables if t not in rtn } if missing_tables: print( "The following table names could not be found in the %s file.\n%s\n" % (xls_file_path, "\n".join(missing_tables))) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following are (table, field) pairs missing from the %s file.\n%s" % (xls_file_path, missing_fields)) xl.close() rtn = _clean_pandat_creator(self.pan_dat_factory, rtn, print_missing_tables=True) if self.pan_dat_factory.xlsx_trailing_empty_rows == "prune": from ticdat.pandatfactory import remove_trailing_all_nan for t in self.pan_dat_factory.all_tables: setattr(rtn, t, remove_trailing_all_nan(getattr(rtn, t))) return rtn