def __init__(self, **init_tables): superself._trigger_has_been_used() for t in init_tables : verify(t in superself.all_tables, "Unexpected table name %s"%t) tbl = safe_apply(DataFrame)(init_tables[t]) if tbl is None and dictish(init_tables[t]) and all(map(stringish, init_tables[t])): tbl = safe_apply(DataFrame)(**init_tables[t]) verify(isinstance(tbl, DataFrame), "Failed to provide a valid DataFrame or DataFrame construction argument for %s"%t) setattr(self, t, tbl.copy()) df = getattr(self, t) if list(df.columns) == list(range(len(df.columns))) and \ len(df.columns) >= len(superself._all_fields(t)): df.rename(columns={f1:f2 for f1, f2 in zip(df.columns, superself._all_fields(t))}, inplace=True) for t in set(superself.all_tables).difference(init_tables): setattr(self, t, DataFrame({f:[] for f in utils.all_fields(superself, t)})) missing_fields = {(t, f) for t in superself.all_tables for f in superself._all_fields(t) if f not in getattr(self, t).columns} verify(not missing_fields, "The following are (table, field) pairs missing from the data.\n%s"%missing_fields) for t in superself.all_tables: af = list(superself._all_fields(t)) df = getattr(self, t) if list(df.columns)[:len(af)] != af: extra_cols = [_ for _ in list(df.columns) if _ not in af] setattr(self, t, df[af + extra_cols]) assert list(getattr(self, t)) == af + extra_cols
def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs): """ Create a PanDat object from a directory of csv files. :param db_file_path: the directory containing the .csv files. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param kwargs: additional named arguments to pass to pandas.read_csv :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to csv and then recover it, the type of data might change. For example df = pd.DataFrame({"a":["100", "200", "300"]}) df.to_csv("something.csv") df2 = pd.read_csv("something.csv") results in a numeric column in df2. To address this, you need to either use set_data_type for your PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better). This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]}) """ verify(os.path.isdir(dir_path), "%s not a directory path" % dir_path) tbl_names = self._get_table_names(dir_path) rtn = {} for t, f in tbl_names.items(): kwargs_ = dict(kwargs) if "dtype" not in kwargs_: kwargs_[ "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t) rtn[t] = pd.read_csv(f, **kwargs_) missing_tables = { t for t in self.pan_dat_factory.all_tables if t not in rtn } if missing_tables: print( "The following table names could not be found in the %s directory.\n%s\n" % (dir_path, "\n".join(missing_tables))) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following (table, file_name, field) triplets are missing fields.\n%s" % [(t, os.path.basename(tbl_names[t]), f) for t, f in missing_fields]) return _clean_pandat_creator(self.pan_dat_factory, rtn)
def create_pan_dat(self, dir_path, fill_missing_fields=False, **kwargs): """ Create a PanDat object from a SQLite database file :param db_file_path: the directory containing the .csv files. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param kwargs: additional named arguments to pass to pandas.read_csv :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). """ verify(os.path.isdir(dir_path), "%s not a directory path"%dir_path) tbl_names = self._get_table_names(dir_path) rtn = {t: pd.read_csv(f, **kwargs) for t,f in tbl_names.items()} missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following (table, file_name, field) triplets are missing fields.\n%s" % [(t, os.path.basename(tbl_names[t]), f) for t,f in missing_fields]) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn
def create_pan_dat(self, db_file_path, con=None, fill_missing_fields=False): """ Create a PanDat object from a SQLite database file :param db_file_path: A SQLite DB File. Set to falsey if using con argument :param con: A connection object that can be passed to pandas read_sql. Set to falsey if using db_file_path argument. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :return: a PanDat object populated by the matching tables. caveats: Missing tables always resolve to an empty table, but missing fields on matching tables throw an exception (unless fill_missing_fields is truthy). Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). """ verify( bool(db_file_path) != bool(con), "use either the con argument or the db_file_path argument but not both" ) if db_file_path: verify( os.path.exists(db_file_path) and not os.path.isdir(db_file_path), "%s not a file path" % db_file_path) rtn = {} con_maker = lambda: _sql_con( db_file_path) if db_file_path else _DummyContextManager(con) with con_maker() as _: con_ = con or _ for t, s in self._get_table_names(con_).items(): rtn[t] = pd.read_sql(sql="Select * from [%s]" % s, con=con_) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following are (table, field) pairs missing from the %s file.\n%s" % (db_file_path, missing_fields)) missing_tables = sorted( set(self.pan_dat_factory.all_tables).difference(rtn)) if missing_tables: print( "The following table names could not be found in the SQLite database.\n%s\n" % "\n".join(missing_tables)) return _clean_pandat_creator(self.pan_dat_factory, rtn)
def create_pan_dat(self, xls_file_path, fill_missing_fields=False): """ Create a PanDat object from an Excel file :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :return: a PanDat object populated by the matching sheets. caveats: Missing sheets resolve to an empty table, but missing fields on matching sheets throw an Exception (unless fill_missing_fields is truthy). Table names are matched to sheets with with case-space insensitivity, but spaces and case are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to excel and then recover it, the type of data might change. For example df = pd.DataFrame({"a":["100", "200", "300"]}) df.to_excel("something.xlsx") df2 = pd.read_excel("something.xlsx") results in a numeric column in df2. To address this, you need to either use set_data_type for your PanDatFactory. This problem is even worse with df = pd.DataFrame({"a":["0100", "1200", "2300"]}) """ rtn = {} for t, s in self._get_sheet_names(xls_file_path).items(): rtn[t] = pd.read_excel( xls_file_path, s, dtype=self.pan_dat_factory._dtypes_for_pandas_read(t)) missing_tables = { t for t in self.pan_dat_factory.all_tables if t not in rtn } if missing_tables: print( "The following table names could not be found in the %s file.\n%s\n" % (xls_file_path, "\n".join(missing_tables))) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following are (table, field) pairs missing from the %s file.\n%s" % (xls_file_path, missing_fields)) return _clean_pandat_creator(self.pan_dat_factory, rtn, print_missing_tables=True)
def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs): """ Create a PanDat object from a SQLite database file :param path_or_buf: a valid JSON string or file-like :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param orient: Indication of expected JSON string format. See pandas.read_json for more details. :param kwargs: additional named arguments to pass to pandas.read_json :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). +- "inf", "-inf" strings will be converted to +-float("inf") """ if os.path.exists(path_or_buf): verify(os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf) with open(path_or_buf, "r") as f: loaded_dict = json.load(f) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) loaded_dict = json.loads(path_or_buf) verify(dictish(loaded_dict), "path_or_buf to json.load as a dict") verify(all(map(dictish, loaded_dict.values())), "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries") tbl_names = self._get_table_names(loaded_dict) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") rtn = {t: pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs) for t,f in tbl_names.items()} missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t,f in missing_fields]) for v in rtn.values(): v.replace("inf", float("inf"), inplace=True) v.replace("-inf", -float("inf"), inplace=True) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn
def create_pan_dat(self, db_file_path, con=None, fill_missing_fields=False): """ Create a PanDat object from a SQLite database file :param db_file_path: A SQLite DB File. Set to falsey if using con argument :param con: sqlalchemy.engine.Engine or sqlite3.Connection. Set to falsey if using db_file_path argument. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :return: a PanDat object populated by the matching tables. caveats: Missing tables always throw an Exception. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). """ verify(bool(db_file_path) != bool(con), "use either the con argument or the db_file_path argument but not both") if db_file_path: verify(os.path.exists(db_file_path) and not os.path.isdir(db_file_path), "%s not a file path"%db_file_path) rtn = {} con_maker = lambda: _sql_con(db_file_path) if db_file_path else _DummyContextManager(con) with con_maker() as _: con_ = con or _ for t, s in self._get_table_names(con_).items(): rtn[t] = pd.read_sql(sql="Select * from [%s]"%s, con=con_) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following are (table, field) pairs missing from the %s file.\n%s" % (db_file_path, missing_fields)) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn
def create_pan_dat(self, xls_file_path, fill_missing_fields=False): """ Create a PanDat object from an Excel file :param xls_file_path: An Excel file containing sheets whose names match the table names in the schema. :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :return: a PanDat object populated by the matching sheets. caveats: Missing sheets resolve to an empty table, but missing fields on matching sheets throw an Exception (unless fill_missing_fields is falsey). Table names are matched to sheets with with case-space insensitivity, but spaces and case are respected for field names. (ticdat supports whitespace in field names but not table names). """ rtn = {} for t, s in self._get_sheet_names(xls_file_path).items(): rtn[t] = pd.read_excel(xls_file_path, s) missing_tables = {t for t in self.pan_dat_factory.all_tables if t not in rtn} if missing_tables: print ("The following table names could not be found in the %s file.\n%s\n"% (xls_file_path,"\n".join(missing_tables))) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t,f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify(fill_missing_fields or not missing_fields, "The following are (table, field) pairs missing from the %s file.\n%s" % (xls_file_path, missing_fields)) rtn = self.pan_dat_factory.PanDat(**rtn) msg = [] assert self.pan_dat_factory.good_pan_dat_object(rtn, msg.append), str(msg) return rtn
def create_pan_dat(self, path_or_buf, fill_missing_fields=False, orient='split', **kwargs): """ Create a PanDat object from a JSON file or string :param path_or_buf: a valid JSON string or file-like :param fill_missing_fields: boolean. If truthy, missing fields will be filled in with their default value. Otherwise, missing fields throw an Exception. :param orient: Indication of expected JSON string format. See pandas.read_json for more details. :param kwargs: additional named arguments to pass to pandas.read_json :return: a PanDat object populated by the matching tables. caveats: Missing tables always resolve to an empty table. Table names are matched with case-space insensitivity, but spaces are respected for field names. (ticdat supports whitespace in field names but not table names). Note that if you save a DataFrame to json and then recover it, the type of data might change. Specifically, text that looks numeric might be recovered as a number, to include the loss of leading zeros. To address this, you need to either use set_data_type for your PanDatFactory, or specify "dtype" in kwargs. (The former is obviously better). """ if stringish(path_or_buf) and os.path.exists(path_or_buf): verify( os.path.isfile(path_or_buf), "%s appears to be a directory and not a file." % path_or_buf) with open(path_or_buf, "r") as f: loaded_dict = json.load(f) else: verify(stringish(path_or_buf), "%s isn't a string" % path_or_buf) loaded_dict = json.loads(path_or_buf) verify(dictish(loaded_dict), "the json.load result doesn't resolve to a dictionary") verify( all(map(dictish, loaded_dict.values())), "the json.load result doesn't resolve to a dictionary whose values are themselves dictionaries" ) tbl_names = self._get_table_names(loaded_dict) verify("orient" not in kwargs, "orient should be passed as a non-kwargs argument") rtn = {} for t, f in tbl_names.items(): kwargs_ = dict(kwargs) if "dtype" not in kwargs_: kwargs_[ "dtype"] = self.pan_dat_factory._dtypes_for_pandas_read(t) rtn[t] = pd.read_json(json.dumps(loaded_dict[f]), orient=orient, **kwargs_) missing_fields = {(t, f) for t in rtn for f in all_fields(self.pan_dat_factory, t) if f not in rtn[t].columns} if fill_missing_fields: for t, f in missing_fields: rtn[t][f] = self.pan_dat_factory.default_values[t][f] verify( fill_missing_fields or not missing_fields, "The following (table, field) pairs are missing fields.\n%s" % [(t, f) for t, f in missing_fields]) missing_tables = sorted( set(self.pan_dat_factory.all_tables).difference(rtn)) if missing_tables: print( "The following table names could not be found in the SQLite database.\n%s\n" % "\n".join(missing_tables)) return _clean_pandat_creator(self.pan_dat_factory, rtn, json_read=True)