def write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() records = [] # write data dictionary for all checkpointed_tables with open(os.path.join(output_dir, 'data_dict.txt'), 'w') as file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print >> file, "\n### %s %s" % (table_name, df.shape) print >> file, df.dtypes rows, columns = df.shape bytes = df.memory_usage(index=True).sum() records.append((table_name, rows, columns, bytes)) df = pd.DataFrame.from_records( records, columns=['table_name', 'rows', 'columns', 'bytes']) df.sort_values(by='table_name', inplace=True) df.to_csv(os.path.join(output_dir, 'data_dict.csv'))
def previous_write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ model_settings = config.read_model_settings('write_data_dictionary') txt_format = model_settings.get('txt_format', 'data_dict.txt') csv_format = model_settings.get('csv_format', 'data_dict.csv') if txt_format: output_file_path = config.output_file_path(txt_format) pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables with open(output_file_path, 'w') as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def write_data_dictionary(output_dir): output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables with open(os.path.join(output_dir, 'data_dict.csv'), 'a') as file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print >> file, "\n### %s (%s)\n" % (table_name, df.shape), df.dtypes
def write_tables(output_dir): output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables = pipeline.checkpointed_tables() if output_tables_settings is not None: action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) if action == 'include': output_tables = tables elif action == 'skip': output_tables = [t for t in output_tables if t not in tables] # should provide option to also write checkpoints? # output_tables.append("checkpoints.csv") for table_name in output_tables: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s.csv" % table_name logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index)
def write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path('data_dict.txt'), mode) as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables mode = 'wb' if sys.version_info < (3,) else 'w' with open(config.output_file_path('data_dict.txt'), mode) as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def write_data_dictionary(output_dir): """ Write table schema for all tables model settings txt_format: output text file name (default data_dict.txt) or empty to suppress txt output csv_format: output csv file name (default data_dict.tcsvxt) or empty to suppress txt output schema_tables: list of tables to include in output (defaults to all checkpointed tables) for each table, write column names, dtype, and checkpoint added) text format writes individual table schemas to a single text file csv format writes all tables together with an additional table_name column Parameters ---------- output_dir: str """ model_settings = config.read_model_settings('write_data_dictionary') txt_format = model_settings.get('txt_format', 'data_dict.txt') csv_format = model_settings.get('csv_format', 'data_dict.csv') if not (csv_format or txt_format): logger.warning( f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified" ) return table_names = pipeline.checkpointed_tables() # use table_names list from model_settings, if provided schema_tables = model_settings.get('tables', None) if schema_tables: table_names = [c for c in schema_tables if c in table_names] # initialize schema as dict of dataframe[table_name, column_name, dtype, checkpoint] schema = dict() final_shapes = dict() for table_name in table_names: df = pipeline.get_table(table_name) final_shapes[table_name] = df.shape if df.index.name and df.index.name not in df.columns: df = df.reset_index() info = df.dtypes.astype(str).to_frame('dtype').reset_index().rename( columns={'index': 'column_name'}) info['checkpoint'] = '' info.insert(loc=0, column='table_name', value=table_name) schema[table_name] = info # annotate schema.info with name of checkpoint columns were first seen for _, row in pipeline.get_checkpoints().iterrows(): checkpoint_name = row[pipeline.CHECKPOINT_NAME] for table_name in table_names: # no change to table in this checkpoint if row[table_name] != checkpoint_name: continue # get the checkpointed version of the table df = pipeline.get_table(table_name, checkpoint_name) if df.index.name and df.index.name not in df.columns: df = df.reset_index() info = schema.get(table_name, None) # tag any new columns with checkpoint name prev_columns = info[info.checkpoint != ''].column_name.values new_cols = [c for c in df.columns.values if c not in prev_columns] is_new_column_this_checkpoont = info.column_name.isin(new_cols) info.checkpoint = np.where(is_new_column_this_checkpoont, checkpoint_name, info.checkpoint) schema[table_name] = info schema_df = pd.concat(schema.values()) if csv_format: schema_df.to_csv(config.output_file_path(csv_format), header=True, index=False) if txt_format: with open(config.output_file_path(txt_format), 'w') as output_file: # get max schema column widths from omnibus table col_width = { c: schema_df[c].str.len().max() + 2 for c in schema_df } for table_name in table_names: info = schema.get(table_name, None) columns_to_print = ['column_name', 'dtype', 'checkpoint'] info = info[columns_to_print].copy() # normalize schema columns widths across all table schemas for unified output formatting for c in info: info[c] = info[c].str.pad(col_width[c], side='right') info.columns = [c.ljust(col_width[c]) for c in info.columns] info = info.to_string(index=False) print( f"###\n### {table_name} {final_shapes[table_name]}\n###\n", file=output_file) print(f"{info}\n", file=output_file)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag: :: output_tables: h5_store: True action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') h5_store = output_tables_settings.get('h5_store', False) sort = output_tables_settings.get('sort', False) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [ t for t in checkpointed_tables if t not in tables ] else: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) if sort: traceable_table_indexes = inject.get_injectable( 'traceable_table_indexes', {}) if df.index.name in traceable_table_indexes: df = df.sort_index() logger.debug( f"write_tables sorting {table_name} on index {df.index.name}" ) else: # find all registered columns we can use to sort this table # (they are ordered appropriately in traceable_table_indexes) sort_columns = [ c for c in traceable_table_indexes if c in df.columns ] if len(sort_columns) > 0: df = df.sort_values(by=sort_columns) logger.debug( f"write_tables sorting {table_name} on columns {sort_columns}" ) else: logger.debug( f"write_tables sorting {table_name} on unrecognized index {df.index.name}" ) df = df.sort_index() if h5_store: file_path = config.output_file_path('%soutput_tables.h5' % prefix) df.to_hdf(file_path, key=table_name, mode='a', format='fixed') else: file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance( df.index, pd.MultiIndex) df.to_csv(file_path, index=write_index)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag: :: output_tables: h5_store: True action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') h5_store = output_tables_settings.get('h5_store', False) if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [ t for t in checkpointed_tables if t not in tables ] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) if h5_store: file_path = config.output_file_path('%soutput_tables.h5' % prefix) df.to_hdf(file_path, key=table_name, mode='a', format='fixed') else: file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance( df.index, pd.MultiIndex) df.to_csv(file_path, index=write_index)
MODELS = setting('models') # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint resume_after = setting('resume_after', None) if resume_after: print "resume_after", resume_after pipeline.run(models=MODELS, resume_after=resume_after) print "\n#### run completed" # write final versions of all checkpointed dataframes to CSV files to review results for table_name in pipeline.checkpointed_tables(): file_name = "final_%s_table.csv" % table_name file_path = os.path.join(orca.get_injectable("output_dir"), file_name) pipeline.get_table(table_name).to_csv(file_path) # write checkpoints file_path = os.path.join(orca.get_injectable("output_dir"), "checkpoints.csv") pipeline.get_checkpoints().to_csv(file_path) # tables will no longer be available after pipeline is closed pipeline.close_pipeline() t0 = print_elapsed_time("all models", t0)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. Pipeline tables are intermediate computational tables, not to be confused with the synthetic population tables written by the write_synthetic_population step. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written Intermediate tables likely to be of particular interest or utility are the controls and weights tables for the various geographies. For example, if one of your geographies is TRACT, then: TRACT_controls has control totals for every TRACT (and aggregated subzone) controls. TRACT_weights has balanced_weight and integer_weight for every TRACT. To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the expanded_household_ids table: :: output_tables: action: include tables: - expanded_household_ids Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables_list = pipeline.checkpointed_tables() if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') if action not in ['include', 'skip']: raise RuntimeError( "expected %s action '%s' to be either 'include' or 'skip'" % (output_tables_settings_name, action)) if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in output_tables_list if t not in tables] # should provide option to also write checkpoints? # output_tables_list.append("checkpoints.csv") for table_name in output_tables_list: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s.csv" % table_name logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in checkpointed_tables if t not in tables] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex) df.to_csv(file_path, index=write_index)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. Pipeline tables are intermediate computational tables, not to be confused with the synthetic population tables written by the write_synthetic_population step. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written Intermediate tables likely to be of particular interest or utility are the controls and weights tables for the various geographies. For example, if one of your geographies is TRACT, then: TRACT_controls has control totals for every TRACT (and aggregated subzone) controls. TRACT_weights has balanced_weight and integer_weight for every TRACT. To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the expanded_household_ids table: :: output_tables: action: include tables: - expanded_household_ids Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables_list = pipeline.checkpointed_tables() if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') if action not in ['include', 'skip']: raise RuntimeError("expected %s action '%s' to be either 'include' or 'skip'" % (output_tables_settings_name, action)) if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in output_tables_list if t not in tables] logger.debug("output_tables_list: %s" % str(output_tables_list)) # should provide option to also write checkpoints? # output_tables_list.append("checkpoints.csv") # columns: geography, id, variable, control, result, diff summary_melt_df = pd.DataFrame() for table_name in output_tables_list: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s.csv" % table_name logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index) try: # create the melt # find the control variables control_vars = [] for column in list(df.columns.values): if column[-8:] == "_control": control_vars.append(column[:-8]) logger.debug("control variables for melt %s" % str(control_vars)) control_col_names = list("%s_control" % cv for cv in control_vars) result_col_names = list("%s_result" % cv for cv in control_vars) diff_col_names = list("%s_diff" % cv for cv in control_vars) control_melt_df = df.melt(id_vars=["geography","id"], value_vars=control_col_names, value_name="control").replace(to_replace=dict(zip(control_col_names, control_vars)) ) result_melt_df = df.melt(id_vars=["geography","id"], value_vars=result_col_names, value_name="result" ).replace(to_replace=dict(zip(result_col_names, control_vars)) ) diff_melt_df = df.melt(id_vars=["geography","id"], value_vars=diff_col_names, value_name="diff" ).replace(to_replace=dict(zip(diff_col_names, control_vars)) ) melt_df = pd.merge(left=control_melt_df, right=result_melt_df, how="left", on=["geography","id","variable"]) melt_df = pd.merge(left=melt_df, right=diff_melt_df, how="left", on=["geography","id","variable"]) summary_melt_df = summary_melt_df.append(melt_df) except: # if something doesn't work, it's ok pass if len(summary_melt_df) > 0: file_name = "summary_melt.csv" logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None summary_melt_df.to_csv(file_path, index=write_index)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables_list = pipeline.checkpointed_tables() if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in output_tables_list if t not in tables] # should provide option to also write checkpoints? # output_tables_list.append("checkpoints.csv") for table_name in output_tables_list: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s%s.csv" % (prefix, table_name) logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index) if (action == 'include') == ('checkpoints' in tables): # write checkpoints file_name = "%s%s.csv" % (prefix, 'checkpoints') pipeline.get_checkpoints().to_csv(os.path.join(output_dir, file_name))