def build_network(settings): """ Build a Pandana network from CSV files """ logger.info('building pandana network') network_settings_file = settings['network_settings_file'] if not network_settings_file: logger.error("Please specify 'network_settings_file' in settings") return network_settings = config.read_model_settings(network_settings_file) logger.debug('using settings %s' % network_settings) nodes = pd.read_csv(config.data_file_path(network_settings['nodes'])) links = pd.read_csv(config.data_file_path(network_settings['links'])) nodes.index = nodes[network_settings['nodes-id']] network = pdna.Network(nodes[network_settings['nodes-x']], nodes[network_settings['nodes-y']], links[network_settings['links-a']], links[network_settings['links-b']], links[[network_settings['links-impedance']]], twoway=network_settings['twoway']) network.save_hdf5(config.output_file_path('pandana_network.h5')) return network
def read_saved_shadow_prices(self, model_settings): """ Read saved shadow_prices from csv file in data_dir (so-called warm start) returns None if no saved shadow price file name specified or named file not found Parameters ---------- model_settings : dict Returns ------- shadow_prices : pandas.DataFrame or None """ shadow_prices = None # - load saved shadow_prices saved_shadow_price_file_name = model_settings.get('SAVED_SHADOW_PRICE_TABLE_NAME') if saved_shadow_price_file_name: # FIXME - where should we look for this file? file_path = config.data_file_path(saved_shadow_price_file_name, mandatory=False) if file_path: shadow_prices = pd.read_csv(file_path, index_col=0) self.saved_shadow_price_file_path = file_path # informational logging.info("loaded saved_shadow_prices from %s" % file_path) else: logging.warning("Could not find saved_shadow_prices file %s" % file_path) return shadow_prices
def read_saved_shadow_prices(self, model_settings): """ Read saved shadow_prices from csv file in data_dir (so-called warm start) returns None if no saved shadow price file name specified or named file not found Parameters ---------- model_settings : dict Returns ------- shadow_prices : pandas.DataFrame or None """ shadow_prices = None # - load saved shadow_prices saved_shadow_price_file_name = model_settings.get('SAVED_SHADOW_PRICE_TABLE_NAME') if saved_shadow_price_file_name: # FIXME - where should we look for this file? file_path = config.data_file_path(saved_shadow_price_file_name, mandatory=False) if file_path: shadow_prices = pd.read_csv(file_path, index_col=0) self.saved_shadow_price_file_path = file_path # informational logger.info("loaded saved_shadow_prices from %s" % file_path) else: logger.warning("Could not find saved_shadow_prices file %s" % file_path) return shadow_prices
def override_hh_ids(settings): hh_ids_filename = settings.get('hh_ids', None) if hh_ids_filename is None: return None file_path = config.data_file_path(hh_ids_filename, mandatory=False) if not file_path: logger.error("hh_ids file name '%s' specified in settings not found" % hh_ids_filename) return None df = pd.read_csv(file_path, comment='#') if 'household_id' not in df.columns: logger.error("No 'household_id' column in hh_ids file %s" % hh_ids_filename) return None household_ids = df.household_id.astype(int).unique() if len(household_ids) == 0: logger.error("No households in hh_ids file %s" % hh_ids_filename) return None logger.info("Using hh_ids list with %s households from file %s" % (len(household_ids), hh_ids_filename)) return household_ids
def read_pois_table(buffer_zones_settings, network, constants): poi_fname = config.data_file_path(buffer_zones_settings['pois']) poi_df = pd.read_csv(poi_fname, index_col=False) poi_df['net_node_id'] = network.get_node_ids( poi_df[constants['pois-x']].values, poi_df[constants['pois-y']].values) return poi_df
def skim_dict(data_dir, settings): omx_file_path = config.data_file_path(settings["skims_file"]) tags_to_load = settings['skim_time_periods']['labels'] logger.info("loading skim_dict from %s" % (omx_file_path, )) # select the skims to load skim_info = get_skim_info(omx_file_path, tags_to_load) logger.debug("omx_shape %s skim_dtype %s" % (skim_info['omx_shape'], skim_info['dtype'])) skim_buffers = inject.get_injectable('data_buffers', None) if skim_buffers: logger.info('Using existing skim_buffers for skims') else: skim_buffers = buffers_for_skims(skim_info, shared=False) load_skims(omx_file_path, skim_info, skim_buffers) skim_data = skim_data_from_buffers(skim_buffers, skim_info) block_names = list(skim_info['blocks'].keys()) for i in range(len(skim_data)): block_name = block_names[i] block_data = skim_data[i] logger.info("block_name %s bytes %s (%s)" % (block_name, block_data.nbytes, util.GB(block_data.nbytes))) # create skim dict skim_dict = skim.SkimDict(skim_data, skim_info) skim_dict.offset_mapper.set_offset_int(-1) return skim_dict
def initialize_settings(self): assert not self.settings_initialized settings = config.read_model_settings(ESTIMATION_SETTINGS_FILE_NAME) self.enabled = settings.get('enable', 'True') self.bundles = settings.get('bundles', []) self.model_estimation_table_types = settings.get( 'model_estimation_table_types', {}) self.estimation_table_recipes = settings.get( 'estimation_table_recipes', {}) if self.enabled: self.survey_tables = settings.get('survey_tables', {}) for table_name, table_info in self.survey_tables.items(): assert 'file_name' in table_info, \ "No file name specified for survey_table '%s' in %s" % (table_name, ESTIMATION_SETTINGS_FILE_NAME) file_path = config.data_file_path(table_info['file_name'], mandatory=True) assert os.path.exists(file_path), \ "File for survey table '%s' not found: %s" % (table_name, file_path) df = pd.read_csv(file_path) index_col = table_info.get('index_col') if index_col is not None: assert index_col in df.columns, \ "Index col '%s' not in survey_table '%s' in file: %s % (index_col, table_name, file_path)" df.set_index(index_col, inplace=True) # add the table df to survey_tables table_info['df'] = df self.settings_initialized = True
def skim_dict(data_dir, settings): omx_file_path = config.data_file_path(settings["skims_file"]) tags_to_load = settings['skim_time_periods']['labels'] logger.info("loading skim_dict from %s" % (omx_file_path, )) # select the skims to load skim_info = get_skim_info(omx_file_path, tags_to_load) logger.debug("omx_shape %s skim_dtype %s" % (skim_info['omx_shape'], skim_info['dtype'])) skim_buffers = inject.get_injectable('data_buffers', None) if skim_buffers: logger.info('Using existing skim_buffers for skims') else: skim_buffers = buffers_for_skims(skim_info, shared=False) load_skims(omx_file_path, skim_info, skim_buffers) skim_data = skim_data_from_buffers(skim_buffers, skim_info) block_names = list(skim_info['blocks'].keys()) for i in range(len(skim_data)): block_name = block_names[i] block_data = skim_data[i] logger.info( "block_name %s bytes %s (%s)" % (block_name, block_data.nbytes, util.GB(block_data.nbytes))) # create skim dict skim_dict = skim.SkimDict(skim_data, skim_info) skim_dict.offset_mapper.set_offset_int(-1) return skim_dict
def tap_skim_dict(data_dir, settings): logger.info("loading tap_skim_dict") cache_skim_key_values = settings['skim_time_periods']['labels'] skim_dict = askim.SkimDict() for skims_file in settings["tap_skims_files"]: skims_file_path = config.data_file_path(skims_file) with omx.open_file(skims_file_path) as omx_file: add_to_skim_dict(skim_dict, omx_file, cache_skim_key_values) return skim_dict
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline setup_dirs() inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, read_skim_cache=True) # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) # print "checkpoints_df\n%s" % checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 9 pipeline.open_pipeline('auto_ownership_simulate') regress_mini_auto() # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str( excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() # should be able to get this before pipeline is closed (from existing open store) checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values hh_ids = pd.DataFrame({'household_id': hh_ids}) hh_ids_path = config.data_file_path('override_hh_ids.csv') hh_ids.to_csv(hh_ids_path, index=False, header=True) pipeline.close_pipeline() inject.clear_cache() close_handlers()
def get_trips_df(model_settings): """Default to pipeline trips table unless user provides a CSV """ filename = model_settings.get('input_table', None) if not filename: logger.info("using 'trips' pipeline table for balancing step") trips_df = pipeline.get_table('trips') return trips_df.reset_index() logger.info('using %s for balancing step' % filename) fpath = config.data_file_path(filename, mandatory=True) return pd.read_csv(fpath, header=0, comment='#')
def read_zone_indexed_csv_file(file_name): logger.info('reading file \'%s\'' % file_name) fpath = config.data_file_path(file_name, mandatory=True) zone_df = pd.read_csv(fpath, header=0, comment='#') if ZONE_LABEL in zone_df.columns: zone_index = ZONE_LABEL # str else: # use row numbers for zone ids. convert to 1-based zone ids simply by adding 1 zone_index = zone_df.index + 1 # Series zone_df.set_index(zone_index, drop=True, inplace=True) zone_df.index.name = ZONE_LABEL return zone_df
def read_input_table(table_name): filename = setting('input_store', None) if not filename: logger.error("input store file name not specified in settings") raise RuntimeError("store file name not specified in settings") input_store_path = config.data_file_path(filename) if not os.path.exists(input_store_path): logger.error("store file not found: %s" % input_store_path) raise RuntimeError("store file not found: %s" % input_store_path) df = pd.read_hdf(input_store_path, table_name) return df
def load_skim_info(self): """ read skim info from omx files into SkimInfo, and store in self.skims_info dict keyed by skim_tag ONE_ZONE and TWO_ZONE systems have only TAZ skims THREE_ZONE systems have both TAZ and TAP skims """ assert self.skim_dict_factory is not None # load taz skim_info self.skims_info['taz'] = self.skim_dict_factory.load_skim_info('taz') if self.zone_system == THREE_ZONE: # load tap skim_info self.skims_info['tap'] = self.skim_dict_factory.load_skim_info('tap') if self.zone_system == THREE_ZONE: # load this here rather than in load_data as it is required during multiprocessing to size TVPBCache self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True)) self.tvpb = pathbuilder.TransitVirtualPathBuilder(self) # dependent on self.tap_df
def test_mini_pipeline_run3(): # test that hh_ids setting overrides household sampling setup_dirs() inject_settings(hh_ids='override_hh_ids.csv') households = inject.get_table('households').to_frame() override_hh_ids = pd.read_csv(config.data_file_path('override_hh_ids.csv')) print("\noverride_hh_ids\n%s" % override_hh_ids) print("\nhouseholds\n%s" % households.index) assert households.shape[0] == override_hh_ids.shape[0] assert households.index.isin(override_hh_ids.household_id).all() inject.clear_cache() close_handlers()
def _read_skims_from_omx(self, skim_info, skim_data): """ read skims from omx file into skim_data """ skim_tag = skim_info.skim_tag omx_keys = skim_info.omx_keys omx_manifest = skim_info.omx_manifest # dict mapping { omx_key: skim_name } for omx_file_name in skim_info.omx_file_names: omx_file_path = config.data_file_path(omx_file_name) num_skims_loaded = 0 logger.info(f"_read_skims_from_omx {omx_file_path}") # read skims into skim_data with omx.open_file(omx_file_path) as omx_file: for skim_key, omx_key in omx_keys.items(): if omx_manifest[omx_key] == omx_file_name: offset = skim_info.block_offsets[skim_key] logger.debug( f"_read_skims_from_omx file {omx_file_name} omx_key {omx_key} " f"skim_key {skim_key} to offset {offset}") if skim_dictionary.ROW_MAJOR_LAYOUT: a = skim_data[offset, :, :] else: a = skim_data[:, :, offset] # this will trigger omx readslice to read and copy data to skim_data's buffer omx_data = omx_file[omx_key] a[:] = omx_data[:] num_skims_loaded += 1 logger.info( f"_read_skims_from_omx loaded {num_skims_loaded} skims from {omx_file_name}" )
def test_mini_pipeline_run3(): # test that hh_ids setting overrides household sampling configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, hh_ids='override_hh_ids.csv') households = inject.get_table('households').to_frame() override_hh_ids = pd.read_csv(config.data_file_path('override_hh_ids.csv')) print("\noverride_hh_ids\n", override_hh_ids) print("\nhouseholds\n", households.index) assert households.shape[0] == override_hh_ids.shape[0] assert households.index.isin(override_hh_ids.household_id).all() inject.clear_cache() close_handlers()
def read_network_file(settings): """ Read network from saved HDF5 file """ network_fname = settings['saved_network'] if not network_fname: logger.error("Please specify 'saved_network' file in settings") return network_fpath = config.data_file_path(network_fname, mandatory=False) or \ config.output_file_path(network_fname) if not os.path.exists(network_fpath): logger.error('No network file %s found' % network_fname) return logger.info('Reading network from %s' % network_fpath) network = pdna.Network.from_hdf5(network_fpath) return network
def read_from_table_info(table_info): """ Read input text files and return cleaned up DataFrame. table_info is a dictionary that specifies the following input params. See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ input_store = config.setting('input_store', None) create_input_store = config.setting('create_input_store', default=False) tablename = table_info.get('tablename') data_filename = table_info.get('filename', input_store) h5_tablename = table_info.get('h5_tablename') or tablename drop_columns = table_info.get('drop_columns', None) column_map = table_info.get('column_map', None) keep_columns = table_info.get('keep_columns', None) rename_columns = table_info.get('rename_columns', None) csv_dtypes = table_info.get('dtypes', {}) # don't require a redundant index_col directive for canonical tables # but allow explicit disabling of assignment of index col for canonical tables, in which case, presumably, # the canonical index will be assigned in a subsequent initialization step (e.g. initialize_tours) canonical_index_col = canonical_table_index_name(tablename) # if there is an explicit index_col entry in table_info if 'index_col' in table_info: # honor explicit index_col unless it conflicts with canonical name index_col = table_info['index_col'] if canonical_index_col: if index_col: # if there is a non-empty index_col directive, it should be for canonical_table_index_name assert index_col == canonical_index_col, \ f"{tablename} index_col {table_info.get('index_col')} should be {index_col}" else: logger.info(f"Not assigning canonical index_col {tablename}.{canonical_index_col} " f"because settings file index_col directive is explicitly None.") # if there is an index_col directive for a canonical table, it should be for canonical_table_index_name else: # otherwise default is to use canonical index name for known tables, and no index for unknown tables index_col = canonical_index_col assert tablename is not None, 'no tablename provided' assert data_filename is not None, 'no input file provided' data_file_path = config.data_file_path(data_filename) df = _read_input_file(data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes) # logger.debug('raw %s table columns: %s' % (tablename, df.columns.values)) logger.debug('raw %s table size: %s' % (tablename, util.df_size(df))) if create_input_store: h5_filepath = config.output_file_path('input_data.h5') logger.info('writing %s to %s' % (h5_tablename, h5_filepath)) df.to_hdf(h5_filepath, key=h5_tablename, mode='a') csv_dir = config.output_file_path('input_data') if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False) if drop_columns: logger.debug("dropping columns: %s" % drop_columns) df.drop(columns=drop_columns, inplace=True, errors='ignore') if column_map: warnings.warn("table_inf option 'column_map' renamed 'rename_columns'" "Support for 'column_map' will be removed in future versions.", FutureWarning) logger.debug("renaming columns: %s" % column_map) df.rename(columns=column_map, inplace=True) # rename columns first, so keep_columns can be a stable list of expected/required columns if rename_columns: logger.debug("renaming columns: %s" % rename_columns) df.rename(columns=rename_columns, inplace=True) # set index if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() if canonical_index_col: # we expect canonical indexes to be integer-valued assert (df[index_col] == df[index_col].astype(int)).all(), \ f"Index col '{index_col}' has non-integer values" df[index_col] = df[index_col].astype(int) df.set_index(index_col, inplace=True) else: # FIXME not sure we want to do this. More likely they omitted index col than that they want to name it? # df.index.names = [index_col] logger.error(f"index_col '{index_col}' specified in configs but not in {tablename} table!") logger.error(f"{tablename} columns are: {list(df.columns)}") raise RuntimeError(f"index_col '{index_col}' not in {tablename} table!") if keep_columns: logger.debug("keeping columns: %s" % keep_columns) if not set(keep_columns).issubset(set(df.columns)): logger.error(f"Required columns missing from {tablename} table: " f"{list(set(keep_columns).difference(set(df.columns)))}") logger.error(f"{tablename} table has columns: {list(df.columns)}") raise RuntimeError(f"Required columns missing from {tablename} table") df = df[keep_columns] if df.columns.duplicated().any(): duplicate_column_names = df.columns[df.columns.duplicated(keep=False)].unique().to_list() assert not df.columns.duplicated().any(), f"duplicate columns names in {tablename}: {duplicate_column_names}" logger.debug('%s table columns: %s' % (tablename, df.columns.values)) logger.debug('%s table size: %s' % (tablename, util.df_size(df))) logger.debug('%s index name: %s' % (tablename, df.index.name)) return df
def load_skim_info(self, skim_tag): """ Read omx files for skim <skim_tag> (e.g. 'TAZ') and build skim_info dict Parameters ---------- skim_tag: str """ self.omx_file_names = self.network_los.omx_file_names(skim_tag) # ignore any 3D skims not in skim_time_periods # specifically, load all skims except those with key2 not in dim3_tags_to_load skim_time_periods = self.network_los.skim_time_periods dim3_tags_to_load = skim_time_periods and skim_time_periods['labels'] self.omx_manifest = {} # dict mapping { omx_key: skim_name } for omx_file_name in self.omx_file_names: omx_file_path = config.data_file_path(omx_file_name) # logger.debug(f"load_skim_info {skim_tag} reading {omx_file_path}") with omx.open_file(omx_file_path) as omx_file: # fixme call to omx_file.shape() failing in windows p3.5 if self.omx_shape is None: self.omx_shape = tuple( int(i) for i in omx_file.shape()) # sometimes omx shape are floats! else: assert (self.omx_shape == tuple( int(i) for i in omx_file.shape())) for skim_name in omx_file.listMatrices(): assert skim_name not in self.omx_manifest, \ f"duplicate skim '{skim_name}' found in {self.omx_manifest[skim_name]} and {omx_file}" self.omx_manifest[skim_name] = omx_file_name for m in omx_file.listMappings(): if self.offset_map is None: self.offset_map_name = m self.offset_map = omx_file.mapentries( self.offset_map_name) assert len(self.offset_map) == self.omx_shape[0] else: # don't really expect more than one, but ok if they are all the same if not (self.offset_map == omx_file.mapentries(m)): raise RuntimeError( f"Multiple mappings in omx file: {self.offset_map_name} != {m}" ) # - omx_keys dict maps skim key to omx_key # DISTWALK: DISTWALK # ('DRV_COM_WLK_BOARDS', 'AM'): DRV_COM_WLK_BOARDS__AM, ... self.omx_keys = dict() for skim_name in self.omx_manifest.keys(): key1, sep, key2 = skim_name.partition('__') # - ignore composite tags not in dim3_tags_to_load if dim3_tags_to_load and sep and key2 not in dim3_tags_to_load: continue skim_key = (key1, key2) if sep else key1 self.omx_keys[skim_key] = skim_name self.num_skims = len(self.omx_keys) # - key1_subkeys dict maps key1 to dict of subkeys with that key1 # DIST: {'DIST': 0} # DRV_COM_WLK_BOARDS: {'MD': 1, 'AM': 0, 'PM': 2}, ... key1_subkeys = dict() for skim_key, omx_key in self.omx_keys.items(): if isinstance(skim_key, tuple): key1, key2 = skim_key else: key1 = key2 = skim_key key2_dict = key1_subkeys.setdefault(key1, {}) key2_dict[key2] = len(key2_dict) key1_block_offsets = dict() offset = 0 for key1, v in key1_subkeys.items(): num_subkeys = len(v) key1_block_offsets[key1] = offset offset += num_subkeys # - block_offsets dict maps skim_key to offset of omx matrix # DIST: 0, # ('DRV_COM_WLK_BOARDS', 'AM'): 3, # ('DRV_COM_WLK_BOARDS', 'MD') 4, ... self.block_offsets = dict() for skim_key in self.omx_keys: if isinstance(skim_key, tuple): key1, key2 = skim_key else: key1 = key2 = skim_key key1_offset = key1_block_offsets[key1] key2_relative_offset = key1_subkeys.get(key1).get(key2) self.block_offsets[skim_key] = key1_offset + key2_relative_offset if skim_dictionary.ROW_MAJOR_LAYOUT: self.skim_data_shape = (self.num_skims, self.omx_shape[0], self.omx_shape[1]) else: self.skim_data_shape = self.omx_shape + (self.num_skims, ) # list of base keys (keys self.base_keys = tuple(k for k in key1_block_offsets.keys())
def read_from_table_info(table_info): """ Read input text files and return cleaned up DataFrame. table_info is a dictionary that specifies the following input params. See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ input_store = config.setting('input_store', None) create_input_store = config.setting('create_input_store', default=False) tablename = table_info.get('tablename') data_filename = table_info.get('filename', input_store) h5_tablename = table_info.get('h5_tablename') or tablename drop_columns = table_info.get('drop_columns', None) column_map = table_info.get('column_map', None) keep_columns = table_info.get('keep_columns', None) rename_columns = table_info.get('rename_columns', None) index_col = table_info.get('index_col', None) assert tablename is not None, 'no tablename provided' assert data_filename is not None, 'no input file provided' data_file_path = config.data_file_path(data_filename) df = _read_input_file(data_file_path, h5_tablename=h5_tablename) logger.debug('raw %s table columns: %s' % (tablename, df.columns.values)) logger.debug('raw %s table size: %s' % (tablename, util.df_size(df))) if create_input_store: h5_filepath = config.output_file_path('input_data.h5') logger.info('writing %s to %s' % (h5_tablename, h5_filepath)) df.to_hdf(h5_filepath, key=h5_tablename, mode='a') csv_dir = config.output_file_path('input_data') if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False) if drop_columns: logger.debug("dropping columns: %s" % drop_columns) df.drop(columns=drop_columns, inplace=True, errors='ignore') if column_map: warnings.warn( "table_inf option 'column_map' renamed 'rename_columns'" "Support for 'column_map' will be removed in future versions.", FutureWarning) logger.debug("renaming columns: %s" % column_map) df.rename(columns=column_map, inplace=True) # rename columns first, so keep_columns can be a stable list of expected/required columns if rename_columns: logger.info("renaming columns: %s" % rename_columns) df.rename(columns=rename_columns, inplace=True) # set index if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] logger.info("keeping columns: %s" % keep_columns) if keep_columns: logger.info("keeping columns: %s" % keep_columns) df = df[keep_columns] logger.debug('%s table columns: %s' % (tablename, df.columns.values)) logger.debug('%s table size: %s' % (tablename, util.df_size(df))) logger.info('%s index name: %s' % (tablename, df.index.name)) return df
def load_data(self): """ Load tables and skims from files specified in network_los settigns """ # load maz tables if self.zone_system in [TWO_ZONE, THREE_ZONE]: # maz file_name = self.setting('maz') self.maz_taz_df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) self.maz_taz_df = self.maz_taz_df[['MAZ', 'TAZ']].sort_values(by='MAZ') # only fields we need self.maz_ceiling = self.maz_taz_df.MAZ.max() + 1 # maz_to_maz_df maz_to_maz_tables = self.setting('maz_to_maz.tables') maz_to_maz_tables = [maz_to_maz_tables] if isinstance(maz_to_maz_tables, str) else maz_to_maz_tables for file_name in maz_to_maz_tables: df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) df['i'] = df.OMAZ * self.maz_ceiling + df.DMAZ df.set_index('i', drop=True, inplace=True, verify_integrity=True) logger.debug(f"loading maz_to_maz table {file_name} with {len(df)} rows") # FIXME - don't really need these columns, but if we do want them, # we would need to merge them in since files may have different numbers of rows df.drop(columns=['OMAZ', 'DMAZ'], inplace=True) # besides, we only want data columns so we can coerce to same type as skims df = df.astype(np.dtype(self.skim_dtype_name)) if self.maz_to_maz_df is None: self.maz_to_maz_df = df else: self.maz_to_maz_df = pd.concat([self.maz_to_maz_df, df], axis=1) # load tap tables if self.zone_system == THREE_ZONE: # tap_df should already have been loaded by load_skim_info because, # during multiprocessing, it is required by TapTapUidCalculator to size TVPBCache # self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True)) assert self.tap_df is not None # maz_to_tap_dfs - different sized sparse arrays with different columns, so we keep them seperate for mode, maz_to_tap_settings in self.setting('maz_to_tap').items(): assert 'table' in maz_to_tap_settings, \ f"Expected setting maz_to_tap.{mode}.table not found in in {LOS_SETTINGS_FILE_NAME}" file_name = maz_to_tap_settings['table'] df = pd.read_csv(config.data_file_path(file_name, mandatory=True)) # trim tap set # if provided, use tap_line_distance_col together with tap_lines table to trim the near tap set # to only include the nearest tap to origin when more than one tap serves the same line distance_col = maz_to_tap_settings.get('tap_line_distance_col') if distance_col: if self.tap_lines_df is None: # load tap_lines on demand (required if they specify tap_line_distance_col) tap_lines_file_name = self.setting('tap_lines', ) self.tap_lines_df = pd.read_csv(config.data_file_path(tap_lines_file_name, mandatory=True)) # csv file has one row per TAP with space-delimited list of lines served by that TAP # TAP LINES # 6020 GG_024b_SB GG_068_RT GG_228_WB GG_023X_RT # stack to create dataframe with one column 'line' indexed by TAP with one row per line served # TAP line # 6020 GG_024b_SB # 6020 GG_068_RT # 6020 GG_228_WB self.tap_lines_df = \ self.tap_lines_df.set_index('TAP').LINES.str.split(expand=True)\ .stack().droplevel(1).to_frame('line') old_len = len(df) # NOTE - merge will remove unused taps (not appearing in tap_lines) df = pd.merge(df, self.tap_lines_df, left_on='TAP', right_index=True) # find nearest TAP to MAz that serves line df = df.sort_values(by=distance_col).drop_duplicates(subset=['MAZ', 'line']) # we don't need to remember which lines are served by which TAPs df = df.drop(columns='line').drop_duplicates(subset=['MAZ', 'TAP']).sort_values(['MAZ', 'TAP']) logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows") logger.debug(f"maz_to_tap table {file_name} max {distance_col} {df[distance_col].max()}") max_dist = maz_to_tap_settings.get('max_dist', None) if max_dist: old_len = len(df) df = df[df[distance_col] <= max_dist] logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows " f"based on max_dist {max_dist}") if TRACE_TRIMMED_MAZ_TO_TAP_TABLES: tracing.write_csv(df, file_name=f"trimmed_{maz_to_tap_settings['table']}", transpose=False) df.set_index(['MAZ', 'TAP'], drop=True, inplace=True, verify_integrity=True) logger.debug(f"loaded maz_to_tap table {file_name} with {len(df)} rows") assert mode not in self.maz_to_tap_dfs self.maz_to_tap_dfs[mode] = df mem.trace_memory_info('#MEM network_los.load_data before create_skim_dicts') # create taz skim dict assert 'taz' not in self.skim_dicts self.skim_dicts['taz'] = self.create_skim_dict('taz') # make sure skim has all tap_ids # FIXME - weird that there is no list of tazs? # create MazSkimDict facade if self.zone_system in [TWO_ZONE, THREE_ZONE]: # create MazSkimDict facade skim_dict # (must have already loaded dependencies: taz skim_dict, maz_to_maz_df, and maz_taz_df) assert 'maz' not in self.skim_dicts self.skim_dicts['maz'] = self.create_skim_dict('maz') # make sure skim has all maz_ids assert set(self.maz_taz_df['MAZ'].values).issubset(set(self.skim_dicts['maz'].zone_ids)) # create tap skim dict if self.zone_system == THREE_ZONE: assert 'tap' not in self.skim_dicts self.skim_dicts['tap'] = self.create_skim_dict('tap') # make sure skim has all tap_ids assert set(self.tap_df['TAP'].values).issubset(set(self.skim_dicts['tap'].zone_ids)) mem.trace_memory_info("network_los.load_data after create_skim_dicts")
def read_from_table_info(table_info): """ Read input text files and return cleaned up DataFrame. table_info is a dictionary that specifies the following input params. See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ | h5_tablename | name of target table in HDF5 file | +--------------+----------------------------------------------------------+ """ input_store = config.setting('input_store', None) create_input_store = config.setting('create_input_store', default=False) tablename = table_info.get('tablename') data_filename = table_info.get('filename', input_store) h5_tablename = table_info.get('h5_tablename') or tablename drop_columns = table_info.get('drop_columns', None) column_map = table_info.get('column_map', None) index_col = table_info.get('index_col', None) assert tablename is not None, 'no tablename provided' assert data_filename is not None, 'no input file provided' data_file_path = config.data_file_path(data_filename) df = _read_input_file(data_file_path, h5_tablename=h5_tablename) logger.info('%s table columns: %s' % (tablename, df.columns.values)) logger.info('%s table size: %s' % (tablename, util.df_size(df))) if create_input_store: h5_filepath = config.output_file_path('input_data.h5') logger.info('writing %s to %s' % (h5_tablename, h5_filepath)) df.to_hdf(h5_filepath, key=h5_tablename, mode='a') if drop_columns: for c in drop_columns: logger.info("dropping column '%s'" % c) del df[c] if column_map: df.rename(columns=column_map, inplace=True) # set index if index_col is not None: if index_col in df.columns: assert not df.duplicated(index_col).any() df.set_index(index_col, inplace=True) else: df.index.names = [index_col] logger.info('%s index name: %s' % (tablename, df.index.name)) return df