def _destination_sample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skims, alt_dest_col_name, estimator, chunk_size, chunk_tag, trace_label): """ Note: trips with no viable destination receive no sample rows (because we call interaction_sample with allow_zero_probs=True) All other trips will have one or more rows with pick_count summing to sample_size returns choices: pandas.DataFrame alt_dest prob pick_count trip_id 102829169 2898 0.002333 1 102829169 2901 0.004976 1 102829169 3193 0.002628 1 """ spec = simulate.spec_for_segment(model_settings, spec_id='DESTINATION_SAMPLE_SPEC', segment_name=primary_purpose, estimator=estimator) sample_size = model_settings['SAMPLE_SIZE'] if config.setting('disable_destination_sampling', False) or (estimator and estimator.want_unsampled_alternatives): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count logger.info( "Estimation mode for %s using unsampled alternatives short_circuit_choices" % (trace_label, )) sample_size = 0 locals_dict = config.get_model_constants(model_settings).copy() # size_terms of destination zones are purpose-specific, and trips have various purposes # so the relevant size_term for each interaction_sample row # cannot be determined until after choosers are joined with alternatives # (unless we iterate over trip.purpose - which we could, though we are already iterating over trip_num) # so, instead, expressions determine row-specific size_term by a call to: size_terms.get(df.alt_dest, df.purpose) locals_dict.update({'size_terms': size_term_matrix}) locals_dict.update(skims) log_alt_losers = config.setting('log_alt_losers', False) choices = interaction_sample(choosers=trips, alternatives=alternatives, sample_size=sample_size, alt_col_name=alt_dest_col_name, log_alt_losers=log_alt_losers, allow_zero_probs=True, spec=spec, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, chunk_tag=chunk_tag, trace_label=trace_label) return choices
def add_geography_columns(incidence_table, households_df, crosswalk_df): """ Add seed and meta geography columns to incidence_table Parameters ---------- incidence_table households_df crosswalk_df Returns ------- """ geographies = setting('geographies') meta_geography = geographies[0] seed_geography = setting('seed_geography') # add seed_geography col to incidence table incidence_table[seed_geography] = households_df[seed_geography] # add meta column to incidence table seed_to_meta = \ crosswalk_df[[seed_geography, meta_geography]] \ .groupby(seed_geography, as_index=True).min()[meta_geography] incidence_table[meta_geography] = incidence_table[seed_geography].map( seed_to_meta) return incidence_table
def _location_sample(segment_name, choosers, alternatives, skims, estimator, model_settings, alt_dest_col_name, chunk_size, chunk_tag, trace_label): """ select a sample of alternative locations. Logsum calculations are expensive, so we build a table of persons * all zones and then select a sample subset of potential locations The sample subset is generated by making multiple choices (<sample_size> number of choices) which results in sample containing up to <sample_size> choices for each choose (e.g. person) and a pick_count indicating how many times that choice was selected for that chooser.) person_id, dest_zone_id, rand, pick_count 23750, 14, 0.565502716034, 4 23750, 16, 0.711135838871, 6 ... 23751, 12, 0.408038878552, 1 23751, 14, 0.972732479292, 2 """ assert not choosers.empty logger.info("Running %s with %d persons" % (trace_label, len(choosers.index))) sample_size = model_settings["SAMPLE_SIZE"] if config.setting('disable_destination_sampling', False) or (estimator and estimator.want_unsampled_alternatives): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count logger.info( "Estimation mode for %s using unsampled alternatives short_circuit_choices" % (trace_label, )) sample_size = 0 locals_d = {'skims': skims, 'segment_size': segment_name} constants = config.get_model_constants(model_settings) locals_d.update(constants) spec = simulate.spec_for_segment(model_settings, spec_id='SAMPLE_SPEC', segment_name=segment_name, estimator=estimator) # here since presumably we want this when called for either sample or presample log_alt_losers = config.setting('log_alt_losers', False) choices = interaction_sample(choosers, alternatives, spec=spec, sample_size=sample_size, alt_col_name=alt_dest_col_name, log_alt_losers=log_alt_losers, skims=skims, locals_d=locals_d, chunk_size=chunk_size, chunk_tag=chunk_tag, trace_label=trace_label) return choices
def load_settings(self): """ Read setting file and initialize object variables (see class docstring for list of object variables) """ try: self.los_settings = config.read_settings_file(self.los_settings_file_name, mandatory=True) except config.SettingsFileNotFound as e: print(f"los_settings_file_name {self.los_settings_file_name} not found - trying global settings") print(f"skims_file: {config.setting('skims_file')}") print(f"skim_time_periods: {config.setting('skim_time_periods')}") print(f"source_file_paths: {config.setting('source_file_paths')}") print(f"inject.get_injectable('configs_dir') {inject.get_injectable('configs_dir')}") # look for legacy 'skims_file' setting in global settings file if config.setting('skims_file'): warnings.warn("Support for 'skims_file' setting in global settings file will be removed." "Use 'taz_skims' in network_los.yaml config file instead.", FutureWarning) # in which case, we also expect to find skim_time_periods in settings file skim_time_periods = config.setting('skim_time_periods') assert skim_time_periods is not None, "'skim_time_periods' setting not found." warnings.warn("Support for 'skim_time_periods' setting in global settings file will be removed." "Put 'skim_time_periods' in network_los.yaml config file instead.", FutureWarning) self.los_settings = { 'taz_skims': config.setting('skims_file'), 'zone_system': ONE_ZONE, 'skim_time_periods': skim_time_periods } else: raise e # validate skim_time_periods self.skim_time_periods = self.setting('skim_time_periods') if 'hours' in self.skim_time_periods: self.skim_time_periods['periods'] = self.skim_time_periods.pop('hours') warnings.warn('support for `skim_time_periods` key `hours` will be removed in ' 'future verions. Use `periods` instead', FutureWarning) assert 'periods' in self.skim_time_periods, "'periods' key not found in network_los.skim_time_periods" assert 'labels' in self.skim_time_periods, "'labels' key not found in network_los.skim_time_periods" self.zone_system = self.setting('zone_system') assert self.zone_system in [ONE_ZONE, TWO_ZONE, THREE_ZONE], \ f"Network_LOS: unrecognized zone_system: {self.zone_system}" if self.zone_system in [TWO_ZONE, THREE_ZONE]: # maz_to_maz_settings self.max_blend_distance = self.setting('maz_to_maz.max_blend_distance', default={}) if isinstance(self.max_blend_distance, int): self.max_blend_distance = {'DEFAULT': self.max_blend_distance} self.blend_distance_skim_name = self.setting('maz_to_maz.blend_distance_skim_name', default=None) # validate skim_time_periods self.skim_time_periods = self.setting('skim_time_periods') assert {'periods', 'labels'}.issubset(set(self.skim_time_periods.keys()))
def add_geography_columns(incidence_table, households_df, crosswalk_df): """ Add seed and meta geography columns to incidence_table Parameters ---------- incidence_table households_df crosswalk_df Returns ------- """ geographies = setting('geographies') meta_geography = geographies[0] seed_geography = setting('seed_geography') # add seed_geography col to incidence table incidence_table[seed_geography] = households_df[seed_geography] # add meta column to incidence table (unless it's already there) if seed_geography != meta_geography: tmp = crosswalk_df[list({seed_geography, meta_geography})] seed_to_meta = tmp.groupby(seed_geography, as_index=True).min()[meta_geography] incidence_table[meta_geography] = incidence_table[seed_geography].map( seed_to_meta) return incidence_table
def run(args): """ Run bca4abm. Specify a project folder using the '--working_dir' option, or point to the config, data, and output folders directly with '--config', '--data', and '--output'. """ if args.working_dir and os.path.exists(args.working_dir): os.chdir(args.working_dir) if args.config: inject.add_injectable('configs_dir', args.config) if args.data: inject.add_injectable('data_dir', args.data) if args.output: inject.add_injectable('output_dir', args.output) for injectable in ['configs_dir', 'data_dir', 'output_dir']: try: dir_path = inject.get_injectable(injectable) except RuntimeError: sys.exit('Error: please specify either a --working_dir ' "containing 'configs', 'data', and 'output' folders " 'or all three of --config, --data, and --output') if not os.path.exists(dir_path): sys.exit("Could not find %s '%s'" % (injectable, os.path.abspath(dir_path))) if args.pipeline: inject.add_injectable('pipeline_file_name', args.pipeline) if args.resume: override_setting('resume_after', args.resume) tracing.config_logger() tracing.delete_csv_files() # only modifies output_dir warnings.simplefilter('always') logging.captureWarnings(capture=True) t0 = tracing.print_elapsed_time() # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint resume_after = setting('resume_after', None) if resume_after: print('resume_after: %s' % resume_after) pipeline.run(models=setting('models'), resume_after=resume_after) # tables will no longer be available after pipeline is closed pipeline.close_pipeline() t0 = tracing.print_elapsed_time('all models', t0)
def meta_summary(incidence_df, control_spec, top_geography, top_id, sub_geographies, hh_id_col): if setting('NO_INTEGERIZATION_EVER', False): seed_weight_cols = ['preliminary_balanced_weight', 'balanced_weight'] sub_weight_cols = ['balanced_weight'] else: seed_weight_cols = [ 'preliminary_balanced_weight', 'balanced_weight', 'integer_weight' ] sub_weight_cols = ['balanced_weight', 'integer_weight'] incidence_df = incidence_df[incidence_df[top_geography] == top_id] control_cols = control_spec.target.values controls_df = get_control_table(top_geography) # controls for this geography as series controls = controls_df[control_cols].loc[top_id] incidence = incidence_df[control_cols] summary = pd.DataFrame(index=control_cols) summary.index.name = 'control_name' summary['control_value'] = controls seed_geography = setting('seed_geography') seed_weights_df = get_weight_table(seed_geography) for c in seed_weight_cols: if c in seed_weights_df: summary_col_name = '%s_%s' % (top_geography, c) summary[summary_col_name] = \ incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0) for g in sub_geographies: sub_weights = get_weight_table(g) if sub_weights is None: continue sub_weights = sub_weights[sub_weights[top_geography] == top_id] sub_weights = sub_weights[[hh_id_col] + sub_weight_cols].groupby(hh_id_col).sum() for c in sub_weight_cols: summary['%s_%s' % (g, c)] = \ incidence.multiply(sub_weights[c], axis="index").sum(axis=0) return summary
def load_tables(table_list_name, data_dir=None): table_list = setting(table_list_name) if table_list is None: raise "I expected to find table list '%s' with table_info in settings." % table_list_name if data_dir is None: data_dir = setting('data_dir', inject.get_injectable('data_dir')) for table_name, table_info in table_list.iteritems(): df = read_table(table_name, table_info, data_dir) inject.add_table(table_name, df)
def run(args): """ Run the models. Specify a project folder using the '--working_dir' option, or point to the config, data, and output folders directly with '--config', '--data', and '--output'. Both '--config' and '--data' can be specified multiple times. Directories listed first take precedence. """ from activitysim import abm # register injectables tracing.config_logger(basic=True) handle_standard_args(args) # possibly update injectables tracing.config_logger( basic=False) # update using possibly new logging configs config.filter_warnings() logging.captureWarnings(capture=True) log_settings() t0 = tracing.print_elapsed_time() # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint resume_after = config.setting('resume_after', None) # cleanup if not resuming if not resume_after: cleanup_output_files() elif config.setting('cleanup_trace_files_on_resume', False): tracing.delete_trace_files() if config.setting('multiprocess', False): logger.info('run multiprocess simulation') from activitysim.core import mp_tasks run_list = mp_tasks.get_run_list() injectables = {k: inject.get_injectable(k) for k in INJECTABLES} mp_tasks.run_multiprocess(run_list, injectables) else: logger.info('run single process simulation') pipeline.run(models=config.setting('models'), resume_after=resume_after) pipeline.close_pipeline() chunk.log_write_hwm() tracing.print_elapsed_time('all models', t0)
def preload_injectables(): """ preload bulky injectables up front - stuff that isn't inserted into the pipeline """ logger.info("preload_injectables") inject.add_step('track_skim_usage', track_skim_usage) inject.add_step('write_data_dictionary', write_data_dictionary) inject.add_step('write_tables', write_tables) table_list = config.setting('input_table_list') # default ActivitySim table names and indices if table_list is None: logger.warning( "No 'input_table_list' found in settings. This will be a " "required setting in upcoming versions of ActivitySim.") new_settings = inject.get_injectable('settings') new_settings['input_table_list'] = DEFAULT_TABLE_LIST inject.add_injectable('settings', new_settings) # FIXME undocumented feature if config.setting('write_raw_tables'): # write raw input tables as csv (before annotation) csv_dir = config.output_file_path('raw_tables') if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed table_names = [t['tablename'] for t in table_list] for t in table_names: df = inject.get_table(t).to_frame() if t == 'households': df.drop(columns='chunk_id', inplace=True) df.to_csv(os.path.join(csv_dir, '%s.csv' % t), index=True) t0 = tracing.print_elapsed_time() # FIXME - still want to do this? # if inject.get_injectable('skim_dict', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True) # # if inject.get_injectable('skim_stack', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True) return True
def preload_injectables(): """ preload bulky injectables up front - stuff that isn't inserted into the pipeline """ logger.info("preload_injectables") inject.add_step('track_skim_usage', track_skim_usage) inject.add_step('write_data_dictionary', write_data_dictionary) inject.add_step('write_tables', write_tables) table_list = config.setting('input_table_list') # default ActivitySim table names and indices if table_list is None: logger.warn("No 'input_table_list' found in settings. This will be a " "required setting in upcoming versions of ActivitySim.") new_settings = inject.get_injectable('settings') new_settings['input_table_list'] = DEFAULT_TABLE_LIST inject.add_injectable('settings', new_settings) t0 = tracing.print_elapsed_time() # FIXME - still want to do this? # if inject.get_injectable('skim_dict', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True) # # if inject.get_injectable('skim_stack', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True) return True
def read_input_table(tablename): """Reads input table name and returns cleaned DataFrame. Uses settings found in input_table_list in settings.yaml Parameters ---------- tablename : string Returns ------- pandas DataFrame """ table_list = config.setting('input_table_list') assert table_list is not None, 'no input_table_list found in settings' table_info = None for info in table_list: if info['tablename'] == tablename: table_info = info assert table_info is not None, \ 'could not find info for for tablename %s in settings.yaml' % tablename return read_from_table_info(table_info)
def read_input_table(tablename, required=True): """Reads input table name and returns cleaned DataFrame. Uses settings found in input_table_list in global settings file Parameters ---------- tablename : string Returns ------- pandas DataFrame """ table_list = config.setting('input_table_list') assert table_list is not None, 'no input_table_list found in settings' table_info = None for info in table_list: if info['tablename'] == tablename: table_info = info if table_info is not None: df = read_from_table_info(table_info) else: if required: raise RuntimeError(f"could not find info for for tablename {tablename} in settings file") df = None return df
def load_shared_data(self, shared_data_buffers): """ Load omx skim data into shared_data buffers Only called when multiprocessing - BEFORE any models are run or any call to load_data() Parameters ---------- shared_data_buffers: dict of multiprocessing.RawArray keyed by skim_tag """ assert self.multiprocess() # assert self.skim_dict_factory.supports_shared_data_for_multiprocessing if self.skim_dict_factory.supports_shared_data_for_multiprocessing: for skim_tag in self.skims_info.keys(): assert skim_tag in shared_data_buffers, f"load_shared_data expected allocated shared_data_buffers" self.skim_dict_factory.load_skims_to_buffer(self.skims_info[skim_tag], shared_data_buffers[skim_tag]) if self.zone_system == THREE_ZONE: assert self.tvpb is not None if self.rebuild_tvpb_cache and not config.setting('resume_after', None): # delete old cache at start of new run so that stale cache is not loaded by load_data_to_buffer # when singleprocess, this call is made (later in program flow) in the initialize_los step self.tvpb.tap_cache.cleanup() self.tvpb.tap_cache.load_data_to_buffer(shared_data_buffers[self.tvpb.tap_cache.cache_tag])
def write_summaries(output_dir): summary_settings_name = 'output_summaries' summary_file_name = 'summaries.txt' summary_settings = setting(summary_settings_name) if summary_settings is None: logger.info( "No {summary_settings_name} specified in settings file. Nothing to write." ) return summary_dict = summary_settings mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path(summary_file_name), mode) as output_file: for table_name, column_names in summary_dict.items(): df = pipeline.get_table(table_name) for c in column_names: n = 100 empty = (df[c] == '') | df[c].isnull() print( f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n", file=output_file) print(df[c].value_counts().nlargest(n), file=output_file)
def write_skim_cache(skim_info, skim_data): """ write skim data from skim_data to canonically named cache file(s) in output directory """ skim_cache_dir = config.setting('skim_cache_dir', default_skim_cache_dir()) logger.info( f"load_skims writing skims data to cache directory {skim_cache_dir}") omx_name = skim_info['omx_name'] dtype = np.dtype(skim_info['dtype']) blocks = skim_info['blocks'] block = 0 for block_name, block_size in blocks.items(): skim_cache_file_name = build_skim_cache_file_name(omx_name, block) skim_cache_path = os.path.join(skim_cache_dir, skim_cache_file_name) block_data = skim_data[block] logger.info( f"load_skims writing block_name {block_name} {block_data.shape} to {skim_cache_file_name}" ) data = np.memmap(skim_cache_path, shape=block_data.shape, dtype=dtype, mode='w+') data[::] = block_data block += 1
def scenario_dir(): scenarios_dir = setting('scenarios_dir') assert scenarios_dir is not None, "scenarios_dir not defined in settings file" if not os.path.exists(scenarios_dir): raise RuntimeError("scenarios_dir not found: %s" % scenarios_dir) scenario_name = setting('scenario_name') assert scenario_name is not None, "scenario_name not defined in settings file" scenario_dir_path = os.path.join(scenarios_dir, scenario_name) assert os.path.exists( scenario_dir_path), "scenario_dir not found: %s" % scenario_dir_path return scenario_dir_path
def scenarios_dir(): scenarios_dir = setting('scenarios_dir', 'scenarios') if not os.path.exists(scenarios_dir): raise RuntimeError("scenarios_dir: directory does not exist") return scenarios_dir
def _destination_sample(spec_segment_name, choosers, destination_size_terms, skims, estimator, model_settings, alt_dest_col_name, chunk_size, chunk_tag, trace_label): model_spec = simulate.spec_for_segment(model_settings, spec_id='SAMPLE_SPEC', segment_name=spec_segment_name, estimator=estimator) logger.info("running %s with %d tours", trace_label, len(choosers)) sample_size = model_settings['SAMPLE_SIZE'] if config.setting('disable_destination_sampling', False) or (estimator and estimator.want_unsampled_alternatives): # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count logger.info( "Estimation mode for %s using unsampled alternatives short_circuit_choices" % (trace_label, )) sample_size = 0 locals_d = {'skims': skims} constants = config.get_model_constants(model_settings) if constants is not None: locals_d.update(constants) log_alt_losers = config.setting('log_alt_losers', False) choices = interaction_sample(choosers, alternatives=destination_size_terms, sample_size=sample_size, alt_col_name=alt_dest_col_name, log_alt_losers=log_alt_losers, spec=model_spec, skims=skims, locals_d=locals_d, chunk_size=chunk_size, chunk_tag=chunk_tag, trace_label=trace_label) # remember person_id in chosen alts so we can merge with persons in subsequent steps # (broadcasts person_id onto all alternatives sharing the same tour_id index value) choices['person_id'] = choosers.person_id return choices
def multiprocess(self): """ return True if this is a multiprocessing run (even if it is a main or single-process subprocess) Returns ------- bool """ is_multiprocess = config.setting('multiprocess', False) return is_multiprocess
def multi_integerize(incidence_df, sub_zone_weights, sub_controls_df, control_spec, total_hh_control_col, parent_geography, parent_id, sub_geography, sub_control_zones): """ Parameters ---------- incidence_df : pandas.Dataframe full incidence_df for all hh samples in seed zone sub_zone_weights : pandas.DataFame balanced subzone household sample weights to integerize sub_controls_df : pandas.Dataframe sub_geography controls (one row per zone indexed by sub_zone id) control_spec : pandas.Dataframe full control spec with columns 'target', 'seed_table', 'importance', ... total_hh_control_col : str name of total_hh column (so we can preferentially match this control) parent_geography : str parent geography zone name parent_id : int parent geography zone id sub_geography : str subzone geography name (e.g. 'TAZ') sub_control_zones : pandas.Series index is zone id and value is zone label (e.g. TAZ_101) for use in sub_controls_df column names Returns ------- integer_weights_df : pandas.DataFrame canonical form weight table, with columns for 'balanced_weight', 'integer_weight' plus columns for household id, parent and sub_geography zone ids """ trace_label = "%s_%s" % (parent_geography, parent_id) if setting('NO_INTEGERIZATION_EVER', False): integerizer = do_no_integerizing elif use_simul_integerizer(): integerizer = do_simul_integerizing else: integerizer = do_sequential_integerizing integer_weights_df = integerizer( trace_label=trace_label, incidence_df=incidence_df, sub_weights=sub_zone_weights, sub_controls_df=sub_controls_df, control_spec=control_spec, total_hh_control_col=total_hh_control_col, sub_geography=sub_geography, sub_control_zones=sub_control_zones, ) return integer_weights_df
def run_destination_sample(spec_segment_name, tours, persons_merged, model_settings, network_los, destination_size_terms, estimator, chunk_size, trace_label): # FIXME - MEMORY HACK - only include columns actually used in spec (omit them pre-merge) chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] persons_merged = persons_merged[[ c for c in persons_merged.columns if c in chooser_columns ]] tours = tours[[ c for c in tours.columns if c in chooser_columns or c == 'person_id' ]] choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, how='left') # interaction_sample requires that choosers.index.is_monotonic_increasing if not choosers.index.is_monotonic_increasing: logger.debug( f"run_destination_sample {trace_label} sorting choosers because not monotonic_increasing" ) choosers = choosers.sort_index() # by default, enable presampling for multizone systems, unless they disable it in settings file pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) if pre_sample_taz and not config.setting('want_dest_choice_presampling', True): pre_sample_taz = False logger.info(f"Disabled destination zone presampling for {trace_label} " f"because 'want_dest_choice_presampling' setting is False") if pre_sample_taz: logger.info("Running %s destination_presample with %d tours" % (trace_label, len(tours))) choices = destination_presample(spec_segment_name, choosers, model_settings, network_los, destination_size_terms, estimator, chunk_size, trace_label) else: choices = destination_sample(spec_segment_name, choosers, model_settings, network_los, destination_size_terms, estimator, chunk_size, trace_label) # remember person_id in chosen alts so we can merge with persons in subsequent steps # (broadcasts person_id onto all alternatives sharing the same tour_id index value) choices['person_id'] = tours.person_id return choices
def load_skims(omx_file_path, skim_info, skim_buffers): read_cache = config.setting('read_skim_cache') write_cache = config.setting('write_skim_cache') assert not (read_cache and write_cache), \ "read_skim_cache and write_skim_cache are both True in settings file. I am assuming this is a mistake" skim_data = skim_data_from_buffers(skim_buffers, skim_info) t0 = tracing.print_elapsed_time() if read_cache: read_skim_cache(skim_info, skim_data) t0 = tracing.print_elapsed_time("read_skim_cache", t0) else: read_skims_from_omx(skim_info, skim_data, omx_file_path) t0 = tracing.print_elapsed_time("read_skims_from_omx", t0) if write_cache: write_skim_cache(skim_info, skim_data) t0 = tracing.print_elapsed_time("write_skim_cache", t0)
def log_settings(): settings = [ 'households_sample_size', 'chunk_size', 'multiprocess', 'num_processes', 'resume_after', ] for k in settings: logger.info('setting %s: %s' % (k, config.setting(k)))
def input_pre_processor(): # - load generic data data_dir = setting('data_dir', inject.get_injectable('data_dir')) load_tables('input_tables', data_dir) # - load scenario input data scenario_input_dir = os.path.join(scenario_dir(), 'inputs') load_tables('scenario_input_tables', scenario_input_dir) for table_name in pipeline.orca_dataframe_tables(): df = inject.get_table(table_name, None).to_frame()
def merge_seed_data(expanded_household_ids, seed_data_df, seed_columns, trace_label): seed_geography = setting('seed_geography') hh_col = setting('household_id_col') df_columns = seed_data_df.columns.values # warn of any columns that aren't in seed_data_df for c in seed_columns: if c not in df_columns and c != hh_col: logger.warning("column '%s' not in %s" % (c, trace_label)) # remove any columns that aren't in seed_data_df df_columns = [c for c in seed_columns if c in df_columns] # seed_geography column in seed_data_df is redundant (already in expanded_household_ids table) if seed_geography in df_columns: df_columns.remove(seed_geography) # join to seed_data on either index or hh_col (for persons) right_index = (seed_data_df.index.name == hh_col) right_on = hh_col if hh_col in seed_data_df.columns and not right_index else None assert right_index or right_on if right_on and hh_col not in df_columns: df_columns.append(hh_col) merged_df = pd.merge( how="left", left=expanded_household_ids, right=seed_data_df[df_columns], left_on=hh_col, right_index=right_index, right_on=right_on ) if hh_col not in seed_columns: del merged_df[hh_col] return merged_df
def input_pre_processor(): """ Read input text files and save them as pipeline tables for use in subsequent steps. The files to read as specified by table_list, and array of dicts that specify the input file name, the name of the pipeline table, along with keys allow the specification of pre-processing steps. By default, reads table_list from 'input_table_list' in settings.yaml, unless an alternate table_list name is specified as a model step argument 'table_list'. (This allows alternate/additional input files to be read for repop) In the case of repop, this step is being run after an initial run has completed, in which case the input_table_list may specify replacement tables. (e.g. lowest geography controls that will replace the previous low controls dataframe.) See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ """ # alternate table list name may have been provided as a model argument table_list_name = inject.get_step_arg('table_list', default='input_table_list') table_list = config.setting(table_list_name) assert table_list is not None, "no table list '%s' found in settings." % table_list_name logger.info('Using table list: %s' % table_list) for table_info in table_list: tablename = table_info.get('tablename') df = input.read_from_table_info(table_info) logger.info('registering table %s' % tablename) # add (or replace) pipeline table repop = inject.get_step_arg('repop', default=False) inject.add_table(tablename, df, replace=repop)
def person_trips_processor(trips_with_demographics, person_trips_spec, person_trips_settings, coc_column_names, settings, chunk_size, trace_hh_id): """ Compute disaggregate trips benefits """ trips_df = trips_with_demographics.to_frame() logger.info( "Running person_trips_processor with %d trips (chunk size = %s)" % (len(trips_with_demographics), chunk_size)) # eval person_trips_spec in context of trips_with_demographics locals_dict = config.get_model_constants(person_trips_settings) locals_dict.update(config.setting('globals')) locals_dict['trips'] = trips_df trace_rows = trace_hh_id and trips_df['household_id'] == trace_hh_id coc_summary, trace_results, trace_assigned_locals = \ bca.eval_and_sum(assignment_expressions=person_trips_spec, df=trips_df, locals_dict=locals_dict, df_alias='trips', group_by_column_names=coc_column_names, chunk_size=chunk_size, trace_rows=trace_rows) result_prefix = 'PT_' add_result_columns("coc_results", coc_summary, result_prefix) add_summary_results(coc_summary, prefix=result_prefix, spec=person_trips_spec) if trace_hh_id: if trace_results is not None: # FIXME - moved this into assign_variables # add trips_df columns to trace_results # trace_results = pd.concat([trips_df[trace_rows], trace_results], axis=1) tracing.write_csv(trace_results, file_name="person_trips", index_label='trip_id', column_labels=['label', 'trip']) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="person_trips_locals")
def aggregate_zone_processor(zones, trace_od): """ zones: orca table zone data for base and build scenario dat files combined into a single dataframe with columns names prefixed with base_ or build_ indexed by ZONE """ trace_label = 'aggregate_zone' model_settings = config.read_model_settings('aggregate_zone.yaml') spec_file_name = model_settings.get('spec_file_name', 'aggregate_zone.csv') aggregate_zone_spec = bca.read_assignment_spec(spec_file_name) zones_df = zones.to_frame() logger.info("Running aggregate_zone_processor with %d zones" % (len(zones_df.index), )) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (zones_df.index == trace_orig) | (zones_df.index == trace_dest) else: trace_od_rows = None # locals whose values will be accessible to the execution context # when the expressions in spec are applied to choosers locals_dict = config.get_model_constants(model_settings) locals_dict.update(config.setting('globals')) # eval_variables evaluates each of the expressions in spec # in the context of each row in of the choosers dataframe results, trace_results, trace_assigned_locals = \ assign.assign_variables(aggregate_zone_spec, zones_df, locals_dict, df_alias='zones', trace_rows=trace_od_rows) pipeline.replace_table('aggregate_zone_summary', results) if trace_results is not None: tracing.write_csv(trace_results, file_name="aggregate_zone", index_label='zone', column_labels=['label', 'zone']) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="aggregate_zone_locals")
def trip_destination_sample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skim_hotel, estimator, chunk_size, trace_hh_id, trace_label): """ Returns ------- destination_sample: pandas.dataframe choices_df from interaction_sample with (up to) sample_size alts for each chooser row index (non unique) is trip_id from trips (duplicated for each alt) and columns dest_zone_id, prob, and pick_count dest_zone_id: int alt identifier from alternatives[<alt_col_name>] prob: float the probability of the chosen alternative pick_count : int number of duplicate picks for chooser, alt """ trace_label = tracing.extend_trace_label(trace_label, 'trip_destination_sample') assert len(trips) > 0 assert len(alternatives) > 0 # by default, enable presampling for multizone systems, unless they disable it in settings file network_los = inject.get_injectable('network_los') pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) if pre_sample_taz and not config.setting('want_dest_choice_presampling', True): pre_sample_taz = False logger.info(f"Disabled destination zone presampling for {trace_label} " f"because 'want_dest_choice_presampling' setting is False") if pre_sample_taz: logger.info("Running %s trip_destination_presample with %d trips" % (trace_label, len(trips))) choices = destination_presample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skim_hotel, network_los, estimator, chunk_size, trace_hh_id, trace_label) else: choices = destination_sample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skim_hotel, estimator, chunk_size, trace_label) return choices
def run(): config.handle_standard_args() # specify None for a pseudo random base seed # inject.add_injectable('rng_base_seed', 0) tracing.config_logger() config.filter_warnings() tracing.delete_csv_files() # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint resume_after = setting('resume_after', None) if resume_after: print("resume_after", resume_after) pipeline.run(models=setting('models'), resume_after=resume_after) # tables will no longer be available after pipeline is closed pipeline.close_pipeline()
def log_settings(injectables): settings = [ 'households_sample_size', 'chunk_size', 'multiprocess', 'num_processes', 'resume_after', ] for k in settings: logger.info("setting %s: %s" % (k, config.setting(k))) for k in injectables: logger.info("injectable %s: %s" % (k, inject.get_injectable(k)))
def read_input_table(table_name): filename = setting('input_store', None) if not filename: logger.error("input store file name not specified in settings") raise RuntimeError("store file name not specified in settings") input_store_path = config.data_file_path(filename) if not os.path.exists(input_store_path): logger.error("store file not found: %s" % input_store_path) raise RuntimeError("store file not found: %s" % input_store_path) df = pd.read_hdf(input_store_path, table_name) return df
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in checkpointed_tables if t not in tables] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex) df.to_csv(file_path, index=write_index)
def add_size_tables(): """ inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace) Size tables are pandas dataframes with locations counts for model_selector by zone and segment tour_destination_size_terms if using shadow pricing, we scale size_table counts to sample population (in which case, they have to be created while single-process) Scaling is problematic as it breaks household result replicability across sample sizes It also changes the magnitude of the size terms so if they are used as utilities in expression files, their importance will diminish relative to other utilities as the sample size decreases. Scaling makes most sense for a full sample in conjunction with shadow pricing, where shadow prices can be adjusted iteratively to bring modelled counts into line with desired (size table) counts. """ use_shadow_pricing = bool(config.setting('use_shadow_pricing')) shadow_settings = config.read_model_settings('shadow_pricing.yaml') shadow_pricing_models = shadow_settings['shadow_pricing_models'] # probably ought not scale if not shadow_pricing (breaks partial sample replicability) # but this allows compatability with existing CTRAMP behavior... scale_size_table = shadow_settings.get('SCALE_SIZE_TABLE', False) if shadow_pricing_models is None: logger.warning('shadow_pricing_models list not found in shadow_pricing settings') return # shadow_pricing_models is dict of {<model_selector>: <model_name>} # since these are scaled to model size, they have to be created while single-process for model_selector, model_name in iteritems(shadow_pricing_models): model_settings = config.read_model_settings(model_name) assert model_selector == model_settings['MODEL_SELECTOR'] segment_ids = model_settings['SEGMENT_IDS'] chooser_table_name = model_settings['CHOOSER_TABLE_NAME'] chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] choosers_df = inject.get_table(chooser_table_name).to_frame() if 'CHOOSER_FILTER_COLUMN_NAME' in model_settings: choosers_df = \ choosers_df[choosers_df[model_settings['CHOOSER_FILTER_COLUMN_NAME']] != 0] # - raw_desired_size land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') raw_size = tour_destination_size_terms(land_use, size_terms, model_selector) assert set(raw_size.columns) == set(segment_ids.keys()) if use_shadow_pricing or scale_size_table: inject.add_table('raw_' + size_table_name(model_selector), raw_size) # - scale size_table counts to sample population # scaled_size = zone_size * (total_segment_modeled / total_segment_desired) # segment scale factor (modeled / desired) keyed by segment_name segment_scale_factors = {} for c in raw_size: # number of zone demographics desired destination choices segment_desired_size = raw_size[c].astype(np.float64).sum() # number of synthetic population choosers in segment segment_chooser_count = \ (choosers_df[chooser_segment_column] == segment_ids[c]).sum() segment_scale_factors[c] = \ segment_chooser_count / np.maximum(segment_desired_size, 1) logger.info("add_desired_size_tables %s segment %s " "desired %s modeled %s scale_factor %s" % (chooser_table_name, c, segment_desired_size, segment_chooser_count, segment_scale_factors[c])) # FIXME - should we be rounding? scaled_size = (raw_size * segment_scale_factors).round() else: scaled_size = raw_size inject.add_table(size_table_name(model_selector), scaled_size)
def __init__(self, model_settings, num_processes, shared_data=None, shared_data_lock=None): """ Presence of shared_data is used as a flag for multiprocessing If we are multiprocessing, shared_data should be a multiprocessing.RawArray buffer to aggregate modeled_size across all sub-processes, and shared_data_lock should be a multiprocessing.Lock object to coordinate access to that buffer. Optionally load saved shadow_prices from data_dir if config setting use_shadow_pricing and shadow_setting LOAD_SAVED_SHADOW_PRICES are both True Parameters ---------- model_settings : dict shared_data : multiprocessing.Array or None (if single process) shared_data_lock : numpy array wrapping multiprocessing.RawArray or None (if single process) """ self.num_processes = num_processes self.use_shadow_pricing = bool(config.setting('use_shadow_pricing')) self.saved_shadow_price_file_path = None # set by read_saved_shadow_prices if loaded self.model_selector = model_settings['MODEL_SELECTOR'] full_model_run = config.setting('households_sample_size') == 0 if self.use_shadow_pricing and not full_model_run: logging.warning("deprecated combination of use_shadow_pricing and not full_model_run") self.segment_ids = model_settings['SEGMENT_IDS'] # - modeled_size (set by call to set_choices/synchronize_choices) self.modeled_size = None if self.use_shadow_pricing: self.shadow_settings = config.read_model_settings('shadow_pricing.yaml') for k in self.shadow_settings: logger.debug("shadow_settings %s: %s" % (k, self.shadow_settings.get(k))) # - destination_size_table (desired_size) self.desired_size = inject.get_table(size_table_name(self.model_selector)).to_frame() # - shared_data if shared_data is not None: assert shared_data.shape[0] == self.desired_size.shape[0] assert shared_data.shape[1] == self.desired_size.shape[1] + 1 # tally column assert shared_data_lock is not None self.shared_data = shared_data self.shared_data_lock = shared_data_lock # - load saved shadow_prices (if available) and set max_iterations accordingly if self.use_shadow_pricing: self.shadow_prices = None self.shadow_price_method = self.shadow_settings['SHADOW_PRICE_METHOD'] assert self.shadow_price_method in ['daysim', 'ctramp'] if self.shadow_settings['LOAD_SAVED_SHADOW_PRICES']: # read_saved_shadow_prices logs error and returns None if file not found self.shadow_prices = self.read_saved_shadow_prices(model_settings) if self.shadow_prices is None: self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS', 5) else: self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS_SAVED', 1) # initial_shadow_price if we did not load if self.shadow_prices is None: # initial value depends on method initial_shadow_price = 1.0 if self.shadow_price_method == 'ctramp' else 0.0 self.shadow_prices = \ pd.DataFrame(data=initial_shadow_price, columns=self.desired_size.columns, index=self.desired_size.index) else: self.max_iterations = 1 self.num_fail = pd.DataFrame(index=self.desired_size.columns) self.max_abs_diff = pd.DataFrame(index=self.desired_size.columns) self.max_rel_diff = pd.DataFrame(index=self.desired_size.columns)
# inject.add_injectable('data_dir', '/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data') inject.add_injectable('data_dir', ['ancillary_data', data_dir]) # inject.add_injectable('data_dir', ['ancillary_data', '../activitysim/abm/test/data']) inject.add_injectable('configs_dir', ['configs', '../example/configs']) injectables = config.handle_standard_args() tracing.config_logger() config.filter_warnings() log_settings(injectables) t0 = tracing.print_elapsed_time() # cleanup if not resuming if not config.setting('resume_after', False): cleanup_output_files() run_list = mp_tasks.get_run_list() if run_list['multiprocess']: # do this after config.handle_standard_args, as command line args may override injectables injectables = list(set(injectables) | set(['data_dir', 'configs_dir', 'output_dir'])) injectables = {k: inject.get_injectable(k) for k in injectables} else: injectables = None run(run_list, injectables) # pipeline.open_pipeline('_') #