def initialize_households(): trace_label = 'initialize_households' with chunk.chunk_log(trace_label, base=True): chunk.log_rss(f"{trace_label}.inside-yield") households = inject.get_table('households').to_frame() assert not households._is_view chunk.log_df(trace_label, "households", households) del households chunk.log_df(trace_label, "households", None) persons = inject.get_table('persons').to_frame() assert not persons._is_view chunk.log_df(trace_label, "persons", persons) del persons chunk.log_df(trace_label, "persons", None) model_settings = config.read_model_settings( 'initialize_households.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # - initialize shadow_pricing size tables after annotating household and person tables # since these are scaled to model size, they have to be created while single-process # this can now be called as a stand alone model step instead, add_size_tables add_size_tables = model_settings.get('add_size_tables', True) if add_size_tables: # warnings.warn(f"Calling add_size_tables from initialize will be removed in the future.", FutureWarning) shadow_pricing.add_size_tables() # - preload person_windows person_windows = inject.get_table('person_windows').to_frame() chunk.log_df(trace_label, "person_windows", person_windows)
def auto_ownership_simulate(households_merged, auto_ownership_spec, auto_ownership_settings, trace_hh_id): """ Auto ownership is a standard model which predicts how many cars a household with given characteristics owns """ logger.info("Running auto_ownership_simulate with %d households" % len(households_merged)) nest_spec = config.get_logit_model_settings(auto_ownership_settings) constants = config.get_model_constants(auto_ownership_settings) choices = asim.simple_simulate(choosers=households_merged.to_frame(), spec=auto_ownership_spec, nest_spec=nest_spec, locals_d=constants, trace_label=trace_hh_id and 'auto_ownership', trace_choice_name='auto_ownership') tracing.print_summary('auto_ownership', choices, value_counts=True) inject.add_column('households', 'auto_ownership', choices) pipeline.add_dependent_columns('households', 'households_autoown') if trace_hh_id: trace_columns = ['auto_ownership' ] + inject.get_table('households_autoown').columns tracing.trace_df(inject.get_table('households').to_frame(), label='auto_ownership', columns=trace_columns, warn_if_empty=True)
def test_persons_merged_table(): persons_merged = inject.get_table('persons_merged').to_frame() assert 'person_gender' in persons_merged.columns assert 'hh_income' in persons_merged.columns persons = inject.get_table('persons').to_frame() assert (persons_merged.person_type == persons.person_type).all() assert persons_merged.shape[0] == 27
def initialize_households(): trace_label = 'initialize_households' model_settings = config.read_model_settings('initialize_households.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # - initialize shadow_pricing size tables after annotating household and person tables # since these are scaled to model size, they have to be created while single-process shadow_pricing.add_size_tables() # - preload person_windows t0 = tracing.print_elapsed_time() inject.get_table('person_windows').to_frame() t0 = tracing.print_elapsed_time("preload person_windows", t0, debug=True)
def initialize_households(): trace_label = 'initialize_households' model_settings = config.read_model_settings('initialize_households.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # - initialize shadow_pricing size tables after annotating household and person tables # since these are scaled to model size, they have to be created while single-process shadow_pricing.add_size_tables() # - preload person_windows t0 = tracing.print_elapsed_time() inject.get_table('person_windows').to_frame() t0 = tracing.print_elapsed_time("preload person_windows", t0, debug=True)
def __init__(self, size_term_selector): # do this once so they can request siae_terms for various segments (tour_type or purpose) land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') self.destination_size_terms = \ tour_destination_size_terms(land_use, size_terms, size_term_selector)
def create_controls(spec): locals_d = {'df' : inject.get_table('all_acs').to_frame()} le = [] for e in zip(spec.geog, spec.target, spec.expression): geog, target, expression = e values = to_series(eval(expression, globals(), locals_d), target=target) le.append((target, values)) variables = [] seen = set() for statement in reversed(le): # statement is a tuple (<target_name>, <eval results in pandas.Series>) target_name = statement[0] if target_name not in seen: variables.insert(0, statement) seen.add(target_name) # DataFrame from list of tuples [<target_name>, <eval results>), ...] variables = pd.DataFrame.from_items(variables) variables = variables.merge(locals_d['df'][['state','county', 'tract', 'block group']], how='left', left_index = True, right_index = True) variables['block_group_id'] = variables['county'].astype('str')+variables['tract'].astype('str')+variables['block group'].astype('str') return variables
def __init__(self, size_term_selector): # do this once so they can request siae_terms for various segments (tour_type or purpose) land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') self.destination_size_terms = \ tour_destination_size_terms(land_use, size_terms, size_term_selector)
def create_mandatory_tours(): # FIXME - move this to body? persons = inject.get_table('persons') configs_dir = inject.get_injectable('configs_dir') persons = persons.to_frame(columns=[ "mandatory_tour_frequency", "is_worker", "school_taz", "workplace_taz" ]) persons = persons[~persons.mandatory_tour_frequency.isnull()] tour_frequency_alternatives = inject.get_injectable( 'mandatory_tour_frequency_alternatives') tours = process_mandatory_tours(persons, tour_frequency_alternatives) expressions.assign_columns(df=tours, model_settings='annotate_tours_with_dest', configs_dir=configs_dir, trace_label='create_mandatory_tours') pipeline.extend_table("tours", tours) tracing.register_traceable_table('tours', tours) pipeline.get_rn_generator().add_channel(tours, 'tours')
def test_disaggregate_trips_table(): trips = inject.get_table('disaggregate_trips').to_frame() assert 'build_auto_time' in trips.columns assert 'base_auto_time' in trips.columns assert trips.shape[0] == 250
def append_tour_leg_trip_mode_choice_logsums(tours): """Creates trip mode choice logsum column in tours table for each tour mode and leg Parameters ---------- tours : pd.DataFrame Returns ------- tours : pd.DataFrame Adds two * n_modes logsum columns to each tour row, e.g. "logsum_DRIVE_outbound" """ trips = inject.get_table('trips').to_frame() trip_dir_mode_logsums = trips.pivot(index='tour_id', columns=['tour_mode', 'outbound'], values='trip_mode_choice_logsum') new_cols = [ '_'.join(['logsum', mode, 'outbound' if outbound else 'inbound']) for mode, outbound in trip_dir_mode_logsums.columns ] trip_dir_mode_logsums.columns = new_cols trip_dir_mode_logsums.reindex(tours.index) tours = pd.merge(tours, trip_dir_mode_logsums, left_index=True, right_index=True) return tours
def write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() records = [] # write data dictionary for all checkpointed_tables with open(os.path.join(output_dir, 'data_dict.txt'), 'w') as file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print >> file, "\n### %s %s" % (table_name, df.shape) print >> file, df.dtypes rows, columns = df.shape bytes = df.memory_usage(index=True).sum() records.append((table_name, rows, columns, bytes)) df = pd.DataFrame.from_records( records, columns=['table_name', 'rows', 'columns', 'bytes']) df.sort_values(by='table_name', inplace=True) df.to_csv(os.path.join(output_dir, 'data_dict.csv'))
def annotate_tables(model_settings, trace_label): annotate_tables = model_settings.get('annotate_tables', []) if not annotate_tables: logger.warning("annotate_tables setting is empty - nothing to do!") t0 = tracing.print_elapsed_time() for table_info in annotate_tables: tablename = table_info['tablename'] df = inject.get_table(tablename).to_frame() # - rename columns column_map = table_info.get('column_map', None) if column_map: logger.info("renaming %s columns %s" % (tablename, column_map,)) df.rename(columns=column_map, inplace=True) # - annotate annotate = table_info.get('annotate', None) if annotate: logger.info("annotated %s SPEC %s" % (tablename, annotate['SPEC'],)) expressions.assign_columns( df=df, model_settings=annotate, trace_label=trace_label) # fixme - narrow? # - write table to pipeline pipeline.replace_table(tablename, df)
def previous_write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ model_settings = config.read_model_settings('write_data_dictionary') txt_format = model_settings.get('txt_format', 'data_dict.txt') csv_format = model_settings.get('csv_format', 'data_dict.csv') if txt_format: output_file_path = config.output_file_path(txt_format) pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables with open(output_file_path, 'w') as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def get_tazs(self): # FIXME - should compute on init? if self.zone_system == ONE_ZONE: tazs = inject.get_table('land_use').index.values else: tazs = self.maz_taz_df.TAZ.unique() assert isinstance(tazs, np.ndarray) return tazs
def test_trips_with_demographics_table(): trips = inject.get_table('trips_with_demographics').to_frame() assert 'build_auto_time' in trips.columns assert 'base_auto_time' in trips.columns assert 'person_age' in trips.columns assert 'hh_income' in trips.columns assert trips.shape[0] == 250
def annotate_jtp(model_settings, trace_label): # - annotate persons persons = inject.get_table('persons').to_frame() expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons)
def annotate_tables(model_settings, trace_label): trace_label = tracing.extend_trace_label(trace_label, 'annotate_tables') chunk.log_rss(trace_label) annotate_tables = model_settings.get('annotate_tables', []) if not annotate_tables: logger.warning( f"{trace_label} - annotate_tables setting is empty - nothing to do!" ) assert isinstance(annotate_tables, list), \ f"annotate_tables settings should be a list but is {type(annotate_tables)}" t0 = tracing.print_elapsed_time() for table_info in annotate_tables: tablename = table_info['tablename'] chunk.log_rss(f"{trace_label}.pre-get_table.{tablename}") df = inject.get_table(tablename).to_frame() chunk.log_df(trace_label, tablename, df) # - rename columns column_map = table_info.get('column_map', None) if column_map: warnings.warn( f"Setting 'column_map' has been changed to 'rename_columns'. " f"Support for 'column_map' in annotate_tables will be removed in future versions.", FutureWarning) logger.info( f"{trace_label} - renaming {tablename} columns {column_map}") df.rename(columns=column_map, inplace=True) # - annotate annotate = table_info.get('annotate', None) if annotate: logger.info( f"{trace_label} - annotating {tablename} SPEC {annotate['SPEC']}" ) expressions.assign_columns(df=df, model_settings=annotate, trace_label=trace_label) chunk.log_df(trace_label, tablename, df) # - write table to pipeline pipeline.replace_table(tablename, df) del df chunk.log_df(trace_label, tablename, None)
def write_data_dictionary(output_dir): output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables with open(os.path.join(output_dir, 'data_dict.csv'), 'a') as file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print >> file, "\n### %s (%s)\n" % (table_name, df.shape), df.dtypes
def __init__(self, size_term_selector): # do this once so they can request size_terms for various segments (tour_type or purpose) land_use = inject.get_table('land_use') self.land_use = land_use size_terms = inject.get_injectable('size_terms') self.destination_size_terms = \ tour_destination_size_terms(self.land_use, size_terms, size_term_selector) assert not self.destination_size_terms.isna().any(axis=None)
def test_read_persons_table(): table_settings = config.read_model_settings('tables.yaml') assert table_settings.get('persons') == 'persons.csv' # expect all of and only the columns specified by persons_column_map values persons = inject.get_table('persons').to_frame() assert expect_columns(persons, list(table_settings['persons_column_map'].values())) assert persons.shape[0] == 27
def add_result_columns(base_dfname, from_df, prefix=''): dest_df = inject.get_table(base_dfname).to_frame() if prefix: from_df = from_df.copy() from_df.columns = [prefix + c for c in from_df.columns.values] assign_in_place(dest_df, from_df) pipeline.replace_table(base_dfname, dest_df)
def test_read_households_table(): table_settings = config.read_model_settings('tables.yaml') households = inject.get_table('households').to_frame() assert not missing_columns(households, list(table_settings['base_households_column_map'].values())) assert not missing_columns(households, list(table_settings['build_households_column_map'].values())) assert households.shape[0] == 9
def input_pre_processor(): # - load generic data data_dir = setting('data_dir', inject.get_injectable('data_dir')) load_tables('input_tables', data_dir) # - load scenario input data scenario_input_dir = os.path.join(scenario_dir(), 'inputs') load_tables('scenario_input_tables', scenario_input_dir) for table_name in pipeline.orca_dataframe_tables(): df = inject.get_table(table_name, None).to_frame()
def _create_od_alts_from_dest_size_terms(size_terms_df, segment_name, od_id_col=None, origin_id_col='origin', dest_id_col='destination', origin_filter=None, origin_attr_cols=None): """ Extend destination size terms to create dataframe representing the cartesian product of tour origins and destinations. Actual "Size Terms" will still only be associated with the destinations, but individual attributes of the origins can be preserved. """ land_use = inject.get_table('land_use').to_frame(columns=origin_attr_cols) if origin_filter: origins = land_use.query(origin_filter) else: origins = land_use n_repeat = len(origins) od_alts = size_terms_df.reindex(size_terms_df.index.repeat(n_repeat)) od_alts[origin_id_col] = list( origins.index.values) * od_alts.index.nunique() od_alts.reset_index(inplace=True) if dest_id_col not in od_alts.columns: od_alts.rename(columns={land_use.index.name: dest_id_col}, inplace=True) if od_id_col is None: new_index_name = get_od_id_col(origin_id_col, dest_id_col) else: new_index_name = od_id_col od_alts[new_index_name] = od_alts[origin_id_col].astype( str) + '_' + od_alts[dest_id_col].astype(str) od_alts.set_index(new_index_name, inplace=True) # manually add origin attributes to output since these can't be generated by # the destination-based size term calculator if origin_attr_cols: land_use.index.name = origin_id_col land_use.reset_index(inplace=True) od_alts.reset_index(inplace=True) od_alts = pd.merge(od_alts, land_use[origin_attr_cols + [origin_id_col]], on=origin_id_col, how='left').set_index(new_index_name) return od_alts
def mandatory_tour_frequency(persons_merged, mandatory_tour_frequency_spec, mandatory_tour_frequency_settings, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons" % len(choosers)) nest_spec = config.get_logit_model_settings(mandatory_tour_frequency_settings) constants = config.get_model_constants(mandatory_tour_frequency_settings) choices = simulate.simple_simulate( choosers, spec=mandatory_tour_frequency_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( mandatory_tour_frequency_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) tracing.print_summary('mandatory_tour_frequency', choices, value_counts=True) inject.add_column("persons", "mandatory_tour_frequency", choices) create_mandatory_tours(trace_hh_id) # add mandatory_tour-dependent columns (e.g. tour counts) to persons pipeline.add_dependent_columns("persons", "persons_mtf") if trace_hh_id: trace_columns = ['mandatory_tour_frequency'] tracing.trace_df(inject.get_table('persons').to_frame(), label="mandatory_tour_frequency.persons", # columns=trace_columns, warn_if_empty=True)
def cdap_simulate(persons_merged, cdap_settings, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ persons_df = persons_merged.to_frame() constants = config.get_model_constants(cdap_settings) logger.info("Running cdap_simulate with %d persons" % len(persons_df.index)) choices = run_cdap( persons=persons_df, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label='cdap') tracing.print_summary('cdap_activity', choices.cdap_activity, value_counts=True) print pd.crosstab(persons_df.ptype, choices.cdap_activity, margins=True) choices = choices.reindex(persons_merged.index) inject.add_column("persons", "cdap_activity", choices.cdap_activity) inject.add_column("persons", "cdap_rank", choices.cdap_rank) pipeline.add_dependent_columns("persons", "persons_cdap") pipeline.add_dependent_columns("households", "households_cdap") if trace_hh_id: tracing.trace_df(inject.get_table('persons_merged').to_frame(), label="cdap", columns=['ptype', 'cdap_rank', 'cdap_activity'], warn_if_empty=True)
def preload_injectables(): """ preload bulky injectables up front - stuff that isn't inserted into the pipeline """ logger.info("preload_injectables") inject.add_step('track_skim_usage', track_skim_usage) inject.add_step('write_data_dictionary', write_data_dictionary) inject.add_step('write_tables', write_tables) table_list = config.setting('input_table_list') # default ActivitySim table names and indices if table_list is None: logger.warning( "No 'input_table_list' found in settings. This will be a " "required setting in upcoming versions of ActivitySim.") new_settings = inject.get_injectable('settings') new_settings['input_table_list'] = DEFAULT_TABLE_LIST inject.add_injectable('settings', new_settings) # FIXME undocumented feature if config.setting('write_raw_tables'): # write raw input tables as csv (before annotation) csv_dir = config.output_file_path('raw_tables') if not os.path.exists(csv_dir): os.makedirs(csv_dir) # make directory if needed table_names = [t['tablename'] for t in table_list] for t in table_names: df = inject.get_table(t).to_frame() if t == 'households': df.drop(columns='chunk_id', inplace=True) df.to_csv(os.path.join(csv_dir, '%s.csv' % t), index=True) t0 = tracing.print_elapsed_time() # FIXME - still want to do this? # if inject.get_injectable('skim_dict', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True) # # if inject.get_injectable('skim_stack', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True) return True
def test_read_build_trips_table(): table_settings = config.read_model_settings('tables.yaml') assert table_settings.get('buildtrips') == 'buildtrips_normal.csv' trips = inject.get_table('build_trips').to_frame() # expect all of and only the columns specified by persons_column_map values raw_columns = \ mapped_columns( table_settings['buildtrips_column_map'], table_settings['buildtrips_baselos_column_map']) + ['build', 'base', 'person_id'] assert expect_columns(trips, raw_columns) assert trips.shape[0] == 127
def write_estimation_specs(estimator, model_settings, settings_file): """ write sample_spec, spec, and coefficients to estimation data bundle Parameters ---------- model_settings settings_file """ estimator.write_model_settings(model_settings, settings_file) # estimator.write_spec(model_settings, tag='SAMPLE_SPEC') estimator.write_spec(model_settings, tag='SPEC') estimator.write_coefficients(simulate.read_model_coefficients(model_settings)) estimator.write_table(inject.get_injectable('size_terms'), 'size_terms', append=False) estimator.write_table(inject.get_table('land_use').to_frame(), 'landuse', append=False)
def initialize(): """ Because random seed is set differently for each step, the sampling of households depends on which step they are initially loaded in so we force them to load here and they get stored to the pipeline, """ t0 = tracing.print_elapsed_time() inject.get_table('land_use').to_frame() t0 = tracing.print_elapsed_time("preload land_use", t0, debug=True) inject.get_table('households').to_frame() t0 = tracing.print_elapsed_time("preload households", t0, debug=True) inject.get_table('persons').to_frame() t0 = tracing.print_elapsed_time("preload persons", t0, debug=True) inject.get_table('person_windows').to_frame() t0 = tracing.print_elapsed_time("preload person_windows", t0, debug=True)
def create_non_mandatory_tours(): """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ persons = inject.get_table('persons') alts = inject.get_injectable('non_mandatory_tour_frequency_alts') df = process_non_mandatory_tours( persons.non_mandatory_tour_frequency.dropna(), alts ) pipeline.extend_table("tours", df) tracing.register_traceable_table('tours', df) pipeline.get_rn_generator().add_channel(df, 'tours')
def annotate_tables(model_settings, trace_label): annotate_tables = model_settings.get('annotate_tables', []) if not annotate_tables: logger.warning("annotate_tables setting is empty - nothing to do!") t0 = tracing.print_elapsed_time() for table_info in annotate_tables: tablename = table_info['tablename'] df = inject.get_table(tablename).to_frame() # - rename columns column_map = table_info.get('column_map', None) if column_map: warnings.warn( "annotate_tables option 'column_map' renamed 'rename_columns' and moved" "to settings.yaml. Support for 'column_map' in annotate_tables will be " "removed in future versions.", FutureWarning) logger.info("renaming %s columns %s" % ( tablename, column_map, )) df.rename(columns=column_map, inplace=True) # - annotate annotate = table_info.get('annotate', None) if annotate: logger.info("annotated %s SPEC %s" % ( tablename, annotate['SPEC'], )) expressions.assign_columns(df=df, model_settings=annotate, trace_label=trace_label) # fixme - narrow? # - write table to pipeline pipeline.replace_table(tablename, df)
def add_null_results(trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) persons = inject.get_table('persons').to_frame() persons['mandatory_tour_frequency'] = '' tours = pd.DataFrame() tours['tour_category'] = None tours['tour_type'] = None tours['person_id'] = None tours.index.name = 'tour_id' pipeline.replace_table("tours", tours) expressions.assign_columns( df=persons, model_settings=mandatory_tour_frequency_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons)
def get_shadow_pricing_info(): """ return dict with info about dtype and shapes of desired and modeled size tables block shape is (num_zones, num_segments + 1) Returns ------- shadow_pricing_info: dict dtype: <sp_dtype>, block_shapes: dict {<model_selector>: <block_shape>} """ land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') shadow_settings = config.read_model_settings('shadow_pricing.yaml') # shadow_pricing_models is dict of {<model_selector>: <model_name>} shadow_pricing_models = shadow_settings['shadow_pricing_models'] blocks = OrderedDict() for model_selector in shadow_pricing_models: sp_rows = len(land_use) sp_cols = len(size_terms[size_terms.model_selector == model_selector]) # extra tally column for TALLY_CHECKIN and TALLY_CHECKOUT semaphores blocks[block_name(model_selector)] = (sp_rows, sp_cols + 1) sp_dtype = np.int64 shadow_pricing_info = { 'dtype': sp_dtype, 'block_shapes': blocks, } for k in shadow_pricing_info: logger.debug("shadow_pricing_info %s: %s" % (k, shadow_pricing_info.get(k))) return shadow_pricing_info
def test_mini_pipeline_run3(): # test that hh_ids setting overrides household sampling configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, hh_ids='override_hh_ids.csv') households = inject.get_table('households').to_frame() override_hh_ids = pd.read_csv(config.data_file_path('override_hh_ids.csv')) print("\noverride_hh_ids\n", override_hh_ids) print("\nhouseholds\n", households.index) assert households.shape[0] == override_hh_ids.shape[0] assert households.index.isin(override_hh_ids.household_id).all() inject.clear_cache() close_handlers()
def write_data_dictionary(output_dir): """ Write table_name, number of rows, columns, and bytes for each checkpointed table Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 output_tables = pipeline.checkpointed_tables() # write data dictionary for all checkpointed_tables mode = 'wb' if sys.version_info < (3,) else 'w' with open(config.output_file_path('data_dict.txt'), mode) as output_file: for table_name in output_tables: df = inject.get_table(table_name, None).to_frame() print("\n### %s %s" % (table_name, df.shape), file=output_file) print('index:', df.index.name, df.index.dtype, file=output_file) print(df.dtypes, file=output_file)
def run_trip_destination( trips, tours_merged, chunk_size, trace_hh_id, trace_label): """ trip destination - main functionality separated from model step so it can be called iteratively Run the trip_destination model, assigning destinations for each (intermediate) trip (last trips already have a destination - either the tour primary destination or Home) Set trip destination and origin columns, and a boolean failed flag for any failed trips (destination for flagged failed trips will be set to -1) Parameters ---------- trips tours_merged chunk_size trace_hh_id trace_label Returns ------- """ model_settings = config.read_model_settings('trip_destination.yaml') preprocessor_settings = model_settings.get('preprocessor', None) logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') # - initialize trip origin and destination to those of half-tour # (we will sequentially adjust intermediate trips origin and destination as we choose them) tour_destination = reindex(tours_merged.destination, trips.tour_id).astype(int) tour_origin = reindex(tours_merged.origin, trips.tour_id).astype(int) trips['destination'] = np.where(trips.outbound, tour_destination, tour_origin) trips['origin'] = np.where(trips.outbound, tour_origin, tour_destination) trips['failed'] = False trips = trips.sort_index() trips['next_trip_id'] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where(trips.trip_num < trips.trip_count, 0) # - filter tours_merged (AFTER copying destination and origin columns to trips) # tours_merged is used for logsums, we filter it here upfront to save space and time tours_merged_cols = logsum_settings['TOURS_MERGED_CHOOSER_COLUMNS'] if 'REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS' in model_settings: redundant_cols = model_settings['REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS'] tours_merged_cols = [c for c in tours_merged_cols if c not in redundant_cols] tours_merged = tours_merged[tours_merged_cols] # - skims skims = wrap_skims(model_settings) # - size_terms and alternatives alternatives = tour_destination_size_terms(land_use, size_terms, 'trip') # DataFrameMatrix alows us to treat dataframe as virtual a 2-D array, indexed by TAZ, purpose # e.g. size_terms.get(df.dest_taz, df.purpose) # returns a series of size_terms for each chooser's dest_taz and purpose with chooser index size_term_matrix = DataFrameMatrix(alternatives) # don't need size terms in alternatives, just TAZ index alternatives = alternatives.drop(alternatives.columns, axis=1) alternatives.index.name = model_settings['ALT_DEST'] # - process intermediate trips in ascending trip_num order intermediate = trips.trip_num < trips.trip_count if intermediate.any(): first_trip_num = trips[intermediate].trip_num.min() last_trip_num = trips[intermediate].trip_num.max() # iterate over trips in ascending trip_num order for trip_num in range(first_trip_num, last_trip_num + 1): nth_trips = trips[intermediate & (trips.trip_num == trip_num)] nth_trace_label = tracing.extend_trace_label(trace_label, 'trip_num_%s' % trip_num) # - annotate nth_trips if preprocessor_settings: expressions.assign_columns( df=nth_trips, model_settings=preprocessor_settings, locals_dict=config.get_model_constants(model_settings), trace_label=nth_trace_label) logger.info("Running %s with %d trips", nth_trace_label, nth_trips.shape[0]) # - choose destination for nth_trips, segmented by primary_purpose choices_list = [] for primary_purpose, trips_segment in nth_trips.groupby('primary_purpose'): choices = choose_trip_destination( primary_purpose, trips_segment, alternatives, tours_merged, model_settings, size_term_matrix, skims, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(nth_trace_label, primary_purpose)) choices_list.append(choices) destinations = pd.concat(choices_list) failed_trip_ids = nth_trips.index.difference(destinations.index) if failed_trip_ids.any(): logger.warning("%s sidelining %s trips without viable destination alternatives" % (nth_trace_label, failed_trip_ids.shape[0])) next_trip_ids = nth_trips.next_trip_id.reindex(failed_trip_ids) trips.loc[failed_trip_ids, 'failed'] = True trips.loc[failed_trip_ids, 'destination'] = -1 trips.loc[next_trip_ids, 'origin'] = trips.loc[failed_trip_ids].origin.values # - assign choices to these trips destinations and to next trips origin assign_in_place(trips, destinations.to_frame('destination')) destinations.index = nth_trips.next_trip_id.reindex(destinations.index) assign_in_place(trips, destinations.to_frame('origin')) del trips['next_trip_id'] return trips
def joint_tour_scheduling( tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): """ This model predicts the departure time and duration of each joint tour """ trace_label = 'joint_tour_scheduling' model_settings = config.read_model_settings('joint_tour_scheduling.yaml') model_spec = simulate.read_model_spec(file_name='tour_scheduling_joint.csv') tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: tracing.no_results(trace_label) return # use inject.get_table as this won't exist if there are no joint_tours joint_tour_participants = inject.get_table('joint_tour_participants').to_frame() persons_merged = persons_merged.to_frame() logger.info("Running %s with %d joint tours", trace_label, joint_tours.shape[0]) # it may seem peculiar that we are concerned with persons rather than households # but every joint tour is (somewhat arbitrarily) assigned a "primary person" # some of whose characteristics are used in the spec # and we get household attributes along with person attributes in persons_merged persons_merged = persons_merged[persons_merged.num_hh_joint_tours > 0] # since a households joint tours each potentially different participants # they may also have different joint tour masks (free time of all participants) # so we have to either chunk processing by joint_tour_num and build timetable by household # or build timetables by unique joint_tour constants = config.get_model_constants(model_settings) # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( df=joint_tours, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label) tdd_choices, timetable = vectorize_joint_tour_scheduling( joint_tours, joint_tour_participants, persons_merged, tdd_alts, spec=model_spec, model_settings=model_settings, chunk_size=chunk_size, trace_label=trace_label) timetable.replace_table() assign_in_place(tours, tdd_choices) pipeline.replace_table("tours", tours) # updated df for tracing joint_tours = tours[tours.tour_category == 'joint'] if trace_hh_id: tracing.trace_df(joint_tours, label="joint_tour_scheduling", slicer='household_id')
def cdap_simulate(persons_merged, persons, households, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') persons_merged = persons_merged.to_frame() constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices.cdap_activity persons['cdap_rank'] = choices.cdap_rank expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info("cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True)) if trace_hh_id: tracing.trace_df(inject.get_table('persons_merged').to_frame(), label="cdap", columns=['ptype', 'cdap_rank', 'cdap_activity'], warn_if_empty=True)
def add_size_tables(): """ inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace) Size tables are pandas dataframes with locations counts for model_selector by zone and segment tour_destination_size_terms if using shadow pricing, we scale size_table counts to sample population (in which case, they have to be created while single-process) Scaling is problematic as it breaks household result replicability across sample sizes It also changes the magnitude of the size terms so if they are used as utilities in expression files, their importance will diminish relative to other utilities as the sample size decreases. Scaling makes most sense for a full sample in conjunction with shadow pricing, where shadow prices can be adjusted iteratively to bring modelled counts into line with desired (size table) counts. """ use_shadow_pricing = bool(config.setting('use_shadow_pricing')) shadow_settings = config.read_model_settings('shadow_pricing.yaml') shadow_pricing_models = shadow_settings['shadow_pricing_models'] # probably ought not scale if not shadow_pricing (breaks partial sample replicability) # but this allows compatability with existing CTRAMP behavior... scale_size_table = shadow_settings.get('SCALE_SIZE_TABLE', False) if shadow_pricing_models is None: logger.warning('shadow_pricing_models list not found in shadow_pricing settings') return # shadow_pricing_models is dict of {<model_selector>: <model_name>} # since these are scaled to model size, they have to be created while single-process for model_selector, model_name in iteritems(shadow_pricing_models): model_settings = config.read_model_settings(model_name) assert model_selector == model_settings['MODEL_SELECTOR'] segment_ids = model_settings['SEGMENT_IDS'] chooser_table_name = model_settings['CHOOSER_TABLE_NAME'] chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] choosers_df = inject.get_table(chooser_table_name).to_frame() if 'CHOOSER_FILTER_COLUMN_NAME' in model_settings: choosers_df = \ choosers_df[choosers_df[model_settings['CHOOSER_FILTER_COLUMN_NAME']] != 0] # - raw_desired_size land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') raw_size = tour_destination_size_terms(land_use, size_terms, model_selector) assert set(raw_size.columns) == set(segment_ids.keys()) if use_shadow_pricing or scale_size_table: inject.add_table('raw_' + size_table_name(model_selector), raw_size) # - scale size_table counts to sample population # scaled_size = zone_size * (total_segment_modeled / total_segment_desired) # segment scale factor (modeled / desired) keyed by segment_name segment_scale_factors = {} for c in raw_size: # number of zone demographics desired destination choices segment_desired_size = raw_size[c].astype(np.float64).sum() # number of synthetic population choosers in segment segment_chooser_count = \ (choosers_df[chooser_segment_column] == segment_ids[c]).sum() segment_scale_factors[c] = \ segment_chooser_count / np.maximum(segment_desired_size, 1) logger.info("add_desired_size_tables %s segment %s " "desired %s modeled %s scale_factor %s" % (chooser_table_name, c, segment_desired_size, segment_chooser_count, segment_scale_factors[c])) # FIXME - should we be rounding? scaled_size = (raw_size * segment_scale_factors).round() else: scaled_size = raw_size inject.add_table(size_table_name(model_selector), scaled_size)
def __init__(self, model_settings, num_processes, shared_data=None, shared_data_lock=None): """ Presence of shared_data is used as a flag for multiprocessing If we are multiprocessing, shared_data should be a multiprocessing.RawArray buffer to aggregate modeled_size across all sub-processes, and shared_data_lock should be a multiprocessing.Lock object to coordinate access to that buffer. Optionally load saved shadow_prices from data_dir if config setting use_shadow_pricing and shadow_setting LOAD_SAVED_SHADOW_PRICES are both True Parameters ---------- model_settings : dict shared_data : multiprocessing.Array or None (if single process) shared_data_lock : numpy array wrapping multiprocessing.RawArray or None (if single process) """ self.num_processes = num_processes self.use_shadow_pricing = bool(config.setting('use_shadow_pricing')) self.saved_shadow_price_file_path = None # set by read_saved_shadow_prices if loaded self.model_selector = model_settings['MODEL_SELECTOR'] full_model_run = config.setting('households_sample_size') == 0 if self.use_shadow_pricing and not full_model_run: logging.warning("deprecated combination of use_shadow_pricing and not full_model_run") self.segment_ids = model_settings['SEGMENT_IDS'] # - modeled_size (set by call to set_choices/synchronize_choices) self.modeled_size = None if self.use_shadow_pricing: self.shadow_settings = config.read_model_settings('shadow_pricing.yaml') for k in self.shadow_settings: logger.debug("shadow_settings %s: %s" % (k, self.shadow_settings.get(k))) # - destination_size_table (desired_size) self.desired_size = inject.get_table(size_table_name(self.model_selector)).to_frame() # - shared_data if shared_data is not None: assert shared_data.shape[0] == self.desired_size.shape[0] assert shared_data.shape[1] == self.desired_size.shape[1] + 1 # tally column assert shared_data_lock is not None self.shared_data = shared_data self.shared_data_lock = shared_data_lock # - load saved shadow_prices (if available) and set max_iterations accordingly if self.use_shadow_pricing: self.shadow_prices = None self.shadow_price_method = self.shadow_settings['SHADOW_PRICE_METHOD'] assert self.shadow_price_method in ['daysim', 'ctramp'] if self.shadow_settings['LOAD_SAVED_SHADOW_PRICES']: # read_saved_shadow_prices logs error and returns None if file not found self.shadow_prices = self.read_saved_shadow_prices(model_settings) if self.shadow_prices is None: self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS', 5) else: self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS_SAVED', 1) # initial_shadow_price if we did not load if self.shadow_prices is None: # initial value depends on method initial_shadow_price = 1.0 if self.shadow_price_method == 'ctramp' else 0.0 self.shadow_prices = \ pd.DataFrame(data=initial_shadow_price, columns=self.desired_size.columns, index=self.desired_size.index) else: self.max_iterations = 1 self.num_fail = pd.DataFrame(index=self.desired_size.columns) self.max_abs_diff = pd.DataFrame(index=self.desired_size.columns) self.max_rel_diff = pd.DataFrame(index=self.desired_size.columns)
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings('mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( model_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)