def annotate_tables(model_settings, trace_label): annotate_tables = model_settings.get('annotate_tables', []) if not annotate_tables: logger.warning("annotate_tables setting is empty - nothing to do!") t0 = tracing.print_elapsed_time() for table_info in annotate_tables: tablename = table_info['tablename'] df = inject.get_table(tablename).to_frame() # - rename columns column_map = table_info.get('column_map', None) if column_map: logger.info("renaming %s columns %s" % (tablename, column_map,)) df.rename(columns=column_map, inplace=True) # - annotate annotate = table_info.get('annotate', None) if annotate: logger.info("annotated %s SPEC %s" % (tablename, annotate['SPEC'],)) expressions.assign_columns( df=df, model_settings=annotate, trace_label=trace_label) # fixme - narrow? # - write table to pipeline pipeline.replace_table(tablename, df)
def trip_purpose( trips, chunk_size, trace_hh_id): """ trip purpose model step - calls run_trip_purpose to run the actual model adds purpose column to trips """ trace_label = "trip_purpose" trips_df = trips.to_frame() choices = run_trip_purpose( trips_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label ) trips_df['purpose'] = choices # we should have assigned a purpose to all trips assert not trips_df.purpose.isnull().any() pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def free_parking( persons_merged, persons, households, skim_dict, skim_stack, chunk_size, trace_hh_id, locutor): """ """ trace_label = 'free_parking' model_settings = config.read_model_settings('free_parking.yaml') choosers = persons_merged.to_frame() choosers = choosers[choosers.workplace_taz > -1] logger.info("Running %s with %d persons", trace_label, len(choosers)) constants = config.get_model_constants(model_settings) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label) model_spec = simulate.read_model_spec(file_name='free_parking.csv') nest_spec = config.get_logit_model_settings(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='free_parking_at_work') persons = persons.to_frame() # no need to reindex as we used all households free_parking_alt = model_settings['FREE_PARKING_ALT'] choices = (choices == free_parking_alt) persons['free_parking_at_work'] = choices.reindex(persons.index).fillna(0).astype(bool) pipeline.replace_table("persons", persons) tracing.print_summary('free_parking', persons.free_parking_at_work, value_counts=True) if trace_hh_id: tracing.trace_df(persons, label=trace_label, warn_if_empty=True)
def atwork_subtour_destination( tours, persons_merged, skim_dict, skim_stack, land_use, size_terms, chunk_size, trace_hh_id): persons_merged = persons_merged.to_frame() tours = tours.to_frame() subtours = tours[tours.tour_category == 'atwork'] # - if no atwork subtours if subtours.shape[0] == 0: tracing.no_results('atwork_subtour_destination') return # interaction_sample_simulate insists choosers appear in same order as alts subtours = subtours.sort_index() destination_size_terms = tour_destination_size_terms(land_use, size_terms, 'atwork') destination_sample = atwork_subtour_destination_sample( subtours, persons_merged, skim_dict, destination_size_terms, chunk_size, trace_hh_id) destination_sample = atwork_subtour_destination_logsums( persons_merged, destination_sample, skim_dict, skim_stack, chunk_size, trace_hh_id) choices = atwork_subtour_destination_simulate( subtours, persons_merged, destination_sample, skim_dict, destination_size_terms, chunk_size, trace_hh_id) subtours['destination'] = choices assign_in_place(tours, subtours[['destination']]) pipeline.replace_table("tours", tours) tracing.print_summary('subtour destination', subtours.destination, describe=True) if trace_hh_id: tracing.trace_df(tours, label='atwork_subtour_destination', columns=['destination'])
def non_mandatory_tour_destination( tours, persons_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ Given the tour generation from the above, each tour needs to have a destination, so in this case tours are the choosers (with the associated person that's making the tour) """ trace_label = 'non_mandatory_tour_destination' model_settings = config.read_model_settings('non_mandatory_tour_destination.yaml') tours = tours.to_frame() persons_merged = persons_merged.to_frame() # choosers are tours - in a sense tours are choosing their destination non_mandatory_tours = tours[tours.tour_category == 'non_mandatory'] # - if no mandatory_tours if non_mandatory_tours.shape[0] == 0: tracing.no_results(trace_label) return choices = tour_destination.run_tour_destination( tours, persons_merged, model_settings, skim_dict, skim_stack, chunk_size, trace_hh_id, trace_label) non_mandatory_tours['destination'] = choices assign_in_place(tours, non_mandatory_tours[['destination']]) pipeline.replace_table("tours", tours) if trace_hh_id: tracing.trace_df(tours[tours.tour_category == 'non_mandatory'], label="non_mandatory_tour_destination", slicer='person_id', index_label='tour', columns=None, warn_if_empty=True)
def trip_destination( trips, tours_merged, chunk_size, trace_hh_id): """ Choose a destination for all 'intermediate' trips based on trip purpose. Final trips already have a destination (the primary tour destination for outbound trips, and home for inbound trips.) """ trace_label = 'trip_destination' model_settings = config.read_model_settings('trip_destination.yaml') CLEANUP = model_settings.get('CLEANUP', True) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) trips_df = run_trip_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label) if trips_df.failed.any(): logger.warning("%s %s failed trips", trace_label, trips_df.failed.sum()) file_name = "%s_failed_trips" % trace_label logger.info("writing failed trips to %s", file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) if CLEANUP: trips_df = cleanup_failed_trips(trips_df) elif trips_df.failed.any(): logger.warning("%s keeping %s sidelined failed trips" % (trace_label, trips_df.failed.sum())) pipeline.replace_table("trips", trips_df) print("trips_df\n", trips_df.shape) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def replace_table(self): """ Save or replace windows_df DataFrame to pipeline with saved table name (specified when object instantiated.) This is a convenience function in case caller instantiates object in one context (e.g. dependency injection) where it knows the pipeline table name, but wants to checkpoint the table in another context where it does not know that name. """ assert self.windows_table_name is not None # get windows_df from bottleneck function in case updates to self.person_window # do not write through to pandas dataframe pipeline.replace_table(self.windows_table_name, self.get_windows_df())
def initialize_landuse(): trace_label = 'initialize_landuse' model_settings = config.read_model_settings('initialize_landuse.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # create accessibility land_use = pipeline.get_table('land_use') accessibility_df = pd.DataFrame(index=land_use.index) # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df)
def add_null_results(trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) persons = inject.get_table('persons').to_frame() persons['mandatory_tour_frequency'] = '' tours = pd.DataFrame() tours['tour_category'] = None tours['tour_type'] = None tours['person_id'] = None tours.index.name = 'tour_id' pipeline.replace_table("tours", tours) expressions.assign_columns( df=persons, model_settings=mandatory_tour_frequency_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons)
def auto_ownership_simulate(households, households_merged, chunk_size, trace_hh_id): """ Auto ownership is a standard model which predicts how many cars a household with given characteristics owns """ trace_label = 'auto_ownership_simulate' model_settings = config.read_model_settings('auto_ownership.yaml') logger.info("Running %s with %d households", trace_label, len(households_merged)) model_spec = simulate.read_model_spec(file_name='auto_ownership.csv') nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=households_merged.to_frame(), spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='auto_ownership') households = households.to_frame() # no need to reindex as we used all households households['auto_ownership'] = choices pipeline.replace_table("households", households) tracing.print_summary('auto_ownership', households.auto_ownership, value_counts=True) if trace_hh_id: tracing.trace_df(households, label='auto_ownership', warn_if_empty=True)
def atwork_subtour_scheduling( tours, persons_merged, tdd_alts, skim_dict, chunk_size, trace_hh_id): """ This model predicts the departure time and duration of each activity for at work subtours tours """ trace_label = 'atwork_subtour_scheduling' model_settings = config.read_model_settings('tour_scheduling_atwork.yaml') model_spec = simulate.read_model_spec(file_name='tour_scheduling_atwork.csv') persons_merged = persons_merged.to_frame() tours = tours.to_frame() subtours = tours[tours.tour_category == 'atwork'] # - if no atwork subtours if subtours.shape[0] == 0: tracing.no_results(trace_label) return logger.info("Running %s with %d tours", trace_label, len(subtours)) # preprocessor constants = config.get_model_constants(model_settings) od_skim_wrapper = skim_dict.wrap('origin', 'destination') do_skim_wrapper = skim_dict.wrap('destination', 'origin') skims = { "od_skims": od_skim_wrapper, "do_skims": do_skim_wrapper, } annotate_preprocessors( subtours, constants, skims, model_settings, trace_label) # parent_tours table with columns ['tour_id', 'tdd'] index = tour_id parent_tour_ids = subtours.parent_tour_id.astype(int).unique() parent_tours = pd.DataFrame({'tour_id': parent_tour_ids}, index=parent_tour_ids) parent_tours = parent_tours.merge(tours[['tdd']], left_index=True, right_index=True) tdd_choices = vectorize_subtour_scheduling( parent_tours, subtours, persons_merged, tdd_alts, model_spec, model_settings, chunk_size=chunk_size, trace_label=trace_label) assign_in_place(tours, tdd_choices) pipeline.replace_table("tours", tours) if trace_hh_id: tracing.trace_df(tours[tours.tour_category == 'atwork'], label="atwork_subtour_scheduling", slicer='person_id', index_label='tour_id', columns=None) if DUMP: subtours = tours[tours.tour_category == 'atwork'] parent_tours = tours[tours.index.isin(subtours.parent_tour_id)] tracing.dump_df(DUMP, subtours, trace_label, 'sub_tours') tracing.dump_df(DUMP, parent_tours, trace_label, 'parent_tours') parent_tours['parent_tour_id'] = parent_tours.index subtours = pd.concat([parent_tours, subtours]) tracing.dump_df(DUMP, tt.tour_map(parent_tours, subtours, tdd_alts, persons_id_col='parent_tour_id'), trace_label, 'tour_map')
def joint_tour_frequency(households, persons, chunk_size, trace_hh_id): """ This model predicts the frequency of making fully joint trips (see the alternatives above). """ trace_label = 'joint_tour_frequency' model_settings_file_name = 'joint_tour_frequency.yaml' model_settings = config.read_model_settings(model_settings_file_name) alternatives = simulate.read_model_alts( 'joint_tour_frequency_alternatives.csv', set_index='alt') # - only interested in households with more than one cdap travel_active person and # - at least one non-preschooler households = households.to_frame() multi_person_households = households[ households.participates_in_jtf_model].copy() # - only interested in persons in multi_person_households # FIXME - gratuitous pathological efficiency move, just let yaml specify persons? persons = persons.to_frame() persons = persons[persons.household_id.isin(multi_person_households.index)] logger.info( "Running joint_tour_frequency with %d multi-person households" % multi_person_households.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'persons': persons, 'hh_time_window_overlap': hh_time_window_overlap } expressions.assign_columns(df=multi_person_households, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) estimator = estimation.manager.begin_estimation('joint_tour_frequency') model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) if estimator: estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) estimator.write_coefficients(coefficients_df) estimator.write_choosers(multi_person_households) choices = simulate.simple_simulate( choosers=multi_person_households, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='joint_tour_frequency', estimator=estimator) # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'households', 'joint_tour_frequency') estimator.write_override_choices(choices) estimator.end_estimation() # - create joint_tours based on joint_tour_frequency choices # - we need a person_id in order to generate the tour index (and for register_traceable_table) # - but we don't know the tour participants yet # - so we arbitrarily choose the first person in the household # - to be point person for the purpose of generating an index and setting origin temp_point_persons = persons.loc[persons.PNUM == 1] temp_point_persons['person_id'] = temp_point_persons.index temp_point_persons = temp_point_persons.set_index('household_id') temp_point_persons = temp_point_persons[['person_id', 'home_taz']] joint_tours = \ process_joint_tours(choices, alternatives, temp_point_persons) tours = pipeline.extend_table("tours", joint_tours) tracing.register_traceable_table('tours', joint_tours) pipeline.get_rn_generator().add_channel('tours', joint_tours) # - annotate households # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] households['joint_tour_frequency'] = choices.reindex( households.index).fillna(no_tours_alt).astype(str) households['num_hh_joint_tours'] = joint_tours.groupby('household_id').size().\ reindex(households.index).fillna(0).astype(np.int8) pipeline.replace_table("households", households) tracing.print_summary('joint_tour_frequency', households.joint_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(households, label="joint_tour_frequency.households") tracing.trace_df(joint_tours, label="joint_tour_frequency.joint_tours", slicer='household_id')
def compute_accessibility(accessibility, skim_dict, land_use, trace_od): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec The actual results depend on the expressions in accessibility_spec, but this is initially intended to permit implementation of the mtc accessibility calculation as implemented by Accessibility.job Compute measures of accessibility used by the automobile ownership model. The accessibility measure first multiplies an employment variable by a mode-specific decay function. The product reflects the difficulty of accessing the activities the farther (in terms of round-trip travel time) the jobs are from the location in question. The products to each destination zone are next summed over each origin zone, and the logarithm of the product mutes large differences. The decay function on the walk accessibility measure is steeper than automobile or transit. The minimum accessibility is zero. """ trace_label = 'compute_accessibility' model_settings = config.read_model_settings('accessibility.yaml') assignment_spec = assign.read_assignment_spec(config.config_file_path('accessibility.csv')) accessibility_df = accessibility.to_frame() logger.info("Running %s with %d dest zones" % (trace_label, len(accessibility_df))) constants = config.get_model_constants(model_settings) land_use_columns = model_settings.get('land_use_columns', []) land_use_df = land_use.to_frame() # #bug # # land_use_df = land_use_df[land_use_df.index % 2 == 1] # accessibility_df = accessibility_df[accessibility_df.index.isin(land_use_df.index)].head(5) # # print "land_use_df", land_use_df.index # print "accessibility_df", accessibility_df.index # #bug orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) logger.info("Running %s with %d dest zones %d orig zones" % (trace_label, dest_zone_count, orig_zone_count)) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(np.asanyarray(accessibility_df.index), dest_zone_count), 'dest': np.tile(np.asanyarray(land_use_df.index), orig_zone_count) } ) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df land_use_df = land_use_df[land_use_columns] od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() locals_d = { 'log': np.log, 'exp': np.exp, 'skim_od': AccessibilitySkims(skim_dict, orig_zones, dest_zones), 'skim_do': AccessibilitySkims(skim_dict, orig_zones, dest_zones, transpose=True) } if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(assignment_spec, od_df, locals_d, trace_rows=trace_od_rows) for column in results.columns: data = np.asanyarray(results[column]) data.shape = (orig_zone_count, dest_zone_count) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df) if trace_od: if not trace_od_rows.any(): logger.warning("trace_od not found origin = %s, dest = %s" % (trace_orig, trace_dest)) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals")
def balance_trips(zones, trace_od): """Improve the match between destination zone trip totals (given by the DEST_TARGETS in the balance_trips config file) and the trip counts calculated during the destination choice step. The config file should contain the following parameters: dest_zone_trip_targets: total: <aggregate destination zone trip counts> OR <segment_1>: totals for segment 1 (optional) <segment_2>: totals for segment 2 (optional) <segment_3>: totals for segment 3 (optional) (These are optional) max_iterations: maximum number of iteration to pass to the balancer balance_closure: float precision to stop balancing totals input_table: path to CSV to use instead of trips table. The config file can also have an orig_zone_trip_targets to manually specify origin zone totals instead of using the logsums calculated by the destination choice step. Parameters ---------- zones : DataFrameWrapper zone attributes trace_od : list or dict Returns ------- Nothing. Balances trips table and writes trace tables """ logger.info('running trip balancing step ...') model_settings = config.read_model_settings(YAML_FILENAME) trips_df = get_trips_df(model_settings) trace_rows = trace.trace_filter(trips_df, trace_od) tracing.write_csv(trips_df[trace_rows], file_name='trips_unbalanced', transpose=False) trips_df = trips_df.melt( id_vars=['orig', 'dest'], var_name='segment', value_name='trips') dest_targets = model_settings.get(DEST_TARGETS) orig_targets = model_settings.get(ORIG_TARGETS) max_iterations = model_settings.get('max_iterations', 50) closure = model_settings.get('balance_closure', 0.001) aggregates, dimensions = calculate_aggregates(trips_df, zones.to_frame(), dest_targets, orig_targets) balancer = Balancer(trips_df.reset_index(), aggregates, dimensions, weight_col='trips', max_iteration=max_iterations, closure=closure) balanced_df = balancer.balance() balanced_trips = balanced_df.set_index(['orig', 'dest', 'segment'])['trips'].unstack() tracing.write_csv(balanced_trips.reset_index()[trace_rows], file_name='trips_balanced', transpose=False) pipeline.replace_table('trips', balanced_trips) logger.info('finished balancing trips.')
def joint_tour_composition( tours, households, persons, chunk_size, trace_hh_id): """ This model predicts the makeup of the travel party (adults, children, or mixed). """ trace_label = 'joint_tour_composition' model_settings = config.read_model_settings('joint_tour_composition.yaml') model_spec = simulate.read_model_spec(file_name='joint_tour_composition.csv') tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: add_null_results(trace_label, tours) return # - only interested in households with joint_tours households = households.to_frame() households = households[households.num_hh_joint_tours > 0] persons = persons.to_frame() persons = persons[persons.household_id.isin(households.index)] logger.info("Running joint_tour_composition with %d joint tours" % joint_tours.shape[0]) # - run preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'persons': persons, 'hh_time_window_overlap': hh_time_window_overlap } expressions.assign_columns( df=households, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) joint_tours_merged = pd.merge(joint_tours, households, left_on='household_id', right_index=True, how='left') # - simple_simulate nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=joint_tours_merged, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='composition') # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) # add composition column to tours for tracing joint_tours['composition'] = choices # reindex since we ran model on a subset of households tours['composition'] = choices.reindex(tours.index).fillna('').astype(str) pipeline.replace_table("tours", tours) tracing.print_summary('joint_tour_composition', joint_tours.composition, value_counts=True) if trace_hh_id: tracing.trace_df(joint_tours, label="joint_tour_composition.joint_tours", slicer='household_id')
def joint_tour_scheduling( tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): """ This model predicts the departure time and duration of each joint tour """ trace_label = 'joint_tour_scheduling' model_settings = config.read_model_settings('joint_tour_scheduling.yaml') model_spec = simulate.read_model_spec(file_name='tour_scheduling_joint.csv') tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: tracing.no_results(trace_label) return # use inject.get_table as this won't exist if there are no joint_tours joint_tour_participants = inject.get_table('joint_tour_participants').to_frame() persons_merged = persons_merged.to_frame() logger.info("Running %s with %d joint tours", trace_label, joint_tours.shape[0]) # it may seem peculiar that we are concerned with persons rather than households # but every joint tour is (somewhat arbitrarily) assigned a "primary person" # some of whose characteristics are used in the spec # and we get household attributes along with person attributes in persons_merged persons_merged = persons_merged[persons_merged.num_hh_joint_tours > 0] # since a households joint tours each potentially different participants # they may also have different joint tour masks (free time of all participants) # so we have to either chunk processing by joint_tour_num and build timetable by household # or build timetables by unique joint_tour constants = config.get_model_constants(model_settings) # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_d = {} if constants is not None: locals_d.update(constants) expressions.assign_columns( df=joint_tours, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label) tdd_choices, timetable = vectorize_joint_tour_scheduling( joint_tours, joint_tour_participants, persons_merged, tdd_alts, spec=model_spec, model_settings=model_settings, chunk_size=chunk_size, trace_label=trace_label) timetable.replace_table() assign_in_place(tours, tdd_choices) pipeline.replace_table("tours", tours) # updated df for tracing joint_tours = tours[tours.tour_category == 'joint'] if trace_hh_id: tracing.trace_df(joint_tours, label="joint_tour_scheduling", slicer='household_id')
def trip_purpose_and_destination( trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_purpose_and_destination" model_settings = config.read_model_settings('trip_purpose_and_destination.yaml') # for consistency, read sample_table_name setting from trip_destination settings file trip_destination_model_settings = config.read_model_settings('trip_destination.yaml') sample_table_name = trip_destination_model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None MAX_ITERATIONS = model_settings.get('MAX_ITERATIONS', 5) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() if trips_df.empty: logger.info("%s - no trips. Nothing to do." % trace_label) return # FIXME could allow MAX_ITERATIONS=0 to allow for cleanup-only run # in which case, we would need to drop bad trips, WITHOUT failing bad_trip leg_mates assert (MAX_ITERATIONS > 0) # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if 'destination' in trips_df: if 'failed' not in trips_df.columns: # trip_destination model cleaned up any failed trips logger.info("%s - no failed column from prior model run." % trace_label) return elif not trips_df.failed.any(): # 'failed' column but no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) trips_df.drop(columns='failed', inplace=True) pipeline.replace_table("trips", trips_df) return else: logger.info("trip_destination has already been run. Rerunning failed trips") flag_failed_trip_leg_mates(trips_df, 'failed') trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] logger.info("Rerunning %s failed trips and leg-mates" % trips_df.shape[0]) # drop any previously saved samples of failed trips if want_sample_table and pipeline.is_table(sample_table_name): logger.info("Dropping any previously saved samples of failed trips") save_sample_df = pipeline.get_table(sample_table_name) save_sample_df.drop(trips_df.index, level='trip_id', inplace=True) pipeline.replace_table(sample_table_name, save_sample_df) del save_sample_df processed_trips = [] save_samples = [] i = 0 TRIP_RESULT_COLUMNS = ['purpose', 'destination', 'origin', 'failed'] while True: i += 1 for c in TRIP_RESULT_COLUMNS: if c in trips_df: del trips_df[c] trips_df, save_sample_df = run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % i)) # # if testing, make sure at least one trip fails if config.setting('testing_fail_trip_destination', False) \ and (i == 1) and not trips_df.failed.any(): fail_o = trips_df[trips_df.trip_num < trips_df.trip_count].origin.max() trips_df.failed = (trips_df.origin == fail_o) & \ (trips_df.trip_num < trips_df.trip_count) num_failed_trips = trips_df.failed.sum() # if there were no failed trips, we are done if num_failed_trips == 0: processed_trips.append(trips_df[TRIP_RESULT_COLUMNS]) if save_sample_df is not None: save_samples.append(save_sample_df) break logger.warning("%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i)) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) # if max iterations reached, add remaining trips to processed_trips and give up # note that we do this BEFORE failing leg_mates so resulting trip legs are complete if i >= MAX_ITERATIONS: logger.warning("%s too many iterations %s" % (trace_label, i)) processed_trips.append(trips_df[TRIP_RESULT_COLUMNS]) if save_sample_df is not None: save_sample_df.drop(trips_df[trips_df.failed].index, level='trip_id', inplace=True) save_samples.append(save_sample_df) break # otherwise, if any trips failed, then their leg-mates trips must also fail flag_failed_trip_leg_mates(trips_df, 'failed') # add the good trips to processed_trips processed_trips.append(trips_df[~trips_df.failed][TRIP_RESULT_COLUMNS]) # and keep the failed ones to retry trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] # add trip samples of processed_trips to processed_samples if save_sample_df is not None: # drop failed trip samples save_sample_df.drop(trips_df.index, level='trip_id', inplace=True) save_samples.append(save_sample_df) # - assign result columns to trips processed_trips = pd.concat(processed_trips) if len(save_samples) > 0: save_sample_df = pd.concat(save_samples) logger.info("adding %s samples to %s" % (len(save_sample_df), sample_table_name)) pipeline.extend_table(sample_table_name, save_sample_df) logger.info("%s %s failed trips after %s iterations" % (trace_label, processed_trips.failed.sum(), i)) trips_df = trips.to_frame() assign_in_place(trips_df, processed_trips) trips_df = cleanup_failed_trips(trips_df) pipeline.replace_table("trips", trips_df) # check to make sure we wrote sample file if requestsd if want_sample_table and len(trips_df) > 0: assert pipeline.is_table(sample_table_name) # since we have saved samples for all successful trips # once we discard failed trips, we should samples for all trips save_sample_df = pipeline.get_table(sample_table_name) # expect samples only for intermediate trip destinatinos assert \ len(save_sample_df.index.get_level_values(0).unique()) == \ len(trips_df[trips_df.trip_num < trips_df.trip_count]) del save_sample_df if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def trip_mode_choice( trips, tours_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. Modes for each primary tour putpose are calculated separately because they have different coefficient values (stored in trip_mode_choice_coeffs.csv coefficient file.) Adds trip_mode column to trip table """ trace_label = 'trip_mode_choice' model_settings = config.read_model_settings('trip_mode_choice.yaml') model_spec = \ simulate.read_model_spec(file_name=model_settings['SPEC']) omnibus_coefficients = \ assign.read_constant_spec(config.config_file_path(model_settings['COEFFS'])) trips_df = trips.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) tours_merged = tours_merged.to_frame() tours_merged = tours_merged[model_settings['TOURS_MERGED_CHOOSER_COLUMNS']] nest_spec = config.get_logit_model_settings(model_settings) tracing.print_summary('primary_purpose', trips_df.primary_purpose, value_counts=True) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( trips_df, tours_merged, left_on='tour_id', right_index=True, how="left") assert trips_merged.index.equals(trips.index) # setup skim keys assert ('trip_period' not in trips_merged) trips_merged['trip_period'] = skim_time_period_label(trips_merged.depart) orig_col = 'origin' dest_col = 'destination' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col, right_key=dest_col, skim_key='trip_period') od_skim_wrapper = skim_dict.wrap('origin', 'destination') skims = { "odt_skims": odt_skim_stack_wrapper, "od_skims": od_skim_wrapper, } constants = config.get_model_constants(model_settings) constants.update({ 'ORIGIN': orig_col, 'DESTINATION': dest_col }) choices_list = [] for primary_purpose, trips_segment in trips_merged.groupby('primary_purpose'): segment_trace_label = tracing.extend_trace_label(trace_label, primary_purpose) logger.info("trip_mode_choice tour_type '%s' (%s trips)" % (primary_purpose, len(trips_segment.index), )) # name index so tracing knows how to slice assert trips_segment.index.name == 'trip_id' locals_dict = assign.evaluate_constants(omnibus_coefficients[primary_purpose], constants=constants) locals_dict.update(constants) annotate_preprocessors( trips_segment, locals_dict, skims, model_settings, segment_trace_label) locals_dict.update(skims) choices = simulate.simple_simulate( choosers=trips_segment, spec=model_spec, nest_spec=nest_spec, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=segment_trace_label, trace_choice_name='trip_mode_choice') alts = model_spec.columns choices = choices.map(dict(list(zip(list(range(len(alts))), alts)))) # tracing.print_summary('trip_mode_choice %s choices' % primary_purpose, # choices, value_counts=True) if trace_hh_id: # trace the coefficients tracing.trace_df(pd.Series(locals_dict), label=tracing.extend_trace_label(segment_trace_label, 'constants'), transpose=False, slicer='NONE') # so we can trace with annotations trips_segment['trip_mode'] = choices tracing.trace_df(trips_segment, label=tracing.extend_trace_label(segment_trace_label, 'trip_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True) choices_list.append(choices) # FIXME - force garbage collection force_garbage_collect() choices = pd.concat(choices_list) trips_df = trips.to_frame() trips_df['trip_mode'] = choices tracing.print_summary('tour_modes', trips_merged.tour_mode, value_counts=True) tracing.print_summary('trip_mode_choice choices', choices, value_counts=True) assert not trips_df.trip_mode.isnull().any() pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=tracing.extend_trace_label(trace_label, 'trip_mode'), slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings = config.read_model_settings('non_mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='non_mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('non_mandatory_tour_frequency_alternatives.csv'), set_index=None) choosers = persons_merged.to_frame() # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives['tot_tours'] = alternatives.sum(axis=1) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_max_window': person_max_window } expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # filter based on results of CDAP choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) choices_list = [] # segment by person type and pick the right spec for each person type for ptype, segment in choosers.groupby('ptype'): name = PTYPE_NAME[ptype] # pick the spec column for the segment spec = model_spec[[name]] # drop any zero-valued rows spec = spec[spec[name] != 0] logger.info("Running segment '%s' of size %d", name, len(segment)) choices = interaction_simulate( segment, alternatives, spec=spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % name, trace_choice_name='non_mandatory_tour_frequency') choices_list.append(choices) # FIXME - force garbage collection? # force_garbage_collect() choices = pd.concat(choices_list) del alternatives['tot_tours'] # del tot_tours column we added above # - add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] # (we expect there to be an alt with no tours - which we can use to backfill non-travelers) no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ # - get counts of each of the alternatives (so we can extend) # (choices is just the index values for the chosen alts) """ escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ tour_counts = alternatives.loc[choices] tour_counts.index = choices.index # assign person ids to the index prev_tour_count = tour_counts.sum().sum() # - extend_tour_counts tour_counts = extend_tour_counts(choosers, tour_counts, alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) extended_tour_count = tour_counts.sum().sum() logging.info("extend_tour_counts increased nmtf tour count by %s from %s to %s" % (extended_tour_count - prev_tour_count, prev_tour_count, extended_tour_count)) # - create the non_mandatory tours non_mandatory_tours = process_non_mandatory_tours(persons, tour_counts) assert len(non_mandatory_tours) == extended_tour_count pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df(persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def atwork_subtour_mode_choice( tours, persons_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ At-work subtour mode choice simulate """ trace_label = 'atwork_subtour_mode_choice' model_settings = config.read_model_settings('tour_mode_choice.yaml') spec = tour_mode_choice_spec(model_settings) tours = tours.to_frame() subtours = tours[tours.tour_category == 'atwork'] # - if no atwork subtours if subtours.shape[0] == 0: tracing.no_results(trace_label) return subtours_merged = \ pd.merge(subtours, persons_merged.to_frame(), left_on='person_id', right_index=True, how='left') nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) logger.info("Running %s with %d subtours" % (trace_label, subtours_merged.shape[0])) tracing.print_summary('%s tour_type' % trace_label, subtours_merged.tour_type, value_counts=True) # setup skim keys orig_col_name = 'workplace_taz' dest_col_name = 'destination' out_time_col_name = 'start' in_time_col_name = 'end' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col_name, right_key=dest_col_name, skim_key='out_period') dot_skim_stack_wrapper = skim_stack.wrap(left_key=dest_col_name, right_key=orig_col_name, skim_key='in_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, 'out_time_col_name': out_time_col_name, 'in_time_col_name': in_time_col_name } choices = run_tour_mode_choice_simulate( subtours_merged, spec, tour_purpose='atwork', model_settings=model_settings, skims=skims, constants=constants, nest_spec=nest_spec, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='tour_mode_choice') tracing.print_summary('%s choices' % trace_label, choices, value_counts=True) assign_in_place(tours, choices.to_frame('tour_mode')) pipeline.replace_table("tours", tours) if trace_hh_id: tracing.trace_df(tours[tours.tour_category == 'atwork'], label=tracing.extend_trace_label(trace_label, 'tour_mode'), slicer='tour_id', index_label='tour_id') force_garbage_collect()
def tour_mode_choice_simulate(tours, persons_merged, network_los, chunk_size, trace_hh_id): """ Tour mode choice simulate """ trace_label = 'tour_mode_choice' model_settings_file_name = 'tour_mode_choice.yaml' model_settings = config.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get('MODE_CHOICE_LOGSUM_COLUMN_NAME') mode_column_name = 'tour_mode' # FIXME - should be passed in? primary_tours = tours.to_frame() assert not (primary_tours.tour_category == 'atwork').any() logger.info("Running %s with %d tours" % (trace_label, primary_tours.shape[0])) tracing.print_summary('tour_types', primary_tours.tour_type, value_counts=True) persons_merged = persons_merged.to_frame() primary_tours_merged = pd.merge(primary_tours, persons_merged, left_on='person_id', right_index=True, how='left', suffixes=('', '_r')) constants = {} # model_constants can appear in expressions constants.update(config.get_model_constants(model_settings)) skim_dict = network_los.get_default_skim_dict() # setup skim keys orig_col_name = 'home_zone_id' dest_col_name = 'destination' out_time_col_name = 'start' in_time_col_name = 'end' odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, dim3_key='out_period') dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, dim3_key='in_period') odr_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, dim3_key='in_period') dor_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, dim3_key='out_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "odr_skims": odr_skim_stack_wrapper, # dot return skims for e.g. TNC bridge return fare "dor_skims": dor_skim_stack_wrapper, # odt return skims for e.g. TNC bridge return fare "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, 'out_time_col_name': out_time_col_name, 'in_time_col_name': in_time_col_name } if network_los.zone_system == los.THREE_ZONE: # fixme - is this a lightweight object? tvpb = network_los.tvpb tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col_name, dest_key=dest_col_name, tod_key='out_period', segment_key='demographic_segment', cache_choices=True, trace_label=trace_label, tag='tvpb_logsum_odt') tvpb_logsum_dot = tvpb.wrap_logsum(orig_key=dest_col_name, dest_key=orig_col_name, tod_key='in_period', segment_key='demographic_segment', cache_choices=True, trace_label=trace_label, tag='tvpb_logsum_dot') skims.update({ 'tvpb_logsum_odt': tvpb_logsum_odt, 'tvpb_logsum_dot': tvpb_logsum_dot }) # TVPB constants can appear in expressions constants.update( network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) estimator = estimation.manager.begin_estimation('tour_mode_choice') if estimator: estimator.write_coefficients( simulate.read_model_coefficients(model_settings)) estimator.write_coefficients_template( simulate.read_model_coefficient_template(model_settings)) estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) # (run_tour_mode_choice_simulate writes choosers post-annotation) # FIXME should normalize handling of tour_type and tour_purpose # mtctm1 school tour_type includes univ, which has different coefficients from elementary and HS # we should either add this column when tours created or add univ to tour_types not_university = (primary_tours_merged.tour_type != 'school') | ~primary_tours_merged.is_university primary_tours_merged['tour_purpose'] = \ primary_tours_merged.tour_type.where(not_university, 'univ') choices_list = [] for tour_purpose, tours_segment in primary_tours_merged.groupby( 'tour_purpose'): logger.info("tour_mode_choice_simulate tour_type '%s' (%s tours)" % ( tour_purpose, len(tours_segment.index), )) if network_los.zone_system == los.THREE_ZONE: tvpb_logsum_odt.extend_trace_label(tour_purpose) tvpb_logsum_dot.extend_trace_label(tour_purpose) # name index so tracing knows how to slice assert tours_segment.index.name == 'tour_id' choices_df = run_tour_mode_choice_simulate( tours_segment, tour_purpose, model_settings, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, network_los=network_los, skims=skims, constants=constants, estimator=estimator, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, tour_purpose), trace_choice_name='tour_mode_choice') tracing.print_summary('tour_mode_choice_simulate %s choices_df' % tour_purpose, choices_df.tour_mode, value_counts=True) choices_list.append(choices_df) # FIXME - force garbage collection force_garbage_collect() choices_df = pd.concat(choices_list) # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: tvpb_mode_path_types = model_settings.get('tvpb_mode_path_types') for mode, path_types in tvpb_mode_path_types.items(): for direction, skim in zip(['od', 'do'], [tvpb_logsum_odt, tvpb_logsum_dot]): path_type = path_types[direction] skim_cache = skim.cache[path_type] print( f"mode {mode} direction {direction} path_type {path_type}") for c in skim_cache: dest_col = f'{direction}_{c}' if dest_col not in choices_df: choices_df[ dest_col] = 0 if pd.api.types.is_numeric_dtype( skim_cache[c]) else '' choices_df[dest_col].where(choices_df.tour_mode != mode, skim_cache[c], inplace=True) if estimator: estimator.write_choices(choices_df.tour_mode) choices_df.tour_mode = estimator.get_survey_values( choices_df.tour_mode, 'tours', 'tour_mode') estimator.write_override_choices(choices_df.tour_mode) estimator.end_estimation() tracing.print_summary('tour_mode_choice_simulate all tour type choices', choices_df.tour_mode, value_counts=True) # so we can trace with annotations assign_in_place(primary_tours, choices_df) # update tours table with mode choice (and optionally logsums) all_tours = tours.to_frame() assign_in_place(all_tours, choices_df) pipeline.replace_table("tours", all_tours) if trace_hh_id: tracing.trace_df(primary_tours, label=tracing.extend_trace_label( trace_label, mode_column_name), slicer='tour_id', index_label='tour_id', warn_if_empty=True)
def meta_control_factoring(settings, control_spec, incidence_table): """ Apply simple factoring to summed household fractional weights based on original meta control values relative to summed household fractional weights by meta zone. The resulting factored meta control weights will be new meta controls appended as additional columns to the seed control table, for final balancing. Parameters ---------- settings : dict (settings.yaml as dict) control_spec : pipeline table incidence_table : pipeline table Returns ------- """ # FIXME - if there is only one seed zone in the meta zone, just copy meta control values? incidence_df = incidence_table.to_frame() control_spec = control_spec.to_frame() geographies = settings.get('geographies') seed_geography = settings.get('seed_geography') meta_geography = geographies[0] # - if there are no meta controls, then we don't have to do anything if not (control_spec.geography == meta_geography).any(): logger.warn("meta_control_factoring: no meta targets so nothing to do") return meta_controls_df = get_control_table(meta_geography) dump_table("meta_controls_df", meta_controls_df) # slice control_spec to select only the rows for meta level controls meta_controls_spec = control_spec[control_spec.geography == meta_geography] meta_control_targets = meta_controls_spec['target'] logger.info("meta_control_factoring %s targets" % len(meta_control_targets)) dump_table("meta_controls_spec", meta_controls_spec) dump_table("meta_control_targets", meta_control_targets) # seed level weights of all households (rows aligned with incidence_df rows) seed_weights_df = get_weight_table(seed_geography) assert len(incidence_df.index) == len(seed_weights_df.index) # expand person weights by incidence (incidnece will simply be 1 for household targets) hh_level_weights = incidence_df[[seed_geography, meta_geography]].copy() for target in meta_control_targets: hh_level_weights[target] = \ incidence_df[target] * seed_weights_df['preliminary_balanced_weight'] dump_table("hh_level_weights", hh_level_weights) # weights of meta targets at seed level factored_seed_weights = \ hh_level_weights.groupby([seed_geography, meta_geography], as_index=False).sum() factored_seed_weights.set_index(seed_geography, inplace=True) dump_table("factored_seed_weights", factored_seed_weights) # weights of meta targets summed from seed level to meta level factored_meta_weights = factored_seed_weights.groupby(meta_geography, as_index=True).sum() dump_table("factored_meta_weights", factored_meta_weights) # only the meta level controls from meta_controls table meta_controls_df = meta_controls_df[meta_control_targets] dump_table("meta_controls_df", meta_controls_df) # compute the scaling factors to be applied to the seed-level totals: meta_factors = pd.DataFrame(index=meta_controls_df.index) for target in meta_control_targets: meta_factors[target] = meta_controls_df[target] / factored_meta_weights[target] dump_table("meta_factors", meta_factors) # compute seed-level controls from meta-level controls seed_level_meta_controls = pd.DataFrame(index=factored_seed_weights.index) for target in meta_control_targets: # meta level scaling_factor for this meta_control scaling_factor = factored_seed_weights[meta_geography].map(meta_factors[target]) # scale the seed_level_meta_controls by meta_level scaling_factor seed_level_meta_controls[target] = factored_seed_weights[target] * scaling_factor # FIXME - why round scaled factored seed_weights to int prior to final seed balancing? seed_level_meta_controls[target] = seed_level_meta_controls[target].round().astype(int) dump_table("seed_level_meta_controls", seed_level_meta_controls) # create final balancing controls # add newly created seed_level_meta_controls to the existing set of seed level controls seed_controls_df = get_control_table(seed_geography) assert len(seed_controls_df.index) == len(seed_level_meta_controls.index) seed_controls_df = pd.concat([seed_controls_df, seed_level_meta_controls], axis=1) # ensure columns are in right order for orca-extended table seed_controls_df = seed_controls_df[control_spec.target] assert (seed_controls_df.columns == control_spec.target).all() dump_table("seed_controls_df", seed_controls_df) pipeline.replace_table(control_table_name(seed_geography), seed_controls_df)
def trip_scheduling(trips, tours, chunk_size, trace_hh_id): """ Trip scheduling assigns depart times for trips within the start, end limits of the tour. The algorithm is simplistic: The first outbound trip starts at the tour start time, and subsequent outbound trips are processed in trip_num order, to ensure that subsequent trips do not depart before the trip that preceeds them. Inbound trips are handled similarly, except in reverse order, starting with the last trip, and working backwards to ensure that inbound trips do not depart after the trip that succeeds them. The probability spec assigns probabilities for depart times, but those possible departs must be clipped to disallow depart times outside the tour limits, the departs of prior trips, and in the case of work tours, the start/end times of any atwork subtours. Scheduling can fail if the probability table assigns zero probabilities to all the available depart times in a trip's depart window. (This could be avoided by giving every window a small probability, rather than zero, but the existing mtctm1 prob spec does not do this. I believe this is due to the its having been generated from a small household travel survey sample that lacked any departs for some time periods.) Rescheduling the trips that fail (along with their inbound or outbound leg-mates) can sometimes fix this problem, if it was caused by an earlier trip's depart choice blocking a subsequent trip's ability to schedule a depart within the resulting window. But it can also happen if a tour is very short (e.g. one time period) and the prob spec having a zero probability for that tour hour. Therefor we need to handle trips that could not be scheduled. There are two ways (at least) to solve this problem: 1) CHOOSE_MOST_INITIAL simply assign a depart time to the trip, even if it has a zero probability. It makes most sense, in this case, to assign the 'most initial' depart time, so that subsequent trips are minimally impacted. This can be done in the final iteration, thus affecting only the trips that could no be scheduled by the standard approach 2) drop_and_cleanup drop trips that could no be scheduled, and adjust their leg mates, as is done for failed trips in trip_destination. For now we are choosing among these approaches with a manifest constant, but this could be made a model setting... """ trace_label = "trip_scheduling" model_settings = config.read_model_settings('trip_scheduling.yaml') assert 'DEPART_ALT_BASE' in model_settings failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) probs_spec = pd.read_csv( config.config_file_path('trip_scheduling_probs.csv'), comment='#') trips_df = trips.to_frame() tours = tours.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together trips_df['chunk_id'] = \ reindex(pd.Series(list(range(tours.shape[0])), tours.index), trips_df.tour_id) max_iterations = model_settings.get('MAX_ITERATIONS', 1) assert max_iterations > 0 choices_list = [] i = 0 while (i < max_iterations) and not trips_df.empty: i += 1 last_iteration = (i == max_iterations) trace_label_i = tracing.extend_trace_label(trace_label, "i%s" % i) logger.info("%s scheduling %s trips", trace_label_i, trips_df.shape[0]) choices = \ run_trip_scheduling( trips_df, tours, probs_spec, model_settings, last_iteration=last_iteration, trace_hh_id=trace_hh_id, chunk_size=chunk_size, trace_label=trace_label_i) # boolean series of trips whose individual trip scheduling failed failed = choices.reindex(trips_df.index).isnull() logger.info("%s %s failed", trace_label_i, failed.sum()) if not last_iteration: # boolean series of trips whose leg scheduling failed failed_cohorts = failed_trip_cohorts(trips_df, failed) trips_df = trips_df[failed_cohorts] choices = choices[~failed_cohorts] choices_list.append(choices) trips_df = trips.to_frame() choices = pd.concat(choices_list) choices = choices.reindex(trips_df.index) if choices.isnull().any(): logger.warning( "%s of %s trips could not be scheduled after %s iterations" % (choices.isnull().sum(), trips_df.shape[0], i)) if failfix != FAILFIX_DROP_AND_CLEANUP: raise RuntimeError("%s setting '%s' not enabled in settings" % (FAILFIX, FAILFIX_DROP_AND_CLEANUP)) trips_df['failed'] = choices.isnull() trips_df = cleanup_failed_trips(trips_df) choices = choices.reindex(trips_df.index) trips_df['depart'] = choices assert not trips_df.depart.isnull().any() pipeline.replace_table("trips", trips_df)
def stop_frequency( tours, tours_merged, stop_frequency_alts, skim_dict, chunk_size, trace_hh_id): """ stop frequency model For each tour, shoose a number of intermediate inbound stops and outbound stops. Create a trip table with inbound and outbound trips. Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops, and four corresponding trips: three outbound, and one inbound. Adds stop_frequency str column to trips, with fields creates trips table with columns: :: - person_id - household_id - tour_id - primary_purpose - atwork - trip_num - outbound - trip_count """ trace_label = 'stop_frequency' model_settings = config.read_model_settings('stop_frequency.yaml') tours = tours.to_frame() tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination') skims = [od_skim_stack_wrapper] locals_dict = { "od_skims": od_skim_stack_wrapper } if constants is not None: locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) assign_in_place(tours_merged, annotations) tracing.print_summary('stop_frequency segments', tours_merged.primary_purpose, value_counts=True) choices_list = [] for segment_type, choosers in tours_merged.groupby('primary_purpose'): logging.info("%s running segment %s with %s chooser rows" % (trace_label, segment_type, choosers.shape[0])) spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' % segment_type) assert spec is not None, "spec for segment_type %s not found" % segment_type choices = simulate.simple_simulate( choosers=choosers, spec=spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_type), trace_choice_name='stops') # convert indexes to alternative names choices = pd.Series(spec.columns[choices.values], index=choices.index) choices_list.append(choices) choices = pd.concat(choices_list) tracing.print_summary('stop_frequency', choices, value_counts=True) # add stop_frequency choices to tours table assign_in_place(tours, choices.to_frame('stop_frequency')) if 'primary_purpose' not in tours.columns: assign_in_place(tours, tours_merged[['primary_purpose']]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel('trips', trips) if trace_hh_id: tracing.trace_df(tours, label="stop_frequency.tours", slicer='person_id', columns=None) tracing.trace_df(trips, label="stop_frequency.trips", slicer='person_id', columns=None) tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) tracing.trace_df(tours_merged, label="stop_frequency.tours_merged", slicer='person_id', columns=None)
def cdap_simulate(persons_merged, persons, households, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') persons_merged = persons_merged.to_frame() constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices.cdap_activity persons['cdap_rank'] = choices.cdap_rank expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info("cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True)) if trace_hh_id: tracing.trace_df(inject.get_table('persons_merged').to_frame(), label="cdap", columns=['ptype', 'cdap_rank', 'cdap_activity'], warn_if_empty=True)
def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') person_type_map = model_settings.get('PERSON_TYPE_MAP', {}) cdap_indiv_spec = simulate.read_model_spec( file_name=model_settings['INDIV_AND_HHSIZE1_SPEC']) # Rules and coefficients for generating interaction specs for different household sizes cdap_interaction_coefficients = \ pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') """ spec to compute/specify the relative proportions of each activity (M, N, H) that should be used to choose activities for additional household members not handled by CDAP This spec is handled much like an activitysim logit utility spec, EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) """ cdap_fixed_relative_proportions = \ simulate.read_model_spec(file_name=model_settings['FIXED_RELATIVE_PROPORTIONS_SPEC']) persons_merged = persons_merged.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together assert 'chunk_id' not in persons_merged.columns unique_household_ids = persons_merged.household_id.unique() household_chunk_ids = pd.Series(range(len(unique_household_ids)), index=unique_household_ids) persons_merged['chunk_id'] = reindex(household_chunk_ids, persons_merged.household_id) constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) estimator = estimation.manager.begin_estimation('cdap') if estimator: estimator.write_model_settings(model_settings, 'cdap.yaml') estimator.write_spec(model_settings, tag='INDIV_AND_HHSIZE1_SPEC') estimator.write_spec(model_settings=model_settings, tag='FIXED_RELATIVE_PROPORTIONS_SPEC') estimator.write_table(cdap_interaction_coefficients, 'interaction_coefficients', index=False, append=False) estimator.write_choosers(persons_merged) for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.get_cached_spec(hhsize) estimator.write_table(spec, 'spec_%s' % hhsize, append=False) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, person_type_map=person_type_map, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'persons', 'cdap_activity') estimator.write_override_choices(choices) estimator.end_estimation() # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info( "cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True))
def iterate_location_choice( model_settings, persons_merged, persons, households, skim_dict, skim_stack, estimator, chunk_size, trace_hh_id, locutor, trace_label): """ iterate run_location_choice updating shadow pricing until convergence criteria satisfied or max_iterations reached. (If use_shadow_pricing not enabled, then just iterate once) Parameters ---------- model_settings : dict persons_merged : injected table persons : injected table skim_dict : skim.SkimDict skim_stack : skim.SkimStack chunk_size : int trace_hh_id : int locutor : bool whether this process is the privileged logger of shadow_pricing when multiprocessing trace_label : str Returns ------- adds choice column model_settings['DEST_CHOICE_COLUMN_NAME'] adds logsum column model_settings['DEST_CHOICE_LOGSUM_COLUMN_NAME']- if provided adds annotations to persons table """ # column containing segment id chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] # boolean to filter out persons not needing location modeling (e.g. is_worker, is_student) chooser_filter_column = model_settings['CHOOSER_FILTER_COLUMN_NAME'] dest_choice_column_name = model_settings['DEST_CHOICE_COLUMN_NAME'] logsum_column_name = model_settings.get('DEST_CHOICE_LOGSUM_COLUMN_NAME') sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') want_sample_table = config.setting('want_dest_choice_sample_tables') and sample_table_name is not None persons_merged_df = persons_merged.to_frame() persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]] persons_merged_df.sort_index(inplace=True) # interaction_sample expects chooser index to be monotonic increasing spc = shadow_pricing.load_shadow_price_calculator(model_settings) max_iterations = spc.max_iterations assert not (spc.use_shadow_pricing and estimator) logger.debug("%s max_iterations: %s" % (trace_label, max_iterations)) for iteration in range(1, max_iterations + 1): if spc.use_shadow_pricing and iteration > 1: spc.update_shadow_prices() choices_df, save_sample_df = run_location_choice( persons_merged_df, skim_dict, skim_stack, shadow_price_calculator=spc, want_logsums=logsum_column_name is not None, want_sample_table=want_sample_table, estimator=estimator, model_settings=model_settings, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'i%s' % iteration)) # choices_df is a pandas DataFrame with columns 'choice' and (optionally) 'logsum' if choices_df is None: break spc.set_choices( choices=choices_df['choice'], segment_ids=persons_merged_df[chooser_segment_column].reindex(choices_df.index)) if locutor: spc.write_trace_files(iteration) if spc.use_shadow_pricing and spc.check_fit(iteration): logging.info("%s converged after iteration %s" % (trace_label, iteration,)) break # - shadow price table if locutor: if spc.use_shadow_pricing and 'SHADOW_PRICE_TABLE' in model_settings: inject.add_table(model_settings['SHADOW_PRICE_TABLE'], spc.shadow_prices) if 'MODELED_SIZE_TABLE' in model_settings: inject.add_table(model_settings['MODELED_SIZE_TABLE'], spc.modeled_size) persons_df = persons.to_frame() # add the choice values to the dest_choice_column in persons dataframe # We only chose school locations for the subset of persons who go to school # so we backfill the empty choices with -1 to code as no school location # names for location choice and (optional) logsums columns NO_DEST_TAZ = -1 persons_df[dest_choice_column_name] = \ choices_df['choice'].reindex(persons_df.index).fillna(NO_DEST_TAZ).astype(int) # add the dest_choice_logsum column to persons dataframe if logsum_column_name: persons_df[logsum_column_name] = \ choices_df['logsum'].reindex(persons_df.index).astype('float') if save_sample_df is not None: # might be None for tiny samples even if sample_table_name was specified assert len(save_sample_df.index.get_level_values(0).unique()) == len(choices_df) # lest they try to put school and workplace samples into the same table if pipeline.is_table(sample_table_name): raise RuntimeError("dest choice sample table %s already exists" % sample_table_name) pipeline.extend_table(sample_table_name, save_sample_df) # - annotate persons table if 'annotate_persons' in model_settings: expressions.assign_columns( df=persons_df, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons_df) if trace_hh_id: tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if 'annotate_households' in model_settings: households_df = households.to_frame() expressions.assign_columns( df=households_df, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households_df) if trace_hh_id: tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) if logsum_column_name: tracing.print_summary(logsum_column_name, choices_df['logsum'], value_counts=True) return persons_df
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings('mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( model_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)
def add_null_results(trace_label, tours): logger.info("Skipping %s: add_null_results" % trace_label) tours['composition'] = '' pipeline.replace_table("tours", tours)
def tour_mode_choice_simulate(tours, persons_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ Tour mode choice simulate """ trace_label = 'tour_mode_choice' model_settings = config.read_model_settings('tour_mode_choice.yaml') spec = tour_mode_choice_spec(model_settings) primary_tours = tours.to_frame() assert not (primary_tours.tour_category == 'atwork').any() persons_merged = persons_merged.to_frame() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) logger.info("Running %s with %d tours" % (trace_label, primary_tours.shape[0])) tracing.print_summary('tour_types', primary_tours.tour_type, value_counts=True) primary_tours_merged = pd.merge(primary_tours, persons_merged, left_on='person_id', right_index=True, how='left', suffixes=('', '_r')) # setup skim keys orig_col_name = 'TAZ' dest_col_name = 'destination' out_time_col_name = 'start' in_time_col_name = 'end' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col_name, right_key=dest_col_name, skim_key='out_period') dot_skim_stack_wrapper = skim_stack.wrap(left_key=dest_col_name, right_key=orig_col_name, skim_key='in_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, 'out_time_col_name': out_time_col_name, 'in_time_col_name': in_time_col_name } choices_list = [] for tour_type, segment in primary_tours_merged.groupby('tour_type'): logger.info("tour_mode_choice_simulate tour_type '%s' (%s tours)" % (tour_type, len(segment.index), )) # name index so tracing knows how to slice assert segment.index.name == 'tour_id' choices = run_tour_mode_choice_simulate( segment, spec, tour_type, model_settings, skims=skims, constants=constants, nest_spec=nest_spec, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, tour_type), trace_choice_name='tour_mode_choice') tracing.print_summary('tour_mode_choice_simulate %s choices' % tour_type, choices, value_counts=True) choices_list.append(choices) # FIXME - force garbage collection force_garbage_collect() choices = pd.concat(choices_list) tracing.print_summary('tour_mode_choice_simulate all tour type choices', choices, value_counts=True) # so we can trace with annotations primary_tours['tour_mode'] = choices # but only keep mode choice col all_tours = tours.to_frame() # uncomment to save annotations to table # assign_in_place(all_tours, annotations) assign_in_place(all_tours, choices.to_frame('tour_mode')) pipeline.replace_table("tours", all_tours) if trace_hh_id: tracing.trace_df(primary_tours, label=tracing.extend_trace_label(trace_label, 'tour_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True)
def trip_mode_choice(trips, tours_merged, network_los, chunk_size, trace_hh_id): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. Modes for each primary tour putpose are calculated separately because they have different coefficient values (stored in trip_mode_choice_coefficients.csv coefficient file.) Adds trip_mode column to trip table """ trace_label = 'trip_mode_choice' model_settings_file_name = 'trip_mode_choice.yaml' model_settings = config.read_model_settings(model_settings_file_name) logsum_column_name = model_settings.get('MODE_CHOICE_LOGSUM_COLUMN_NAME') mode_column_name = 'trip_mode' trips_df = trips.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) tours_merged = tours_merged.to_frame() tours_merged = tours_merged[model_settings['TOURS_MERGED_CHOOSER_COLUMNS']] tracing.print_summary('primary_purpose', trips_df.primary_purpose, value_counts=True) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge(trips_df, tours_merged, left_on='tour_id', right_index=True, how="left") assert trips_merged.index.equals(trips.index) # setup skim keys assert ('trip_period' not in trips_merged) trips_merged['trip_period'] = network_los.skim_time_period_label( trips_merged.depart) orig_col = 'origin' dest_col = 'destination' constants = {} constants.update(config.get_model_constants(model_settings)) constants.update({'ORIGIN': orig_col, 'DESTINATION': dest_col}) skim_dict = network_los.get_default_skim_dict() odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col, dest_key=dest_col, dim3_key='trip_period') dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col, dest_key=orig_col, dim3_key='trip_period') od_skim_wrapper = skim_dict.wrap('origin', 'destination') skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_wrapper, } if network_los.zone_system == los.THREE_ZONE: # fixme - is this a lightweight object? tvpb = network_los.tvpb tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col, dest_key=dest_col, tod_key='trip_period', segment_key='demographic_segment', cache_choices=True, trace_label=trace_label, tag='tvpb_logsum_odt') skims.update({ 'tvpb_logsum_odt': tvpb_logsum_odt, # 'tvpb_logsum_dot': tvpb_logsum_dot }) # TVPB constants can appear in expressions constants.update( network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) estimator = estimation.manager.begin_estimation('trip_mode_choice') if estimator: estimator.write_coefficients(model_settings=model_settings) estimator.write_coefficients_template(model_settings=model_settings) estimator.write_spec(model_settings) estimator.write_model_settings(model_settings, model_settings_file_name) model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) nest_spec = config.get_logit_model_settings(model_settings) choices_list = [] for primary_purpose, trips_segment in trips_merged.groupby( 'primary_purpose'): segment_trace_label = tracing.extend_trace_label( trace_label, primary_purpose) logger.info("trip_mode_choice tour_type '%s' (%s trips)" % ( primary_purpose, len(trips_segment.index), )) # name index so tracing knows how to slice assert trips_segment.index.name == 'trip_id' if network_los.zone_system == los.THREE_ZONE: tvpb_logsum_odt.extend_trace_label(primary_purpose) # tvpb_logsum_dot.extend_trace_label(primary_purpose) coefficients = simulate.get_segment_coefficients( model_settings, primary_purpose) locals_dict = {} locals_dict.update(constants) locals_dict.update(coefficients) expressions.annotate_preprocessors(trips_segment, locals_dict, skims, model_settings, segment_trace_label) if estimator: # write choosers after annotation estimator.write_choosers(trips_segment) locals_dict.update(skims) choices = mode_choice_simulate( choosers=trips_segment, spec=simulate.eval_coefficients(model_spec, coefficients, estimator), nest_spec=simulate.eval_nest_coefficients(nest_spec, coefficients, segment_trace_label), skims=skims, locals_d=locals_dict, chunk_size=chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=segment_trace_label, trace_choice_name='trip_mode_choice', estimator=estimator) if trace_hh_id: # trace the coefficients tracing.trace_df(pd.Series(locals_dict), label=tracing.extend_trace_label( segment_trace_label, 'constants'), transpose=False, slicer='NONE') # so we can trace with annotations assign_in_place(trips_segment, choices) tracing.trace_df(trips_segment, label=tracing.extend_trace_label( segment_trace_label, 'trip_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True) choices_list.append(choices) choices_df = pd.concat(choices_list) # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: tvpb_mode_path_types = model_settings.get('tvpb_mode_path_types') for mode, path_type in tvpb_mode_path_types.items(): skim_cache = tvpb_logsum_odt.cache[path_type] for c in skim_cache: dest_col = c if dest_col not in choices_df: choices_df[ dest_col] = np.nan if pd.api.types.is_numeric_dtype( skim_cache[c]) else '' choices_df[dest_col].where( choices_df[mode_column_name] != mode, skim_cache[c], inplace=True) if estimator: estimator.write_choices(choices_df.trip_mode) choices_df.trip_mode = estimator.get_survey_values( choices_df.trip_mode, 'trips', 'trip_mode') estimator.write_override_choices(choices_df.trip_mode) estimator.end_estimation() # update trips table with choices (and potionally logssums) trips_df = trips.to_frame() assign_in_place(trips_df, choices_df) tracing.print_summary('trip_modes', trips_merged.tour_mode, value_counts=True) tracing.print_summary('trip_mode_choice choices', trips_df[mode_column_name], value_counts=True) assert not trips_df[mode_column_name].isnull().any() pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=tracing.extend_trace_label( trace_label, 'trip_mode'), slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def setup_data_structures(settings, configs_dir, households, persons): """ Setup geographic correspondence (crosswalk), control sets, and incidence tables. A control tables for target geographies should already have been read in by running input_pre_processor. The zone control tables contains one row for each zone, with columns specifying control field totals for that control This step reads in the global control file, which specifies which control control fields in the control table should be used for balancing, along with their importance and the recipe (seed table and expression) for determining household incidence for that control. If GROUP_BY_INCIDENCE_SIGNATURE setting is enabled, then incidence table rows are household group ids and and additional household_groups table is created mapping hh group ids to actual hh_ids. Parameters ---------- settings: dict contents of settings.yaml as dict configs_dir: str households: pipeline table persons: pipeline table creates pipeline tables: crosswalk controls geography-specific controls incidence_table household_groups (if GROUP_BY_INCIDENCE_SIGNATURE setting is enabled) modifies tables: households persons """ seed_geography = setting('seed_geography') households_df = households.to_frame() persons_df = persons.to_frame() crosswalk_df = build_crosswalk_table() inject.add_table('crosswalk', crosswalk_df) control_spec = read_control_spec(setting('control_file_name', 'controls.csv'), configs_dir) inject.add_table('control_spec', control_spec) geographies = settings['geographies'] for g in geographies: controls = build_control_table(g, control_spec, crosswalk_df) inject.add_table(control_table_name(g), controls) households_df, persons_df = filter_households(households_df, persons_df, crosswalk_df) pipeline.replace_table('households', households_df) pipeline.replace_table('persons', persons_df) incidence_table = \ build_incidence_table(control_spec, households_df, persons_df, crosswalk_df) incidence_table = add_geography_columns(incidence_table, households_df, crosswalk_df) # add sample_weight col to incidence table hh_weight_col = setting('household_weight_col') incidence_table['sample_weight'] = households_df[hh_weight_col] if setting('GROUP_BY_INCIDENCE_SIGNATURE') and not setting('NO_INTEGERIZATION_EVER', False): group_incidence_table, household_groups \ = build_grouped_incidence_table(incidence_table, control_spec, seed_geography) inject.add_table('household_groups', household_groups) inject.add_table('incidence_table', group_incidence_table) else: inject.add_table('incidence_table', incidence_table)
def trip_purpose_and_destination( trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_purpose_and_destination" model_settings = config.read_model_settings('trip_purpose_and_destination.yaml') MAX_ITERATIONS = model_settings.get('MAX_ITERATIONS', 5) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() if trips_df.empty: logger.info("%s - no trips. Nothing to do." % trace_label) return # FIXME could allow MAX_ITERATIONS=0 to allow for cleanup-only run # in which case, we would need to drop bad trips, WITHOUT failing bad_trip leg_mates assert (MAX_ITERATIONS > 0) # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if 'destination' in trips_df: if trips_df.failed.any(): logger.info('trip_destination has already been run. Rerunning failed trips') flag_failed_trip_leg_mates(trips_df, 'failed') trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] logger.info('Rerunning %s failed trips and leg-mates' % trips_df.shape[0]) else: # no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) del trips_df['failed'] pipeline.replace_table("trips", trips_df) return results = [] i = 0 RESULT_COLUMNS = ['purpose', 'destination', 'origin', 'failed'] while True: i += 1 for c in RESULT_COLUMNS: if c in trips_df: del trips_df[c] trips_df = run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % i)) num_failed_trips = trips_df.failed.sum() # if there were no failed trips, we are done if num_failed_trips == 0: results.append(trips_df[RESULT_COLUMNS]) break logger.warning("%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i)) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) # if max iterations reached, add remaining trips to results and give up # note that we do this BEFORE failing leg_mates so resulting trip legs are complete if i >= MAX_ITERATIONS: logger.warning("%s too many iterations %s" % (trace_label, i)) results.append(trips_df[RESULT_COLUMNS]) break # otherwise, if any trips failed, then their leg-mates trips must also fail flag_failed_trip_leg_mates(trips_df, 'failed') # add the good trips to results results.append(trips_df[~trips_df.failed][RESULT_COLUMNS]) # and keep the failed ones to retry trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] # - assign result columns to trips results = pd.concat(results) logger.info("%s %s failed trips after %s iterations" % (trace_label, results.failed.sum(), i)) trips_df = trips.to_frame() assign_in_place(trips_df, results) trips_df = cleanup_failed_trips(trips_df) pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def tour_mode_choice_simulate(tours, persons_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ Tour mode choice simulate """ trace_label = 'tour_mode_choice' model_settings = config.read_model_settings('tour_mode_choice.yaml') spec = tour_mode_choice_spec(model_settings) primary_tours = tours.to_frame() assert not (primary_tours.tour_category == 'atwork').any() persons_merged = persons_merged.to_frame() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) logger.info("Running %s with %d tours" % (trace_label, primary_tours.shape[0])) tracing.print_summary('tour_types', primary_tours.tour_type, value_counts=True) primary_tours_merged = pd.merge(primary_tours, persons_merged, left_on='person_id', right_index=True, how='left', suffixes=('', '_r')) # setup skim keys orig_col_name = 'TAZ' dest_col_name = 'destination' out_time_col_name = 'start' in_time_col_name = 'end' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col_name, right_key=dest_col_name, skim_key='out_period') dot_skim_stack_wrapper = skim_stack.wrap(left_key=dest_col_name, right_key=orig_col_name, skim_key='in_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, 'out_time_col_name': out_time_col_name, 'in_time_col_name': in_time_col_name } choices_list = [] for tour_type, segment in primary_tours_merged.groupby('tour_type'): logger.info("tour_mode_choice_simulate tour_type '%s' (%s tours)" % ( tour_type, len(segment.index), )) # name index so tracing knows how to slice assert segment.index.name == 'tour_id' choices = run_tour_mode_choice_simulate( segment, spec, tour_type, model_settings, skims=skims, constants=constants, nest_spec=nest_spec, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, tour_type), trace_choice_name='tour_mode_choice') tracing.print_summary('tour_mode_choice_simulate %s choices' % tour_type, choices, value_counts=True) choices_list.append(choices) # FIXME - force garbage collection force_garbage_collect() choices = pd.concat(choices_list) tracing.print_summary('tour_mode_choice_simulate all tour type choices', choices, value_counts=True) # so we can trace with annotations primary_tours['tour_mode'] = choices # but only keep mode choice col all_tours = tours.to_frame() # uncomment to save annotations to table # assign_in_place(all_tours, annotations) assign_in_place(all_tours, choices.to_frame('tour_mode')) pipeline.replace_table("tours", all_tours) if trace_hh_id: tracing.trace_df(primary_tours, label=tracing.extend_trace_label( trace_label, 'tour_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True)
def iterate_location_choice( model_settings, persons_merged, persons, households, skim_dict, skim_stack, chunk_size, trace_hh_id, locutor, trace_label): """ iterate run_location_choice updating shadow pricing until convergence criteria satisfied or max_iterations reached. (If use_shadow_pricing not enabled, then just iterate once) Parameters ---------- model_settings : dict persons_merged : injected table persons : injected table skim_dict : skim.SkimDict skim_stack : skim.SkimStack chunk_size : int trace_hh_id : int locutor : bool whether this process is the privileged logger of shadow_pricing when multiprocessing trace_label : str Returns ------- adds choice column model_settings['DEST_CHOICE_COLUMN_NAME'] and annotations to persons table """ # column containing segment id chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] # boolean to filter out persons not needing location modeling (e.g. is_worker, is_student) chooser_filter_column = model_settings['CHOOSER_FILTER_COLUMN_NAME'] persons_merged_df = persons_merged.to_frame() persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]] spc = shadow_pricing.load_shadow_price_calculator(model_settings) max_iterations = spc.max_iterations logging.debug("%s max_iterations: %s" % (trace_label, max_iterations)) choices = None for iteration in range(1, max_iterations + 1): if spc.use_shadow_pricing and iteration > 1: spc.update_shadow_prices() choices = run_location_choice( persons_merged_df, skim_dict, skim_stack, spc, model_settings, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'i%s' % iteration)) choices_df = choices.to_frame('dest_choice') choices_df['segment_id'] = \ persons_merged_df[chooser_segment_column].reindex(choices_df.index) spc.set_choices(choices_df) if locutor: spc.write_trace_files(iteration) if spc.use_shadow_pricing and spc.check_fit(iteration): logging.info("%s converged after iteration %s" % (trace_label, iteration,)) break # - shadow price table if locutor: if spc.use_shadow_pricing and 'SHADOW_PRICE_TABLE' in model_settings: inject.add_table(model_settings['SHADOW_PRICE_TABLE'], spc.shadow_prices) if 'MODELED_SIZE_TABLE' in model_settings: inject.add_table(model_settings['MODELED_SIZE_TABLE'], spc.modeled_size) dest_choice_column_name = model_settings['DEST_CHOICE_COLUMN_NAME'] tracing.print_summary(dest_choice_column_name, choices, value_counts=True) persons_df = persons.to_frame() # We only chose school locations for the subset of persons who go to school # so we backfill the empty choices with -1 to code as no school location NO_DEST_TAZ = -1 persons_df[dest_choice_column_name] = \ choices.reindex(persons_df.index).fillna(NO_DEST_TAZ).astype(int) # - annotate persons table if 'annotate_persons' in model_settings: expressions.assign_columns( df=persons_df, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons_df) if trace_hh_id: tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if 'annotate_households' in model_settings: households_df = households.to_frame() expressions.assign_columns( df=households_df, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households_df) if trace_hh_id: tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) return persons_df
def repop_setup_data_structures(configs_dir, households, persons): """ Setup geographic correspondence (crosswalk), control sets, and incidence tables for repop run. A new lowest-level geography control tables should already have been read in by rerunning input_pre_processor with a table_list override. The control table contains one row for each zone, with columns specifying control field totals for that control This step reads in the repop control file, which specifies which control control fields in the control table should be used for balancing, along with their importance and the recipe (seed table and expression) for determining household incidence for that control. Parameters ---------- configs_dir : str households: pipeline table persons: pipeline table Returns ------- """ seed_geography = setting('seed_geography') geographies = setting('geographies') low_geography = geographies[-1] # replace crosswalk table crosswalk_df = build_crosswalk_table() pipeline.replace_table('crosswalk', crosswalk_df) # replace control_spec control_file_name = setting('repop_control_file_name', 'repop_controls.csv') control_spec = read_control_spec(control_file_name, configs_dir) # repop control spec should only specify controls for lowest level geography assert control_spec.geography.unique() == [low_geography] pipeline.replace_table('control_spec', control_spec) # build incidence_table with repop controls and households in repop zones # filter households (dropping any not in crosswalk) in order to build incidence_table # We DO NOT REPLACE households and persons as we need full tables to synthesize population # (There is no problem, however, with overwriting the incidence_table and household_groups # because the expand_households step has ALREADY created the expanded_household_ids table # for the original simulated population. ) households_df = households.to_frame() persons_df = persons.to_frame() households_df, persons_df = filter_households(households_df, persons_df, crosswalk_df) incidence_table = build_incidence_table(control_spec, households_df, persons_df, crosswalk_df) incidence_table = add_geography_columns(incidence_table, households_df, crosswalk_df) # add sample_weight col to incidence table hh_weight_col = setting('household_weight_col') incidence_table['sample_weight'] = households_df[hh_weight_col] # rebuild control tables with only the low level controls (aggregated at higher levels) for g in geographies: controls = build_control_table(g, control_spec, crosswalk_df) pipeline.replace_table(control_table_name(g), controls) if setting('GROUP_BY_INCIDENCE_SIGNATURE') and not setting('NO_INTEGERIZATION_EVER', False): group_incidence_table, household_groups \ = build_grouped_incidence_table(incidence_table, control_spec, seed_geography) pipeline.replace_table('household_groups', household_groups) pipeline.replace_table('incidence_table', group_incidence_table) else: pipeline.replace_table('incidence_table', incidence_table)