def read_spec_file(model_settings, segments): """Read expressions file from a csv using instructions from model settings. Parameters ---------- model_settings: dict, from yaml segments : dict, origin zone trip segments Returns ------- pandas DataFrame """ spec_file_name = model_settings.get('spec_file_name') spec_file_path = config.config_file_path(spec_file_name, mandatory=True) logger.info('reading spec file \'%s\'' % spec_file_name) cfg = pd.read_csv(spec_file_path, comment='#') expected_header = ['description', 'target', 'expression', *segments.keys()] if not sorted(cfg.columns.values) == sorted(expected_header): raise RuntimeError("Spec file requires header %s" % expected_header) cfg.target = cfg.target.str.strip() cfg.expression = cfg.expression.str.strip() return cfg
def tour_mode_choice_coeffecients_spec(model_settings): assert 'COEFFS' in model_settings coeffs_file_name = model_settings['COEFFS'] file_path = config.config_file_path(coeffs_file_name) return pd.read_csv(file_path, comment='#', index_col='Expression')
def compute_accessibility(land_use, accessibility, network_los, chunk_size, trace_od): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec The actual results depend on the expressions in accessibility_spec, but this is initially intended to permit implementation of the mtc accessibility calculation as implemented by Accessibility.job Compute measures of accessibility used by the automobile ownership model. The accessibility measure first multiplies an employment variable by a mode-specific decay function. The product reflects the difficulty of accessing the activities the farther (in terms of round-trip travel time) the jobs are from the location in question. The products to each destination zone are next summed over each origin zone, and the logarithm of the product mutes large differences. The decay function on the walk accessibility measure is steeper than automobile or transit. The minimum accessibility is zero. """ trace_label = 'compute_accessibility' model_settings = config.read_model_settings('accessibility.yaml') assignment_spec = assign.read_assignment_spec( config.config_file_path('accessibility.csv')) accessibility_df = accessibility.to_frame() if len(accessibility_df.columns) > 0: logger.warning( f"accessibility table is not empty. Columns:{list(accessibility_df.columns)}" ) raise RuntimeError(f"accessibility table is not empty.") constants = config.get_model_constants(model_settings) # only include the land_use columns needed by spec, as specified by land_use_columns model_setting land_use_columns = model_settings.get('land_use_columns', []) land_use_df = land_use.to_frame() land_use_df = land_use_df[land_use_columns] logger.info( f"Running {trace_label} with {len(accessibility_df.index)} orig zones {len(land_use_df)} dest zones" ) accessibilities_list = [] for i, chooser_chunk, chunk_trace_label in \ chunk.adaptive_chunked_choosers(accessibility_df, chunk_size, trace_label): accessibilities = \ compute_accessibilities_for_zones(chooser_chunk, land_use_df, assignment_spec, constants, network_los, trace_od, trace_label) accessibilities_list.append(accessibilities) accessibility_df = pd.concat(accessibilities_list) logger.info( f"{trace_label} computed accessibilities {accessibility_df.shape}") # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df)
def extension_probs(): f = config.config_file_path('non_mandatory_tour_frequency_extension_probs.csv') df = pd.read_csv(f, comment='#') # convert cum probs to individual probs df['2_tours'] = df['2_tours'] - df['1_tours'] df['1_tours'] = df['1_tours'] - df['0_tours'] return df
def test_bad_coefficients(): coefficients = pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') coefficients = cdap.preprocess_interaction_coefficients(coefficients) coefficients.loc[2, 'activity'] = 'AA' with pytest.raises(RuntimeError) as excinfo: coefficients = cdap.preprocess_interaction_coefficients(coefficients) assert "Expect only M, N, or H" in str(excinfo.value)
def write_spec(self, model_settings=None, file_name=None, tag='SPEC'): if model_settings is not None: assert file_name is None file_name = model_settings[tag] input_path = config.config_file_path(file_name) output_path = self.file_path(table_name=tag, file_type='csv') shutil.copy(input_path, output_path) self.debug("estimate.write_spec: %s" % output_path)
def tdd_alts(): # right now this file just contains the start and end hour f = config.config_file_path('tour_departure_and_duration_alternatives.csv') df = pd.read_csv(f) df['duration'] = df.end - df.start # - NARROW df = df.astype(np.int8) return df
def write_spec(self, model_settings=None, file_name=None, tag='SPEC', bundle_directory=False): if model_settings is not None: assert file_name is None file_name = model_settings[tag] input_path = config.config_file_path(file_name) table_name = tag # more readable than full spec file_name output_path = self.output_file_path(table_name, 'csv', bundle_directory) shutil.copy(input_path, output_path) self.debug("estimate.write_spec: %s" % output_path)
def compute_tap_tap_time(self, recipe, access_df, egress_df, chooser_attributes, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_time') model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') tap_tap_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.tap_tap_settings') with memo("#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) locals_d = {'los': self.network_los} locals_d.update(model_constants) assignment_spec = assign.read_assignment_spec( file_name=config.config_file_path(tap_tap_settings['SPEC'])) results, _, _ = assign.assign_variables(assignment_spec, transit_df, locals_d) assert len(results.columns == 1) transit_df['transit'] = results # filter out unavailable btap_atap pairs logger.debug( f"{(transit_df['transit'] <= 0).sum()} unavailable tap_tap pairs out of {len(transit_df)}" ) transit_df = transit_df[transit_df.transit > 0] transit_df.drop(columns=chooser_attributes.columns, inplace=True) chunk.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, 'transit_df') return transit_df
def initialize_tvpb_calc_row_size(choosers, network_los, trace_label): """ rows_per_chunk calculator for trip_purpose """ sizer = chunk.RowSizeEstimator(trace_label) model_settings = \ network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.tap_tap_settings') attributes_as_columns = \ network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attributes_as_columns', []) # expression_values for each spec row sizer.add_elements(len(choosers.columns), 'choosers') # expression_values for each spec row sizer.add_elements(len(attributes_as_columns), 'attributes_as_columns') preprocessor_settings = model_settings.get('PREPROCESSOR') if preprocessor_settings: preprocessor_spec_name = preprocessor_settings.get('SPEC', None) if not preprocessor_spec_name.endswith(".csv"): preprocessor_spec_name = f'{preprocessor_spec_name}.csv' expressions_spec = assign.read_assignment_spec( config.config_file_path(preprocessor_spec_name)) sizer.add_elements(expressions_spec.shape[0], 'preprocessor') # expression_values for each spec row spec = simulate.read_model_spec(file_name=model_settings['SPEC']) sizer.add_elements(spec.shape[0], 'expression_values') # expression_values for each spec row sizer.add_elements(spec.shape[1], 'utilities') row_size = sizer.get_hwm() return row_size
def test_build_cdap_spec_hhsize2(people, model_settings): hhsize = 2 cdap_indiv_and_hhsize1 = simulate.read_model_spec( file_name='cdap_indiv_and_hhsize1.csv') interaction_coefficients = pd.read_csv( config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') interaction_coefficients = cdap.preprocess_interaction_coefficients( interaction_coefficients) person_type_map = model_settings.get('PERSON_TYPE_MAP', {}) cdap.assign_cdap_rank(people, person_type_map) indiv_utils = cdap.individual_utilities(people, cdap_indiv_and_hhsize1, locals_d=None) choosers = cdap.hh_choosers(indiv_utils, hhsize=hhsize) spec = cdap.build_cdap_spec(interaction_coefficients, hhsize=hhsize, cache=False) vars = simulate.eval_variables(spec.index, choosers) utils = simulate.compute_utilities(vars, spec) expected = pd.DataFrame( [ [0, 3, 0, 3, 7, 3, 0, 3, 0], # household 3 [0, 0, 1, 1, 1, 2, 0, 0, 2], # household 4 ], index=[3, 4], columns=['HH', 'HM', 'HN', 'MH', 'MM', 'MN', 'NH', 'NM', 'NN']).astype('float') pdt.assert_frame_equal(utils, expected, check_names=False)
def read_control_spec(data_filename): # read the csv file data_file_path = config.config_file_path(data_filename) if not os.path.exists(data_file_path): raise RuntimeError( "initial_seed_balancing - control file not found: %s" % (data_file_path, )) logger.info("Reading control file %s" % data_file_path) control_spec = pd.read_csv(data_file_path, comment='#') geographies = setting('geographies') if 'geography' not in control_spec.columns: raise RuntimeError("missing geography column in controls file") for g in control_spec.geography.unique(): if g not in geographies: raise RuntimeError( "unknown geography column '%s' in control file" % g) return control_spec
def test_build_cdap_spec_hhsize2(people, model_settings): hhsize = 2 cdap_indiv_and_hhsize1 = simulate.read_model_spec(file_name='cdap_indiv_and_hhsize1.csv') interaction_coefficients = pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') interaction_coefficients = cdap.preprocess_interaction_coefficients(interaction_coefficients) person_type_map = model_settings.get('PERSON_TYPE_MAP', {}) with chunk.chunk_log('test_build_cdap_spec_hhsize2', base=True): cdap.assign_cdap_rank(people, person_type_map) indiv_utils = cdap.individual_utilities(people, cdap_indiv_and_hhsize1, locals_d=None) choosers = cdap.hh_choosers(indiv_utils, hhsize=hhsize) spec = cdap.build_cdap_spec(interaction_coefficients, hhsize=hhsize, cache=False) # pandas.dot depends on column names of expression_values matching spec index values # expressions should have been uniquified when spec was read assert spec.index.is_unique vars = simulate.eval_variables(spec.index, choosers) assert (spec.index.values == vars.columns.values).all() # spec = spec.astype(np.float64) utils = vars.dot(spec) expected = pd.DataFrame([ [0, 3, 0, 3, 7, 3, 0, 3, 0], # household 3 [0, 0, 1, 1, 1, 2, 0, 0, 2], # household 4 ], index=[3, 4], columns=['HH', 'HM', 'HN', 'MH', 'MM', 'MN', 'NH', 'NM', 'NN']).astype('float') pdt.assert_frame_equal(utils, expected, check_names=False)
def tdd_alt_segments(): # tour_purpose,time_period,start,end # work,EA,3,5 # work,AM,6,8 # ... # school,PM,15,17 # school,EV,18,22 file_path = config.config_file_path( 'tour_departure_and_duration_segments.csv', mandatory=False) if file_path: df = pd.read_csv(file_path, comment='#') # - NARROW df['start'] = df['start'].astype(np.int8) df['end'] = df['end'].astype(np.int8) else: df = None return df
def size_terms(): f = config.config_file_path('destination_choice_size_terms.csv') return pd.read_csv(f, comment='#', index_col='segment')
def run_trip_purpose(trips_df, estimator, chunk_size, trace_hh_id, trace_label): """ trip purpose - main functionality separated from model step so it can be called iteratively For each intermediate stop on a tour (i.e. trip other than the last trip outbound or inbound) each trip is assigned a purpose based on an observed frequency distribution The distribution should always be segmented by tour purpose and tour direction. By default it is also segmented by person type. The join columns can be overwritten using the "probs_join_cols" parameter in the model settings. The model will attempt to segment by trip depart time as well if necessary and depart time ranges are specified in the probability lookup table. Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ # uniform across trip_purpose chunk_tag = 'trip_purpose' model_settings_file_name = 'trip_purpose.yaml' model_settings = config.read_model_settings(model_settings_file_name) probs_join_cols = model_settings.get('probs_join_cols', PROBS_JOIN_COLUMNS) spec_file_name = model_settings.get('PROBS_SPEC', 'trip_purpose_probs.csv') probs_spec = pd.read_csv(config.config_file_path(spec_file_name), comment='#') # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation # coefficients_df = simulate.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) if estimator: estimator.write_spec(model_settings, tag='PROBS_SPEC') estimator.write_model_settings(model_settings, model_settings_file_name) # estimator.write_coefficients(coefficients_df, model_settings) result_list = [] # - last trip of outbound tour gets primary_purpose last_trip = (trips_df.trip_num == trips_df.trip_count) purpose = trips_df.primary_purpose[last_trip & trips_df.outbound] result_list.append(purpose) logger.info("assign purpose to %s last outbound trips", purpose.shape[0]) # - last trip of inbound tour gets home (or work for atwork subtours) purpose = trips_df.primary_purpose[last_trip & ~trips_df.outbound] purpose = pd.Series(np.where(purpose == 'atwork', 'work', 'home'), index=purpose.index) result_list.append(purpose) logger.info("assign purpose to %s last inbound trips", purpose.shape[0]) # - intermediate stops (non-last trips) purpose assigned by probability table trips_df = trips_df[~last_trip] logger.info("assign purpose to %s intermediate trips", trips_df.shape[0]) preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns(df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) use_depart_time = model_settings.get('use_depart_time', True) for i, trips_chunk, chunk_trace_label in \ chunk.adaptive_chunked_choosers(trips_df, chunk_size, chunk_tag, trace_label): choices = choose_intermediate_trip_purpose( trips_chunk, probs_spec, estimator, probs_join_cols=probs_join_cols, use_depart_time=use_depart_time, trace_hh_id=trace_hh_id, trace_label=chunk_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) if len(result_list) > 1: choices = pd.concat(result_list) return choices
def compute_maz_tap_utilities(self, recipe, maz_od_df, chooser_attributes, leg, mode, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, f'maz_tap_utils.{leg}') with chunk.chunk_log(trace_label): maz_tap_settings = \ self.network_los.setting(f'TVPB_SETTINGS.{recipe}.maz_tap_settings.{mode}') chooser_columns = maz_tap_settings['CHOOSER_COLUMNS'] attribute_columns = list( chooser_attributes.columns ) if chooser_attributes is not None else [] model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') if leg == 'access': maz_col = 'omaz' tap_col = 'btap' else: maz_col = 'dmaz' tap_col = 'atap' # maz_to_tap access/egress utilities # deduped utilities_df - one row per chooser for each boarding tap (btap) accessible from omaz utilities_df = self.network_los.maz_to_tap_dfs[mode] utilities_df = utilities_df[chooser_columns]. \ reset_index(drop=False). \ rename(columns={'MAZ': maz_col, 'TAP': tap_col}) utilities_df = pd.merge(maz_od_df[['idx', maz_col]].drop_duplicates(), utilities_df, on=maz_col, how='inner') # add any supplemental chooser attributes (e.g. demographic_segment, tod) for c in attribute_columns: utilities_df[c] = reindex(chooser_attributes[c], utilities_df['idx']) chunk.log_df(trace_label, "utilities_df", utilities_df) if self.units_for_recipe(recipe) == 'utility': utilities_df[leg] = compute_utilities( self.network_los, maz_tap_settings, utilities_df, model_constants=model_constants, trace_label=trace_label, trace=trace, trace_column_names=['idx', maz_col, tap_col] if trace else None) chunk.log_df(trace_label, "utilities_df", utilities_df) # annotated else: assignment_spec = \ assign.read_assignment_spec(file_name=config.config_file_path(maz_tap_settings['SPEC'])) results, _, _ = assign.assign_variables( assignment_spec, utilities_df, model_constants) assert len(results.columns == 1) utilities_df[leg] = results chunk.log_df(trace_label, "utilities_df", utilities_df) if trace: self.trace_df(utilities_df, trace_label, 'utilities_df') # drop utility computation columns ('tod', 'demographic_segment' and maz_to_tap_df time/distance columns) utilities_df.drop(columns=attribute_columns + chooser_columns, inplace=True) return utilities_df
def atwork_subtour_frequency(tours, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making at-work subtour tours (alternatives for this model come from a separate csv file which is configured by the user). """ trace_label = 'atwork_subtour_frequency' model_settings = config.read_model_settings( 'atwork_subtour_frequency.yaml') model_spec = simulate.read_model_spec( file_name='atwork_subtour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('atwork_subtour_frequency_alternatives.csv'), set_index='alt') tours = tours.to_frame() persons_merged = persons_merged.to_frame() work_tours = tours[tours.tour_type == 'work'] # - if no work_tours if len(work_tours) == 0: add_null_results(trace_label, tours) return # merge persons into work_tours work_tours = pd.merge(work_tours, persons_merged, left_on='person_id', right_index=True) logger.info("Running atwork_subtour_frequency with %d work tours", len(work_tours)) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: assign_columns(df=work_tours, model_settings=preprocessor_settings, trace_label=trace_label) choices = simulate.simple_simulate( choosers=work_tours, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='atwork_subtour_frequency') # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index) tracing.print_summary('atwork_subtour_frequency', choices, value_counts=True) # add atwork_subtour_frequency column to tours # reindex since we are working with a subset of tours tours['atwork_subtour_frequency'] = choices.reindex(tours.index) pipeline.replace_table("tours", tours) # - create atwork_subtours based on atwork_subtour_frequency choice names work_tours = tours[tours.tour_type == 'work'] assert not work_tours.atwork_subtour_frequency.isnull().any() subtours = process_atwork_subtours(work_tours, alternatives) tours = pipeline.extend_table("tours", subtours) tracing.register_traceable_table('tours', subtours) pipeline.get_rn_generator().add_channel('tours', subtours) if trace_hh_id: tracing.trace_df(tours, label='atwork_subtour_frequency.tours')
def cdap_simulate(persons_merged, persons, households, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') cdap_indiv_spec = simulate.read_model_spec( file_name=model_settings['INDIV_AND_HHSIZE1_SPEC']) # Rules and coefficients for generating interaction specs for different household sizes cdap_interaction_coefficients = \ pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#') """ spec to compute/specify the relative proportions of each activity (M, N, H) that should be used to choose activities for additional household members not handled by CDAP This spec is handled much like an activitysim logit utility spec, EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) """ cdap_fixed_relative_proportions = \ simulate.read_model_spec(file_name=model_settings['FIXED_RELATIVE_PROPORTIONS_SPEC']) persons_merged = persons_merged.to_frame() constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) estimator = estimation.manager.begin_estimation('cdap') if estimator: estimator.write_model_settings(model_settings, 'cdap.yaml') estimator.write_spec(model_settings, tag='INDIV_AND_HHSIZE1_SPEC') estimator.write_spec(model_settings=model_settings, tag='FIXED_RELATIVE_PROPORTIONS_SPEC') estimator.write_table(cdap_interaction_coefficients, 'interaction_coefficients', index=False, append=False) estimator.write_choosers(persons_merged) for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.get_cached_spec(hhsize) estimator.write_table(spec, 'spec_%s' % hhsize, append=False) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'persons', 'cdap_activity') estimator.write_override_choices(choices) estimator.end_estimation() # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info( "cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True))
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings = config.read_model_settings('non_mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='non_mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('non_mandatory_tour_frequency_alternatives.csv'), set_index=None) choosers = persons_merged.to_frame() # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives['tot_tours'] = alternatives.sum(axis=1) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_max_window': person_max_window } expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # filter based on results of CDAP choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) choices_list = [] # segment by person type and pick the right spec for each person type for ptype, segment in choosers.groupby('ptype'): name = PTYPE_NAME[ptype] # pick the spec column for the segment spec = model_spec[[name]] # drop any zero-valued rows spec = spec[spec[name] != 0] logger.info("Running segment '%s' of size %d", name, len(segment)) choices = interaction_simulate( segment, alternatives, spec=spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % name, trace_choice_name='non_mandatory_tour_frequency') choices_list.append(choices) # FIXME - force garbage collection? # force_garbage_collect() choices = pd.concat(choices_list) del alternatives['tot_tours'] # del tot_tours column we added above # - add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] # (we expect there to be an alt with no tours - which we can use to backfill non-travelers) no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ # - get counts of each of the alternatives (so we can extend) # (choices is just the index values for the chosen alts) """ escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ tour_counts = alternatives.loc[choices] tour_counts.index = choices.index # assign person ids to the index prev_tour_count = tour_counts.sum().sum() # - extend_tour_counts tour_counts = extend_tour_counts(choosers, tour_counts, alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) extended_tour_count = tour_counts.sum().sum() logging.info("extend_tour_counts increased nmtf tour count by %s from %s to %s" % (extended_tour_count - prev_tour_count, prev_tour_count, extended_tour_count)) # - create the non_mandatory tours non_mandatory_tours = process_non_mandatory_tours(persons, tour_counts) assert len(non_mandatory_tours) == extended_tour_count pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df(persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def trip_mode_choice( trips, tours_merged, network_los, chunk_size, trace_hh_id): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. Modes for each primary tour putpose are calculated separately because they have different coefficient values (stored in trip_mode_choice_coeffs.csv coefficient file.) Adds trip_mode column to trip table """ trace_label = 'trip_mode_choice' model_settings = config.read_model_settings('trip_mode_choice.yaml') logsum_column_name = model_settings.get('MODE_CHOICE_LOGSUM_COLUMN_NAME') mode_column_name = 'trip_mode' model_spec = \ simulate.read_model_spec(file_name=model_settings['SPEC']) omnibus_coefficients = \ assign.read_constant_spec(config.config_file_path(model_settings['COEFFICIENTS'])) trips_df = trips.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) tours_merged = tours_merged.to_frame() tours_merged = tours_merged[model_settings['TOURS_MERGED_CHOOSER_COLUMNS']] nest_spec = config.get_logit_model_settings(model_settings) tracing.print_summary('primary_purpose', trips_df.primary_purpose, value_counts=True) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( trips_df, tours_merged, left_on='tour_id', right_index=True, how="left") assert trips_merged.index.equals(trips.index) # setup skim keys assert ('trip_period' not in trips_merged) trips_merged['trip_period'] = network_los.skim_time_period_label(trips_merged.depart) orig_col = 'origin' dest_col = 'destination' constants = {} constants.update(config.get_model_constants(model_settings)) constants.update({ 'ORIGIN': orig_col, 'DESTINATION': dest_col }) skim_dict = network_los.get_default_skim_dict() odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col, dest_key=dest_col, dim3_key='trip_period') dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col, dest_key=orig_col, dim3_key='trip_period') od_skim_wrapper = skim_dict.wrap('origin', 'destination') skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_wrapper, } if network_los.zone_system == los.THREE_ZONE: # fixme - is this a lightweight object? tvpb = network_los.tvpb tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col, dest_key=dest_col, tod_key='trip_period', segment_key='demographic_segment', cache_choices=True, trace_label=trace_label, tag='tvpb_logsum_odt') skims.update({ 'tvpb_logsum_odt': tvpb_logsum_odt, # 'tvpb_logsum_dot': tvpb_logsum_dot }) # TVPB constants can appear in expressions constants.update(network_los.setting('TVPB_SETTINGS.tour_mode_choice.CONSTANTS')) choices_list = [] for primary_purpose, trips_segment in trips_merged.groupby('primary_purpose'): segment_trace_label = tracing.extend_trace_label(trace_label, primary_purpose) logger.info("trip_mode_choice tour_type '%s' (%s trips)" % (primary_purpose, len(trips_segment.index), )) # name index so tracing knows how to slice assert trips_segment.index.name == 'trip_id' if network_los.zone_system == los.THREE_ZONE: tvpb_logsum_odt.extend_trace_label(primary_purpose) # tvpb_logsum_dot.extend_trace_label(primary_purpose) locals_dict = assign.evaluate_constants(omnibus_coefficients[primary_purpose], constants=constants) locals_dict.update(constants) expressions.annotate_preprocessors( trips_segment, locals_dict, skims, model_settings, segment_trace_label) locals_dict.update(skims) choices = mode_choice_simulate( choosers=trips_segment, spec=model_spec, nest_spec=nest_spec, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, mode_column_name=mode_column_name, logsum_column_name=logsum_column_name, trace_label=trace_label, trace_choice_name='trip_mode_choice') if trace_hh_id: # trace the coefficients tracing.trace_df(pd.Series(locals_dict), label=tracing.extend_trace_label(segment_trace_label, 'constants'), transpose=False, slicer='NONE') # so we can trace with annotations assign_in_place(trips_segment, choices) tracing.trace_df(trips_segment, label=tracing.extend_trace_label(segment_trace_label, 'trip_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True) choices_list.append(choices) # FIXME - force garbage collection force_garbage_collect() choices_df = pd.concat(choices_list) # add cached tvpb_logsum tap choices for modes specified in tvpb_mode_path_types if network_los.zone_system == los.THREE_ZONE: tvpb_mode_path_types = model_settings.get('tvpb_mode_path_types') for mode, path_type in tvpb_mode_path_types.items(): skim_cache = tvpb_logsum_odt.cache[path_type] print(f"mode {mode} path_type {path_type}") for c in skim_cache: dest_col = c if dest_col not in choices_df: choices_df[dest_col] = np.nan choices_df[dest_col].where(choices_df[mode_column_name] != mode, skim_cache[c], inplace=True) # update trips table with choices (and otionally logssums) trips_df = trips.to_frame() assign_in_place(trips_df, choices_df) tracing.print_summary('tour_modes', trips_merged.tour_mode, value_counts=True) tracing.print_summary('trip_mode_choice choices', trips_df[mode_column_name], value_counts=True) assert not trips_df[mode_column_name].isnull().any() pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=tracing.extend_trace_label(trace_label, 'trip_mode'), slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def stop_frequency_alts(): # alt file for building trips even though simulation is simple_simulate not interaction_simulate file_path = config.config_file_path('stop_frequency_alternatives.csv') df = pd.read_csv(file_path, comment='#') df.set_index('alt', inplace=True) return df
def compute_accessibility(accessibility, network_los, land_use, trace_od): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec The actual results depend on the expressions in accessibility_spec, but this is initially intended to permit implementation of the mtc accessibility calculation as implemented by Accessibility.job Compute measures of accessibility used by the automobile ownership model. The accessibility measure first multiplies an employment variable by a mode-specific decay function. The product reflects the difficulty of accessing the activities the farther (in terms of round-trip travel time) the jobs are from the location in question. The products to each destination zone are next summed over each origin zone, and the logarithm of the product mutes large differences. The decay function on the walk accessibility measure is steeper than automobile or transit. The minimum accessibility is zero. """ trace_label = 'compute_accessibility' model_settings = config.read_model_settings('accessibility.yaml') assignment_spec = assign.read_assignment_spec( config.config_file_path('accessibility.csv')) accessibility_df = accessibility.to_frame() logger.info("Running %s with %d dest zones" % (trace_label, len(accessibility_df))) constants = config.get_model_constants(model_settings) land_use_columns = model_settings.get('land_use_columns', []) land_use_df = land_use.to_frame() land_use_df = land_use_df[land_use_columns] # don't assume they are the same: accessibility may be sliced if we are multiprocessing orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) logger.info("Running %s with %d dest zones %d orig zones" % (trace_label, dest_zone_count, orig_zone_count)) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(orig_zones, dest_zone_count), 'dest': np.tile(dest_zones, orig_zone_count) }) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() locals_d = { 'log': np.log, 'exp': np.exp, 'network_los': network_los, } skim_dict = network_los.get_default_skim_dict() locals_d['skim_od'] = skim_dict.wrap('orig', 'dest').set_df(od_df) locals_d['skim_do'] = skim_dict.wrap('dest', 'orig').set_df(od_df) if network_los.zone_system == los.THREE_ZONE: locals_d['tvpb'] = TransitVirtualPathBuilder(network_los) if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(assignment_spec, od_df, locals_d, trace_rows=trace_od_rows) for column in results.columns: data = np.asanyarray(results[column]) data.shape = (orig_zone_count, dest_zone_count) # (o,d) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) logger.info("{trace_label} added {len(results.columns} columns") # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df) if trace_od: if not trace_od_rows.any(): logger.warning( f"trace_od not found origin = {trace_orig}, dest = {trace_dest}" ) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals")
def compute_logsums( primary_purpose, trips, destination_sample, tours_merged, model_settings, skims, chunk_size, trace_hh_id, trace_label): """ Calculate mode choice logsums using the same recipe as for trip_mode_choice, but do it twice for each alternative since we need out-of-direction logsum (i.e . origin to alt_dest, and alt_dest to half-tour destination) Returns ------- adds od_logsum and dp_logsum columns to trips (in place) """ trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums') logger.info("Running %s with %d samples", trace_label, destination_sample.shape[0]) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( trips, tours_merged, left_on='tour_id', right_index=True, how="left") assert trips_merged.index.equals(trips.index) # - choosers - merge destination_sample and trips_merged # re/set index because pandas merge does not preserve left index if it has duplicate values! choosers = pd.merge(destination_sample, trips_merged.reset_index(), left_index=True, right_on='trip_id', how="left", suffixes=('', '_r')).set_index('trip_id') assert choosers.index.equals(destination_sample.index) logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) omnibus_coefficient_spec = \ assign.read_constant_spec(config.config_file_path(logsum_settings['COEFFS'])) coefficient_spec = omnibus_coefficient_spec[primary_purpose] constants = config.get_model_constants(logsum_settings) locals_dict = assign.evaluate_constants(coefficient_spec, constants=constants) locals_dict.update(constants) # - od_logsums od_skims = { 'ORIGIN': model_settings['TRIP_ORIGIN'], 'DESTINATION': model_settings['ALT_DEST'], "odt_skims": skims['odt_skims'], "od_skims": skims['od_skims'], } destination_sample['od_logsum'] = compute_ood_logsums( choosers, logsum_settings, od_skims, locals_dict, chunk_size, trace_label=tracing.extend_trace_label(trace_label, 'od')) # - dp_logsums dp_skims = { 'ORIGIN': model_settings['ALT_DEST'], 'DESTINATION': model_settings['PRIMARY_DEST'], "odt_skims": skims['dpt_skims'], "od_skims": skims['dp_skims'], } destination_sample['dp_logsum'] = compute_ood_logsums( choosers, logsum_settings, dp_skims, locals_dict, chunk_size, trace_label=tracing.extend_trace_label(trace_label, 'dp'))
def compute_columns(df, model_settings, locals_dict={}, trace_label=None): """ Evaluate expressions_spec in context of df, with optional additional pipeline tables in locals Parameters ---------- df : pandas DataFrame or if None, expect name of pipeline table to be specified by DF in model_settings model_settings : dict or str dict with keys: DF - df_alias and (additionally, if df is None) name of pipeline table to load as df SPEC - name of expressions file (csv suffix optional) if different from model_settings TABLES - list of pipeline tables to load and make available as (read only) locals str: name of yaml file in configs_dir to load dict from locals_dict : dict dict of locals (e.g. utility functions) to add to the execution environment trace_label Returns ------- results: pandas.DataFrame one column for each expression (except temps with ALL_CAP target names) same index as df """ if isinstance(model_settings, str): model_settings_name = model_settings model_settings = config.read_model_settings('%s.yaml' % model_settings) assert model_settings, "Found no model settings for %s" % model_settings_name else: model_settings_name = 'dict' assert isinstance(model_settings, dict) assert 'DF' in model_settings, \ "Expected to find 'DF' in %s" % model_settings_name df_name = model_settings.get('DF') helper_table_names = model_settings.get('TABLES', []) expressions_spec_name = model_settings.get('SPEC', None) assert expressions_spec_name is not None, \ "Expected to find 'SPEC' in %s" % model_settings_name trace_label = tracing.extend_trace_label(trace_label or '', expressions_spec_name) if not expressions_spec_name.endswith(".csv"): expressions_spec_name = '%s.csv' % expressions_spec_name logger.debug( f"{trace_label} compute_columns using expression spec file {expressions_spec_name}" ) expressions_spec = assign.read_assignment_spec( config.config_file_path(expressions_spec_name)) assert expressions_spec.shape[0] > 0, \ "Expected to find some assignment expressions in %s" % expressions_spec_name tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} # if df was passed in, df might be a slice, or any other table, but DF is it's local alias assert df_name not in tables, "Did not expect to find df '%s' in TABLES" % df_name tables[df_name] = df # be nice and also give it to them as df? tables['df'] = df _locals_dict = assign.local_utilities() _locals_dict.update(locals_dict) _locals_dict.update(tables) # FIXME a number of asim model preprocessors want skim_dict - should they request it in model_settings.TABLES? _locals_dict.update({ # 'los': inject.get_injectable('network_los', None), 'skim_dict': inject.get_injectable('skim_dict', None), }) results, trace_results, trace_assigned_locals \ = assign.assign_variables(expressions_spec, df, _locals_dict, trace_rows=tracing.trace_targets(df)) if trace_results is not None: tracing.trace_df(trace_results, label=trace_label, slicer='NONE') if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) return results
def trip_purpose_probs(): f = config.config_file_path('trip_purpose_probs.csv') df = pd.read_csv(f, comment='#') return df
def compute_tap_tap_time(self, recipe, access_df, egress_df, chooser_attributes, path_info, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_time') with chunk.chunk_log(trace_label): model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') tap_tap_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.tap_tap_settings') with memo( "#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) # some expressions may want to know access mode - locals_dict = path_info.copy() locals_dict['los'] = self.network_los locals_dict.update(model_constants) assignment_spec = assign.read_assignment_spec( file_name=config.config_file_path(tap_tap_settings['SPEC'])) DEDUPE = True if DEDUPE: # assign uid for reduping max_atap = transit_df.atap.max() + 1 transit_df[ 'uid'] = transit_df.btap * max_atap + transit_df.atap # dedupe chooser_attribute_columns = list(chooser_attributes.columns) unique_transit_df = \ transit_df.loc[~transit_df.uid.duplicated(), ['btap', 'atap', 'uid'] + chooser_attribute_columns] unique_transit_df.set_index('uid', inplace=True) chunk.log_df(trace_label, "unique_transit_df", unique_transit_df) logger.debug( f"#TVPB CACHE deduped transit_df from {len(transit_df)} to {len(unique_transit_df)}" ) # assign_variables results, _, _ = assign.assign_variables( assignment_spec, unique_transit_df, locals_dict) assert len(results.columns == 1) unique_transit_df['transit'] = results # redupe results back into transit_df with memo("#TVPB compute_tap_tap_time redupe transit_df"): transit_df['transit'] = reindex(unique_transit_df.transit, transit_df.uid) del transit_df['uid'] del unique_transit_df chunk.log_df(trace_label, "transit_df", transit_df) chunk.log_df(trace_label, "unique_transit_df", None) else: results, _, _ = assign.assign_variables( assignment_spec, transit_df, locals_dict) assert len(results.columns == 1) transit_df['transit'] = results # filter out unavailable btap_atap pairs logger.debug( f"{(transit_df['transit'] <= 0).sum()} unavailable tap_tap pairs out of {len(transit_df)}" ) transit_df = transit_df[transit_df.transit > 0] transit_df.drop(columns=chooser_attributes.columns, inplace=True) chunk.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, 'transit_df') return transit_df
def best_transit_path_spec(): return assign.read_assignment_spec(config.config_file_path('best_transit_path.csv'))
def compute_accessibility(accessibility, skim_dict, land_use, trace_od): """ Compute accessibility for each zone in land use file using expressions from accessibility_spec The actual results depend on the expressions in accessibility_spec, but this is initially intended to permit implementation of the mtc accessibility calculation as implemented by Accessibility.job Compute measures of accessibility used by the automobile ownership model. The accessibility measure first multiplies an employment variable by a mode-specific decay function. The product reflects the difficulty of accessing the activities the farther (in terms of round-trip travel time) the jobs are from the location in question. The products to each destination zone are next summed over each origin zone, and the logarithm of the product mutes large differences. The decay function on the walk accessibility measure is steeper than automobile or transit. The minimum accessibility is zero. """ trace_label = 'compute_accessibility' model_settings = config.read_model_settings('accessibility.yaml') assignment_spec = assign.read_assignment_spec(config.config_file_path('accessibility.csv')) accessibility_df = accessibility.to_frame() logger.info("Running %s with %d dest zones" % (trace_label, len(accessibility_df))) constants = config.get_model_constants(model_settings) land_use_columns = model_settings.get('land_use_columns', []) land_use_df = land_use.to_frame() # #bug # # land_use_df = land_use_df[land_use_df.index % 2 == 1] # accessibility_df = accessibility_df[accessibility_df.index.isin(land_use_df.index)].head(5) # # print "land_use_df", land_use_df.index # print "accessibility_df", accessibility_df.index # #bug orig_zones = accessibility_df.index.values dest_zones = land_use_df.index.values orig_zone_count = len(orig_zones) dest_zone_count = len(dest_zones) logger.info("Running %s with %d dest zones %d orig zones" % (trace_label, dest_zone_count, orig_zone_count)) # create OD dataframe od_df = pd.DataFrame( data={ 'orig': np.repeat(np.asanyarray(accessibility_df.index), dest_zone_count), 'dest': np.tile(np.asanyarray(land_use_df.index), orig_zone_count) } ) if trace_od: trace_orig, trace_dest = trace_od trace_od_rows = (od_df.orig == trace_orig) & (od_df.dest == trace_dest) else: trace_od_rows = None # merge land_use_columns into od_df land_use_df = land_use_df[land_use_columns] od_df = pd.merge(od_df, land_use_df, left_on='dest', right_index=True).sort_index() locals_d = { 'log': np.log, 'exp': np.exp, 'skim_od': AccessibilitySkims(skim_dict, orig_zones, dest_zones), 'skim_do': AccessibilitySkims(skim_dict, orig_zones, dest_zones, transpose=True) } if constants is not None: locals_d.update(constants) results, trace_results, trace_assigned_locals \ = assign.assign_variables(assignment_spec, od_df, locals_d, trace_rows=trace_od_rows) for column in results.columns: data = np.asanyarray(results[column]) data.shape = (orig_zone_count, dest_zone_count) accessibility_df[column] = np.log(np.sum(data, axis=1) + 1) # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df) if trace_od: if not trace_od_rows.any(): logger.warning("trace_od not found origin = %s, dest = %s" % (trace_orig, trace_dest)) else: # add OD columns to trace results df = pd.concat([od_df[trace_od_rows], trace_results], axis=1) # dump the trace results table (with _temp variables) to aid debugging tracing.trace_df(df, label='accessibility', index_label='skim_offset', slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="accessibility_locals")
def trip_scheduling(trips, tours, chunk_size, trace_hh_id): """ Trip scheduling assigns depart times for trips within the start, end limits of the tour. The algorithm is simplistic: The first outbound trip starts at the tour start time, and subsequent outbound trips are processed in trip_num order, to ensure that subsequent trips do not depart before the trip that preceeds them. Inbound trips are handled similarly, except in reverse order, starting with the last trip, and working backwards to ensure that inbound trips do not depart after the trip that succeeds them. The probability spec assigns probabilities for depart times, but those possible departs must be clipped to disallow depart times outside the tour limits, the departs of prior trips, and in the case of work tours, the start/end times of any atwork subtours. Scheduling can fail if the probability table assigns zero probabilities to all the available depart times in a trip's depart window. (This could be avoided by giving every window a small probability, rather than zero, but the existing mtctm1 prob spec does not do this. I believe this is due to the its having been generated from a small household travel survey sample that lacked any departs for some time periods.) Rescheduling the trips that fail (along with their inbound or outbound leg-mates) can sometimes fix this problem, if it was caused by an earlier trip's depart choice blocking a subsequent trip's ability to schedule a depart within the resulting window. But it can also happen if a tour is very short (e.g. one time period) and the prob spec having a zero probability for that tour hour. Therefor we need to handle trips that could not be scheduled. There are two ways (at least) to solve this problem: 1) CHOOSE_MOST_INITIAL simply assign a depart time to the trip, even if it has a zero probability. It makes most sense, in this case, to assign the 'most initial' depart time, so that subsequent trips are minimally impacted. This can be done in the final iteration, thus affecting only the trips that could no be scheduled by the standard approach 2) drop_and_cleanup drop trips that could no be scheduled, and adjust their leg mates, as is done for failed trips in trip_destination. For now we are choosing among these approaches with a manifest constant, but this could be made a model setting... """ trace_label = "trip_scheduling" model_settings = config.read_model_settings('trip_scheduling.yaml') assert 'DEPART_ALT_BASE' in model_settings failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) probs_spec = pd.read_csv( config.config_file_path('trip_scheduling_probs.csv'), comment='#') trips_df = trips.to_frame() tours = tours.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together trips_df['chunk_id'] = \ reindex(pd.Series(list(range(tours.shape[0])), tours.index), trips_df.tour_id) max_iterations = model_settings.get('MAX_ITERATIONS', 1) assert max_iterations > 0 choices_list = [] i = 0 while (i < max_iterations) and not trips_df.empty: i += 1 last_iteration = (i == max_iterations) trace_label_i = tracing.extend_trace_label(trace_label, "i%s" % i) logger.info("%s scheduling %s trips", trace_label_i, trips_df.shape[0]) choices = \ run_trip_scheduling( trips_df, tours, probs_spec, model_settings, last_iteration=last_iteration, trace_hh_id=trace_hh_id, chunk_size=chunk_size, trace_label=trace_label_i) # boolean series of trips whose individual trip scheduling failed failed = choices.reindex(trips_df.index).isnull() logger.info("%s %s failed", trace_label_i, failed.sum()) if not last_iteration: # boolean series of trips whose leg scheduling failed failed_cohorts = failed_trip_cohorts(trips_df, failed) trips_df = trips_df[failed_cohorts] choices = choices[~failed_cohorts] choices_list.append(choices) trips_df = trips.to_frame() choices = pd.concat(choices_list) choices = choices.reindex(trips_df.index) if choices.isnull().any(): logger.warning( "%s of %s trips could not be scheduled after %s iterations" % (choices.isnull().sum(), trips_df.shape[0], i)) if failfix != FAILFIX_DROP_AND_CLEANUP: raise RuntimeError("%s setting '%s' not enabled in settings" % (FAILFIX, FAILFIX_DROP_AND_CLEANUP)) trips_df['failed'] = choices.isnull() trips_df = cleanup_failed_trips(trips_df) choices = choices.reindex(trips_df.index) trips_df['depart'] = choices assert not trips_df.depart.isnull().any() pipeline.replace_table("trips", trips_df)
def trip_mode_choice( trips, tours_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. Modes for each primary tour putpose are calculated separately because they have different coefficient values (stored in trip_mode_choice_coeffs.csv coefficient file.) Adds trip_mode column to trip table """ trace_label = 'trip_mode_choice' model_settings = config.read_model_settings('trip_mode_choice.yaml') model_spec = \ simulate.read_model_spec(file_name=model_settings['SPEC']) omnibus_coefficients = \ assign.read_constant_spec(config.config_file_path(model_settings['COEFFS'])) trips_df = trips.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) tours_merged = tours_merged.to_frame() tours_merged = tours_merged[model_settings['TOURS_MERGED_CHOOSER_COLUMNS']] nest_spec = config.get_logit_model_settings(model_settings) tracing.print_summary('primary_purpose', trips_df.primary_purpose, value_counts=True) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( trips_df, tours_merged, left_on='tour_id', right_index=True, how="left") assert trips_merged.index.equals(trips.index) # setup skim keys assert ('trip_period' not in trips_merged) trips_merged['trip_period'] = skim_time_period_label(trips_merged.depart) orig_col = 'origin' dest_col = 'destination' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col, right_key=dest_col, skim_key='trip_period') od_skim_wrapper = skim_dict.wrap('origin', 'destination') skims = { "odt_skims": odt_skim_stack_wrapper, "od_skims": od_skim_wrapper, } constants = config.get_model_constants(model_settings) constants.update({ 'ORIGIN': orig_col, 'DESTINATION': dest_col }) choices_list = [] for primary_purpose, trips_segment in trips_merged.groupby('primary_purpose'): segment_trace_label = tracing.extend_trace_label(trace_label, primary_purpose) logger.info("trip_mode_choice tour_type '%s' (%s trips)" % (primary_purpose, len(trips_segment.index), )) # name index so tracing knows how to slice assert trips_segment.index.name == 'trip_id' locals_dict = assign.evaluate_constants(omnibus_coefficients[primary_purpose], constants=constants) locals_dict.update(constants) annotate_preprocessors( trips_segment, locals_dict, skims, model_settings, segment_trace_label) locals_dict.update(skims) choices = simulate.simple_simulate( choosers=trips_segment, spec=model_spec, nest_spec=nest_spec, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=segment_trace_label, trace_choice_name='trip_mode_choice') alts = model_spec.columns choices = choices.map(dict(list(zip(list(range(len(alts))), alts)))) # tracing.print_summary('trip_mode_choice %s choices' % primary_purpose, # choices, value_counts=True) if trace_hh_id: # trace the coefficients tracing.trace_df(pd.Series(locals_dict), label=tracing.extend_trace_label(segment_trace_label, 'constants'), transpose=False, slicer='NONE') # so we can trace with annotations trips_segment['trip_mode'] = choices tracing.trace_df(trips_segment, label=tracing.extend_trace_label(segment_trace_label, 'trip_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True) choices_list.append(choices) # FIXME - force garbage collection force_garbage_collect() choices = pd.concat(choices_list) trips_df = trips.to_frame() trips_df['trip_mode'] = choices tracing.print_summary('tour_modes', trips_merged.tour_mode, value_counts=True) tracing.print_summary('trip_mode_choice choices', choices, value_counts=True) assert not trips_df.trip_mode.isnull().any() pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=tracing.extend_trace_label(trace_label, 'trip_mode'), slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings('mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( model_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings( 'mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec( file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns(df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series(model_spec.columns[choices.values], index=choices.index).reindex( persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex( persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)
def trip_scheduling(trips, tours, chunk_size, trace_hh_id): """ Trip scheduling assigns depart times for trips within the start, end limits of the tour. The algorithm is simplistic: The first outbound trip starts at the tour start time, and subsequent outbound trips are processed in trip_num order, to ensure that subsequent trips do not depart before the trip that preceeds them. Inbound trips are handled similarly, except in reverse order, starting with the last trip, and working backwards to ensure that inbound trips do not depart after the trip that succeeds them. The probability spec assigns probabilities for depart times, but those possible departs must be clipped to disallow depart times outside the tour limits, the departs of prior trips, and in the case of work tours, the start/end times of any atwork subtours. Scheduling can fail if the probability table assigns zero probabilities to all the available depart times in a trip's depart window. (This could be avoided by giving every window a small probability, rather than zero, but the existing mtctm1 prob spec does not do this. I believe this is due to the its having been generated from a small household travel survey sample that lacked any departs for some time periods.) Rescheduling the trips that fail (along with their inbound or outbound leg-mates) can sometimes fix this problem, if it was caused by an earlier trip's depart choice blocking a subsequent trip's ability to schedule a depart within the resulting window. But it can also happen if a tour is very short (e.g. one time period) and the prob spec having a zero probability for that tour hour. Therefore we need to handle trips that could not be scheduled. There are two ways (at least) to solve this problem: 1) choose_most_initial simply assign a depart time to the trip, even if it has a zero probability. It makes most sense, in this case, to assign the 'most initial' depart time, so that subsequent trips are minimally impacted. This can be done in the final iteration, thus affecting only the trips that could no be scheduled by the standard approach 2) drop_and_cleanup drop trips that could no be scheduled, and adjust their leg mates, as is done for failed trips in trip_destination. Which option is applied is determined by the FAILFIX model setting """ trace_label = "trip_scheduling" model_settings_file_name = 'trip_scheduling.yaml' model_settings = config.read_model_settings(model_settings_file_name) trips_df = trips.to_frame() tours = tours.to_frame() # add columns 'tour_hour', 'earliest', 'latest' to trips set_tour_hour(trips_df, tours) # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode estimator = estimation.manager.begin_estimation('trip_scheduling') if estimator: estimator.write_spec(model_settings, tag='PROBS_SPEC') estimator.write_model_settings(model_settings, model_settings_file_name) chooser_cols_for_estimation = [ 'person_id', 'household_id', 'tour_id', 'trip_num', 'trip_count', 'primary_purpose', 'outbound', 'earliest', 'latest', 'tour_hour', ] estimator.write_choosers(trips_df[chooser_cols_for_estimation]) probs_spec = pd.read_csv( config.config_file_path('trip_scheduling_probs.csv'), comment='#') # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation # coefficients_df = simulate.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) # add tour-based chunk_id so we can chunk all trips in tour together trips_df['chunk_id'] = reindex( pd.Series(list(range(len(tours))), tours.index), trips_df.tour_id) assert 'DEPART_ALT_BASE' in model_settings failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) max_iterations = model_settings.get('MAX_ITERATIONS', 1) assert max_iterations > 0 choices_list = [] for chunk_i, trips_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers_by_chunk_id( trips_df, chunk_size, trace_label, trace_label): i = 0 while (i < max_iterations) and not trips_chunk.empty: # only chunk log first iteration since memory use declines with each iteration with chunk.chunk_log( trace_label) if i == 0 else chunk.chunk_log_skip(): i += 1 is_last_iteration = (i == max_iterations) trace_label_i = tracing.extend_trace_label( trace_label, "i%s" % i) logger.info("%s scheduling %s trips within chunk %s", trace_label_i, trips_chunk.shape[0], chunk_i) choices = \ run_trip_scheduling( trips_chunk, tours, probs_spec, model_settings, estimator=estimator, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, chunk_size=chunk_size, chunk_tag=trace_label, trace_label=trace_label_i) # boolean series of trips whose individual trip scheduling failed failed = choices.reindex(trips_chunk.index).isnull() logger.info("%s %s failed", trace_label_i, failed.sum()) if not is_last_iteration: # boolean series of trips whose leg scheduling failed failed_cohorts = failed_trip_cohorts(trips_chunk, failed) trips_chunk = trips_chunk[failed_cohorts] choices = choices[~failed_cohorts] choices_list.append(choices) trips_df = trips.to_frame() choices = pd.concat(choices_list) choices = choices.reindex(trips_df.index) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'trips', 'depart') # override choices estimator.write_override_choices(choices) estimator.end_estimation() assert not choices.isnull().any() if choices.isnull().any(): logger.warning( "%s of %s trips could not be scheduled after %s iterations" % (choices.isnull().sum(), trips_df.shape[0], i)) if failfix != FAILFIX_DROP_AND_CLEANUP: raise RuntimeError("%s setting '%s' not enabled in settings" % (FAILFIX, FAILFIX_DROP_AND_CLEANUP)) trips_df['failed'] = choices.isnull() trips_df = cleanup_failed_trips(trips_df) choices = choices.reindex(trips_df.index) trips_df['depart'] = choices assert not trips_df.depart.isnull().any() pipeline.replace_table("trips", trips_df)
def buffer_zones_spec(buffer_zones_settings): spec_path = config.config_file_path( buffer_zones_settings['buffer_zones_spec']) return buffer.read_buffer_spec(spec_path)
def cdap_interaction_coefficients(): """ Rules and coefficients for generating interaction specs for different household sizes """ f = config.config_file_path('cdap_interaction_coefficients.csv') return pd.read_csv(f, comment='#')