def vectorize_tour_scheduling(tours, alts, spec, constants={}, chunk_size=0, trace_label=None): """ The purpose of this method is fairly straightforward - it takes tours and schedules them into time slots. Alternatives should be specified so as to define those time slots (usually with start and end times). The difficulty of doing this in Python is that subsequent tours are dependent on certain characteristics of previous tours for the same person. This is a problem with Python's vectorization requirement, so this method does all the 1st tours, then all the 2nd tours, and so forth. This method also adds variables that can be used in the spec which have to do with the previous tours per person. Every column in the alternatives table is appended with the suffix "_previous" and made available. So if your alternatives table has columns for start and end, then start_previous and end_previous will be set to the start and end of the most recent tour for a person. The first time through, start_previous and end_previous are undefined, so make sure to protect with a tour_num >= 2 in the variable computation. Parameters ---------- tours : DataFrame DataFrame of tours containing tour attributes, as well as a person_id column to define the nth tour for each person. alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. Returns ------- choices : Series A Series of choices where the index is the index of the tours DataFrame and the values are the index of the alts DataFrame. """ max_num_trips = tours.groupby('person_id').size().max() if np.isnan(max_num_trips): s = pd.Series() s.index.name = 'tour_id' return s # because this is Python, we have to vectorize everything by doing the # "nth" trip for each person in a for loop (in other words, because each # trip is dependent on the time windows left by the previous decision) - # hopefully this will work out ok! choices = [] # keep a series of the the most recent tours for each person previous_tour_by_personid = pd.Series( pd.Series(alts.index).iloc[0], index=tours.person_id.unique()) for i in range(max_num_trips): # this reset_index / set_index stuff keeps the index as the tours # index rather that switching to person_id as the index which is # what happens when you groupby person_id index_name = tours.index.name or 'index' nth_tours = tours.reset_index().\ groupby('person_id').nth(i).reset_index().set_index(index_name) nth_tours.index.name = 'tour_id' if trace_label: logger.info("%s running %d #%d tour choices" % (trace_label, len(nth_tours), i+1)) # tour num can be set by the user, but if it isn't we set it here if "tour_num" not in nth_tours: nth_tours["tour_num"] = i+1 nth_tours = nth_tours.join(get_previous_tour_by_tourid( nth_tours.person_id, previous_tour_by_personid, alts)) tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % i) nth_choices = interaction_simulate( nth_tours, alts.copy(), spec, locals_d=constants, chunk_size=chunk_size, trace_label=tour_trace_label ) choices.append(nth_choices) previous_tour_by_personid.loc[nth_tours.person_id] = nth_choices.values choices = pd.concat(choices) # return the concatenated choices return choices
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings_file_name = 'non_mandatory_tour_frequency.yaml' model_settings = config.read_model_settings(model_settings_file_name) # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives = simulate.read_model_alts( 'non_mandatory_tour_frequency_alternatives.csv', set_index=None) alternatives['tot_tours'] = alternatives.sum(axis=1) # filter based on results of CDAP choosers = persons_merged.to_frame() choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {'person_max_window': person_max_window} expressions.assign_columns(df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) spec_segments = model_settings.get('SPEC_SEGMENTS', {}) # segment by person type and pick the right spec for each person type choices_list = [] for segment_settings in spec_segments: segment_name = segment_settings['NAME'] ptype = segment_settings['PTYPE'] # pick the spec column for the segment segment_spec = model_spec[[segment_name]] chooser_segment = choosers[choosers.ptype == ptype] logger.info("Running segment '%s' of size %d", segment_name, len(chooser_segment)) if len(chooser_segment) == 0: # skip empty segments continue estimator = \ estimation.manager.begin_estimation(model_name=segment_name, bundle_name='non_mandatory_tour_frequency') coefficients_df = simulate.read_model_coefficients(segment_settings) segment_spec = simulate.eval_coefficients(segment_spec, coefficients_df, estimator) if estimator: estimator.write_spec(model_settings, bundle_directory=True) estimator.write_model_settings(model_settings, model_settings_file_name, bundle_directory=True) # preserving coefficients file name makes bringing back updated coefficients more straightforward estimator.write_coefficients(coefficients_df, segment_settings) estimator.write_choosers(chooser_segment) estimator.write_alternatives(alternatives, bundle_directory=True) # FIXME #interaction_simulate_estimation_requires_chooser_id_in_df_column # shuold we do it here or have interaction_simulate do it? # chooser index must be duplicated in column or it will be omitted from interaction_dataset # estimation requires that chooser_id is either in index or a column of interaction_dataset # so it can be reformatted (melted) and indexed by chooser_id and alt_id assert chooser_segment.index.name == 'person_id' assert 'person_id' not in chooser_segment.columns chooser_segment['person_id'] = chooser_segment.index # FIXME set_alt_id - do we need this for interaction_simulate estimation bundle tables? estimator.set_alt_id('alt_id') estimator.set_chooser_id(chooser_segment.index.name) choices = interaction_simulate( chooser_segment, alternatives, spec=segment_spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % segment_name, trace_choice_name='non_mandatory_tour_frequency', estimator=estimator) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values( choices, 'persons', 'non_mandatory_tour_frequency') estimator.write_override_choices(choices) estimator.end_estimation() choices_list.append(choices) # FIXME - force garbage collection? force_garbage_collect() del alternatives['tot_tours'] # del tot_tours column we added above # The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate # is the index value of the chosen alternative in the alternatives table. choices = pd.concat(choices_list).sort_index() # add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # we expect there to be an alt with no tours - which we can use to backfill non-travelers no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tour frequencies, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) But before we do that, we run an additional probablilistic step to extend/increase tour counts beyond the strict limits of the tour_frequency alternatives chosen above (which are currently limited to at most 2 escort tours and 1 each of shopping, othmaint, othdiscr, eatout, and social tours) The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate is simply the index value of the chosen alternative in the alternatives table. get counts of each of the tour type alternatives (so we can extend) escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ # counts of each of the tour type alternatives (so we can extend) modeled_tour_counts = alternatives.loc[choices] modeled_tour_counts.index = choices.index # assign person ids to the index # - extend_tour_counts - probabalistic extended_tour_counts = \ extend_tour_counts(choosers, modeled_tour_counts.copy(), alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) num_modeled_tours = modeled_tour_counts.sum().sum() num_extended_tours = extended_tour_counts.sum().sum() logger.info("extend_tour_counts increased tour count by %s from %s to %s" % (num_extended_tours - num_modeled_tours, num_modeled_tours, num_extended_tours)) """ create the non_mandatory tours based on extended_tour_counts """ if estimator: override_tour_counts = \ estimation.manager.get_survey_values(extended_tour_counts, table_name='persons', column_names=['_%s' % c for c in extended_tour_counts.columns]) override_tour_counts = \ override_tour_counts.rename(columns={('_%s' % c): c for c in extended_tour_counts.columns}) logger.info( "estimation get_survey_values override_tour_counts %s changed cells" % (override_tour_counts != extended_tour_counts).sum().sum()) extended_tour_counts = override_tour_counts """ create the non_mandatory tours based on extended_tour_counts """ non_mandatory_tours = process_non_mandatory_tours(persons, extended_tour_counts) assert len(non_mandatory_tours) == extended_tour_counts.sum().sum() if estimator: # make sure they created the right tours survey_tours = estimation.manager.get_survey_table( 'tours').sort_index() non_mandatory_survey_tours = survey_tours[survey_tours.tour_category == 'non_mandatory'] assert len(non_mandatory_survey_tours) == len(non_mandatory_tours) assert non_mandatory_survey_tours.index.equals( non_mandatory_tours.sort_index().index) # make sure they created tours with the expected tour_ids columns = ['person_id', 'household_id', 'tour_type', 'tour_category'] survey_tours = \ estimation.manager.get_survey_values(non_mandatory_tours, table_name='tours', column_names=columns) tours_differ = (non_mandatory_tours[columns] != survey_tours[columns]).any(axis=1) if tours_differ.any(): print("tours_differ\n%s" % tours_differ) print("%s of %s tours differ" % (tours_differ.sum(), len(tours_differ))) print("differing survey_tours\n%s" % survey_tours[tours_differ]) print("differing modeled_tours\n%s" % non_mandatory_tours[columns][tours_differ]) assert (not tours_differ.any()) pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df( non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df( persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def non_mandatory_tour_frequency(persons_merged, non_mandatory_tour_frequency_alts, non_mandatory_tour_frequency_spec, non_mandatory_tour_frequency_settings, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ t0 = print_elapsed_time() choosers = persons_merged.to_frame() non_mandatory_tour_frequency_alts['tot_tours'] = non_mandatory_tour_frequency_alts.sum(axis=1) # filter based on results of CDAP choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] logger.info("Running non_mandatory_tour_frequency with %d persons" % len(choosers)) constants = config.get_model_constants(non_mandatory_tour_frequency_settings) choices_list = [] # segment by person type and pick the right spec for each person type for name, segment in choosers.groupby('ptype_cat'): logger.info("Running segment '%s' of size %d" % (name, len(segment))) choices = interaction_simulate( segment, non_mandatory_tour_frequency_alts, # notice that we pick the column for the segment for each segment we run spec=non_mandatory_tour_frequency_spec[[name]], locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % name, trace_choice_name='non_mandatory_tour_frequency') choices_list.append(choices) t0 = print_elapsed_time("non_mandatory_tour_frequency.%s" % name, t0, debug=True) # FIXME - force garbage collection # force_garbage_collect() choices = pd.concat(choices_list) tracing.print_summary('non_mandatory_tour_frequency', choices, value_counts=True) # FIXME - no need to reindex? # FIXME - how about the persons not processed inject.add_column("persons", "non_mandatory_tour_frequency", choices) create_non_mandatory_tours(trace_hh_id) # add non_mandatory_tour-dependent columns (e.g. tour counts) to persons pipeline.add_dependent_columns("persons", "persons_nmtf") if trace_hh_id: trace_columns = ['non_mandatory_tour_frequency'] tracing.trace_df(inject.get_table('persons').to_frame(), label="non_mandatory_tour_frequency.persons", # columns=trace_columns, warn_if_empty=True)
def non_mandatory_tour_destination_choice( tours, persons_merged, skim_dict, non_mandatory_tour_destination_choice_spec, non_mandatory_tour_destination_choice_settings, destination_size_terms, configs_dir, chunk_size, trace_hh_id): """ Given the tour generation from the above, each tour needs to have a destination, so in this case tours are the choosers (with the associated person that's making the tour) """ trace_label = 'non_mandatory_tour_destination' tours = tours.to_frame() persons_merged = persons_merged.to_frame() alternatives = destination_size_terms.to_frame() spec = non_mandatory_tour_destination_choice_spec # choosers are tours - in a sense tours are choosing their destination non_mandatory_tours = tours[tours.non_mandatory] choosers = pd.merge(non_mandatory_tours, persons_merged, left_on='person_id', right_index=True) constants = config.get_model_constants( non_mandatory_tour_destination_choice_settings) sample_size = non_mandatory_tour_destination_choice_settings["SAMPLE_SIZE"] # create wrapper with keys for this lookup - in this case there is a TAZ in the choosers # and a TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skims = skim_dict.wrap("TAZ", "TAZ_r") locals_d = {'skims': skims} if constants is not None: locals_d.update(constants) logger.info( "Running non_mandatory_tour_destination_choice with %d non_mandatory_tours" % len(choosers.index)) choices_list = [] # segment by trip type and pick the right spec for each person type for name, segment in choosers.groupby('tour_type'): # FIXME - there are two options here escort with kids and without kludge_name = name if name == "escort": logging.error( "destination_choice escort not implemented - running shopping instead" ) kludge_name = "shopping" # the segment is now available to switch between size terms locals_d['segment'] = kludge_name # FIXME - no point in considering impossible alternatives alternatives_segment = alternatives[alternatives[kludge_name] > 0] logger.info("Running segment '%s' of %d tours %d alternatives" % (name, len(segment), len(alternatives_segment))) # name index so tracing knows how to slice segment.index.name = 'tour_id' choices = interaction_simulate(segment, alternatives_segment, spec[[kludge_name]], skims=skims, locals_d=locals_d, sample_size=sample_size, chunk_size=chunk_size, trace_label=tracing.extend_trace_label( trace_label, name)) choices_list.append(choices) choices = pd.concat(choices_list) non_mandatory_tours['destination'] = choices results = expressions.compute_columns( df=non_mandatory_tours, model_settings='annotate_tours_with_dest', configs_dir=configs_dir, trace_label=trace_label) assign_in_place(tours, non_mandatory_tours[['destination']]) assign_in_place(tours, results) pipeline.replace_table("tours", tours) if trace_hh_id: tracing.trace_df(tours[tours.non_mandatory], label="non_mandatory_tour_destination", slicer='person_id', index_label='tour', columns=None, warn_if_empty=True)
def destination_choice(non_mandatory_tours_merged, skim_dict, destination_choice_spec, destination_choice_settings, destination_size_terms, chunk_size, trace_hh_id): """ Given the tour generation from the above, each tour needs to have a destination, so in this case tours are the choosers (with the associated person that's making the tour) """ # choosers are tours - in a sense tours are choosing their destination choosers = non_mandatory_tours_merged.to_frame() alternatives = destination_size_terms.to_frame() spec = destination_choice_spec.to_frame() constants = config.get_model_constants(destination_choice_settings) sample_size = destination_choice_settings["SAMPLE_SIZE"] # create wrapper with keys for this lookup - in this case there is a TAZ in the choosers # and a TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skims = skim_dict.wrap("TAZ", "TAZ_r") locals_d = {'skims': skims} if constants is not None: locals_d.update(constants) logger.info("Running destination_choice with %d non_mandatory_tours" % len(choosers.index)) choices_list = [] # segment by trip type and pick the right spec for each person type for name, segment in choosers.groupby('tour_type'): # FIXME - there are two options here escort with kids and without kludge_name = name if name == "escort": logging.error( "destination_choice escort not implemented - running shopping instead" ) kludge_name = "shopping" # the segment is now available to switch between size terms locals_d['segment'] = kludge_name # FIXME - no point in considering impossible alternatives alternatives_segment = alternatives[alternatives[kludge_name] > 0] logger.info("Running segment '%s' of %d tours %d alternatives" % (name, len(segment), len(alternatives_segment))) # name index so tracing knows how to slice segment.index.name = 'tour_id' choices = interaction_simulate(segment, alternatives_segment, spec[[kludge_name]], skims=skims, locals_d=locals_d, sample_size=sample_size, chunk_size=chunk_size, trace_label='destination.%s' % name) choices_list.append(choices) choices = pd.concat(choices_list) # FIXME - can there be null destinations? if choices.isnull().any(): logger.error("destination_choice had %s null destinations" % choices.isnull().sum()) assert choices.isnull().sum() == 0 tracing.print_summary('destination', choices, describe=True) # every trip now has a destination which is the index from the # alternatives table - in this case it's the destination taz orca.add_column("non_mandatory_tours", "destination", choices) if trace_hh_id: tracing.trace_df(orca.get_table('non_mandatory_tours').to_frame(), label="destination", slicer='person_id', index_label='tour', columns=None, warn_if_empty=True)
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings = config.read_model_settings('non_mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='non_mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('non_mandatory_tour_frequency_alternatives.csv'), set_index=None) choosers = persons_merged.to_frame() # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives['tot_tours'] = alternatives.sum(axis=1) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_max_window': person_max_window } expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # filter based on results of CDAP choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) choices_list = [] # segment by person type and pick the right spec for each person type for ptype, segment in choosers.groupby('ptype'): name = PTYPE_NAME[ptype] # pick the spec column for the segment spec = model_spec[[name]] # drop any zero-valued rows spec = spec[spec[name] != 0] logger.info("Running segment '%s' of size %d", name, len(segment)) choices = interaction_simulate( segment, alternatives, spec=spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % name, trace_choice_name='non_mandatory_tour_frequency') choices_list.append(choices) # FIXME - force garbage collection? # force_garbage_collect() choices = pd.concat(choices_list) del alternatives['tot_tours'] # del tot_tours column we added above # - add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] # (we expect there to be an alt with no tours - which we can use to backfill non-travelers) no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ # - get counts of each of the alternatives (so we can extend) # (choices is just the index values for the chosen alts) """ escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ tour_counts = alternatives.loc[choices] tour_counts.index = choices.index # assign person ids to the index prev_tour_count = tour_counts.sum().sum() # - extend_tour_counts tour_counts = extend_tour_counts(choosers, tour_counts, alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) extended_tour_count = tour_counts.sum().sum() logging.info("extend_tour_counts increased nmtf tour count by %s from %s to %s" % (extended_tour_count - prev_tour_count, prev_tour_count, extended_tour_count)) # - create the non_mandatory tours non_mandatory_tours = process_non_mandatory_tours(persons, tour_counts) assert len(non_mandatory_tours) == extended_tour_count pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df(persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)