def run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label): assert not trips_df.empty choices = run_trip_purpose( trips_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'purpose') ) trips_df['purpose'] = choices trips_df = run_trip_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'destination')) return trips_df
def run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label): assert not trips_df.empty choices = run_trip_purpose( trips_df, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'purpose') ) trips_df['purpose'] = choices trips_df, save_sample_df = run_trip_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'destination')) return trips_df, save_sample_df
def run_trip_scheduling( trips, tours, probs_spec, model_settings, estimator, is_last_iteration, chunk_size, chunk_tag, trace_hh_id, trace_label): # only non-initial trips require scheduling, segment handing first such trip in tour will use most space # is_outbound_chooser = (trips.trip_num > 1) & trips.outbound & (trips.primary_purpose != 'atwork') # is_inbound_chooser = (trips.trip_num < trips.trip_count) & ~trips.outbound & (trips.primary_purpose != 'atwork') # num_choosers = (is_inbound_chooser | is_outbound_chooser).sum() result_list = [] for i, trips_chunk, chunk_trace_label \ in chunk.adaptive_chunked_choosers_by_chunk_id(trips, chunk_size, trace_label, chunk_tag): if trips_chunk.outbound.any(): leg_chunk = trips_chunk[trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'outbound') choices = \ schedule_trips_in_leg( outbound=True, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) if (~trips_chunk.outbound).any(): leg_chunk = trips_chunk[~trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'inbound') choices = \ schedule_trips_in_leg( outbound=False, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) choices = pd.concat(result_list) return choices
def run_trip_scheduling(trips_chunk, tours, probs_spec, model_settings, estimator, is_last_iteration, chunk_size, trace_hh_id, trace_label): set_tour_hour(trips_chunk, tours) set_stop_num(trips_chunk) # only non-initial trips require scheduling, segment handing first such trip in tour will use most space # is_outbound_chooser = (trips.trip_num > 1) & trips.outbound & (trips.primary_purpose != 'atwork') # is_inbound_chooser = (trips.trip_num < trips.trip_count) & ~trips.outbound & (trips.primary_purpose != 'atwork') # num_choosers = (is_inbound_chooser | is_outbound_chooser).sum() result_list = [] if trips_chunk.outbound.any(): leg_chunk = trips_chunk[trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, 'outbound') choices = \ schedule_trips_in_leg( outbound=True, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) # departure time of last outbound trips must constrain # departure times for initial inbound trips update_tour_earliest(trips_chunk, choices) if (~trips_chunk.outbound).any(): leg_chunk = trips_chunk[~trips_chunk.outbound] leg_trace_label = tracing.extend_trace_label(trace_label, 'inbound') choices = \ schedule_trips_in_leg( outbound=False, trips=leg_chunk, probs_spec=probs_spec, model_settings=model_settings, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_df(trace_label, f'result_list', result_list) choices = pd.concat(result_list) return choices
def run_trip_scheduling(trips, tours, probs_spec, model_settings, last_iteration, chunk_size, trace_hh_id, trace_label): set_tour_hour(trips, tours) rows_per_chunk, effective_chunk_size = \ trip_scheduling_rpc(chunk_size, trips, probs_spec, trace_label) result_list = [] for i, num_chunks, trips_chunk in chunk.chunked_choosers_by_chunk_id( trips, rows_per_chunk): if num_chunks > 1: chunk_trace_label = tracing.extend_trace_label( trace_label, 'chunk_%s' % i) logger.info("%s of %s size %d" % (chunk_trace_label, num_chunks, len(trips_chunk))) else: chunk_trace_label = trace_label leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'outbound') chunk.log_open(leg_trace_label, chunk_size, effective_chunk_size) choices = \ schedule_trips_in_leg( outbound=True, trips=trips_chunk[trips_chunk.outbound], probs_spec=probs_spec, model_settings=model_settings, last_iteration=last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_close(leg_trace_label) leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'inbound') chunk.log_open(leg_trace_label, chunk_size, effective_chunk_size) choices = \ schedule_trips_in_leg( outbound=False, trips=trips_chunk[~trips_chunk.outbound], probs_spec=probs_spec, model_settings=model_settings, last_iteration=last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_close(leg_trace_label) choices = pd.concat(result_list) return choices
def trace_df(self, df, trace_label, extension): assert len(df) > 0 tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, extension), slicer='NONE', transpose=False)
def get_tvpb_best_transit_time(self, orig, dest, tod): # FIXME lots of pathological knowledge here as we are only called by accessibility directly from expressions trace_label = tracing.extend_trace_label( 'accessibility.tvpb_best_time', tod) recipe = 'accessibility' path_type = 'WTW' with chunk.chunk_log(trace_label): result = \ self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment=None, want_choices=False, trace_label=trace_label) trace_od = inject.get_injectable("trace_od", None) if trace_od: filter_targets = (orig == trace_od[0]) & (dest == trace_od[1]) if filter_targets.any(): self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment=None, want_choices=False, trace_label=trace_label, filter_targets=filter_targets, trace=True) return result
def best_paths(self, recipe, path_type, maz_od_df, access_df, egress_df, transit_df, trace_label, trace=False): trace_label = tracing.extend_trace_label(trace_label, 'best_paths') path_settings = self.network_los.setting(f'TVPB_SETTINGS.{recipe}.path_types.{path_type}') max_paths_per_tap_set = path_settings.get('max_paths_per_tap_set', 1) max_paths_across_tap_sets = path_settings.get('max_paths_across_tap_sets', 1) units = self.units_for_recipe(recipe) smaller_is_better = (units in ['time']) maz_od_df['seq'] = maz_od_df.index # maz_od_df has one row per chooser # inner join to add rows for each access, egress, and transit segment combination path_df = maz_od_df. \ merge(access_df, on=['idx', 'omaz'], how='inner'). \ merge(egress_df, on=['idx', 'dmaz'], how='inner'). \ merge(transit_df, on=['idx', 'atap', 'btap'], how='inner') chunk.log_df(trace_label, "path_df", path_df) # transit sets are the transit_df non-join columns transit_sets = [c for c in transit_df.columns if c not in ['idx', 'atap', 'btap']] if trace: # be nice and show both tap_tap set utility and total_set = access + set + egress for c in transit_sets: path_df[f'total_{c}'] = path_df[c] + path_df['access'] + path_df['egress'] self.trace_df(path_df, trace_label, 'best_paths.full') for c in transit_sets: del path_df[f'total_{c}'] for c in transit_sets: path_df[c] = path_df[c] + path_df['access'] + path_df['egress'] path_df.drop(columns=['access', 'egress'], inplace=True) # choose best paths by tap set best_paths_list = [] for c in transit_sets: keep = path_df.index.isin( path_df[['seq', c]].sort_values(by=c, ascending=smaller_is_better). groupby(['seq']).head(max_paths_per_tap_set).index ) best_paths_for_set = path_df[keep] best_paths_for_set['path_set'] = c # remember the path set best_paths_for_set[units] = path_df[keep][c] best_paths_for_set.drop(columns=transit_sets, inplace=True) best_paths_list.append(best_paths_for_set) path_df = pd.concat(best_paths_list).sort_values(by=['seq', units], ascending=[True, smaller_is_better]) # choose best paths overall by seq path_df = path_df.sort_values(by=['seq', units], ascending=[True, smaller_is_better]) path_df = path_df[path_df.index.isin(path_df.groupby(['seq']).head(max_paths_across_tap_sets).index)] if trace: self.trace_df(path_df, trace_label, 'best_paths') return path_df
def __init__(self, pathbuilder, orig_key, dest_key, tod_key, segment_key, cache_choices, trace_label, tag): self.tvpb = pathbuilder assert hasattr(pathbuilder, 'get_tvpb_logsum') self.orig_key = orig_key self.dest_key = dest_key self.tod_key = tod_key self.segment_key = segment_key self.df = None self.cache_choices = cache_choices self.cache = {} if cache_choices else None self.base_trace_label = tracing.extend_trace_label( trace_label, tag) or f'tvpb_logsum.{tag}' self.trace_label = self.base_trace_label self.tag = tag self.chunk_overhead = None assert isinstance(orig_key, str) assert isinstance(dest_key, str) assert isinstance(tod_key, str) assert isinstance(segment_key, str)
def get_tvpb_logsum(self, path_type, orig, dest, tod, demographic_segment, want_choices, trace_label=None): # assume they have given us a more specific name (since there may be more than one active wrapper) trace_label = trace_label or 'get_tvpb_logsum' trace_label = tracing.extend_trace_label(trace_label, path_type) recipe = 'tour_mode_choice' with chunk.chunk_log(trace_label): logsum_df = \ self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, want_choices=want_choices, trace_label=trace_label) trace_hh_id = inject.get_injectable("trace_hh_id", None) if trace_hh_id: filter_targets = tracing.trace_targets(orig) # choices from preceding run (because random numbers) override_choices = logsum_df['path_num'] if want_choices else None if filter_targets.any(): self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, want_choices=want_choices, override_choices=override_choices, trace_label=trace_label, filter_targets=filter_targets, trace=True) return logsum_df
def __init__(self, model_settings, network_los, trace_label): self.model_settings = model_settings self.trace_label = tracing.extend_trace_label(trace_label, 'skim_hotel') self.network_los = network_los self.zone_system = network_los.zone_system
def annotate_tables(model_settings, trace_label): trace_label = tracing.extend_trace_label(trace_label, 'annotate_tables') chunk.log_rss(trace_label) annotate_tables = model_settings.get('annotate_tables', []) if not annotate_tables: logger.warning( f"{trace_label} - annotate_tables setting is empty - nothing to do!" ) assert isinstance(annotate_tables, list), \ f"annotate_tables settings should be a list but is {type(annotate_tables)}" t0 = tracing.print_elapsed_time() for table_info in annotate_tables: tablename = table_info['tablename'] chunk.log_rss(f"{trace_label}.pre-get_table.{tablename}") df = inject.get_table(tablename).to_frame() chunk.log_df(trace_label, tablename, df) # - rename columns column_map = table_info.get('column_map', None) if column_map: warnings.warn( f"Setting 'column_map' has been changed to 'rename_columns'. " f"Support for 'column_map' in annotate_tables will be removed in future versions.", FutureWarning) logger.info( f"{trace_label} - renaming {tablename} columns {column_map}") df.rename(columns=column_map, inplace=True) # - annotate annotate = table_info.get('annotate', None) if annotate: logger.info( f"{trace_label} - annotating {tablename} SPEC {annotate['SPEC']}" ) expressions.assign_columns(df=df, model_settings=annotate, trace_label=trace_label) chunk.log_df(trace_label, tablename, df) # - write table to pipeline pipeline.replace_table(tablename, df) del df chunk.log_df(trace_label, tablename, None)
def annotate_jtp(model_settings, trace_label): # - annotate persons persons = inject.get_table('persons').to_frame() expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons)
def trip_destination_simulate( primary_purpose, trips, destination_sample, model_settings, want_logsums, size_term_matrix, skims, chunk_size, trace_hh_id, trace_label): """ Chose destination from destination_sample (with od_logsum and dp_logsum columns added) Returns ------- choices - pandas.Series destination alt chosen """ trace_label = tracing.extend_trace_label(trace_label, 'trip_dest_simulate') spec = get_spec_for_purpose(model_settings, 'DESTINATION_SPEC', primary_purpose) alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] logger.info("Running trip_destination_simulate with %d trips", len(trips)) locals_dict = config.get_model_constants(model_settings).copy() locals_dict.update({ 'size_terms': size_term_matrix }) locals_dict.update(skims) destinations = interaction_sample_simulate( choosers=trips, alternatives=destination_sample, spec=spec, choice_column=alt_dest_col_name, want_logsums=want_logsums, allow_zero_probs=True, zero_prob_choice_val=NO_DESTINATION, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='trip_dest') if not want_logsums: # for consistency, always return a dataframe with canonical column name assert isinstance(destinations, pd.Series) destinations = destinations.to_frame('choice') # drop any failed zero_prob destinations if (destinations.choice == NO_DESTINATION).any(): # logger.debug("dropping %s failed destinations", (destinations == NO_DESTINATION).sum()) destinations = destinations[destinations.choice != NO_DESTINATION] return destinations
def location_presample(segment_name, persons_merged, network_los, dest_size_terms, estimator, model_settings, chunk_size, chunk_tag, trace_label): trace_label = tracing.extend_trace_label(trace_label, 'presample') logger.info(f"{trace_label} location_presample") alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] assert DEST_TAZ != alt_dest_col_name MAZ_size_terms, TAZ_size_terms = aggregate_size_terms( dest_size_terms, network_los) # convert MAZ zone_id to 'TAZ' in choosers (persons_merged) # persons_merged[HOME_TAZ] = persons_merged[HOME_MAZ].map(maz_to_taz) assert HOME_MAZ in persons_merged assert HOME_TAZ in persons_merged # 'TAZ' should already be in persons_merged from land_use # FIXME - MEMORY HACK - only include columns actually used in spec # FIXME we don't actually require that land_use provide a TAZ crosswalk # FIXME maybe we should add it for multi-zone (from maz_taz) if missing? chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] chooser_columns = [ HOME_TAZ if c == HOME_MAZ else c for c in chooser_columns ] choosers = persons_merged[chooser_columns] # create wrapper with keys for this lookup - in this case there is a HOME_TAZ in the choosers # and a DEST_TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skim_dict = network_los.get_skim_dict('taz') skims = skim_dict.wrap(HOME_TAZ, DEST_TAZ) taz_sample = _location_sample(segment_name, choosers, TAZ_size_terms, skims, estimator, model_settings, DEST_TAZ, chunk_size, chunk_tag, trace_label) # print(f"taz_sample\n{taz_sample}") # dest_TAZ prob pick_count # person_id # 55227 7 0.009827 1 # 55227 10 0.000656 1 # 55227 18 0.014871 1 # 55227 20 0.035548 3 # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_choices = tour_destination.choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label) assert DEST_MAZ in maz_choices maz_choices = maz_choices.rename(columns={DEST_MAZ: alt_dest_col_name}) return maz_choices
def schedule_tours( tours, persons_merged, alts, spec, constants, timetable, previous_tour, window_id_col, chunk_size, tour_trace_label): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ # return _schedule_tours(tours, persons_merged, alts, spec, constants, timetable, # previous_tour, window_id_col, chunk_size, tour_trace_label) logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # persons_merged columns plus 2 previous tour columns extra_chooser_columns = persons_merged.shape[1] + 2 rows_per_chunk = \ calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, trace_label=tour_trace_label) logger.info("chunk_size %s rows_per_chunk %s" % (chunk_size, rows_per_chunk)) result_list = [] for i, num_chunks, chooser_chunk \ in chunk.chunked_choosers(tours, rows_per_chunk): logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else tour_trace_label choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, constants, timetable, previous_tour, window_id_col, tour_trace_label=chunk_trace_label) result_list.append(choices) force_garbage_collect() # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def trip_destination_sample( primary_purpose, trips, alternatives, model_settings, size_term_matrix, skims, chunk_size, trace_hh_id, trace_label): """ Returns ------- destination_sample: pandas.dataframe choices_df from interaction_sample with (up to) sample_size alts for each chooser row index (non unique) is trip_id from trips (duplicated for each alt) and columns dest_taz, prob, and pick_count dest_taz: int alt identifier (dest_taz) from alternatives[<alt_col_name>] prob: float the probability of the chosen alternative pick_count : int number of duplicate picks for chooser, alt """ trace_label = tracing.extend_trace_label(trace_label, 'trip_destination_sample') spec = get_spec_for_purpose(model_settings, 'DESTINATION_SAMPLE_SPEC', primary_purpose) sample_size = model_settings["SAMPLE_SIZE"] alt_dest_col_name = model_settings["ALT_DEST"] logger.info("Running %s with %d trips", trace_label, trips.shape[0]) locals_dict = config.get_model_constants(model_settings).copy() locals_dict.update({ 'size_terms': size_term_matrix }) locals_dict.update(skims) destination_sample = interaction_sample( choosers=trips, alternatives=alternatives, sample_size=sample_size, alt_col_name=alt_dest_col_name, allow_zero_probs=True, spec=spec, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=trace_label) return destination_sample
def compute_utilities(network_los, model_settings, choosers, model_constants, trace_label, trace=False, trace_column_names=None): """ Compute utilities """ with chunk.chunk_log(f'tvpb compute_utilities'): trace_label = tracing.extend_trace_label(trace_label, 'compute_utils') logger.debug( f"{trace_label} Running compute_utilities with {choosers.shape[0]} choosers" ) locals_dict = {'np': np, 'los': network_los} locals_dict.update(model_constants) # we don't grok coefficients, but allow them to use constants in spec alt columns spec = simulate.read_model_spec(file_name=model_settings['SPEC']) for c in spec.columns: if c != simulate.SPEC_LABEL_NAME: spec[c] = spec[c].map( lambda s: model_constants.get(s, s)).astype(float) with chunk.chunk_log(f'compute_utilities'): # - run preprocessor to annotate choosers preprocessor_settings = model_settings.get('PREPROCESSOR') if preprocessor_settings: # don't want to alter caller's dataframe choosers = choosers.copy() expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) utilities = simulate.eval_utilities( spec, choosers, locals_d=locals_dict, trace_all_rows=trace, trace_label=trace_label, trace_column_names=trace_column_names) return utilities
def trip_destination_sample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skim_hotel, estimator, chunk_size, trace_hh_id, trace_label): """ Returns ------- destination_sample: pandas.dataframe choices_df from interaction_sample with (up to) sample_size alts for each chooser row index (non unique) is trip_id from trips (duplicated for each alt) and columns dest_zone_id, prob, and pick_count dest_zone_id: int alt identifier from alternatives[<alt_col_name>] prob: float the probability of the chosen alternative pick_count : int number of duplicate picks for chooser, alt """ trace_label = tracing.extend_trace_label(trace_label, 'trip_destination_sample') assert len(trips) > 0 assert len(alternatives) > 0 # by default, enable presampling for multizone systems, unless they disable it in settings file network_los = inject.get_injectable('network_los') pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE) if pre_sample_taz and not config.setting('want_dest_choice_presampling', True): pre_sample_taz = False logger.info(f"Disabled destination zone presampling for {trace_label} " f"because 'want_dest_choice_presampling' setting is False") if pre_sample_taz: logger.info("Running %s trip_destination_presample with %d trips" % (trace_label, len(trips))) choices = destination_presample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skim_hotel, network_los, estimator, chunk_size, trace_hh_id, trace_label) else: choices = destination_sample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skim_hotel, estimator, chunk_size, trace_label) return choices
def initialize_tours(network_los, households, persons, trace_hh_id): trace_label = 'initialize_tours' tours = read_input_table("tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... slice_happened = \ inject.get_injectable('households_sample_size', 0) > 0 \ or inject.get_injectable('households_sample_size', 0) > 0 if slice_happened: logger.info("slicing tours %s" % (tours.shape,)) # keep all persons in the sampled households tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True) expressions.assign_columns( df=tours, model_settings=model_settings.get('annotate_tours'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours')) skip_patch_tour_ids = model_settings.get('skip_patch_tour_ids', False) if skip_patch_tour_ids: pass else: tours = patch_tour_ids(tours) assert tours.index.name == 'tour_id' # replace table function with dataframe inject.add_table('tours', tours) pipeline.get_rn_generator().add_channel('tours', tours) tracing.register_traceable_table('tours', tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") assert not tours.index.duplicated().any() tours_without_persons = ~tours.person_id.isin(persons.index) if tours_without_persons.any(): logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n" f"{pd.Series({'person_id': tours_without_persons.index.values})}") raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: tracing.trace_df(tours, label='initialize_tours', warn_if_empty=True)
def trip_destination_simulate( primary_purpose, trips, destination_sample, model_settings, size_term_matrix, skims, chunk_size, trace_hh_id, trace_label): """ Chose destination from destination_sample (with od_logsum and dp_logsum columns added) Returns ------- choices - pandas.Series destination alt chosen """ trace_label = tracing.extend_trace_label(trace_label, 'trip_destination_simulate') spec = get_spec_for_purpose(model_settings, 'DESTINATION_SPEC', primary_purpose) alt_dest_col_name = model_settings["ALT_DEST"] logger.info("Running trip_destination_simulate with %d trips", len(trips)) locals_dict = config.get_model_constants(model_settings).copy() locals_dict.update({ 'size_terms': size_term_matrix }) locals_dict.update(skims) destinations = interaction_sample_simulate( choosers=trips, alternatives=destination_sample, spec=spec, choice_column=alt_dest_col_name, allow_zero_probs=True, zero_prob_choice_val=NO_DESTINATION, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='trip_dest') # drop any failed zero_prob destinations if (destinations == NO_DESTINATION).any(): # logger.debug("dropping %s failed destinations", destinations == NO_DESTINATION).sum() destinations = destinations[destinations != NO_DESTINATION] return destinations
def od_presample(spec_segment_name, choosers, model_settings, network_los, destination_size_terms, estimator, chunk_size, trace_label): trace_label = tracing.extend_trace_label(trace_label, 'presample') chunk_tag = 'tour_od.presample' logger.info(f"{trace_label} od_presample") alt_od_col_name = get_od_id_col(ORIG_MAZ, DEST_TAZ) MAZ_size_terms, TAZ_size_terms = aggregate_size_terms( destination_size_terms, network_los) # create wrapper with keys for this lookup - in this case there is a ORIG_TAZ # in the choosers and a DEST_TAZ in the alternatives which get merged during # interaction the skims will be available under the name "skims" for any @ expressions skim_dict = network_los.get_skim_dict('taz') skims = skim_dict.wrap(ORIG_TAZ, DEST_TAZ) orig_MAZ_dest_TAZ_sample = _od_sample(spec_segment_name, choosers, network_los, TAZ_size_terms, ORIG_MAZ, DEST_TAZ, skims, estimator, model_settings, alt_od_col_name, chunk_size, chunk_tag, trace_label) orig_MAZ_dest_TAZ_sample[ORIG_MAZ] = orig_MAZ_dest_TAZ_sample[ alt_od_col_name].str.split('_').str[0].astype(int) orig_MAZ_dest_TAZ_sample[DEST_TAZ] = orig_MAZ_dest_TAZ_sample[ alt_od_col_name].str.split('_').str[1].astype(int) # choose a MAZ for each DEST_TAZ choice, choice probability based on # MAZ size_term fraction of TAZ total maz_choices = choose_MAZ_for_TAZ(orig_MAZ_dest_TAZ_sample, MAZ_size_terms, trace_label, addtl_col_for_unique_key=ORIG_MAZ) # outputs assert DEST_MAZ in maz_choices alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] chooser_orig_col_name = model_settings['CHOOSER_ORIG_COL_NAME'] maz_choices = maz_choices.rename(columns={ DEST_MAZ: alt_dest_col_name, ORIG_MAZ: chooser_orig_col_name }) return maz_choices
def destination_presample(primary_purpose, trips, alternatives, model_settings, size_term_matrix, skim_hotel, network_los, estimator, chunk_size, trace_hh_id, trace_label): trace_label = tracing.extend_trace_label(trace_label, 'presample') chunk_tag = 'trip_destination.presample' # distinguish from trip_destination.sample alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].set_index('MAZ').TAZ TAZ_size_term_matrix = aggregate_size_term_matrix(size_term_matrix, maz_taz) TRIP_ORIGIN = model_settings['TRIP_ORIGIN'] PRIMARY_DEST = model_settings['PRIMARY_DEST'] trips = trips.copy() trips[TRIP_ORIGIN] = trips[TRIP_ORIGIN].map(maz_taz) trips[PRIMARY_DEST] = trips[PRIMARY_DEST].map(maz_taz) # alternatives is just an empty dataframe indexed by maz with index name <alt_dest_col_name> # but logically, we are aggregating so lets do it, as there is no particular gain in being clever alternatives = alternatives.groupby(alternatives.index.map(maz_taz)).sum() skims = skim_hotel.sample_skims(presample=True) taz_sample = _destination_sample(primary_purpose, trips, alternatives, model_settings, TAZ_size_term_matrix, skims, alt_dest_col_name, estimator, chunk_size, chunk_tag=chunk_tag, trace_label=trace_label) # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_sample = choose_MAZ_for_TAZ(taz_sample, size_term_matrix, trips, network_los, alt_dest_col_name, trace_label) assert alt_dest_col_name in maz_sample return maz_sample
def parking_destination_simulate(segment_name, trips, destination_sample, model_settings, skims, chunk_size, trace_hh_id, trace_label): """ Chose destination from destination_sample (with od_logsum and dp_logsum columns added) Returns ------- choices - pandas.Series destination alt chosen """ trace_label = tracing.extend_trace_label(trace_label, 'trip_destination_simulate') spec = get_spec_for_segment(model_settings, 'SPECIFICATION', segment_name) alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] logger.info("Running trip_destination_simulate with %d trips", len(trips)) locals_dict = config.get_model_constants(model_settings).copy() locals_dict.update(skims) parking_locations = interaction_sample_simulate( choosers=trips, alternatives=destination_sample, spec=spec, choice_column=alt_dest_col_name, want_logsums=False, allow_zero_probs=True, zero_prob_choice_val=NO_DESTINATION, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='parking_loc') # drop any failed zero_prob destinations if (parking_locations == NO_DESTINATION).any(): logger.debug("dropping %s failed parking locations", (parking_locations == NO_DESTINATION).sum()) parking_locations = parking_locations[ parking_locations != NO_DESTINATION] return parking_locations
def destination_presample(spec_segment_name, choosers, model_settings, network_los, destination_size_terms, estimator, chunk_size, trace_label): trace_label = tracing.extend_trace_label(trace_label, 'presample') chunk_tag = 'tour_destination.presample' logger.info(f"{trace_label} location_presample") alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] assert DEST_TAZ != alt_dest_col_name MAZ_size_terms, TAZ_size_terms = aggregate_size_terms( destination_size_terms, network_los) orig_maz = model_settings['CHOOSER_ORIG_COL_NAME'] assert orig_maz in choosers if ORIG_TAZ not in choosers: choosers[ORIG_TAZ] = map_maz_to_taz(choosers[orig_maz], network_los) # create wrapper with keys for this lookup - in this case there is a HOME_TAZ in the choosers # and a DEST_TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skim_dict = network_los.get_skim_dict('taz') skims = skim_dict.wrap(ORIG_TAZ, DEST_TAZ) taz_sample = _destination_sample(spec_segment_name, choosers, TAZ_size_terms, skims, estimator, model_settings, DEST_TAZ, chunk_size, chunk_tag=chunk_tag, trace_label=trace_label) # choose a MAZ for each DEST_TAZ choice, choice probability based on MAZ size_term fraction of TAZ total maz_choices = choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label) assert DEST_MAZ in maz_choices maz_choices = maz_choices.rename(columns={DEST_MAZ: alt_dest_col_name}) return maz_choices
def add_null_results(trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) persons = inject.get_table('persons').to_frame() persons['mandatory_tour_frequency'] = '' tours = pd.DataFrame() tours['tour_category'] = None tours['tour_type'] = None tours['person_id'] = None tours.index.name = 'tour_id' pipeline.replace_table("tours", tours) expressions.assign_columns( df=persons, model_settings=mandatory_tour_frequency_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons)
def compute_tap_tap_time(self, recipe, access_df, egress_df, chooser_attributes, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, 'compute_tap_tap_time') model_constants = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.CONSTANTS') tap_tap_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.tap_tap_settings') with memo("#TVPB CACHE compute_tap_tap_utilities all_transit_paths"): transit_df = self.all_transit_paths(access_df, egress_df, chooser_attributes, trace_label, trace) # note: transit_df index is arbitrary chunk.log_df(trace_label, "transit_df", transit_df) locals_d = {'los': self.network_los} locals_d.update(model_constants) assignment_spec = assign.read_assignment_spec( file_name=config.config_file_path(tap_tap_settings['SPEC'])) results, _, _ = assign.assign_variables(assignment_spec, transit_df, locals_d) assert len(results.columns == 1) transit_df['transit'] = results # filter out unavailable btap_atap pairs logger.debug( f"{(transit_df['transit'] <= 0).sum()} unavailable tap_tap pairs out of {len(transit_df)}" ) transit_df = transit_df[transit_df.transit > 0] transit_df.drop(columns=chooser_attributes.columns, inplace=True) chunk.log_df(trace_label, "transit_df", None) if trace: self.trace_df(transit_df, trace_label, 'transit_df') return transit_df
def add_null_results(trace_label, mandatory_tour_frequency_settings): logger.info("Skipping %s: add_null_results", trace_label) persons = inject.get_table('persons').to_frame() persons['mandatory_tour_frequency'] = '' tours = pd.DataFrame() tours['tour_category'] = None tours['tour_type'] = None tours['person_id'] = None tours.index.name = 'tour_id' pipeline.replace_table("tours", tours) expressions.assign_columns( df=persons, model_settings=mandatory_tour_frequency_settings.get( 'annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons)
def all_transit_paths(self, access_df, egress_df, chooser_attributes, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, 'all_transit_paths') # deduped transit_df has one row per chooser for each boarding (btap) and alighting (atap) pair transit_df = pd.merge( access_df[['idx', 'btap']], egress_df[['idx', 'atap']], on='idx').drop_duplicates() # don't want transit trips that start and stop in same tap transit_df = transit_df[transit_df.atap != transit_df.btap] for c in list(chooser_attributes.columns): transit_df[c] = reindex(chooser_attributes[c], transit_df['idx']) transit_df = transit_df.reset_index(drop=True) if trace: self.trace_df(transit_df, trace_label, 'all_transit_df') return transit_df
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making non-mandatory trips (alternatives for this model come from a separate csv file which is configured by the user) - these trips include escort, shopping, othmaint, othdiscr, eatout, and social trips in various combination. """ trace_label = 'non_mandatory_tour_frequency' model_settings = config.read_model_settings('non_mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='non_mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('non_mandatory_tour_frequency_alternatives.csv'), set_index=None) choosers = persons_merged.to_frame() # FIXME kind of tacky both that we know to add this here and del it below # 'tot_tours' is used in model_spec expressions alternatives['tot_tours'] = alternatives.sum(axis=1) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_max_window': person_max_window } expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # filter based on results of CDAP choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])] logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers)) constants = config.get_model_constants(model_settings) choices_list = [] # segment by person type and pick the right spec for each person type for ptype, segment in choosers.groupby('ptype'): name = PTYPE_NAME[ptype] # pick the spec column for the segment spec = model_spec[[name]] # drop any zero-valued rows spec = spec[spec[name] != 0] logger.info("Running segment '%s' of size %d", name, len(segment)) choices = interaction_simulate( segment, alternatives, spec=spec, locals_d=constants, chunk_size=chunk_size, trace_label='non_mandatory_tour_frequency.%s' % name, trace_choice_name='non_mandatory_tour_frequency') choices_list.append(choices) # FIXME - force garbage collection? # force_garbage_collect() choices = pd.concat(choices_list) del alternatives['tot_tours'] # del tot_tours column we added above # - add non_mandatory_tour_frequency column to persons persons = persons.to_frame() # need to reindex as we only handled persons with cdap_activity in ['M', 'N'] # (we expect there to be an alt with no tours - which we can use to backfill non-travelers) no_tours_alt = (alternatives.sum(axis=1) == 0).index[0] persons['non_mandatory_tour_frequency'] = \ choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8) """ We have now generated non-mandatory tours, but they are attributes of the person table Now we create a "tours" table which has one row per tour that has been generated (and the person id it is associated with) """ # - get counts of each of the alternatives (so we can extend) # (choices is just the index values for the chosen alts) """ escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ tour_counts = alternatives.loc[choices] tour_counts.index = choices.index # assign person ids to the index prev_tour_count = tour_counts.sum().sum() # - extend_tour_counts tour_counts = extend_tour_counts(choosers, tour_counts, alternatives, trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts')) extended_tour_count = tour_counts.sum().sum() logging.info("extend_tour_counts increased nmtf tour count by %s from %s to %s" % (extended_tour_count - prev_tour_count, prev_tour_count, extended_tour_count)) # - create the non_mandatory tours non_mandatory_tours = process_non_mandatory_tours(persons, tour_counts) assert len(non_mandatory_tours) == extended_tour_count pipeline.extend_table("tours", non_mandatory_tours) tracing.register_traceable_table('tours', non_mandatory_tours) pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=trace_label) pipeline.replace_table("persons", persons) tracing.print_summary('non_mandatory_tour_frequency', persons.non_mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(non_mandatory_tours, label="non_mandatory_tour_frequency.non_mandatory_tours", warn_if_empty=True) tracing.trace_df(choosers, label="non_mandatory_tour_frequency.choosers", warn_if_empty=True) tracing.trace_df(persons, label="non_mandatory_tour_frequency.annotated_persons", warn_if_empty=True)
def cdap_simulate(persons_merged, persons, households, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, chunk_size, trace_hh_id): """ CDAP stands for Coordinated Daily Activity Pattern, which is a choice of high-level activity pattern for each person, in a coordinated way with other members of a person's household. Because Python requires vectorization of computation, there are some specialized routines in the cdap directory of activitysim for this purpose. This module simply applies those utilities using the simulation framework. """ trace_label = 'cdap' model_settings = config.read_model_settings('cdap.yaml') persons_merged = persons_merged.to_frame() constants = config.get_model_constants(model_settings) cdap_interaction_coefficients = \ cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients) # specs are built just-in-time on demand and cached as injectables # prebuilding here allows us to write them to the output directory # (also when multiprocessing locutor might not see all household sizes) logger.info("Pre-building cdap specs") for hhsize in range(2, cdap.MAX_HHSIZE + 1): spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True) if inject.get_injectable('locutor', False): spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True) logger.info("Running cdap_simulate with %d persons", len(persons_merged.index)) choices = cdap.run_cdap( persons=persons_merged, cdap_indiv_spec=cdap_indiv_spec, cdap_interaction_coefficients=cdap_interaction_coefficients, cdap_fixed_relative_proportions=cdap_fixed_relative_proportions, locals_d=constants, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_label=trace_label) # - assign results to persons table and annotate persons = persons.to_frame() choices = choices.reindex(persons.index) persons['cdap_activity'] = choices.cdap_activity persons['cdap_rank'] = choices.cdap_rank expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) # - annotate households table households = households.to_frame() expressions.assign_columns( df=households, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households) tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True) logger.info("cdap crosstabs:\n%s" % pd.crosstab(persons.ptype, persons.cdap_activity, margins=True)) if trace_hh_id: tracing.trace_df(inject.get_table('persons_merged').to_frame(), label="cdap", columns=['ptype', 'cdap_rank', 'cdap_activity'], warn_if_empty=True)
def _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, trace_label): """ compute logsums for tours using skims for alt_tdd out_period and in_period """ trace_label = tracing.extend_trace_label(trace_label, 'logsums') logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) choosers = alt_tdd.join(tours_merged, how='left', rsuffix='_chooser') logger.info("%s compute_logsums for %d choosers%s alts" % (trace_label, choosers.shape[0], alt_tdd.shape[0])) # - setup skims skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack') orig_col_name = 'TAZ' dest_col_name = model_settings.get('DESTINATION_FOR_TOUR_PURPOSE').get(tour_purpose) odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col_name, right_key=dest_col_name, skim_key='out_period') dot_skim_stack_wrapper = skim_stack.wrap(left_key=dest_col_name, right_key=orig_col_name, skim_key='in_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, } # - locals_dict constants = config.get_model_constants(logsum_settings) omnibus_coefficient_spec = get_coeffecients_spec(logsum_settings) coefficient_spec = omnibus_coefficient_spec[tour_purpose] coefficients = assign.evaluate_constants(coefficient_spec, constants=constants) locals_dict = {} locals_dict.update(coefficients) locals_dict.update(constants) locals_dict.update(skims) # - run preprocessor to annotate choosers # allow specification of alternate preprocessor for nontour choosers preprocessor = model_settings.get('LOGSUM_PREPROCESSOR', 'preprocessor') preprocessor_settings = logsum_settings[preprocessor] if preprocessor_settings: simulate.set_skim_wrapper_targets(choosers, skims) expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - compute logsums logsum_spec = get_logsum_spec(logsum_settings) nest_spec = config.get_logit_model_settings(logsum_settings) logsums = simulate.simple_simulate_logsums( choosers, logsum_spec, nest_spec, skims=skims, locals_d=locals_dict, chunk_size=0, trace_label=trace_label) return logsums
def run_location_choice( persons_merged_df, skim_dict, skim_stack, spc, model_settings, chunk_size, trace_hh_id, trace_label ): """ Run the three-part location choice algorithm to generate a location choice for each chooser Handle the various segments separately and in turn for simplicity of expression files Parameters ---------- persons_merged_df : pandas.DataFrame persons table merged with households and land_use skim_dict : skim.SkimDict skim_stack : skim.SkimStack spc : ShadowPriceCalculator to get size terms model_settings : dict chunk_size : int trace_hh_id : int trace_label : str Returns ------- pandas.Series location choices (zone ids) indexed by persons_merged_df.index """ chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] # maps segment names to compact (integer) ids segment_ids = model_settings['SEGMENT_IDS'] choices_list = [] for segment_name, segment_id in iteritems(segment_ids): choosers = persons_merged_df[persons_merged_df[chooser_segment_column] == segment_id] # size_term and shadow price adjustment - one row per zone dest_size_terms = spc.dest_size_terms(segment_name) if choosers.shape[0] == 0: logger.info("%s skipping segment %s: no choosers", trace_label, segment_name) continue # - location_sample location_sample_df = \ run_location_sample( segment_name, choosers, skim_dict, dest_size_terms, model_settings, chunk_size, tracing.extend_trace_label(trace_label, 'sample.%s' % segment_name)) # - location_logsums location_sample_df = \ run_location_logsums( segment_name, choosers, skim_dict, skim_stack, location_sample_df, model_settings, chunk_size, trace_hh_id, tracing.extend_trace_label(trace_label, 'logsums.%s' % segment_name)) # - location_simulate choices = \ run_location_simulate( segment_name, choosers, location_sample_df, skim_dict, dest_size_terms, model_settings, chunk_size, tracing.extend_trace_label(trace_label, 'simulate.%s' % segment_name)) choices_list.append(choices) # FIXME - want to do this here? del location_sample_df force_garbage_collect() return pd.concat(choices_list) if len(choices_list) > 0 else pd.Series()
def run_cdap( persons, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, chunk_size=0, trace_hh_id=None, trace_label=None): """ Choose individual activity patterns for persons. Parameters ---------- persons : pandas.DataFrame Table of persons data. Must contain at least a household ID, household size, person type category, and age, plus any columns used in cdap_indiv_spec cdap_indiv_spec : pandas.DataFrame CDAP spec for individuals without taking any interactions into account. cdap_interaction_coefficients : pandas.DataFrame Rules and coefficients for generating interaction specs for different household sizes cdap_fixed_relative_proportions : pandas.DataFrame Spec to for the relative proportions of each activity (M, N, H) to choose activities for additional household members not handled by CDAP locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ in either the cdap_indiv_spec or cdap_fixed_relative_proportions expression files chunk_size: int Chunk size or 0 for no chunking trace_hh_id : int hh_id to trace or None if no hh tracing trace_label : str label for tracing or None if no tracing Returns ------- choices : pandas.DataFrame dataframe is indexed on _persons_index_ and has two columns: cdap_activity : str activity for that person expressed as 'M', 'N', 'H' cdap_rank : int activities for persons with cdap_rank <= MAX_HHSIZE are determined by cdap 'extra' household members activities are assigned by cdap_fixed_relative_proportions """ trace_label = tracing.extend_trace_label(trace_label, 'cdap') rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, persons, trace_label=trace_label) result_list = [] # segment by person type and pick the right spec for each person type for i, num_chunks, persons_chunk in chunk.chunked_choosers_by_chunk_id(persons, rows_per_chunk): logger.info("Running chunk %s of %s with %d persons" % (i, num_chunks, len(persons_chunk))) chunk_trace_label = tracing.extend_trace_label(trace_label, 'chunk_%s' % i) chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = _run_cdap(persons_chunk, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, trace_hh_id, chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) return choices
def run_tour_destination( tours, persons_merged, model_settings, skim_dict, skim_stack, chunk_size, trace_hh_id, trace_label): size_term_calculator = SizeTermCalculator(model_settings['SIZE_TERM_SELECTOR']) chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] # maps segment names to compact (integer) ids segments = model_settings['SEGMENTS'] # interaction_sample_simulate insists choosers appear in same order as alts tours = tours.sort_index() choices_list = [] for segment_name in segments: choosers = tours[tours[chooser_segment_column] == segment_name] # size_term segment is segment_name segment_destination_size_terms = size_term_calculator.dest_size_terms_df(segment_name) if choosers.shape[0] == 0: logger.info("%s skipping segment %s: no choosers", trace_label, segment_name) continue # - destination_sample spec_segment_name = segment_name # spec_segment_name is segment_name location_sample_df = \ run_destination_sample( spec_segment_name, choosers, persons_merged, model_settings, skim_dict, segment_destination_size_terms, chunk_size, tracing.extend_trace_label(trace_label, 'sample.%s' % segment_name)) # - destination_logsums tour_purpose = segment_name # tour_purpose is segment_name location_sample_df = \ run_destination_logsums( tour_purpose, persons_merged, location_sample_df, model_settings, skim_dict, skim_stack, chunk_size, trace_hh_id, tracing.extend_trace_label(trace_label, 'logsums.%s' % segment_name)) # - destination_simulate spec_segment_name = segment_name # spec_segment_name is segment_name choices = \ run_destination_simulate( spec_segment_name, choosers, persons_merged, location_sample_df, model_settings, skim_dict, segment_destination_size_terms, chunk_size, tracing.extend_trace_label(trace_label, 'simulate.%s' % segment_name)) choices_list.append(choices) # FIXME - want to do this here? del location_sample_df force_garbage_collect() return pd.concat(choices_list) if len(choices_list) > 0 else pd.Series()
def compute_logsums( primary_purpose, trips, destination_sample, tours_merged, model_settings, skims, chunk_size, trace_hh_id, trace_label): """ Calculate mode choice logsums using the same recipe as for trip_mode_choice, but do it twice for each alternative since we need out-of-direction logsum (i.e . origin to alt_dest, and alt_dest to half-tour destination) Returns ------- adds od_logsum and dp_logsum columns to trips (in place) """ trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums') logger.info("Running %s with %d samples", trace_label, destination_sample.shape[0]) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( trips, tours_merged, left_on='tour_id', right_index=True, how="left") assert trips_merged.index.equals(trips.index) # - choosers - merge destination_sample and trips_merged # re/set index because pandas merge does not preserve left index if it has duplicate values! choosers = pd.merge(destination_sample, trips_merged.reset_index(), left_index=True, right_on='trip_id', how="left", suffixes=('', '_r')).set_index('trip_id') assert choosers.index.equals(destination_sample.index) logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) omnibus_coefficient_spec = \ assign.read_constant_spec(config.config_file_path(logsum_settings['COEFFS'])) coefficient_spec = omnibus_coefficient_spec[primary_purpose] constants = config.get_model_constants(logsum_settings) locals_dict = assign.evaluate_constants(coefficient_spec, constants=constants) locals_dict.update(constants) # - od_logsums od_skims = { 'ORIGIN': model_settings['TRIP_ORIGIN'], 'DESTINATION': model_settings['ALT_DEST'], "odt_skims": skims['odt_skims'], "od_skims": skims['od_skims'], } destination_sample['od_logsum'] = compute_ood_logsums( choosers, logsum_settings, od_skims, locals_dict, chunk_size, trace_label=tracing.extend_trace_label(trace_label, 'od')) # - dp_logsums dp_skims = { 'ORIGIN': model_settings['ALT_DEST'], 'DESTINATION': model_settings['PRIMARY_DEST'], "odt_skims": skims['dpt_skims'], "od_skims": skims['dp_skims'], } destination_sample['dp_logsum'] = compute_ood_logsums( choosers, logsum_settings, dp_skims, locals_dict, chunk_size, trace_label=tracing.extend_trace_label(trace_label, 'dp'))
def tour_mode_choice_simulate(tours, persons_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ Tour mode choice simulate """ trace_label = 'tour_mode_choice' model_settings = config.read_model_settings('tour_mode_choice.yaml') spec = tour_mode_choice_spec(model_settings) primary_tours = tours.to_frame() assert not (primary_tours.tour_category == 'atwork').any() persons_merged = persons_merged.to_frame() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) logger.info("Running %s with %d tours" % (trace_label, primary_tours.shape[0])) tracing.print_summary('tour_types', primary_tours.tour_type, value_counts=True) primary_tours_merged = pd.merge(primary_tours, persons_merged, left_on='person_id', right_index=True, how='left', suffixes=('', '_r')) # setup skim keys orig_col_name = 'TAZ' dest_col_name = 'destination' out_time_col_name = 'start' in_time_col_name = 'end' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col_name, right_key=dest_col_name, skim_key='out_period') dot_skim_stack_wrapper = skim_stack.wrap(left_key=dest_col_name, right_key=orig_col_name, skim_key='in_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, 'out_time_col_name': out_time_col_name, 'in_time_col_name': in_time_col_name } choices_list = [] for tour_type, segment in primary_tours_merged.groupby('tour_type'): logger.info("tour_mode_choice_simulate tour_type '%s' (%s tours)" % (tour_type, len(segment.index), )) # name index so tracing knows how to slice assert segment.index.name == 'tour_id' choices = run_tour_mode_choice_simulate( segment, spec, tour_type, model_settings, skims=skims, constants=constants, nest_spec=nest_spec, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, tour_type), trace_choice_name='tour_mode_choice') tracing.print_summary('tour_mode_choice_simulate %s choices' % tour_type, choices, value_counts=True) choices_list.append(choices) # FIXME - force garbage collection force_garbage_collect() choices = pd.concat(choices_list) tracing.print_summary('tour_mode_choice_simulate all tour type choices', choices, value_counts=True) # so we can trace with annotations primary_tours['tour_mode'] = choices # but only keep mode choice col all_tours = tours.to_frame() # uncomment to save annotations to table # assign_in_place(all_tours, annotations) assign_in_place(all_tours, choices.to_frame('tour_mode')) pipeline.replace_table("tours", all_tours) if trace_hh_id: tracing.trace_df(primary_tours, label=tracing.extend_trace_label(trace_label, 'tour_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True)
def stop_frequency( tours, tours_merged, stop_frequency_alts, skim_dict, chunk_size, trace_hh_id): """ stop frequency model For each tour, shoose a number of intermediate inbound stops and outbound stops. Create a trip table with inbound and outbound trips. Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops, and four corresponding trips: three outbound, and one inbound. Adds stop_frequency str column to trips, with fields creates trips table with columns: :: - person_id - household_id - tour_id - primary_purpose - atwork - trip_num - outbound - trip_count """ trace_label = 'stop_frequency' model_settings = config.read_model_settings('stop_frequency.yaml') tours = tours.to_frame() tours_merged = tours_merged.to_frame() assert not tours_merged.household_id.isnull().any() assert not (tours_merged.origin == -1).any() assert not (tours_merged.destination == -1).any() nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) # - run preprocessor to annotate tours_merged preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: # hack: preprocessor adds origin column in place if it does not exist already od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination') skims = [od_skim_stack_wrapper] locals_dict = { "od_skims": od_skim_stack_wrapper } if constants is not None: locals_dict.update(constants) simulate.set_skim_wrapper_targets(tours_merged, skims) # this should be pre-slice as some expressions may count tours by type annotations = expressions.compute_columns( df=tours_merged, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) assign_in_place(tours_merged, annotations) tracing.print_summary('stop_frequency segments', tours_merged.primary_purpose, value_counts=True) choices_list = [] for segment_type, choosers in tours_merged.groupby('primary_purpose'): logging.info("%s running segment %s with %s chooser rows" % (trace_label, segment_type, choosers.shape[0])) spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' % segment_type) assert spec is not None, "spec for segment_type %s not found" % segment_type choices = simulate.simple_simulate( choosers=choosers, spec=spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, segment_type), trace_choice_name='stops') # convert indexes to alternative names choices = pd.Series(spec.columns[choices.values], index=choices.index) choices_list.append(choices) choices = pd.concat(choices_list) tracing.print_summary('stop_frequency', choices, value_counts=True) # add stop_frequency choices to tours table assign_in_place(tours, choices.to_frame('stop_frequency')) if 'primary_purpose' not in tours.columns: assign_in_place(tours, tours_merged[['primary_purpose']]) pipeline.replace_table("tours", tours) # create trips table trips = process_trips(tours, stop_frequency_alts) trips = pipeline.extend_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel('trips', trips) if trace_hh_id: tracing.trace_df(tours, label="stop_frequency.tours", slicer='person_id', columns=None) tracing.trace_df(trips, label="stop_frequency.trips", slicer='person_id', columns=None) tracing.trace_df(annotations, label="stop_frequency.annotations", columns=None) tracing.trace_df(tours_merged, label="stop_frequency.tours_merged", slicer='person_id', columns=None)
def run_trip_destination( trips, tours_merged, chunk_size, trace_hh_id, trace_label): """ trip destination - main functionality separated from model step so it can be called iteratively Run the trip_destination model, assigning destinations for each (intermediate) trip (last trips already have a destination - either the tour primary destination or Home) Set trip destination and origin columns, and a boolean failed flag for any failed trips (destination for flagged failed trips will be set to -1) Parameters ---------- trips tours_merged chunk_size trace_hh_id trace_label Returns ------- """ model_settings = config.read_model_settings('trip_destination.yaml') preprocessor_settings = model_settings.get('preprocessor', None) logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') # - initialize trip origin and destination to those of half-tour # (we will sequentially adjust intermediate trips origin and destination as we choose them) tour_destination = reindex(tours_merged.destination, trips.tour_id).astype(int) tour_origin = reindex(tours_merged.origin, trips.tour_id).astype(int) trips['destination'] = np.where(trips.outbound, tour_destination, tour_origin) trips['origin'] = np.where(trips.outbound, tour_origin, tour_destination) trips['failed'] = False trips = trips.sort_index() trips['next_trip_id'] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where(trips.trip_num < trips.trip_count, 0) # - filter tours_merged (AFTER copying destination and origin columns to trips) # tours_merged is used for logsums, we filter it here upfront to save space and time tours_merged_cols = logsum_settings['TOURS_MERGED_CHOOSER_COLUMNS'] if 'REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS' in model_settings: redundant_cols = model_settings['REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS'] tours_merged_cols = [c for c in tours_merged_cols if c not in redundant_cols] tours_merged = tours_merged[tours_merged_cols] # - skims skims = wrap_skims(model_settings) # - size_terms and alternatives alternatives = tour_destination_size_terms(land_use, size_terms, 'trip') # DataFrameMatrix alows us to treat dataframe as virtual a 2-D array, indexed by TAZ, purpose # e.g. size_terms.get(df.dest_taz, df.purpose) # returns a series of size_terms for each chooser's dest_taz and purpose with chooser index size_term_matrix = DataFrameMatrix(alternatives) # don't need size terms in alternatives, just TAZ index alternatives = alternatives.drop(alternatives.columns, axis=1) alternatives.index.name = model_settings['ALT_DEST'] # - process intermediate trips in ascending trip_num order intermediate = trips.trip_num < trips.trip_count if intermediate.any(): first_trip_num = trips[intermediate].trip_num.min() last_trip_num = trips[intermediate].trip_num.max() # iterate over trips in ascending trip_num order for trip_num in range(first_trip_num, last_trip_num + 1): nth_trips = trips[intermediate & (trips.trip_num == trip_num)] nth_trace_label = tracing.extend_trace_label(trace_label, 'trip_num_%s' % trip_num) # - annotate nth_trips if preprocessor_settings: expressions.assign_columns( df=nth_trips, model_settings=preprocessor_settings, locals_dict=config.get_model_constants(model_settings), trace_label=nth_trace_label) logger.info("Running %s with %d trips", nth_trace_label, nth_trips.shape[0]) # - choose destination for nth_trips, segmented by primary_purpose choices_list = [] for primary_purpose, trips_segment in nth_trips.groupby('primary_purpose'): choices = choose_trip_destination( primary_purpose, trips_segment, alternatives, tours_merged, model_settings, size_term_matrix, skims, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(nth_trace_label, primary_purpose)) choices_list.append(choices) destinations = pd.concat(choices_list) failed_trip_ids = nth_trips.index.difference(destinations.index) if failed_trip_ids.any(): logger.warning("%s sidelining %s trips without viable destination alternatives" % (nth_trace_label, failed_trip_ids.shape[0])) next_trip_ids = nth_trips.next_trip_id.reindex(failed_trip_ids) trips.loc[failed_trip_ids, 'failed'] = True trips.loc[failed_trip_ids, 'destination'] = -1 trips.loc[next_trip_ids, 'origin'] = trips.loc[failed_trip_ids].origin.values # - assign choices to these trips destinations and to next trips origin assign_in_place(trips, destinations.to_frame('destination')) destinations.index = nth_trips.next_trip_id.reindex(destinations.index) assign_in_place(trips, destinations.to_frame('origin')) del trips['next_trip_id'] return trips
def extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label): """ Generate the activity choices for the 'extra' household members who weren't handled by cdap Following the CTRAMP HouseholdCoordinatedDailyActivityPatternModel, "a separate, simple cross-sectional distribution is looked up for the remaining household members" The cdap_fixed_relative_proportions spec is handled like an activitysim logit utility spec, EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) Parameters ---------- persons : pandas.DataFrame Table of persons data indexed on _persons_index_ We expect, at least, columns [_hh_id_, _ptype_] cdap_fixed_relative_proportions spec to compute/specify the relative proportions of each activity (M, N, H) that should be used to choose activities for additional household members not handled by CDAP. locals_d : Dict dictionary of local variables that eval_variables adds to the environment for an evaluation of an expression that begins with @ Returns ------- choices : pandas.Series list of alternatives chosen for all extra members, indexed by _persons_index_ """ trace_label = tracing.extend_trace_label(trace_label, 'extra_hh_member_choices') # extra household members have cdap_ran > MAX_HHSIZE choosers = persons[persons['cdap_rank'] > MAX_HHSIZE] if len(choosers.index) == 0: return pd.Series() # eval the expression file values = simulate.eval_variables(cdap_fixed_relative_proportions.index, choosers, locals_d) # cdap_fixed_relative_proportions computes relative proportions by ptype, not utilities proportions = values.dot(cdap_fixed_relative_proportions) # convert relative proportions to probability probs = proportions.div(proportions.sum(axis=1), axis=0) # select an activity pattern alternative for each person based on probability # idx_choices is a series (indexed on _persons_index_ ) with the chosen alternative represented # as the integer (0 based) index of the chosen column from probs idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) # convert choice from column index to activity name choices = pd.Series(probs.columns[idx_choices].values, index=probs.index) # if DUMP: # tracing.trace_df(proportions, '%s.DUMP.extra_proportions' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(probs, '%s.DUMP.extra_probs' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(choices, '%s.DUMP.extra_choices' % trace_label, # transpose=False, # slicer='NONE') if trace_hh_id: tracing.trace_df(proportions, '%s.extra_hh_member_choices_proportions' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(probs, '%s.extra_hh_member_choices_probs' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(choices, '%s.extra_hh_member_choices_choices' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(rands, '%s.extra_hh_member_choices_rands' % trace_label, columns=[None, 'rand']) return choices
def trip_mode_choice( trips, tours_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ Trip mode choice - compute trip_mode (same values as for tour_mode) for each trip. Modes for each primary tour putpose are calculated separately because they have different coefficient values (stored in trip_mode_choice_coeffs.csv coefficient file.) Adds trip_mode column to trip table """ trace_label = 'trip_mode_choice' model_settings = config.read_model_settings('trip_mode_choice.yaml') model_spec = \ simulate.read_model_spec(file_name=model_settings['SPEC']) omnibus_coefficients = \ assign.read_constant_spec(config.config_file_path(model_settings['COEFFS'])) trips_df = trips.to_frame() logger.info("Running %s with %d trips", trace_label, trips_df.shape[0]) tours_merged = tours_merged.to_frame() tours_merged = tours_merged[model_settings['TOURS_MERGED_CHOOSER_COLUMNS']] nest_spec = config.get_logit_model_settings(model_settings) tracing.print_summary('primary_purpose', trips_df.primary_purpose, value_counts=True) # - trips_merged - merge trips and tours_merged trips_merged = pd.merge( trips_df, tours_merged, left_on='tour_id', right_index=True, how="left") assert trips_merged.index.equals(trips.index) # setup skim keys assert ('trip_period' not in trips_merged) trips_merged['trip_period'] = skim_time_period_label(trips_merged.depart) orig_col = 'origin' dest_col = 'destination' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col, right_key=dest_col, skim_key='trip_period') od_skim_wrapper = skim_dict.wrap('origin', 'destination') skims = { "odt_skims": odt_skim_stack_wrapper, "od_skims": od_skim_wrapper, } constants = config.get_model_constants(model_settings) constants.update({ 'ORIGIN': orig_col, 'DESTINATION': dest_col }) choices_list = [] for primary_purpose, trips_segment in trips_merged.groupby('primary_purpose'): segment_trace_label = tracing.extend_trace_label(trace_label, primary_purpose) logger.info("trip_mode_choice tour_type '%s' (%s trips)" % (primary_purpose, len(trips_segment.index), )) # name index so tracing knows how to slice assert trips_segment.index.name == 'trip_id' locals_dict = assign.evaluate_constants(omnibus_coefficients[primary_purpose], constants=constants) locals_dict.update(constants) annotate_preprocessors( trips_segment, locals_dict, skims, model_settings, segment_trace_label) locals_dict.update(skims) choices = simulate.simple_simulate( choosers=trips_segment, spec=model_spec, nest_spec=nest_spec, skims=skims, locals_d=locals_dict, chunk_size=chunk_size, trace_label=segment_trace_label, trace_choice_name='trip_mode_choice') alts = model_spec.columns choices = choices.map(dict(list(zip(list(range(len(alts))), alts)))) # tracing.print_summary('trip_mode_choice %s choices' % primary_purpose, # choices, value_counts=True) if trace_hh_id: # trace the coefficients tracing.trace_df(pd.Series(locals_dict), label=tracing.extend_trace_label(segment_trace_label, 'constants'), transpose=False, slicer='NONE') # so we can trace with annotations trips_segment['trip_mode'] = choices tracing.trace_df(trips_segment, label=tracing.extend_trace_label(segment_trace_label, 'trip_mode'), slicer='tour_id', index_label='tour_id', warn_if_empty=True) choices_list.append(choices) # FIXME - force garbage collection force_garbage_collect() choices = pd.concat(choices_list) trips_df = trips.to_frame() trips_df['trip_mode'] = choices tracing.print_summary('tour_modes', trips_merged.tour_mode, value_counts=True) tracing.print_summary('trip_mode_choice choices', choices, value_counts=True) assert not trips_df.trip_mode.isnull().any() pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=tracing.extend_trace_label(trace_label, 'trip_mode'), slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def atwork_subtour_mode_choice( tours, persons_merged, skim_dict, skim_stack, chunk_size, trace_hh_id): """ At-work subtour mode choice simulate """ trace_label = 'atwork_subtour_mode_choice' model_settings = config.read_model_settings('tour_mode_choice.yaml') spec = tour_mode_choice_spec(model_settings) tours = tours.to_frame() subtours = tours[tours.tour_category == 'atwork'] # - if no atwork subtours if subtours.shape[0] == 0: tracing.no_results(trace_label) return subtours_merged = \ pd.merge(subtours, persons_merged.to_frame(), left_on='person_id', right_index=True, how='left') nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) logger.info("Running %s with %d subtours" % (trace_label, subtours_merged.shape[0])) tracing.print_summary('%s tour_type' % trace_label, subtours_merged.tour_type, value_counts=True) # setup skim keys orig_col_name = 'workplace_taz' dest_col_name = 'destination' out_time_col_name = 'start' in_time_col_name = 'end' odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col_name, right_key=dest_col_name, skim_key='out_period') dot_skim_stack_wrapper = skim_stack.wrap(left_key=dest_col_name, right_key=orig_col_name, skim_key='in_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, 'out_time_col_name': out_time_col_name, 'in_time_col_name': in_time_col_name } choices = run_tour_mode_choice_simulate( subtours_merged, spec, tour_purpose='atwork', model_settings=model_settings, skims=skims, constants=constants, nest_spec=nest_spec, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='tour_mode_choice') tracing.print_summary('%s choices' % trace_label, choices, value_counts=True) assign_in_place(tours, choices.to_frame('tour_mode')) pipeline.replace_table("tours", tours) if trace_hh_id: tracing.trace_df(tours[tours.tour_category == 'atwork'], label=tracing.extend_trace_label(trace_label, 'tour_mode'), slicer='tour_id', index_label='tour_id') force_garbage_collect()
def vectorize_subtour_scheduling(parent_tours, subtours, persons_merged, alts, spec, model_settings, chunk_size=0, trace_label=None): """ Like vectorize_tour_scheduling but specifically for atwork subtours subtours have a few peculiarities necessitating separate treatment: Timetable has to be initialized to set all timeperiods outside parent tour footprint as unavailable. So atwork subtour timewindows are limited to the foorprint of the parent work tour. And parent_tour_id' column of tours is used instead of parent_id as timetable row_id. Parameters ---------- parent_tours : DataFrame parent tours of the subtours (because we need to know the tdd of the parent tour to assign_subtour_mask of timetable indexed by parent_tour id subtours : DataFrame atwork subtours to schedule persons_merged : DataFrame DataFrame of persons containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. (all subtours share same spec regardless of subtour type) model_settings : dict chunk_size trace_label Returns ------- choices : Series A Series of choices where the index is the index of the subtours DataFrame and the values are the index of the alts DataFrame. """ if not trace_label: trace_label = 'vectorize_non_mandatory_tour_scheduling' assert len(subtours.index) > 0 assert 'tour_num' in subtours.columns assert 'tour_type' in subtours.columns timetable_window_id_col = 'parent_tour_id' tour_owner_id_col = 'parent_tour_id' segment = None # timetable with a window for each parent tour parent_tour_windows = tt.create_timetable_windows(parent_tours, alts) timetable = tt.TimeTable(parent_tour_windows, alts) # mask the periods outside parent tour footprint timetable.assign_subtour_mask(parent_tours.tour_id, parent_tours.tdd) # print timetable.windows """ [[7 7 7 0 0 0 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7] [7 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7] [7 7 7 7 7 0 0 0 0 0 0 0 0 0 0 7 7 7 7 7 7] [7 7 0 0 0 0 0 0 0 7 7 7 7 7 7 7 7 7 7 7 7]] """ choice_list = [] # keep a series of the the most recent tours for each person # initialize with first trip from alts previous_tour_by_parent_tour_id = \ pd.Series(alts.index[0], index=subtours['parent_tour_id'].unique()) # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # this ought to have been ensured when tours are created (tour_frequency.process_tours) for tour_num, nth_tours in subtours.groupby('tour_num', sort=True): tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % (tour_num,)) # no more than one tour per timetable window per call to schedule_tours assert not nth_tours.parent_tour_id.duplicated().any() choices = \ schedule_tours(nth_tours, persons_merged, alts, spec, segment, model_settings, timetable, timetable_window_id_col, previous_tour_by_parent_tour_id, tour_owner_id_col, chunk_size, tour_trace_label) choice_list.append(choices) choices = pd.concat(choice_list) # add the start, end, and duration from tdd_alts # assert (alts.index == list(range(alts.shape[0]))).all() tdd = pd.DataFrame(data=alts.values[choices.values], columns=alts.columns, index=choices.index) # tdd = alts.loc[choices] # tdd.index = choices.index # include the index of the choice in the tdd alts table tdd['tdd'] = choices # print "\nfinal timetable.windows\n", timetable.windows """ [[7 7 7 0 0 0 0 2 7 7 4 7 7 7 7 7 7 7 7 7 7] [7 0 2 7 4 0 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7] [7 7 7 7 7 2 4 0 0 0 0 0 0 0 0 7 7 7 7 7 7] [7 7 0 2 7 7 4 0 0 7 7 7 7 7 7 7 7 7 7 7 7]] """ # we dont need to call replace_table() for this nonce timetable # because subtours are occuring during persons timetable scheduled time return tdd
def iterate_location_choice( model_settings, persons_merged, persons, households, skim_dict, skim_stack, chunk_size, trace_hh_id, locutor, trace_label): """ iterate run_location_choice updating shadow pricing until convergence criteria satisfied or max_iterations reached. (If use_shadow_pricing not enabled, then just iterate once) Parameters ---------- model_settings : dict persons_merged : injected table persons : injected table skim_dict : skim.SkimDict skim_stack : skim.SkimStack chunk_size : int trace_hh_id : int locutor : bool whether this process is the privileged logger of shadow_pricing when multiprocessing trace_label : str Returns ------- adds choice column model_settings['DEST_CHOICE_COLUMN_NAME'] and annotations to persons table """ # column containing segment id chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] # boolean to filter out persons not needing location modeling (e.g. is_worker, is_student) chooser_filter_column = model_settings['CHOOSER_FILTER_COLUMN_NAME'] persons_merged_df = persons_merged.to_frame() persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]] spc = shadow_pricing.load_shadow_price_calculator(model_settings) max_iterations = spc.max_iterations logging.debug("%s max_iterations: %s" % (trace_label, max_iterations)) choices = None for iteration in range(1, max_iterations + 1): if spc.use_shadow_pricing and iteration > 1: spc.update_shadow_prices() choices = run_location_choice( persons_merged_df, skim_dict, skim_stack, spc, model_settings, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'i%s' % iteration)) choices_df = choices.to_frame('dest_choice') choices_df['segment_id'] = \ persons_merged_df[chooser_segment_column].reindex(choices_df.index) spc.set_choices(choices_df) if locutor: spc.write_trace_files(iteration) if spc.use_shadow_pricing and spc.check_fit(iteration): logging.info("%s converged after iteration %s" % (trace_label, iteration,)) break # - shadow price table if locutor: if spc.use_shadow_pricing and 'SHADOW_PRICE_TABLE' in model_settings: inject.add_table(model_settings['SHADOW_PRICE_TABLE'], spc.shadow_prices) if 'MODELED_SIZE_TABLE' in model_settings: inject.add_table(model_settings['MODELED_SIZE_TABLE'], spc.modeled_size) dest_choice_column_name = model_settings['DEST_CHOICE_COLUMN_NAME'] tracing.print_summary(dest_choice_column_name, choices, value_counts=True) persons_df = persons.to_frame() # We only chose school locations for the subset of persons who go to school # so we backfill the empty choices with -1 to code as no school location NO_DEST_TAZ = -1 persons_df[dest_choice_column_name] = \ choices.reindex(persons_df.index).fillna(NO_DEST_TAZ).astype(int) # - annotate persons table if 'annotate_persons' in model_settings: expressions.assign_columns( df=persons_df, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons_df) if trace_hh_id: tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if 'annotate_households' in model_settings: households_df = households.to_frame() expressions.assign_columns( df=households_df, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households_df) if trace_hh_id: tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) return persons_df
def vectorize_tour_scheduling(tours, alts, spec, constants={}, chunk_size=0, trace_label=None): """ The purpose of this method is fairly straightforward - it takes tours and schedules them into time slots. Alternatives should be specified so as to define those time slots (usually with start and end times). The difficulty of doing this in Python is that subsequent tours are dependent on certain characteristics of previous tours for the same person. This is a problem with Python's vectorization requirement, so this method does all the 1st tours, then all the 2nd tours, and so forth. This method also adds variables that can be used in the spec which have to do with the previous tours per person. Every column in the alternatives table is appended with the suffix "_previous" and made available. So if your alternatives table has columns for start and end, then start_previous and end_previous will be set to the start and end of the most recent tour for a person. The first time through, start_previous and end_previous are undefined, so make sure to protect with a tour_num >= 2 in the variable computation. Parameters ---------- tours : DataFrame DataFrame of tours containing tour attributes, as well as a person_id column to define the nth tour for each person. alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. Returns ------- choices : Series A Series of choices where the index is the index of the tours DataFrame and the values are the index of the alts DataFrame. """ max_num_trips = tours.groupby('person_id').size().max() if np.isnan(max_num_trips): s = pd.Series() s.index.name = 'tour_id' return s # because this is Python, we have to vectorize everything by doing the # "nth" trip for each person in a for loop (in other words, because each # trip is dependent on the time windows left by the previous decision) - # hopefully this will work out ok! choices = [] # keep a series of the the most recent tours for each person previous_tour_by_personid = pd.Series(pd.Series(alts.index).iloc[0], index=tours.person_id.unique()) for i in range(max_num_trips): # this reset_index / set_index stuff keeps the index as the tours # index rather that switching to person_id as the index which is # what happens when you groupby person_id index_name = tours.index.name or 'index' nth_tours = tours.reset_index().\ groupby('person_id').nth(i).reset_index().set_index(index_name) nth_tours.index.name = 'tour_id' if trace_label: logger.info("%s running %d #%d tour choices" % (trace_label, len(nth_tours), i + 1)) # tour num can be set by the user, but if it isn't we set it here if "tour_num" not in nth_tours: nth_tours["tour_num"] = i + 1 nth_tours = nth_tours.join( get_previous_tour_by_tourid(nth_tours.person_id, previous_tour_by_personid, alts)) tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % i) nth_choices = asim.interaction_simulate(nth_tours, alts.copy(), spec, locals_d=constants, chunk_size=chunk_size, trace_label=tour_trace_label) choices.append(nth_choices) previous_tour_by_personid.loc[nth_tours.person_id] = nth_choices.values choices = pd.concat(choices) # return the concatenated choices return choices
def schedule_tours( tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, chunk_size, tour_trace_label): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ if not tours.index.is_monotonic_increasing: logger.info("schedule_tours %s tours not monotonic_increasing - sorting df") tours = tours.sort_index() logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # no more than one tour per timetable_window per call if timetable_window_id_col is None: assert not tours.index.duplicated().any() else: assert not tours[timetable_window_id_col].duplicated().any() rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, trace_label=tour_trace_label) result_list = [] for i, num_chunks, chooser_chunk \ in chunk.chunked_choosers(tours, rows_per_chunk): logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else tour_trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, tour_trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) mem.force_garbage_collect() # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def school_location_simulate(persons_merged, school_location_sample, school_location_spec, school_location_settings, skim_dict, destination_size_terms, chunk_size, trace_hh_id): """ School location model on school_location_sample annotated with mode_choice logsum to select a school_taz from sample alternatives """ choosers = persons_merged.to_frame() school_location_sample = school_location_sample.to_frame() destination_size_terms = destination_size_terms.to_frame() trace_label = 'school_location_simulate' alt_col_name = school_location_settings["ALT_COL_NAME"] constants = config.get_model_constants(school_location_settings) # create wrapper with keys for this lookup - in this case there is a TAZ in the choosers # and a TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skims = skim_dict.wrap("TAZ", alt_col_name) locals_d = { 'skims': skims, } if constants is not None: locals_d.update(constants) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = school_location_settings['SIMULATE_CHOOSER_COLUMNS'] choosers = choosers[chooser_columns] tracing.dump_df(DUMP, choosers, 'school_location_simulate', 'choosers') choices_list = [] for school_type in ['university', 'highschool', 'gradeschool']: locals_d['segment'] = school_type choosers_segment = choosers[choosers["is_" + school_type]] alts_segment = school_location_sample[ school_location_sample['school_type'] == school_type] # alternatives are pre-sampled and annotated with logsums and pick_count # but we have to merge additional alt columns into alt sample list alts_segment = \ pd.merge(alts_segment, destination_size_terms, left_on=alt_col_name, right_index=True, how="left") tracing.dump_df(DUMP, alts_segment, trace_label, '%s_alternatives' % school_type) choices = interaction_sample_simulate( choosers_segment, alts_segment, spec=school_location_spec[[school_type]], choice_column=alt_col_name, skims=skims, locals_d=locals_d, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, school_type), trace_choice_name='school_location') choices_list.append(choices) choices = pd.concat(choices_list) # We only chose school locations for the subset of persons who go to school # so we backfill the empty choices with -1 to code as no school location choices = choices.reindex(persons_merged.index).fillna(-1).astype(int) tracing.dump_df(DUMP, choices, trace_label, 'choices') tracing.print_summary('school_taz', choices, describe=True) inject.add_column("persons", "school_taz", choices) pipeline.add_dependent_columns("persons", "persons_school") if trace_hh_id: trace_columns = ['school_taz' ] + inject.get_table('persons_school').columns tracing.trace_df(inject.get_table('persons_merged').to_frame(), label="school_location", columns=trace_columns, warn_if_empty=True)
def vectorize_tour_scheduling(tours, persons_merged, alts, spec, segment_col, model_settings, chunk_size=0, trace_label=None): """ The purpose of this method is fairly straightforward - it takes tours and schedules them into time slots. Alternatives should be specified so as to define those time slots (usually with start and end times). schedule_tours adds variables that can be used in the spec which have to do with the previous tours per person. Every column in the alternatives table is appended with the suffix "_previous" and made available. So if your alternatives table has columns for start and end, then start_previous and end_previous will be set to the start and end of the most recent tour for a person. The first time through, start_previous and end_previous are undefined, so make sure to protect with a tour_num >= 2 in the variable computation. Parameters ---------- tours : DataFrame DataFrame of tours containing tour attributes, as well as a person_id column to define the nth tour for each person. persons_merged : DataFrame DataFrame of persons containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. (or dict of specs keyed on tour_type if tour_types is not None) model_settings : dict Returns ------- choices : Series A Series of choices where the index is the index of the tours DataFrame and the values are the index of the alts DataFrame. timetable : TimeTable persons timetable updated with tours (caller should replace_table for it to persist) """ trace_label = tracing.extend_trace_label(trace_label, 'vectorize_tour_scheduling') assert len(tours.index) > 0 assert 'tour_num' in tours.columns assert 'tour_type' in tours.columns # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # this ought to have been ensured when tours are created (tour_frequency.process_tours) timetable = inject.get_injectable("timetable") choice_list = [] # keep a series of the the most recent tours for each person # initialize with first trip from alts previous_tour_by_personid = pd.Series(alts.index[0], index=tours.person_id.unique()) timetable_window_id_col = 'person_id' tour_owner_id_col = 'person_id' # no more than one tour per person per call to schedule_tours # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # segregate scheduling by tour_type if multiple specs passed in dict keyed by tour_type for tour_num, nth_tours in tours.groupby('tour_num', sort=True): tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % (tour_num,)) if isinstance(spec, dict): assert segment_col is not None for spec_segment in spec: segment_trace_label = tracing.extend_trace_label(tour_trace_label, spec_segment) in_segment = nth_tours[segment_col] == spec_segment if not in_segment.any(): logger.info("skipping empty segment %s") continue # assume segmentation of spec and logsum coefficients are aligned logsum_tour_purpose = spec_segment choices = \ schedule_tours(nth_tours[in_segment], persons_merged, alts, spec[spec_segment], logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour_by_personid, tour_owner_id_col, chunk_size, segment_trace_label) choice_list.append(choices) else: # unsegmented spec dict indicates no logsums # caller could use single-element spec dict if logsum support desired, # but this case nor required for mtctm1 assert segment_col is None logsum_segment = None choices = \ schedule_tours(nth_tours, persons_merged, alts, spec, logsum_segment, model_settings, timetable, timetable_window_id_col, previous_tour_by_personid, tour_owner_id_col, chunk_size, tour_trace_label) choice_list.append(choices) choices = pd.concat(choice_list) # add the start, end, and duration from tdd_alts # use np instead of (slower) loc[] since alts has rangeindex tdd = pd.DataFrame(data=alts.values[choices.values], columns=alts.columns, index=choices.index) # tdd = alts.loc[choices] # tdd.index = choices.index # include the index of the choice in the tdd alts table tdd['tdd'] = choices return tdd, timetable
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id): """ This model predicts the frequency of making mandatory trips (see the alternatives above) - these trips include work and school in some combination. """ trace_label = 'mandatory_tour_frequency' model_settings = config.read_model_settings('mandatory_tour_frequency.yaml') model_spec = simulate.read_model_spec(file_name='mandatory_tour_frequency.csv') alternatives = simulate.read_model_alts( config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt') choosers = persons_merged.to_frame() # filter based on results of CDAP choosers = choosers[choosers.cdap_activity == 'M'] logger.info("Running mandatory_tour_frequency with %d persons", len(choosers)) # - if no mandatory tours if choosers.shape[0] == 0: add_null_results(trace_label, model_settings) return # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = {} expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=choosers, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='mandatory_tour_frequency') # convert indexes to alternative names choices = pd.Series( model_spec.columns[choices.values], index=choices.index).reindex(persons_merged.local.index) # - create mandatory tours """ This reprocesses the choice of index of the mandatory tour frequency alternatives into an actual dataframe of tours. Ending format is the same as got non_mandatory_tours except trip types are "work" and "school" """ choosers['mandatory_tour_frequency'] = choices mandatory_tours = process_mandatory_tours( persons=choosers, mandatory_tour_frequency_alts=alternatives ) tours = pipeline.extend_table("tours", mandatory_tours) tracing.register_traceable_table('tours', mandatory_tours) pipeline.get_rn_generator().add_channel('tours', mandatory_tours) # - annotate persons persons = inject.get_table('persons').to_frame() # need to reindex as we only handled persons with cdap_activity == 'M' persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str) expressions.assign_columns( df=persons, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons) tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency, value_counts=True) if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_frequency.mandatory_tours", warn_if_empty=True) tracing.trace_df(persons, label="mandatory_tour_frequency.persons", warn_if_empty=True)
def vectorize_joint_tour_scheduling( joint_tours, joint_tour_participants, persons_merged, alts, spec, model_settings, chunk_size=0, trace_label=None): """ Like vectorize_tour_scheduling but specifically for joint tours joint tours have a few peculiarities necessitating separate treatment: Timetable has to be initialized to set all timeperiods... Parameters ---------- tours : DataFrame DataFrame of tours containing tour attributes, as well as a person_id column to define the nth tour for each person. persons_merged : DataFrame DataFrame of persons containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. (or dict of specs keyed on tour_type if tour_types is not None) model_settings : dict Returns ------- choices : Series A Series of choices where the index is the index of the tours DataFrame and the values are the index of the alts DataFrame. persons_timetable : TimeTable timetable updated with joint tours (caller should replace_table for it to persist) """ trace_label = tracing.extend_trace_label(trace_label, 'vectorize_joint_tour_scheduling') assert len(joint_tours.index) > 0 assert 'tour_num' in joint_tours.columns assert 'tour_type' in joint_tours.columns timetable_window_id_col = None tour_owner_id_col = 'household_id' segment = None persons_timetable = inject.get_injectable("timetable") choice_list = [] # keep a series of the the most recent tours for each person # initialize with first trip from alts previous_tour_by_householdid = pd.Series(alts.index[0], index=joint_tours.household_id.unique()) # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # this ought to have been ensured when tours are created (tour_frequency.process_tours) # print "participant windows before scheduling\n", \ # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) for tour_num, nth_tours in joint_tours.groupby('tour_num', sort=True): tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % (tour_num,)) # no more than one tour per household per call to schedule_tours assert not nth_tours.household_id.duplicated().any() nth_participants = \ joint_tour_participants[joint_tour_participants.tour_id.isin(nth_tours.index)] timetable = build_joint_tour_timetables( nth_tours, nth_participants, persons_timetable, alts) choices = \ schedule_tours(nth_tours, persons_merged, alts, spec, segment, model_settings, timetable, timetable_window_id_col, previous_tour_by_householdid, tour_owner_id_col, chunk_size, tour_trace_label) # - update timetables of all joint tour participants persons_timetable.assign( nth_participants.person_id, reindex(choices, nth_participants.tour_id)) choice_list.append(choices) choices = pd.concat(choice_list) # add the start, end, and duration from tdd_alts # assert (alts.index == list(range(alts.shape[0]))).all() tdd = pd.DataFrame(data=alts.values[choices.values], columns=alts.columns, index=choices.index) # tdd = alts.loc[choices] # tdd.index = choices.index tdd.index = choices.index # include the index of the choice in the tdd alts table tdd['tdd'] = choices # print "participant windows after scheduling\n", \ # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) return tdd, persons_timetable
def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label): """ extend tour counts based on a probability table counts can only be extended if original count is between 1 and 4 and tours can only be extended if their count is at the max possible (e.g. 2 for escort, 1 otherwise) so escort might be increased to 3 or 4 and other tour types might be increased to 2 or 3 Parameters ---------- persons: pandas dataframe (need this for join columns) tour_counts: pandas dataframe one row per person, once column per tour_type alternatives alternatives from nmtv interaction_simulate only need this to know max possible frequency for a tour type trace_hh_id trace_label Returns ------- extended tour_counts tour_counts looks like this: escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ assert tour_counts.index.name == persons.index.name PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours'] JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour'] TOUR_TYPE_COL = 'nonmandatory_tour_type' probs_spec = extension_probs() persons = persons[JOIN_COLUMNS] # only extend if there are 1 - 4 non_mandatory tours to start with extend_tour_counts = tour_counts.sum(axis=1).between(1, 4) if not extend_tour_counts.any(): return tour_counts have_trace_targets = trace_hh_id and tracing.has_trace_targets(extend_tour_counts) for i, tour_type in enumerate(alternatives.columns): i_tour_type = i + 1 # (probs_spec nonmandatory_tour_type column is 1-based) tour_type_trace_label = tracing.extend_trace_label(trace_label, tour_type) # - only extend tour if frequency is max possible frequency for this tour type tour_type_is_maxed = \ extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max()) maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed] if len(maxed_tour_count_idx) == 0: continue # - get extension probs for tour_type choosers = pd.merge( persons.loc[maxed_tour_count_idx], probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type], on=JOIN_COLUMNS, how='left' ).set_index(maxed_tour_count_idx) assert choosers.index.name == tour_counts.index.name # - random choice of extension magnituce based on relative probs choices, rands = logit.make_choices( choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, trace_choosers=choosers) # - extend tour_count (0-based prob alternative choice equals magnitude of extension) if choices.any(): tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label(tour_type_trace_label, 'choices'), columns=[None, 'choice']) tracing.trace_df(rands, tracing.extend_trace_label(tour_type_trace_label, 'rands'), columns=[None, 'rand']) return tour_counts
def school_location_logsums(persons_merged, land_use, skim_dict, skim_stack, school_location_sample, configs_dir, chunk_size, trace_hh_id): """ add logsum column to existing school_location_sample able logsum is calculated by running the mode_choice model for each sample (person, dest_taz) pair in school_location_sample, and computing the logsum of all the utilities +-------+--------------+----------------+------------+----------------+ | PERID | dest_TAZ | rand | pick_count | logsum (added) | +=======+==============+================+============+================+ | 23750 | 14 | 0.565502716034 | 4 | 1.85659498857 | +-------+--------------+----------------+------------+----------------+ + 23750 | 16 | 0.711135838871 | 6 | 1.92315598631 | +-------+--------------+----------------+------------+----------------+ + ... | | | | | +-------+--------------+----------------+------------+----------------+ | 23751 | 12 | 0.408038878552 | 1 | 2.40612135416 | +-------+--------------+----------------+------------+----------------+ | 23751 | 14 | 0.972732479292 | 2 | 1.44009018355 | +-------+--------------+----------------+------------+----------------+ """ trace_label = 'school_location_logsums' school_location_settings = config.read_model_settings( configs_dir, 'school_location.yaml') alt_col_name = school_location_settings["ALT_COL_NAME"] chooser_col_name = 'TAZ' # FIXME - just using settings from tour_mode_choice logsum_settings = config.read_model_settings(configs_dir, 'tour_mode_choice.yaml') persons_merged = persons_merged.to_frame() school_location_sample = school_location_sample.to_frame() logger.info("Running school_location_sample with %s rows" % len(school_location_sample)) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = school_location_settings['LOGSUM_CHOOSER_COLUMNS'] persons_merged = persons_merged[chooser_columns] tracing.dump_df(DUMP, persons_merged, trace_label, 'persons_merged') logsums_list = [] for school_type in ['university', 'highschool', 'gradeschool']: logsums_spec = mode_choice_logsums_spec(configs_dir, school_type) choosers = school_location_sample[school_location_sample['school_type'] == school_type] choosers = pd.merge(choosers, persons_merged, left_index=True, right_index=True, how="left") choosers['in_period'] = skim_time_period_label( school_location_settings['IN_PERIOD']) choosers['out_period'] = skim_time_period_label( school_location_settings['OUT_PERIOD']) # FIXME - should do this in expression file? choosers['dest_topology'] = reindex(land_use.TOPOLOGY, choosers[alt_col_name]) choosers['dest_density_index'] = reindex(land_use.density_index, choosers[alt_col_name]) tracing.dump_df(DUMP, choosers, tracing.extend_trace_label(trace_label, school_type), 'choosers') logsums = compute_logsums( choosers, logsums_spec, logsum_settings, skim_dict, skim_stack, chooser_col_name, alt_col_name, chunk_size, trace_hh_id, tracing.extend_trace_label(trace_label, school_type)) logsums_list.append(logsums) logsums = pd.concat(logsums_list) # add_column series should have an index matching the table to which it is being added # logsums does, since school_location_sample was on left side of merge creating choosers inject.add_column("school_location_sample", "mode_choice_logsum", logsums)
def trip_purpose_and_destination( trips, tours_merged, chunk_size, trace_hh_id): trace_label = "trip_purpose_and_destination" model_settings = config.read_model_settings('trip_purpose_and_destination.yaml') MAX_ITERATIONS = model_settings.get('MAX_ITERATIONS', 5) trips_df = trips.to_frame() tours_merged_df = tours_merged.to_frame() if trips_df.empty: logger.info("%s - no trips. Nothing to do." % trace_label) return # FIXME could allow MAX_ITERATIONS=0 to allow for cleanup-only run # in which case, we would need to drop bad trips, WITHOUT failing bad_trip leg_mates assert (MAX_ITERATIONS > 0) # if trip_destination has been run before, keep only failed trips (and leg_mates) to retry if 'destination' in trips_df: if trips_df.failed.any(): logger.info('trip_destination has already been run. Rerunning failed trips') flag_failed_trip_leg_mates(trips_df, 'failed') trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] logger.info('Rerunning %s failed trips and leg-mates' % trips_df.shape[0]) else: # no failed trips from prior run of trip_destination logger.info("%s - no failed trips from prior model run." % trace_label) del trips_df['failed'] pipeline.replace_table("trips", trips_df) return results = [] i = 0 RESULT_COLUMNS = ['purpose', 'destination', 'origin', 'failed'] while True: i += 1 for c in RESULT_COLUMNS: if c in trips_df: del trips_df[c] trips_df = run_trip_purpose_and_destination( trips_df, tours_merged_df, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, "i%s" % i)) num_failed_trips = trips_df.failed.sum() # if there were no failed trips, we are done if num_failed_trips == 0: results.append(trips_df[RESULT_COLUMNS]) break logger.warning("%s %s failed trips in iteration %s" % (trace_label, num_failed_trips, i)) file_name = "%s_i%s_failed_trips" % (trace_label, i) logger.info("writing failed trips to %s" % file_name) tracing.write_csv(trips_df[trips_df.failed], file_name=file_name, transpose=False) # if max iterations reached, add remaining trips to results and give up # note that we do this BEFORE failing leg_mates so resulting trip legs are complete if i >= MAX_ITERATIONS: logger.warning("%s too many iterations %s" % (trace_label, i)) results.append(trips_df[RESULT_COLUMNS]) break # otherwise, if any trips failed, then their leg-mates trips must also fail flag_failed_trip_leg_mates(trips_df, 'failed') # add the good trips to results results.append(trips_df[~trips_df.failed][RESULT_COLUMNS]) # and keep the failed ones to retry trips_df = trips_df[trips_df.failed] tours_merged_df = tours_merged_df[tours_merged_df.index.isin(trips_df.tour_id)] # - assign result columns to trips results = pd.concat(results) logger.info("%s %s failed trips after %s iterations" % (trace_label, results.failed.sum(), i)) trips_df = trips.to_frame() assign_in_place(trips_df, results) trips_df = cleanup_failed_trips(trips_df) pipeline.replace_table("trips", trips_df) if trace_hh_id: tracing.trace_df(trips_df, label=trace_label, slicer='trip_id', index_label='trip_id', warn_if_empty=True)
def school_location_sample(persons_merged, school_location_sample_spec, school_location_settings, skim_dict, destination_size_terms, chunk_size, trace_hh_id): """ build a table of persons * all zones to select a sample of alternative school locations. PERID, dest_TAZ, rand, pick_count 23750, 14, 0.565502716034, 4 23750, 16, 0.711135838871, 6 ... 23751, 12, 0.408038878552, 1 23751, 14, 0.972732479292, 2 """ trace_label = 'school_location_sample' choosers = persons_merged.to_frame() alternatives = destination_size_terms.to_frame() constants = config.get_model_constants(school_location_settings) sample_size = school_location_settings["SAMPLE_SIZE"] alt_col_name = school_location_settings["ALT_COL_NAME"] logger.info("Running school_location_simulate with %d persons" % len(choosers)) # create wrapper with keys for this lookup - in this case there is a TAZ in the choosers # and a TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skims = skim_dict.wrap("TAZ", "TAZ_r") locals_d = {'skims': skims} if constants is not None: locals_d.update(constants) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = school_location_settings['SIMULATE_CHOOSER_COLUMNS'] choosers = choosers[chooser_columns] choices_list = [] for school_type in ['university', 'highschool', 'gradeschool']: locals_d['segment'] = school_type choosers_segment = choosers[choosers["is_" + school_type]] # FIXME - no point in considering impossible alternatives alternatives_segment = alternatives[alternatives[school_type] > 0] logger.info( "school_type %s: %s persons %s alternatives" % (school_type, len(choosers_segment), len(alternatives_segment))) if len(choosers_segment.index) > 0: choices = interaction_sample( choosers_segment, alternatives_segment, sample_size=sample_size, alt_col_name=alt_col_name, spec=school_location_sample_spec[[school_type]], skims=skims, locals_d=locals_d, chunk_size=chunk_size, trace_label=tracing.extend_trace_label(trace_label, school_type)) choices['school_type'] = school_type choices_list.append(choices) choices = pd.concat(choices_list) inject.add_table('school_location_sample', choices)
def run_trip_purpose( trips_df, chunk_size, trace_hh_id, trace_label): """ trip purpose - main functionality separated from model step so it can be called iteratively For each intermediate stop on a tour (i.e. trip other than the last trip outbound or inbound) Each trip is assigned a purpose based on an observed frequency distribution The distribution is segmented by tour purpose, tour direction, person type, and, optionally, trip depart time . Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ model_settings = config.read_model_settings('trip_purpose.yaml') probs_spec = trip_purpose_probs() result_list = [] # - last trip of outbound tour gets primary_purpose last_trip = (trips_df.trip_num == trips_df.trip_count) purpose = trips_df.primary_purpose[last_trip & trips_df.outbound] result_list.append(purpose) logger.info("assign purpose to %s last outbound trips", purpose.shape[0]) # - last trip of inbound tour gets home (or work for atwork subtours) purpose = trips_df.primary_purpose[last_trip & ~trips_df.outbound] purpose = pd.Series(np.where(purpose == 'atwork', 'Work', 'Home'), index=purpose.index) result_list.append(purpose) logger.info("assign purpose to %s last inbound trips", purpose.shape[0]) # - intermediate stops (non-last trips) purpose assigned by probability table trips_df = trips_df[~last_trip] logger.info("assign purpose to %s intermediate trips", trips_df.shape[0]) preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns( df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) rows_per_chunk, effective_chunk_size = \ trip_purpose_rpc(chunk_size, trips_df, probs_spec, trace_label=trace_label) for i, num_chunks, trips_chunk in chunk.chunked_choosers(trips_df, rows_per_chunk): logger.info("Running chunk %s of %s size %d", i, num_chunks, len(trips_chunk)) chunk_trace_label = tracing.extend_trace_label(trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = choose_intermediate_trip_purpose( trips_chunk, probs_spec, trace_hh_id, trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) if len(result_list) > 1: choices = pd.concat(result_list) return choices