def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_persons): """ This method processes the joint_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the joint tours that were generated Parameters ---------- joint_tour_frequency : pandas.Series household joint_tour_frequency (which came out of the joint tour frequency model) indexed by household_id joint_tour_frequency_alts: DataFrame A DataFrame which has as a unique index with joint_tour_frequency values and frequency counts for the tours to be generated for that choice point_persons : pandas DataFrame table with columns for (at least) person_ids and home_zone_id indexed by household_id Returns ------- tours : DataFrame An example of a tours DataFrame is supplied as a comment in the source code - it has an index which is a tour identifier, a household_id column, a tour_type column and tour_type_num and tour_num columns which is set to 1 or 2 depending whether it is the first or second joint tour made by the household. """ assert not joint_tour_frequency.isnull().any() tours = process_tours(joint_tour_frequency.dropna(), joint_tour_frequency_alts, tour_category='joint', parent_col='household_id') assert not tours.index.duplicated().any() assert point_persons.index.name == 'household_id' # - assign a temp point person to tour so we can create stable index tours['person_id'] = reindex(point_persons.person_id, tours.household_id) tours['origin'] = reindex(point_persons.home_zone_id, tours.household_id) # assign stable (predictable) tour_id set_tour_index(tours, is_joint=True) """ household_id tour_type tour_type_count tour_type_num tour_num tour_count tour_id 3209530 320953 disc 1 1 1 2 3209531 320953 disc 2 2 2 2 23267026 2326702 shop 1 1 1 1 17978574 1797857 main 1 1 1 1 tour_category tour_category_id person_id 3209530 joint 4 577234 3209531 joint 4 577234 23267026 joint 4 1742708 17978574 joint 4 5143198 """ return tours
def create_simple_trips(tours, households, persons, trace_hh_id): """ Create a simple trip table """ logger.info("Running simple trips table creation with %d tours" % len(tours.index)) tours_df = tours.to_frame() # we now have a tour_id column tours_df.reset_index(inplace=True) tours_df['household_id'] = reindex(persons.household_id, tours_df.person_id) tours_df['TAZ'] = reindex(households.TAZ, tours_df.household_id) # create inbound and outbound records trips = pd.concat([tours_df, tours_df], ignore_index=True) # first half are outbound, second half are inbound trips['INBOUND'] = np.repeat([False, True], len(trips.index) / 2) # TRIPID for outbound trips = 1, inbound_trips = 2 trips['trip_num'] = np.repeat([1, 2], len(trips.index) / 2) # set key fields from tour fields: 'TAZ','destination','start','end' trips['OTAZ'] = trips.TAZ trips['OTAZ'][trips.INBOUND] = trips.destination[trips.INBOUND] trips['DTAZ'] = trips.destination trips['DTAZ'][trips.INBOUND] = trips.TAZ[trips.INBOUND] trips['start_trip'] = trips.start trips['start_trip'][trips.INBOUND] = trips.end[trips.INBOUND] trips['end_trip'] = trips.end trips['end_trip'][trips.INBOUND] = trips.start[trips.INBOUND] # create a stable (predictable) index based on tour_id and trip_num possible_trips_count = 2 trips['trip_id'] = (trips.tour_id * possible_trips_count) + (trips.trip_num - 1) trips.set_index('trip_id', inplace=True, verify_integrity=True) trip_columns = [ 'tour_id', 'INBOUND', 'trip_num', 'OTAZ', 'DTAZ', 'start_trip', 'end_trip' ] trips = trips[trip_columns] orca.add_table("trips", trips) tracing.register_traceable_table('trips', trips) pipeline.get_rn_generator().add_channel(trips, 'trips') if trace_hh_id: tracing.trace_df(trips, label="trips", warn_if_empty=True)
def process_joint_tours(joint_tour_frequency, joint_tour_frequency_alts, point_persons): """ This method processes the joint_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the joint tours that were generated Parameters ---------- joint_tour_frequency : pandas.Series household joint_tour_frequency (which came out of the joint tour frequency model) indexed by household_id joint_tour_frequency_alts: DataFrame A DataFrame which has as a unique index with joint_tour_frequency values and frequency counts for the tours to be generated for that choice point_persons : pandas DataFrame table with columns for (at least) person_ids and home_taz indexed by household_id Returns ------- tours : DataFrame An example of a tours DataFrame is supplied as a comment in the source code - it has an index which is a tour identifier, a household_id column, a tour_type column and tour_type_num and tour_num columns which is set to 1 or 2 depending whether it is the first or second joint tour made by the household. """ assert not joint_tour_frequency.isnull().any() tours = process_tours(joint_tour_frequency.dropna(), joint_tour_frequency_alts, tour_category='joint', parent_col='household_id') # - assign a temp point person to tour so we can create stable index tours['person_id'] = reindex(point_persons.person_id, tours.household_id) tours['origin'] = reindex(point_persons.home_taz, tours.household_id) # assign stable (predictable) tour_id set_tour_index(tours, is_joint=True) """ household_id tour_type tour_type_count tour_type_num tour_num tour_count tour_id 3209530 320953 disc 1 1 1 2 3209531 320953 disc 2 2 2 2 23267026 2326702 shop 1 1 1 1 17978574 1797857 main 1 1 1 1 tour_category tour_category_id person_id 3209530 joint 4 577234 3209531 joint 4 577234 23267026 joint 4 1742708 17978574 joint 4 5143198 """ return tours
def process_non_mandatory_tours(persons, tour_counts): """ This method processes the non_mandatory_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the non mandatory tours that were generated Parameters ---------- persons: pandas.DataFrame persons table containing a non_mandatory_tour_frequency column which has the index of the chosen alternative as the value non_mandatory_tour_frequency_alts: DataFrame A DataFrame which has as a unique index which relates to the values in the series above typically includes columns which are named for trip purposes with values which are counts for that trip purpose. Example trip purposes include escort, shopping, othmaint, othdiscr, eatout, social, etc. A row would be an alternative which might be to take one shopping trip and zero trips of other purposes, etc. Returns ------- tours : DataFrame An example of a tours DataFrame is supplied as a comment in the source code - it has an index which is a unique tour identifier, a person_id column, and a tour type column which comes from the column names of the alternatives DataFrame supplied above. """ tours = create_tours(tour_counts, tour_category='non_mandatory') tours['household_id'] = reindex(persons.household_id, tours.person_id) tours['origin'] = reindex(persons.home_taz, tours.person_id) # assign stable (predictable) tour_id set_tour_index(tours) """ person_id tour_type tour_type_count tour_type_num tour_num tour_count tour_id 17008286 1133885 shopping 1 1 1 3 17008283 1133885 othmaint 1 1 2 3 17008282 1133885 othdiscr 1 1 3 3 ... tour_category non_mandatory non_mandatory non_mandatory """ return tours
def process_non_mandatory_tours(persons, tour_counts): """ This method processes the non_mandatory_tour_frequency column that comes out of the model of the same name and turns into a DataFrame that represents the non mandatory tours that were generated Parameters ---------- persons: pandas.DataFrame persons table containing a non_mandatory_tour_frequency column which has the index of the chosen alternative as the value non_mandatory_tour_frequency_alts: DataFrame A DataFrame which has as a unique index which relates to the values in the series above typically includes columns which are named for trip purposes with values which are counts for that trip purpose. Example trip purposes include escort, shopping, othmaint, othdiscr, eatout, social, etc. A row would be an alternative which might be to take one shopping trip and zero trips of other purposes, etc. Returns ------- tours : DataFrame An example of a tours DataFrame is supplied as a comment in the source code - it has an index which is a unique tour identifier, a person_id column, and a tour type column which comes from the column names of the alternatives DataFrame supplied above. """ tours = create_tours(tour_counts, tour_category='non_mandatory') tours['household_id'] = reindex(persons.household_id, tours.person_id) tours['origin'] = reindex(persons.home_zone_id, tours.person_id) # assign stable (predictable) tour_id set_tour_index(tours) """ person_id tour_type tour_type_count tour_type_num tour_num tour_count tour_id 17008286 1133885 shopping 1 1 1 3 17008283 1133885 othmaint 1 1 2 3 17008282 1133885 othdiscr 1 1 3 3 ... tour_category non_mandatory non_mandatory non_mandatory """ return tours
def destination_in_cbd(non_mandatory_tours, land_use, settings): # protection until filled in by destination choice model if "destination" not in non_mandatory_tours.columns: return pd.Series(False, index=non_mandatory_tours.index) s = reindex(land_use.area_type, non_mandatory_tours.destination) return s < settings['cbd_threshold']
def patch_trip_ids(tours, trips): """ replace survey trip_ids with asim standard trip_id replace survey tour_id foreign key with asim standard tour_id """ # tour_id is a column, not index assert ASIM_TOUR_ID in tours # patch tour_id foreign key # tours['household_id'] = reindex(persons.household_id, tours.person_id) asim_tour_id = pd.Series(tours[ASIM_TOUR_ID].values, index=tours[SURVEY_TOUR_ID].values) trips[ASIM_TOUR_ID] = reindex(asim_tour_id, trips[SURVEY_TOUR_ID]) # person_is_university = persons.pstudent == constants.PSTUDENT_UNIVERSITY # tour_is_university = reindex(person_is_university, tours.person_id) # tour_primary_purpose = tours.tour_type.where((tours.tour_type != 'school') | ~tour_is_university, 'univ') # tour_primary_purpose = tour_primary_purpose.where(tours.tour_category!='atwork', 'atwork') # # trips['primary_purpose'] = reindex(tour_primary_purpose, trips.tour_id) # if order is ambiguous if trips depart in same time slot - order by SURVEY_TRIP_ID hoping that increases with time if 'trip_num' not in trips: trips['trip_num'] = \ trips.sort_values(by=['tour_id', 'outbound', 'depart', SURVEY_TRIP_ID]).\ groupby(['tour_id', 'outbound']).\ cumcount() + 1 cid.set_trip_index(trips) assert trips.index.name == ASIM_TRIP_ID trips = trips.reset_index().rename(columns={'trip_id': ASIM_TRIP_ID}) return trips
def create_logsum_trips(tours, segment_column_name, model_settings, trace_label): """ Construct table of trips from half-tours (1 inbound, 1 outbound) for each tour-mode. Parameters ---------- tours : pandas.DataFrame segment_column_name : str column in tours table used for segmenting model spec model_settings : dict trace_label : str Returns ------- pandas.DataFrame Table of trips: 2 per tour, with O/D and purpose inherited from tour """ stop_frequency_alts = inject.get_injectable('stop_frequency_alts') stop_freq = '0out_0in' # no intermediate stops tours['stop_frequency'] = stop_freq tours['primary_purpose'] = tours['tour_purpose'] trips = trip.initialize_from_tours(tours, stop_frequency_alts) trips['stop_frequency'] = stop_freq outbound = trips['outbound'] trips['depart'] = reindex(tours.start, trips.tour_id) trips.loc[~outbound, 'depart'] = reindex(tours.end, trips.loc[~outbound, 'tour_id']) # actual segment doesn't matter. just need to grab one # to get a set of coefficients from the spec segment_name = tours.iloc[0][segment_column_name] tour_mode_alts = get_alts_from_segmented_nested_logit( model_settings, segment_name, trace_label) # repeat rows from the trips table iterating over tour mode logsum_trips = pd.DataFrame() for tour_mode in tour_mode_alts: trips['tour_mode'] = tour_mode logsum_trips = pd.concat((logsum_trips, trips), ignore_index=True) assert len(logsum_trips) == len(trips) * len(tour_mode_alts) logsum_trips.index.name = 'trip_id' return logsum_trips
def set_tour_hour(trips, tours): """ add columns 'tour_hour', 'earliest', 'latest' to trips Parameters ---------- trips: pd.DataFrame tours: pd.DataFrame Returns ------- modifies trips in place """ # all trips must depart between tour start and end trips['earliest'] = reindex(tours.start, trips.tour_id) trips['latest'] = reindex(tours.end, trips.tour_id) # tour_hour is start for outbound trips, and end for inbound trips trips['tour_hour'] = np.where(trips.outbound, trips['earliest'], trips['latest']).astype(np.int8) # subtours indexed by parent_tour_id subtours = tours.loc[ tours.primary_purpose == 'atwork', ['tour_num', 'tour_count', 'parent_tour_id', 'start', 'end']] subtours.parent_tour_id = subtours.parent_tour_id.astype(np.int64) subtours = subtours.set_index('parent_tour_id') subtours = subtours.astype( np.int16) # remaining columns are all small ints # bool series trip_has_subtours = trips.tour_id.isin(subtours.index) outbound = trip_has_subtours & trips.outbound trips.loc[outbound, 'latest'] = \ reindex(subtours[subtours.tour_num == 1]['start'], trips[outbound].tour_id) inbound = trip_has_subtours & ~trips.outbound trips.loc[inbound, 'earliest'] = \ reindex(subtours[subtours.tour_num == subtours.tour_count]['end'], trips[inbound].tour_id)
def trip_departure_choice( trips, trips_merged, skim_dict, chunk_size, trace_hh_id): trace_label = 'trip_departure_choice' model_settings = config.read_model_settings('trip_departure_choice.yaml') spec = simulate.read_model_spec(file_name=model_settings['SPECIFICATION']) trips_merged_df = trips_merged.to_frame() # add tour-based chunk_id so we can chunk all trips in tour together tour_ids = trips_merged[TOUR_ID].unique() trips_merged_df['chunk_id'] = reindex(pd.Series(list(range(len(tour_ids))), tour_ids), trips_merged_df.tour_id) max_tour_id = trips_merged[TOUR_ID].max() trip_departure_choice.MAX_TOUR_ID = int(np.power(10, np.ceil(np.log10(max_tour_id)))) locals_d = config.get_model_constants(model_settings).copy() preprocessor_settings = model_settings.get('PREPROCESSOR', None) tour_legs = get_tour_legs(trips_merged_df) pipeline.get_rn_generator().add_channel('tour_legs', tour_legs) if preprocessor_settings: od_skim = skim_dict.wrap('origin', 'destination') do_skim = skim_dict.wrap('destination', 'origin') skims = [od_skim, do_skim] simulate.set_skim_wrapper_targets(trips_merged_df, skims) locals_d.update({ "od_skims": od_skim, "do_skims": do_skim, }) expressions.assign_columns( df=trips_merged_df, model_settings=preprocessor_settings, locals_dict=locals_d, trace_label=trace_label) choices = apply_stage_two_model(spec, trips_merged_df, chunk_size, trace_label) trips_df = trips.to_frame() trip_length = len(trips_df) trips_df = pd.concat([trips_df, choices], axis=1) assert len(trips_df) == trip_length assert trips_df[trips_df['depart'].isnull()].empty pipeline.replace_table("trips", trips_df)
def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): """ This model predicts the departure time and duration of each activity for mandatory tours """ model_name = 'mandatory_tour_scheduling' trace_label = model_name persons_merged = persons_merged.to_frame() tours = tours.to_frame() mandatory_tours = tours[tours.tour_category == 'mandatory'] # - if no mandatory_tours if mandatory_tours.shape[0] == 0: tracing.no_results(model_name) return # - add tour segmentation column # mtctm1 segments mandatory_scheduling spec by tour_type # (i.e. there are different specs for work and school tour_types) # mtctm1 logsum coefficients are segmented by primary_purpose # (i.e. there are different logsum coefficients for work, school, univ primary_purposes # for simplicity managing these different segmentation schemes, # we conflate them by segmenting tour processing to align with primary_purpose tour_segment_col = 'mandatory_tour_seg' assert tour_segment_col not in mandatory_tours is_university_tour = \ (mandatory_tours.tour_type == 'school') & \ reindex(persons_merged.is_university, mandatory_tours.person_id) mandatory_tours[tour_segment_col] = \ mandatory_tours.tour_type.where(~is_university_tour, 'univ') choices = run_tour_scheduling(model_name, mandatory_tours, persons_merged, tdd_alts, tour_segment_col, chunk_size, trace_hh_id) assign_in_place(tours, choices) pipeline.replace_table("tours", tours) # updated df for tracing mandatory_tours = tours[tours.tour_category == 'mandatory'] tracing.dump_df(DUMP, tt.tour_map(persons_merged, mandatory_tours, tdd_alts), trace_label, 'tour_map') if trace_hh_id: tracing.trace_df(mandatory_tours, label=trace_label, slicer='person_id', index_label='tour', columns=None, warn_if_empty=True)
def normal_for_df(self, df, mu=0, sigma=1, broadcast=False): """ Return a single floating point normal random number in range (-inf, inf) for each row in df using the appropriate random channel for each row. Subsequent calls (in the same step) will return the next rand for each df row The resulting array will be the same length (and order) as df This method is designed to support alternative selection from a probability array The columns in df are ignored; the index name and values are used to determine which random number sequence to to use. We assume that we can identify the channel to used based on the name of df.index This channel should have already been registered by a call to add_channel (q.v.) If "true pseudo random" behavior is desired (i.e. NOT repeatable) the set_base_seed method (q.v.) may be used to globally reseed all random streams. Parameters ---------- df : pandas.DataFrame df with index name and values corresponding to a registered channel mu : float or array of floats with one value per df row sigma : float or array of floats with one value per df row Returns ------- rands : 1-D ndarray the same length as df (or Series with same index as df) a single float in lognormal distribution for each row in df """ channel = self.get_channel_for_df(df) if broadcast: alts_df = df df = df.index.unique().to_series() rands = channel.normal_for_df(df, self.step_name, mu=0, sigma=1, lognormal=False) rands = reindex(pd.Series(rands, index=df.index), alts_df.index) rands = rands * sigma + mu else: rands = channel.normal_for_df(df, self.step_name, mu, sigma, lognormal=False) return rands
def infer_atwork_subtour_frequency(configs_dir, tours): # first column is 'atwork_subtour_frequency' nickname, remaining columns are trip type counts alts = pd.read_csv(os.path.join(configs_dir, 'atwork_subtour_frequency_alternatives.csv'), comment='#') tour_types = list(alts.drop(columns=alts.columns[0]).columns) # get trip_types, ignoring first column alts['alt_id'] = alts.index # alt eat business maint alt_id # 0 no_subtours 0 0 0 0 # 1 eat 1 0 0 1 # 2 business1 0 1 0 2 # 3 maint 0 0 1 3 # 4 business2 0 2 0 4 # 5 eat_business 1 1 0 5 work_tours = tours[tours.tour_type == 'work'] work_tours = work_tours[[ASIM_TOUR_ID]] subtours = tours[tours.tour_category == 'atwork'] subtours = subtours[['tour_id', 'tour_type', 'parent_tour_id']] # actual tour counts (may exceed counts envisioned by alts) tour_counts = pd.DataFrame(index=work_tours[ASIM_TOUR_ID]) for tour_type in tour_types: # count subtours of this type by parent_tour_id tour_type_count = subtours[subtours.tour_type == tour_type].groupby('parent_tour_id').size() # backfill with 0 count tour_counts[tour_type] = tour_type_count.reindex(tour_counts.index).fillna(0).astype(np.int8) # determine alt id corresponding to constrained_tour_counts # need to do index waltz because pd.merge doesn't preserve index in this case tour_counts = \ pd.merge(tour_counts.reset_index(), alts, left_on=tour_types, right_on=tour_types, how='left').set_index(tour_counts.index.name) atwork_subtour_frequency = tour_counts.alt # did we end up with any tour frequencies not in alts? if atwork_subtour_frequency.isna().any(): bad_tour_frequencies = atwork_subtour_frequency.isna() logger.warning("WARNING Bad atwork subtour frequencies for %s work tours" % bad_tour_frequencies.sum()) logger.warning("WARNING Bad atwork subtour frequencies: num_tours\n%s" % tour_counts[bad_tour_frequencies]) logger.warning("WARNING Bad atwork subtour frequencies: num_tours\n%s" % subtours[subtours.parent_tour_id.isin(tour_counts[bad_tour_frequencies].index)]. sort_values('parent_tour_id')) bug atwork_subtour_frequency = reindex(atwork_subtour_frequency, tours[ASIM_TOUR_ID]).fillna('') return atwork_subtour_frequency
def all_transit_paths(self, access_df, egress_df, chooser_attributes, trace_label, trace): trace_label = tracing.extend_trace_label(trace_label, 'all_transit_paths') # deduped transit_df has one row per chooser for each boarding (btap) and alighting (atap) pair transit_df = pd.merge( access_df[['idx', 'btap']], egress_df[['idx', 'atap']], on='idx').drop_duplicates() # don't want transit trips that start and stop in same tap transit_df = transit_df[transit_df.atap != transit_df.btap] for c in list(chooser_attributes.columns): transit_df[c] = reindex(chooser_attributes[c], transit_df['idx']) transit_df = transit_df.reset_index(drop=True) if trace: self.trace_df(transit_df, trace_label, 'all_transit_df') return transit_df
def school_location_logsums(persons_merged, land_use, skim_dict, skim_stack, school_location_sample, configs_dir, chunk_size, trace_hh_id): """ add logsum column to existing school_location_sample able logsum is calculated by running the mode_choice model for each sample (person, dest_taz) pair in school_location_sample, and computing the logsum of all the utilities <added> PERID, dest_TAZ, rand, pick_count, logsum 23750, 14, 0.565502716034, 4 1.85659498857 23750, 16, 0.711135838871, 6 1.92315598631 ... 23751, 12, 0.408038878552, 1 2.40612135416 23751, 14, 0.972732479292, 2 1.44009018355 """ trace_label = 'school_location_logsums' # extract logsums_spec from omnibus_spec # omnibus_spec = orca.get_injectable('tour_mode_choice_spec') # for tour_type in ['school', 'university']: # logsums_spec = get_segment_and_unstack(omnibus_spec, tour_type) # tracing.dump_df(DUMP, logsums_spec, trace_label, 'logsums_spec_%s' % tour_type) school_location_settings = config.read_model_settings( configs_dir, 'school_location.yaml') alt_col_name = school_location_settings["ALT_COL_NAME"] # FIXME - just using settings from tour_mode_choice logsum_settings = config.read_model_settings(configs_dir, 'tour_mode_choice.yaml') persons_merged = persons_merged.to_frame() school_location_sample = school_location_sample.to_frame() logger.info("Running school_location_sample with %s rows" % len(school_location_sample)) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = school_location_settings['LOGSUM_CHOOSER_COLUMNS'] persons_merged = persons_merged[chooser_columns] tracing.dump_df(DUMP, persons_merged, trace_label, 'persons_merged') logsums_list = [] for school_type in ['university', 'highschool', 'gradeschool']: logsums_spec = mode_choice_logsums_spec(configs_dir, school_type) choosers = school_location_sample[school_location_sample['school_type'] == school_type] choosers = pd.merge(choosers, persons_merged, left_index=True, right_index=True, how="left") choosers['in_period'] = time_period_label( school_location_settings['IN_PERIOD']) choosers['out_period'] = time_period_label( school_location_settings['OUT_PERIOD']) # FIXME - should do this in expression file? choosers['dest_topology'] = reindex(land_use.TOPOLOGY, choosers[alt_col_name]) choosers['dest_density_index'] = reindex(land_use.density_index, choosers[alt_col_name]) tracing.dump_df(DUMP, choosers, trace_label, '%s_choosers' % school_type) logsums = compute_logsums(choosers, logsums_spec, logsum_settings, skim_dict, skim_stack, alt_col_name, chunk_size, trace_hh_id, trace_label) logsums_list.append(logsums) logsums = pd.concat(logsums_list) # add_column series should have an index matching the table to which it is being added # logsums does, since school_location_sample was on left side of merge creating choosers orca.add_column("school_location_sample", "mode_choice_logsum", logsums)
def run_destination_simulate( spec_segment_name, tours, persons_merged, destination_sample, model_settings, skim_dict, destination_size_terms, chunk_size, trace_label): """ run destination_simulate on tour_destination_sample annotated with mode_choice logsum to select a destination from sample alternatives """ model_spec_file_name = model_settings['SPEC'] model_spec = simulate.read_model_spec(file_name=model_spec_file_name) model_spec = model_spec[[spec_segment_name]] # merge persons into tours choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, how='left') # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] choosers = choosers[chooser_columns] alt_dest_col_name = model_settings["ALT_DEST_COL_NAME"] origin_col_name = model_settings['CHOOSER_ORIG_COL_NAME'] # alternatives are pre-sampled and annotated with logsums and pick_count # but we have to merge size_terms column into alt sample list destination_sample['size_term'] = \ reindex(destination_size_terms.size_term, destination_sample[alt_dest_col_name]) tracing.dump_df(DUMP, destination_sample, trace_label, 'alternatives') constants = config.get_model_constants(model_settings) logger.info("Running tour_destination_simulate with %d persons", len(choosers)) # create wrapper with keys for this lookup - in this case there is a TAZ in the choosers # and a TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skims = skim_dict.wrap(origin_col_name, alt_dest_col_name) locals_d = { 'skims': skims, } if constants is not None: locals_d.update(constants) tracing.dump_df(DUMP, choosers, trace_label, 'choosers') choices = interaction_sample_simulate( choosers, destination_sample, spec=model_spec, choice_column=alt_dest_col_name, skims=skims, locals_d=locals_d, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='destination') return choices
def process_trips(tours, stop_frequency_alts): MAX_TRIPS_PER_LEG = 4 # max number of trips per leg (inbound or outbound) of tour OUTBOUND_ALT = 'out' assert OUTBOUND_ALT in stop_frequency_alts.columns # get the actual alternatives for each person - have to go back to the # stop_frequency_alts dataframe to get this - the stop_frequency choice # column has the index values for the chosen alternative trips = stop_frequency_alts.loc[tours.stop_frequency] # assign tour ids to the index trips.index = tours.index """ :: tours.stop_frequency => proto trips table ________________________________________________________ stop_frequency | out in tour_id | tour_id 954910 1out_1in | 954910 1 1 985824 0out_1in | 985824 0 1 """ # reformat with the columns given below trips = trips.stack().reset_index() trips.columns = ['tour_id', 'direction', 'trip_count'] # tours legs have one more leg than stop trips.trip_count += 1 # prefer direction as boolean trips['outbound'] = trips.direction == OUTBOUND_ALT """ tour_id direction trip_count outbound 0 954910 out 2 True 1 954910 in 2 False 2 985824 out 1 True 3 985824 in 2 False """ # now do a repeat and a take, so if you have two trips of given type you # now have two rows, and zero trips yields zero rows trips = trips.take(np.repeat(trips.index.values, trips.trip_count.values)) trips = trips.reset_index(drop=True) grouped = trips.groupby(['tour_id', 'outbound']) trips['trip_num'] = grouped.cumcount() + 1 trips['person_id'] = reindex(tours.person_id, trips.tour_id) trips['household_id'] = reindex(tours.household_id, trips.tour_id) trips['primary_purpose'] = reindex(tours.primary_purpose, trips.tour_id) # reorder columns and drop 'direction' trips = trips[['person_id', 'household_id', 'tour_id', 'primary_purpose', 'trip_num', 'outbound', 'trip_count']] """ person_id household_id tour_id primary_purpose trip_num outbound trip_count 0 32927 32927 954910 work 1 True 2 1 32927 32927 954910 work 2 True 2 2 32927 32927 954910 work 1 False 2 3 32927 32927 954910 work 2 False 2 4 33993 33993 985824 univ 1 True 1 5 33993 33993 985824 univ 1 False 2 6 33993 33993 985824 univ 2 False 2 """ # canonical_trip_num: 1st trip out = 1, 2nd trip out = 2, 1st in = 5, etc. canonical_trip_num = (~trips.outbound * MAX_TRIPS_PER_LEG) + trips.trip_num trips['trip_id'] = trips.tour_id * (2 * MAX_TRIPS_PER_LEG) + canonical_trip_num trips.set_index('trip_id', inplace=True, verify_integrity=True) return trips
def joint_tour_participation( tours, persons_merged, chunk_size, trace_hh_id): """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = 'joint_tour_participation' model_settings_file_name = 'joint_tour_participation.yaml' model_settings = config.read_model_settings(model_settings_file_name) tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: add_null_results(model_settings, trace_label) return persons_merged = persons_merged.to_frame() # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table('joint_tour_participants', candidates) pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates) logger.info("Running joint_tours_participation with %d potential participants (candidates)" % candidates.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_time_window_overlap': person_time_window_overlap, 'persons': persons_merged } expressions.assign_columns( df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - simple_simulate estimator = estimation.manager.begin_estimation('joint_tour_participation') model_spec = simulate.read_model_spec(file_name=model_settings['SPEC']) coefficients_df = simulate.read_model_coefficients(model_settings) model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator) nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) if estimator: estimator.write_model_settings(model_settings, model_settings_file_name) estimator.write_spec(model_settings) estimator.write_coefficients(coefficients_df, model_settings) estimator.write_choosers(candidates) # add tour-based chunk_id so we can chunk all trips in tour together assert 'chunk_id' not in candidates.columns unique_household_ids = candidates.household_id.unique() household_chunk_ids = pd.Series(range(len(unique_household_ids)), index=unique_household_ids) candidates['chunk_id'] = reindex(household_chunk_ids, candidates.household_id) choices = simulate.simple_simulate_by_chunk_id( choosers=candidates, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='participation', custom_chooser=participants_chooser, estimator=estimator) # choice is boolean (participate or not) choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in model_spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col) participate = (choices == PARTICIPATE_CHOICE) if estimator: estimator.write_choices(choices) # we override the 'participate' boolean series, instead of raw alternative index in 'choices' series # its value depends on whether the candidate's 'participant_id' is in the joint_tour_participant index survey_participants_df = estimator.get_survey_table('joint_tour_participants') participate = pd.Series(choices.index.isin(survey_participants_df.index.values), index=choices.index) # but estimation software wants to know the choices value (alternative index) choices = participate.replace({True: PARTICIPATE_CHOICE, False: 1-PARTICIPATE_CHOICE}) # estimator.write_override_choices(participate) # write choices as boolean participate estimator.write_override_choices(choices) # write choices as int alt indexes estimator.end_estimation() # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) assert tour_satisfaction.all() candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id) PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id'] participants = candidates[participate][PARTICIPANT_COLS].copy() # assign participant_num # FIXME do we want something smarter than the participant with the lowest person_id? participants['participant_num'] = \ participants.sort_values(by=['tour_id', 'person_id']).\ groupby('tour_id').cumcount() + 1 pipeline.replace_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) pipeline.get_rn_generator().drop_channel('joint_tour_participants') # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] joint_tours['person_id'] = point_persons.set_index('tour_id').person_id # update number_of_participants which was initialized to 1 joint_tours['number_of_participants'] = participants.groupby('tour_id').size() assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']]) pipeline.replace_table("tours", tours) # - run annotations annotate_jtp(model_settings, trace_label) if trace_hh_id: tracing.trace_df(participants, label="joint_tour_participation.participants") tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours")
def get_survey_values(self, model_values, table_name, column_names): assert isinstance(model_values, (pd.Series, pd.DataFrame, pd.Index)), \ "get_survey_values model_values has unrecognized type %s" % type(model_values) dest_index = model_values if isinstance( model_values, (pd.Index)) else model_values.index # read override_df table survey_df = manager.get_survey_table(table_name) assert survey_df is not None, \ "get_survey_values: table '%s' not found" % (table_name,) column_name = column_names if isinstance(column_names, str) else None if column_name: column_names = [column_name] if not set(column_names).issubset(set(survey_df.columns)): missing_columns = list(set(column_names) - set(survey_df.columns)) logger.error("missing columns (%s) in survey table %s" % (missing_columns, table_name)) print("survey table columns: %s" % (survey_df.columns, )) raise RuntimeError("missing columns (%s) in survey table %s" % (missing_columns, table_name)) assert set(column_names).issubset(set(survey_df.columns)), \ f"missing columns ({list(set(column_names) - set(survey_df.columns))}) " \ f"in survey table {table_name} {list(survey_df.columns)}" # for now tour_id is asim_tour_id in survey_df asim_df_index_name = dest_index.name if asim_df_index_name == survey_df.index.name: # survey table has same index as activitysim survey_df_index_column = 'index' elif asim_df_index_name in survey_df.columns: # survey table has activitysim index as column survey_df_index_column = asim_df_index_name elif 'asim_%s' % asim_df_index_name in survey_df.columns: # survey table has activitysim index as column with asim_ prefix survey_df_index_column = 'asim_%s' % asim_df_index_name else: logger.error("get_survey_values:index '%s' not in survey table" % dest_index.name) # raise RuntimeError("index '%s' not in survey table %s" % (dest_index.name, table_name) survey_df_index_column = None logger.debug("get_survey_values: reindexing using %s.%s" % (table_name, survey_df_index_column)) values = pd.DataFrame(index=dest_index) for c in column_names: if survey_df_index_column == 'index': survey_values = survey_df[c] else: survey_values = pd.Series( survey_df[c].values, index=survey_df[survey_df_index_column]) survey_values = reindex(survey_values, dest_index) # shouldn't be any choices we can't override missing_values = survey_values.isna() if missing_values.any(): logger.error("missing survey_values for %s\n%s" % (c, dest_index[missing_values])) logger.error("couldn't get_survey_values for %s in %s\n" % (c, table_name)) raise RuntimeError( "couldn't get_survey_values for %s in %s\n" % (c, table_name)) values[c] = survey_values return values[column_name] if column_name else values
def joint_tour_participation( tours, persons_merged, chunk_size, trace_hh_id): """ Predicts for each eligible person to participate or not participate in each joint tour. """ trace_label = 'joint_tour_participation' model_settings = config.read_model_settings('joint_tour_participation.yaml') model_spec = simulate.read_model_spec(file_name='joint_tour_participation.csv') tours = tours.to_frame() joint_tours = tours[tours.tour_category == 'joint'] # - if no joint tours if joint_tours.shape[0] == 0: add_null_results(model_settings, trace_label) return persons_merged = persons_merged.to_frame() # - create joint_tour_participation_candidates table candidates = joint_tour_participation_candidates(joint_tours, persons_merged) tracing.register_traceable_table('joint_tour_participants', candidates) pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates) logger.info("Running joint_tours_participation with %d potential participants (candidates)" % candidates.shape[0]) # - preprocessor preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = { 'person_time_window_overlap': person_time_window_overlap, 'persons': persons_merged } expressions.assign_columns( df=candidates, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - simple_simulate nest_spec = config.get_logit_model_settings(model_settings) constants = config.get_model_constants(model_settings) choices = simulate.simple_simulate( choosers=candidates, spec=model_spec, nest_spec=nest_spec, locals_d=constants, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='participation', custom_chooser=participants_chooser) # choice is boolean (participate or not) choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in model_spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col) participate = (choices == PARTICIPATE_CHOICE) # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) assert tour_satisfaction.all() candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id) PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id'] participants = candidates[participate][PARTICIPANT_COLS].copy() # assign participant_num # FIXME do we want something smarter than the participant with the lowest person_id? participants['participant_num'] = \ participants.sort_values(by=['tour_id', 'person_id']).\ groupby('tour_id').cumcount() + 1 pipeline.replace_table("joint_tour_participants", participants) # drop channel as we aren't using any more (and it has candidates that weren't chosen) pipeline.get_rn_generator().drop_channel('joint_tour_participants') # - assign joint tour 'point person' (participant_num == 1) point_persons = participants[participants.participant_num == 1] joint_tours['person_id'] = point_persons.set_index('tour_id').person_id # update number_of_participants which was initialized to 1 joint_tours['number_of_participants'] = participants.groupby('tour_id').size() assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']]) pipeline.replace_table("tours", tours) # - run annotations annotate_jtp(model_settings, trace_label) if trace_hh_id: tracing.trace_df(participants, label="joint_tour_participation.participants") tracing.trace_df(joint_tours, label="joint_tour_participation.joint_tours")
def participants_chooser(probs, choosers, spec, trace_label): """ custom alternative to logit.make_choices for simulate.simple_simulate Choosing participants for mixed tours is trickier than adult or child tours becuase we need at least one adult and one child participant in a mixed tour. We call logit.make_choices and then check to see if the tour statisfies this requirement, and rechoose for any that fail until all are satisfied. In principal, this shold always occur eventually, but we fail after MAX_ITERATIONS, just in case there is some failure in program logic (haven't seen this occur.) Parameters ---------- probs : pandas.DataFrame Rows for choosers and columns for the alternatives from which they are choosing. Values are expected to be valid probabilities across each row, e.g. they should sum to 1. choosers : pandas.dataframe simple_simulate choosers df spec : pandas.DataFrame simple_simulate spec df We only need spec so we can know the column index of the 'participate' alternative indicating that the participant has been chosen to participate in the tour trace_label : str Returns - same as logit.make_choices ------- choices, rands choices, rands as returned by logit.make_choices (in same order as probs) """ assert probs.index.equals(choosers.index) # choice is boolean (participate or not) model_settings = config.read_model_settings('joint_tour_participation.yaml') choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = spec.columns.get_loc(choice_col) MAX_ITERATIONS = model_settings.get('max_participation_choice_iterations', 5000) trace_label = tracing.extend_trace_label(trace_label, 'participants_chooser') candidates = choosers.copy() choices_list = [] rands_list = [] num_tours_remaining = len(candidates.tour_id.unique()) logger.info('%s %s joint tours to satisfy.', trace_label, num_tours_remaining,) iter = 0 while candidates.shape[0] > 0: iter += 1 if iter > MAX_ITERATIONS: logger.warning('%s max iterations exceeded (%s).', trace_label, MAX_ITERATIONS) diagnostic_cols = ['tour_id', 'household_id', 'composition', 'adult'] unsatisfied_candidates = candidates[diagnostic_cols].join(probs) tracing.write_csv(unsatisfied_candidates, file_name='%s.UNSATISFIED' % trace_label, transpose=False) print(unsatisfied_candidates.head(20)) assert False choices, rands = logit.make_choices(probs, trace_label=trace_label, trace_choosers=choosers) participate = (choices == PARTICIPATE_CHOICE) # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) num_tours_satisfied_this_iter = tour_satisfaction.sum() if num_tours_satisfied_this_iter > 0: num_tours_remaining -= num_tours_satisfied_this_iter satisfied = reindex(tour_satisfaction, candidates.tour_id) choices_list.append(choices[satisfied]) rands_list.append(rands[satisfied]) # remove candidates of satisfied tours probs = probs[~satisfied] candidates = candidates[~satisfied] logger.info('%s iteration %s : %s joint tours satisfied %s remaining' % (trace_label, iter, num_tours_satisfied_this_iter, num_tours_remaining,)) choices = pd.concat(choices_list) rands = pd.concat(rands_list).reindex(choosers.index) # reindex choices and rands to match probs and v index choices = choices.reindex(choosers.index) rands = rands.reindex(choosers.index) assert choices.index.equals(choosers.index) assert rands.index.equals(choosers.index) logger.info('%s %s iterations to satisfy all joint tours.', trace_label, iter,) return choices, rands
def process_trips(tours, stop_frequency_alts): MAX_TRIPS_PER_LEG = 4 # max number of trips per leg (inbound or outbound) of tour OUTBOUND_ALT = 'out' assert OUTBOUND_ALT in stop_frequency_alts.columns # get the actual alternatives for each person - have to go back to the # stop_frequency_alts dataframe to get this - the stop_frequency choice # column has the index values for the chosen alternative trips = stop_frequency_alts.loc[tours.stop_frequency] # assign tour ids to the index trips.index = tours.index """ :: tours.stop_frequency => proto trips table ________________________________________________________ stop_frequency | out in tour_id | tour_id 954910 1out_1in | 954910 1 1 985824 0out_1in | 985824 0 1 """ # reformat with the columns given below trips = trips.stack().reset_index() trips.columns = ['tour_id', 'direction', 'trip_count'] # tours legs have one more leg than stop trips.trip_count += 1 # prefer direction as boolean trips['outbound'] = trips.direction == OUTBOUND_ALT """ tour_id direction trip_count outbound 0 954910 out 2 True 1 954910 in 2 False 2 985824 out 1 True 3 985824 in 2 False """ # now do a repeat and a take, so if you have two trips of given type you # now have two rows, and zero trips yields zero rows trips = trips.take(np.repeat(trips.index.values, trips.trip_count.values)) trips = trips.reset_index(drop=True) grouped = trips.groupby(['tour_id', 'outbound']) trips['trip_num'] = grouped.cumcount() + 1 trips['person_id'] = reindex(tours.person_id, trips.tour_id) trips['household_id'] = reindex(tours.household_id, trips.tour_id) trips['primary_purpose'] = reindex(tours.primary_purpose, trips.tour_id) # reorder columns and drop 'direction' trips = trips[[ 'person_id', 'household_id', 'tour_id', 'primary_purpose', 'trip_num', 'outbound', 'trip_count' ]] """ person_id household_id tour_id primary_purpose trip_num outbound trip_count 0 32927 32927 954910 work 1 True 2 1 32927 32927 954910 work 2 True 2 2 32927 32927 954910 work 1 False 2 3 32927 32927 954910 work 2 False 2 4 33993 33993 985824 univ 1 True 1 5 33993 33993 985824 univ 1 False 2 6 33993 33993 985824 univ 2 False 2 """ # canonical_trip_num: 1st trip out = 1, 2nd trip out = 2, 1st in = 5, etc. canonical_trip_num = (~trips.outbound * MAX_TRIPS_PER_LEG) + trips.trip_num trips['trip_id'] = trips.tour_id * (2 * MAX_TRIPS_PER_LEG) + canonical_trip_num trips.set_index('trip_id', inplace=True, verify_integrity=True) return trips
def run_trip_destination(trips, tours_merged, estimator, chunk_size, trace_hh_id, trace_label, fail_some_trips_for_testing=False): """ trip destination - main functionality separated from model step so it can be called iteratively Run the trip_destination model, assigning destinations for each (intermediate) trip (last trips already have a destination - either the tour primary destination or Home) Set trip destination and origin columns, and a boolean failed flag for any failed trips (destination for flagged failed trips will be set to -1) Parameters ---------- trips tours_merged want_sample_table chunk_size trace_hh_id trace_label Returns ------- """ model_settings_file_name = 'trip_destination.yaml' model_settings = config.read_model_settings(model_settings_file_name) preprocessor_settings = model_settings.get('preprocessor', None) logsum_settings = config.read_model_settings( model_settings['LOGSUM_SETTINGS']) logsum_column_name = model_settings.get('DEST_CHOICE_LOGSUM_COLUMN_NAME') want_logsums = logsum_column_name is not None sample_table_name = model_settings.get('DEST_CHOICE_SAMPLE_TABLE_NAME') want_sample_table = config.setting( 'want_dest_choice_sample_tables') and sample_table_name is not None land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') network_los = inject.get_injectable('network_los') trips = trips.sort_index() trips['next_trip_id'] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where( trips.trip_num < trips.trip_count, 0) # - initialize trip origin and destination to those of half-tour # (we will sequentially adjust intermediate trips origin and destination as we choose them) tour_destination = reindex(tours_merged.destination, trips.tour_id).astype(np.int64) tour_origin = reindex(tours_merged.origin, trips.tour_id).astype(np.int64) trips['destination'] = np.where(trips.outbound, tour_destination, tour_origin) trips['origin'] = np.where(trips.outbound, tour_origin, tour_destination) trips['failed'] = False if estimator: # need to check or override non-intermediate trip destination # should check consistency of survey trips origin, destination with parent tour and subsequent/prior trip? # FIXME if not consistent, do we fail or override? (seems weird to override them to bad values?) # expect all the same trips survey_trips = estimator.get_survey_table('trips').sort_index() assert survey_trips.index.equals(trips.index) first = (survey_trips.trip_num == 1) last = (survey_trips.trip_num == trips.trip_count) # expect survey's outbound first trip origin to be same as half tour origin assert ( survey_trips.origin[survey_trips.outbound & first] == tour_origin[survey_trips.outbound & first]).all() # expect outbound last trip destination to be same as half tour destination assert (survey_trips.destination[survey_trips.outbound & last] == tour_destination[survey_trips.outbound & last]).all() # expect inbound first trip origin to be same as half tour destination assert (survey_trips.origin[~survey_trips.outbound & first] == tour_destination[~survey_trips.outbound & first]).all() # expect inbound last trip destination to be same as half tour origin assert (survey_trips.destination[~survey_trips.outbound & last] == tour_origin[~survey_trips.outbound & last]).all() # - filter tours_merged (AFTER copying destination and origin columns to trips) # tours_merged is used for logsums, we filter it here upfront to save space and time tours_merged_cols = logsum_settings['TOURS_MERGED_CHOOSER_COLUMNS'] redundant_cols = model_settings.get( 'REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS', []) if redundant_cols: tours_merged_cols = [ c for c in tours_merged_cols if c not in redundant_cols ] tours_merged = tours_merged[tours_merged_cols] # - skims skim_hotel = SkimHotel(model_settings, network_los, trace_label) # - size_terms and alternatives alternatives = tour_destination_size_terms(land_use, size_terms, 'trip') # DataFrameMatrix alows us to treat dataframe as virtual a 2-D array, indexed by zone_id, purpose # e.g. size_terms.get(df.dest_zone_id, df.purpose) # returns a series of size_terms for each chooser's dest_zone_id and purpose with chooser index size_term_matrix = DataFrameMatrix(alternatives) # don't need size terms in alternatives, just zone_id index alternatives = alternatives.drop(alternatives.columns, axis=1) alternatives.index.name = model_settings['ALT_DEST_COL_NAME'] sample_list = [] # - process intermediate trips in ascending trip_num order intermediate = trips.trip_num < trips.trip_count if intermediate.any(): first_trip_num = trips[intermediate].trip_num.min() last_trip_num = trips[intermediate].trip_num.max() # iterate over trips in ascending trip_num order for trip_num in range(first_trip_num, last_trip_num + 1): nth_trips = trips[intermediate & (trips.trip_num == trip_num)] nth_trace_label = tracing.extend_trace_label( trace_label, 'trip_num_%s' % trip_num) locals_dict = {'network_los': network_los} locals_dict.update(config.get_model_constants(model_settings)) # - annotate nth_trips if preprocessor_settings: expressions.assign_columns( df=nth_trips, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=nth_trace_label) logger.info("Running %s with %d trips", nth_trace_label, nth_trips.shape[0]) # - choose destination for nth_trips, segmented by primary_purpose choices_list = [] for primary_purpose, trips_segment in nth_trips.groupby( 'primary_purpose'): choices, destination_sample = choose_trip_destination( primary_purpose, trips_segment, alternatives, tours_merged, model_settings, want_logsums, want_sample_table, size_term_matrix, skim_hotel, estimator, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label( nth_trace_label, primary_purpose)) choices_list.append(choices) if want_sample_table: assert destination_sample is not None sample_list.append(destination_sample) destinations_df = pd.concat(choices_list) if fail_some_trips_for_testing: if len(destinations_df) > 0: destinations_df = destinations_df.drop( destinations_df.index[0]) failed_trip_ids = nth_trips.index.difference(destinations_df.index) if failed_trip_ids.any(): logger.warning( "%s sidelining %s trips without viable destination alternatives" % (nth_trace_label, failed_trip_ids.shape[0])) next_trip_ids = nth_trips.next_trip_id.reindex(failed_trip_ids) trips.loc[failed_trip_ids, 'failed'] = True trips.loc[failed_trip_ids, 'destination'] = -1 trips.loc[next_trip_ids, 'origin'] = trips.loc[failed_trip_ids].origin.values if len(destinations_df) == 0: assert failed_trip_ids.all() logger.warning( f"all {len(nth_trips)} {primary_purpose} trip_num {trip_num} trips failed" ) if len(destinations_df) > 0: # - assign choices to this trip's destinations # if estimator, then the choices will already have been overridden by trip_destination_simulate # because we need to overwrite choices before any failed choices are suppressed assign_in_place(trips, destinations_df.choice.to_frame('destination')) if want_logsums: assert 'logsum' in destinations_df.columns assign_in_place( trips, destinations_df.logsum.to_frame(logsum_column_name)) # - assign choice to next trip's origin destinations_df.index = nth_trips.next_trip_id.reindex( destinations_df.index) assign_in_place(trips, destinations_df.choice.to_frame('origin')) del trips['next_trip_id'] if len(sample_list) > 0: save_sample_df = pd.concat(sample_list) else: # this could happen if no intermediate trips, or if no saved sample desired save_sample_df = None return trips, save_sample_df
def run_destination_simulate(spec_segment_name, tours, persons_merged, destination_sample, want_logsums, model_settings, network_los, destination_size_terms, estimator, chunk_size, trace_label): """ run destination_simulate on tour_destination_sample annotated with mode_choice logsum to select a destination from sample alternatives """ model_spec = simulate.spec_for_segment(model_settings, spec_id='SPEC', segment_name=spec_segment_name, estimator=estimator) # FIXME - MEMORY HACK - only include columns actually used in spec (omit them pre-merge) chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] persons_merged = persons_merged[[ c for c in persons_merged.columns if c in chooser_columns ]] tours = tours[[ c for c in tours.columns if c in chooser_columns or c == 'person_id' ]] choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True, how='left') # interaction_sample requires that choosers.index.is_monotonic_increasing if not choosers.index.is_monotonic_increasing: logger.debug( f"run_destination_simulate {trace_label} sorting choosers because not monotonic_increasing" ) choosers = choosers.sort_index() if estimator: estimator.write_choosers(choosers) alt_dest_col_name = model_settings['ALT_DEST_COL_NAME'] origin_col_name = model_settings['CHOOSER_ORIG_COL_NAME'] # alternatives are pre-sampled and annotated with logsums and pick_count # but we have to merge size_terms column into alt sample list destination_sample['size_term'] = \ reindex(destination_size_terms.size_term, destination_sample[alt_dest_col_name]) tracing.dump_df(DUMP, destination_sample, trace_label, 'alternatives') constants = config.get_model_constants(model_settings) logger.info("Running tour_destination_simulate with %d persons", len(choosers)) # create wrapper with keys for this lookup - in this case there is a home_zone_id in the choosers # and a zone_id in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skim_dict = network_los.get_default_skim_dict() skims = skim_dict.wrap(origin_col_name, alt_dest_col_name) locals_d = { 'skims': skims, } if constants is not None: locals_d.update(constants) tracing.dump_df(DUMP, choosers, trace_label, 'choosers') choices = interaction_sample_simulate(choosers, destination_sample, spec=model_spec, choice_column=alt_dest_col_name, want_logsums=want_logsums, skims=skims, locals_d=locals_d, chunk_size=chunk_size, trace_label=trace_label, trace_choice_name='destination', estimator=estimator) if not want_logsums: # for consistency, always return a dataframe with canonical column name assert isinstance(choices, pd.Series) choices = choices.to_frame('choice') return choices
def home_taz(households, persons): return reindex(households.home_taz, persons.household_id)
def run_trip_destination( trips, tours_merged, chunk_size, trace_hh_id, trace_label): """ trip destination - main functionality separated from model step so it can be called iteratively Run the trip_destination model, assigning destinations for each (intermediate) trip (last trips already have a destination - either the tour primary destination or Home) Set trip destination and origin columns, and a boolean failed flag for any failed trips (destination for flagged failed trips will be set to -1) Parameters ---------- trips tours_merged chunk_size trace_hh_id trace_label Returns ------- """ model_settings = config.read_model_settings('trip_destination.yaml') preprocessor_settings = model_settings.get('preprocessor', None) logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') # - initialize trip origin and destination to those of half-tour # (we will sequentially adjust intermediate trips origin and destination as we choose them) tour_destination = reindex(tours_merged.destination, trips.tour_id).astype(int) tour_origin = reindex(tours_merged.origin, trips.tour_id).astype(int) trips['destination'] = np.where(trips.outbound, tour_destination, tour_origin) trips['origin'] = np.where(trips.outbound, tour_origin, tour_destination) trips['failed'] = False trips = trips.sort_index() trips['next_trip_id'] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where(trips.trip_num < trips.trip_count, 0) # - filter tours_merged (AFTER copying destination and origin columns to trips) # tours_merged is used for logsums, we filter it here upfront to save space and time tours_merged_cols = logsum_settings['TOURS_MERGED_CHOOSER_COLUMNS'] if 'REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS' in model_settings: redundant_cols = model_settings['REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS'] tours_merged_cols = [c for c in tours_merged_cols if c not in redundant_cols] tours_merged = tours_merged[tours_merged_cols] # - skims skims = wrap_skims(model_settings) # - size_terms and alternatives alternatives = tour_destination_size_terms(land_use, size_terms, 'trip') # DataFrameMatrix alows us to treat dataframe as virtual a 2-D array, indexed by TAZ, purpose # e.g. size_terms.get(df.dest_taz, df.purpose) # returns a series of size_terms for each chooser's dest_taz and purpose with chooser index size_term_matrix = DataFrameMatrix(alternatives) # don't need size terms in alternatives, just TAZ index alternatives = alternatives.drop(alternatives.columns, axis=1) alternatives.index.name = model_settings['ALT_DEST'] # - process intermediate trips in ascending trip_num order intermediate = trips.trip_num < trips.trip_count if intermediate.any(): first_trip_num = trips[intermediate].trip_num.min() last_trip_num = trips[intermediate].trip_num.max() # iterate over trips in ascending trip_num order for trip_num in range(first_trip_num, last_trip_num + 1): nth_trips = trips[intermediate & (trips.trip_num == trip_num)] nth_trace_label = tracing.extend_trace_label(trace_label, 'trip_num_%s' % trip_num) # - annotate nth_trips if preprocessor_settings: expressions.assign_columns( df=nth_trips, model_settings=preprocessor_settings, locals_dict=config.get_model_constants(model_settings), trace_label=nth_trace_label) logger.info("Running %s with %d trips", nth_trace_label, nth_trips.shape[0]) # - choose destination for nth_trips, segmented by primary_purpose choices_list = [] for primary_purpose, trips_segment in nth_trips.groupby('primary_purpose'): choices = choose_trip_destination( primary_purpose, trips_segment, alternatives, tours_merged, model_settings, size_term_matrix, skims, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(nth_trace_label, primary_purpose)) choices_list.append(choices) destinations = pd.concat(choices_list) failed_trip_ids = nth_trips.index.difference(destinations.index) if failed_trip_ids.any(): logger.warning("%s sidelining %s trips without viable destination alternatives" % (nth_trace_label, failed_trip_ids.shape[0])) next_trip_ids = nth_trips.next_trip_id.reindex(failed_trip_ids) trips.loc[failed_trip_ids, 'failed'] = True trips.loc[failed_trip_ids, 'destination'] = -1 trips.loc[next_trip_ids, 'origin'] = trips.loc[failed_trip_ids].origin.values # - assign choices to these trips destinations and to next trips origin assign_in_place(trips, destinations.to_frame('destination')) destinations.index = nth_trips.next_trip_id.reindex(destinations.index) assign_in_place(trips, destinations.to_frame('origin')) del trips['next_trip_id'] return trips
def workplace_in_cbd(persons, land_use, settings): s = reindex(land_use.area_type, persons.workplace_taz) return s < settings['cbd_threshold']
def dest_topology(tours, land_use): return reindex(land_use.TOPOLOGY, tours.destination)
def vectorize_joint_tour_scheduling( joint_tours, joint_tour_participants, persons_merged, alts, spec, model_settings, chunk_size=0, trace_label=None): """ Like vectorize_tour_scheduling but specifically for joint tours joint tours have a few peculiarities necessitating separate treatment: Timetable has to be initialized to set all timeperiods... Parameters ---------- tours : DataFrame DataFrame of tours containing tour attributes, as well as a person_id column to define the nth tour for each person. persons_merged : DataFrame DataFrame of persons containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. (or dict of specs keyed on tour_type if tour_types is not None) model_settings : dict Returns ------- choices : Series A Series of choices where the index is the index of the tours DataFrame and the values are the index of the alts DataFrame. persons_timetable : TimeTable timetable updated with joint tours (caller should replace_table for it to persist) """ trace_label = tracing.extend_trace_label(trace_label, 'vectorize_joint_tour_scheduling') assert len(joint_tours.index) > 0 assert 'tour_num' in joint_tours.columns assert 'tour_type' in joint_tours.columns timetable_window_id_col = None tour_owner_id_col = 'household_id' segment = None persons_timetable = inject.get_injectable("timetable") choice_list = [] # keep a series of the the most recent tours for each person # initialize with first trip from alts previous_tour_by_householdid = pd.Series(alts.index[0], index=joint_tours.household_id.unique()) # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # this ought to have been ensured when tours are created (tour_frequency.process_tours) # print "participant windows before scheduling\n", \ # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) for tour_num, nth_tours in joint_tours.groupby('tour_num', sort=True): tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % (tour_num,)) # no more than one tour per household per call to schedule_tours assert not nth_tours.household_id.duplicated().any() nth_participants = \ joint_tour_participants[joint_tour_participants.tour_id.isin(nth_tours.index)] timetable = build_joint_tour_timetables( nth_tours, nth_participants, persons_timetable, alts) choices = \ schedule_tours(nth_tours, persons_merged, alts, spec, segment, model_settings, timetable, timetable_window_id_col, previous_tour_by_householdid, tour_owner_id_col, chunk_size, tour_trace_label) # - update timetables of all joint tour participants persons_timetable.assign( nth_participants.person_id, reindex(choices, nth_participants.tour_id)) choice_list.append(choices) choices = pd.concat(choice_list) # add the start, end, and duration from tdd_alts # assert (alts.index == list(range(alts.shape[0]))).all() tdd = pd.DataFrame(data=alts.values[choices.values], columns=alts.columns, index=choices.index) # tdd = alts.loc[choices] # tdd.index = choices.index tdd.index = choices.index # include the index of the choice in the tdd alts table tdd['tdd'] = choices # print "participant windows after scheduling\n", \ # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) return tdd, persons_timetable
def trip_scheduling(trips, tours, chunk_size, trace_hh_id): """ Trip scheduling assigns depart times for trips within the start, end limits of the tour. The algorithm is simplistic: The first outbound trip starts at the tour start time, and subsequent outbound trips are processed in trip_num order, to ensure that subsequent trips do not depart before the trip that preceeds them. Inbound trips are handled similarly, except in reverse order, starting with the last trip, and working backwards to ensure that inbound trips do not depart after the trip that succeeds them. The probability spec assigns probabilities for depart times, but those possible departs must be clipped to disallow depart times outside the tour limits, the departs of prior trips, and in the case of work tours, the start/end times of any atwork subtours. Scheduling can fail if the probability table assigns zero probabilities to all the available depart times in a trip's depart window. (This could be avoided by giving every window a small probability, rather than zero, but the existing mtctm1 prob spec does not do this. I believe this is due to the its having been generated from a small household travel survey sample that lacked any departs for some time periods.) Rescheduling the trips that fail (along with their inbound or outbound leg-mates) can sometimes fix this problem, if it was caused by an earlier trip's depart choice blocking a subsequent trip's ability to schedule a depart within the resulting window. But it can also happen if a tour is very short (e.g. one time period) and the prob spec having a zero probability for that tour hour. Therefore we need to handle trips that could not be scheduled. There are two ways (at least) to solve this problem: 1) choose_most_initial simply assign a depart time to the trip, even if it has a zero probability. It makes most sense, in this case, to assign the 'most initial' depart time, so that subsequent trips are minimally impacted. This can be done in the final iteration, thus affecting only the trips that could no be scheduled by the standard approach 2) drop_and_cleanup drop trips that could no be scheduled, and adjust their leg mates, as is done for failed trips in trip_destination. Which option is applied is determined by the FAILFIX model setting """ trace_label = "trip_scheduling" model_settings_file_name = 'trip_scheduling.yaml' model_settings = config.read_model_settings(model_settings_file_name) trips_df = trips.to_frame() tours = tours.to_frame() # add columns 'tour_hour', 'earliest', 'latest' to trips set_tour_hour(trips_df, tours) # trip_scheduling is a probabilistic model ane we don't support estimation, # but we do need to override choices in estimation mode estimator = estimation.manager.begin_estimation('trip_scheduling') if estimator: estimator.write_spec(model_settings, tag='PROBS_SPEC') estimator.write_model_settings(model_settings, model_settings_file_name) chooser_cols_for_estimation = [ 'person_id', 'household_id', 'tour_id', 'trip_num', 'trip_count', 'primary_purpose', 'outbound', 'earliest', 'latest', 'tour_hour', ] estimator.write_choosers(trips_df[chooser_cols_for_estimation]) probs_spec = pd.read_csv( config.config_file_path('trip_scheduling_probs.csv'), comment='#') # FIXME for now, not really doing estimation for probabilistic model - just overwriting choices # besides, it isn't clear that named coefficients would be helpful if we had some form of estimation # coefficients_df = simulate.read_model_coefficients(model_settings) # probs_spec = map_coefficients(probs_spec, coefficients_df) # add tour-based chunk_id so we can chunk all trips in tour together trips_df['chunk_id'] = reindex( pd.Series(list(range(len(tours))), tours.index), trips_df.tour_id) assert 'DEPART_ALT_BASE' in model_settings failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) max_iterations = model_settings.get('MAX_ITERATIONS', 1) assert max_iterations > 0 choices_list = [] for chunk_i, trips_chunk, chunk_trace_label in chunk.adaptive_chunked_choosers_by_chunk_id( trips_df, chunk_size, trace_label, trace_label): i = 0 while (i < max_iterations) and not trips_chunk.empty: # only chunk log first iteration since memory use declines with each iteration with chunk.chunk_log( trace_label) if i == 0 else chunk.chunk_log_skip(): i += 1 is_last_iteration = (i == max_iterations) trace_label_i = tracing.extend_trace_label( trace_label, "i%s" % i) logger.info("%s scheduling %s trips within chunk %s", trace_label_i, trips_chunk.shape[0], chunk_i) choices = \ run_trip_scheduling( trips_chunk, tours, probs_spec, model_settings, estimator=estimator, is_last_iteration=is_last_iteration, trace_hh_id=trace_hh_id, chunk_size=chunk_size, chunk_tag=trace_label, trace_label=trace_label_i) # boolean series of trips whose individual trip scheduling failed failed = choices.reindex(trips_chunk.index).isnull() logger.info("%s %s failed", trace_label_i, failed.sum()) if not is_last_iteration: # boolean series of trips whose leg scheduling failed failed_cohorts = failed_trip_cohorts(trips_chunk, failed) trips_chunk = trips_chunk[failed_cohorts] choices = choices[~failed_cohorts] choices_list.append(choices) trips_df = trips.to_frame() choices = pd.concat(choices_list) choices = choices.reindex(trips_df.index) if estimator: estimator.write_choices(choices) choices = estimator.get_survey_values(choices, 'trips', 'depart') # override choices estimator.write_override_choices(choices) estimator.end_estimation() assert not choices.isnull().any() if choices.isnull().any(): logger.warning( "%s of %s trips could not be scheduled after %s iterations" % (choices.isnull().sum(), trips_df.shape[0], i)) if failfix != FAILFIX_DROP_AND_CLEANUP: raise RuntimeError("%s setting '%s' not enabled in settings" % (FAILFIX, FAILFIX_DROP_AND_CLEANUP)) trips_df['failed'] = choices.isnull() trips_df = cleanup_failed_trips(trips_df) choices = choices.reindex(trips_df.index) trips_df['depart'] = choices assert not trips_df.depart.isnull().any() pipeline.replace_table("trips", trips_df)
def dest_density_index(tours, land_use): return reindex(land_use.density_index, tours.destination)
def workplace_location_logsums(persons_merged, land_use, skim_dict, skim_stack, workplace_location_sample, configs_dir, chunk_size, trace_hh_id): """ add logsum column to existing workplace_location_sample able logsum is calculated by running the mode_choice model for each sample (person, dest_taz) pair in workplace_location_sample, and computing the logsum of all the utilities <added> PERID, dest_TAZ, rand, pick_count, logsum 23750, 14, 0.565502716034, 4 1.85659498857 23750, 16, 0.711135838871, 6 1.92315598631 ... 23751, 12, 0.408038878552, 1 2.40612135416 23751, 14, 0.972732479292, 2 1.44009018355 """ trace_label = 'workplace_location_logsums' logsums_spec = mode_choice_logsums_spec(configs_dir, 'work') workplace_location_settings = config.read_model_settings( configs_dir, 'workplace_location.yaml') alt_col_name = workplace_location_settings["ALT_COL_NAME"] # FIXME - just using settings from tour_mode_choice logsum_settings = config.read_model_settings(configs_dir, 'tour_mode_choice.yaml') persons_merged = persons_merged.to_frame() workplace_location_sample = workplace_location_sample.to_frame() logger.info("Running workplace_location_sample with %s rows" % len(workplace_location_sample)) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = workplace_location_settings['LOGSUM_CHOOSER_COLUMNS'] persons_merged = persons_merged[chooser_columns] choosers = pd.merge(workplace_location_sample, persons_merged, left_index=True, right_index=True, how="left") choosers['in_period'] = time_period_label( workplace_location_settings['IN_PERIOD']) choosers['out_period'] = time_period_label( workplace_location_settings['OUT_PERIOD']) # FIXME - should do this in expression file? choosers['dest_topology'] = reindex(land_use.TOPOLOGY, choosers[alt_col_name]) choosers['dest_density_index'] = reindex(land_use.density_index, choosers[alt_col_name]) tracing.dump_df(DUMP, persons_merged, trace_label, 'persons_merged') tracing.dump_df(DUMP, choosers, trace_label, 'choosers') logsums = compute_logsums(choosers, logsums_spec, logsum_settings, skim_dict, skim_stack, alt_col_name, chunk_size, trace_hh_id, trace_label) # "add_column series should have an index matching the table to which it is being added" # when the index has duplicates, however, in the special case that the series index exactly # matches the table index, then the series value order is preserved # logsums now does, since workplace_location_sample was on left side of merge de-dup merge orca.add_column("workplace_location_sample", "mode_choice_logsum", logsums)
def mandatory_tour_scheduling(tours, persons_merged, tdd_alts, chunk_size, trace_hh_id): """ This model predicts the departure time and duration of each activity for mandatory tours """ trace_label = 'mandatory_tour_scheduling' model_settings_file_name = 'mandatory_tour_scheduling.yaml' estimators = {} model_settings = config.read_model_settings(model_settings_file_name) logsum_settings = config.read_model_settings( model_settings['LOGSUM_SETTINGS']) tours = tours.to_frame() mandatory_tours = tours[tours.tour_category == 'mandatory'] # - if no mandatory_tours if mandatory_tours.shape[0] == 0: tracing.no_results(trace_label) return persons_merged = persons_merged.to_frame() # - filter chooser columns for both logsums and simulate logsum_columns = logsum_settings.get('LOGSUM_CHOOSER_COLUMNS', []) model_columns = model_settings.get('SIMULATE_CHOOSER_COLUMNS', []) chooser_columns = logsum_columns + [ c for c in model_columns if c not in logsum_columns ] persons_merged = expressions.filter_chooser_columns( persons_merged, chooser_columns) # - add tour segmentation column # mtctm1 segments mandatory_scheduling spec by tour_type # (i.e. there are different specs for work and school tour_types) # mtctm1 logsum coefficients are segmented by primary_purpose # (i.e. there are different locsum coefficents for work, school, univ primary_purposes # for simplicity managing these different segmentation schemes, # we conflate them by segmenting the skims to align with primary_purpose tour_segment_col = 'mandatory_tour_seg' assert tour_segment_col not in mandatory_tours is_university_tour = \ (mandatory_tours.tour_type == 'school') & \ reindex(persons_merged.is_university, mandatory_tours.person_id) mandatory_tours[tour_segment_col] = \ mandatory_tours.tour_type.where(~is_university_tour, 'univ') # load specs spec_segment_settings = model_settings.get('SPEC_SEGMENTS', {}) specs = {} estimators = {} for spec_segment_name, spec_settings in spec_segment_settings.items(): # estimator for this tour_segment estimator = estimation.manager.begin_estimation( model_name='mandatory_tour_scheduling_%s' % spec_segment_name, bundle_name='mandatory_tour_scheduling') spec_file_name = spec_settings['SPEC'] model_spec = simulate.read_model_spec(file_name=spec_file_name) coefficients_df = simulate.read_model_coefficients( spec_segment_settings[spec_segment_name]) specs[spec_segment_name] = simulate.eval_coefficients( model_spec, coefficients_df, estimator) if estimator: estimators[spec_segment_name] = estimator # add to local list estimator.write_model_settings(model_settings, model_settings_file_name) estimator.write_spec(spec_settings) estimator.write_coefficients(coefficients_df) # - spec dict segmented by primary_purpose tour_segment_settings = model_settings.get('TOUR_SPEC_SEGMENTS', {}) tour_segments = {} for tour_segment_name, spec_segment_name in tour_segment_settings.items(): tour_segments[tour_segment_name] = {} tour_segments[tour_segment_name][ 'spec_segment_name'] = spec_segment_name tour_segments[tour_segment_name]['spec'] = specs[spec_segment_name] tour_segments[tour_segment_name]['estimator'] = estimators.get( spec_segment_name) timetable = inject.get_injectable("timetable") if estimators: timetable.begin_transaction(list(estimators.values())) logger.info("Running mandatory_tour_scheduling with %d tours", len(tours)) choices = vts.vectorize_tour_scheduling(mandatory_tours, persons_merged, tdd_alts, timetable, tour_segments=tour_segments, tour_segment_col=tour_segment_col, model_settings=model_settings, chunk_size=chunk_size, trace_label=trace_label) if estimators: # overrride choices for all estimators choices_list = [] for spec_segment_name, estimator in estimators.items(): model_choices = choices[( mandatory_tours.tour_type == spec_segment_name)] # FIXME vectorize_tour_scheduling calls used to write_choices but perhaps shouldn't estimator.write_choices(model_choices) override_choices = estimator.get_survey_values( model_choices, 'tours', 'tdd') estimator.write_override_choices(override_choices) choices_list.append(override_choices) estimator.end_estimation() choices = pd.concat(choices_list) # update timetable to reflect the override choices (assign tours in tour_num order) timetable.rollback() for tour_num, nth_tours in tours.groupby('tour_num', sort=True): timetable.assign(window_row_ids=nth_tours['person_id'], tdds=choices.reindex(nth_tours.index)) # choices are tdd alternative ids # we want to add start, end, and duration columns to tours, which we have in tdd_alts table choices = pd.merge(choices.to_frame('tdd'), tdd_alts, left_on=['tdd'], right_index=True, how='left') assign_in_place(tours, choices) pipeline.replace_table("tours", tours) timetable.replace_table() # updated df for tracing mandatory_tours = tours[tours.tour_category == 'mandatory'] tracing.dump_df(DUMP, tt.tour_map(persons_merged, mandatory_tours, tdd_alts), trace_label, 'tour_map') if trace_hh_id: tracing.trace_df(mandatory_tours, label="mandatory_tour_scheduling", slicer='person_id', index_label='tour', columns=None, warn_if_empty=True)
def home_is_urban(households, land_use, settings): s = reindex(land_use.area_type, households.home_taz) return s < settings['urban_threshold']
def school_location_logsums(persons_merged, land_use, skim_dict, skim_stack, school_location_sample, configs_dir, chunk_size, trace_hh_id): """ add logsum column to existing school_location_sample able logsum is calculated by running the mode_choice model for each sample (person, dest_taz) pair in school_location_sample, and computing the logsum of all the utilities +-------+--------------+----------------+------------+----------------+ | PERID | dest_TAZ | rand | pick_count | logsum (added) | +=======+==============+================+============+================+ | 23750 | 14 | 0.565502716034 | 4 | 1.85659498857 | +-------+--------------+----------------+------------+----------------+ + 23750 | 16 | 0.711135838871 | 6 | 1.92315598631 | +-------+--------------+----------------+------------+----------------+ + ... | | | | | +-------+--------------+----------------+------------+----------------+ | 23751 | 12 | 0.408038878552 | 1 | 2.40612135416 | +-------+--------------+----------------+------------+----------------+ | 23751 | 14 | 0.972732479292 | 2 | 1.44009018355 | +-------+--------------+----------------+------------+----------------+ """ trace_label = 'school_location_logsums' school_location_settings = config.read_model_settings( configs_dir, 'school_location.yaml') alt_col_name = school_location_settings["ALT_COL_NAME"] chooser_col_name = 'TAZ' # FIXME - just using settings from tour_mode_choice logsum_settings = config.read_model_settings(configs_dir, 'tour_mode_choice.yaml') persons_merged = persons_merged.to_frame() school_location_sample = school_location_sample.to_frame() logger.info("Running school_location_sample with %s rows" % len(school_location_sample)) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = school_location_settings['LOGSUM_CHOOSER_COLUMNS'] persons_merged = persons_merged[chooser_columns] tracing.dump_df(DUMP, persons_merged, trace_label, 'persons_merged') logsums_list = [] for school_type in ['university', 'highschool', 'gradeschool']: logsums_spec = mode_choice_logsums_spec(configs_dir, school_type) choosers = school_location_sample[school_location_sample['school_type'] == school_type] choosers = pd.merge(choosers, persons_merged, left_index=True, right_index=True, how="left") choosers['in_period'] = skim_time_period_label( school_location_settings['IN_PERIOD']) choosers['out_period'] = skim_time_period_label( school_location_settings['OUT_PERIOD']) # FIXME - should do this in expression file? choosers['dest_topology'] = reindex(land_use.TOPOLOGY, choosers[alt_col_name]) choosers['dest_density_index'] = reindex(land_use.density_index, choosers[alt_col_name]) tracing.dump_df(DUMP, choosers, tracing.extend_trace_label(trace_label, school_type), 'choosers') logsums = compute_logsums( choosers, logsums_spec, logsum_settings, skim_dict, skim_stack, chooser_col_name, alt_col_name, chunk_size, trace_hh_id, tracing.extend_trace_label(trace_label, school_type)) logsums_list.append(logsums) logsums = pd.concat(logsums_list) # add_column series should have an index matching the table to which it is being added # logsums does, since school_location_sample was on left side of merge creating choosers inject.add_column("school_location_sample", "mode_choice_logsum", logsums)
def home_is_rural(households, land_use, settings): s = reindex(land_use.area_type, households.home_taz) return s > settings['rural_threshold']