def choose_intermediate_trip_purpose(trips, probs_spec, trace_hh_id, trace_label): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ probs_join_cols = ['primary_purpose', 'outbound', 'person_type'] non_purpose_cols = probs_join_cols + ['depart_range_start', 'depart_range_end'] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) # probs shold sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index('trip_id') chunk.log_df(trace_label, 'choosers', choosers) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[(choosers.start >= choosers['depart_range_start']) & ( choosers.start <= choosers['depart_range_end'])] # choosers should now match trips row for row assert choosers.index.is_unique assert len(choosers.index) == num_trips choices, rands = logit.make_choices( choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) choices = choices.map(pd.Series(purpose_cols)) return choices
def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label): """ extend tour counts based on a probability table counts can only be extended if original count is between 1 and 4 and tours can only be extended if their count is at the max possible (e.g. 2 for escort, 1 otherwise) so escort might be increased to 3 or 4 and other tour types might be increased to 2 or 3 Parameters ---------- persons: pandas dataframe (need this for join columns) tour_counts: pandas dataframe one row per person, once column per tour_type alternatives alternatives from nmtv interaction_simulate only need this to know max possible frequency for a tour type trace_hh_id trace_label Returns ------- extended tour_counts tour_counts looks like this: escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ assert tour_counts.index.name == persons.index.name PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours'] JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour'] TOUR_TYPE_COL = 'nonmandatory_tour_type' probs_spec = extension_probs() persons = persons[JOIN_COLUMNS] # only extend if there are 1 - 4 non_mandatory tours to start with extend_tour_counts = tour_counts.sum(axis=1).between(1, 4) if not extend_tour_counts.any(): logger.info( "extend_tour_counts - no persons eligible for tour_count extension" ) return tour_counts have_trace_targets = trace_hh_id and tracing.has_trace_targets( extend_tour_counts) for i, tour_type in enumerate(alternatives.columns): i_tour_type = i + 1 # (probs_spec nonmandatory_tour_type column is 1-based) tour_type_trace_label = tracing.extend_trace_label( trace_label, tour_type) # - only extend tour if frequency is max possible frequency for this tour type tour_type_is_maxed = \ extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max()) maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed] if len(maxed_tour_count_idx) == 0: continue # - get extension probs for tour_type choosers = pd.merge( persons.loc[maxed_tour_count_idx], probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type], on=JOIN_COLUMNS, how='left').set_index(maxed_tour_count_idx) assert choosers.index.name == tour_counts.index.name # - random choice of extension magnitude based on relative probs choices, rands = logit.make_choices(choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, trace_choosers=choosers) # - extend tour_count (0-based prob alternative choice equals magnitude of extension) if choices.any(): tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label( tour_type_trace_label, 'choices'), columns=[None, 'choice']) tracing.trace_df(rands, tracing.extend_trace_label( tour_type_trace_label, 'rands'), columns=[None, 'rand']) return tour_counts
def extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label): """ Generate the activity choices for the 'extra' household members who weren't handled by cdap Following the CTRAMP HouseholdCoordinatedDailyActivityPatternModel, "a separate, simple cross-sectional distribution is looked up for the remaining household members" The cdap_fixed_relative_proportions spec is handled like an activitysim logit utility spec, EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) Parameters ---------- persons : pandas.DataFrame Table of persons data indexed on _persons_index_ We expect, at least, columns [_hh_id_, _ptype_] cdap_fixed_relative_proportions spec to compute/specify the relative proportions of each activity (M, N, H) that should be used to choose activities for additional household members not handled by CDAP. locals_d : Dict dictionary of local variables that eval_variables adds to the environment for an evaluation of an expression that begins with @ Returns ------- choices : pandas.Series list of alternatives chosen for all extra members, indexed by _persons_index_ """ trace_label = tracing.extend_trace_label(trace_label, 'extra_hh_member_choices') # extra household members have cdap_ran > MAX_HHSIZE choosers = persons[persons['cdap_rank'] > MAX_HHSIZE] if len(choosers.index) == 0: return pd.Series(dtype='float64') # eval the expression file values = simulate.eval_variables(cdap_fixed_relative_proportions.index, choosers, locals_d) # cdap_fixed_relative_proportions computes relative proportions by ptype, not utilities proportions = values.dot(cdap_fixed_relative_proportions) # convert relative proportions to probability probs = proportions.div(proportions.sum(axis=1), axis=0) # select an activity pattern alternative for each person based on probability # idx_choices is a series (indexed on _persons_index_ ) with the chosen alternative represented # as the integer (0 based) index of the chosen column from probs idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) # convert choice from column index to activity name choices = pd.Series(probs.columns[idx_choices].values, index=probs.index) # if DUMP: # tracing.trace_df(proportions, '%s.DUMP.extra_proportions' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(probs, '%s.DUMP.extra_probs' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(choices, '%s.DUMP.extra_choices' % trace_label, # transpose=False, # slicer='NONE') if trace_hh_id: tracing.trace_df(proportions, '%s.extra_hh_member_choices_proportions' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(probs, '%s.extra_hh_member_choices_probs' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(choices, '%s.extra_hh_member_choices_choices' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(rands, '%s.extra_hh_member_choices_rands' % trace_label, columns=[None, 'rand']) return choices
def household_activity_choices(indiv_utils, interaction_coefficients, hhsize, trace_hh_id=None, trace_label=None): """ Calculate household utilities for each activity pattern alternative for households of hhsize The resulting activity pattern for each household will be coded as a string of activity codes. e.g. 'MNHH' for a 4 person household with activities Mandatory, NonMandatory, Home, Home Parameters ---------- indiv_utils : pandas.DataFrame CDAP utilities for each individual, ignoring interactions ind_utils has index of _persons_index_ and a column for each alternative i.e. three columns 'M' (Mandatory), 'N' (NonMandatory), 'H' (Home) interaction_coefficients : pandas.DataFrame Rules and coefficients for generating interaction specs for different household sizes hhsize : int the size of household for which activity perttern should be calculated (1..MAX_HHSIZE) Returns ------- choices : pandas.Series the chosen cdap activity pattern for each household represented as a string (e.g. 'MNH') with same index (_hh_index_) as utils """ if hhsize == 1: # for 1 person households, there are no interactions to account for # and the household utils are the same as the individual utils choosers = vars = None # extract the individual utilities for individuals from hhsize 1 households utils = indiv_utils.loc[indiv_utils[_hh_size_] == 1, [_hh_id_, 'M', 'N', 'H']] # index on household_id, not person_id set_hh_index(utils) else: choosers = hh_choosers(indiv_utils, hhsize=hhsize) spec = build_cdap_spec(interaction_coefficients, hhsize, trace_spec=(trace_hh_id in choosers.index), trace_label=trace_label) utils = simulate.eval_utilities(spec, choosers, trace_label=trace_label) if len(utils.index) == 0: return pd.Series(dtype='float64') probs = logit.utils_to_probs(utils, trace_label=trace_label) # select an activity pattern alternative for each household based on probability # result is a series indexed on _hh_index_ with the (0 based) index of the column from probs idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) # convert choice expressed as index into alternative name from util column label choices = pd.Series(utils.columns[idx_choices].values, index=utils.index) if trace_hh_id: if hhsize > 1: tracing.trace_df(choosers, '%s.hhsize%d_choosers' % (trace_label, hhsize), column_labels=['expression', 'person']) tracing.trace_df(utils, '%s.hhsize%d_utils' % (trace_label, hhsize), column_labels=['expression', 'household']) tracing.trace_df(probs, '%s.hhsize%d_probs' % (trace_label, hhsize), column_labels=['expression', 'household']) tracing.trace_df(choices, '%s.hhsize%d_activity_choices' % (trace_label, hhsize), column_labels=['expression', 'household']) tracing.trace_df(rands, '%s.hhsize%d_rands' % (trace_label, hhsize), columns=[None, 'rand']) return choices
def participants_chooser(probs, choosers, spec, trace_label): """ custom alternative to logit.make_choices for simulate.simple_simulate Choosing participants for mixed tours is trickier than adult or child tours becuase we need at least one adult and one child participant in a mixed tour. We call logit.make_choices and then check to see if the tour statisfies this requirement, and rechoose for any that fail until all are satisfied. In principal, this shold always occur eventually, but we fail after MAX_ITERATIONS, just in case there is some failure in program logic (haven't seen this occur.) Parameters ---------- probs : pandas.DataFrame Rows for choosers and columns for the alternatives from which they are choosing. Values are expected to be valid probabilities across each row, e.g. they should sum to 1. choosers : pandas.dataframe simple_simulate choosers df spec : pandas.DataFrame simple_simulate spec df We only need spec so we can know the column index of the 'participate' alternative indicating that the participant has been chosen to participate in the tour trace_label : str Returns - same as logit.make_choices ------- choices, rands choices, rands as returned by logit.make_choices (in same order as probs) """ assert probs.index.equals(choosers.index) # choice is boolean (participate or not) model_settings = config.read_model_settings('joint_tour_participation.yaml') choice_col = model_settings.get('participation_choice', 'participate') assert choice_col in spec.columns, \ "couldn't find participation choice column '%s' in spec" PARTICIPATE_CHOICE = spec.columns.get_loc(choice_col) MAX_ITERATIONS = model_settings.get('max_participation_choice_iterations', 5000) trace_label = tracing.extend_trace_label(trace_label, 'participants_chooser') candidates = choosers.copy() choices_list = [] rands_list = [] num_tours_remaining = len(candidates.tour_id.unique()) logger.info('%s %s joint tours to satisfy.', trace_label, num_tours_remaining,) iter = 0 while candidates.shape[0] > 0: iter += 1 if iter > MAX_ITERATIONS: logger.warning('%s max iterations exceeded (%s).', trace_label, MAX_ITERATIONS) diagnostic_cols = ['tour_id', 'household_id', 'composition', 'adult'] unsatisfied_candidates = candidates[diagnostic_cols].join(probs) tracing.write_csv(unsatisfied_candidates, file_name='%s.UNSATISFIED' % trace_label, transpose=False) print(unsatisfied_candidates.head(20)) assert False choices, rands = logit.make_choices(probs, trace_label=trace_label, trace_choosers=choosers) participate = (choices == PARTICIPATE_CHOICE) # satisfaction indexed by tour_id tour_satisfaction = get_tour_satisfaction(candidates, participate) num_tours_satisfied_this_iter = tour_satisfaction.sum() if num_tours_satisfied_this_iter > 0: num_tours_remaining -= num_tours_satisfied_this_iter satisfied = reindex(tour_satisfaction, candidates.tour_id) choices_list.append(choices[satisfied]) rands_list.append(rands[satisfied]) # remove candidates of satisfied tours probs = probs[~satisfied] candidates = candidates[~satisfied] logger.info('%s iteration %s : %s joint tours satisfied %s remaining' % (trace_label, iter, num_tours_satisfied_this_iter, num_tours_remaining,)) choices = pd.concat(choices_list) rands = pd.concat(rands_list).reindex(choosers.index) # reindex choices and rands to match probs and v index choices = choices.reindex(choosers.index) rands = rands.reindex(choosers.index) assert choices.index.equals(choosers.index) assert rands.index.equals(choosers.index) logger.info('%s %s iterations to satisfy all joint tours.', trace_label, iter,) return choices, rands
def extra_hh_member_choices(persons, cdap_fixed_relative_proportions, locals_d, trace_hh_id, trace_label): """ Generate the activity choices for the 'extra' household members who weren't handled by cdap Following the CTRAMP HouseholdCoordinatedDailyActivityPatternModel, "a separate, simple cross-sectional distribution is looked up for the remaining household members" The cdap_fixed_relative_proportions spec is handled like an activitysim logit utility spec, EXCEPT that the values computed are relative proportions, not utilities (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0) Parameters ---------- persons : pandas.DataFrame Table of persons data indexed on _persons_index_ We expect, at least, columns [_hh_id_, _ptype_] cdap_fixed_relative_proportions spec to compute/specify the relative proportions of each activity (M, N, H) that should be used to choose activities for additional household members not handled by CDAP. locals_d : Dict dictionary of local variables that eval_variables adds to the environment for an evaluation of an expression that begins with @ Returns ------- choices : pandas.Series list of alternatives chosen for all extra members, indexed by _persons_index_ """ trace_label = tracing.extend_trace_label(trace_label, 'extra_hh_member_choices') # extra household members have cdap_ran > MAX_HHSIZE choosers = persons[persons['cdap_rank'] > MAX_HHSIZE] if len(choosers.index) == 0: return pd.Series() # eval the expression file values = simulate.eval_variables(cdap_fixed_relative_proportions.index, choosers, locals_d) # cdap_fixed_relative_proportions computes relative proportions by ptype, not utilities proportions = values.dot(cdap_fixed_relative_proportions) # convert relative proportions to probability probs = proportions.div(proportions.sum(axis=1), axis=0) # select an activity pattern alternative for each person based on probability # idx_choices is a series (indexed on _persons_index_ ) with the chosen alternative represented # as the integer (0 based) index of the chosen column from probs idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) # convert choice from column index to activity name choices = pd.Series(probs.columns[idx_choices].values, index=probs.index) # if DUMP: # tracing.trace_df(proportions, '%s.DUMP.extra_proportions' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(probs, '%s.DUMP.extra_probs' % trace_label, # transpose=False, slicer='NONE') # tracing.trace_df(choices, '%s.DUMP.extra_choices' % trace_label, # transpose=False, # slicer='NONE') if trace_hh_id: tracing.trace_df(proportions, '%s.extra_hh_member_choices_proportions' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(probs, '%s.extra_hh_member_choices_probs' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(choices, '%s.extra_hh_member_choices_choices' % trace_label, column_labels=['expression', 'person']) tracing.trace_df(rands, '%s.extra_hh_member_choices_rands' % trace_label, columns=[None, 'rand']) return choices
def household_activity_choices(indiv_utils, interaction_coefficients, hhsize, trace_hh_id=None, trace_label=None): """ Calculate household utilities for each activity pattern alternative for households of hhsize The resulting activity pattern for each household will be coded as a string of activity codes. e.g. 'MNHH' for a 4 person household with activities Mandatory, NonMandatory, Home, Home Parameters ---------- indiv_utils : pandas.DataFrame CDAP utilities for each individual, ignoring interactions ind_utils has index of _persons_index_ and a column for each alternative i.e. three columns 'M' (Mandatory), 'N' (NonMandatory), 'H' (Home) interaction_coefficients : pandas.DataFrame Rules and coefficients for generating interaction specs for different household sizes hhsize : int the size of household for which activity perttern should be calculated (1..MAX_HHSIZE) Returns ------- choices : pandas.Series the chosen cdap activity pattern for each household represented as a string (e.g. 'MNH') with same index (_hh_index_) as utils """ if hhsize == 1: # for 1 person households, there are no interactions to account for # and the household utils are the same as the individual utils choosers = vars = None # extract the individual utilities for individuals from hhsize 1 households utils = indiv_utils.loc[indiv_utils[_hh_size_] == 1, [_hh_id_, 'M', 'N', 'H']] # index on household_id, not person_id set_hh_index(utils) else: choosers = hh_choosers(indiv_utils, hhsize=hhsize) spec = build_cdap_spec(interaction_coefficients, hhsize, trace_spec=(trace_hh_id in choosers.index), trace_label=trace_label) utils = simulate.eval_utilities(spec, choosers, trace_label=trace_label) if len(utils.index) == 0: return pd.Series() probs = logit.utils_to_probs(utils, trace_label=trace_label) # select an activity pattern alternative for each household based on probability # result is a series indexed on _hh_index_ with the (0 based) index of the column from probs idx_choices, rands = logit.make_choices(probs, trace_label=trace_label) # convert choice expressed as index into alternative name from util column label choices = pd.Series(utils.columns[idx_choices].values, index=utils.index) if trace_hh_id: if hhsize > 1: tracing.trace_df(choosers, '%s.hhsize%d_choosers' % (trace_label, hhsize), column_labels=['expression', 'person']) tracing.trace_df(utils, '%s.hhsize%d_utils' % (trace_label, hhsize), column_labels=['expression', 'household']) tracing.trace_df(probs, '%s.hhsize%d_probs' % (trace_label, hhsize), column_labels=['expression', 'household']) tracing.trace_df(choices, '%s.hhsize%d_activity_choices' % (trace_label, hhsize), column_labels=['expression', 'household']) tracing.trace_df(rands, '%s.hhsize%d_rands' % (trace_label, hhsize), columns=[None, 'rand']) return choices
def choose_intermediate_trip_purpose(trips, probs_spec, estimator, probs_join_cols, use_depart_time, trace_hh_id, trace_label): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ non_purpose_cols = probs_join_cols.copy() if use_depart_time: non_purpose_cols += ['depart_range_start', 'depart_range_end'] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) # probs should sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index('trip_id') chunk.log_df(trace_label, 'choosers', choosers) if use_depart_time: # select the matching depart range (this should result on in exactly one chooser row per trip) chooser_probs = \ (choosers.start >= choosers['depart_range_start']) & (choosers.start <= choosers['depart_range_end']) # if we failed to match a row in probs_spec if chooser_probs.sum() < num_trips: # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols missing_trip_ids = trips.index[ ~trips.index.isin(choosers.index[chooser_probs])].values unmatched_choosers = choosers[choosers.index.isin( missing_trip_ids)] unmatched_choosers = unmatched_choosers[['person_id', 'start'] + non_purpose_cols] # join to persons for better diagnostics persons = inject.get_table('persons').to_frame() persons_cols = [ 'age', 'is_worker', 'is_student', 'is_gradeschool', 'is_highschool', 'is_university' ] unmatched_choosers = pd.merge(unmatched_choosers, persons[[ col for col in persons_cols if col in persons.columns ]], left_on='person_id', right_index=True, how='left') file_name = '%s.UNMATCHED_PROBS' % trace_label logger.error( "%s %s of %s intermediate trips could not be matched to probs based on join columns %s" % (trace_label, len(unmatched_choosers), len(choosers), probs_join_cols)) logger.info("Writing %s unmatched choosers to %s" % ( len(unmatched_choosers), file_name, )) tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) raise RuntimeError( "Some trips could not be matched to probs based on join columns %s." % probs_join_cols) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[chooser_probs] # choosers should now match trips row for row assert choosers.index.identical(trips.index) if estimator: probs_cols = list(probs_spec.columns) print(choosers[probs_cols]) estimator.write_table(choosers[probs_cols], 'probs', append=True) choices, rands = logit.make_choices(choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) choices = choices.map(pd.Series(purpose_cols)) return choices
def schedule_nth_trips(trips, probs_spec, model_settings, first_trip_in_leg, report_failed_trips, trace_hh_id, trace_label): """ We join each trip with the appropriate row in probs_spec by joining on probs_join_cols, which should exist in both trips, probs_spec dataframe. Parameters ---------- trips: pd.DataFrame probs_spec: pd.DataFrame Dataframe of probs for choice of depart times and join columns to match them with trips. Depart columns names are irrelevant. Instead, they are position dependent, time period choice is their index + depart_alt_base depart_alt_base: int int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am report_failed_trips : bool trace_hh_id trace_label Returns ------- choices: pd.Series time periods depart choices, one per trip (except for trips with zero probs) """ depart_alt_base = model_settings.get('DEPART_ALT_BASE') probs_cols = [c for c in probs_spec.columns if c not in PROBS_JOIN_COLUMNS] # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=PROBS_JOIN_COLUMNS, how='left').set_index('trip_id') chunk.log_df(trace_label, "choosers", choosers) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choosers, '%s.choosers' % trace_label) # choosers should now match trips row for row assert choosers.index.is_unique assert len(choosers.index) == len(trips.index) # zero out probs outside earliest-latest window chooser_probs = clip_probs(trips, choosers[probs_cols], model_settings) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if first_trip_in_leg: # probs should sum to 1 unless all zero chooser_probs = chooser_probs.div(chooser_probs.sum(axis=1), axis=0).fillna(0) # probs should sum to 1 with residual probs resulting in choice of 'fail' chooser_probs['fail'] = 1 - chooser_probs.sum(axis=1).clip(0, 1) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label) choices, rands = logit.make_choices(chooser_probs, trace_label=trace_label, trace_choosers=choosers) chunk.log_df(trace_label, "choices", choices) chunk.log_df(trace_label, "rands", rands) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # convert alt choice index to depart time (setting failed choices to -1) failed = (choices == chooser_probs.columns.get_loc('fail')) choices = (choices + depart_alt_base).where(~failed, -1) chunk.log_df(trace_label, "failed", failed) # report failed trips while we have the best diagnostic info if report_failed_trips and failed.any(): report_bad_choices(bad_row_map=failed, df=choosers, filename='failed_choosers', trace_label=trace_label, trace_choosers=None) # trace before removing failures if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # remove any failed choices if failed.any(): choices = choices[~failed] assert (choices >= trips.earliest[~failed]).all() assert (choices <= trips.latest[~failed]).all() return choices
def build_virtual_path(self, recipe, path_type, orig, dest, tod, demographic_segment, want_choices, trace_label, filter_targets=None, trace=False, override_choices=None): trace_label = tracing.extend_trace_label(trace_label, 'build_virtual_path') # Tracing is implemented as a seperate, second call that operates ONLY on filter_targets assert not (trace and filter_targets is None) if filter_targets is not None: assert filter_targets.any() # slice orig and dest orig = orig[filter_targets] dest = dest[filter_targets] assert len(orig) > 0 assert len(dest) > 0 # slice tod and demographic_segment if not scalar if not isinstance(tod, str): tod = tod[filter_targets] if demographic_segment is not None: demographic_segment = demographic_segment[filter_targets] assert len(demographic_segment) > 0 # slice choices # (requires actual choices from the previous call lest rands change on second call) assert want_choices == (override_choices is not None) if want_choices: override_choices = override_choices[filter_targets] units = self.units_for_recipe(recipe) assert units == 'utility' or not want_choices, "'want_choices' only supported supported if units is utility" access_mode = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.access') egress_mode = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.egress') path_types_settings = self.network_los.setting( f'TVPB_SETTINGS.{recipe}.path_types.{path_type}') attributes_as_columns = \ self.network_los.setting(f'TVPB_SETTINGS.{recipe}.tap_tap_settings.attributes_as_columns', []) path_info = { 'path_type': path_type, 'access_mode': access_mode, 'egress_mode': egress_mode } # maz od pairs requested with memo("#TVPB build_virtual_path maz_od_df"): maz_od_df = pd.DataFrame({ 'idx': orig.index.values, 'omaz': orig.values, 'dmaz': dest.values, 'seq': range(len(orig)) }) chunk.log_df(trace_label, "maz_od_df", maz_od_df) self.trace_maz_tap(maz_od_df, access_mode, egress_mode) # for location choice, there will be multiple alt dest rows per chooser and duplicate orig.index values # but tod and demographic_segment should be the same for all chooser rows (unique orig index values) # knowing this allows us to eliminate redundant computations (e.g. utilities of maz_tap pairs) duplicated = orig.index.duplicated(keep='first') chooser_attributes = pd.DataFrame(index=orig.index[~duplicated]) if not isinstance(tod, str): chooser_attributes['tod'] = tod.loc[~duplicated] elif 'tod' in attributes_as_columns: chooser_attributes['tod'] = tod else: path_info['tod'] = tod if demographic_segment is not None: chooser_attributes[ 'demographic_segment'] = demographic_segment.loc[~duplicated] with memo("#TVPB build_virtual_path access_df"): access_df = self.compute_maz_tap_utilities(recipe, maz_od_df, chooser_attributes, leg='access', mode=access_mode, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "access_df", access_df) with memo("#TVPB build_virtual_path egress_df"): egress_df = self.compute_maz_tap_utilities(recipe, maz_od_df, chooser_attributes, leg='egress', mode=egress_mode, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "egress_df", egress_df) # path_info for use by expressions (e.g. penalty for drive access if no parking at access tap) with memo("#TVPB build_virtual_path compute_tap_tap"): transit_df = self.compute_tap_tap(recipe, maz_od_df, access_df, egress_df, chooser_attributes, path_info=path_info, trace_label=trace_label, trace=trace) chunk.log_df(trace_label, "transit_df", transit_df) with memo("#TVPB build_virtual_path best_paths"): path_df = self.best_paths(recipe, path_type, maz_od_df, access_df, egress_df, transit_df, trace_label, trace) chunk.log_df(trace_label, "path_df", path_df) # now that we have created path_df, we are done with the dataframes for the separate legs del access_df chunk.log_df(trace_label, "access_df", None) del egress_df chunk.log_df(trace_label, "egress_df", None) del transit_df chunk.log_df(trace_label, "transit_df", None) if units == 'utility': # logsums with memo("#TVPB build_virtual_path logsums"): # one row per seq with utilities in columns # path_num 0-based to aligh with logit.make_choices 0-based choice indexes path_df['path_num'] = path_df.groupby('seq').cumcount() chunk.log_df(trace_label, "path_df", path_df) utilities_df = path_df[['seq', 'path_num', units]].set_index(['seq', 'path_num' ]).unstack() utilities_df.columns = utilities_df.columns.droplevel( ) # for legibility # add rows missing because no access or egress availability utilities_df = pd.concat( [pd.DataFrame(index=maz_od_df.seq), utilities_df], axis=1) utilities_df = utilities_df.fillna( UNAVAILABLE ) # set utilities for missing paths to UNAVAILABLE chunk.log_df(trace_label, "utilities_df", utilities_df) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. # most likely "divide by zero encountered in log" caused by all transit sets non-viable warnings.simplefilter("always") paths_nest_nesting_coefficient = path_types_settings.get( 'paths_nest_nesting_coefficient', 1) exp_utilities = np.exp(utilities_df.values / paths_nest_nesting_coefficient) logsums = np.maximum( np.log(np.nansum(exp_utilities, axis=1)), UNAVAILABLE) if len(w) > 0: for wrn in w: logger.warning( f"{trace_label} - {type(wrn).__name__} ({wrn.message})" ) DUMP = False if DUMP: zero_utilities_df = utilities_df[np.nansum( np.exp(utilities_df.values), axis=1) == 0] zero_utilities_df.to_csv(config.output_file_path( 'warning_utilities_df.csv'), index=True) bug if want_choices: # orig index to identify appropriate random number channel to use making choices utilities_df.index = orig.index with memo("#TVPB build_virtual_path make_choices"): probs = logit.utils_to_probs(utilities_df, allow_zero_probs=True, trace_label=trace_label) chunk.log_df(trace_label, "probs", probs) if trace: choices = override_choices utilities_df['choices'] = choices self.trace_df(utilities_df, trace_label, 'utilities_df') probs['choices'] = choices self.trace_df(probs, trace_label, 'probs') else: choices, rands = logit.make_choices( probs, allow_bad_probs=True, trace_label=trace_label) chunk.log_df(trace_label, "rands", rands) del rands chunk.log_df(trace_label, "rands", None) del probs chunk.log_df(trace_label, "probs", None) # we need to get path_set, btap, atap from path_df row with same seq and path_num # drop seq join column, but keep path_num of choice to override_choices when tracing columns_to_cache = ['btap', 'atap', 'path_set', 'path_num'] logsum_df = \ pd.merge(pd.DataFrame({'seq': range(len(orig)), 'path_num': choices.values}), path_df[['seq'] + columns_to_cache], on=['seq', 'path_num'], how='left')\ .drop(columns=['seq'])\ .set_index(orig.index) logsum_df['logsum'] = logsums else: assert len(logsums) == len(orig) logsum_df = pd.DataFrame({'logsum': logsums}, index=orig.index) chunk.log_df(trace_label, "logsum_df", logsum_df) del utilities_df chunk.log_df(trace_label, "utilities_df", None) if trace: self.trace_df(logsum_df, trace_label, 'logsum_df') chunk.log_df(trace_label, "logsum_df", logsum_df) results = logsum_df else: assert units == 'time' # return a series results = pd.Series(path_df[units].values, index=path_df['idx']) # zero-fill rows for O-D pairs where no best path exists because there was no tap-tap transit availability results = reindex(results, maz_od_df.idx).fillna(0.0) chunk.log_df(trace_label, "results", results) assert len(results) == len(orig) del path_df chunk.log_df(trace_label, "path_df", None) # diagnostic # maz_od_df['DIST'] = self.network_los.get_default_skim_dict().get('DIST').get(maz_od_df.omaz, maz_od_df.dmaz) # maz_od_df[units] = results.logsum if units == 'utility' else results.values # print(f"maz_od_df\n{maz_od_df}") return results
def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label='trace_label'): alternatives = generate_alternatives(trip_segment, STOP_TIME_DURATION).sort_index() have_trace_targets = tracing.has_trace_targets(trip_segment) if have_trace_targets: tracing.trace_df(trip_segment, tracing.extend_trace_label(trace_label, 'choosers')) tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), transpose=False) if len(spec.columns) > 1: raise RuntimeError('spec must have only one column') # - join choosers and alts # in vanilla interaction_simulate interaction_df is cross join of choosers and alternatives # interaction_df = logit.interaction_dataset(choosers, alternatives, sample_size) # here, alternatives is sparsely repeated once for each (non-dup) sample # we expect alternatives to have same index of choosers (but with duplicate index values) # so we just need to left join alternatives with choosers assert alternatives.index.name == trip_segment.index.name interaction_df = alternatives.join(trip_segment, how='left', rsuffix='_chooser') chunk.log_df(trace_label, 'interaction_df', interaction_df) if have_trace_targets: trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, trip_segment) tracing.trace_df(interaction_df, tracing.extend_trace_label(trace_label, 'interaction_df'), transpose=False) else: trace_rows = trace_ids = None interaction_utilities, trace_eval_results \ = interaction_simulate.eval_interaction_utilities(spec, interaction_df, None, trace_label, trace_rows, estimator=None) interaction_utilities = pd.concat([interaction_df[STOP_TIME_DURATION], interaction_utilities], axis=1) chunk.log_df(trace_label, 'interaction_utilities', interaction_utilities) interaction_utilities = pd.merge(interaction_utilities.reset_index(), patterns[patterns[TRIP_ID].isin(interaction_utilities.index)], on=[TRIP_ID, STOP_TIME_DURATION], how='left') if have_trace_targets: tracing.trace_interaction_eval_results(trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, 'eval')) tracing.trace_df(interaction_utilities, tracing.extend_trace_label(trace_label, 'interaction_utilities'), transpose=False) del interaction_df chunk.log_df(trace_label, 'interaction_df', None) interaction_utilities = interaction_utilities.groupby([TOUR_ID, OUTBOUND, PATTERN_ID], as_index=False)[['utility']].sum() interaction_utilities[TOUR_LEG_ID] = \ interaction_utilities.apply(generate_tour_leg_id, axis=1) tour_choosers = interaction_utilities.set_index(TOUR_LEG_ID) interaction_utilities = tour_choosers[['utility']].copy() # reshape utilities (one utility column and one row per row in model_design) # to a dataframe with one row per chooser and one column per alternative # interaction_utilities is sparse because duplicate sampled alternatives were dropped # so we need to pad with dummy utilities so low that they are never chosen # number of samples per chooser sample_counts = interaction_utilities.groupby(interaction_utilities.index).size().values chunk.log_df(trace_label, 'sample_counts', sample_counts) # max number of alternatvies for any chooser max_sample_count = sample_counts.max() # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = sample_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts) del sample_counts chunk.log_df(trace_label, 'sample_counts', None) # insert the zero-prob utilities to pad each alternative set to same size padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999) del inserts del interaction_utilities chunk.log_df(trace_label, 'interaction_utilities', None) # reshape to array with one row per chooser, one column per alternative padded_utilities = padded_utilities.reshape(-1, max_sample_count) chunk.log_df(trace_label, 'padded_utilities', padded_utilities) # convert to a dataframe with one row per chooser and one column per alternative utilities_df = pd.DataFrame( padded_utilities, index=tour_choosers.index.unique()) chunk.log_df(trace_label, 'utilities_df', utilities_df) del padded_utilities chunk.log_df(trace_label, 'padded_utilities', None) if have_trace_targets: tracing.trace_df(utilities_df, tracing.extend_trace_label(trace_label, 'utilities'), column_labels=['alternative', 'utility']) # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative probs = logit.utils_to_probs(utilities_df, trace_label=trace_label, trace_choosers=trip_segment) chunk.log_df(trace_label, 'probs', probs) del utilities_df chunk.log_df(trace_label, 'utilities_df', None) if have_trace_targets: tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), column_labels=['alternative', 'probability']) # make choices # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = \ logit.make_choices(probs, trace_label=trace_label, trace_choosers=trip_segment) chunk.log_df(trace_label, 'positions', positions) chunk.log_df(trace_label, 'rands', rands) del probs chunk.log_df(trace_label, 'probs', None) # shouldn't have chosen any of the dummy pad utilities assert positions.max() < max_sample_count # need to get from an integer offset into the alternative sample to the alternative index # that is, we want the index value of the row that is offset by <position> rows into the # tranche of this choosers alternatives created by cross join of alternatives and choosers # resulting pandas Int64Index has one element per chooser row and is in same order as choosers choices = tour_choosers[PATTERN_ID].take(positions + first_row_offsets) chunk.log_df(trace_label, 'choices', choices) if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label(trace_label, 'choices'), columns=[None, PATTERN_ID]) tracing.trace_df(rands, tracing.extend_trace_label(trace_label, 'rands'), columns=[None, 'rand']) return choices
def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label): """ extend tour counts based on a probability table counts can only be extended if original count is between 1 and 4 and tours can only be extended if their count is at the max possible (e.g. 2 for escort, 1 otherwise) so escort might be increased to 3 or 4 and other tour types might be increased to 2 or 3 Parameters ---------- persons: pandas dataframe (need this for join columns) tour_counts: pandas dataframe one row per person, once column per tour_type alternatives alternatives from nmtv interaction_simulate only need this to know max possible frequency for a tour type trace_hh_id trace_label Returns ------- extended tour_counts tour_counts looks like this: escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ assert tour_counts.index.name == persons.index.name PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours'] JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour'] TOUR_TYPE_COL = 'nonmandatory_tour_type' probs_spec = extension_probs() persons = persons[JOIN_COLUMNS] # only extend if there are 1 - 4 non_mandatory tours to start with extend_tour_counts = tour_counts.sum(axis=1).between(1, 4) if not extend_tour_counts.any(): return tour_counts have_trace_targets = trace_hh_id and tracing.has_trace_targets(extend_tour_counts) for i, tour_type in enumerate(alternatives.columns): i_tour_type = i + 1 # (probs_spec nonmandatory_tour_type column is 1-based) tour_type_trace_label = tracing.extend_trace_label(trace_label, tour_type) # - only extend tour if frequency is max possible frequency for this tour type tour_type_is_maxed = \ extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max()) maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed] if len(maxed_tour_count_idx) == 0: continue # - get extension probs for tour_type choosers = pd.merge( persons.loc[maxed_tour_count_idx], probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type], on=JOIN_COLUMNS, how='left' ).set_index(maxed_tour_count_idx) assert choosers.index.name == tour_counts.index.name # - random choice of extension magnituce based on relative probs choices, rands = logit.make_choices( choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, trace_choosers=choosers) # - extend tour_count (0-based prob alternative choice equals magnitude of extension) if choices.any(): tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label(tour_type_trace_label, 'choices'), columns=[None, 'choice']) tracing.trace_df(rands, tracing.extend_trace_label(tour_type_trace_label, 'rands'), columns=[None, 'rand']) return tour_counts
def make_scheduling_choices( choosers_df, scheduling_mode, probs_spec, probs_join_cols, depart_alt_base, first_trip_in_leg, report_failed_trips, trace_hh_id, trace_label, trace_choice_col_name='depart', clip_earliest_latest=True): """ We join each trip with the appropriate row in probs_spec by joining on probs_join_cols, which should exist in both trips, probs_spec dataframe. Parameters ---------- choosers: pd.DataFrame scheduling_mode: str Either 'departure' or 'stop_duration' depending on whether the probability lookup table is keyed on depature period or stop duration. trips: pd.DataFrame probs_spec: pd.DataFrame Dataframe of probs for choice of depart times and join columns to match them with trips. Depart columns names are irrelevant. Instead, they are position dependent, time period choice is their index + depart_alt_base depart_alt_base: int int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am report_failed_trips : bool trace_hh_id trace_label Returns ------- choices: pd.Series time periods depart choices, one per trip (except for trips with zero probs) """ choosers = pd.merge(choosers_df.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index(choosers_df.index.name) chunk.log_df(trace_label, "choosers", choosers) if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(choosers, '%s.choosers' % trace_label) # different pre-processing is required based on the scheduling mode chooser_probs = _preprocess_scheduling_probs( scheduling_mode, choosers_df, choosers, probs_spec, probs_join_cols, clip_earliest_latest, depart_alt_base, first_trip_in_leg) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label) raw_choices, rands = logit.make_choices(chooser_probs, trace_label=trace_label, trace_choosers=choosers) chunk.log_df(trace_label, "choices", raw_choices) chunk.log_df(trace_label, "rands", rands) if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(raw_choices, '%s.choices' % trace_label, columns=[None, trace_choice_col_name]) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # different post-processing is required based on the scheduling mode choices, failed = _postprocess_scheduling_choices( scheduling_mode, depart_alt_base, raw_choices, chooser_probs.columns, choosers_df) chunk.log_df(trace_label, "failed", failed) # report failed trips while we have the best diagnostic info if report_failed_trips and failed.any(): _report_bad_choices( bad_row_map=failed, df=choosers, filename='failed_choosers', trace_label=trace_label, trace_choosers=None) # trace before removing failures if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_col_name]) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # remove any failed choices if failed.any(): choices = choices[~failed] if all([check_col in choosers_df.columns for check_col in ['earliest', 'latest']]): assert (choices >= choosers_df.earliest[~failed]).all() assert (choices <= choosers_df.latest[~failed]).all() return choices