def choose_intermediate_trip_purpose(trips, probs_spec, trace_hh_id, trace_label): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ probs_join_cols = ['primary_purpose', 'outbound', 'person_type'] non_purpose_cols = probs_join_cols + ['depart_range_start', 'depart_range_end'] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) # probs shold sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index('trip_id') chunk.log_df(trace_label, 'choosers', choosers) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[(choosers.start >= choosers['depart_range_start']) & ( choosers.start <= choosers['depart_range_end'])] # choosers should now match trips row for row assert choosers.index.is_unique assert len(choosers.index) == num_trips choices, rands = logit.make_choices( choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) choices = choices.map(pd.Series(purpose_cols)) return choices
def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label): """ extend tour counts based on a probability table counts can only be extended if original count is between 1 and 4 and tours can only be extended if their count is at the max possible (e.g. 2 for escort, 1 otherwise) so escort might be increased to 3 or 4 and other tour types might be increased to 2 or 3 Parameters ---------- persons: pandas dataframe (need this for join columns) tour_counts: pandas dataframe one row per person, once column per tour_type alternatives alternatives from nmtv interaction_simulate only need this to know max possible frequency for a tour type trace_hh_id trace_label Returns ------- extended tour_counts tour_counts looks like this: escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ assert tour_counts.index.name == persons.index.name PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours'] JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour'] TOUR_TYPE_COL = 'nonmandatory_tour_type' probs_spec = extension_probs() persons = persons[JOIN_COLUMNS] # only extend if there are 1 - 4 non_mandatory tours to start with extend_tour_counts = tour_counts.sum(axis=1).between(1, 4) if not extend_tour_counts.any(): logger.info( "extend_tour_counts - no persons eligible for tour_count extension" ) return tour_counts have_trace_targets = trace_hh_id and tracing.has_trace_targets( extend_tour_counts) for i, tour_type in enumerate(alternatives.columns): i_tour_type = i + 1 # (probs_spec nonmandatory_tour_type column is 1-based) tour_type_trace_label = tracing.extend_trace_label( trace_label, tour_type) # - only extend tour if frequency is max possible frequency for this tour type tour_type_is_maxed = \ extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max()) maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed] if len(maxed_tour_count_idx) == 0: continue # - get extension probs for tour_type choosers = pd.merge( persons.loc[maxed_tour_count_idx], probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type], on=JOIN_COLUMNS, how='left').set_index(maxed_tour_count_idx) assert choosers.index.name == tour_counts.index.name # - random choice of extension magnitude based on relative probs choices, rands = logit.make_choices(choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, trace_choosers=choosers) # - extend tour_count (0-based prob alternative choice equals magnitude of extension) if choices.any(): tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label( tour_type_trace_label, 'choices'), columns=[None, 'choice']) tracing.trace_df(rands, tracing.extend_trace_label( tour_type_trace_label, 'rands'), columns=[None, 'rand']) return tour_counts
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_col_name, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ Parameters ---------- taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term Returns ------- dataframe with with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count """ if len(taz_sample) == 0: # it can happen that all trips have no viable destinations (and so are dropped from the sample) # in which case we can just return the empty taz_sample, since it has the same columns return taz_sample.copy() # we had to use alt_dest_col_name as specified in model_settings for interaction_sample # because expressions reference it to look up size_terms by trip purpose DEST_MAZ = alt_dest_col_name DEST_TAZ = f"{alt_dest_col_name}_TAZ" taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True) trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) tracing.trace_df(taz_sample[trace_targets], label=tracing.extend_trace_label( trace_label, 'taz_sample'), transpose=False) # print(f"taz_sample\n{taz_sample}") # alt_dest_TAZ prob pick_count # trip_id # 4343721 12 0.000054 1 # 4343721 20 0.001864 2 taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) taz_choices = taz_choices.reindex( taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) # print(f"taz_choices\n{taz_choices}") # trip_id alt_dest_TAZ prob # 0 4343721 12 0.000054 # 1 4343721 20 0.001864 # 2 4343721 20 0.001864 # print(f"MAZ_size_terms\n{MAZ_size_terms.df}") # work escort shopping eatout othmaint social othdiscr univ # alt_dest # 2 31.0 9.930 0.042 0.258 0.560 0.520 10.856 0.042 # 3 0.0 3.277 0.029 0.000 0.029 0.029 7.308 0.029 # 4 0.0 1.879 0.023 0.000 0.023 0.023 5.796 0.023 # just to make it clear we are siloing choices by chooser_id chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'trip_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame( index=taz_sample.index[~taz_sample.index.duplicated()]) num_choosers = len(chooser_df) assert chooser_df.index.name == chooser_id_col # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ) # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating taz_sample_size = taz_choices.groupby( chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].rename(columns={ 'TAZ': DEST_TAZ, 'MAZ': DEST_MAZ }) maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), maz_taz, how='left', on=DEST_TAZ).set_index('index') purpose = maz_sizes['trip_id'].map( trips.purpose) # size term varies by purpose maz_sizes['size_term'] = MAZ_size_terms.get(maz_sizes[DEST_MAZ], purpose) # print(f"maz_sizes\n{maz_sizes}") # trip_id alt_dest_TAZ alt_dest size_term # index # 0 4343721 12 3445 0.019 # 0 4343721 12 11583 0.017 # 0 4343721 12 21142 0.020 if have_trace_targets: # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term] maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer='trip_id') trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] tracing.trace_df(trace_maz_sizes, label=tracing.extend_trace_label( trace_label, 'maz_sizes'), transpose=False) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values # print(maz_counts) # max number of MAZs for any TAZ max_maz_count = maz_counts.max() # print(f"max_maz_count {max_maz_count}") # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = maz_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts) # insert zero filler to pad each alternative set to same size padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0) padded_maz_sizes = padded_maz_sizes.reshape(-1, max_maz_count) # prob array with one row TAZ_choice, one column per alternative row_sums = padded_maz_sizes.sum(axis=1) maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = pipeline.get_rn_generator().random_for_df( chooser_df, n=taz_sample_size).reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] # make choices # positions is array with the chosen alternative represented as a column index in probs # which is an integer between zero and max_maz_count positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1) # shouldn't have chosen any of the dummy pad positions assert (positions < maz_counts).all() taz_choices[DEST_MAZ] = maz_sizes[DEST_MAZ].take(positions + first_row_offsets) taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] if have_trace_targets: taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer='trip_id') trace_taz_choices_df = taz_choices[taz_choices_trace_targets] tracing.trace_df(trace_taz_choices_df, label=tracing.extend_trace_label( trace_label, 'taz_choices'), transpose=False) lhs_df = trace_taz_choices_df[['trip_id', DEST_TAZ]] alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) # trace dest_maz_alts padded_maz_sizes = np.insert(trace_maz_sizes[DEST_MAZ].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_alts'), transpose=False) # trace dest_maz_size_terms padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_size_terms'), transpose=False) # trace dest_maz_probs df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) df['rand'] = rands[taz_choices_trace_targets] tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_probs'), transpose=False) taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ ]).agg(prob=('prob', 'max'), pick_count=('prob', 'count')) taz_choices.reset_index(level=DEST_MAZ, inplace=True) return taz_choices
def choose_intermediate_trip_purpose(trips, probs_spec, estimator, probs_join_cols, use_depart_time, trace_hh_id, trace_label): """ chose purpose for intermediate trips based on probs_spec which assigns relative weights (summing to 1) to the possible purpose choices Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ non_purpose_cols = probs_join_cols.copy() if use_depart_time: non_purpose_cols += ['depart_range_start', 'depart_range_end'] purpose_cols = [c for c in probs_spec.columns if c not in non_purpose_cols] num_trips = len(trips.index) have_trace_targets = trace_hh_id and tracing.has_trace_targets(trips) # probs should sum to 1 across rows sum_probs = probs_spec[purpose_cols].sum(axis=1) probs_spec.loc[:, purpose_cols] = probs_spec.loc[:, purpose_cols].div(sum_probs, axis=0) # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index('trip_id') chunk.log_df(trace_label, 'choosers', choosers) if use_depart_time: # select the matching depart range (this should result on in exactly one chooser row per trip) chooser_probs = \ (choosers.start >= choosers['depart_range_start']) & (choosers.start <= choosers['depart_range_end']) # if we failed to match a row in probs_spec if chooser_probs.sum() < num_trips: # this can happen if the spec doesn't have probs for the trips matching a trip's probs_join_cols missing_trip_ids = trips.index[ ~trips.index.isin(choosers.index[chooser_probs])].values unmatched_choosers = choosers[choosers.index.isin( missing_trip_ids)] unmatched_choosers = unmatched_choosers[['person_id', 'start'] + non_purpose_cols] # join to persons for better diagnostics persons = inject.get_table('persons').to_frame() persons_cols = [ 'age', 'is_worker', 'is_student', 'is_gradeschool', 'is_highschool', 'is_university' ] unmatched_choosers = pd.merge(unmatched_choosers, persons[[ col for col in persons_cols if col in persons.columns ]], left_on='person_id', right_index=True, how='left') file_name = '%s.UNMATCHED_PROBS' % trace_label logger.error( "%s %s of %s intermediate trips could not be matched to probs based on join columns %s" % (trace_label, len(unmatched_choosers), len(choosers), probs_join_cols)) logger.info("Writing %s unmatched choosers to %s" % ( len(unmatched_choosers), file_name, )) tracing.write_csv(unmatched_choosers, file_name=file_name, transpose=False) raise RuntimeError( "Some trips could not be matched to probs based on join columns %s." % probs_join_cols) # select the matching depart range (this should result on in exactly one chooser row per trip) choosers = choosers[chooser_probs] # choosers should now match trips row for row assert choosers.index.identical(trips.index) if estimator: probs_cols = list(probs_spec.columns) print(choosers[probs_cols]) estimator.write_table(choosers[probs_cols], 'probs', append=True) choices, rands = logit.make_choices(choosers[purpose_cols], trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'trip_purpose']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) choices = choices.map(pd.Series(purpose_cols)) return choices
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ Parameters ---------- taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <DEST_TAZ>, prob, pick_count MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term Returns ------- dataframe with with duplicated index <chooser_id_col> and columns: <DEST_MAZ>, prob, pick_count """ # print(f"taz_sample\n{taz_sample}") # dest_TAZ prob pick_count person_id # tour_id # 542963 18 0.004778 1 13243 # 542963 53 0.004224 2 13243 # 542963 59 0.008628 1 13243 trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') CHOOSER_ID = taz_sample.index.name # zone_id for tours, but person_id for location choice assert CHOOSER_ID is not None # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) tracing.trace_df(taz_sample[trace_targets], label=tracing.extend_trace_label( trace_label, 'taz_sample'), transpose=False) # redupe taz_sample[[DEST_TAZ, 'prob']] using pick_count to repeat rows taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) taz_choices = taz_choices.reindex( taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) # print(f"taz_choices\n{taz_choices}") # tour_id dest_TAZ TAZ_prob # 0 542963 18 0.004778 # 1 542963 53 0.004224 # 2 542963 53 0.004224 # 3 542963 59 0.008628 # print(f"MAZ_size_terms\n{MAZ_size_terms}") # zone_id dest_TAZ size_term # 0 6097 2 7.420 # 1 16421 2 9.646 # 2 24251 2 10.904 # just to make it clear we are siloing choices by chooser_id chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'person_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame( index=taz_sample.index[~taz_sample.index.duplicated()]) num_choosers = len(chooser_df) assert chooser_df.index.name == chooser_id_col # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ) # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating taz_sample_size = taz_choices.groupby( chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) # maz_sizes.index is the integer offset into taz_choices of the taz for which the maz_size row is a candidate) maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), MAZ_size_terms, how='left', on=DEST_TAZ).set_index('index') # tour_id dest_TAZ zone_id size_term # index # 0 542963 18 498 12.130 # 0 542963 18 7696 18.550 # 0 542963 18 15431 8.678 # 0 542963 18 21429 29.938 # 1 542963 53 17563 34.252 if have_trace_targets: # write maz_sizes: maz_sizes[index,tour_id,dest_TAZ,zone_id,size_term] maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] tracing.trace_df(trace_maz_sizes, label=tracing.extend_trace_label( trace_label, 'maz_sizes'), transpose=False) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values # max number of MAZs for any TAZ max_maz_count = maz_counts.max() # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = maz_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts) # insert zero filler to pad each alternative set to same size padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0).reshape(-1, max_maz_count) # prob array with one row TAZ_choice, one column per alternative row_sums = padded_maz_sizes.sum(axis=1) maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) rands = rands.reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] # make choices # positions is array with the chosen alternative represented as a column index in probs # which is an integer between zero and max_maz_count positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1) # shouldn't have chosen any of the dummy pad positions assert (positions < maz_counts).all() taz_choices[DEST_MAZ] = maz_sizes['zone_id'].take(positions + first_row_offsets) taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] if have_trace_targets: taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer=CHOOSER_ID) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] tracing.trace_df(trace_taz_choices_df, label=tracing.extend_trace_label( trace_label, 'taz_choices'), transpose=False) lhs_df = trace_taz_choices_df[[CHOOSER_ID, DEST_TAZ]] alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) # trace dest_maz_alts padded_maz_sizes = np.insert(trace_maz_sizes[CHOOSER_ID].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_alts'), transpose=False) # trace dest_maz_size_terms padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_size_terms'), transpose=False) # trace dest_maz_probs df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) df['rand'] = rands[taz_choices_trace_targets] tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_probs'), transpose=False) taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ ]).agg(prob=('prob', 'max'), pick_count=('prob', 'count')) taz_choices.reset_index(level=DEST_MAZ, inplace=True) return taz_choices
def schedule_nth_trips(trips, probs_spec, model_settings, first_trip_in_leg, report_failed_trips, trace_hh_id, trace_label): """ We join each trip with the appropriate row in probs_spec by joining on probs_join_cols, which should exist in both trips, probs_spec dataframe. Parameters ---------- trips: pd.DataFrame probs_spec: pd.DataFrame Dataframe of probs for choice of depart times and join columns to match them with trips. Depart columns names are irrelevant. Instead, they are position dependent, time period choice is their index + depart_alt_base depart_alt_base: int int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am report_failed_trips : bool trace_hh_id trace_label Returns ------- choices: pd.Series time periods depart choices, one per trip (except for trips with zero probs) """ depart_alt_base = model_settings.get('DEPART_ALT_BASE') probs_cols = [c for c in probs_spec.columns if c not in PROBS_JOIN_COLUMNS] # left join trips to probs (there may be multiple rows per trip for multiple depart ranges) choosers = pd.merge(trips.reset_index(), probs_spec, on=PROBS_JOIN_COLUMNS, how='left').set_index('trip_id') chunk.log_df(trace_label, "choosers", choosers) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choosers, '%s.choosers' % trace_label) # choosers should now match trips row for row assert choosers.index.is_unique assert len(choosers.index) == len(trips.index) # zero out probs outside earliest-latest window chooser_probs = clip_probs(trips, choosers[probs_cols], model_settings) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if first_trip_in_leg: # probs should sum to 1 unless all zero chooser_probs = chooser_probs.div(chooser_probs.sum(axis=1), axis=0).fillna(0) # probs should sum to 1 with residual probs resulting in choice of 'fail' chooser_probs['fail'] = 1 - chooser_probs.sum(axis=1).clip(0, 1) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label) choices, rands = logit.make_choices(chooser_probs, trace_label=trace_label, trace_choosers=choosers) chunk.log_df(trace_label, "choices", choices) chunk.log_df(trace_label, "rands", rands) if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # convert alt choice index to depart time (setting failed choices to -1) failed = (choices == chooser_probs.columns.get_loc('fail')) choices = (choices + depart_alt_base).where(~failed, -1) chunk.log_df(trace_label, "failed", failed) # report failed trips while we have the best diagnostic info if report_failed_trips and failed.any(): report_bad_choices(bad_row_map=failed, df=choosers, filename='failed_choosers', trace_label=trace_label, trace_choosers=None) # trace before removing failures if trace_hh_id and tracing.has_trace_targets(trips): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, 'depart']) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # remove any failed choices if failed.any(): choices = choices[~failed] assert (choices >= trips.earliest[~failed]).all() assert (choices <= trips.latest[~failed]).all() return choices
def choose_tour_leg_pattern(trip_segment, patterns, spec, trace_label='trace_label'): alternatives = generate_alternatives(trip_segment, STOP_TIME_DURATION).sort_index() have_trace_targets = tracing.has_trace_targets(trip_segment) if have_trace_targets: tracing.trace_df(trip_segment, tracing.extend_trace_label(trace_label, 'choosers')) tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), transpose=False) if len(spec.columns) > 1: raise RuntimeError('spec must have only one column') # - join choosers and alts # in vanilla interaction_simulate interaction_df is cross join of choosers and alternatives # interaction_df = logit.interaction_dataset(choosers, alternatives, sample_size) # here, alternatives is sparsely repeated once for each (non-dup) sample # we expect alternatives to have same index of choosers (but with duplicate index values) # so we just need to left join alternatives with choosers assert alternatives.index.name == trip_segment.index.name interaction_df = alternatives.join(trip_segment, how='left', rsuffix='_chooser') chunk.log_df(trace_label, 'interaction_df', interaction_df) if have_trace_targets: trace_rows, trace_ids = tracing.interaction_trace_rows(interaction_df, trip_segment) tracing.trace_df(interaction_df, tracing.extend_trace_label(trace_label, 'interaction_df'), transpose=False) else: trace_rows = trace_ids = None interaction_utilities, trace_eval_results \ = interaction_simulate.eval_interaction_utilities(spec, interaction_df, None, trace_label, trace_rows, estimator=None) interaction_utilities = pd.concat([interaction_df[STOP_TIME_DURATION], interaction_utilities], axis=1) chunk.log_df(trace_label, 'interaction_utilities', interaction_utilities) interaction_utilities = pd.merge(interaction_utilities.reset_index(), patterns[patterns[TRIP_ID].isin(interaction_utilities.index)], on=[TRIP_ID, STOP_TIME_DURATION], how='left') if have_trace_targets: tracing.trace_interaction_eval_results(trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, 'eval')) tracing.trace_df(interaction_utilities, tracing.extend_trace_label(trace_label, 'interaction_utilities'), transpose=False) del interaction_df chunk.log_df(trace_label, 'interaction_df', None) interaction_utilities = interaction_utilities.groupby([TOUR_ID, OUTBOUND, PATTERN_ID], as_index=False)[['utility']].sum() interaction_utilities[TOUR_LEG_ID] = \ interaction_utilities.apply(generate_tour_leg_id, axis=1) tour_choosers = interaction_utilities.set_index(TOUR_LEG_ID) interaction_utilities = tour_choosers[['utility']].copy() # reshape utilities (one utility column and one row per row in model_design) # to a dataframe with one row per chooser and one column per alternative # interaction_utilities is sparse because duplicate sampled alternatives were dropped # so we need to pad with dummy utilities so low that they are never chosen # number of samples per chooser sample_counts = interaction_utilities.groupby(interaction_utilities.index).size().values chunk.log_df(trace_label, 'sample_counts', sample_counts) # max number of alternatvies for any chooser max_sample_count = sample_counts.max() # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = sample_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_sample_count - sample_counts) del sample_counts chunk.log_df(trace_label, 'sample_counts', None) # insert the zero-prob utilities to pad each alternative set to same size padded_utilities = np.insert(interaction_utilities.utility.values, inserts, -999) del inserts del interaction_utilities chunk.log_df(trace_label, 'interaction_utilities', None) # reshape to array with one row per chooser, one column per alternative padded_utilities = padded_utilities.reshape(-1, max_sample_count) chunk.log_df(trace_label, 'padded_utilities', padded_utilities) # convert to a dataframe with one row per chooser and one column per alternative utilities_df = pd.DataFrame( padded_utilities, index=tour_choosers.index.unique()) chunk.log_df(trace_label, 'utilities_df', utilities_df) del padded_utilities chunk.log_df(trace_label, 'padded_utilities', None) if have_trace_targets: tracing.trace_df(utilities_df, tracing.extend_trace_label(trace_label, 'utilities'), column_labels=['alternative', 'utility']) # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative probs = logit.utils_to_probs(utilities_df, trace_label=trace_label, trace_choosers=trip_segment) chunk.log_df(trace_label, 'probs', probs) del utilities_df chunk.log_df(trace_label, 'utilities_df', None) if have_trace_targets: tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), column_labels=['alternative', 'probability']) # make choices # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions, rands = \ logit.make_choices(probs, trace_label=trace_label, trace_choosers=trip_segment) chunk.log_df(trace_label, 'positions', positions) chunk.log_df(trace_label, 'rands', rands) del probs chunk.log_df(trace_label, 'probs', None) # shouldn't have chosen any of the dummy pad utilities assert positions.max() < max_sample_count # need to get from an integer offset into the alternative sample to the alternative index # that is, we want the index value of the row that is offset by <position> rows into the # tranche of this choosers alternatives created by cross join of alternatives and choosers # resulting pandas Int64Index has one element per chooser row and is in same order as choosers choices = tour_choosers[PATTERN_ID].take(positions + first_row_offsets) chunk.log_df(trace_label, 'choices', choices) if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label(trace_label, 'choices'), columns=[None, PATTERN_ID]) tracing.trace_df(rands, tracing.extend_trace_label(trace_label, 'rands'), columns=[None, 'rand']) return choices
def extend_tour_counts(persons, tour_counts, alternatives, trace_hh_id, trace_label): """ extend tour counts based on a probability table counts can only be extended if original count is between 1 and 4 and tours can only be extended if their count is at the max possible (e.g. 2 for escort, 1 otherwise) so escort might be increased to 3 or 4 and other tour types might be increased to 2 or 3 Parameters ---------- persons: pandas dataframe (need this for join columns) tour_counts: pandas dataframe one row per person, once column per tour_type alternatives alternatives from nmtv interaction_simulate only need this to know max possible frequency for a tour type trace_hh_id trace_label Returns ------- extended tour_counts tour_counts looks like this: escort shopping othmaint othdiscr eatout social parent_id 2588676 2 0 0 1 1 0 2588677 0 1 0 1 0 0 """ assert tour_counts.index.name == persons.index.name PROBABILITY_COLUMNS = ['0_tours', '1_tours', '2_tours'] JOIN_COLUMNS = ['ptype', 'has_mandatory_tour', 'has_joint_tour'] TOUR_TYPE_COL = 'nonmandatory_tour_type' probs_spec = extension_probs() persons = persons[JOIN_COLUMNS] # only extend if there are 1 - 4 non_mandatory tours to start with extend_tour_counts = tour_counts.sum(axis=1).between(1, 4) if not extend_tour_counts.any(): return tour_counts have_trace_targets = trace_hh_id and tracing.has_trace_targets(extend_tour_counts) for i, tour_type in enumerate(alternatives.columns): i_tour_type = i + 1 # (probs_spec nonmandatory_tour_type column is 1-based) tour_type_trace_label = tracing.extend_trace_label(trace_label, tour_type) # - only extend tour if frequency is max possible frequency for this tour type tour_type_is_maxed = \ extend_tour_counts & (tour_counts[tour_type] == alternatives[tour_type].max()) maxed_tour_count_idx = tour_counts.index[tour_type_is_maxed] if len(maxed_tour_count_idx) == 0: continue # - get extension probs for tour_type choosers = pd.merge( persons.loc[maxed_tour_count_idx], probs_spec[probs_spec[TOUR_TYPE_COL] == i_tour_type], on=JOIN_COLUMNS, how='left' ).set_index(maxed_tour_count_idx) assert choosers.index.name == tour_counts.index.name # - random choice of extension magnituce based on relative probs choices, rands = logit.make_choices( choosers[PROBABILITY_COLUMNS], trace_label=tour_type_trace_label, trace_choosers=choosers) # - extend tour_count (0-based prob alternative choice equals magnitude of extension) if choices.any(): tour_counts.loc[choices.index, tour_type] += choices if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label(tour_type_trace_label, 'choices'), columns=[None, 'choice']) tracing.trace_df(rands, tracing.extend_trace_label(tour_type_trace_label, 'rands'), columns=[None, 'rand']) return tour_counts
def make_scheduling_choices( choosers_df, scheduling_mode, probs_spec, probs_join_cols, depart_alt_base, first_trip_in_leg, report_failed_trips, trace_hh_id, trace_label, trace_choice_col_name='depart', clip_earliest_latest=True): """ We join each trip with the appropriate row in probs_spec by joining on probs_join_cols, which should exist in both trips, probs_spec dataframe. Parameters ---------- choosers: pd.DataFrame scheduling_mode: str Either 'departure' or 'stop_duration' depending on whether the probability lookup table is keyed on depature period or stop duration. trips: pd.DataFrame probs_spec: pd.DataFrame Dataframe of probs for choice of depart times and join columns to match them with trips. Depart columns names are irrelevant. Instead, they are position dependent, time period choice is their index + depart_alt_base depart_alt_base: int int to add to probs column index to get time period it represents. e.g. depart_alt_base = 5 means first column (column 0) represents 5 am report_failed_trips : bool trace_hh_id trace_label Returns ------- choices: pd.Series time periods depart choices, one per trip (except for trips with zero probs) """ choosers = pd.merge(choosers_df.reset_index(), probs_spec, on=probs_join_cols, how='left').set_index(choosers_df.index.name) chunk.log_df(trace_label, "choosers", choosers) if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(choosers, '%s.choosers' % trace_label) # different pre-processing is required based on the scheduling mode chooser_probs = _preprocess_scheduling_probs( scheduling_mode, choosers_df, choosers, probs_spec, probs_join_cols, clip_earliest_latest, depart_alt_base, first_trip_in_leg) chunk.log_df(trace_label, "chooser_probs", chooser_probs) if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(chooser_probs, '%s.chooser_probs' % trace_label) raw_choices, rands = logit.make_choices(chooser_probs, trace_label=trace_label, trace_choosers=choosers) chunk.log_df(trace_label, "choices", raw_choices) chunk.log_df(trace_label, "rands", rands) if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(raw_choices, '%s.choices' % trace_label, columns=[None, trace_choice_col_name]) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # different post-processing is required based on the scheduling mode choices, failed = _postprocess_scheduling_choices( scheduling_mode, depart_alt_base, raw_choices, chooser_probs.columns, choosers_df) chunk.log_df(trace_label, "failed", failed) # report failed trips while we have the best diagnostic info if report_failed_trips and failed.any(): _report_bad_choices( bad_row_map=failed, df=choosers, filename='failed_choosers', trace_label=trace_label, trace_choosers=None) # trace before removing failures if trace_hh_id and tracing.has_trace_targets(choosers_df): tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_col_name]) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) # remove any failed choices if failed.any(): choices = choices[~failed] if all([check_col in choosers_df.columns for check_col in ['earliest', 'latest']]): assert (choices >= choosers_df.earliest[~failed]).all() assert (choices <= choosers_df.latest[~failed]).all() return choices