def get_tvpb_logsum(self, path_type, orig, dest, tod, demographic_segment, want_choices, trace_label=None): # assume they have given us a more specific name (since there may be more than one active wrapper) trace_label = trace_label or 'get_tvpb_logsum' trace_label = tracing.extend_trace_label(trace_label, path_type) recipe = 'tour_mode_choice' with chunk.chunk_log(trace_label): logsum_df = \ self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, want_choices=want_choices, trace_label=trace_label) trace_hh_id = inject.get_injectable("trace_hh_id", None) if trace_hh_id: filter_targets = tracing.trace_targets(orig) # choices from preceding run (because random numbers) override_choices = logsum_df['path_num'] if want_choices else None if filter_targets.any(): self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, want_choices=want_choices, override_choices=override_choices, trace_label=trace_label, filter_targets=filter_targets, trace=True) return logsum_df
def compute_columns(df, model_settings, configs_dir, trace_label=None): """ Evaluate expressions_spec in context of df, with optional additional pipeline tables in locals Parameters ---------- df : pandas DataFrame or if None, expect name of pipeline table to be specified by DF in model_settings model_settings : dict or str dict with keys: DF - df_alias and (additionally, if df is None) name of pipeline table to load as df SPEC - name of expressions file (csv suffix optional) if different from model_settings TABLES - list of pipeline tables to load and make available as (read only) locals str: name of yaml file in confirs_dir to load dict from configs_dir trace_label Returns ------- results: pandas.DataFrame one column for each expression (except temps with ALL_CAP target names) same index as df """ if isinstance(model_settings, str): model_settings_name = model_settings model_settings = config.read_model_settings(configs_dir, '%s.yaml' % model_settings) assert model_settings, "Found no model settings for %s" % model_settings_name else: model_settings_name = 'dict' assert 'DF' in model_settings, \ "Expected to find 'DF' in %s" % model_settings_name df_name = model_settings.get('DF') helper_table_names = model_settings.get('TABLES', []) expressions_spec_name = model_settings.get('SPEC', model_settings_name) assert expressions_spec_name is not None, \ "Expected to find 'SPEC' in %s" % model_settings_name if trace_label is None: trace_label = expressions_spec_name if not expressions_spec_name.endswith(".csv"): expressions_spec_name = '%s.csv' % expressions_spec_name expressions_spec = assign.read_assignment_spec(os.path.join(configs_dir, expressions_spec_name)) tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} # if df was passed in, df might be a slice, or any other table, but DF is it's local alias assert df_name not in tables, "Did not expect to find df '%s' in TABLES" % df_name tables[df_name] = df locals_dict = local_utilities() locals_dict.update(tables) results, trace_results, trace_assigned_locals \ = assign.assign_variables(expressions_spec, df, locals_dict, trace_rows=tracing.trace_targets(df)) if trace_results is not None: tracing.trace_df(trace_results, label=trace_label, slicer='NONE', warn_if_empty=True) if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) return results
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los, alt_dest_col_name, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ Parameters ---------- taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term Returns ------- dataframe with with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count """ if len(taz_sample) == 0: # it can happen that all trips have no viable destinations (and so are dropped from the sample) # in which case we can just return the empty taz_sample, since it has the same columns return taz_sample.copy() # we had to use alt_dest_col_name as specified in model_settings for interaction_sample # because expressions reference it to look up size_terms by trip purpose DEST_MAZ = alt_dest_col_name DEST_TAZ = f"{alt_dest_col_name}_TAZ" taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True) trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) tracing.trace_df(taz_sample[trace_targets], label=tracing.extend_trace_label( trace_label, 'taz_sample'), transpose=False) # print(f"taz_sample\n{taz_sample}") # alt_dest_TAZ prob pick_count # trip_id # 4343721 12 0.000054 1 # 4343721 20 0.001864 2 taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) taz_choices = taz_choices.reindex( taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) # print(f"taz_choices\n{taz_choices}") # trip_id alt_dest_TAZ prob # 0 4343721 12 0.000054 # 1 4343721 20 0.001864 # 2 4343721 20 0.001864 # print(f"MAZ_size_terms\n{MAZ_size_terms.df}") # work escort shopping eatout othmaint social othdiscr univ # alt_dest # 2 31.0 9.930 0.042 0.258 0.560 0.520 10.856 0.042 # 3 0.0 3.277 0.029 0.000 0.029 0.029 7.308 0.029 # 4 0.0 1.879 0.023 0.000 0.023 0.023 5.796 0.023 # just to make it clear we are siloing choices by chooser_id chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'trip_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame( index=taz_sample.index[~taz_sample.index.duplicated()]) num_choosers = len(chooser_df) assert chooser_df.index.name == chooser_id_col # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ) # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating taz_sample_size = taz_choices.groupby( chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].rename(columns={ 'TAZ': DEST_TAZ, 'MAZ': DEST_MAZ }) maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), maz_taz, how='left', on=DEST_TAZ).set_index('index') purpose = maz_sizes['trip_id'].map( trips.purpose) # size term varies by purpose maz_sizes['size_term'] = MAZ_size_terms.get(maz_sizes[DEST_MAZ], purpose) # print(f"maz_sizes\n{maz_sizes}") # trip_id alt_dest_TAZ alt_dest size_term # index # 0 4343721 12 3445 0.019 # 0 4343721 12 11583 0.017 # 0 4343721 12 21142 0.020 if have_trace_targets: # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term] maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer='trip_id') trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] tracing.trace_df(trace_maz_sizes, label=tracing.extend_trace_label( trace_label, 'maz_sizes'), transpose=False) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values # print(maz_counts) # max number of MAZs for any TAZ max_maz_count = maz_counts.max() # print(f"max_maz_count {max_maz_count}") # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = maz_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts) # insert zero filler to pad each alternative set to same size padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0) padded_maz_sizes = padded_maz_sizes.reshape(-1, max_maz_count) # prob array with one row TAZ_choice, one column per alternative row_sums = padded_maz_sizes.sum(axis=1) maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = pipeline.get_rn_generator().random_for_df( chooser_df, n=taz_sample_size).reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] # make choices # positions is array with the chosen alternative represented as a column index in probs # which is an integer between zero and max_maz_count positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1) # shouldn't have chosen any of the dummy pad positions assert (positions < maz_counts).all() taz_choices[DEST_MAZ] = maz_sizes[DEST_MAZ].take(positions + first_row_offsets) taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] if have_trace_targets: taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer='trip_id') trace_taz_choices_df = taz_choices[taz_choices_trace_targets] tracing.trace_df(trace_taz_choices_df, label=tracing.extend_trace_label( trace_label, 'taz_choices'), transpose=False) lhs_df = trace_taz_choices_df[['trip_id', DEST_TAZ]] alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) # trace dest_maz_alts padded_maz_sizes = np.insert(trace_maz_sizes[DEST_MAZ].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_alts'), transpose=False) # trace dest_maz_size_terms padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_size_terms'), transpose=False) # trace dest_maz_probs df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) df['rand'] = rands[taz_choices_trace_targets] tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_probs'), transpose=False) taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ ]).agg(prob=('prob', 'max'), pick_count=('prob', 'count')) taz_choices.reset_index(level=DEST_MAZ, inplace=True) return taz_choices
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label): """ Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ Parameters ---------- taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <DEST_TAZ>, prob, pick_count MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term Returns ------- dataframe with with duplicated index <chooser_id_col> and columns: <DEST_MAZ>, prob, pick_count """ # print(f"taz_sample\n{taz_sample}") # dest_TAZ prob pick_count person_id # tour_id # 542963 18 0.004778 1 13243 # 542963 53 0.004224 2 13243 # 542963 59 0.008628 1 13243 trace_hh_id = inject.get_injectable("trace_hh_id", None) have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample) if have_trace_targets: trace_label = tracing.extend_trace_label(trace_label, 'choose_MAZ_for_TAZ') CHOOSER_ID = taz_sample.index.name # zone_id for tours, but person_id for location choice assert CHOOSER_ID is not None # write taz choices, pick_counts, probs trace_targets = tracing.trace_targets(taz_sample) tracing.trace_df(taz_sample[trace_targets], label=tracing.extend_trace_label( trace_label, 'taz_sample'), transpose=False) # redupe taz_sample[[DEST_TAZ, 'prob']] using pick_count to repeat rows taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False) taz_choices = taz_choices.reindex( taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True) taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'}) # print(f"taz_choices\n{taz_choices}") # tour_id dest_TAZ TAZ_prob # 0 542963 18 0.004778 # 1 542963 53 0.004224 # 2 542963 53 0.004224 # 3 542963 59 0.008628 # print(f"MAZ_size_terms\n{MAZ_size_terms}") # zone_id dest_TAZ size_term # 0 6097 2 7.420 # 1 16421 2 9.646 # 2 24251 2 10.904 # just to make it clear we are siloing choices by chooser_id chooser_id_col = taz_sample.index.name # should be canonical chooser index name (e.g. 'person_id') # for random_for_df, we need df with de-duplicated chooser canonical index chooser_df = pd.DataFrame( index=taz_sample.index[~taz_sample.index.duplicated()]) num_choosers = len(chooser_df) assert chooser_df.index.name == chooser_id_col # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ) # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating taz_sample_size = taz_choices.groupby( chooser_id_col)[DEST_TAZ].count().max() # taz_choices index values should be contiguous assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index, taz_sample_size)).all() # we need to choose a MAZ for each DEST_TAZ choice # probability of choosing MAZ based on MAZ size_term fraction of TAZ total # there will be a different set (and number) of candidate MAZs for each TAZ # (preserve index, which will have duplicates as result of join) # maz_sizes.index is the integer offset into taz_choices of the taz for which the maz_size row is a candidate) maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(), MAZ_size_terms, how='left', on=DEST_TAZ).set_index('index') # tour_id dest_TAZ zone_id size_term # index # 0 542963 18 498 12.130 # 0 542963 18 7696 18.550 # 0 542963 18 15431 8.678 # 0 542963 18 21429 29.938 # 1 542963 53 17563 34.252 if have_trace_targets: # write maz_sizes: maz_sizes[index,tour_id,dest_TAZ,zone_id,size_term] maz_sizes_trace_targets = tracing.trace_targets(maz_sizes, slicer=CHOOSER_ID) trace_maz_sizes = maz_sizes[maz_sizes_trace_targets] tracing.trace_df(trace_maz_sizes, label=tracing.extend_trace_label( trace_label, 'maz_sizes'), transpose=False) # number of DEST_TAZ candidates per chooser maz_counts = maz_sizes.groupby(maz_sizes.index).size().values # max number of MAZs for any TAZ max_maz_count = maz_counts.max() # offsets of the first and last rows of each chooser in sparse interaction_utilities last_row_offsets = maz_counts.cumsum() first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0) # repeat the row offsets once for each dummy utility to insert # (we want to insert dummy utilities at the END of the list of alternative utilities) # inserts is a list of the indices at which we want to do the insertions inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts) # insert zero filler to pad each alternative set to same size padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0).reshape(-1, max_maz_count) # prob array with one row TAZ_choice, one column per alternative row_sums = padded_maz_sizes.sum(axis=1) maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1)) assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count) rands = pipeline.get_rn_generator().random_for_df(chooser_df, n=taz_sample_size) rands = rands.reshape(-1, 1) assert len(rands) == num_choosers * taz_sample_size assert len(rands) == maz_probs.shape[0] # make choices # positions is array with the chosen alternative represented as a column index in probs # which is an integer between zero and max_maz_count positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1) # shouldn't have chosen any of the dummy pad positions assert (positions < maz_counts).all() taz_choices[DEST_MAZ] = maz_sizes['zone_id'].take(positions + first_row_offsets) taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]), positions] taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob'] if have_trace_targets: taz_choices_trace_targets = tracing.trace_targets(taz_choices, slicer=CHOOSER_ID) trace_taz_choices_df = taz_choices[taz_choices_trace_targets] tracing.trace_df(trace_taz_choices_df, label=tracing.extend_trace_label( trace_label, 'taz_choices'), transpose=False) lhs_df = trace_taz_choices_df[[CHOOSER_ID, DEST_TAZ]] alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)] # following the same logic as the full code, but for trace cutout trace_maz_counts = maz_counts[taz_choices_trace_targets] trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum() trace_inserts = np.repeat(trace_last_row_offsets, max_maz_count - trace_maz_counts) # trace dest_maz_alts padded_maz_sizes = np.insert(trace_maz_sizes[CHOOSER_ID].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_alts'), transpose=False) # trace dest_maz_size_terms padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values, trace_inserts, 0.0).reshape(-1, max_maz_count) df = pd.DataFrame(data=padded_maz_sizes, columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_size_terms'), transpose=False) # trace dest_maz_probs df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets], columns=alt_dest_columns, index=trace_taz_choices_df.index) df = pd.concat([lhs_df, df], axis=1) df['rand'] = rands[taz_choices_trace_targets] tracing.trace_df(df, label=tracing.extend_trace_label( trace_label, 'dest_maz_probs'), transpose=False) taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob']) taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ ]).agg(prob=('prob', 'max'), pick_count=('prob', 'count')) taz_choices.reset_index(level=DEST_MAZ, inplace=True) return taz_choices
def compute_columns(df, model_settings, locals_dict={}, trace_label=None): """ Evaluate expressions_spec in context of df, with optional additional pipeline tables in locals Parameters ---------- df : pandas DataFrame or if None, expect name of pipeline table to be specified by DF in model_settings model_settings : dict or str dict with keys: DF - df_alias and (additionally, if df is None) name of pipeline table to load as df SPEC - name of expressions file (csv suffix optional) if different from model_settings TABLES - list of pipeline tables to load and make available as (read only) locals str: name of yaml file in configs_dir to load dict from locals_dict : dict dict of locals (e.g. utility functions) to add to the execution environment trace_label Returns ------- results: pandas.DataFrame one column for each expression (except temps with ALL_CAP target names) same index as df """ if isinstance(model_settings, str): model_settings_name = model_settings model_settings = config.read_model_settings('%s.yaml' % model_settings) assert model_settings, "Found no model settings for %s" % model_settings_name else: model_settings_name = 'dict' assert isinstance(model_settings, dict) assert 'DF' in model_settings, \ "Expected to find 'DF' in %s" % model_settings_name df_name = model_settings.get('DF') helper_table_names = model_settings.get('TABLES', []) expressions_spec_name = model_settings.get('SPEC', None) assert expressions_spec_name is not None, \ "Expected to find 'SPEC' in %s" % model_settings_name trace_label = tracing.extend_trace_label(trace_label or '', expressions_spec_name) if not expressions_spec_name.endswith(".csv"): expressions_spec_name = '%s.csv' % expressions_spec_name logger.debug( f"{trace_label} compute_columns using expression spec file {expressions_spec_name}" ) expressions_spec = assign.read_assignment_spec( config.config_file_path(expressions_spec_name)) assert expressions_spec.shape[0] > 0, \ "Expected to find some assignment expressions in %s" % expressions_spec_name tables = {t: inject.get_table(t).to_frame() for t in helper_table_names} # if df was passed in, df might be a slice, or any other table, but DF is it's local alias assert df_name not in tables, "Did not expect to find df '%s' in TABLES" % df_name tables[df_name] = df # be nice and also give it to them as df? tables['df'] = df _locals_dict = assign.local_utilities() _locals_dict.update(locals_dict) _locals_dict.update(tables) # FIXME a number of asim model preprocessors want skim_dict - should they request it in model_settings.TABLES? _locals_dict.update({ # 'los': inject.get_injectable('network_los', None), 'skim_dict': inject.get_injectable('skim_dict', None), }) results, trace_results, trace_assigned_locals \ = assign.assign_variables(expressions_spec, df, _locals_dict, trace_rows=tracing.trace_targets(df)) if trace_results is not None: tracing.trace_df(trace_results, label=trace_label, slicer='NONE') if trace_assigned_locals: tracing.write_csv(trace_assigned_locals, file_name="%s_locals" % trace_label) return results