def interaction_simulate(choosers, alternatives, spec, skims=None, locals_d=None, sample_size=None, chunk_size=0, trace_label=None, trace_choice_name=None): """ Run a simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or because alternatives are being sampled. optionally (if chunk_size > 0) iterates over choosers in chunk_size chunks Parameters ---------- choosers : pandas.DataFrame DataFrame of choosers alternatives : pandas.DataFrame DataFrame of alternatives - will be merged with choosers, currently without sampling spec : pandas.DataFrame A Pandas DataFrame that gives the specification of the variables to compute and the coefficients for each variable. Variable specifications must be in the table index and the table should have only one column of coefficients. skims : Skims object The skims object is used to contain multiple matrices of origin-destination impedances. Make sure to also add it to the locals_d below in order to access it in expressions. The *only* job of this method in regards to skims is to call set_df with the dataframe that comes back from interacting choosers with alternatives. See the skims module for more documentation on how the skims object is intended to be used. locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ sample_size : int, optional Sample alternatives with sample of given size. By default is None, which does not sample alternatives. chunk_size : int if chunk_size > 0 iterates over choosers in chunk_size chunks trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- ret : pandas.Series A series where index should match the index of the choosers DataFrame and values will match the index of the alternatives DataFrame - choices are simulated in the standard Monte Carlo fashion """ # FIXME - chunk size should take number of chooser and alternative columns into account # FIXME - that is, chunk size should represent memory footprint (rows X columns) not just rows chunk_size = int(chunk_size) if (chunk_size == 0) or (chunk_size >= len(choosers.index)): choices = _interaction_simulate(choosers, alternatives, spec, skims, locals_d, sample_size, trace_label, trace_choice_name) return choices logger.info("interaction_simulate chunk_size %s num_choosers %s" % (chunk_size, len(choosers.index))) choices_list = [] # segment by person type and pick the right spec for each person type for i, chooser_chunk in chunked_choosers(choosers, chunk_size): logger.info("Running chunk %s of size %d" % (i, len(chooser_chunk))) choices = _interaction_simulate( chooser_chunk, alternatives, spec, skims, locals_d, sample_size, tracing.extend_trace_label(trace_label, 'chunk_%s' % i), trace_choice_name) choices_list.append(choices) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 choices = pd.concat(choices_list) assert len(choices.index == len(choosers.index)) return choices
def compute_logsums(choosers, logsum_spec, logsum_settings, skim_dict, skim_stack, alt_col_name, chunk_size, trace_hh_id, trace_label): """ Parameters ---------- choosers logsum_spec logsum_settings skim_dict skim_stack alt_col_name chunk_size trace_hh_id trace_label Returns ------- logsums: pandas series computed logsums with same index as choosers """ trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums') nest_spec = get_logit_model_settings(logsum_settings) constants = get_model_constants(logsum_settings) print("Running compute_logsums with %d choosers" % len(choosers.index)) if trace_hh_id: tracing.trace_df(logsum_spec, tracing.extend_trace_label(trace_label, 'spec'), slicer='NONE', transpose=False) # setup skim keys odt_skim_stack_wrapper = skim_stack.wrap(left_key='TAZ', right_key=alt_col_name, skim_key="out_period") dot_skim_stack_wrapper = skim_stack.wrap(left_key=alt_col_name, right_key='TAZ', skim_key="in_period") od_skim_stack_wrapper = skim_dict.wrap('TAZ', alt_col_name) skims = [ odt_skim_stack_wrapper, dot_skim_stack_wrapper, od_skim_stack_wrapper ] locals_d = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper } if constants is not None: locals_d.update(constants) logsums = asim_simulate.simple_simulate_logsums(choosers, logsum_spec, nest_spec, skims=skims, locals_d=locals_d, chunk_size=chunk_size, trace_label=trace_label) return logsums
def eval_nl(choosers, spec, nest_spec, locals_d=None, trace_label=None, trace_choice_name=None): """ Run a nested-logit simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. Parameters ---------- choosers : pandas.DataFrame spec : pandas.DataFrame A table of variable specifications and coefficient values. Variable expressions should be in the table index and the table should have a column for each alternative. nest_spec: dictionary specifying nesting structure and nesting coefficients (from the model spec yaml file) locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- choices : pandas.Series Index will be that of `choosers`, values will match the columns of `spec`. """ trace_label = tracing.extend_trace_label(trace_label, 'nl') check_for_variability = tracing.check_for_variability() # column names of model_design match spec index values model_design = eval_variables(spec.index, choosers, locals_d) if check_for_variability: _check_for_variability(model_design, trace_label) # raw utilities of all the leaves # matrix product of spec expression evals with utility coefficients of alternatives # sums the partial utilities (represented by each spec row) of the alternatives # resulting in a dataframe with one row per chooser and one column per alternative # pandas dot matrix-multiply depends on column names of model_design matching spec index values raw_utilities = model_design.dot(spec) # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities( raw_utilities, nest_spec) # probabilities of alternatives relative to siblings sharing the same nest nested_probabilities = compute_nested_probabilities( nested_exp_utilities, nest_spec, trace_label=trace_label) # global (flattened) leaf probabilities based on relative nest coefficients base_probabilities = compute_base_probabilities(nested_probabilities, nest_spec) # note base_probabilities could all be zero since we allowed all probs for nests to be zero # check here to print a clear message but make_choices will raise error if probs don't sum to 1 BAD_PROB_THRESHOLD = 0.001 no_choices = \ base_probabilities.sum(axis=1).sub(np.ones(len(base_probabilities.index))).abs() \ > BAD_PROB_THRESHOLD * np.ones(len(base_probabilities.index)) if no_choices.any(): report_bad_choices(no_choices, base_probabilities, tracing.extend_trace_label(trace_label, 'eval_nl'), tag='bad_probs', msg="base_probabilities all zero") choices = make_choices(base_probabilities, trace_label, trace_choosers=choosers) if trace_label: tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_probabilities, '%s.nested_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(base_probabilities, '%s.base_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_name]) tracing.trace_df(model_design, '%s.model_design' % trace_label, column_labels=['expression', None]) # dump whole df - for software development debugging # tracing.trace_df(raw_utilities, "%s.raw_utilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(nested_exp_utilities, "%s.nested_exp_utilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(nested_probabilities, "%s.nested_probabilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(base_probabilities, "%s.base_probabilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(unnested_probabilities, "%s.unnested_probabilities" % trace_label, # slicer='NONE', transpose=False) return choices
def _interaction_simulate(choosers, alternatives, spec, skims=None, locals_d=None, sample_size=None, trace_label=None, trace_choice_name=None): """ Run a MNL simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or because alternatives are being sampled. Parameters are same as for public function interaction_simulate spec : dataframe one row per spec expression and one col with utility coefficient interaction_df : dataframe cross join (cartesian product) of choosers with alternatives combines columns of choosers and alternatives len(df) == len(choosers) * len(alternatives) index values (non-unique) are index values from alternatives df interaction_utilities : dataframe the utility of each alternative is sum of the partial utilities determined by the various spec expressions and their corresponding coefficients yielding a dataframe with len(interaction_df) rows and one utility column having the same index as interaction_df (non-unique values from alternatives df) utilities : dataframe dot product of model_design.dot(spec) yields utility value for element in the cross product of choosers and alternatives this is then reshaped as a dataframe with one row per chooser and one column per alternative probs : dataframe utilities exponentiated and converted to probabilities same shape as utilities, one row per chooser and one column for alternative positions : series choices among alternatives with the chosen alternative represented as the integer index of the selected alternative column in probs choices : series series with the alternative chosen for each chooser the index is same as choosers and the series value is the alternative df index of chosen alternative Returns ------- ret : pandas.Series A series where index should match the index of the choosers DataFrame and values will match the index of the alternatives DataFrame - choices are simulated in the standard Monte Carlo fashion """ trace_label = tracing.extend_trace_label(trace_label, 'interaction_simulate') have_trace_targets = trace_label and tracing.has_trace_targets(choosers) if have_trace_targets: tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, 'choosers')) tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), slicer='NONE', transpose=False) if len(spec.columns) > 1: raise RuntimeError('spec must have only one column') sample_size = sample_size or len(alternatives) if sample_size > len(alternatives): logger.warn("clipping sample size %s to len(alternatives) %s" % (sample_size, len(alternatives))) sample_size = min(sample_size, len(alternatives)) # if using skims, copy index into the dataframe, so it will be # available as the "destination" for the skims dereference below if skims: alternatives[alternatives.index.name] = alternatives.index # cross join choosers and alternatives (cartesian product) # for every chooser, there will be a row for each alternative # index values (non-unique) are from alternatives df interaction_df = interaction_dataset(choosers, alternatives, sample_size) if skims: add_skims(interaction_df, skims) # evaluate expressions from the spec multiply by coefficients and sum # spec is df with one row per spec expression and one col with utility coefficient # column names of model_design match spec index values # utilities has utility value for element in the cross product of choosers and alternatives # interaction_utilities is a df with one utility column and one row per row in model_design if have_trace_targets: trace_rows, trace_ids = tracing.interaction_trace_rows( interaction_df, choosers) tracing.trace_df(interaction_df[trace_rows], tracing.extend_trace_label(trace_label, 'interaction_df'), slicer='NONE', transpose=False) else: trace_rows = trace_ids = None interaction_utilities, trace_eval_results \ = eval_interaction_utilities(spec, interaction_df, locals_d, trace_label, trace_rows) if have_trace_targets: tracing.trace_interaction_eval_results( trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, 'eval')) tracing.trace_df(interaction_utilities[trace_rows], tracing.extend_trace_label(trace_label, 'interaction_utilities'), slicer='NONE', transpose=False) # reshape utilities (one utility column and one row per row in model_design) # to a dataframe with one row per chooser and one column per alternative utilities = pd.DataFrame(interaction_utilities.as_matrix().reshape( len(choosers), sample_size), index=choosers.index) if have_trace_targets: tracing.trace_df(utilities, tracing.extend_trace_label(trace_label, 'utilities'), column_labels=['alternative', 'utility']) # tracing.trace_df(utilities, '%s.DUMP.utilities' % trace_label, transpose=False, slicer='NONE') # convert to probabilities (utilities exponentiated and normalized to probs) # probs is same shape as utilities, one row per chooser and one column for alternative probs = utils_to_probs(utilities, trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), column_labels=['alternative', 'probability']) # make choices # positions is series with the chosen alternative represented as a column index in probs # which is an integer between zero and num alternatives in the alternative sample positions = make_choices(probs, trace_label=trace_label, trace_choosers=choosers) # need to get from an integer offset into the alternative sample to the alternative index # that is, we want the index value of the row that is offset by <position> rows into the # tranche of this choosers alternatives created by cross join of alternatives and choosers # offsets is the offset into model_design df of first row of chooser alternatives offsets = np.arange(len(positions)) * sample_size # resulting pandas Int64Index has one element per chooser row and is in same order as choosers choices = interaction_utilities.index.take(positions + offsets) # create a series with index from choosers and the index of the chosen alternative choices = pd.Series(choices, index=choosers.index) if have_trace_targets: tracing.trace_df(choices, tracing.extend_trace_label(trace_label, 'choices'), columns=[None, trace_choice_name]) # # if have_trace_targets: # tracing.trace_df(choosers, '%s.choosers' % trace_label) # tracing.trace_df(utilities, '%s.utilities' % trace_label, # column_labels=['alternative', 'utility']) # tracing.trace_df(probs, '%s.probs' % trace_label, # column_labels=['alternative', 'probability']) # tracing.trace_df(choices, '%s.choices' % trace_label, # columns=[None, trace_choice_name]) # tracing.trace_interaction_eval_results(trace_eval_results, trace_ids, # '%s.eval' % trace_label) return choices
def utils_to_probs(utils, trace_label=None, exponentiated=False, allow_zero_probs=False, trace_choosers=None): """ Convert a table of utilities to probabilities. Parameters ---------- utils : pandas.DataFrame Rows should be choosers and columns should be alternatives. trace_label : str label for tracing bad utility or probability values exponentiated : bool True if utilities have already been exponentiated allow_zero_probs : bool if True value rows in which all utility alts are EXP_UTIL_MIN will result in rows in probs to have all zero probability (and not sum to 1.0) This si for hte benefit of calculating probabilities of nested logit nests trace_choosers : pandas.dataframe the choosers df (for interaction_simulate) to facilitate the reporting of hh_id by report_bad_choices because it can't deduce hh_id from the interaction_dataset which is indexed on index values from alternatives df Returns ------- probs : pandas.DataFrame Will have the same index and columns as `utils`. """ trace_label = tracing.extend_trace_label(trace_label, 'utils_to_probs') utils_arr = utils.as_matrix().astype('float') if not exponentiated: utils_arr = np.exp(utils_arr) EXP_UTIL_MIN = 1e-300 EXP_UTIL_MAX = np.inf np.clip(utils_arr, EXP_UTIL_MIN, EXP_UTIL_MAX, out=utils_arr) # FIXME utils_arr = np.where(utils_arr == EXP_UTIL_MIN, 0.0, utils_arr) arr_sum = utils_arr.sum(axis=1) zero_probs = (arr_sum == 0.0) if zero_probs.any() and not allow_zero_probs: report_bad_choices(zero_probs, utils, tracing.extend_trace_label(trace_label, 'zero_prob_utils'), msg="all probabilities are zero", trace_choosers=trace_choosers) inf_utils = np.isinf(arr_sum) if inf_utils.any(): report_bad_choices(inf_utils, utils, tracing.extend_trace_label(trace_label, 'inf_exp_utils'), msg="infinite exponentiated utilities", trace_choosers=trace_choosers) # if allow_zero_probs, this may cause a RuntimeWarning: invalid value encountered in divide np.divide(utils_arr, arr_sum.reshape(len(utils_arr), 1), out=utils_arr) PROB_MIN = 0.0 PROB_MAX = 1.0 # if allow_zero_probs, this will cause EXP_UTIL_MIN util rows to have all zero probabilities utils_arr[np.isnan(utils_arr)] = PROB_MIN np.clip(utils_arr, PROB_MIN, PROB_MAX, out=utils_arr) probs = pd.DataFrame(utils_arr, columns=utils.columns, index=utils.index) return probs
def eval_mnl(choosers, spec, locals_d=None, trace_label=None, trace_choice_name=None): """ Run a simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. Each row in spec computes a partial utility for each alternative, by providing a spec expression (often a boolean 0-1 trigger) and a column of utility coefficients for each alternative. We compute the utility of each alternative by matrix-multiplication of eval results with the utility coefficients in the spec alternative columns yielding one row per chooser and one column per alternative Parameters ---------- choosers : pandas.DataFrame spec : pandas.DataFrame A table of variable specifications and coefficient values. Variable expressions should be in the table index and the table should have a column for each alternative. locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- choices : pandas.Series Index will be that of `choosers`, values will match the columns of `spec`. """ trace_label = tracing.extend_trace_label(trace_label, 'mnl') check_for_variability = tracing.check_for_variability() model_design = eval_variables(spec.index, choosers, locals_d) if check_for_variability: _check_for_variability(model_design, trace_label) # matrix product of spec expression evals with utility coefficients of alternatives # sums the partial utilities (represented by each spec row) of the alternatives # resulting in a dataframe with one row per chooser and one column per alternative # pandas dot matrix-multiply depends on column names of model_design matching spec index values utilities = model_design.dot(spec) probs = utils_to_probs(utilities, trace_label=trace_label, trace_choosers=choosers) choices = make_choices(probs, trace_label=trace_label, trace_choosers=choosers) if trace_label: tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(utilities, '%s.utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(probs, '%s.probs' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_name]) tracing.trace_df(model_design, '%s.model_design' % trace_label, column_labels=['expression', None]) return choices
def _simple_simulate(choosers, spec, nest_spec, skims=None, locals_d=None, trace_label=None, trace_choice_name=None): """ Run an MNL or NL simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. Parameters ---------- choosers : pandas.DataFrame spec : pandas.DataFrame A table of variable specifications and coefficient values. Variable expressions should be in the table index and the table should have a column for each alternative. nest_spec: for nested logit (nl): dictionary specifying nesting structure and nesting coefficients for multinomial logit (mnl): None skims : Skims object The skims object is used to contain multiple matrices of origin-destination impedances. Make sure to also add it to the locals_d below in order to access it in expressions. The *only* job of this method in regards to skims is to call set_df with the dataframe that comes back from interacting choosers with alternatives. See the skims module for more documentation on how the skims object is intended to be used. locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- choices : pandas.Series Index will be that of `choosers`, values will match the columns of `spec`. """ if skims: add_skims(choosers, skims) trace_label = tracing.extend_trace_label(trace_label, 'simple_simulate') if nest_spec is None: choices = eval_mnl(choosers, spec, locals_d, trace_label=trace_label, trace_choice_name=trace_choice_name) else: choices = eval_nl(choosers, spec, nest_spec, locals_d, trace_label=trace_label, trace_choice_name=trace_choice_name) return choices
def eval_nl(choosers, spec, nest_spec, locals_d, trace_label=None, trace_choice_name=None): """ Run a nested-logit simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. Parameters ---------- choosers : pandas.DataFrame spec : pandas.DataFrame A table of variable specifications and coefficient values. Variable expressions should be in the table index and the table should have a column for each alternative. nest_spec: dictionary specifying nesting structure and nesting coefficients (from the model spec yaml file) locals_d : Dict or None This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- choices : pandas.Series Index will be that of `choosers`, values will match the columns of `spec`. """ trace_label = tracing.extend_trace_label(trace_label, 'nl') check_for_variability = tracing.check_for_variability() t0 = tracing.print_elapsed_time() # column names of expression_values match spec index values expression_values = eval_variables(spec.index, choosers, locals_d) t0 = tracing.print_elapsed_time("eval_variables", t0, debug=True) if check_for_variability: _check_for_variability(expression_values, trace_label) t0 = tracing.print_elapsed_time("_check_for_variability", t0, debug=True) # raw utilities of all the leaves raw_utilities = compute_utilities(expression_values, spec) t0 = tracing.print_elapsed_time("expression_values.dot", t0, debug=True) # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities( raw_utilities, nest_spec) t0 = tracing.print_elapsed_time("compute_nested_exp_utilities", t0, debug=True) # probabilities of alternatives relative to siblings sharing the same nest nested_probabilities = compute_nested_probabilities( nested_exp_utilities, nest_spec, trace_label=trace_label) t0 = tracing.print_elapsed_time("compute_nested_probabilities", t0, debug=True) # global (flattened) leaf probabilities based on relative nest coefficients base_probabilities = compute_base_probabilities(nested_probabilities, nest_spec) t0 = tracing.print_elapsed_time("compute_base_probabilities", t0, debug=True) # note base_probabilities could all be zero since we allowed all probs # for nests to be zero check here to print a clear message but # make_choices will raise error if probs don't sum to 1 BAD_PROB_THRESHOLD = 0.001 no_choices = \ base_probabilities.sum(axis=1).sub( np.ones(len(base_probabilities.index))).abs() \ > BAD_PROB_THRESHOLD * np.ones(len(base_probabilities.index)) if no_choices.any(): logit.report_bad_choices(no_choices, base_probabilities, tracing.extend_trace_label( trace_label, 'eval_nl'), tag='bad_probs', msg="base_probabilities all zero") t0 = tracing.print_elapsed_time("report_bad_choices", t0, debug=True) choices, rands = logit.make_choices(base_probabilities, trace_label, trace_choosers=choosers) t0 = tracing.print_elapsed_time("logit.make_choices", t0, debug=True) if trace_label: tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_probabilities, '%s.nested_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(base_probabilities, '%s.base_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_name]) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) tracing.trace_df(expression_values, '%s.expression_values' % trace_label, column_labels=['expression', None]) return choices
def interaction_sample(choosers, alternatives, spec, sample_size, alt_col_name=None, skims=None, locals_d=None, chunk_size=0, trace_label=None): """ Run a simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or because alternatives are being sampled. optionally (if chunk_size > 0) iterates over choosers in chunk_size chunks Parameters ---------- choosers : pandas.DataFrame DataFrame of choosers alternatives : pandas.DataFrame DataFrame of alternatives - will be merged with choosers and sampled spec : pandas.DataFrame A Pandas DataFrame that gives the specification of the variables to compute and the coefficients for each variable. Variable specifications must be in the table index and the table should have only one column of coefficients. sample_size : int, optional Sample alternatives with sample of given size. By default is None, which does not sample alternatives. alt_col_name: str or None name to give the sampled_alternative column skims : Skims object The skims object is used to contain multiple matrices of origin-destination impedances. Make sure to also add it to the locals_d below in order to access it in expressions. The *only* job of this method in regards to skims is to call set_df with the dataframe that comes back from interacting choosers with alternatives. See the skims module for more documentation on how the skims object is intended to be used. locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ chunk_size : int if chunk_size > 0 iterates over choosers in chunk_size chunks trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. Returns ------- ret : pandas.Series A series where index should match the index of the choosers DataFrame and values will match the index of the alternatives DataFrame - choices are simulated in the standard Monte Carlo fashion """ assert sample_size > 0 sample_size = min(sample_size, len(alternatives.index)) rows_per_chunk = asim_utils.num_chunk_rows_for_chunk_size( chunk_size, choosers, alternatives) print("interaction_simulate chunk_size %s num_choosers %s" % (chunk_size, len(choosers.index))) result_list = [] for i, num_chunks, chooser_chunk in asim_utils.chunked_choosers( choosers, rows_per_chunk): print("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) choices = _interaction_sample( chooser_chunk, alternatives, spec, sample_size, alt_col_name, skims, locals_d, tracing.extend_trace_label(trace_label, 'chunk_%s' % i)) result_list.append(choices) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choosers.index) == len(np.unique(choices.index.values)) return choices
def _interaction_sample(choosers, alternatives, spec, sample_size, alt_col_name, skims=None, locals_d=None, trace_label=None): """ Run a MNL simulation in the situation in which alternatives must be merged with choosers because there are interaction terms or because alternatives are being sampled. Parameters are same as for public function interaction_simulate spec : dataframe one row per spec expression and one col with utility coefficient interaction_df : dataframe cross join (cartesian product) of choosers with alternatives combines columns of choosers and alternatives len(df) == len(choosers) * len(alternatives) index values (non-unique) are index values from alternatives df interaction_utilities : dataframe the utility of each alternative is sum of the partial utilities determined by the various spec expressions and their corresponding coefficients yielding a dataframe with len(interaction_df) rows and one utility column having the same index as interaction_df (non-unique values from alternatives df) utilities : dataframe dot product of model_design.dot(spec) yields utility value for element in the cross product of choosers and alternatives this is then reshaped as a dataframe with one row per chooser and one column per alternative probs : dataframe utilities exponentiated and converted to probabilities same shape as utilities, one row per chooser and one column per alternative positions : series choices among alternatives with the chosen alternative represented as the integer index of the selected alternative column in probs choices : series series with the alternative chosen for each chooser the index is same as choosers and the series value is the alternative df index of chosen alternative Returns ------- choices_df : pandas.DataFrame A DataFrame where index should match the index of the choosers DataFrame and columns alt_col_name, prob, rand, pick_count prob: float the probability of the chosen alternative rand: float the rand that did the choosing pick_count : int number of duplicate picks for chooser, alt """ trace_label = tracing.extend_trace_label(trace_label, 'interaction_simulate') have_trace_targets = trace_label and tracing.has_trace_targets(choosers) if alt_col_name is None: alt_col_name = 'alt_%s' % alternatives.index.name if have_trace_targets: tracing.trace_df(choosers, tracing.extend_trace_label(trace_label, 'choosers')) tracing.trace_df(alternatives, tracing.extend_trace_label(trace_label, 'alternatives'), slicer='NONE', transpose=False) if len(spec.columns) > 1: raise RuntimeError('spec must have only one column') alternative_count = len(alternatives) # print("_interaction_sample alternative_count %s" % alternative_count) # if using skims, copy index into the dataframe, so it will be # available as the "destination" for the skims dereference below if skims: alternatives[alternatives.index.name] = alternatives.index # cross join choosers and alternatives (cartesian product) # for every chooser, there will be a row for each alternative # index values (non-unique) are from alternatives df interaction_df = logit.interaction_dataset(choosers, alternatives, alternative_count) assert alternative_count == len(interaction_df.index) / len(choosers.index) if skims: asim_utils.add_skims(interaction_df, skims) # evaluate expressions from the spec multiply by coefficients and sum # spec is df with one row per spec expression and one col # with utility coefficient column names of interaction_df match spec # index values utilities has utility value for element in the # cross product of choosers and alternatives interaction_utilities is # a df with one utility column and one row per row in interaction_df if have_trace_targets: trace_rows, trace_ids \ = tracing.interaction_trace_rows( interaction_df, choosers, alternative_count) tracing.trace_df(interaction_df[trace_rows], tracing.extend_trace_label(trace_label, 'interaction_df'), slicer='NONE', transpose=False) else: trace_rows = trace_ids = None interaction_utilities, trace_eval_results \ = eval_interaction_utilities( spec, interaction_df, locals_d, trace_label, trace_rows) if have_trace_targets: tracing.trace_interaction_eval_results( trace_eval_results, trace_ids, tracing.extend_trace_label(trace_label, 'eval')) tracing.trace_df(interaction_utilities[trace_rows], tracing.extend_trace_label(trace_label, 'interaction_utilities'), slicer='NONE', transpose=False) tracing.dump_df(DUMP, interaction_utilities, trace_label, 'interaction_utilities') # FIXME - do this in numpy, not pandas? # reshape utilities (one utility column and one row per # row in interaction_utilities) to a dataframe with one # row per chooser and one column per alternative utilities = pd.DataFrame(interaction_utilities.as_matrix().reshape( len(choosers), alternative_count), index=choosers.index) if have_trace_targets: tracing.trace_df(utilities, tracing.extend_trace_label(trace_label, 'utilities'), column_labels=['alternative', 'utility']) tracing.dump_df(DUMP, utilities, trace_label, 'utilities') # FIXME - do this in numpy, not pandas? # convert to probabilities (utilities exponentiated # and normalized to probs) probs is same shape as utilities, # one row per chooser and one column for alternative probs = logit.utils_to_probs(utilities, trace_label=trace_label, trace_choosers=choosers) if have_trace_targets: tracing.trace_df(probs, tracing.extend_trace_label(trace_label, 'probs'), column_labels=['alternative', 'probability']) choices_df = make_sample_choices(choosers, probs, interaction_utilities, sample_size, alternative_count, alt_col_name, trace_label) # make_sample_choices should return choosers index as choices_df column assert choosers.index.name in choices_df.columns # pick_count and pick_dup # pick_count is number of duplicate picks # pick_dup flag is True for all but first of duplicates pick_group = choices_df.groupby([choosers.index.name, alt_col_name]) # number each item in each group from 0 to the length of that group - 1. choices_df['pick_count'] = pick_group.cumcount(ascending=True) # flag duplicate rows after first choices_df['pick_dup'] = choices_df['pick_count'] > 0 # add reverse cumcount to get total pick_count # (conveniently faster than groupby.count + merge) choices_df['pick_count'] += pick_group.cumcount(ascending=False) + 1 # drop the duplicates choices_df = choices_df[~choices_df['pick_dup']] del choices_df['pick_dup'] # set index after groupby so we can trace on it choices_df.set_index(choosers.index.name, inplace=True) tracing.dump_df(DUMP, choices_df, trace_label, 'choices_df') if have_trace_targets: tracing.trace_df(choices_df, tracing.extend_trace_label(trace_label, 'sampled_alternatives'), transpose=False, column_labels=['sample_alt', 'alternative']) return choices_df
def make_sample_choices(choosers, probs, interaction_utilities, sample_size, alternative_count, alt_col_name, trace_label): """ Parameters ---------- choosers probs : pandas DataFrame one row per chooser and one column per alternative interaction_utilities dataframe with len(interaction_df) rows and one utility column sample_size : int number of samples/choices to make alternative_count alt_col_name trace_label Returns ------- """ assert isinstance(probs, pd.DataFrame) assert probs.shape == (len(choosers), alternative_count) assert isinstance(interaction_utilities, pd.DataFrame) assert interaction_utilities.shape == (len(choosers) * alternative_count, 1) t0 = tracing.print_elapsed_time() # probs should sum to 1 across each row BAD_PROB_THRESHOLD = 0.001 bad_probs = \ probs.sum(axis=1).sub(np.ones(len(probs.index))).abs() \ > BAD_PROB_THRESHOLD * np.ones(len(probs.index)) if bad_probs.any(): logit.report_bad_choices.report_bad_choices( bad_probs, probs, tracing.extend_trace_label(trace_label, 'bad_probs'), msg="probabilities do not add up to 1", trace_choosers=choosers) t0 = tracing.print_elapsed_time("make_choices bad_probs", t0, debug=True) cum_probs_arr = probs.as_matrix().cumsum(axis=1) t0 = tracing.print_elapsed_time("make_choices cum_probs_arr", t0, debug=True) # alt probs in convenient layout to return prob of chose alternative # (same layout as cum_probs_arr and interaction_utilities) alt_probs_array = probs.as_matrix().flatten() # get sample_size rands for each chooser # transform as we iterate over alternatives # reshape so rands[i] is in broadcastable (2-D) shape for cum_probs_arr # i.e rands[i] is a 2-D array of one alt choice rand for each chooser rands = asim_utils.get_rn_generator().random_for_df(probs, n=sample_size) rands = rands.T.reshape(sample_size, -1, 1) t0 = tracing.print_elapsed_time("make_choices random_for_df", t0, debug=True) # the alternative value chosen choices_array = np.empty([sample_size, len(choosers)]).astype(int) # the probability of the chosen alternative choice_probs_array = np.empty([sample_size, len(choosers)]) # FIXME - do this all at once rather than iterate? for i in range(sample_size): # FIXME - do this in numpy, not pandas? # rands for this alt in broadcastable shape r = rands[i] # position of first occurrence of positive value positions = np.argmax(cum_probs_arr > r, axis=1) # FIXME - leave positions as numpy array, not pandas series? # positions is series with the chosen alternative # represented as a column index in probs # which is an integer between zero and num alternatives # in the alternative sample positions = pd.Series(positions, index=probs.index) # need to get from an integer offset into the alternative # sample to the alternative index that is, we want the index # value of the row that is offset by <position> rows into the # tranche of this choosers alternatives created by cross join # of alternatives and choosers # offsets is the offset into model_design df of first row # of chooser alternatives offsets = np.arange(len(positions)) * alternative_count # resulting pandas Int64Index has one element per chooser # and is in same order as choosers choices_array[i] = interaction_utilities.index.take(positions + offsets) choice_probs_array[i] = np.take(alt_probs_array, positions + offsets) # explode to one row per chooser.index, alt_TAZ choices_df = pd.DataFrame({ alt_col_name: choices_array.flatten(order='F'), 'rand': rands.flatten(order='F'), 'prob': choice_probs_array.flatten(order='F'), choosers.index.name: np.repeat(np.asanyarray(choosers.index), sample_size) }) return choices_df