def eval_nl_logsums(choosers, spec, nest_spec, locals_d, trace_label=None): """ like eval_nl except return logsums instead of making choices Returns ------- logsums : pandas.Series Index will be that of `choosers`, values will be nest logsum based on spec column values """ trace_label = tracing.extend_trace_label(trace_label, 'nl_logsums') check_for_variability = tracing.check_for_variability() # logger.debug("running eval_nl_logsums") t0 = tracing.print_elapsed_time() # column names of expression_values match spec index values expression_values = eval_variables(spec.index, choosers, locals_d) t0 = tracing.print_elapsed_time("eval_variables", t0, debug=True) if check_for_variability: _check_for_variability(expression_values, trace_label) t0 = tracing.print_elapsed_time("_check_for_variability", t0, debug=True) # raw utilities of all the leaves raw_utilities = compute_utilities(expression_values, spec) t0 = tracing.print_elapsed_time("expression_values.dot", t0, debug=True) # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities( raw_utilities, nest_spec) t0 = tracing.print_elapsed_time("compute_nested_exp_utilities", t0, debug=True) logsums = np.log(nested_exp_utilities.root) logsums = pd.Series(logsums, index=choosers.index) t0 = tracing.print_elapsed_time("logsums", t0, debug=True) if trace_label: # add logsum to nested_exp_utilities for tracing nested_exp_utilities['logsum'] = logsums tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(logsums, '%s.logsums' % trace_label, column_labels=['alternative', 'logsum']) return logsums
def eval_mnl_logsums(choosers, spec, locals_d, trace_label=None): """ like eval_nl except return logsums instead of making choices Returns ------- logsums : pandas.Series Index will be that of `choosers`, values will be logsum across spec column values """ trace_label = tracing.extend_trace_label(trace_label, 'mnl') check_for_variability = tracing.check_for_variability() print("running eval_mnl_logsums") expression_values = eval_variables(spec.index, choosers, locals_d) if check_for_variability: _check_for_variability(expression_values, trace_label) # utility values utilities = compute_utilities(expression_values, spec) # logsum is log of exponentiated utilities summed across # columns of each chooser row utils_arr = utilities.as_matrix().astype('float') logsums = np.log(np.exp(utils_arr).sum(axis=1)) logsums = pd.Series(logsums, index=choosers.index) if trace_label: # add logsum to utilities for tracing utilities['logsum'] = logsums tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(utilities, '%s.utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(logsums, '%s.logsums' % trace_label, column_labels=['alternative', 'logsum']) tracing.trace_df(expression_values, '%s.expression_values' % trace_label, column_labels=['expression', None]) return logsums
def eval_interaction_utilities(spec, df, locals_d, trace_label, trace_rows): """ Compute the utilities for a single-alternative spec evaluated in the context of df We could compute the utilities for interaction datasets just as we do for simple_simulate specs with multiple alternative columns byt calling eval_variables and then computing the utilities by matrix-multiplication of eval results with the utility coefficients in the spec alternative columns. But interaction simulate computes the utilities of each alternative in the context of a separate row in interaction dataset df, and so there is only one alternative in spec. This turns out to be quite a bit faster (in this special case) than the pandas dot function. For efficiency, we combine eval_variables and multiplication of coefficients into a single step, so we don't have to create a separate column for each partial utility. Instead, we simply multiply the eval result by a single alternative coefficient and sum the partial utilities. spec : dataframe one row per spec expression and one col with utility coefficient df : dataframe cross join (cartesian product) of choosers with alternatives combines columns of choosers and alternatives len(df) == len(choosers) * len(alternatives) index values (non-unique) are index values from alternatives df interaction_utilities : dataframe the utility of each alternative is sum of the partial utilities determined by the various spec expressions and their corresponding coefficients yielding a dataframe with len(interaction_df) rows and one utility column having the same index as interaction_df (non-unique values from alternatives df) Returns ------- utilities : pandas.DataFrame Will have the index of `df` and a single column of utilities """ assert (len(spec.columns) == 1) # avoid altering caller's passed-in locals_d parameter (they may be looping) locals_d = locals_d.copy() if locals_d is not None else {} locals_d.update(locals()) def to_series(x): if np.isscalar(x): return pd.Series([x] * len(df), index=df.index) return x if trace_rows is not None and trace_rows.any(): # # convert to numpy array so we can slice ndarrays as well as series # trace_rows = np.asanyarray(trace_rows) assert type(trace_rows) == np.ndarray trace_eval_results = [] else: trace_eval_results = None check_for_variability = tracing.check_for_variability() # need to be able to identify which variables causes an error, which keeps # this from being expressed more parsimoniously utilities = pd.DataFrame({'utility': 0.0}, index=df.index) no_variability = has_missing_vals = 0 for expr, coefficient in zip(spec.index, spec.iloc[:, 0]): try: if expr.startswith('@'): v = to_series(eval(expr[1:], globals(), locals_d)) else: v = df.eval(expr) if check_for_variability and v.std() == 0: logger.info("%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], expr)) no_variability += 1 # FIXME - how likely is this to happen? Not sure it is really a problem? if check_for_variability and np.count_nonzero( v.isnull().values) > 0: logger.info("%s: missing values in: %s" % (trace_label, expr)) has_missing_vals += 1 utilities.utility += (v * coefficient).astype('float') if trace_eval_results is not None: trace_eval_results.append((expr, v[trace_rows])) trace_eval_results.append( ('partial utility (coefficient = %s)' % coefficient, v[trace_rows] * coefficient)) # trace_eval_results.append(('cumulative utility', # utilities.utility[trace_rows])) except Exception as err: logger.exception("Variable evaluation failed for: %s" % str(expr)) raise err if no_variability > 0: logger.warn("%s: %s columns have no variability" % (trace_label, no_variability)) if has_missing_vals > 0: logger.warn("%s: %s columns have missing values" % (trace_label, has_missing_vals)) if trace_eval_results is not None: trace_eval_results.append( ('total utility', utilities.utility[trace_rows])) trace_eval_results = pd.DataFrame.from_items(trace_eval_results) trace_eval_results.index = df[trace_rows].index # add df columns to trace_results trace_eval_results = pd.concat([df[trace_rows], trace_eval_results], axis=1) return utilities, trace_eval_results
def eval_nl(choosers, spec, nest_spec, locals_d=None, trace_label=None, trace_choice_name=None): """ Run a nested-logit simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. Parameters ---------- choosers : pandas.DataFrame spec : pandas.DataFrame A table of variable specifications and coefficient values. Variable expressions should be in the table index and the table should have a column for each alternative. nest_spec: dictionary specifying nesting structure and nesting coefficients (from the model spec yaml file) locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- choices : pandas.Series Index will be that of `choosers`, values will match the columns of `spec`. """ trace_label = tracing.extend_trace_label(trace_label, 'nl') check_for_variability = tracing.check_for_variability() # column names of model_design match spec index values model_design = eval_variables(spec.index, choosers, locals_d) if check_for_variability: _check_for_variability(model_design, trace_label) # raw utilities of all the leaves # matrix product of spec expression evals with utility coefficients of alternatives # sums the partial utilities (represented by each spec row) of the alternatives # resulting in a dataframe with one row per chooser and one column per alternative # pandas dot matrix-multiply depends on column names of model_design matching spec index values raw_utilities = model_design.dot(spec) # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities( raw_utilities, nest_spec) # probabilities of alternatives relative to siblings sharing the same nest nested_probabilities = compute_nested_probabilities( nested_exp_utilities, nest_spec, trace_label=trace_label) # global (flattened) leaf probabilities based on relative nest coefficients base_probabilities = compute_base_probabilities(nested_probabilities, nest_spec) # note base_probabilities could all be zero since we allowed all probs for nests to be zero # check here to print a clear message but make_choices will raise error if probs don't sum to 1 BAD_PROB_THRESHOLD = 0.001 no_choices = \ base_probabilities.sum(axis=1).sub(np.ones(len(base_probabilities.index))).abs() \ > BAD_PROB_THRESHOLD * np.ones(len(base_probabilities.index)) if no_choices.any(): report_bad_choices(no_choices, base_probabilities, tracing.extend_trace_label(trace_label, 'eval_nl'), tag='bad_probs', msg="base_probabilities all zero") choices = make_choices(base_probabilities, trace_label, trace_choosers=choosers) if trace_label: tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_probabilities, '%s.nested_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(base_probabilities, '%s.base_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_name]) tracing.trace_df(model_design, '%s.model_design' % trace_label, column_labels=['expression', None]) # dump whole df - for software development debugging # tracing.trace_df(raw_utilities, "%s.raw_utilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(nested_exp_utilities, "%s.nested_exp_utilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(nested_probabilities, "%s.nested_probabilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(base_probabilities, "%s.base_probabilities" % trace_label, # slicer='NONE', transpose=False) # tracing.trace_df(unnested_probabilities, "%s.unnested_probabilities" % trace_label, # slicer='NONE', transpose=False) return choices
def eval_mnl(choosers, spec, locals_d=None, trace_label=None, trace_choice_name=None): """ Run a simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. Each row in spec computes a partial utility for each alternative, by providing a spec expression (often a boolean 0-1 trigger) and a column of utility coefficients for each alternative. We compute the utility of each alternative by matrix-multiplication of eval results with the utility coefficients in the spec alternative columns yielding one row per chooser and one column per alternative Parameters ---------- choosers : pandas.DataFrame spec : pandas.DataFrame A table of variable specifications and coefficient values. Variable expressions should be in the table index and the table should have a column for each alternative. locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- choices : pandas.Series Index will be that of `choosers`, values will match the columns of `spec`. """ trace_label = tracing.extend_trace_label(trace_label, 'mnl') check_for_variability = tracing.check_for_variability() model_design = eval_variables(spec.index, choosers, locals_d) if check_for_variability: _check_for_variability(model_design, trace_label) # matrix product of spec expression evals with utility coefficients of alternatives # sums the partial utilities (represented by each spec row) of the alternatives # resulting in a dataframe with one row per chooser and one column per alternative # pandas dot matrix-multiply depends on column names of model_design matching spec index values utilities = model_design.dot(spec) probs = utils_to_probs(utilities, trace_label=trace_label, trace_choosers=choosers) choices = make_choices(probs, trace_label=trace_label, trace_choosers=choosers) if trace_label: tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(utilities, '%s.utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(probs, '%s.probs' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_name]) tracing.trace_df(model_design, '%s.model_design' % trace_label, column_labels=['expression', None]) return choices
def eval_nl(choosers, spec, nest_spec, locals_d, trace_label=None, trace_choice_name=None): """ Run a nested-logit simulation for when the model spec does not involve alternative specific data, e.g. there are no interactions with alternative properties and no need to sample from alternatives. Parameters ---------- choosers : pandas.DataFrame spec : pandas.DataFrame A table of variable specifications and coefficient values. Variable expressions should be in the table index and the table should have a column for each alternative. nest_spec: dictionary specifying nesting structure and nesting coefficients (from the model spec yaml file) locals_d : Dict or None This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ trace_label: str This is the label to be used for trace log file entries and dump file names when household tracing enabled. No tracing occurs if label is empty or None. trace_choice_name: str This is the column label to be used in trace file csv dump of choices Returns ------- choices : pandas.Series Index will be that of `choosers`, values will match the columns of `spec`. """ trace_label = tracing.extend_trace_label(trace_label, 'nl') check_for_variability = tracing.check_for_variability() t0 = tracing.print_elapsed_time() # column names of expression_values match spec index values expression_values = eval_variables(spec.index, choosers, locals_d) t0 = tracing.print_elapsed_time("eval_variables", t0, debug=True) if check_for_variability: _check_for_variability(expression_values, trace_label) t0 = tracing.print_elapsed_time("_check_for_variability", t0, debug=True) # raw utilities of all the leaves raw_utilities = compute_utilities(expression_values, spec) t0 = tracing.print_elapsed_time("expression_values.dot", t0, debug=True) # exponentiated utilities of leaves and nests nested_exp_utilities = compute_nested_exp_utilities( raw_utilities, nest_spec) t0 = tracing.print_elapsed_time("compute_nested_exp_utilities", t0, debug=True) # probabilities of alternatives relative to siblings sharing the same nest nested_probabilities = compute_nested_probabilities( nested_exp_utilities, nest_spec, trace_label=trace_label) t0 = tracing.print_elapsed_time("compute_nested_probabilities", t0, debug=True) # global (flattened) leaf probabilities based on relative nest coefficients base_probabilities = compute_base_probabilities(nested_probabilities, nest_spec) t0 = tracing.print_elapsed_time("compute_base_probabilities", t0, debug=True) # note base_probabilities could all be zero since we allowed all probs # for nests to be zero check here to print a clear message but # make_choices will raise error if probs don't sum to 1 BAD_PROB_THRESHOLD = 0.001 no_choices = \ base_probabilities.sum(axis=1).sub( np.ones(len(base_probabilities.index))).abs() \ > BAD_PROB_THRESHOLD * np.ones(len(base_probabilities.index)) if no_choices.any(): logit.report_bad_choices(no_choices, base_probabilities, tracing.extend_trace_label( trace_label, 'eval_nl'), tag='bad_probs', msg="base_probabilities all zero") t0 = tracing.print_elapsed_time("report_bad_choices", t0, debug=True) choices, rands = logit.make_choices(base_probabilities, trace_label, trace_choosers=choosers) t0 = tracing.print_elapsed_time("logit.make_choices", t0, debug=True) if trace_label: tracing.trace_df(choosers, '%s.choosers' % trace_label) tracing.trace_df(raw_utilities, '%s.raw_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_exp_utilities, '%s.nested_exp_utilities' % trace_label, column_labels=['alternative', 'utility']) tracing.trace_df(nested_probabilities, '%s.nested_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(base_probabilities, '%s.base_probabilities' % trace_label, column_labels=['alternative', 'probability']) tracing.trace_df(choices, '%s.choices' % trace_label, columns=[None, trace_choice_name]) tracing.trace_df(rands, '%s.rands' % trace_label, columns=[None, 'rand']) tracing.trace_df(expression_values, '%s.expression_values' % trace_label, column_labels=['expression', None]) return choices