Exemple #1
0
def interaction_simulate(choosers,
                         alternatives,
                         spec,
                         skims=None,
                         locals_d=None,
                         sample_size=None,
                         chunk_size=0,
                         trace_label=None,
                         trace_choice_name=None):
    """
    Run a simulation in the situation in which alternatives must
    be merged with choosers because there are interaction terms or
    because alternatives are being sampled.

    optionally (if chunk_size > 0) iterates over choosers in chunk_size chunks

    Parameters
    ----------
    choosers : pandas.DataFrame
        DataFrame of choosers
    alternatives : pandas.DataFrame
        DataFrame of alternatives - will be merged with choosers, currently
        without sampling
    spec : pandas.DataFrame
        A Pandas DataFrame that gives the specification of the variables to
        compute and the coefficients for each variable.
        Variable specifications must be in the table index and the
        table should have only one column of coefficients.
    skims : Skims object
        The skims object is used to contain multiple matrices of
        origin-destination impedances.  Make sure to also add it to the
        locals_d below in order to access it in expressions.  The *only* job
        of this method in regards to skims is to call set_df with the
        dataframe that comes back from interacting choosers with
        alternatives.  See the skims module for more documentation on how
        the skims object is intended to be used.
    locals_d : Dict
        This is a dictionary of local variables that will be the environment
        for an evaluation of an expression that begins with @
    sample_size : int, optional
        Sample alternatives with sample of given size.  By default is None,
        which does not sample alternatives.
    chunk_size : int
        if chunk_size > 0 iterates over choosers in chunk_size chunks
    trace_label: str
        This is the label to be used  for trace log file entries and dump file names
        when household tracing enabled. No tracing occurs if label is empty or None.
    trace_choice_name: str
        This is the column label to be used in trace file csv dump of choices

    Returns
    -------
    ret : pandas.Series
        A series where index should match the index of the choosers DataFrame
        and values will match the index of the alternatives DataFrame -
        choices are simulated in the standard Monte Carlo fashion
    """

    # FIXME - chunk size should take number of chooser and alternative columns into account
    # FIXME - that is, chunk size should represent memory footprint (rows X columns) not just rows

    chunk_size = int(chunk_size)

    if (chunk_size == 0) or (chunk_size >= len(choosers.index)):
        choices = _interaction_simulate(choosers, alternatives, spec, skims,
                                        locals_d, sample_size, trace_label,
                                        trace_choice_name)
        return choices

    logger.info("interaction_simulate chunk_size %s num_choosers %s" %
                (chunk_size, len(choosers.index)))

    choices_list = []
    # segment by person type and pick the right spec for each person type
    for i, chooser_chunk in chunked_choosers(choosers, chunk_size):

        logger.info("Running chunk %s of size %d" % (i, len(chooser_chunk)))

        choices = _interaction_simulate(
            chooser_chunk, alternatives, spec, skims, locals_d, sample_size,
            tracing.extend_trace_label(trace_label, 'chunk_%s' % i),
            trace_choice_name)

        choices_list.append(choices)

    # FIXME: this will require 2X RAM
    # if necessary, could append to hdf5 store on disk:
    # http://pandas.pydata.org/pandas-docs/stable/io.html#id2
    choices = pd.concat(choices_list)

    assert len(choices.index == len(choosers.index))

    return choices
Exemple #2
0
def compute_logsums(choosers, logsum_spec, logsum_settings, skim_dict,
                    skim_stack, alt_col_name, chunk_size, trace_hh_id,
                    trace_label):
    """

    Parameters
    ----------
    choosers
    logsum_spec
    logsum_settings
    skim_dict
    skim_stack
    alt_col_name
    chunk_size
    trace_hh_id
    trace_label

    Returns
    -------
    logsums: pandas series
        computed logsums with same index as choosers
    """

    trace_label = tracing.extend_trace_label(trace_label, 'compute_logsums')

    nest_spec = get_logit_model_settings(logsum_settings)
    constants = get_model_constants(logsum_settings)

    print("Running compute_logsums with %d choosers" % len(choosers.index))

    if trace_hh_id:
        tracing.trace_df(logsum_spec,
                         tracing.extend_trace_label(trace_label, 'spec'),
                         slicer='NONE',
                         transpose=False)

    # setup skim keys
    odt_skim_stack_wrapper = skim_stack.wrap(left_key='TAZ',
                                             right_key=alt_col_name,
                                             skim_key="out_period")
    dot_skim_stack_wrapper = skim_stack.wrap(left_key=alt_col_name,
                                             right_key='TAZ',
                                             skim_key="in_period")
    od_skim_stack_wrapper = skim_dict.wrap('TAZ', alt_col_name)

    skims = [
        odt_skim_stack_wrapper, dot_skim_stack_wrapper, od_skim_stack_wrapper
    ]

    locals_d = {
        "odt_skims": odt_skim_stack_wrapper,
        "dot_skims": dot_skim_stack_wrapper,
        "od_skims": od_skim_stack_wrapper
    }
    if constants is not None:
        locals_d.update(constants)

    logsums = asim_simulate.simple_simulate_logsums(choosers,
                                                    logsum_spec,
                                                    nest_spec,
                                                    skims=skims,
                                                    locals_d=locals_d,
                                                    chunk_size=chunk_size,
                                                    trace_label=trace_label)

    return logsums
Exemple #3
0
def eval_nl(choosers,
            spec,
            nest_spec,
            locals_d=None,
            trace_label=None,
            trace_choice_name=None):
    """
    Run a nested-logit simulation for when the model spec does not involve alternative
    specific data, e.g. there are no interactions with alternative
    properties and no need to sample from alternatives.

    Parameters
    ----------
    choosers : pandas.DataFrame
    spec : pandas.DataFrame
        A table of variable specifications and coefficient values.
        Variable expressions should be in the table index and the table
        should have a column for each alternative.
    nest_spec:
        dictionary specifying nesting structure and nesting coefficients
        (from the model spec yaml file)
    locals_d : Dict
        This is a dictionary of local variables that will be the environment
        for an evaluation of an expression that begins with @
    trace_label: str
        This is the label to be used  for trace log file entries and dump file names
        when household tracing enabled. No tracing occurs if label is empty or None.
    trace_choice_name: str
        This is the column label to be used in trace file csv dump of choices

    Returns
    -------
    choices : pandas.Series
        Index will be that of `choosers`, values will match the columns
        of `spec`.
    """

    trace_label = tracing.extend_trace_label(trace_label, 'nl')
    check_for_variability = tracing.check_for_variability()

    # column names of model_design match spec index values
    model_design = eval_variables(spec.index, choosers, locals_d)

    if check_for_variability:
        _check_for_variability(model_design, trace_label)

    # raw utilities of all the leaves

    # matrix product of spec expression evals with utility coefficients of alternatives
    # sums the partial utilities (represented by each spec row) of the alternatives
    # resulting in a dataframe with one row per chooser and one column per alternative
    # pandas dot matrix-multiply depends on column names of model_design matching spec index values
    raw_utilities = model_design.dot(spec)

    # exponentiated utilities of leaves and nests
    nested_exp_utilities = compute_nested_exp_utilities(
        raw_utilities, nest_spec)

    # probabilities of alternatives relative to siblings sharing the same nest
    nested_probabilities = compute_nested_probabilities(
        nested_exp_utilities, nest_spec, trace_label=trace_label)

    # global (flattened) leaf probabilities based on relative nest coefficients
    base_probabilities = compute_base_probabilities(nested_probabilities,
                                                    nest_spec)

    # note base_probabilities could all be zero since we allowed all probs for nests to be zero
    # check here to print a clear message but make_choices will raise error if probs don't sum to 1
    BAD_PROB_THRESHOLD = 0.001
    no_choices = \
        base_probabilities.sum(axis=1).sub(np.ones(len(base_probabilities.index))).abs() \
        > BAD_PROB_THRESHOLD * np.ones(len(base_probabilities.index))

    if no_choices.any():
        report_bad_choices(no_choices,
                           base_probabilities,
                           tracing.extend_trace_label(trace_label, 'eval_nl'),
                           tag='bad_probs',
                           msg="base_probabilities all zero")

    choices = make_choices(base_probabilities,
                           trace_label,
                           trace_choosers=choosers)

    if trace_label:
        tracing.trace_df(choosers, '%s.choosers' % trace_label)
        tracing.trace_df(raw_utilities,
                         '%s.raw_utilities' % trace_label,
                         column_labels=['alternative', 'utility'])
        tracing.trace_df(nested_exp_utilities,
                         '%s.nested_exp_utilities' % trace_label,
                         column_labels=['alternative', 'utility'])
        tracing.trace_df(nested_probabilities,
                         '%s.nested_probabilities' % trace_label,
                         column_labels=['alternative', 'probability'])
        tracing.trace_df(base_probabilities,
                         '%s.base_probabilities' % trace_label,
                         column_labels=['alternative', 'probability'])
        tracing.trace_df(choices,
                         '%s.choices' % trace_label,
                         columns=[None, trace_choice_name])
        tracing.trace_df(model_design,
                         '%s.model_design' % trace_label,
                         column_labels=['expression', None])

        # dump whole df - for software development debugging
        # tracing.trace_df(raw_utilities, "%s.raw_utilities" % trace_label,
        #                  slicer='NONE', transpose=False)
        # tracing.trace_df(nested_exp_utilities, "%s.nested_exp_utilities" % trace_label,
        #                  slicer='NONE', transpose=False)
        # tracing.trace_df(nested_probabilities, "%s.nested_probabilities" % trace_label,
        #                  slicer='NONE', transpose=False)
        # tracing.trace_df(base_probabilities, "%s.base_probabilities" % trace_label,
        #                  slicer='NONE', transpose=False)
        # tracing.trace_df(unnested_probabilities, "%s.unnested_probabilities" % trace_label,
        #                  slicer='NONE', transpose=False)

    return choices
Exemple #4
0
def _interaction_simulate(choosers,
                          alternatives,
                          spec,
                          skims=None,
                          locals_d=None,
                          sample_size=None,
                          trace_label=None,
                          trace_choice_name=None):
    """
    Run a MNL simulation in the situation in which alternatives must
    be merged with choosers because there are interaction terms or
    because alternatives are being sampled.

    Parameters are same as for public function interaction_simulate

    spec : dataframe
        one row per spec expression and one col with utility coefficient

    interaction_df : dataframe
        cross join (cartesian product) of choosers with alternatives
        combines columns of choosers and alternatives
        len(df) == len(choosers) * len(alternatives)
        index values (non-unique) are index values from alternatives df

    interaction_utilities : dataframe
        the utility of each alternative is sum of the partial utilities determined by the
        various spec expressions and their corresponding coefficients
        yielding a dataframe  with len(interaction_df) rows and one utility column
        having the same index as interaction_df (non-unique values from alternatives df)

    utilities : dataframe
        dot product of model_design.dot(spec)
        yields utility value for element in the cross product of choosers and alternatives
        this is then reshaped as a dataframe with one row per chooser and one column per alternative

    probs : dataframe
        utilities exponentiated and converted to probabilities
        same shape as utilities, one row per chooser and one column for alternative

    positions : series
        choices among alternatives with the chosen alternative represented
        as the integer index of the selected alternative column in probs

    choices : series
        series with the alternative chosen for each chooser
        the index is same as choosers
        and the series value is the alternative df index of chosen alternative

    Returns
    -------
    ret : pandas.Series
        A series where index should match the index of the choosers DataFrame
        and values will match the index of the alternatives DataFrame -
        choices are simulated in the standard Monte Carlo fashion
    """

    trace_label = tracing.extend_trace_label(trace_label,
                                             'interaction_simulate')
    have_trace_targets = trace_label and tracing.has_trace_targets(choosers)

    if have_trace_targets:
        tracing.trace_df(choosers,
                         tracing.extend_trace_label(trace_label, 'choosers'))
        tracing.trace_df(alternatives,
                         tracing.extend_trace_label(trace_label,
                                                    'alternatives'),
                         slicer='NONE',
                         transpose=False)

    if len(spec.columns) > 1:
        raise RuntimeError('spec must have only one column')

    sample_size = sample_size or len(alternatives)

    if sample_size > len(alternatives):
        logger.warn("clipping sample size %s to len(alternatives) %s" %
                    (sample_size, len(alternatives)))
        sample_size = min(sample_size, len(alternatives))

    # if using skims, copy index into the dataframe, so it will be
    # available as the "destination" for the skims dereference below
    if skims:
        alternatives[alternatives.index.name] = alternatives.index

    # cross join choosers and alternatives (cartesian product)
    # for every chooser, there will be a row for each alternative
    # index values (non-unique) are from alternatives df
    interaction_df = interaction_dataset(choosers, alternatives, sample_size)

    if skims:
        add_skims(interaction_df, skims)

    # evaluate expressions from the spec multiply by coefficients and sum
    # spec is df with one row per spec expression and one col with utility coefficient
    # column names of model_design match spec index values
    # utilities has utility value for element in the cross product of choosers and alternatives
    # interaction_utilities is a df with one utility column and one row per row in model_design
    if have_trace_targets:
        trace_rows, trace_ids = tracing.interaction_trace_rows(
            interaction_df, choosers)

        tracing.trace_df(interaction_df[trace_rows],
                         tracing.extend_trace_label(trace_label,
                                                    'interaction_df'),
                         slicer='NONE',
                         transpose=False)
    else:
        trace_rows = trace_ids = None

    interaction_utilities, trace_eval_results \
        = eval_interaction_utilities(spec, interaction_df, locals_d, trace_label, trace_rows)

    if have_trace_targets:
        tracing.trace_interaction_eval_results(
            trace_eval_results, trace_ids,
            tracing.extend_trace_label(trace_label, 'eval'))

        tracing.trace_df(interaction_utilities[trace_rows],
                         tracing.extend_trace_label(trace_label,
                                                    'interaction_utilities'),
                         slicer='NONE',
                         transpose=False)

    # reshape utilities (one utility column and one row per row in model_design)
    # to a dataframe with one row per chooser and one column per alternative
    utilities = pd.DataFrame(interaction_utilities.as_matrix().reshape(
        len(choosers), sample_size),
                             index=choosers.index)

    if have_trace_targets:
        tracing.trace_df(utilities,
                         tracing.extend_trace_label(trace_label, 'utilities'),
                         column_labels=['alternative', 'utility'])

    # tracing.trace_df(utilities, '%s.DUMP.utilities' % trace_label, transpose=False, slicer='NONE')

    # convert to probabilities (utilities exponentiated and normalized to probs)
    # probs is same shape as utilities, one row per chooser and one column for alternative
    probs = utils_to_probs(utilities,
                           trace_label=trace_label,
                           trace_choosers=choosers)

    if have_trace_targets:
        tracing.trace_df(probs,
                         tracing.extend_trace_label(trace_label, 'probs'),
                         column_labels=['alternative', 'probability'])

    # make choices
    # positions is series with the chosen alternative represented as a column index in probs
    # which is an integer between zero and num alternatives in the alternative sample
    positions = make_choices(probs,
                             trace_label=trace_label,
                             trace_choosers=choosers)

    # need to get from an integer offset into the alternative sample to the alternative index
    # that is, we want the index value of the row that is offset by <position> rows into the
    # tranche of this choosers alternatives created by cross join of alternatives and choosers

    # offsets is the offset into model_design df of first row of chooser alternatives
    offsets = np.arange(len(positions)) * sample_size
    # resulting pandas Int64Index has one element per chooser row and is in same order as choosers
    choices = interaction_utilities.index.take(positions + offsets)

    # create a series with index from choosers and the index of the chosen alternative
    choices = pd.Series(choices, index=choosers.index)

    if have_trace_targets:
        tracing.trace_df(choices,
                         tracing.extend_trace_label(trace_label, 'choices'),
                         columns=[None, trace_choice_name])

    #
    # if have_trace_targets:
    #     tracing.trace_df(choosers, '%s.choosers' % trace_label)
    #     tracing.trace_df(utilities, '%s.utilities' % trace_label,
    #                      column_labels=['alternative', 'utility'])
    #     tracing.trace_df(probs, '%s.probs' % trace_label,
    #                      column_labels=['alternative', 'probability'])
    #     tracing.trace_df(choices, '%s.choices' % trace_label,
    #                      columns=[None, trace_choice_name])
    #     tracing.trace_interaction_eval_results(trace_eval_results, trace_ids,
    #                                            '%s.eval' % trace_label)

    return choices
Exemple #5
0
def utils_to_probs(utils, trace_label=None, exponentiated=False, allow_zero_probs=False,
                   trace_choosers=None):
    """
    Convert a table of utilities to probabilities.

    Parameters
    ----------
    utils : pandas.DataFrame
        Rows should be choosers and columns should be alternatives.

    trace_label : str
        label for tracing bad utility or probability values

    exponentiated : bool
        True if utilities have already been exponentiated

    allow_zero_probs : bool
        if True value rows in which all utility alts are EXP_UTIL_MIN will result
        in rows in probs to have all zero probability (and not sum to 1.0)
        This si for hte benefit of calculating probabilities of nested logit nests

    trace_choosers : pandas.dataframe
        the choosers df (for interaction_simulate) to facilitate the reporting of hh_id
        by report_bad_choices because it can't deduce hh_id from the interaction_dataset
        which is indexed on index values from alternatives df

    Returns
    -------
    probs : pandas.DataFrame
        Will have the same index and columns as `utils`.

    """
    trace_label = tracing.extend_trace_label(trace_label, 'utils_to_probs')

    utils_arr = utils.as_matrix().astype('float')
    if not exponentiated:
        utils_arr = np.exp(utils_arr)

    EXP_UTIL_MIN = 1e-300
    EXP_UTIL_MAX = np.inf
    np.clip(utils_arr, EXP_UTIL_MIN, EXP_UTIL_MAX, out=utils_arr)

    # FIXME
    utils_arr = np.where(utils_arr == EXP_UTIL_MIN, 0.0, utils_arr)

    arr_sum = utils_arr.sum(axis=1)

    zero_probs = (arr_sum == 0.0)
    if zero_probs.any() and not allow_zero_probs:

        report_bad_choices(zero_probs, utils,
                           tracing.extend_trace_label(trace_label, 'zero_prob_utils'),
                           msg="all probabilities are zero",
                           trace_choosers=trace_choosers)

    inf_utils = np.isinf(arr_sum)
    if inf_utils.any():
        report_bad_choices(inf_utils, utils,
                           tracing.extend_trace_label(trace_label, 'inf_exp_utils'),
                           msg="infinite exponentiated utilities",
                           trace_choosers=trace_choosers)

    # if allow_zero_probs, this may cause a RuntimeWarning: invalid value encountered in divide
    np.divide(utils_arr, arr_sum.reshape(len(utils_arr), 1), out=utils_arr)

    PROB_MIN = 0.0
    PROB_MAX = 1.0

    # if allow_zero_probs, this will cause EXP_UTIL_MIN util rows to have all zero probabilities
    utils_arr[np.isnan(utils_arr)] = PROB_MIN

    np.clip(utils_arr, PROB_MIN, PROB_MAX, out=utils_arr)

    probs = pd.DataFrame(utils_arr, columns=utils.columns, index=utils.index)

    return probs
Exemple #6
0
def eval_mnl(choosers,
             spec,
             locals_d=None,
             trace_label=None,
             trace_choice_name=None):
    """
    Run a simulation for when the model spec does not involve alternative
    specific data, e.g. there are no interactions with alternative
    properties and no need to sample from alternatives.

    Each row in spec computes a partial utility for each alternative,
    by providing a spec expression (often a boolean 0-1 trigger)
    and a column of utility coefficients for each alternative.

    We compute the utility of each alternative by matrix-multiplication of eval results
    with the utility coefficients in the spec alternative columns
    yielding one row per chooser and one column per alternative

    Parameters
    ----------
    choosers : pandas.DataFrame
    spec : pandas.DataFrame
        A table of variable specifications and coefficient values.
        Variable expressions should be in the table index and the table
        should have a column for each alternative.
    locals_d : Dict
        This is a dictionary of local variables that will be the environment
        for an evaluation of an expression that begins with @
    trace_label: str
        This is the label to be used  for trace log file entries and dump file names
        when household tracing enabled. No tracing occurs if label is empty or None.
    trace_choice_name: str
        This is the column label to be used in trace file csv dump of choices

    Returns
    -------
    choices : pandas.Series
        Index will be that of `choosers`, values will match the columns
        of `spec`.
    """

    trace_label = tracing.extend_trace_label(trace_label, 'mnl')
    check_for_variability = tracing.check_for_variability()

    model_design = eval_variables(spec.index, choosers, locals_d)

    if check_for_variability:
        _check_for_variability(model_design, trace_label)

    # matrix product of spec expression evals with utility coefficients of alternatives
    # sums the partial utilities (represented by each spec row) of the alternatives
    # resulting in a dataframe with one row per chooser and one column per alternative
    # pandas dot matrix-multiply depends on column names of model_design matching spec index values

    utilities = model_design.dot(spec)

    probs = utils_to_probs(utilities,
                           trace_label=trace_label,
                           trace_choosers=choosers)
    choices = make_choices(probs,
                           trace_label=trace_label,
                           trace_choosers=choosers)

    if trace_label:

        tracing.trace_df(choosers, '%s.choosers' % trace_label)
        tracing.trace_df(utilities,
                         '%s.utilities' % trace_label,
                         column_labels=['alternative', 'utility'])
        tracing.trace_df(probs,
                         '%s.probs' % trace_label,
                         column_labels=['alternative', 'probability'])
        tracing.trace_df(choices,
                         '%s.choices' % trace_label,
                         columns=[None, trace_choice_name])
        tracing.trace_df(model_design,
                         '%s.model_design' % trace_label,
                         column_labels=['expression', None])

    return choices
Exemple #7
0
def _simple_simulate(choosers,
                     spec,
                     nest_spec,
                     skims=None,
                     locals_d=None,
                     trace_label=None,
                     trace_choice_name=None):
    """
    Run an MNL or NL simulation for when the model
    spec does not involve alternative specific data,
    e.g. there are no interactions with alternative
    properties and no need to sample from alternatives.

    Parameters
    ----------
    choosers : pandas.DataFrame
    spec : pandas.DataFrame
        A table of variable specifications and coefficient values.
        Variable expressions should be in the table index and the table
        should have a column for each alternative.
    nest_spec:
        for nested logit (nl): dictionary specifying nesting structure
        and nesting coefficients
        for multinomial logit (mnl): None
    skims : Skims object
        The skims object is used to contain multiple matrices of
        origin-destination impedances.  Make sure to also add it to the
        locals_d below in order to access it in expressions.  The *only* job
        of this method in regards to skims is to call set_df with the
        dataframe that comes back from interacting choosers with
        alternatives.  See the skims module for more documentation on how
        the skims object is intended to be used.
    locals_d : Dict
        This is a dictionary of local variables that will be the environment
        for an evaluation of an expression that begins with @
    trace_label: str
        This is the label to be used  for trace log file entries
        and dump file names when household tracing enabled. No
        tracing occurs if label is empty or None.
    trace_choice_name: str
        This is the column label to be used in trace file csv dump of choices

    Returns
    -------
    choices : pandas.Series
        Index will be that of `choosers`, values will match the columns
        of `spec`.
    """
    if skims:
        add_skims(choosers, skims)

    trace_label = tracing.extend_trace_label(trace_label, 'simple_simulate')

    if nest_spec is None:
        choices = eval_mnl(choosers,
                           spec,
                           locals_d,
                           trace_label=trace_label,
                           trace_choice_name=trace_choice_name)
    else:
        choices = eval_nl(choosers,
                          spec,
                          nest_spec,
                          locals_d,
                          trace_label=trace_label,
                          trace_choice_name=trace_choice_name)

    return choices
Exemple #8
0
def eval_nl(choosers,
            spec,
            nest_spec,
            locals_d,
            trace_label=None,
            trace_choice_name=None):
    """
    Run a nested-logit simulation for when the model spec does not
    involve alternative
    specific data, e.g. there are no interactions with alternative
    properties and no need to sample from alternatives.

    Parameters
    ----------
    choosers : pandas.DataFrame
    spec : pandas.DataFrame
        A table of variable specifications and coefficient values.
        Variable expressions should be in the table index and the table
        should have a column for each alternative.
    nest_spec:
        dictionary specifying nesting structure and nesting coefficients
        (from the model spec yaml file)
    locals_d : Dict or None
        This is a dictionary of local variables that will be the environment
        for an evaluation of an expression that begins with @
    trace_label: str
        This is the label to be used  for trace log file entries
        and dump file names
        when household tracing enabled. No tracing occurs if
        label is empty or None.
    trace_choice_name: str
        This is the column label to be used in trace file csv dump of choices

    Returns
    -------
    choices : pandas.Series
        Index will be that of `choosers`, values will match the columns
        of `spec`.
    """

    trace_label = tracing.extend_trace_label(trace_label, 'nl')
    check_for_variability = tracing.check_for_variability()

    t0 = tracing.print_elapsed_time()

    # column names of expression_values match spec index values
    expression_values = eval_variables(spec.index, choosers, locals_d)
    t0 = tracing.print_elapsed_time("eval_variables", t0, debug=True)

    if check_for_variability:
        _check_for_variability(expression_values, trace_label)
    t0 = tracing.print_elapsed_time("_check_for_variability", t0, debug=True)

    # raw utilities of all the leaves
    raw_utilities = compute_utilities(expression_values, spec)
    t0 = tracing.print_elapsed_time("expression_values.dot", t0, debug=True)

    # exponentiated utilities of leaves and nests
    nested_exp_utilities = compute_nested_exp_utilities(
        raw_utilities, nest_spec)
    t0 = tracing.print_elapsed_time("compute_nested_exp_utilities",
                                    t0,
                                    debug=True)

    # probabilities of alternatives relative to siblings sharing the same nest
    nested_probabilities = compute_nested_probabilities(
        nested_exp_utilities, nest_spec, trace_label=trace_label)
    t0 = tracing.print_elapsed_time("compute_nested_probabilities",
                                    t0,
                                    debug=True)

    # global (flattened) leaf probabilities based on relative nest coefficients
    base_probabilities = compute_base_probabilities(nested_probabilities,
                                                    nest_spec)
    t0 = tracing.print_elapsed_time("compute_base_probabilities",
                                    t0,
                                    debug=True)

    # note base_probabilities could all be zero since we allowed all probs
    # for nests to be zero check here to print a clear message but
    # make_choices will raise error if probs don't sum to 1
    BAD_PROB_THRESHOLD = 0.001
    no_choices = \
        base_probabilities.sum(axis=1).sub(
            np.ones(len(base_probabilities.index))).abs() \
        > BAD_PROB_THRESHOLD * np.ones(len(base_probabilities.index))

    if no_choices.any():
        logit.report_bad_choices(no_choices,
                                 base_probabilities,
                                 tracing.extend_trace_label(
                                     trace_label, 'eval_nl'),
                                 tag='bad_probs',
                                 msg="base_probabilities all zero")

    t0 = tracing.print_elapsed_time("report_bad_choices", t0, debug=True)

    choices, rands = logit.make_choices(base_probabilities,
                                        trace_label,
                                        trace_choosers=choosers)
    t0 = tracing.print_elapsed_time("logit.make_choices", t0, debug=True)

    if trace_label:
        tracing.trace_df(choosers, '%s.choosers' % trace_label)
        tracing.trace_df(raw_utilities,
                         '%s.raw_utilities' % trace_label,
                         column_labels=['alternative', 'utility'])
        tracing.trace_df(nested_exp_utilities,
                         '%s.nested_exp_utilities' % trace_label,
                         column_labels=['alternative', 'utility'])
        tracing.trace_df(nested_probabilities,
                         '%s.nested_probabilities' % trace_label,
                         column_labels=['alternative', 'probability'])
        tracing.trace_df(base_probabilities,
                         '%s.base_probabilities' % trace_label,
                         column_labels=['alternative', 'probability'])
        tracing.trace_df(choices,
                         '%s.choices' % trace_label,
                         columns=[None, trace_choice_name])
        tracing.trace_df(rands,
                         '%s.rands' % trace_label,
                         columns=[None, 'rand'])
        tracing.trace_df(expression_values,
                         '%s.expression_values' % trace_label,
                         column_labels=['expression', None])

    return choices
Exemple #9
0
def interaction_sample(choosers,
                       alternatives,
                       spec,
                       sample_size,
                       alt_col_name=None,
                       skims=None,
                       locals_d=None,
                       chunk_size=0,
                       trace_label=None):
    """
    Run a simulation in the situation in which alternatives must
    be merged with choosers because there are interaction terms or
    because alternatives are being sampled.

    optionally (if chunk_size > 0) iterates over choosers in chunk_size chunks

    Parameters
    ----------
    choosers : pandas.DataFrame
        DataFrame of choosers
    alternatives : pandas.DataFrame
        DataFrame of alternatives - will be merged with choosers and sampled
    spec : pandas.DataFrame
        A Pandas DataFrame that gives the specification of the variables to
        compute and the coefficients for each variable.
        Variable specifications must be in the table index and the
        table should have only one column of coefficients.
    sample_size : int, optional
        Sample alternatives with sample of given size.  By default is None,
        which does not sample alternatives.
    alt_col_name: str or None
        name to give the sampled_alternative column
    skims : Skims object
        The skims object is used to contain multiple matrices of
        origin-destination impedances.  Make sure to also add it to the
        locals_d below in order to access it in expressions.  The *only* job
        of this method in regards to skims is to call set_df with the
        dataframe that comes back from interacting choosers with
        alternatives.  See the skims module for more documentation on how
        the skims object is intended to be used.
    locals_d : Dict
        This is a dictionary of local variables that will be the environment
        for an evaluation of an expression that begins with @
    chunk_size : int
        if chunk_size > 0 iterates over choosers in chunk_size chunks
    trace_label: str
        This is the label to be used  for trace log file
        entries and dump file names when household tracing enabled.
        No tracing occurs if label is empty or None.


    Returns
    -------
    ret : pandas.Series
        A series where index should match the index of the choosers DataFrame
        and values will match the index of the alternatives DataFrame -
        choices are simulated in the standard Monte Carlo fashion
    """

    assert sample_size > 0
    sample_size = min(sample_size, len(alternatives.index))

    rows_per_chunk = asim_utils.num_chunk_rows_for_chunk_size(
        chunk_size, choosers, alternatives)

    print("interaction_simulate chunk_size %s num_choosers %s" %
          (chunk_size, len(choosers.index)))

    result_list = []
    for i, num_chunks, chooser_chunk in asim_utils.chunked_choosers(
            choosers, rows_per_chunk):

        print("Running chunk %s of %s size %d" %
              (i, num_chunks, len(chooser_chunk)))

        choices = _interaction_sample(
            chooser_chunk, alternatives, spec, sample_size, alt_col_name,
            skims, locals_d,
            tracing.extend_trace_label(trace_label, 'chunk_%s' % i))

        result_list.append(choices)

    # FIXME: this will require 2X RAM
    # if necessary, could append to hdf5 store on disk:
    # http://pandas.pydata.org/pandas-docs/stable/io.html#id2
    if len(result_list) > 1:
        choices = pd.concat(result_list)

    assert len(choosers.index) == len(np.unique(choices.index.values))

    return choices
Exemple #10
0
def _interaction_sample(choosers,
                        alternatives,
                        spec,
                        sample_size,
                        alt_col_name,
                        skims=None,
                        locals_d=None,
                        trace_label=None):
    """
    Run a MNL simulation in the situation in which alternatives must
    be merged with choosers because there are interaction terms or
    because alternatives are being sampled.

    Parameters are same as for public function interaction_simulate

    spec : dataframe
        one row per spec expression and one col with utility coefficient

    interaction_df : dataframe
        cross join (cartesian product) of choosers with alternatives
        combines columns of choosers and alternatives
        len(df) == len(choosers) * len(alternatives)
        index values (non-unique) are index values from alternatives df

    interaction_utilities : dataframe
        the utility of each alternative is sum of the partial
        utilities determined by the various spec expressions and
        their corresponding coefficients yielding a dataframe
        with len(interaction_df) rows and one utility column
        having the same index as interaction_df (non-unique values
        from alternatives df)

    utilities : dataframe
        dot product of model_design.dot(spec)
        yields utility value for element in the cross product of
        choosers and alternatives this is then reshaped as a dataframe
        with one row per chooser and one column per alternative

    probs : dataframe
        utilities exponentiated and converted to probabilities
        same shape as utilities, one row per chooser and one column
        per alternative

    positions : series
        choices among alternatives with the chosen alternative represented
        as the integer index of the selected alternative column in probs

    choices : series
        series with the alternative chosen for each chooser
        the index is same as choosers
        and the series value is the alternative df index of chosen alternative

    Returns
    -------
    choices_df : pandas.DataFrame

        A DataFrame where index should match the index of the choosers
        DataFrame and columns alt_col_name, prob, rand, pick_count

        prob: float
            the probability of the chosen alternative
        rand: float
            the rand that did the choosing
        pick_count : int
            number of duplicate picks for chooser, alt
    """

    trace_label = tracing.extend_trace_label(trace_label,
                                             'interaction_simulate')
    have_trace_targets = trace_label and tracing.has_trace_targets(choosers)

    if alt_col_name is None:
        alt_col_name = 'alt_%s' % alternatives.index.name

    if have_trace_targets:
        tracing.trace_df(choosers,
                         tracing.extend_trace_label(trace_label, 'choosers'))
        tracing.trace_df(alternatives,
                         tracing.extend_trace_label(trace_label,
                                                    'alternatives'),
                         slicer='NONE',
                         transpose=False)

    if len(spec.columns) > 1:
        raise RuntimeError('spec must have only one column')

    alternative_count = len(alternatives)
    # print("_interaction_sample alternative_count %s" % alternative_count)

    # if using skims, copy index into the dataframe, so it will be
    # available as the "destination" for the skims dereference below
    if skims:
        alternatives[alternatives.index.name] = alternatives.index

    # cross join choosers and alternatives (cartesian product)
    # for every chooser, there will be a row for each alternative
    # index values (non-unique) are from alternatives df
    interaction_df = logit.interaction_dataset(choosers, alternatives,
                                               alternative_count)

    assert alternative_count == len(interaction_df.index) / len(choosers.index)

    if skims:
        asim_utils.add_skims(interaction_df, skims)

    # evaluate expressions from the spec multiply by coefficients and sum
    # spec is df with one row per spec expression and one col
    # with utility coefficient column names of interaction_df match spec
    # index values utilities has utility value for element in the
    # cross product of choosers and alternatives interaction_utilities is
    # a df with one utility column and one row per row in interaction_df
    if have_trace_targets:
        trace_rows, trace_ids \
            = tracing.interaction_trace_rows(
                interaction_df, choosers, alternative_count)

        tracing.trace_df(interaction_df[trace_rows],
                         tracing.extend_trace_label(trace_label,
                                                    'interaction_df'),
                         slicer='NONE',
                         transpose=False)
    else:
        trace_rows = trace_ids = None

    interaction_utilities, trace_eval_results \
        = eval_interaction_utilities(
            spec, interaction_df, locals_d, trace_label, trace_rows)

    if have_trace_targets:
        tracing.trace_interaction_eval_results(
            trace_eval_results, trace_ids,
            tracing.extend_trace_label(trace_label, 'eval'))

        tracing.trace_df(interaction_utilities[trace_rows],
                         tracing.extend_trace_label(trace_label,
                                                    'interaction_utilities'),
                         slicer='NONE',
                         transpose=False)

    tracing.dump_df(DUMP, interaction_utilities, trace_label,
                    'interaction_utilities')

    # FIXME - do this in numpy, not pandas?
    # reshape utilities (one utility column and one row per
    # row in interaction_utilities) to a dataframe with one
    # row per chooser and one column per alternative
    utilities = pd.DataFrame(interaction_utilities.as_matrix().reshape(
        len(choosers), alternative_count),
                             index=choosers.index)

    if have_trace_targets:
        tracing.trace_df(utilities,
                         tracing.extend_trace_label(trace_label, 'utilities'),
                         column_labels=['alternative', 'utility'])

    tracing.dump_df(DUMP, utilities, trace_label, 'utilities')

    # FIXME - do this in numpy, not pandas?
    # convert to probabilities (utilities exponentiated
    # and normalized to probs) probs is same shape as utilities,
    # one row per chooser and one column for alternative
    probs = logit.utils_to_probs(utilities,
                                 trace_label=trace_label,
                                 trace_choosers=choosers)

    if have_trace_targets:
        tracing.trace_df(probs,
                         tracing.extend_trace_label(trace_label, 'probs'),
                         column_labels=['alternative', 'probability'])

    choices_df = make_sample_choices(choosers, probs, interaction_utilities,
                                     sample_size, alternative_count,
                                     alt_col_name, trace_label)

    # make_sample_choices should return choosers index as choices_df column
    assert choosers.index.name in choices_df.columns

    # pick_count and pick_dup
    # pick_count is number of duplicate picks
    # pick_dup flag is True for all but first of duplicates
    pick_group = choices_df.groupby([choosers.index.name, alt_col_name])

    # number each item in each group from 0 to the length of that group - 1.
    choices_df['pick_count'] = pick_group.cumcount(ascending=True)
    # flag duplicate rows after first
    choices_df['pick_dup'] = choices_df['pick_count'] > 0
    # add reverse cumcount to get total pick_count
    # (conveniently faster than groupby.count + merge)
    choices_df['pick_count'] += pick_group.cumcount(ascending=False) + 1

    # drop the duplicates
    choices_df = choices_df[~choices_df['pick_dup']]
    del choices_df['pick_dup']

    # set index after groupby so we can trace on it
    choices_df.set_index(choosers.index.name, inplace=True)

    tracing.dump_df(DUMP, choices_df, trace_label, 'choices_df')

    if have_trace_targets:
        tracing.trace_df(choices_df,
                         tracing.extend_trace_label(trace_label,
                                                    'sampled_alternatives'),
                         transpose=False,
                         column_labels=['sample_alt', 'alternative'])

    return choices_df
Exemple #11
0
def make_sample_choices(choosers, probs, interaction_utilities, sample_size,
                        alternative_count, alt_col_name, trace_label):
    """

    Parameters
    ----------
    choosers
    probs : pandas DataFrame
        one row per chooser and one column per alternative
    interaction_utilities
        dataframe  with len(interaction_df) rows and one utility column
    sample_size : int
        number of samples/choices to make
    alternative_count
    alt_col_name
    trace_label

    Returns
    -------

    """

    assert isinstance(probs, pd.DataFrame)
    assert probs.shape == (len(choosers), alternative_count)

    assert isinstance(interaction_utilities, pd.DataFrame)
    assert interaction_utilities.shape == (len(choosers) * alternative_count,
                                           1)

    t0 = tracing.print_elapsed_time()

    # probs should sum to 1 across each row
    BAD_PROB_THRESHOLD = 0.001
    bad_probs = \
        probs.sum(axis=1).sub(np.ones(len(probs.index))).abs() \
        > BAD_PROB_THRESHOLD * np.ones(len(probs.index))

    if bad_probs.any():
        logit.report_bad_choices.report_bad_choices(
            bad_probs,
            probs,
            tracing.extend_trace_label(trace_label, 'bad_probs'),
            msg="probabilities do not add up to 1",
            trace_choosers=choosers)

    t0 = tracing.print_elapsed_time("make_choices bad_probs", t0, debug=True)

    cum_probs_arr = probs.as_matrix().cumsum(axis=1)
    t0 = tracing.print_elapsed_time("make_choices cum_probs_arr",
                                    t0,
                                    debug=True)

    # alt probs in convenient layout to return prob of chose alternative
    # (same layout as cum_probs_arr and interaction_utilities)
    alt_probs_array = probs.as_matrix().flatten()

    # get sample_size rands for each chooser
    # transform as we iterate over alternatives
    # reshape so rands[i] is in broadcastable (2-D) shape for cum_probs_arr
    # i.e rands[i] is a 2-D array of one alt choice rand for each chooser
    rands = asim_utils.get_rn_generator().random_for_df(probs, n=sample_size)
    rands = rands.T.reshape(sample_size, -1, 1)
    t0 = tracing.print_elapsed_time("make_choices random_for_df",
                                    t0,
                                    debug=True)

    # the alternative value chosen
    choices_array = np.empty([sample_size, len(choosers)]).astype(int)

    # the probability of the chosen alternative
    choice_probs_array = np.empty([sample_size, len(choosers)])

    # FIXME - do this all at once rather than iterate?
    for i in range(sample_size):

        # FIXME - do this in numpy, not pandas?

        # rands for this alt in broadcastable shape
        r = rands[i]

        # position of first occurrence of positive value
        positions = np.argmax(cum_probs_arr > r, axis=1)

        # FIXME - leave positions as numpy array, not pandas series?

        # positions is series with the chosen alternative
        # represented as a column index in probs
        # which is an integer between zero and num alternatives
        # in the alternative sample
        positions = pd.Series(positions, index=probs.index)

        # need to get from an integer offset into the alternative
        # sample to the alternative index that is, we want the index
        # value of the row that is offset by <position> rows into the
        # tranche of this choosers alternatives created by cross join
        # of alternatives and choosers

        # offsets is the offset into model_design df of first row
        # of chooser alternatives
        offsets = np.arange(len(positions)) * alternative_count

        # resulting pandas Int64Index has one element per chooser
        # and is in same order as choosers
        choices_array[i] = interaction_utilities.index.take(positions +
                                                            offsets)

        choice_probs_array[i] = np.take(alt_probs_array, positions + offsets)

    # explode to one row per chooser.index, alt_TAZ
    choices_df = pd.DataFrame({
        alt_col_name:
        choices_array.flatten(order='F'),
        'rand':
        rands.flatten(order='F'),
        'prob':
        choice_probs_array.flatten(order='F'),
        choosers.index.name:
        np.repeat(np.asanyarray(choosers.index), sample_size)
    })

    return choices_df