Beispiel #1
0
def create_mandatory_tours():

    # FIXME - move this to body?

    persons = inject.get_table('persons')
    configs_dir = inject.get_injectable('configs_dir')

    persons = persons.to_frame(columns=[
        "mandatory_tour_frequency", "is_worker", "school_taz", "workplace_taz"
    ])
    persons = persons[~persons.mandatory_tour_frequency.isnull()]

    tour_frequency_alternatives = inject.get_injectable(
        'mandatory_tour_frequency_alternatives')

    tours = process_mandatory_tours(persons, tour_frequency_alternatives)

    expressions.assign_columns(df=tours,
                               model_settings='annotate_tours_with_dest',
                               configs_dir=configs_dir,
                               trace_label='create_mandatory_tours')

    pipeline.extend_table("tours", tours)
    tracing.register_traceable_table('tours', tours)
    pipeline.get_rn_generator().add_channel(tours, 'tours')
Beispiel #2
0
def create_simple_trips(tours, households, persons, trace_hh_id):
    """
    Create a simple trip table
    """

    logger.info("Running simple trips table creation with %d tours" %
                len(tours.index))

    tours_df = tours.to_frame()

    # we now have a tour_id column
    tours_df.reset_index(inplace=True)

    tours_df['household_id'] = reindex(persons.household_id,
                                       tours_df.person_id)
    tours_df['TAZ'] = reindex(households.TAZ, tours_df.household_id)

    # create inbound and outbound records
    trips = pd.concat([tours_df, tours_df], ignore_index=True)

    # first half are outbound, second half are inbound
    trips['INBOUND'] = np.repeat([False, True], len(trips.index) / 2)

    # TRIPID for outbound trips = 1, inbound_trips = 2
    trips['trip_num'] = np.repeat([1, 2], len(trips.index) / 2)

    # set key fields from tour fields: 'TAZ','destination','start','end'
    trips['OTAZ'] = trips.TAZ
    trips['OTAZ'][trips.INBOUND] = trips.destination[trips.INBOUND]

    trips['DTAZ'] = trips.destination
    trips['DTAZ'][trips.INBOUND] = trips.TAZ[trips.INBOUND]

    trips['start_trip'] = trips.start
    trips['start_trip'][trips.INBOUND] = trips.end[trips.INBOUND]

    trips['end_trip'] = trips.end
    trips['end_trip'][trips.INBOUND] = trips.start[trips.INBOUND]

    # create a stable (predictable) index based on tour_id and trip_num
    possible_trips_count = 2
    trips['trip_id'] = (trips.tour_id *
                        possible_trips_count) + (trips.trip_num - 1)
    trips.set_index('trip_id', inplace=True, verify_integrity=True)

    trip_columns = [
        'tour_id', 'INBOUND', 'trip_num', 'OTAZ', 'DTAZ', 'start_trip',
        'end_trip'
    ]
    trips = trips[trip_columns]

    orca.add_table("trips", trips)

    tracing.register_traceable_table('trips', trips)
    pipeline.get_rn_generator().add_channel(trips, 'trips')

    if trace_hh_id:
        tracing.trace_df(trips, label="trips", warn_if_empty=True)
Beispiel #3
0
def create_households(trace_hh_id):

    df = pd.DataFrame({'HHID': [1, 2, 3], 'TAZ': {100, 100, 101}})
    inject.add_table('households', df)

    pipeline.get_rn_generator().add_channel(df, 'households')

    if trace_hh_id:
        tracing.register_traceable_table('households', df)
Beispiel #4
0
def trip_departure_choice(
        trips,
        trips_merged,
        skim_dict,
        chunk_size,
        trace_hh_id):

    trace_label = 'trip_departure_choice'
    model_settings = config.read_model_settings('trip_departure_choice.yaml')

    spec = simulate.read_model_spec(file_name=model_settings['SPECIFICATION'])

    trips_merged_df = trips_merged.to_frame()
    # add tour-based chunk_id so we can chunk all trips in tour together
    tour_ids = trips_merged[TOUR_ID].unique()
    trips_merged_df['chunk_id'] = reindex(pd.Series(list(range(len(tour_ids))), tour_ids), trips_merged_df.tour_id)

    max_tour_id = trips_merged[TOUR_ID].max()

    trip_departure_choice.MAX_TOUR_ID = int(np.power(10, np.ceil(np.log10(max_tour_id))))
    locals_d = config.get_model_constants(model_settings).copy()

    preprocessor_settings = model_settings.get('PREPROCESSOR', None)
    tour_legs = get_tour_legs(trips_merged_df)
    pipeline.get_rn_generator().add_channel('tour_legs', tour_legs)

    if preprocessor_settings:
        od_skim = skim_dict.wrap('origin', 'destination')
        do_skim = skim_dict.wrap('destination', 'origin')

        skims = [od_skim, do_skim]

        simulate.set_skim_wrapper_targets(trips_merged_df, skims)

        locals_d.update({
            "od_skims": od_skim,
            "do_skims": do_skim,
        })

        expressions.assign_columns(
            df=trips_merged_df,
            model_settings=preprocessor_settings,
            locals_dict=locals_d,
            trace_label=trace_label)

    choices = apply_stage_two_model(spec, trips_merged_df, chunk_size, trace_label)

    trips_df = trips.to_frame()
    trip_length = len(trips_df)
    trips_df = pd.concat([trips_df, choices], axis=1)
    assert len(trips_df) == trip_length
    assert trips_df[trips_df['depart'].isnull()].empty

    pipeline.replace_table("trips", trips_df)
Beispiel #5
0
def create_households(trace_hh_id):

    df = pd.DataFrame({
        'household_id': [1, 2, 3],
        'home_zone_id': {100, 100, 101}
    })
    inject.add_table('households', df)

    pipeline.get_rn_generator().add_channel('households', df)

    tracing.register_traceable_table('households', df)
Beispiel #6
0
def create_mandatory_tours_table():

    persons = orca.get_table('persons')

    persons = persons.to_frame(columns=["mandatory_tour_frequency",
                                        "is_worker", "school_taz", "workplace_taz"])
    persons = persons[~persons.mandatory_tour_frequency.isnull()]
    df = process_mandatory_tours(persons)

    orca.add_table("mandatory_tours", df)
    tracing.register_traceable_table('mandatory_tours', df)
    pipeline.get_rn_generator().add_channel(df, 'tours')
def create_non_mandatory_tours_table():

    persons = orca.get_table('persons')
    non_mandatory_tour_frequency_alts = orca.get_table('non_mandatory_tour_frequency_alts')

    df = process_non_mandatory_tours(
        persons.non_mandatory_tour_frequency.dropna(),
        non_mandatory_tour_frequency_alts.local
    )

    orca.add_table("non_mandatory_tours", df)
    tracing.register_traceable_table('non_mandatory_tours', df)
    pipeline.get_rn_generator().add_channel(df, 'tours')
def initialize_tours(network_los, households, persons, trace_hh_id):

    trace_label = 'initialize_tours'

    tours = read_input_table("tours")

    # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after.
    # FIXME could just always slice...
    slice_happened = \
        inject.get_injectable('households_sample_size', 0) > 0 \
        or inject.get_injectable('households_sample_size', 0) > 0
    if slice_happened:
        logger.info("slicing tours %s" % (tours.shape,))
        # keep all persons in the sampled households
        tours = tours[tours.person_id.isin(persons.index)]

    # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above
    model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True)
    expressions.assign_columns(
        df=tours,
        model_settings=model_settings.get('annotate_tours'),
        trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours'))

    skip_patch_tour_ids = model_settings.get('skip_patch_tour_ids', False)
    if skip_patch_tour_ids:
        pass
    else:
        tours = patch_tour_ids(tours)
    assert tours.index.name == 'tour_id'

    # replace table function with dataframe
    inject.add_table('tours', tours)

    pipeline.get_rn_generator().add_channel('tours', tours)

    tracing.register_traceable_table('tours', tours)

    logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours")
    logger.debug(f"{len(households.index.unique())} unique household_ids in households")
    assert not tours.index.duplicated().any()

    tours_without_persons = ~tours.person_id.isin(persons.index)
    if tours_without_persons.any():
        logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n"
                     f"{pd.Series({'person_id': tours_without_persons.index.values})}")
        raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id")

    if trace_hh_id:
        tracing.trace_df(tours,
                         label='initialize_tours',
                         warn_if_empty=True)
Beispiel #9
0
def test_rng_access():

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs')
    orca.add_injectable("configs_dir", configs_dir)

    output_dir = os.path.join(os.path.dirname(__file__), 'output')
    orca.add_injectable("output_dir", output_dir)

    data_dir = os.path.join(os.path.dirname(__file__), 'data')
    orca.add_injectable("data_dir", data_dir)

    inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE)

    orca.clear_cache()

    pipeline.set_rn_generator_base_seed(0)

    pipeline.open_pipeline()

    with pytest.raises(RuntimeError) as excinfo:
        pipeline.set_rn_generator_base_seed(0)
    assert "call set_rn_generator_base_seed before the first step" in str(
        excinfo.value)

    rng = pipeline.get_rn_generator()

    pipeline.close_pipeline()
    orca.clear_cache()
Beispiel #10
0
def persons(households, trace_hh_id):

    df = read_raw_persons(households)

    logger.info("loaded persons %s" % (df.shape, ))

    # replace table function with dataframe
    inject.add_table('persons', df)

    pipeline.get_rn_generator().add_channel('persons', df)

    if trace_hh_id:
        tracing.register_traceable_table('persons', df)
        tracing.trace_df(df, "raw.persons", warn_if_empty=True)

    return df
Beispiel #11
0
def persons(households, trace_hh_id):

    df = read_raw_persons(households)

    logger.info("loaded persons %s" % (df.shape,))

    df.index.name = 'person_id'

    # replace table function with dataframe
    inject.add_table('persons', df)

    pipeline.get_rn_generator().add_channel('persons', df)

    if trace_hh_id:
        tracing.register_traceable_table('persons', df)
        tracing.trace_df(df, "raw.persons", warn_if_empty=True)

    return df
def create_non_mandatory_tours():
    """
    We have now generated non-mandatory tours, but they are attributes of the person table
    Now we create a "tours" table which has one row per tour that has been generated
    (and the person id it is associated with)
    """

    persons = inject.get_table('persons')
    alts = inject.get_injectable('non_mandatory_tour_frequency_alts')

    df = process_non_mandatory_tours(
        persons.non_mandatory_tour_frequency.dropna(),
        alts
    )

    pipeline.extend_table("tours", df)
    tracing.register_traceable_table('tours', df)
    pipeline.get_rn_generator().add_channel(df, 'tours')
Beispiel #13
0
def persons(households, trace_hh_id):

    df = read_raw_persons(households)

    logger.info("loaded persons %s" % (df.shape, ))

    # replace table function with dataframe
    inject.add_table('persons', df)

    pipeline.get_rn_generator().add_channel('persons', df)

    tracing.register_traceable_table('persons', df)
    if trace_hh_id:
        tracing.trace_df(df, "raw.persons", warn_if_empty=True)

    logger.debug(
        f"{len(df.household_id.unique())} unique household_ids in persons")
    logger.debug(
        f"{len(households.index.unique())} unique household_ids in households")
    assert not households.index.duplicated().any()
    assert not df.index.duplicated().any()

    persons_without_households = ~df.household_id.isin(households.index)
    if persons_without_households.any():
        logger.error(
            f"{persons_without_households.sum()} persons out of {len(persons)} without households\n"
            f"{pd.Series({'person_id': persons_without_households.index.values})}"
        )
        raise RuntimeError(
            f"{persons_without_households.sum()} persons with bad household_id"
        )

    households_without_persons = df.groupby('household_id').size().reindex(
        households.index).isnull()
    if households_without_persons.any():
        logger.error(
            f"{households_without_persons.sum()} households out of {len(households.index)} without  persons\n"
            f"{pd.Series({'household_id': households_without_persons.index.values})}"
        )
        raise RuntimeError(
            f"{households_without_persons.sum()} households with no persons")

    return df
Beispiel #14
0
def persons(store, households_sample_size, households, trace_hh_id):

    df = store["persons"]

    if households_sample_size > 0:
        # keep all persons in the sampled households
        df = df[df.household_id.isin(households.index)]

    logger.info("loaded persons %s" % (df.shape, ))

    # replace table function with dataframe
    orca.add_table('persons', df)

    pipeline.get_rn_generator().add_channel(df, 'persons')

    if trace_hh_id:
        tracing.register_traceable_table('persons', df)
        tracing.trace_df(df, "persons", warn_if_empty=True)

    return df
Beispiel #15
0
def households(store, households_sample_size, trace_hh_id):

    df_full = store["households"]

    # if we are tracing hh exclusively
    if trace_hh_id and households_sample_size == 1:

        # df contains only trace_hh (or empty if not in full store)
        df = tracing.slice_ids(df_full, trace_hh_id)

    # if we need sample a subset of full store
    elif households_sample_size > 0 and df_full.shape[0] > households_sample_size:

        logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0]))

        # take the requested random sample
        df = asim.random_rows(df_full, households_sample_size)

        # if tracing and we missed trace_hh in sample, but it is in full store
        if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index:
                # replace first hh in sample with trace_hh
                logger.debug("replacing household %s with %s in household sample" %
                             (df.index[0], trace_hh_id))
                df_hh = tracing.slice_ids(df_full, trace_hh_id)
                df = pd.concat([df_hh, df[1:]])

    else:
        df = df_full

    logger.info("loaded households %s" % (df.shape,))

    # replace table function with dataframe
    inject.add_table('households', df)

    pipeline.get_rn_generator().add_channel(df, 'households')

    if trace_hh_id:
        tracing.register_traceable_table('households', df)
        tracing.trace_df(df, "households", warn_if_empty=True)

    return df
def get_trip_mc_logsums_for_all_modes(tours, segment_column_name,
                                      model_settings, trace_label):
    """Creates pseudo-trips from tours and runs trip mode choice to get logsums

    Parameters
    ----------
    tours : pandas.DataFrame
    segment_column_name : str
        column in tours table used for segmenting model spec
    model_settings : dict
    trace_label : str

    Returns
    -------
    tours : pd.DataFrame
        Adds two * n_modes logsum columns to each tour row, e.g. "logsum_DRIVE_outbound"
    """

    # create pseudo-trips from tours for all tour modes
    logsum_trips = create_logsum_trips(tours, segment_column_name,
                                       model_settings, trace_label)

    # temporarily register trips in the pipeline
    pipeline.replace_table('trips', logsum_trips)
    tracing.register_traceable_table('trips', logsum_trips)
    pipeline.get_rn_generator().add_channel('trips', logsum_trips)

    # run trip mode choice on pseudo-trips. use orca instead of pipeline to
    # execute the step because pipeline can only handle one open step at a time
    orca.run(['trip_mode_choice'])

    # add trip mode choice logsums as new cols in tours
    tours = append_tour_leg_trip_mode_choice_logsums(tours)

    # de-register logsum trips table
    pipeline.get_rn_generator().drop_channel('trips')
    tracing.deregister_traceable_table('trips')

    return tours
def create_non_mandatory_tours(trace_hh_id):
    """
    We have now generated non-mandatory tours, but they are attributes of the person table
    Now we create a "tours" table which has one row per tour that has been generated
    (and the person id it is associated with)
    """

    persons = inject.get_table('persons')
    alts = inject.get_injectable('non_mandatory_tour_frequency_alts')

    non_mandatory_tours = process_non_mandatory_tours(
        persons.non_mandatory_tour_frequency.dropna(),
        alts
    )

    tours = pipeline.extend_table("tours", non_mandatory_tours)
    tracing.register_traceable_table('tours', tours)
    pipeline.get_rn_generator().add_channel(non_mandatory_tours, 'tours')

    if trace_hh_id:
        tracing.trace_df(non_mandatory_tours,
                         label="non_mandatory_tour_frequency.non_mandatory_tours",
                         warn_if_empty=True)
Beispiel #18
0
def test_rng_access():

    setup_dirs()

    inject.add_injectable('rng_base_seed', 0)

    pipeline.open_pipeline()

    rng = pipeline.get_rn_generator()

    assert isinstance(rng, random.Random)

    pipeline.close_pipeline()
    inject.clear_cache()
Beispiel #19
0
def test_rng_access():

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs')

    setup_dirs(configs_dir)

    inject.add_injectable('rng_base_seed', 0)

    pipeline.open_pipeline()

    rng = pipeline.get_rn_generator()

    assert isinstance(rng, random.Random)

    pipeline.close_pipeline()
    inject.clear_cache()
Beispiel #20
0
def mandatory_tour_frequency(persons_merged,
                             mandatory_tour_frequency_spec,
                             mandatory_tour_frequency_settings,
                             trace_hh_id):
    """
    This model predicts the frequency of making mandatory trips (see the
    alternatives above) - these trips include work and school in some combination.
    """

    choosers = persons_merged.to_frame()
    # filter based on results of CDAP
    choosers = choosers[choosers.cdap_activity == 'M']
    logger.info("Running mandatory_tour_frequency with %d persons" % len(choosers))

    nest_spec = config.get_logit_model_settings(mandatory_tour_frequency_settings)
    constants = config.get_model_constants(mandatory_tour_frequency_settings)

    choices = asim.simple_simulate(
        choosers,
        spec=mandatory_tour_frequency_spec,
        nest_spec=nest_spec,
        locals_d=constants,
        trace_label=trace_hh_id and 'mandatory_tour_frequency',
        trace_choice_name='mandatory_tour_frequency')

    # convert indexes to alternative names
    choices = pd.Series(
        mandatory_tour_frequency_spec.columns[choices.values],
        index=choices.index).reindex(persons_merged.local.index)

    tracing.print_summary('mandatory_tour_frequency', choices, value_counts=True)

    orca.add_column("persons", "mandatory_tour_frequency", choices)
    pipeline.add_dependent_columns("persons", "persons_mtf")

    create_mandatory_tours_table()

    # FIXME - test prng repeatability
    r = pipeline.get_rn_generator().random_for_df(choices)
    orca.add_column("persons", "mtf_rand", [item for sublist in r for item in sublist])

    if trace_hh_id:
        trace_columns = ['mandatory_tour_frequency']
        tracing.trace_df(orca.get_table('persons_merged').to_frame(),
                         label="mandatory_tour_frequency",
                         columns=trace_columns,
                         warn_if_empty=True)
Beispiel #21
0
def local_utilities():
    """
    Dict of useful modules and functions to provides as locals for use in eval of expressions

    Returns
    -------
    utility_dict : dict
        name, entity pairs of locals
    """

    utility_dict = {
        'pd': pd,
        'np': np,
        'reindex': util.reindex,
        'setting': config.setting,
        'other_than': util.other_than,
        'rng': pipeline.get_rn_generator(),
    }

    return utility_dict
Beispiel #22
0
def local_utilities():
    """
    Dict of useful modules and functions to provides as locals for use in eval of expressions

    Returns
    -------
    utility_dict : dict
        name, entity pairs of locals
    """

    utility_dict = {
        'pd': pd,
        'np': np,
        'reindex': util.reindex,
        'setting': config.setting,
        'other_than': util.other_than,
        'rng': pipeline.get_rn_generator(),
    }

    return utility_dict
Beispiel #23
0
def local_utilities():
    """
    Dict of useful modules and functions to provides as locals for use in eval of expressions

    Returns
    -------
    utility_dict : dict
        name, entity pairs of locals
    """

    utility_dict = {
        'pd': pd,
        'np': np,
        'reindex': util.reindex,
        'reindex_i': util.reindex_i,
        'setting': config.setting,
        'other_than': util.other_than,
        'skim_time_period_label': expressions.skim_time_period_label,
        'rng': pipeline.get_rn_generator(),
    }

    utility_dict.update(config.get_global_constants())

    return utility_dict
Beispiel #24
0
def households(households_sample_size, override_hh_ids, trace_hh_id):

    df_full = read_input_table("households")
    households_sliced = False

    logger.info("full household list contains %s households" % df_full.shape[0])

    # only using households listed in override_hh_ids
    if override_hh_ids is not None:

        # trace_hh_id will not used if it is not in list of override_hh_ids
        logger.info("override household list containing %s households" % len(override_hh_ids))

        df = df_full[df_full.index.isin(override_hh_ids)]
        households_sliced = True

        if df.shape[0] < len(override_hh_ids):
            logger.info("found %s of %s households in override household list" %
                        (df.shape[0], len(override_hh_ids)))

        if df.shape[0] == 0:
            raise RuntimeError('No override households found in store')

    # if we are tracing hh exclusively
    elif trace_hh_id and households_sample_size == 1:

        # df contains only trace_hh (or empty if not in full store)
        df = tracing.slice_ids(df_full, trace_hh_id)
        households_sliced = True

    # if we need a subset of full store
    elif households_sample_size > 0 and df_full.shape[0] > households_sample_size:

        logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0]))

        """
        Because random seed is set differently for each step, sampling of households using
        Random.global_rng would sample differently depending upon which step it was called from.
        We use a one-off rng seeded with the pseudo step name 'sample_households' to provide
        repeatable sampling no matter when the table is loaded.

        Note that the external_rng is also seeded with base_seed so the sample will (rightly) change
        if the pipeline rng's base_seed is changed
        """

        prng = pipeline.get_rn_generator().get_external_rng('sample_households')
        df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False))
        households_sliced = True

        # if tracing and we missed trace_hh in sample, but it is in full store
        if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index:
            # replace first hh in sample with trace_hh
            logger.debug("replacing household %s with %s in household sample" %
                         (df.index[0], trace_hh_id))
            df_hh = df_full.loc[[trace_hh_id]]
            df = pd.concat([df_hh, df[1:]])

    else:
        df = df_full

    # persons table
    inject.add_injectable('households_sliced', households_sliced)

    logger.info("loaded households %s" % (df.shape,))

    df.index.name = 'household_id'

    # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id
    assert 'chunk_id' not in df.columns
    df['chunk_id'] = pd.Series(list(range(len(df))), df.index)

    # replace table function with dataframe
    inject.add_table('households', df)

    pipeline.get_rn_generator().add_channel('households', df)

    if trace_hh_id:
        tracing.register_traceable_table('households', df)
        tracing.trace_df(df, "raw.households", warn_if_empty=True)

    return df
Beispiel #25
0
def mandatory_tour_frequency(persons_merged, chunk_size, trace_hh_id):
    """
    This model predicts the frequency of making mandatory trips (see the
    alternatives above) - these trips include work and school in some combination.
    """
    trace_label = 'mandatory_tour_frequency'

    model_settings = config.read_model_settings(
        'mandatory_tour_frequency.yaml')
    model_spec = simulate.read_model_spec(
        file_name='mandatory_tour_frequency.csv')
    alternatives = simulate.read_model_alts(
        config.config_file_path('mandatory_tour_frequency_alternatives.csv'),
        set_index='alt')

    choosers = persons_merged.to_frame()
    # filter based on results of CDAP
    choosers = choosers[choosers.cdap_activity == 'M']
    logger.info("Running mandatory_tour_frequency with %d persons",
                len(choosers))

    # - if no mandatory tours
    if choosers.shape[0] == 0:
        add_null_results(trace_label, model_settings)
        return

    # - preprocessor
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        locals_dict = {}

        expressions.assign_columns(df=choosers,
                                   model_settings=preprocessor_settings,
                                   locals_dict=locals_dict,
                                   trace_label=trace_label)

    nest_spec = config.get_logit_model_settings(model_settings)
    constants = config.get_model_constants(model_settings)

    choices = simulate.simple_simulate(
        choosers=choosers,
        spec=model_spec,
        nest_spec=nest_spec,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_label=trace_label,
        trace_choice_name='mandatory_tour_frequency')

    # convert indexes to alternative names
    choices = pd.Series(model_spec.columns[choices.values],
                        index=choices.index).reindex(
                            persons_merged.local.index)

    # - create mandatory tours
    """
    This reprocesses the choice of index of the mandatory tour frequency
    alternatives into an actual dataframe of tours.  Ending format is
    the same as got non_mandatory_tours except trip types are "work" and "school"
    """
    choosers['mandatory_tour_frequency'] = choices
    mandatory_tours = process_mandatory_tours(
        persons=choosers, mandatory_tour_frequency_alts=alternatives)

    tours = pipeline.extend_table("tours", mandatory_tours)
    tracing.register_traceable_table('tours', mandatory_tours)
    pipeline.get_rn_generator().add_channel('tours', mandatory_tours)

    # - annotate persons
    persons = inject.get_table('persons').to_frame()

    # need to reindex as we only handled persons with cdap_activity == 'M'
    persons['mandatory_tour_frequency'] = choices.reindex(
        persons.index).fillna('').astype(str)

    expressions.assign_columns(
        df=persons,
        model_settings=model_settings.get('annotate_persons'),
        trace_label=tracing.extend_trace_label(trace_label,
                                               'annotate_persons'))

    pipeline.replace_table("persons", persons)

    tracing.print_summary('mandatory_tour_frequency',
                          persons.mandatory_tour_frequency,
                          value_counts=True)

    if trace_hh_id:
        tracing.trace_df(mandatory_tours,
                         label="mandatory_tour_frequency.mandatory_tours",
                         warn_if_empty=True)

        tracing.trace_df(persons,
                         label="mandatory_tour_frequency.persons",
                         warn_if_empty=True)
def joint_tour_participation(
        tours, persons_merged,
        chunk_size,
        trace_hh_id):
    """
    Predicts for each eligible person to participate or not participate in each joint tour.
    """
    trace_label = 'joint_tour_participation'
    model_settings_file_name = 'joint_tour_participation.yaml'
    model_settings = config.read_model_settings(model_settings_file_name)

    tours = tours.to_frame()
    joint_tours = tours[tours.tour_category == 'joint']

    # - if no joint tours
    if joint_tours.shape[0] == 0:
        add_null_results(model_settings, trace_label)
        return

    persons_merged = persons_merged.to_frame()

    # - create joint_tour_participation_candidates table
    candidates = joint_tour_participation_candidates(joint_tours, persons_merged)
    tracing.register_traceable_table('joint_tour_participants', candidates)
    pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates)

    logger.info("Running joint_tours_participation with %d potential participants (candidates)" %
                candidates.shape[0])

    # - preprocessor
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        locals_dict = {
            'person_time_window_overlap': person_time_window_overlap,
            'persons': persons_merged
        }

        expressions.assign_columns(
            df=candidates,
            model_settings=preprocessor_settings,
            locals_dict=locals_dict,
            trace_label=trace_label)

    # - simple_simulate

    estimator = estimation.manager.begin_estimation('joint_tour_participation')

    model_spec = simulate.read_model_spec(file_name=model_settings['SPEC'])
    coefficients_df = simulate.read_model_coefficients(model_settings)
    model_spec = simulate.eval_coefficients(model_spec, coefficients_df, estimator)

    nest_spec = config.get_logit_model_settings(model_settings)
    constants = config.get_model_constants(model_settings)

    if estimator:
        estimator.write_model_settings(model_settings, model_settings_file_name)
        estimator.write_spec(model_settings)
        estimator.write_coefficients(coefficients_df, model_settings)
        estimator.write_choosers(candidates)

    # add tour-based chunk_id so we can chunk all trips in tour together
    assert 'chunk_id' not in candidates.columns
    unique_household_ids = candidates.household_id.unique()
    household_chunk_ids = pd.Series(range(len(unique_household_ids)), index=unique_household_ids)
    candidates['chunk_id'] = reindex(household_chunk_ids, candidates.household_id)

    choices = simulate.simple_simulate_by_chunk_id(
        choosers=candidates,
        spec=model_spec,
        nest_spec=nest_spec,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_label=trace_label,
        trace_choice_name='participation',
        custom_chooser=participants_chooser,
        estimator=estimator)

    # choice is boolean (participate or not)
    choice_col = model_settings.get('participation_choice', 'participate')
    assert choice_col in model_spec.columns, \
        "couldn't find participation choice column '%s' in spec"
    PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col)

    participate = (choices == PARTICIPATE_CHOICE)

    if estimator:
        estimator.write_choices(choices)

        # we override the 'participate' boolean series, instead of raw alternative index in 'choices' series
        # its value depends on whether the candidate's 'participant_id' is in the joint_tour_participant index
        survey_participants_df = estimator.get_survey_table('joint_tour_participants')
        participate = pd.Series(choices.index.isin(survey_participants_df.index.values), index=choices.index)

        # but estimation software wants to know the choices value (alternative index)
        choices = participate.replace({True: PARTICIPATE_CHOICE, False: 1-PARTICIPATE_CHOICE})
        # estimator.write_override_choices(participate)  # write choices as boolean participate
        estimator.write_override_choices(choices)  # write choices as int alt indexes

        estimator.end_estimation()

    # satisfaction indexed by tour_id
    tour_satisfaction = get_tour_satisfaction(candidates, participate)

    assert tour_satisfaction.all()

    candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id)

    PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id']
    participants = candidates[participate][PARTICIPANT_COLS].copy()

    # assign participant_num
    # FIXME do we want something smarter than the participant with the lowest person_id?
    participants['participant_num'] = \
        participants.sort_values(by=['tour_id', 'person_id']).\
        groupby('tour_id').cumcount() + 1

    pipeline.replace_table("joint_tour_participants", participants)

    # drop channel as we aren't using any more (and it has candidates that weren't chosen)
    pipeline.get_rn_generator().drop_channel('joint_tour_participants')

    # - assign joint tour 'point person' (participant_num == 1)
    point_persons = participants[participants.participant_num == 1]
    joint_tours['person_id'] = point_persons.set_index('tour_id').person_id

    # update number_of_participants which was initialized to 1
    joint_tours['number_of_participants'] = participants.groupby('tour_id').size()

    assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']])

    pipeline.replace_table("tours", tours)

    # - run annotations
    annotate_jtp(model_settings, trace_label)

    if trace_hh_id:
        tracing.trace_df(participants,
                         label="joint_tour_participation.participants")

        tracing.trace_df(joint_tours,
                         label="joint_tour_participation.joint_tours")
Beispiel #27
0
def assign_cdap_rank(persons, trace_hh_id=None, trace_label=None):
    """
    Assign an integer index, cdap_rank, to each household member. (Starting with 1, not 0)

    Modifies persons df in place

    The cdap_rank order is important, because cdap only assigns activities to the first
    MAX_HHSIZE persons in each household.

    This will preferentially be two working adults and the three youngest children.

    Rank is assigned starting at 1. This necessitates some care indexing, but is preferred as
    it follows the convention of 1-based pnums in expression files.

    According to the documentation of reOrderPersonsForCdap in mtctm2.abm.ctramp
    HouseholdCoordinatedDailyActivityPatternModel:

    "Method reorders the persons in the household for use with the CDAP model,
    which only explicitly models the interaction of five persons in a HH. Priority
    in the reordering is first given to full time workers (up to two), then to
    part time workers (up to two workers, of any type), then to children (youngest
    to oldest, up to three). If the method is called for a household with less
    than 5 people, the cdapPersonArray is the same as the person array."

    We diverge from the above description in that a cdap_rank is assigned to all persons,
    including 'extra' household members, whose activity is assigned subsequently.
    The pair _hh_id_, cdap_rank will uniquely identify each household member.

    Parameters
    ----------
    persons : pandas.DataFrame
        Table of persons data. Must contain columns _hh_size_, _hh_id_, _ptype_, _age_

    Returns
    -------
    cdap_rank : pandas.Series
        integer cdap_rank of every person, indexed on _persons_index_
    """

    # transient categories used to categorize persons in cdap_rank before assigning final rank
    RANK_WORKER = 1
    RANK_CHILD = 2
    RANK_BACKFILL = 3
    RANK_UNASSIGNED = 9
    persons['cdap_rank'] = RANK_UNASSIGNED

    # choose up to 2 workers, preferring full over part, older over younger
    workers = \
        persons.loc[persons[_ptype_].isin(WORKER_PTYPES), [_hh_id_, _ptype_]]\
        .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\
        .groupby(_hh_id_).head(2)
    # tag the selected workers
    persons.loc[workers.index, 'cdap_rank'] = RANK_WORKER
    del workers

    # choose up to 3, preferring youngest
    children = \
        persons.loc[persons[_ptype_].isin(CHILD_PTYPES), [_hh_id_, _ptype_, _age_]]\
        .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\
        .groupby(_hh_id_).head(3)
    # tag the selected children
    persons.loc[children.index, 'cdap_rank'] = RANK_CHILD
    del children

    # choose up to MAX_HHSIZE, preferring anyone already chosen
    # others = \
    #     persons[[_hh_id_, 'cdap_rank']]\
    #     .sort_values(by=[_hh_id_, 'cdap_rank'], ascending=[True, True])\
    #     .groupby(_hh_id_).head(MAX_HHSIZE)

    # choose up to MAX_HHSIZE, choosing randomly
    others = persons[[_hh_id_, 'cdap_rank']].copy()
    others['random_order'] = pipeline.get_rn_generator().random_for_df(persons)
    others = \
        others\
        .sort_values(by=[_hh_id_, 'random_order'], ascending=[True, True])\
        .groupby(_hh_id_).head(MAX_HHSIZE)

    # tag the backfilled persons
    persons.loc[others[others.cdap_rank == RANK_UNASSIGNED].index, 'cdap_rank'] \
        = RANK_BACKFILL
    del others

    # assign person number in cdapPersonArray preference order
    # i.e. convert cdap_rank from category to index in order of category rank within household
    # groupby rank() is slow, so we compute rank artisanally
    # save time by sorting only the columns we need (persons is big, and sort moves data)
    p = persons[[_hh_id_, 'cdap_rank', _age_]]\
        .sort_values(by=[_hh_id_, 'cdap_rank', _age_], ascending=[True, True, True])
    rank = p.groupby(_hh_id_).size().map(range)
    rank = [item+1 for sublist in rank for item in sublist]
    p['cdap_rank'] = rank
    persons['cdap_rank'] = p['cdap_rank']  # assignment aligns on index values

    # if DUMP:
    #     tracing.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label,
    #                      transpose=False, slicer='NONE')

    if trace_hh_id:
        tracing.trace_df(persons, '%s.cdap_rank' % trace_label)

    return persons['cdap_rank']
def joint_tour_participation(
        tours, persons_merged,
        chunk_size,
        trace_hh_id):
    """
    Predicts for each eligible person to participate or not participate in each joint tour.
    """
    trace_label = 'joint_tour_participation'
    model_settings = config.read_model_settings('joint_tour_participation.yaml')
    model_spec = simulate.read_model_spec(file_name='joint_tour_participation.csv')

    tours = tours.to_frame()
    joint_tours = tours[tours.tour_category == 'joint']

    # - if no joint tours
    if joint_tours.shape[0] == 0:
        add_null_results(model_settings, trace_label)
        return

    persons_merged = persons_merged.to_frame()

    # - create joint_tour_participation_candidates table
    candidates = joint_tour_participation_candidates(joint_tours, persons_merged)
    tracing.register_traceable_table('joint_tour_participants', candidates)
    pipeline.get_rn_generator().add_channel('joint_tour_participants', candidates)

    logger.info("Running joint_tours_participation with %d potential participants (candidates)" %
                candidates.shape[0])

    # - preprocessor
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        locals_dict = {
            'person_time_window_overlap': person_time_window_overlap,
            'persons': persons_merged
        }

        expressions.assign_columns(
            df=candidates,
            model_settings=preprocessor_settings,
            locals_dict=locals_dict,
            trace_label=trace_label)

    # - simple_simulate

    nest_spec = config.get_logit_model_settings(model_settings)
    constants = config.get_model_constants(model_settings)

    choices = simulate.simple_simulate(
        choosers=candidates,
        spec=model_spec,
        nest_spec=nest_spec,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_label=trace_label,
        trace_choice_name='participation',
        custom_chooser=participants_chooser)

    # choice is boolean (participate or not)
    choice_col = model_settings.get('participation_choice', 'participate')
    assert choice_col in model_spec.columns, \
        "couldn't find participation choice column '%s' in spec"
    PARTICIPATE_CHOICE = model_spec.columns.get_loc(choice_col)

    participate = (choices == PARTICIPATE_CHOICE)

    # satisfaction indexed by tour_id
    tour_satisfaction = get_tour_satisfaction(candidates, participate)

    assert tour_satisfaction.all()

    candidates['satisfied'] = reindex(tour_satisfaction, candidates.tour_id)

    PARTICIPANT_COLS = ['tour_id', 'household_id', 'person_id']
    participants = candidates[participate][PARTICIPANT_COLS].copy()

    # assign participant_num
    # FIXME do we want something smarter than the participant with the lowest person_id?
    participants['participant_num'] = \
        participants.sort_values(by=['tour_id', 'person_id']).\
        groupby('tour_id').cumcount() + 1

    pipeline.replace_table("joint_tour_participants", participants)

    # drop channel as we aren't using any more (and it has candidates that weren't chosen)
    pipeline.get_rn_generator().drop_channel('joint_tour_participants')

    # - assign joint tour 'point person' (participant_num == 1)
    point_persons = participants[participants.participant_num == 1]
    joint_tours['person_id'] = point_persons.set_index('tour_id').person_id

    # update number_of_participants which was initialized to 1
    joint_tours['number_of_participants'] = participants.groupby('tour_id').size()

    assign_in_place(tours, joint_tours[['person_id', 'number_of_participants']])

    pipeline.replace_table("tours", tours)

    # - run annotations
    annotate_jtp(model_settings, trace_label)

    if trace_hh_id:
        tracing.trace_df(participants,
                         label="joint_tour_participation.participants")

        tracing.trace_df(joint_tours,
                         label="joint_tour_participation.joint_tours")
Beispiel #29
0
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trips, network_los,
                       alt_dest_col_name, trace_label):
    """
    Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ
    choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ

    Parameters
    ----------
    taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count
    MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term

    Returns
    -------
    dataframe with with duplicated index <chooser_id_col> and columns: <alt_dest_col_name>, prob, pick_count
    """

    if len(taz_sample) == 0:
        # it can happen that all trips have no viable destinations (and so are dropped from the sample)
        # in which case we can just return the empty taz_sample, since it has the same columns
        return taz_sample.copy()

    # we had to use alt_dest_col_name as specified in model_settings for interaction_sample
    # because expressions reference it to look up size_terms by trip purpose
    DEST_MAZ = alt_dest_col_name
    DEST_TAZ = f"{alt_dest_col_name}_TAZ"

    taz_sample.rename(columns={alt_dest_col_name: DEST_TAZ}, inplace=True)

    trace_hh_id = inject.get_injectable("trace_hh_id", None)
    have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample)
    if have_trace_targets:
        trace_label = tracing.extend_trace_label(trace_label,
                                                 'choose_MAZ_for_TAZ')

        # write taz choices, pick_counts, probs
        trace_targets = tracing.trace_targets(taz_sample)
        tracing.trace_df(taz_sample[trace_targets],
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_sample'),
                         transpose=False)

    # print(f"taz_sample\n{taz_sample}")
    #            alt_dest_TAZ      prob  pick_count
    # trip_id
    # 4343721              12  0.000054           1
    # 4343721              20  0.001864           2

    taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False)
    taz_choices = taz_choices.reindex(
        taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True)
    taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'})

    # print(f"taz_choices\n{taz_choices}")
    #         trip_id  alt_dest_TAZ      prob
    # 0       4343721            12  0.000054
    # 1       4343721            20  0.001864
    # 2       4343721            20  0.001864

    # print(f"MAZ_size_terms\n{MAZ_size_terms.df}")
    #           work  escort  shopping  eatout  othmaint  social  othdiscr   univ
    # alt_dest
    # 2         31.0   9.930     0.042   0.258     0.560   0.520    10.856  0.042
    # 3          0.0   3.277     0.029   0.000     0.029   0.029     7.308  0.029
    # 4          0.0   1.879     0.023   0.000     0.023   0.023     5.796  0.023

    # just to make it clear we are siloing choices by chooser_id
    chooser_id_col = taz_sample.index.name  # should be canonical chooser index name (e.g. 'trip_id')

    # for random_for_df, we need df with de-duplicated chooser canonical index
    chooser_df = pd.DataFrame(
        index=taz_sample.index[~taz_sample.index.duplicated()])
    num_choosers = len(chooser_df)
    assert chooser_df.index.name == chooser_id_col

    # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ)
    # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating
    taz_sample_size = taz_choices.groupby(
        chooser_id_col)[DEST_TAZ].count().max()

    # taz_choices index values should be contiguous
    assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index,
                                                     taz_sample_size)).all()

    # we need to choose a MAZ for each DEST_TAZ choice
    # probability of choosing MAZ based on MAZ size_term fraction of TAZ total
    # there will be a different set (and number) of candidate MAZs for each TAZ
    # (preserve index, which will have duplicates as result of join)

    maz_taz = network_los.maz_taz_df[['MAZ', 'TAZ']].rename(columns={
        'TAZ': DEST_TAZ,
        'MAZ': DEST_MAZ
    })
    maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(),
                         maz_taz,
                         how='left',
                         on=DEST_TAZ).set_index('index')

    purpose = maz_sizes['trip_id'].map(
        trips.purpose)  # size term varies by purpose
    maz_sizes['size_term'] = MAZ_size_terms.get(maz_sizes[DEST_MAZ], purpose)

    # print(f"maz_sizes\n{maz_sizes}")
    #          trip_id  alt_dest_TAZ  alt_dest  size_term
    # index
    # 0        4343721            12      3445      0.019
    # 0        4343721            12     11583      0.017
    # 0        4343721            12     21142      0.020

    if have_trace_targets:
        # write maz_sizes: maz_sizes[index,trip_id,dest_TAZ,zone_id,size_term]
        maz_sizes_trace_targets = tracing.trace_targets(maz_sizes,
                                                        slicer='trip_id')
        trace_maz_sizes = maz_sizes[maz_sizes_trace_targets]
        tracing.trace_df(trace_maz_sizes,
                         label=tracing.extend_trace_label(
                             trace_label, 'maz_sizes'),
                         transpose=False)

    # number of DEST_TAZ candidates per chooser
    maz_counts = maz_sizes.groupby(maz_sizes.index).size().values
    # print(maz_counts)

    # max number of MAZs for any TAZ
    max_maz_count = maz_counts.max()
    # print(f"max_maz_count {max_maz_count}")

    # offsets of the first and last rows of each chooser in sparse interaction_utilities
    last_row_offsets = maz_counts.cumsum()
    first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0)

    # repeat the row offsets once for each dummy utility to insert
    # (we want to insert dummy utilities at the END of the list of alternative utilities)
    # inserts is a list of the indices at which we want to do the insertions
    inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts)

    # insert zero filler to pad each alternative set to same size
    padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts, 0.0)
    padded_maz_sizes = padded_maz_sizes.reshape(-1, max_maz_count)

    # prob array with one row TAZ_choice, one column per alternative
    row_sums = padded_maz_sizes.sum(axis=1)
    maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1))
    assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count)

    rands = pipeline.get_rn_generator().random_for_df(
        chooser_df, n=taz_sample_size).reshape(-1, 1)
    assert len(rands) == num_choosers * taz_sample_size
    assert len(rands) == maz_probs.shape[0]

    # make choices
    # positions is array with the chosen alternative represented as a column index in probs
    # which is an integer between zero and max_maz_count
    positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1)

    # shouldn't have chosen any of the dummy pad positions
    assert (positions < maz_counts).all()

    taz_choices[DEST_MAZ] = maz_sizes[DEST_MAZ].take(positions +
                                                     first_row_offsets)
    taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]),
                                        positions]
    taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob']

    if have_trace_targets:

        taz_choices_trace_targets = tracing.trace_targets(taz_choices,
                                                          slicer='trip_id')
        trace_taz_choices_df = taz_choices[taz_choices_trace_targets]
        tracing.trace_df(trace_taz_choices_df,
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_choices'),
                         transpose=False)

        lhs_df = trace_taz_choices_df[['trip_id', DEST_TAZ]]
        alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)]

        # following the same logic as the full code, but for trace cutout
        trace_maz_counts = maz_counts[taz_choices_trace_targets]
        trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum()
        trace_inserts = np.repeat(trace_last_row_offsets,
                                  max_maz_count - trace_maz_counts)

        # trace dest_maz_alts
        padded_maz_sizes = np.insert(trace_maz_sizes[DEST_MAZ].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_alts'),
                         transpose=False)

        # trace dest_maz_size_terms
        padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_size_terms'),
                         transpose=False)

        # trace dest_maz_probs
        df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets],
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        df['rand'] = rands[taz_choices_trace_targets]
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_probs'),
                         transpose=False)

    taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob'])
    taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ
                                       ]).agg(prob=('prob', 'max'),
                                              pick_count=('prob', 'count'))

    taz_choices.reset_index(level=DEST_MAZ, inplace=True)

    return taz_choices
Beispiel #30
0
def stop_frequency(tours, tours_merged, stop_frequency_alts, skim_dict,
                   chunk_size, trace_hh_id):
    """
    stop frequency model

    For each tour, shoose a number of intermediate inbound stops and outbound stops.
    Create a trip table with inbound and outbound trips.

    Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops,
    and four corresponding trips: three outbound, and one inbound.

    Adds stop_frequency str column to trips, with fields

    creates trips table with columns:

    ::

        - person_id
        - household_id
        - tour_id
        - primary_purpose
        - atwork
        - trip_num
        - outbound
        - trip_count

    """

    trace_label = 'stop_frequency'
    model_settings = config.read_model_settings('stop_frequency.yaml')

    tours = tours.to_frame()
    tours_merged = tours_merged.to_frame()

    assert not tours_merged.household_id.isnull().any()

    assert not (tours_merged.origin == -1).any()
    assert not (tours_merged.destination == -1).any()

    nest_spec = config.get_logit_model_settings(model_settings)
    constants = config.get_model_constants(model_settings)

    # - run preprocessor to annotate tours_merged
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        # hack: preprocessor adds origin column in place if it does not exist already
        od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination')
        skims = [od_skim_stack_wrapper]

        locals_dict = {"od_skims": od_skim_stack_wrapper}
        if constants is not None:
            locals_dict.update(constants)

        simulate.set_skim_wrapper_targets(tours_merged, skims)

        # this should be pre-slice as some expressions may count tours by type
        annotations = expressions.compute_columns(
            df=tours_merged,
            model_settings=preprocessor_settings,
            locals_dict=locals_dict,
            trace_label=trace_label)

        assign_in_place(tours_merged, annotations)

    tracing.print_summary('stop_frequency segments',
                          tours_merged.primary_purpose,
                          value_counts=True)

    choices_list = []
    for segment_type, choosers in tours_merged.groupby('primary_purpose'):

        logging.info("%s running segment %s with %s chooser rows" %
                     (trace_label, segment_type, choosers.shape[0]))

        spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' %
                                        segment_type)

        assert spec is not None, "spec for segment_type %s not found" % segment_type

        choices = simulate.simple_simulate(
            choosers=choosers,
            spec=spec,
            nest_spec=nest_spec,
            locals_d=constants,
            chunk_size=chunk_size,
            trace_label=tracing.extend_trace_label(trace_label, segment_type),
            trace_choice_name='stops')

        # convert indexes to alternative names
        choices = pd.Series(spec.columns[choices.values], index=choices.index)

        choices_list.append(choices)

    choices = pd.concat(choices_list)

    tracing.print_summary('stop_frequency', choices, value_counts=True)

    # add stop_frequency choices to tours table
    assign_in_place(tours, choices.to_frame('stop_frequency'))

    if 'primary_purpose' not in tours.columns:
        assign_in_place(tours, tours_merged[['primary_purpose']])

    pipeline.replace_table("tours", tours)

    # create trips table
    trips = process_trips(tours, stop_frequency_alts)
    trips = pipeline.extend_table("trips", trips)
    tracing.register_traceable_table('trips', trips)
    pipeline.get_rn_generator().add_channel('trips', trips)

    if trace_hh_id:
        tracing.trace_df(tours,
                         label="stop_frequency.tours",
                         slicer='person_id',
                         columns=None)

        tracing.trace_df(trips,
                         label="stop_frequency.trips",
                         slicer='person_id',
                         columns=None)

        tracing.trace_df(annotations,
                         label="stop_frequency.annotations",
                         columns=None)

        tracing.trace_df(tours_merged,
                         label="stop_frequency.tours_merged",
                         slicer='person_id',
                         columns=None)
Beispiel #31
0
def stop_frequency(
        tours, tours_merged,
        stop_frequency_alts,
        skim_dict,
        chunk_size,
        trace_hh_id):
    """
    stop frequency model

    For each tour, shoose a number of intermediate inbound stops and outbound stops.
    Create a trip table with inbound and outbound trips.

    Thus, a tour with stop_frequency '2out_0in' will have two outbound and zero inbound stops,
    and four corresponding trips: three outbound, and one inbound.

    Adds stop_frequency str column to trips, with fields

    creates trips table with columns:

    ::

        - person_id
        - household_id
        - tour_id
        - primary_purpose
        - atwork
        - trip_num
        - outbound
        - trip_count

    """

    trace_label = 'stop_frequency'
    model_settings = config.read_model_settings('stop_frequency.yaml')

    tours = tours.to_frame()
    tours_merged = tours_merged.to_frame()

    assert not tours_merged.household_id.isnull().any()

    assert not (tours_merged.origin == -1).any()
    assert not (tours_merged.destination == -1).any()

    nest_spec = config.get_logit_model_settings(model_settings)
    constants = config.get_model_constants(model_settings)

    # - run preprocessor to annotate tours_merged
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        # hack: preprocessor adds origin column in place if it does not exist already
        od_skim_stack_wrapper = skim_dict.wrap('origin', 'destination')
        skims = [od_skim_stack_wrapper]

        locals_dict = {
            "od_skims": od_skim_stack_wrapper
        }
        if constants is not None:
            locals_dict.update(constants)

        simulate.set_skim_wrapper_targets(tours_merged, skims)

        # this should be pre-slice as some expressions may count tours by type
        annotations = expressions.compute_columns(
            df=tours_merged,
            model_settings=preprocessor_settings,
            locals_dict=locals_dict,
            trace_label=trace_label)

        assign_in_place(tours_merged, annotations)

    tracing.print_summary('stop_frequency segments',
                          tours_merged.primary_purpose, value_counts=True)

    choices_list = []
    for segment_type, choosers in tours_merged.groupby('primary_purpose'):

        logging.info("%s running segment %s with %s chooser rows" %
                     (trace_label, segment_type, choosers.shape[0]))

        spec = simulate.read_model_spec(file_name='stop_frequency_%s.csv' % segment_type)

        assert spec is not None, "spec for segment_type %s not found" % segment_type

        choices = simulate.simple_simulate(
            choosers=choosers,
            spec=spec,
            nest_spec=nest_spec,
            locals_d=constants,
            chunk_size=chunk_size,
            trace_label=tracing.extend_trace_label(trace_label, segment_type),
            trace_choice_name='stops')

        # convert indexes to alternative names
        choices = pd.Series(spec.columns[choices.values], index=choices.index)

        choices_list.append(choices)

    choices = pd.concat(choices_list)

    tracing.print_summary('stop_frequency', choices, value_counts=True)

    # add stop_frequency choices to tours table
    assign_in_place(tours, choices.to_frame('stop_frequency'))

    if 'primary_purpose' not in tours.columns:
        assign_in_place(tours, tours_merged[['primary_purpose']])

    pipeline.replace_table("tours", tours)

    # create trips table
    trips = process_trips(tours, stop_frequency_alts)
    trips = pipeline.extend_table("trips", trips)
    tracing.register_traceable_table('trips', trips)
    pipeline.get_rn_generator().add_channel('trips', trips)

    if trace_hh_id:
        tracing.trace_df(tours,
                         label="stop_frequency.tours",
                         slicer='person_id',
                         columns=None)

        tracing.trace_df(trips,
                         label="stop_frequency.trips",
                         slicer='person_id',
                         columns=None)

        tracing.trace_df(annotations,
                         label="stop_frequency.annotations",
                         columns=None)

        tracing.trace_df(tours_merged,
                         label="stop_frequency.tours_merged",
                         slicer='person_id',
                         columns=None)
Beispiel #32
0
def assign_cdap_rank(persons, trace_hh_id=None, trace_label=None):
    """
    Assign an integer index, cdap_rank, to each household member. (Starting with 1, not 0)

    Modifies persons df in place

    The cdap_rank order is important, because cdap only assigns activities to the first
    MAX_HHSIZE persons in each household.

    This will preferentially be two working adults and the three youngest children.

    Rank is assigned starting at 1. This necessitates some care indexing, but is preferred as
    it follows the convention of 1-based pnums in expression files.

    According to the documentation of reOrderPersonsForCdap in mtctm2.abm.ctramp
    HouseholdCoordinatedDailyActivityPatternModel:

    "Method reorders the persons in the household for use with the CDAP model,
    which only explicitly models the interaction of five persons in a HH. Priority
    in the reordering is first given to full time workers (up to two), then to
    part time workers (up to two workers, of any type), then to children (youngest
    to oldest, up to three). If the method is called for a household with less
    than 5 people, the cdapPersonArray is the same as the person array."

    We diverge from the above description in that a cdap_rank is assigned to all persons,
    including 'extra' household members, whose activity is assigned subsequently.
    The pair _hh_id_, cdap_rank will uniquely identify each household member.

    Parameters
    ----------
    persons : pandas.DataFrame
        Table of persons data. Must contain columns _hh_size_, _hh_id_, _ptype_, _age_

    Returns
    -------
    cdap_rank : pandas.Series
        integer cdap_rank of every person, indexed on _persons_index_
    """

    # transient categories used to categorize persons in cdap_rank before assigning final rank
    RANK_WORKER = 1
    RANK_CHILD = 2
    RANK_BACKFILL = 3
    RANK_UNASSIGNED = 9
    persons['cdap_rank'] = RANK_UNASSIGNED

    # choose up to 2 workers, preferring full over part, older over younger
    workers = \
        persons.loc[persons[_ptype_].isin(WORKER_PTYPES), [_hh_id_, _ptype_]]\
        .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\
        .groupby(_hh_id_).head(2)
    # tag the selected workers
    persons.loc[workers.index, 'cdap_rank'] = RANK_WORKER
    del workers

    # choose up to 3, preferring youngest
    children = \
        persons.loc[persons[_ptype_].isin(CHILD_PTYPES), [_hh_id_, _ptype_, _age_]]\
        .sort_values(by=[_hh_id_, _ptype_], ascending=[True, True])\
        .groupby(_hh_id_).head(3)
    # tag the selected children
    persons.loc[children.index, 'cdap_rank'] = RANK_CHILD
    del children

    # choose up to MAX_HHSIZE, preferring anyone already chosen
    # others = \
    #     persons[[_hh_id_, 'cdap_rank']]\
    #     .sort_values(by=[_hh_id_, 'cdap_rank'], ascending=[True, True])\
    #     .groupby(_hh_id_).head(MAX_HHSIZE)

    # choose up to MAX_HHSIZE, choosing randomly
    others = persons[[_hh_id_, 'cdap_rank']].copy()
    others['random_order'] = pipeline.get_rn_generator().random_for_df(persons)
    others = \
        others\
        .sort_values(by=[_hh_id_, 'random_order'], ascending=[True, True])\
        .groupby(_hh_id_).head(MAX_HHSIZE)

    # tag the backfilled persons
    persons.loc[others[others.cdap_rank == RANK_UNASSIGNED].index, 'cdap_rank'] \
        = RANK_BACKFILL
    del others

    # assign person number in cdapPersonArray preference order
    # i.e. convert cdap_rank from category to index in order of category rank within household
    # groupby rank() is slow, so we compute rank artisanally
    # save time by sorting only the columns we need (persons is big, and sort moves data)
    p = persons[[_hh_id_, 'cdap_rank', _age_]]\
        .sort_values(by=[_hh_id_, 'cdap_rank', _age_], ascending=[True, True, True])
    rank = p.groupby(_hh_id_).size().map(range)
    rank = [item+1 for sublist in rank for item in sublist]
    p['cdap_rank'] = rank
    persons['cdap_rank'] = p['cdap_rank']  # assignment aligns on index values

    # if DUMP:
    #     tracing.trace_df(persons, '%s.DUMP.cdap_person_array' % trace_label,
    #                      transpose=False, slicer='NONE')

    if trace_hh_id:
        tracing.trace_df(persons, '%s.cdap_rank' % trace_label)

    return persons['cdap_rank']
def non_mandatory_tour_frequency(persons, persons_merged,
                                 chunk_size,
                                 trace_hh_id):
    """
    This model predicts the frequency of making non-mandatory trips
    (alternatives for this model come from a separate csv file which is
    configured by the user) - these trips include escort, shopping, othmaint,
    othdiscr, eatout, and social trips in various combination.
    """

    trace_label = 'non_mandatory_tour_frequency'
    model_settings = config.read_model_settings('non_mandatory_tour_frequency.yaml')
    model_spec = simulate.read_model_spec(file_name='non_mandatory_tour_frequency.csv')

    alternatives = simulate.read_model_alts(
        config.config_file_path('non_mandatory_tour_frequency_alternatives.csv'),
        set_index=None)

    choosers = persons_merged.to_frame()

    # FIXME kind of tacky both that we know to add this here and del it below
    # 'tot_tours' is used in model_spec expressions
    alternatives['tot_tours'] = alternatives.sum(axis=1)

    # - preprocessor
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        locals_dict = {
            'person_max_window': person_max_window
        }

        expressions.assign_columns(
            df=choosers,
            model_settings=preprocessor_settings,
            locals_dict=locals_dict,
            trace_label=trace_label)

    # filter based on results of CDAP
    choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])]

    logger.info("Running non_mandatory_tour_frequency with %d persons", len(choosers))

    constants = config.get_model_constants(model_settings)

    choices_list = []
    # segment by person type and pick the right spec for each person type
    for ptype, segment in choosers.groupby('ptype'):

        name = PTYPE_NAME[ptype]

        # pick the spec column for the segment
        spec = model_spec[[name]]

        # drop any zero-valued rows
        spec = spec[spec[name] != 0]

        logger.info("Running segment '%s' of size %d", name, len(segment))

        choices = interaction_simulate(
            segment,
            alternatives,
            spec=spec,
            locals_d=constants,
            chunk_size=chunk_size,
            trace_label='non_mandatory_tour_frequency.%s' % name,
            trace_choice_name='non_mandatory_tour_frequency')

        choices_list.append(choices)

        # FIXME - force garbage collection?
        # force_garbage_collect()

    choices = pd.concat(choices_list)

    del alternatives['tot_tours']  # del tot_tours column we added above

    # - add non_mandatory_tour_frequency column to persons
    persons = persons.to_frame()
    # need to reindex as we only handled persons with cdap_activity in ['M', 'N']
    # (we expect there to be an alt with no tours - which we can use to backfill non-travelers)
    no_tours_alt = (alternatives.sum(axis=1) == 0).index[0]
    persons['non_mandatory_tour_frequency'] = \
        choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8)

    """
    We have now generated non-mandatory tours, but they are attributes of the person table
    Now we create a "tours" table which has one row per tour that has been generated
    (and the person id it is associated with)
    """

    # - get counts of each of the alternatives (so we can extend)
    # (choices is just the index values for the chosen alts)
    """
               escort  shopping  othmaint  othdiscr    eatout    social
    parent_id
    2588676         2         0         0         1         1         0
    2588677         0         1         0         1         0         0
    """
    tour_counts = alternatives.loc[choices]
    tour_counts.index = choices.index  # assign person ids to the index

    prev_tour_count = tour_counts.sum().sum()

    # - extend_tour_counts
    tour_counts = extend_tour_counts(choosers, tour_counts, alternatives,
                                     trace_hh_id,
                                     tracing.extend_trace_label(trace_label, 'extend_tour_counts'))

    extended_tour_count = tour_counts.sum().sum()

    logging.info("extend_tour_counts increased nmtf tour count by %s from %s to %s" %
                 (extended_tour_count - prev_tour_count, prev_tour_count, extended_tour_count))

    # - create the non_mandatory tours
    non_mandatory_tours = process_non_mandatory_tours(persons, tour_counts)
    assert len(non_mandatory_tours) == extended_tour_count

    pipeline.extend_table("tours", non_mandatory_tours)

    tracing.register_traceable_table('tours', non_mandatory_tours)
    pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours)

    expressions.assign_columns(
        df=persons,
        model_settings=model_settings.get('annotate_persons'),
        trace_label=trace_label)

    pipeline.replace_table("persons", persons)

    tracing.print_summary('non_mandatory_tour_frequency',
                          persons.non_mandatory_tour_frequency, value_counts=True)

    if trace_hh_id:
        tracing.trace_df(non_mandatory_tours,
                         label="non_mandatory_tour_frequency.non_mandatory_tours",
                         warn_if_empty=True)

        tracing.trace_df(choosers,
                         label="non_mandatory_tour_frequency.choosers",
                         warn_if_empty=True)

        tracing.trace_df(persons,
                         label="non_mandatory_tour_frequency.annotated_persons",
                         warn_if_empty=True)
Beispiel #34
0
def non_mandatory_tour_frequency(persons, persons_merged, chunk_size,
                                 trace_hh_id):
    """
    This model predicts the frequency of making non-mandatory trips
    (alternatives for this model come from a separate csv file which is
    configured by the user) - these trips include escort, shopping, othmaint,
    othdiscr, eatout, and social trips in various combination.
    """

    trace_label = 'non_mandatory_tour_frequency'
    model_settings_file_name = 'non_mandatory_tour_frequency.yaml'

    model_settings = config.read_model_settings(model_settings_file_name)

    # FIXME kind of tacky both that we know to add this here and del it below
    # 'tot_tours' is used in model_spec expressions
    alternatives = simulate.read_model_alts(
        'non_mandatory_tour_frequency_alternatives.csv', set_index=None)
    alternatives['tot_tours'] = alternatives.sum(axis=1)

    # filter based on results of CDAP
    choosers = persons_merged.to_frame()
    choosers = choosers[choosers.cdap_activity.isin(['M', 'N'])]

    # - preprocessor
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        locals_dict = {'person_max_window': person_max_window}

        expressions.assign_columns(df=choosers,
                                   model_settings=preprocessor_settings,
                                   locals_dict=locals_dict,
                                   trace_label=trace_label)

    logger.info("Running non_mandatory_tour_frequency with %d persons",
                len(choosers))

    constants = config.get_model_constants(model_settings)

    model_spec = simulate.read_model_spec(file_name=model_settings['SPEC'])
    spec_segments = model_settings.get('SPEC_SEGMENTS', {})

    # segment by person type and pick the right spec for each person type
    choices_list = []
    for segment_settings in spec_segments:

        segment_name = segment_settings['NAME']
        ptype = segment_settings['PTYPE']

        # pick the spec column for the segment
        segment_spec = model_spec[[segment_name]]

        chooser_segment = choosers[choosers.ptype == ptype]

        logger.info("Running segment '%s' of size %d", segment_name,
                    len(chooser_segment))

        if len(chooser_segment) == 0:
            # skip empty segments
            continue

        estimator = \
            estimation.manager.begin_estimation(model_name=segment_name, bundle_name='non_mandatory_tour_frequency')

        coefficients_df = simulate.read_model_coefficients(segment_settings)
        segment_spec = simulate.eval_coefficients(segment_spec,
                                                  coefficients_df, estimator)

        if estimator:
            estimator.write_spec(model_settings, bundle_directory=True)
            estimator.write_model_settings(model_settings,
                                           model_settings_file_name,
                                           bundle_directory=True)
            # preserving coefficients file name makes bringing back updated coefficients more straightforward
            estimator.write_coefficients(coefficients_df, segment_settings)
            estimator.write_choosers(chooser_segment)
            estimator.write_alternatives(alternatives, bundle_directory=True)

            # FIXME #interaction_simulate_estimation_requires_chooser_id_in_df_column
            #  shuold we do it here or have interaction_simulate do it?
            # chooser index must be duplicated in column or it will be omitted from interaction_dataset
            # estimation requires that chooser_id is either in index or a column of interaction_dataset
            # so it can be reformatted (melted) and indexed by chooser_id and alt_id
            assert chooser_segment.index.name == 'person_id'
            assert 'person_id' not in chooser_segment.columns
            chooser_segment['person_id'] = chooser_segment.index

            # FIXME set_alt_id - do we need this for interaction_simulate estimation bundle tables?
            estimator.set_alt_id('alt_id')

            estimator.set_chooser_id(chooser_segment.index.name)

        choices = interaction_simulate(
            chooser_segment,
            alternatives,
            spec=segment_spec,
            locals_d=constants,
            chunk_size=chunk_size,
            trace_label='non_mandatory_tour_frequency.%s' % segment_name,
            trace_choice_name='non_mandatory_tour_frequency',
            estimator=estimator)

        if estimator:
            estimator.write_choices(choices)
            choices = estimator.get_survey_values(
                choices, 'persons', 'non_mandatory_tour_frequency')
            estimator.write_override_choices(choices)
            estimator.end_estimation()

        choices_list.append(choices)

        # FIXME - force garbage collection?
        force_garbage_collect()

    del alternatives['tot_tours']  # del tot_tours column we added above

    # The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate
    # is the index value of the chosen alternative in the alternatives table.
    choices = pd.concat(choices_list).sort_index()

    # add non_mandatory_tour_frequency column to persons
    persons = persons.to_frame()
    # we expect there to be an alt with no tours - which we can use to backfill non-travelers
    no_tours_alt = (alternatives.sum(axis=1) == 0).index[0]
    # need to reindex as we only handled persons with cdap_activity in ['M', 'N']
    persons['non_mandatory_tour_frequency'] = \
        choices.reindex(persons.index).fillna(no_tours_alt).astype(np.int8)
    """
    We have now generated non-mandatory tour frequencies, but they are attributes of the person table
    Now we create a "tours" table which has one row per tour that has been generated
    (and the person id it is associated with)

    But before we do that, we run an additional probablilistic step to extend/increase tour counts
    beyond the strict limits of the tour_frequency alternatives chosen above (which are currently limited
    to at most 2 escort tours and 1 each of shopping, othmaint, othdiscr, eatout, and social tours)

    The choice value 'non_mandatory_tour_frequency' assigned by interaction_simulate is simply the
    index value of the chosen alternative in the alternatives table.

    get counts of each of the tour type alternatives (so we can extend)
               escort  shopping  othmaint  othdiscr    eatout    social
    parent_id
    2588676         2         0         0         1         1         0
    2588677         0         1         0         1         0         0
    """

    # counts of each of the tour type alternatives (so we can extend)
    modeled_tour_counts = alternatives.loc[choices]
    modeled_tour_counts.index = choices.index  # assign person ids to the index

    # - extend_tour_counts - probabalistic
    extended_tour_counts = \
        extend_tour_counts(choosers, modeled_tour_counts.copy(), alternatives,
                           trace_hh_id, tracing.extend_trace_label(trace_label, 'extend_tour_counts'))

    num_modeled_tours = modeled_tour_counts.sum().sum()
    num_extended_tours = extended_tour_counts.sum().sum()
    logger.info("extend_tour_counts increased tour count by %s from %s to %s" %
                (num_extended_tours - num_modeled_tours, num_modeled_tours,
                 num_extended_tours))
    """
    create the non_mandatory tours based on extended_tour_counts
    """
    if estimator:
        override_tour_counts = \
            estimation.manager.get_survey_values(extended_tour_counts,
                                                 table_name='persons',
                                                 column_names=['_%s' % c for c in extended_tour_counts.columns])
        override_tour_counts = \
            override_tour_counts.rename(columns={('_%s' % c): c for c in extended_tour_counts.columns})
        logger.info(
            "estimation get_survey_values override_tour_counts %s changed cells"
            % (override_tour_counts != extended_tour_counts).sum().sum())
        extended_tour_counts = override_tour_counts
    """
    create the non_mandatory tours based on extended_tour_counts
    """
    non_mandatory_tours = process_non_mandatory_tours(persons,
                                                      extended_tour_counts)
    assert len(non_mandatory_tours) == extended_tour_counts.sum().sum()

    if estimator:

        # make sure they created the right tours
        survey_tours = estimation.manager.get_survey_table(
            'tours').sort_index()
        non_mandatory_survey_tours = survey_tours[survey_tours.tour_category ==
                                                  'non_mandatory']
        assert len(non_mandatory_survey_tours) == len(non_mandatory_tours)
        assert non_mandatory_survey_tours.index.equals(
            non_mandatory_tours.sort_index().index)

        # make sure they created tours with the expected tour_ids
        columns = ['person_id', 'household_id', 'tour_type', 'tour_category']
        survey_tours = \
            estimation.manager.get_survey_values(non_mandatory_tours,
                                                 table_name='tours',
                                                 column_names=columns)

        tours_differ = (non_mandatory_tours[columns] !=
                        survey_tours[columns]).any(axis=1)

        if tours_differ.any():
            print("tours_differ\n%s" % tours_differ)
            print("%s of %s tours differ" %
                  (tours_differ.sum(), len(tours_differ)))
            print("differing survey_tours\n%s" % survey_tours[tours_differ])
            print("differing modeled_tours\n%s" %
                  non_mandatory_tours[columns][tours_differ])

        assert (not tours_differ.any())

    pipeline.extend_table("tours", non_mandatory_tours)

    tracing.register_traceable_table('tours', non_mandatory_tours)
    pipeline.get_rn_generator().add_channel('tours', non_mandatory_tours)

    expressions.assign_columns(
        df=persons,
        model_settings=model_settings.get('annotate_persons'),
        trace_label=trace_label)

    pipeline.replace_table("persons", persons)

    tracing.print_summary('non_mandatory_tour_frequency',
                          persons.non_mandatory_tour_frequency,
                          value_counts=True)

    if trace_hh_id:
        tracing.trace_df(
            non_mandatory_tours,
            label="non_mandatory_tour_frequency.non_mandatory_tours",
            warn_if_empty=True)

        tracing.trace_df(choosers,
                         label="non_mandatory_tour_frequency.choosers",
                         warn_if_empty=True)

        tracing.trace_df(
            persons,
            label="non_mandatory_tour_frequency.annotated_persons",
            warn_if_empty=True)
Beispiel #35
0
def choose_MAZ_for_TAZ(taz_sample, MAZ_size_terms, trace_label):
    """
    Convert taz_sample table with TAZ zone sample choices to a table with a MAZ zone chosen for each TAZ
    choose MAZ probabilistically (proportionally by size_term) from set of MAZ zones in parent TAZ

    Parameters
    ----------
    taz_sample: dataframe with duplicated index <chooser_id_col> and columns: <DEST_TAZ>, prob, pick_count
    MAZ_size_terms: dataframe with duplicated index <chooser_id_col> and columns: zone_id, dest_TAZ, size_term

    Returns
    -------
    dataframe with with duplicated index <chooser_id_col> and columns: <DEST_MAZ>, prob, pick_count
    """

    # print(f"taz_sample\n{taz_sample}")
    #           dest_TAZ      prob  pick_count  person_id
    # tour_id
    # 542963          18  0.004778           1      13243
    # 542963          53  0.004224           2      13243
    # 542963          59  0.008628           1      13243

    trace_hh_id = inject.get_injectable("trace_hh_id", None)
    have_trace_targets = trace_hh_id and tracing.has_trace_targets(taz_sample)
    if have_trace_targets:
        trace_label = tracing.extend_trace_label(trace_label,
                                                 'choose_MAZ_for_TAZ')

        CHOOSER_ID = taz_sample.index.name  # zone_id for tours, but person_id for location choice
        assert CHOOSER_ID is not None

        # write taz choices, pick_counts, probs
        trace_targets = tracing.trace_targets(taz_sample)
        tracing.trace_df(taz_sample[trace_targets],
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_sample'),
                         transpose=False)

    # redupe taz_sample[[DEST_TAZ, 'prob']] using pick_count to repeat rows
    taz_choices = taz_sample[[DEST_TAZ, 'prob']].reset_index(drop=False)
    taz_choices = taz_choices.reindex(
        taz_choices.index.repeat(taz_sample.pick_count)).reset_index(drop=True)
    taz_choices = taz_choices.rename(columns={'prob': 'TAZ_prob'})

    # print(f"taz_choices\n{taz_choices}")
    #        tour_id  dest_TAZ  TAZ_prob
    # 0       542963        18  0.004778
    # 1       542963        53  0.004224
    # 2       542963        53  0.004224
    # 3       542963        59  0.008628

    # print(f"MAZ_size_terms\n{MAZ_size_terms}")
    #       zone_id  dest_TAZ  size_term
    # 0        6097         2      7.420
    # 1       16421         2      9.646
    # 2       24251         2     10.904

    # just to make it clear we are siloing choices by chooser_id
    chooser_id_col = taz_sample.index.name  # should be canonical chooser index name (e.g. 'person_id')

    # for random_for_df, we need df with de-duplicated chooser canonical index
    chooser_df = pd.DataFrame(
        index=taz_sample.index[~taz_sample.index.duplicated()])
    num_choosers = len(chooser_df)
    assert chooser_df.index.name == chooser_id_col

    # to make choices, <taz_sample_size> rands for each chooser (one rand for each sampled TAZ)
    # taz_sample_size will be model_settings['SAMPLE_SIZE'] samples, except if we are estimating
    taz_sample_size = taz_choices.groupby(
        chooser_id_col)[DEST_TAZ].count().max()

    # taz_choices index values should be contiguous
    assert (taz_choices[chooser_id_col] == np.repeat(chooser_df.index,
                                                     taz_sample_size)).all()

    # we need to choose a MAZ for each DEST_TAZ choice
    # probability of choosing MAZ based on MAZ size_term fraction of TAZ total
    # there will be a different set (and number) of candidate MAZs for each TAZ
    # (preserve index, which will have duplicates as result of join)
    # maz_sizes.index is the integer offset into taz_choices of the taz for which the maz_size row is a candidate)
    maz_sizes = pd.merge(taz_choices[[chooser_id_col, DEST_TAZ]].reset_index(),
                         MAZ_size_terms,
                         how='left',
                         on=DEST_TAZ).set_index('index')

    #         tour_id  dest_TAZ  zone_id  size_term
    # index
    # 0        542963        18      498     12.130
    # 0        542963        18     7696     18.550
    # 0        542963        18    15431      8.678
    # 0        542963        18    21429     29.938
    # 1        542963        53    17563     34.252

    if have_trace_targets:
        # write maz_sizes: maz_sizes[index,tour_id,dest_TAZ,zone_id,size_term]

        maz_sizes_trace_targets = tracing.trace_targets(maz_sizes,
                                                        slicer=CHOOSER_ID)
        trace_maz_sizes = maz_sizes[maz_sizes_trace_targets]
        tracing.trace_df(trace_maz_sizes,
                         label=tracing.extend_trace_label(
                             trace_label, 'maz_sizes'),
                         transpose=False)

    # number of DEST_TAZ candidates per chooser
    maz_counts = maz_sizes.groupby(maz_sizes.index).size().values

    # max number of MAZs for any TAZ
    max_maz_count = maz_counts.max()

    # offsets of the first and last rows of each chooser in sparse interaction_utilities
    last_row_offsets = maz_counts.cumsum()
    first_row_offsets = np.insert(last_row_offsets[:-1], 0, 0)

    # repeat the row offsets once for each dummy utility to insert
    # (we want to insert dummy utilities at the END of the list of alternative utilities)
    # inserts is a list of the indices at which we want to do the insertions
    inserts = np.repeat(last_row_offsets, max_maz_count - maz_counts)

    # insert zero filler to pad each alternative set to same size
    padded_maz_sizes = np.insert(maz_sizes.size_term.values, inserts,
                                 0.0).reshape(-1, max_maz_count)

    # prob array with one row TAZ_choice, one column per alternative
    row_sums = padded_maz_sizes.sum(axis=1)
    maz_probs = np.divide(padded_maz_sizes, row_sums.reshape(-1, 1))
    assert maz_probs.shape == (num_choosers * taz_sample_size, max_maz_count)

    rands = pipeline.get_rn_generator().random_for_df(chooser_df,
                                                      n=taz_sample_size)
    rands = rands.reshape(-1, 1)
    assert len(rands) == num_choosers * taz_sample_size
    assert len(rands) == maz_probs.shape[0]

    # make choices
    # positions is array with the chosen alternative represented as a column index in probs
    # which is an integer between zero and max_maz_count
    positions = np.argmax((maz_probs.cumsum(axis=1) - rands) > 0.0, axis=1)

    # shouldn't have chosen any of the dummy pad positions
    assert (positions < maz_counts).all()

    taz_choices[DEST_MAZ] = maz_sizes['zone_id'].take(positions +
                                                      first_row_offsets)
    taz_choices['MAZ_prob'] = maz_probs[np.arange(maz_probs.shape[0]),
                                        positions]
    taz_choices['prob'] = taz_choices['TAZ_prob'] * taz_choices['MAZ_prob']

    if have_trace_targets:

        taz_choices_trace_targets = tracing.trace_targets(taz_choices,
                                                          slicer=CHOOSER_ID)
        trace_taz_choices_df = taz_choices[taz_choices_trace_targets]
        tracing.trace_df(trace_taz_choices_df,
                         label=tracing.extend_trace_label(
                             trace_label, 'taz_choices'),
                         transpose=False)

        lhs_df = trace_taz_choices_df[[CHOOSER_ID, DEST_TAZ]]
        alt_dest_columns = [f'dest_maz_{c}' for c in range(max_maz_count)]

        # following the same logic as the full code, but for trace cutout
        trace_maz_counts = maz_counts[taz_choices_trace_targets]
        trace_last_row_offsets = maz_counts[taz_choices_trace_targets].cumsum()
        trace_inserts = np.repeat(trace_last_row_offsets,
                                  max_maz_count - trace_maz_counts)

        # trace dest_maz_alts
        padded_maz_sizes = np.insert(trace_maz_sizes[CHOOSER_ID].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_alts'),
                         transpose=False)

        # trace dest_maz_size_terms
        padded_maz_sizes = np.insert(trace_maz_sizes['size_term'].values,
                                     trace_inserts,
                                     0.0).reshape(-1, max_maz_count)
        df = pd.DataFrame(data=padded_maz_sizes,
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_size_terms'),
                         transpose=False)

        # trace dest_maz_probs
        df = pd.DataFrame(data=maz_probs[taz_choices_trace_targets],
                          columns=alt_dest_columns,
                          index=trace_taz_choices_df.index)
        df = pd.concat([lhs_df, df], axis=1)
        df['rand'] = rands[taz_choices_trace_targets]
        tracing.trace_df(df,
                         label=tracing.extend_trace_label(
                             trace_label, 'dest_maz_probs'),
                         transpose=False)

    taz_choices = taz_choices.drop(columns=['TAZ_prob', 'MAZ_prob'])
    taz_choices = taz_choices.groupby([chooser_id_col, DEST_MAZ
                                       ]).agg(prob=('prob', 'max'),
                                              pick_count=('prob', 'count'))

    taz_choices.reset_index(level=DEST_MAZ, inplace=True)

    return taz_choices
def joint_tour_frequency(households, persons, chunk_size, trace_hh_id):
    """
    This model predicts the frequency of making fully joint trips (see the
    alternatives above).
    """
    trace_label = 'joint_tour_frequency'
    model_settings_file_name = 'joint_tour_frequency.yaml'

    estimator = estimation.manager.begin_estimation('joint_tour_frequency')

    model_settings = config.read_model_settings(model_settings_file_name)

    alternatives = simulate.read_model_alts(
        'joint_tour_frequency_alternatives.csv', set_index='alt')

    # - only interested in households with more than one cdap travel_active person and
    # - at least one non-preschooler
    households = households.to_frame()
    multi_person_households = households[
        households.participates_in_jtf_model].copy()

    # - only interested in persons in multi_person_households
    # FIXME - gratuitous pathological efficiency move, just let yaml specify persons?
    persons = persons.to_frame()
    persons = persons[persons.household_id.isin(multi_person_households.index)]

    logger.info(
        "Running joint_tour_frequency with %d multi-person households" %
        multi_person_households.shape[0])

    # - preprocessor
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        locals_dict = {
            'persons': persons,
            'hh_time_window_overlap': hh_time_window_overlap
        }

        expressions.assign_columns(df=multi_person_households,
                                   model_settings=preprocessor_settings,
                                   locals_dict=locals_dict,
                                   trace_label=trace_label)

    model_spec = simulate.read_model_spec(file_name=model_settings['SPEC'])
    coefficients_df = simulate.read_model_coefficients(model_settings)
    model_spec = simulate.eval_coefficients(model_spec, coefficients_df,
                                            estimator)

    nest_spec = config.get_logit_model_settings(model_settings)
    constants = config.get_model_constants(model_settings)

    if estimator:
        estimator.write_spec(model_settings)
        estimator.write_model_settings(model_settings,
                                       model_settings_file_name)
        estimator.write_coefficients(coefficients_df, model_settings)
        estimator.write_choosers(multi_person_households)

    choices = simulate.simple_simulate(
        choosers=multi_person_households,
        spec=model_spec,
        nest_spec=nest_spec,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_label=trace_label,
        trace_choice_name='joint_tour_frequency',
        estimator=estimator)

    # convert indexes to alternative names
    choices = pd.Series(model_spec.columns[choices.values],
                        index=choices.index)

    if estimator:
        estimator.write_choices(choices)
        choices = estimator.get_survey_values(choices, 'households',
                                              'joint_tour_frequency')
        estimator.write_override_choices(choices)
        estimator.end_estimation()

    # - create joint_tours based on joint_tour_frequency choices

    # - we need a person_id in order to generate the tour index (and for register_traceable_table)
    # - but we don't know the tour participants yet
    # - so we arbitrarily choose the first person in the household
    # - to be point person for the purpose of generating an index and setting origin
    temp_point_persons = persons.loc[persons.PNUM == 1]
    temp_point_persons['person_id'] = temp_point_persons.index
    temp_point_persons = temp_point_persons.set_index('household_id')
    temp_point_persons = temp_point_persons[['person_id', 'home_zone_id']]

    joint_tours = \
        process_joint_tours(choices, alternatives, temp_point_persons)

    tours = pipeline.extend_table("tours", joint_tours)

    tracing.register_traceable_table('tours', joint_tours)
    pipeline.get_rn_generator().add_channel('tours', joint_tours)

    # - annotate households

    # we expect there to be an alt with no tours - which we can use to backfill non-travelers
    no_tours_alt = (alternatives.sum(axis=1) == 0).index[0]
    households['joint_tour_frequency'] = choices.reindex(
        households.index).fillna(no_tours_alt).astype(str)

    households['num_hh_joint_tours'] = joint_tours.groupby('household_id').size().\
        reindex(households.index).fillna(0).astype(np.int8)

    pipeline.replace_table("households", households)

    tracing.print_summary('joint_tour_frequency',
                          households.joint_tour_frequency,
                          value_counts=True)

    if trace_hh_id:
        tracing.trace_df(households, label="joint_tour_frequency.households")

        tracing.trace_df(joint_tours,
                         label="joint_tour_frequency.joint_tours",
                         slicer='household_id')

    if estimator:
        survey_tours = estimation.manager.get_survey_table('tours')
        survey_tours = survey_tours[survey_tours.tour_category == 'joint']

        print(f"len(survey_tours) {len(survey_tours)}")
        print(f"len(joint_tours) {len(joint_tours)}")

        different = False
        survey_tours_not_in_tours = survey_tours[~survey_tours.index.
                                                 isin(joint_tours.index)]
        if len(survey_tours_not_in_tours) > 0:
            print(f"survey_tours_not_in_tours\n{survey_tours_not_in_tours}")
            different = True
        tours_not_in_survey_tours = joint_tours[~joint_tours.index.
                                                isin(survey_tours.index)]
        if len(survey_tours_not_in_tours) > 0:
            print(f"tours_not_in_survey_tours\n{tours_not_in_survey_tours}")
            different = True
        assert not different
def mandatory_tour_frequency(persons_merged,
                             chunk_size,
                             trace_hh_id):
    """
    This model predicts the frequency of making mandatory trips (see the
    alternatives above) - these trips include work and school in some combination.
    """
    trace_label = 'mandatory_tour_frequency'

    model_settings = config.read_model_settings('mandatory_tour_frequency.yaml')
    model_spec = simulate.read_model_spec(file_name='mandatory_tour_frequency.csv')
    alternatives = simulate.read_model_alts(
        config.config_file_path('mandatory_tour_frequency_alternatives.csv'), set_index='alt')

    choosers = persons_merged.to_frame()
    # filter based on results of CDAP
    choosers = choosers[choosers.cdap_activity == 'M']
    logger.info("Running mandatory_tour_frequency with %d persons", len(choosers))

    # - if no mandatory tours
    if choosers.shape[0] == 0:
        add_null_results(trace_label, model_settings)
        return

    # - preprocessor
    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:

        locals_dict = {}

        expressions.assign_columns(
            df=choosers,
            model_settings=preprocessor_settings,
            locals_dict=locals_dict,
            trace_label=trace_label)

    nest_spec = config.get_logit_model_settings(model_settings)
    constants = config.get_model_constants(model_settings)

    choices = simulate.simple_simulate(
        choosers=choosers,
        spec=model_spec,
        nest_spec=nest_spec,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_label=trace_label,
        trace_choice_name='mandatory_tour_frequency')

    # convert indexes to alternative names
    choices = pd.Series(
        model_spec.columns[choices.values],
        index=choices.index).reindex(persons_merged.local.index)

    # - create mandatory tours
    """
    This reprocesses the choice of index of the mandatory tour frequency
    alternatives into an actual dataframe of tours.  Ending format is
    the same as got non_mandatory_tours except trip types are "work" and "school"
    """
    choosers['mandatory_tour_frequency'] = choices
    mandatory_tours = process_mandatory_tours(
        persons=choosers,
        mandatory_tour_frequency_alts=alternatives
    )

    tours = pipeline.extend_table("tours", mandatory_tours)
    tracing.register_traceable_table('tours', mandatory_tours)
    pipeline.get_rn_generator().add_channel('tours', mandatory_tours)

    # - annotate persons
    persons = inject.get_table('persons').to_frame()

    # need to reindex as we only handled persons with cdap_activity == 'M'
    persons['mandatory_tour_frequency'] = choices.reindex(persons.index).fillna('').astype(str)

    expressions.assign_columns(
        df=persons,
        model_settings=model_settings.get('annotate_persons'),
        trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons'))

    pipeline.replace_table("persons", persons)

    tracing.print_summary('mandatory_tour_frequency', persons.mandatory_tour_frequency,
                          value_counts=True)

    if trace_hh_id:
        tracing.trace_df(mandatory_tours,
                         label="mandatory_tour_frequency.mandatory_tours",
                         warn_if_empty=True)

        tracing.trace_df(persons,
                         label="mandatory_tour_frequency.persons",
                         warn_if_empty=True)