Example #1
0
def schedule_tours(
        tours, persons_merged, alts, spec, constants, timetable,
        previous_tour, window_id_col, chunk_size, tour_trace_label):
    """
    chunking wrapper for _schedule_tours

    While interaction_sample_simulate provides chunking support, the merged tours, persons
    dataframe and the tdd_interaction_dataset are very big, so we want to create them inside
    the chunking loop to minimize memory footprint. So we implement the chunking loop here,
    and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support.

    """
    # return _schedule_tours(tours, persons_merged, alts, spec, constants, timetable,
    #                        previous_tour, window_id_col, chunk_size, tour_trace_label)

    logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours)))

    # persons_merged columns plus 2 previous tour columns
    extra_chooser_columns = persons_merged.shape[1] + 2

    rows_per_chunk = \
        calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, trace_label=tour_trace_label)

    logger.info("chunk_size %s rows_per_chunk %s" % (chunk_size, rows_per_chunk))

    result_list = []
    for i, num_chunks, chooser_chunk \
            in chunk.chunked_choosers(tours, rows_per_chunk):

        logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk)))

        chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \
            if num_chunks > 1 else tour_trace_label

        choices = _schedule_tours(chooser_chunk, persons_merged,
                                  alts, spec, constants,
                                  timetable,
                                  previous_tour, window_id_col,
                                  tour_trace_label=chunk_trace_label)

        result_list.append(choices)

        force_garbage_collect()

    # FIXME: this will require 2X RAM
    # if necessary, could append to hdf5 store on disk:
    # http://pandas.pydata.org/pandas-docs/stable/io.html#id2
    if len(result_list) > 1:
        choices = pd.concat(result_list)

    assert len(choices.index == len(tours.index))

    return choices
def schedule_tours(
        tours, persons_merged, alts,
        spec, logsum_tour_purpose,
        model_settings,
        timetable, timetable_window_id_col,
        previous_tour, tour_owner_id_col,
        chunk_size, tour_trace_label):
    """
    chunking wrapper for _schedule_tours

    While interaction_sample_simulate provides chunking support, the merged tours, persons
    dataframe and the tdd_interaction_dataset are very big, so we want to create them inside
    the chunking loop to minimize memory footprint. So we implement the chunking loop here,
    and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support.

    """

    if not tours.index.is_monotonic_increasing:
        logger.info("schedule_tours %s tours not monotonic_increasing - sorting df")
        tours = tours.sort_index()

    logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours)))

    # no more than one tour per timetable_window per call
    if timetable_window_id_col is None:
        assert not tours.index.duplicated().any()
    else:
        assert not tours[timetable_window_id_col].duplicated().any()

    rows_per_chunk, effective_chunk_size = \
        calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, trace_label=tour_trace_label)

    result_list = []
    for i, num_chunks, chooser_chunk \
            in chunk.chunked_choosers(tours, rows_per_chunk):

        logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk)))

        chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \
            if num_chunks > 1 else tour_trace_label

        chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size)
        choices = _schedule_tours(chooser_chunk, persons_merged,
                                  alts, spec, logsum_tour_purpose,
                                  model_settings,
                                  timetable, timetable_window_id_col,
                                  previous_tour, tour_owner_id_col,
                                  tour_trace_label=chunk_trace_label)

        chunk.log_close(chunk_trace_label)

        result_list.append(choices)

        mem.force_garbage_collect()

    # FIXME: this will require 2X RAM
    # if necessary, could append to hdf5 store on disk:
    # http://pandas.pydata.org/pandas-docs/stable/io.html#id2
    if len(result_list) > 1:
        choices = pd.concat(result_list)

    assert len(choices.index == len(tours.index))

    return choices
Example #3
0
def schedule_tours(tours, persons_merged, alts, spec, logsum_tour_purpose,
                   model_settings, timetable, timetable_window_id_col,
                   previous_tour, tour_owner_id_col, estimator, chunk_size,
                   tour_trace_label):
    """
    chunking wrapper for _schedule_tours

    While interaction_sample_simulate provides chunking support, the merged tours, persons
    dataframe and the tdd_interaction_dataset are very big, so we want to create them inside
    the chunking loop to minimize memory footprint. So we implement the chunking loop here,
    and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support.

    """

    if not tours.index.is_monotonic_increasing:
        logger.info(
            "schedule_tours %s tours not monotonic_increasing - sorting df")
        tours = tours.sort_index()

    logger.info("%s schedule_tours running %d tour choices" %
                (tour_trace_label, len(tours)))

    # no more than one tour per timetable_window per call
    if timetable_window_id_col is None:
        assert not tours.index.duplicated().any()
    else:
        assert not tours[timetable_window_id_col].duplicated().any()

    rows_per_chunk, effective_chunk_size = \
        calc_rows_per_chunk(chunk_size, tours, persons_merged, alts,
                            model_settings=model_settings, trace_label=tour_trace_label)

    result_list = []
    for i, num_chunks, chooser_chunk \
            in chunk.chunked_choosers(tours, rows_per_chunk):

        logger.info("Running chunk %s of %s size %d" %
                    (i, num_chunks, len(chooser_chunk)))

        chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \
            if num_chunks > 1 else tour_trace_label

        chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size)
        choices = _schedule_tours(chooser_chunk,
                                  persons_merged,
                                  alts,
                                  spec,
                                  logsum_tour_purpose,
                                  model_settings,
                                  timetable,
                                  timetable_window_id_col,
                                  previous_tour,
                                  tour_owner_id_col,
                                  estimator,
                                  tour_trace_label=chunk_trace_label)

        chunk.log_close(chunk_trace_label)

        result_list.append(choices)

        mem.force_garbage_collect()

    # FIXME: this will require 2X RAM
    # if necessary, could append to hdf5 store on disk:
    # http://pandas.pydata.org/pandas-docs/stable/io.html#id2
    if len(result_list) > 1:
        choices = pd.concat(result_list)

    assert len(choices.index == len(tours.index))

    return choices
Example #4
0
def run_trip_purpose(
        trips_df,
        chunk_size,
        trace_hh_id,
        trace_label):
    """
    trip purpose - main functionality separated from model step so it can be called iteratively

    For each intermediate stop on a tour (i.e. trip other than the last trip outbound or inbound)
    Each trip is assigned a purpose based on an observed frequency distribution

    The distribution is segmented by tour purpose, tour direction, person type,
    and, optionally, trip depart time .

    Returns
    -------
    purpose: pandas.Series of purpose (str) indexed by trip_id
    """

    model_settings = config.read_model_settings('trip_purpose.yaml')
    probs_spec = trip_purpose_probs()

    result_list = []

    # - last trip of outbound tour gets primary_purpose
    last_trip = (trips_df.trip_num == trips_df.trip_count)
    purpose = trips_df.primary_purpose[last_trip & trips_df.outbound]
    result_list.append(purpose)
    logger.info("assign purpose to %s last outbound trips", purpose.shape[0])

    # - last trip of inbound tour gets home (or work for atwork subtours)
    purpose = trips_df.primary_purpose[last_trip & ~trips_df.outbound]
    purpose = pd.Series(np.where(purpose == 'atwork', 'Work', 'Home'), index=purpose.index)
    result_list.append(purpose)
    logger.info("assign purpose to %s last inbound trips", purpose.shape[0])

    # - intermediate stops (non-last trips) purpose assigned by probability table
    trips_df = trips_df[~last_trip]
    logger.info("assign purpose to %s intermediate trips", trips_df.shape[0])

    preprocessor_settings = model_settings.get('preprocessor', None)
    if preprocessor_settings:
        locals_dict = config.get_model_constants(model_settings)
        expressions.assign_columns(
            df=trips_df,
            model_settings=preprocessor_settings,
            locals_dict=locals_dict,
            trace_label=trace_label)

    rows_per_chunk, effective_chunk_size = \
        trip_purpose_rpc(chunk_size, trips_df, probs_spec, trace_label=trace_label)

    for i, num_chunks, trips_chunk in chunk.chunked_choosers(trips_df, rows_per_chunk):

        logger.info("Running chunk %s of %s size %d", i, num_chunks, len(trips_chunk))

        chunk_trace_label = tracing.extend_trace_label(trace_label, 'chunk_%s' % i) \
            if num_chunks > 1 else trace_label

        chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size)

        choices = choose_intermediate_trip_purpose(
            trips_chunk,
            probs_spec,
            trace_hh_id,
            trace_label=chunk_trace_label)

        chunk.log_close(chunk_trace_label)

        result_list.append(choices)

    if len(result_list) > 1:
        choices = pd.concat(result_list)

    return choices