def schedule_tours( tours, persons_merged, alts, spec, constants, timetable, previous_tour, window_id_col, chunk_size, tour_trace_label): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ # return _schedule_tours(tours, persons_merged, alts, spec, constants, timetable, # previous_tour, window_id_col, chunk_size, tour_trace_label) logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # persons_merged columns plus 2 previous tour columns extra_chooser_columns = persons_merged.shape[1] + 2 rows_per_chunk = \ calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, trace_label=tour_trace_label) logger.info("chunk_size %s rows_per_chunk %s" % (chunk_size, rows_per_chunk)) result_list = [] for i, num_chunks, chooser_chunk \ in chunk.chunked_choosers(tours, rows_per_chunk): logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else tour_trace_label choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, constants, timetable, previous_tour, window_id_col, tour_trace_label=chunk_trace_label) result_list.append(choices) force_garbage_collect() # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def schedule_tours( tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, chunk_size, tour_trace_label): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ if not tours.index.is_monotonic_increasing: logger.info("schedule_tours %s tours not monotonic_increasing - sorting df") tours = tours.sort_index() logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # no more than one tour per timetable_window per call if timetable_window_id_col is None: assert not tours.index.duplicated().any() else: assert not tours[timetable_window_id_col].duplicated().any() rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, trace_label=tour_trace_label) result_list = [] for i, num_chunks, chooser_chunk \ in chunk.chunked_choosers(tours, rows_per_chunk): logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else tour_trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, tour_trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) mem.force_garbage_collect() # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def schedule_tours(tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, estimator, chunk_size, tour_trace_label): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ if not tours.index.is_monotonic_increasing: logger.info( "schedule_tours %s tours not monotonic_increasing - sorting df") tours = tours.sort_index() logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # no more than one tour per timetable_window per call if timetable_window_id_col is None: assert not tours.index.duplicated().any() else: assert not tours[timetable_window_id_col].duplicated().any() rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, model_settings=model_settings, trace_label=tour_trace_label) result_list = [] for i, num_chunks, chooser_chunk \ in chunk.chunked_choosers(tours, rows_per_chunk): logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else tour_trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, estimator, tour_trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) mem.force_garbage_collect() # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def run_trip_purpose( trips_df, chunk_size, trace_hh_id, trace_label): """ trip purpose - main functionality separated from model step so it can be called iteratively For each intermediate stop on a tour (i.e. trip other than the last trip outbound or inbound) Each trip is assigned a purpose based on an observed frequency distribution The distribution is segmented by tour purpose, tour direction, person type, and, optionally, trip depart time . Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ model_settings = config.read_model_settings('trip_purpose.yaml') probs_spec = trip_purpose_probs() result_list = [] # - last trip of outbound tour gets primary_purpose last_trip = (trips_df.trip_num == trips_df.trip_count) purpose = trips_df.primary_purpose[last_trip & trips_df.outbound] result_list.append(purpose) logger.info("assign purpose to %s last outbound trips", purpose.shape[0]) # - last trip of inbound tour gets home (or work for atwork subtours) purpose = trips_df.primary_purpose[last_trip & ~trips_df.outbound] purpose = pd.Series(np.where(purpose == 'atwork', 'Work', 'Home'), index=purpose.index) result_list.append(purpose) logger.info("assign purpose to %s last inbound trips", purpose.shape[0]) # - intermediate stops (non-last trips) purpose assigned by probability table trips_df = trips_df[~last_trip] logger.info("assign purpose to %s intermediate trips", trips_df.shape[0]) preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns( df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) rows_per_chunk, effective_chunk_size = \ trip_purpose_rpc(chunk_size, trips_df, probs_spec, trace_label=trace_label) for i, num_chunks, trips_chunk in chunk.chunked_choosers(trips_df, rows_per_chunk): logger.info("Running chunk %s of %s size %d", i, num_chunks, len(trips_chunk)) chunk_trace_label = tracing.extend_trace_label(trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = choose_intermediate_trip_purpose( trips_chunk, probs_spec, trace_hh_id, trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) if len(result_list) > 1: choices = pd.concat(result_list) return choices