def run_trip_scheduling(trips, tours, probs_spec, model_settings, last_iteration, chunk_size, trace_hh_id, trace_label): set_tour_hour(trips, tours) rows_per_chunk, effective_chunk_size = \ trip_scheduling_rpc(chunk_size, trips, probs_spec, trace_label) result_list = [] for i, num_chunks, trips_chunk in chunk.chunked_choosers_by_chunk_id( trips, rows_per_chunk): if num_chunks > 1: chunk_trace_label = tracing.extend_trace_label( trace_label, 'chunk_%s' % i) logger.info("%s of %s size %d" % (chunk_trace_label, num_chunks, len(trips_chunk))) else: chunk_trace_label = trace_label leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'outbound') chunk.log_open(leg_trace_label, chunk_size, effective_chunk_size) choices = \ schedule_trips_in_leg( outbound=True, trips=trips_chunk[trips_chunk.outbound], probs_spec=probs_spec, model_settings=model_settings, last_iteration=last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_close(leg_trace_label) leg_trace_label = tracing.extend_trace_label(chunk_trace_label, 'inbound') chunk.log_open(leg_trace_label, chunk_size, effective_chunk_size) choices = \ schedule_trips_in_leg( outbound=False, trips=trips_chunk[~trips_chunk.outbound], probs_spec=probs_spec, model_settings=model_settings, last_iteration=last_iteration, trace_hh_id=trace_hh_id, trace_label=leg_trace_label) result_list.append(choices) chunk.log_close(leg_trace_label) choices = pd.concat(result_list) return choices
def run_cdap( persons, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, chunk_size=0, trace_hh_id=None, trace_label=None): """ Choose individual activity patterns for persons. Parameters ---------- persons : pandas.DataFrame Table of persons data. Must contain at least a household ID, household size, person type category, and age, plus any columns used in cdap_indiv_spec cdap_indiv_spec : pandas.DataFrame CDAP spec for individuals without taking any interactions into account. cdap_interaction_coefficients : pandas.DataFrame Rules and coefficients for generating interaction specs for different household sizes cdap_fixed_relative_proportions : pandas.DataFrame Spec to for the relative proportions of each activity (M, N, H) to choose activities for additional household members not handled by CDAP locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ in either the cdap_indiv_spec or cdap_fixed_relative_proportions expression files chunk_size: int Chunk size or 0 for no chunking trace_hh_id : int hh_id to trace or None if no hh tracing trace_label : str label for tracing or None if no tracing Returns ------- choices : pandas.DataFrame dataframe is indexed on _persons_index_ and has two columns: cdap_activity : str activity for that person expressed as 'M', 'N', 'H' """ trace_label = tracing.extend_trace_label(trace_label, 'cdap') rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, persons, trace_label=trace_label) result_list = [] # segment by person type and pick the right spec for each person type for i, num_chunks, persons_chunk in chunk.chunked_choosers_by_chunk_id(persons, rows_per_chunk): logger.info("Running chunk %s of %s with %d persons" % (i, num_chunks, len(persons_chunk))) chunk_trace_label = tracing.extend_trace_label(trace_label, 'chunk_%s' % i) chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) cdap_results = \ _run_cdap(persons_chunk, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, trace_hh_id, chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(cdap_results) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: cdap_results = pd.concat(result_list) if trace_hh_id: tracing.trace_df(cdap_results, label="cdap", columns=['cdap_rank', 'cdap_activity'], warn_if_empty=True) # return choices column as series return cdap_results['cdap_activity']
def schedule_tours( tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, chunk_size, tour_trace_label): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ if not tours.index.is_monotonic_increasing: logger.info("schedule_tours %s tours not monotonic_increasing - sorting df") tours = tours.sort_index() logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # no more than one tour per timetable_window per call if timetable_window_id_col is None: assert not tours.index.duplicated().any() else: assert not tours[timetable_window_id_col].duplicated().any() rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, trace_label=tour_trace_label) result_list = [] for i, num_chunks, chooser_chunk \ in chunk.chunked_choosers(tours, rows_per_chunk): logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else tour_trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, tour_trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) mem.force_garbage_collect() # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def run_cdap( persons, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, chunk_size=0, trace_hh_id=None, trace_label=None): """ Choose individual activity patterns for persons. Parameters ---------- persons : pandas.DataFrame Table of persons data. Must contain at least a household ID, household size, person type category, and age, plus any columns used in cdap_indiv_spec cdap_indiv_spec : pandas.DataFrame CDAP spec for individuals without taking any interactions into account. cdap_interaction_coefficients : pandas.DataFrame Rules and coefficients for generating interaction specs for different household sizes cdap_fixed_relative_proportions : pandas.DataFrame Spec to for the relative proportions of each activity (M, N, H) to choose activities for additional household members not handled by CDAP locals_d : Dict This is a dictionary of local variables that will be the environment for an evaluation of an expression that begins with @ in either the cdap_indiv_spec or cdap_fixed_relative_proportions expression files chunk_size: int Chunk size or 0 for no chunking trace_hh_id : int hh_id to trace or None if no hh tracing trace_label : str label for tracing or None if no tracing Returns ------- choices : pandas.DataFrame dataframe is indexed on _persons_index_ and has two columns: cdap_activity : str activity for that person expressed as 'M', 'N', 'H' cdap_rank : int activities for persons with cdap_rank <= MAX_HHSIZE are determined by cdap 'extra' household members activities are assigned by cdap_fixed_relative_proportions """ trace_label = tracing.extend_trace_label(trace_label, 'cdap') rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, persons, trace_label=trace_label) result_list = [] # segment by person type and pick the right spec for each person type for i, num_chunks, persons_chunk in chunk.chunked_choosers_by_chunk_id(persons, rows_per_chunk): logger.info("Running chunk %s of %s with %d persons" % (i, num_chunks, len(persons_chunk))) chunk_trace_label = tracing.extend_trace_label(trace_label, 'chunk_%s' % i) chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = _run_cdap(persons_chunk, cdap_indiv_spec, cdap_interaction_coefficients, cdap_fixed_relative_proportions, locals_d, trace_hh_id, chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) return choices
def schedule_tours(tours, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, estimator, chunk_size, tour_trace_label): """ chunking wrapper for _schedule_tours While interaction_sample_simulate provides chunking support, the merged tours, persons dataframe and the tdd_interaction_dataset are very big, so we want to create them inside the chunking loop to minimize memory footprint. So we implement the chunking loop here, and pass a chunk_size of 0 to interaction_sample_simulate to disable its chunking support. """ if not tours.index.is_monotonic_increasing: logger.info( "schedule_tours %s tours not monotonic_increasing - sorting df") tours = tours.sort_index() logger.info("%s schedule_tours running %d tour choices" % (tour_trace_label, len(tours))) # no more than one tour per timetable_window per call if timetable_window_id_col is None: assert not tours.index.duplicated().any() else: assert not tours[timetable_window_id_col].duplicated().any() rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, tours, persons_merged, alts, model_settings=model_settings, trace_label=tour_trace_label) result_list = [] for i, num_chunks, chooser_chunk \ in chunk.chunked_choosers(tours, rows_per_chunk): logger.info("Running chunk %s of %s size %d" % (i, num_chunks, len(chooser_chunk))) chunk_trace_label = tracing.extend_trace_label(tour_trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else tour_trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = _schedule_tours(chooser_chunk, persons_merged, alts, spec, logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour, tour_owner_id_col, estimator, tour_trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) mem.force_garbage_collect() # FIXME: this will require 2X RAM # if necessary, could append to hdf5 store on disk: # http://pandas.pydata.org/pandas-docs/stable/io.html#id2 if len(result_list) > 1: choices = pd.concat(result_list) assert len(choices.index == len(tours.index)) return choices
def eval_and_sum(assignment_expressions, df, locals_dict, group_by_column_names=None, df_alias=None, chunk_size=0, trace_rows=None): """ Evaluate assignment_expressions against df, and sum the results (sum by group if list of group_by_column_names is specified. e.g. group by coc column names and return sums grouped by community of concern.) Parameters ---------- assignment_expressions df locals_dict group_by_column_names : array of str list of names of the columns to group by (e.g. coc_column_names of trip_coc_end) df_alias : str assign_variables df_alias (name of df in assignment_expressions) chunk_size : int trace_rows : array of bool array indicating which rows in df are to be traced Returns ------- """ if group_by_column_names is None: group_by_column_names = [] rows_per_chunk, effective_chunk_size = \ calc_rows_per_chunk(chunk_size, df, assignment_expressions, extra_columns=len(group_by_column_names), trace_label='eval_and_sum') logger.info("eval_and_sum chunk_size %s rows_per_chunk %s df rows %s" % (effective_chunk_size, rows_per_chunk, df.shape[0])) summary = None result_list = [] trace_results = [] trace_assigned_locals = {} for i, num_chunks, df_chunk, trace_rows_chunk in chunked_df( df, rows_per_chunk, trace_rows): logger.info("eval_and_sum chunk %s of %s" % (i, num_chunks)) logger.debug("eval_and_sum chunk %s assign variables" % (i, )) assigned_chunk, trace_chunk, trace_assigned_locals_chunk = \ assign.assign_variables(assignment_expressions, df_chunk, locals_dict=locals_dict, df_alias=df_alias, trace_rows=trace_rows_chunk) # sum this chunk logger.debug("eval_and_sum chunk %s sum" % (i, )) if group_by_column_names: # concat in the group_by columns for c in group_by_column_names: assigned_chunk[c] = df_chunk[c] # sum this chunk summary = assigned_chunk.groupby(group_by_column_names).sum() else: summary = assigned_chunk.sum().to_frame().T result_list.append(summary) if trace_chunk is not None: trace_results.append(trace_chunk) if trace_assigned_locals_chunk is not None: trace_assigned_locals.update(trace_assigned_locals_chunk) # note: chunk size will log low if there are more spec temp vars than extra_columns trace_label = 'eval_and_sum chunk_%s' % i chunk.log_open(trace_label, chunk_size, effective_chunk_size) chunk.log_df(trace_label, 'df_chunk', df_chunk) chunk.log_df(trace_label, 'assigned_chunk', assigned_chunk) chunk.log_close(trace_label) assert result_list # squash multiple chunk summaries if len(result_list) > 1: logger.debug("eval_and_sum squash chunk summaries") summary = pd.concat(result_list) if group_by_column_names: summary.reset_index(inplace=True) summary = summary.groupby(group_by_column_names).sum() else: summary = summary.sum().to_frame().T if trace_results: trace_results = pd.concat(trace_results) # trace_rows index values should match index of original df trace_results.index = df[trace_rows].index else: trace_results = None return summary, trace_results, trace_assigned_locals
def physical_activity_processor(trips_with_demographics, persons_merged, physical_activity_trip_spec, physical_activity_person_spec, physical_activity_settings, coc_column_names, settings, chunk_size, trace_hh_id): """ Compute physical benefits Physical activity benefits generally accrue if the net physical activity for an individual exceeds a certain threshold. We calculate individual physical activity based on trips, so we need to compute trip activity and then sum up to the person level to calculate benefits. We chunk trips by household id to ensure that all of a persons trips are in the same chunk. """ trips_df = trips_with_demographics.to_frame() persons_df = persons_merged.to_frame() trace_label = 'physical_activity' logger.info( "Running physical_activity_processor with %d trips for %d persons " % (len(trips_df), len(persons_df))) locals_dict = config.get_model_constants(physical_activity_settings) locals_dict.update(config.setting('globals')) trip_trace_rows = trace_hh_id and trips_df.household_id == trace_hh_id rows_per_chunk, effective_chunk_size = \ physical_activity_rpc(chunk_size, trips_df, persons_df, physical_activity_trip_spec, trace_label) logger.info("physical_activity_processor chunk_size %s rows_per_chunk %s" % (chunk_size, rows_per_chunk)) coc_summary = None result_list = [] # iterate over trips df chunked by hh_id for i, num_chunks, trips_chunk, trace_rows_chunk \ in bca.chunked_df_by_chunk_id(trips_df, trip_trace_rows, rows_per_chunk): logger.info("%s chunk %s of %s" % (trace_label, i, num_chunks)) trip_activity, trip_trace_results, trip_trace_assigned_locals = \ assign.assign_variables(physical_activity_trip_spec, trips_chunk, locals_dict=locals_dict, df_alias='trips', trace_rows=trace_rows_chunk) # since tracing is at household level, trace_results will occur in only one chunk # we can just write them out when we see them without need to accumulate across chunks if trip_trace_results is not None: tracing.write_csv(trip_trace_results, file_name="physical_activity_trips", index_label='trip_id', column_labels=['label', 'trip']) if trip_trace_assigned_locals: tracing.write_csv(trip_trace_assigned_locals, file_name="physical_activity_trips_locals") # sum trip activity for each unique person trip_activity = trip_activity.groupby(trips_chunk.person_id).sum() # merge in persons columns for this chunk persons_chunk = pd.merge(trip_activity, persons_df, left_index=True, right_index=True) # trace rows array for this chunk person_trace_rows = trace_hh_id and persons_chunk[ 'household_id'] == trace_hh_id person_activity, person_trace_results, person_trace_assigned_locals = \ assign.assign_variables(physical_activity_person_spec, persons_chunk, locals_dict=locals_dict, df_alias='persons', trace_rows=person_trace_rows) # since tracing is at household level, trace_results will occur in only one chunk # we can just write them out when we see them without need to accumulate across chunks if person_trace_results is not None: tracing.write_csv(person_trace_results, file_name="physical_activity_persons", index_label='persons_merged_table_index', column_labels=['label', 'person']) if person_trace_assigned_locals: tracing.write_csv(person_trace_assigned_locals, file_name="physical_activity_persons_locals") # concat in the coc columns and summarize the chunk by coc person_activity = pd.concat( [persons_chunk[coc_column_names], person_activity], axis=1) coc_summary = person_activity.groupby(coc_column_names).sum() result_list.append(coc_summary) chunk_trace_label = 'trace_label chunk_%s' % i chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) chunk.log_df(chunk_trace_label, 'trips_chunk', trips_chunk) chunk.log_df(chunk_trace_label, 'persons_chunk', persons_chunk) chunk.log_close(chunk_trace_label) if len(result_list) > 1: # (if there was only one chunk, then concat is redundant) coc_summary = pd.concat(result_list) # squash the accumulated chunk summaries by reapplying group and sum coc_summary.reset_index(inplace=True) coc_summary = coc_summary.groupby(coc_column_names).sum() result_prefix = 'PA_' add_result_columns("coc_results", coc_summary, result_prefix) add_summary_results(coc_summary, prefix=result_prefix, spec=physical_activity_person_spec)
def run_trip_purpose( trips_df, chunk_size, trace_hh_id, trace_label): """ trip purpose - main functionality separated from model step so it can be called iteratively For each intermediate stop on a tour (i.e. trip other than the last trip outbound or inbound) Each trip is assigned a purpose based on an observed frequency distribution The distribution is segmented by tour purpose, tour direction, person type, and, optionally, trip depart time . Returns ------- purpose: pandas.Series of purpose (str) indexed by trip_id """ model_settings = config.read_model_settings('trip_purpose.yaml') probs_spec = trip_purpose_probs() result_list = [] # - last trip of outbound tour gets primary_purpose last_trip = (trips_df.trip_num == trips_df.trip_count) purpose = trips_df.primary_purpose[last_trip & trips_df.outbound] result_list.append(purpose) logger.info("assign purpose to %s last outbound trips", purpose.shape[0]) # - last trip of inbound tour gets home (or work for atwork subtours) purpose = trips_df.primary_purpose[last_trip & ~trips_df.outbound] purpose = pd.Series(np.where(purpose == 'atwork', 'Work', 'Home'), index=purpose.index) result_list.append(purpose) logger.info("assign purpose to %s last inbound trips", purpose.shape[0]) # - intermediate stops (non-last trips) purpose assigned by probability table trips_df = trips_df[~last_trip] logger.info("assign purpose to %s intermediate trips", trips_df.shape[0]) preprocessor_settings = model_settings.get('preprocessor', None) if preprocessor_settings: locals_dict = config.get_model_constants(model_settings) expressions.assign_columns( df=trips_df, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) rows_per_chunk, effective_chunk_size = \ trip_purpose_rpc(chunk_size, trips_df, probs_spec, trace_label=trace_label) for i, num_chunks, trips_chunk in chunk.chunked_choosers(trips_df, rows_per_chunk): logger.info("Running chunk %s of %s size %d", i, num_chunks, len(trips_chunk)) chunk_trace_label = tracing.extend_trace_label(trace_label, 'chunk_%s' % i) \ if num_chunks > 1 else trace_label chunk.log_open(chunk_trace_label, chunk_size, effective_chunk_size) choices = choose_intermediate_trip_purpose( trips_chunk, probs_spec, trace_hh_id, trace_label=chunk_trace_label) chunk.log_close(chunk_trace_label) result_list.append(choices) if len(result_list) > 1: choices = pd.concat(result_list) return choices
def schedule_trips_in_leg(outbound, trips, probs_spec, model_settings, last_iteration, trace_hh_id, trace_label): """ Parameters ---------- outbound trips probs_spec depart_alt_base last_iteration trace_hh_id trace_label Returns ------- choices: pd.Series depart choice for trips, indexed by trip_id """ failfix = model_settings.get(FAILFIX, FAILFIX_DEFAULT) # logger.debug("%s scheduling %s trips" % (trace_label, trips.shape[0])) assert (trips.outbound == outbound).all() # initial trip of leg and all atwork trips get tour_hour is_initial = (trips.trip_num == 1) if outbound else (trips.trip_num == trips.trip_count) no_scheduling = is_initial | (trips.primary_purpose == 'atwork') choices = trips.tour_hour[no_scheduling] if no_scheduling.all(): return choices result_list = [] result_list.append(choices) trips = trips[~no_scheduling] # add next_trip_id temp column (temp as trips is now a copy, as result of slicing) trips = trips.sort_index() trips['next_trip_id'] = np.roll(trips.index, -1 if outbound else 1) is_final = (trips.trip_num == trips.trip_count) if outbound else (trips.trip_num == 1) trips.next_trip_id = trips.next_trip_id.where(is_final, NO_TRIP_ID) # iterate over outbound trips in ascending trip_num order, skipping the initial trip # iterate over inbound trips in descending trip_num order, skipping the finial trip first_trip_in_leg = True for i in range(trips.trip_num.min(), trips.trip_num.max() + 1): if outbound: nth_trips = trips[trips.trip_num == i] else: nth_trips = trips[trips.trip_num == trips.trip_count - i] nth_trace_label = tracing.extend_trace_label(trace_label, 'num_%s' % i) chunk.log_open(nth_trace_label, chunk_size=0, effective_chunk_size=0) choices = schedule_nth_trips(nth_trips, probs_spec, model_settings, first_trip_in_leg=first_trip_in_leg, report_failed_trips=last_iteration, trace_hh_id=trace_hh_id, trace_label=nth_trace_label) chunk.log_close(nth_trace_label) # if outbound, this trip's depart constrains next trip's earliest depart option # if inbound, we are handling in reverse order, so it constrains latest depart instead ADJUST_NEXT_DEPART_COL = 'earliest' if outbound else 'latest' # most initial departure (when no choice was made because all probs were zero) if last_iteration and (failfix == FAILFIX_CHOOSE_MOST_INITIAL): choices = choices.reindex(nth_trips.index) logger.warning("%s coercing %s depart choices to most initial" % (nth_trace_label, choices.isna().sum())) choices = choices.fillna(trips[ADJUST_NEXT_DEPART_COL]) # adjust allowed depart range of next trip has_next_trip = (nth_trips.next_trip_id != NO_TRIP_ID) if has_next_trip.any(): next_trip_ids = nth_trips.next_trip_id[has_next_trip] # patch choice any trips with next_trips that weren't scheduled trips.loc[next_trip_ids, ADJUST_NEXT_DEPART_COL] = \ choices.reindex(next_trip_ids.index).fillna(trips[ADJUST_NEXT_DEPART_COL]).values result_list.append(choices) first_trip_in_leg = False if len(result_list) > 1: choices = pd.concat(result_list) return choices