def trip_schedule_calc_row_size(choosers, trace_label): """ rows_per_chunk calculator for trip_scheduler """ sizer = chunk.RowSizeEstimator(trace_label) chooser_row_size = len(choosers.columns) spec_columns = 3 sizer.add_elements(chooser_row_size + spec_columns, 'choosers') row_size = sizer.get_hwm() return row_size
def trip_purpose_calc_row_size(choosers, spec, trace_label): """ rows_per_chunk calculator for trip_purpose """ sizer = chunk.RowSizeEstimator(trace_label) chooser_row_size = len(choosers.columns) spec_columns = spec.shape[1] - len(PROBS_JOIN_COLUMNS) sizer.add_elements(chooser_row_size + spec_columns, 'choosers') row_size = sizer.get_hwm() return row_size
def trip_scheduling_calc_row_size(trips, spec, trace_label): sizer = chunk.RowSizeEstimator(trace_label) # NOTE we chunk chunk_id # scale row_size by average number of chooser rows per chunk_id num_choosers = trips['chunk_id'].max() + 1 rows_per_chunk_id = len(trips) / num_choosers # only non-initial trips require scheduling, segment handing first such trip in tour will use most space outbound_chooser = (trips.trip_num == 2) & trips.outbound & ( trips.primary_purpose != 'atwork') inbound_chooser = (trips.trip_num == trips.trip_count - 1) & ~trips.outbound & ( trips.primary_purpose != 'atwork') # furthermore, inbound and outbound are scheduled independently if outbound_chooser.sum() > inbound_chooser.sum(): is_chooser = outbound_chooser logger.debug( f"{trace_label} {is_chooser.sum()} outbound_choosers of {len(trips)} require scheduling" ) else: is_chooser = inbound_chooser logger.debug( f"{trace_label} {is_chooser.sum()} inbound_choosers of {len(trips)} require scheduling" ) chooser_fraction = is_chooser.sum() / len(trips) logger.debug(f"{trace_label} chooser_fraction {chooser_fraction *100}%") chooser_row_size = len(trips.columns) + len( spec.columns) - len(PROBS_JOIN_COLUMNS) sizer.add_elements(chooser_fraction * chooser_row_size, 'choosers') # might be clipped to fewer but this is worst case chooser_probs_row_size = len(spec.columns) - len(PROBS_JOIN_COLUMNS) sizer.add_elements(chooser_fraction * chooser_probs_row_size, 'chooser_probs') sizer.add_elements(chooser_fraction, 'choices') sizer.add_elements(chooser_fraction, 'rands') sizer.add_elements(chooser_fraction, 'failed') row_size = sizer.get_hwm() row_size = row_size * rows_per_chunk_id return row_size
def accessibility_calc_row_size(accessibility_df, land_use_df, assignment_spec, network_los, trace_label): """ rows_per_chunk calculator for accessibility """ sizer = chunk.RowSizeEstimator(trace_label) # if there are skims, and zone_system is THREE_ZONE, and there are any # then we want to estimate the per-row overhead tvpb skims # (do this first to facilitate tracing of rowsize estimation below) if network_los.zone_system == los.THREE_ZONE: # DISABLE_TVPB_OVERHEAD logger.debug("disable calc_row_size for THREE_ZONE with tap skims") return 0 land_use_rows = len(land_use_df.index) land_use_columns = len(land_use_df.columns) od_columns = 2 # assignment spec has one row per value to assign # count number of unique persistent assign_variables targets simultaneously resident during spec eval # (since dict overwrites recurring targets, only count unique targets) def is_persistent(target): return not (assign.is_throwaway(target) or assign.is_temp_scalar(target)) num_spec_values = len([ target for target in assignment_spec.target.unique() if is_persistent(target) ]) sizer.add_elements(land_use_rows * od_columns, 'od_df') # each od_df joins to all land_use zones sizer.add_elements(land_use_rows * land_use_columns, 'land_use_choosers') # and then we assign_variables to joined land_use from assignment_spec sizer.add_elements(land_use_rows * num_spec_values, 'spec_values') row_size = sizer.get_hwm() return row_size
def initialize_tvpb_calc_row_size(choosers, network_los, trace_label): """ rows_per_chunk calculator for trip_purpose """ sizer = chunk.RowSizeEstimator(trace_label) model_settings = \ network_los.setting(f'TVPB_SETTINGS.tour_mode_choice.tap_tap_settings') attributes_as_columns = \ network_los.setting('TVPB_SETTINGS.tour_mode_choice.tap_tap_settings.attributes_as_columns', []) # expression_values for each spec row sizer.add_elements(len(choosers.columns), 'choosers') # expression_values for each spec row sizer.add_elements(len(attributes_as_columns), 'attributes_as_columns') preprocessor_settings = model_settings.get('PREPROCESSOR') if preprocessor_settings: preprocessor_spec_name = preprocessor_settings.get('SPEC', None) if not preprocessor_spec_name.endswith(".csv"): preprocessor_spec_name = f'{preprocessor_spec_name}.csv' expressions_spec = assign.read_assignment_spec( config.config_file_path(preprocessor_spec_name)) sizer.add_elements(expressions_spec.shape[0], 'preprocessor') # expression_values for each spec row spec = simulate.read_model_spec(file_name=model_settings['SPEC']) sizer.add_elements(spec.shape[0], 'expression_values') # expression_values for each spec row sizer.add_elements(spec.shape[1], 'utilities') row_size = sizer.get_hwm() return row_size
def tour_scheduling_calc_row_size(tours, persons_merged, alternatives, skims, spec, model_settings, trace_label): # this will not be consistent across mandatory tours (highest), non_mandatory tours, and atwork subtours (lowest) TIMETABLE_AVAILABILITY_REDUCTION_FACTOR = 1 # this appears to be more stable LOGSUM_DUPLICATE_REDUCTION_FACTOR = 0.5 sizer = chunk.RowSizeEstimator(trace_label) # chooser is tours merged with persons_merged chooser_row_size = len(tours.columns) + len(persons_merged.columns) # e.g. start, end, duration, <chooser_column> alt_row_size = alternatives.shape[1] + 1 # non-available alternatives will be sliced out so this is a over-estimate # for atwork subtours this may be a gross over-estimate, but that is presumably ok since we are adaptive sample_size = len(alternatives) * TIMETABLE_AVAILABILITY_REDUCTION_FACTOR sizer.add_elements(chooser_row_size, 'tours') # tours_merged with persons # alt_tdd tdd_interaction_dataset is cross join of choosers with alternatives sizer.add_elements((chooser_row_size + alt_row_size) * sample_size, 'interaction_df') # eval_interaction_utilities is parsimonious and doesn't create a separate column for each partial utility sizer.add_elements( sample_size, 'interaction_utilities') # <- this is probably always the HWM sizer.drop_elements('interaction_df') sizer.drop_elements('interaction_utilities') sizer.add_elements(alt_row_size, 'utilities_df') sizer.add_elements(alt_row_size, 'probs') if 'LOGSUM_SETTINGS' in model_settings: logsum_settings = config.read_model_settings( model_settings['LOGSUM_SETTINGS']) logsum_spec = simulate.read_model_spec( file_name=logsum_settings['SPEC']) logsum_nest_spec = config.get_logit_model_settings(logsum_settings) if logsum_nest_spec is None: # expression_values for each spec row # utilities and probs for each alt logsum_columns = logsum_spec.shape[0] + (2 * logsum_spec.shape[1]) else: # expression_values for each spec row # raw_utilities and base_probabilities) for each alt # nested_exp_utilities, nested_probabilities for each nest # less 1 as nested_probabilities lacks root nest_count = logit.count_nests(logsum_nest_spec) logsum_columns = logsum_spec.shape[0] + ( 2 * logsum_spec.shape[1]) + (2 * nest_count) - 1 if USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS: sizer.add_elements(logsum_columns * sample_size, 'logsum_columns') else: # if USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS is false compute_logsums prunes alt_tdd # to only compute logsums for unique (tour_id, out_period, in_period, duration) in alt_tdd # which cuts the number of alts by roughly 50% (44% for 100 hh mtctm1 test dataset) # grep the log for USE_BRUTE_FORCE_TO_COMPUTE_LOGSUMS to check actual % savings duplicate_sample_reduction = 0.5 sizer.add_elements( logsum_columns * sample_size * LOGSUM_DUPLICATE_REDUCTION_FACTOR, 'logsum_columns') row_size = sizer.get_hwm() if simulate.tvpb_skims(skims): # DISABLE_TVPB_OVERHEAD logger.debug("disable calc_row_size for THREE_ZONE with tap skims") return 0 return row_size