def create_mandatory_tours(): # FIXME - move this to body? persons = inject.get_table('persons') configs_dir = inject.get_injectable('configs_dir') persons = persons.to_frame(columns=[ "mandatory_tour_frequency", "is_worker", "school_taz", "workplace_taz" ]) persons = persons[~persons.mandatory_tour_frequency.isnull()] tour_frequency_alternatives = inject.get_injectable( 'mandatory_tour_frequency_alternatives') tours = process_mandatory_tours(persons, tour_frequency_alternatives) expressions.assign_columns(df=tours, model_settings='annotate_tours_with_dest', configs_dir=configs_dir, trace_label='create_mandatory_tours') pipeline.extend_table("tours", tours) tracing.register_traceable_table('tours', tours) pipeline.get_rn_generator().add_channel(tours, 'tours')
def get_trace_target(df, slicer): """ get target ids and column or index to identify target trace rows in df Parameters ---------- df: pandas.DataFrame dataframe to slice slicer: str name of column or index to use for slicing Returns ------- (target, column) tuple target : int or list of ints id or ids that identify tracer target rows column : str name of column to search for targets or None to search index """ target_ids = None # id or ids to slice by (e.g. hh_id or person_ids or tour_ids) column = None # column name to slice on or None to slice on index # special do-not-slice code for dumping entire df if slicer == 'NONE': return target_ids, column if slicer is None: slicer = df.index.name if isinstance(df, pd.DataFrame): # always slice by household id if we can if 'household_id' in df.columns: slicer = 'household_id' if slicer in df.columns: column = slicer if column is None and df.index.name != slicer: raise RuntimeError("bad slicer '%s' for df with index '%s'" % (slicer, df.index.name)) traceable_table_indexes = inject.get_injectable('traceable_table_indexes', {}) traceable_table_ids = inject.get_injectable('traceable_table_ids', {}) if df.empty: target_ids = None elif slicer in traceable_table_indexes: # maps 'person_id' to 'persons', etc table_name = traceable_table_indexes[slicer] target_ids = traceable_table_ids.get(table_name, []) elif slicer == 'TAZ': target_ids = inject.get_injectable('trace_od', []) return target_ids, column
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 checkpoints = pipeline.get_checkpoints() tables = OrderedDict() skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack', None) mode = 'wb' if sys.version_info < (3,) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.usage: print(key, file=output_file) if skim_stack is None: unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \ {k for k in skim_dict.usage} print("\n### unused skim keys", file=output_file) for key in unused_keys: print(key, file=output_file) else: print("\n### skim_stack usage", file=output_file) for key in skim_stack.usage: print(key, file=output_file) unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \ {k for k in skim_dict.usage if not isinstance(k, tuple)} print("\n### unused skim str keys", file=output_file) for key in unused: print(key, file=output_file) unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \ {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \ {k for k in skim_stack.usage} print("\n### unused skim dim3 keys", file=output_file) for key in unused: print(key, file=output_file)
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 checkpoints = pipeline.get_checkpoints() tables = OrderedDict() skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack', None) mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.usage: print(key, file=output_file) if skim_stack is None: unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \ {k for k in skim_dict.usage} print("\n### unused skim keys", file=output_file) for key in unused_keys: print(key, file=output_file) else: print("\n### skim_stack usage", file=output_file) for key in skim_stack.usage: print(key, file=output_file) unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \ {k for k in skim_dict.usage if not isinstance(k, tuple)} print("\n### unused skim str keys", file=output_file) for key in unused: print(key, file=output_file) unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \ {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \ {k for k in skim_stack.usage} print("\n### unused skim dim3 keys", file=output_file) for key in unused: print(key, file=output_file)
def config_logger(custom_config_file=None, basic=False): """ Configure logger if log_config_file is not supplied then look for conf file in configs_dir if not found use basicConfig Parameters ---------- custom_config_file: str custom config filename basic: boolean basic setup Returns ------- Nothing """ log_config_file = None if custom_config_file and os.path.isfile(custom_config_file): log_config_file = custom_config_file elif not basic: # look for conf file in configs_dir configs_dir = inject.get_injectable('configs_dir') default_config_file = os.path.join(configs_dir, LOGGING_CONF_FILE_NAME) if os.path.isfile(default_config_file): log_config_file = default_config_file if log_config_file: with open(log_config_file) as f: config_dict = yaml.load(f) config_dict = config_dict['logging'] config_dict.setdefault('version', 1) logging.config.dictConfig(config_dict) else: logging.basicConfig(level=logging.INFO, stream=sys.stdout) logger = logging.getLogger(ASIM_LOGGER) if custom_config_file and not os.path.isfile(custom_config_file): logger.error("#\n#\n#\nconfig_logger could not find conf file '%s'" % custom_config_file) if log_config_file: logger.info("Read logging configuration from: %s" % log_config_file) else: print "Configured logging using basicConfig" logger.info("Configured logging using basicConfig") output_dir = inject.get_injectable('output_dir') logger.debug("Deleting files in output_dir %s" % output_dir) delete_csv_files(output_dir)
def initialize_tours(network_los, households, persons, trace_hh_id): trace_label = 'initialize_tours' tours = read_input_table("tours") # FIXME can't use households_sliced injectable as flag like persons table does in case of resume_after. # FIXME could just always slice... slice_happened = \ inject.get_injectable('households_sample_size', 0) > 0 \ or inject.get_injectable('households_sample_size', 0) > 0 if slice_happened: logger.info("slicing tours %s" % (tours.shape,)) # keep all persons in the sampled households tours = tours[tours.person_id.isin(persons.index)] # annotate before patching tour_id to allow addition of REQUIRED_TOUR_COLUMNS defined above model_settings = config.read_model_settings('initialize_tours.yaml', mandatory=True) expressions.assign_columns( df=tours, model_settings=model_settings.get('annotate_tours'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_tours')) skip_patch_tour_ids = model_settings.get('skip_patch_tour_ids', False) if skip_patch_tour_ids: pass else: tours = patch_tour_ids(tours) assert tours.index.name == 'tour_id' # replace table function with dataframe inject.add_table('tours', tours) pipeline.get_rn_generator().add_channel('tours', tours) tracing.register_traceable_table('tours', tours) logger.debug(f"{len(tours.household_id.unique())} unique household_ids in tours") logger.debug(f"{len(households.index.unique())} unique household_ids in households") assert not tours.index.duplicated().any() tours_without_persons = ~tours.person_id.isin(persons.index) if tours_without_persons.any(): logger.error(f"{tours_without_persons.sum()} tours out of {len(persons)} without persons\n" f"{pd.Series({'person_id': tours_without_persons.index.values})}") raise RuntimeError(f"{tours_without_persons.sum()} tours with bad person_id") if trace_hh_id: tracing.trace_df(tours, label='initialize_tours', warn_if_empty=True)
def test_misc(): inject.clear_cache() with pytest.raises(RuntimeError) as excinfo: inject.get_injectable("configs_dir") assert "directory does not exist" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: inject.get_injectable("data_dir") assert "directory does not exist" in str(excinfo.value) with pytest.raises(RuntimeError) as excinfo: inject.get_injectable("output_dir") assert "directory does not exist" in str(excinfo.value) configs_dir = os.path.join(os.path.dirname(__file__), 'configs_test_misc') inject.add_injectable("configs_dir", configs_dir) settings = inject.get_injectable("settings") assert isinstance(settings, dict) data_dir = os.path.join(os.path.dirname(__file__), 'data') inject.add_injectable("data_dir", data_dir) # default values if not specified in settings assert inject.get_injectable("chunk_size") == 0
def preload_injectables(): """ preload bulky injectables up front - stuff that isn't inserted into eh pipeline """ logger.info("preload_injectables") t0 = tracing.print_elapsed_time() if inject.get_injectable('skim_dict', None) is not None: t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True) if inject.get_injectable('skim_stack', None) is not None: t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True)
def wrap_skims(model_settings): """ wrap skims of trip destination using origin, dest column names from model settings. Various of these are used by destination_sample, compute_logsums, and destination_simulate so we create them all here with canonical names. Note that compute_logsums aliases their names so it can use the same equations to compute logsums from origin to alt_dest, and from alt_dest to primarly destination odt_skims - SkimStackWrapper: trip origin, trip alt_dest, time_of_day dot_skims - SkimStackWrapper: trip alt_dest, trip origin, time_of_day dpt_skims - SkimStackWrapper: trip alt_dest, trip primary_dest, time_of_day pdt_skims - SkimStackWrapper: trip primary_dest,trip alt_dest, time_of_day od_skims - SkimDictWrapper: trip origin, trip alt_dest dp_skims - SkimDictWrapper: trip alt_dest, trip primary_dest Parameters ---------- model_settings Returns ------- dict containing skims, keyed by canonical names relative to tour orientation """ skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack') o = model_settings['TRIP_ORIGIN'] d = model_settings['ALT_DEST_COL_NAME'] p = model_settings['PRIMARY_DEST'] skims = { "odt_skims": skim_stack.wrap(left_key=o, right_key=d, skim_key='trip_period'), "dot_skims": skim_stack.wrap(left_key=d, right_key=o, skim_key='trip_period'), "dpt_skims": skim_stack.wrap(left_key=d, right_key=p, skim_key='trip_period'), "pdt_skims": skim_stack.wrap(left_key=p, right_key=d, skim_key='trip_period'), "od_skims": skim_dict.wrap(o, d), "dp_skims": skim_dict.wrap(d, p), } return skims
def base_settings_file_path(file_name): """ FIXME - should be in configs Parameters ---------- file_name Returns ------- path to base settings file or None if not found """ if not file_name.lower().endswith('.yaml'): file_name = '%s.yaml' % (file_name, ) configs_dir = inject.get_injectable('configs_dir') configs_dir = [configs_dir] if isinstance(configs_dir, str) else configs_dir for dir in configs_dir: file_path = os.path.join(dir, file_name) if os.path.exists(file_path): return file_path raise RuntimeError("base_settings_file %s not found" % file_name)
def log_file_path(file_name): output_dir = inject.get_injectable('output_dir') # - check for optional log subfolder if os.path.exists(os.path.join(output_dir, 'log')): output_dir = os.path.join(output_dir, 'log') # - check for optional process name prefix prefix = inject.get_injectable('log_file_prefix', None) if prefix: file_name = "%s-%s" % (prefix, file_name) file_path = os.path.join(output_dir, file_name) return file_path
def get_tvpb_best_transit_time(self, orig, dest, tod): # FIXME lots of pathological knowledge here as we are only called by accessibility directly from expressions trace_label = tracing.extend_trace_label( 'accessibility.tvpb_best_time', tod) recipe = 'accessibility' path_type = 'WTW' with chunk.chunk_log(trace_label): result = \ self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment=None, want_choices=False, trace_label=trace_label) trace_od = inject.get_injectable("trace_od", None) if trace_od: filter_targets = (orig == trace_od[0]) & (dest == trace_od[1]) if filter_targets.any(): self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment=None, want_choices=False, trace_label=trace_label, filter_targets=filter_targets, trace=True) return result
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims FIXME - if resume_after, this will only reflect skims used after resume Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 skim_dict = inject.get_injectable('skim_dict') mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.get_skim_usage(): print(key, file=output_file) unused = set(k for k in skim_dict.skim_info.base_keys) - set( k for k in skim_dict.get_skim_usage()) for key in unused: print(key, file=output_file)
def test_full_run1(): _MODELS = [ 'input_pre_processor', 'setup_data_structures', 'initial_seed_balancing', 'meta_control_factoring', 'final_seed_balancing', 'integerize_final_seed_weights', 'sub_balancing.geography=TRACT', 'sub_balancing.geography=TAZ', 'expand_households', 'summarize', 'write_tables', 'write_synthetic_population', ] pipeline.run(models=_MODELS, resume_after=None) expanded_household_ids = pipeline.get_table('expanded_household_ids') assert isinstance(expanded_household_ids, pd.DataFrame) taz_hh_counts = expanded_household_ids.groupby('TAZ').size() assert len(taz_hh_counts) == TAZ_COUNT assert taz_hh_counts.loc[100] == TAZ_100_HH_COUNT # output_tables action: skip output_dir = inject.get_injectable('output_dir') assert not os.path.exists(os.path.join(output_dir, 'households.csv')) assert os.path.exists(os.path.join(output_dir, 'summary_DISTRICT_1.csv')) # tables will no longer be available after pipeline is closed pipeline.close_pipeline() inject.clear_cache()
def skim_dict(data_dir, settings): omx_file_path = config.data_file_path(settings["skims_file"]) tags_to_load = settings['skim_time_periods']['labels'] logger.info("loading skim_dict from %s" % (omx_file_path, )) # select the skims to load skim_info = get_skim_info(omx_file_path, tags_to_load) logger.debug("omx_shape %s skim_dtype %s" % (skim_info['omx_shape'], skim_info['dtype'])) skim_buffers = inject.get_injectable('data_buffers', None) if skim_buffers: logger.info('Using existing skim_buffers for skims') else: skim_buffers = buffers_for_skims(skim_info, shared=False) load_skims(omx_file_path, skim_info, skim_buffers) skim_data = skim_data_from_buffers(skim_buffers, skim_info) block_names = list(skim_info['blocks'].keys()) for i in range(len(skim_data)): block_name = block_names[i] block_data = skim_data[i] logger.info( "block_name %s bytes %s (%s)" % (block_name, block_data.nbytes, util.GB(block_data.nbytes))) # create skim dict skim_dict = skim.SkimDict(skim_data, skim_info) skim_dict.offset_mapper.set_offset_int(-1) return skim_dict
def person_max_window(persons): timetable = inject.get_injectable("timetable") # ndarray with one row per person and one column per time period # array value of 1 where free periods and 0 elsewhere s = pd.Series(persons.index.values, index=persons.index) available = timetable.individually_available(s) row_ids, start_pos, run_length, run_val = rle(available) # rle returns all runs, but we only care about runs of available (run_val == 1) target_rows = np.where(run_val == 1) row_ids = row_ids[target_rows] run_length = run_length[target_rows] df = pd.DataFrame({'row_ids': row_ids, 'run_length': run_length}) # groupby index of row_ids match the numpy row indexes of timetable.pairwise_available ndarray # but there may be missing values of any no-overlap persons pairs max_overlap = df.groupby('row_ids').run_length.max() # fill in any missing values to align with input arrays input_row_ids = np.arange(persons.shape[0]) max_window = max_overlap.reindex(input_row_ids).fillna(0) # FIXME should we return series or ndarray? max_window.index = persons.index return max_window
def test_create_input_store(seed_households, data_dir): settings_yaml = """ create_input_store: True input_table_list: - tablename: households h5_tablename: seed_households filename: households.csv index_col: household_id rename_columns: HHID: household_id """ settings = yaml.load(settings_yaml, Loader=yaml.SafeLoader) inject.add_injectable('settings', settings) hh_file = os.path.join(data_dir, 'households.csv') seed_households.to_csv(hh_file, index=False) assert os.path.isfile(hh_file) df = input.read_input_table('households') assert df.index.name == 'household_id' output_store = os.path.join(inject.get_injectable('output_dir'), 'input_data.h5') assert os.path.exists(output_store) store_df = pd.read_hdf(output_store, 'seed_households') assert store_df.equals(seed_households)
def setting(key, default=None): settings = inject.get_injectable('settings') # explicit setting in settings file takes precedence s = settings.get(key, None) # if no setting, try injectable if s is None: s = inject.get_injectable(key, None) # otherwise fall back to supplied default if s is None: s = default return s
def test_mp_run(): mp_configs_dir = os.path.join(os.path.dirname(__file__), 'configs_mp') configs_dir = os.path.join(os.path.dirname(__file__), 'configs') inject.add_injectable('configs_dir', [mp_configs_dir, configs_dir]) output_dir = os.path.join(os.path.dirname(__file__), 'output') inject.add_injectable("output_dir", output_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') inject.add_injectable("data_dir", data_dir) tracing.config_logger() run_list = mp_tasks.get_run_list() mp_tasks.print_run_list(run_list) # do this after config.handle_standard_args, as command line args may override injectables injectables = ['data_dir', 'configs_dir', 'output_dir'] injectables = {k: inject.get_injectable(k) for k in injectables} # pipeline.run(models=run_list['models'], resume_after=run_list['resume_after']) mp_tasks.run_multiprocess(run_list, injectables) pipeline.open_pipeline('_') regress_mini_auto() pipeline.close_pipeline()
def __init__(self, size_term_selector): # do this once so they can request siae_terms for various segments (tour_type or purpose) land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') self.destination_size_terms = \ tour_destination_size_terms(land_use, size_terms, size_term_selector)
def get_tvpb_logsum(self, path_type, orig, dest, tod, demographic_segment, want_choices, trace_label=None): # assume they have given us a more specific name (since there may be more than one active wrapper) trace_label = trace_label or 'get_tvpb_logsum' trace_label = tracing.extend_trace_label(trace_label, path_type) recipe = 'tour_mode_choice' with chunk.chunk_log(trace_label): logsum_df = \ self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, want_choices=want_choices, trace_label=trace_label) trace_hh_id = inject.get_injectable("trace_hh_id", None) if trace_hh_id: filter_targets = tracing.trace_targets(orig) # choices from preceding run (because random numbers) override_choices = logsum_df['path_num'] if want_choices else None if filter_targets.any(): self.build_virtual_path(recipe, path_type, orig, dest, tod, demographic_segment, want_choices=want_choices, override_choices=override_choices, trace_label=trace_label, filter_targets=filter_targets, trace=True) return logsum_df
def skim_dict(data_dir, settings): omx_file_path = config.data_file_path(settings["skims_file"]) tags_to_load = settings['skim_time_periods']['labels'] logger.info("loading skim_dict from %s" % (omx_file_path, )) # select the skims to load skim_info = get_skim_info(omx_file_path, tags_to_load) logger.debug("omx_shape %s skim_dtype %s" % (skim_info['omx_shape'], skim_info['dtype'])) skim_buffers = inject.get_injectable('data_buffers', None) if skim_buffers: logger.info('Using existing skim_buffers for skims') else: skim_buffers = buffers_for_skims(skim_info, shared=False) load_skims(omx_file_path, skim_info, skim_buffers) skim_data = skim_data_from_buffers(skim_buffers, skim_info) block_names = list(skim_info['blocks'].keys()) for i in range(len(skim_data)): block_name = block_names[i] block_data = skim_data[i] logger.info("block_name %s bytes %s (%s)" % (block_name, block_data.nbytes, util.GB(block_data.nbytes))) # create skim dict skim_dict = skim.SkimDict(skim_data, skim_info) skim_dict.offset_mapper.set_offset_int(-1) return skim_dict
def preload_injectables(): """ preload bulky injectables up front - stuff that isn't inserted into the pipeline """ logger.info("preload_injectables") inject.add_step('track_skim_usage', track_skim_usage) inject.add_step('write_data_dictionary', write_data_dictionary) inject.add_step('write_tables', write_tables) table_list = config.setting('input_table_list') # default ActivitySim table names and indices if table_list is None: logger.warn("No 'input_table_list' found in settings. This will be a " "required setting in upcoming versions of ActivitySim.") new_settings = inject.get_injectable('settings') new_settings['input_table_list'] = DEFAULT_TABLE_LIST inject.add_injectable('settings', new_settings) t0 = tracing.print_elapsed_time() # FIXME - still want to do this? # if inject.get_injectable('skim_dict', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True) # # if inject.get_injectable('skim_stack', None) is not None: # t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True) return True
def shared_memory_size(data_buffers=None): """ return total size of the multiprocessing shared memory block in data_buffers Returns ------- """ shared_size = 0 if data_buffers is None: data_buffers = inject.get_injectable('data_buffers', {}) for k, data_buffer in data_buffers.items(): try: obj = data_buffer.get_obj() except Exception: obj = data_buffer data = np.ctypeslib.as_array(obj) data_size = data.nbytes shared_size += data_size return shared_size
def register_traceable_table(table_name, df): """ Register traceable table Parameters ---------- df: pandas.DataFrame traced dataframe Returns ------- Nothing """ trace_hh_id = inject.get_injectable("trace_hh_id", None) if trace_hh_id is None: return if table_name == 'households': register_households(df, trace_hh_id) elif table_name == 'persons': register_persons(df, trace_hh_id) elif table_name == 'trips': register_trips(df, trace_hh_id) elif table_name == 'tours': register_tours(df, trace_hh_id)
def get_skim_data(self, skim_tag, skim_info): """ Read skim data from backing store and return it as a 3D ndarray quack-alike SkimData object Parameters ---------- skim_tag: str skim_info: string Returns ------- SkimData """ data_buffers = inject.get_injectable('data_buffers', None) if data_buffers: # we assume any existing skim buffers will already have skim data loaded into them logger.info( f"get_skim_data {skim_tag} using existing shared skim_buffers for skims" ) skim_buffer = data_buffers[skim_tag] else: skim_buffer = self.allocate_skim_buffer(skim_info, shared=False) self.load_skims_to_buffer(skim_info, skim_buffer) skim_data = SkimData( self._skim_data_from_buffer(skim_info, skim_buffer)) logger.info( f"get_skim_data {skim_tag} {type(skim_data).__name__} shape {skim_data.shape}" ) return skim_data
def preload_injectables(): """ called after pipeline is """ # could simply list injectables as arguments, but this way we can report timing... logger.info("preload_injectables") t0 = tracing.print_elapsed_time() if inject.get_injectable('skim_dict', None) is not None: t0 = tracing.print_elapsed_time("preload skim_dict") if inject.get_injectable('skim_stack', None) is not None: t0 = tracing.print_elapsed_time("preload skim_stack")
def atwork_subtour_destination_sample(tours, persons_merged, atwork_subtour_destination_sample_spec, skim_dict, destination_size_terms, chunk_size, trace_hh_id): trace_label = 'atwork_subtour_location_sample' model_settings = inject.get_injectable('atwork_subtour_destination_settings') persons_merged = persons_merged.to_frame() tours = tours.to_frame() tours = tours[tours.tour_category == 'subtour'] # merge persons into tours choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True) alternatives = destination_size_terms.to_frame() constants = config.get_model_constants(model_settings) sample_size = model_settings["SAMPLE_SIZE"] alt_col_name = model_settings["ALT_COL_NAME"] chooser_col_name = 'workplace_taz' logger.info("Running atwork_subtour_location_sample with %d persons" % len(choosers)) # create wrapper with keys for this lookup - in this case there is a workplace_taz # in the choosers and a TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skims = skim_dict.wrap(chooser_col_name, 'TAZ') locals_d = { 'skims': skims } if constants is not None: locals_d.update(constants) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] choosers = choosers[chooser_columns] choices = interaction_sample( choosers, alternatives, sample_size=sample_size, alt_col_name=alt_col_name, spec=atwork_subtour_destination_sample_spec, skims=skims, locals_d=locals_d, chunk_size=chunk_size, trace_label=trace_label) choices['person_id'] = choosers.person_id choices['workplace_taz'] = choosers.workplace_taz inject.add_table('atwork_subtour_destination_sample', choices)
def skims_for_logsums(tour_purpose, model_settings, trace_label): assert 'LOGSUM_SETTINGS' in model_settings network_los = inject.get_injectable('network_los') skim_dict = network_los.get_default_skim_dict() orig_col_name = 'home_zone_id' dest_col_name = model_settings.get('DESTINATION_FOR_TOUR_PURPOSE').get( tour_purpose) odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, dim3_key='out_period') dot_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, dim3_key='in_period') odr_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=orig_col_name, dest_key=dest_col_name, dim3_key='in_period') dor_skim_stack_wrapper = skim_dict.wrap_3d(orig_key=dest_col_name, dest_key=orig_col_name, dim3_key='out_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "odr_skims": odr_skim_stack_wrapper, "dor_skims": dor_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, } if network_los.zone_system == los.THREE_ZONE: # fixme - is this a lightweight object? tvpb = network_los.tvpb tvpb_logsum_odt = tvpb.wrap_logsum(orig_key=orig_col_name, dest_key=dest_col_name, tod_key='out_period', segment_key='demographic_segment', trace_label=trace_label, tag='tvpb_logsum_odt') tvpb_logsum_dot = tvpb.wrap_logsum(orig_key=dest_col_name, dest_key=orig_col_name, tod_key='in_period', segment_key='demographic_segment', trace_label=trace_label, tag='tvpb_logsum_dot') skims.update({ 'tvpb_logsum_odt': tvpb_logsum_odt, 'tvpb_logsum_dot': tvpb_logsum_dot }) return skims
def initialize_traceable_tables(): traceable_table_ids = inject.get_injectable('traceable_table_ids', {}) if len(traceable_table_ids) > 0: logger.debug( f"initialize_traceable_tables resetting table_ids for {list(traceable_table_ids.keys())}" ) inject.add_injectable('traceable_table_ids', {})
def test_missing_table_list(data_dir): settings = inject.get_injectable('settings') assert isinstance(settings, dict) with pytest.raises(AssertionError) as excinfo: input.read_input_table('households') assert 'no input_table_list found' in str(excinfo.value)
def test_vts(): inject.add_injectable("settings", {}) # note: need 0 duration tour on one end of day to guarantee at least one available tour alts = pd.DataFrame({"start": [1, 1, 2, 3], "end": [1, 4, 5, 6]}) alts['duration'] = alts.end - alts.start inject.add_injectable("tdd_alts", alts) current_tour_person_ids = pd.Series(['b', 'c'], index=['d', 'e']) previous_tour_by_personid = pd.Series([2, 2, 1], index=['a', 'b', 'c']) prev_tour_attrs = get_previous_tour_by_tourid(current_tour_person_ids, previous_tour_by_personid, alts) pdt.assert_series_equal( prev_tour_attrs.start_previous, pd.Series([2, 1], index=['d', 'e'], name='start_previous')) pdt.assert_series_equal( prev_tour_attrs.end_previous, pd.Series([5, 4], index=['d', 'e'], name='end_previous')) tours = pd.DataFrame({ "person_id": [1, 1, 2, 3, 3], "tour_num": [1, 2, 1, 1, 2], "tour_type": ['x', 'x', 'x', 'x', 'x'] }) persons = pd.DataFrame({"income": [20, 30, 25]}, index=[1, 2, 3]) inject.add_table('persons', persons) spec = pd.DataFrame({"Coefficient": [1.2]}, index=["income"]) spec.index.name = "Expression" inject.add_injectable("check_for_variability", True) timetable = inject.get_injectable("timetable") tdd_choices = vectorize_tour_scheduling(tours, persons, alts, timetable, tour_segments={'spec': spec}, tour_segment_col=None, model_settings={}, chunk_size=0, trace_label='test_vts') # FIXME - dead reckoning regression # there's no real logic here - this is just what came out of the monte carlo # note that the result comes out ordered by the nth trips and not ordered # by the trip index. shrug? expected = [2, 2, 2, 0, 0] assert (tdd_choices.values == expected).all()
def load_shadow_price_calculator(model_settings): """ Initialize ShadowPriceCalculator for model_selector (e.g. school or workplace) If multiprocessing, get the shared_data buffer to coordinate global_desired_size calculation across sub-processes Parameters ---------- model_settings : dict Returns ------- spc : ShadowPriceCalculator """ num_processes = inject.get_injectable('num_processes', 1) model_selector = model_settings['MODEL_SELECTOR'] # - get shared_data from data_buffers (if multiprocessing) data_buffers = inject.get_injectable('data_buffers', None) if data_buffers is not None: logger.info('Using existing data_buffers for shadow_price') # - shadow_pricing_info shadow_pricing_info = inject.get_injectable('shadow_pricing_info', None) if shadow_pricing_info is None: shadow_pricing_info = get_shadow_pricing_info() inject.add_injectable('shadow_pricing_info', shadow_pricing_info) # - extract data buffer and reshape as numpy array data, lock = \ shadow_price_data_from_buffers(data_buffers, shadow_pricing_info, model_selector) else: assert num_processes == 1 data = None # ShadowPriceCalculator will allocate its own data lock = None # - ShadowPriceCalculator spc = ShadowPriceCalculator( model_settings, num_processes, data, lock) return spc
def read_raw_persons(households): df = read_input_table("persons") if inject.get_injectable('households_sliced', False): # keep all persons in the sampled households df = df[df.household_id.isin(households.index)] return df
def build_output_file_path(file_name, use_prefix=None): output_dir = inject.get_injectable('output_dir') if use_prefix: file_name = "%s-%s" % (use_prefix, file_name) file_path = os.path.join(output_dir, file_name) return file_path
def wrap_skims(model_settings): """ wrap skims of trip destination using origin, dest column names from model settings. Various of these are used by destination_sample, compute_logsums, and destination_simulate so we create them all here with canonical names. Note that compute_logsums aliases their names so it can use the same equations to compute logsums from origin to alt_dest, and from alt_dest to primarly destination odt_skims - SkimStackWrapper: trip origin, trip alt_dest, time_of_day dot_skims - SkimStackWrapper: trip alt_dest, trip origin, time_of_day dpt_skims - SkimStackWrapper: trip alt_dest, trip primary_dest, time_of_day pdt_skims - SkimStackWrapper: trip primary_dest,trip alt_dest, time_of_day od_skims - SkimDictWrapper: trip origin, trip alt_dest dp_skims - SkimDictWrapper: trip alt_dest, trip primary_dest Parameters ---------- model_settings Returns ------- dict containing skims, keyed by canonical names relative to tour orientation """ skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack') o = model_settings['TRIP_ORIGIN'] d = model_settings['ALT_DEST'] p = model_settings['PRIMARY_DEST'] skims = { "odt_skims": skim_stack.wrap(left_key=o, right_key=d, skim_key='trip_period'), "dot_skims": skim_stack.wrap(left_key=d, right_key=o, skim_key='trip_period'), "dpt_skims": skim_stack.wrap(left_key=d, right_key=p, skim_key='trip_period'), "pdt_skims": skim_stack.wrap(left_key=p, right_key=d, skim_key='trip_period'), "od_skims": skim_dict.wrap(o, d), "dp_skims": skim_dict.wrap(d, p), } return skims
def trace_file_path(file_name): output_dir = inject.get_injectable('output_dir') # - check for optional trace subfolder if os.path.exists(os.path.join(output_dir, 'trace')): output_dir = os.path.join(output_dir, 'trace') else: file_name = "trace.%s" % (file_name,) file_path = os.path.join(output_dir, file_name) return file_path
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 8 pipeline.open_pipeline('auto_ownership_simulate') regress_mini_auto() # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str(excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() # should be able to get this before pipeline is closed (from existing open store) checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values hh_ids = pd.DataFrame({'household_id': hh_ids}) data_dir = inject.get_injectable('data_dir') hh_ids.to_csv(os.path.join(data_dir, 'override_hh_ids.csv'), index=False, header=True) pipeline.close_pipeline() inject.clear_cache() close_handlers()
def log_settings(injectables): settings = [ 'households_sample_size', 'chunk_size', 'multiprocess', 'num_processes', 'resume_after', ] for k in settings: logger.info("setting %s: %s" % (k, config.setting(k))) for k in injectables: logger.info("injectable %s: %s" % (k, inject.get_injectable(k)))
def delete_output_files(file_type, ignore=None, subdir=None): """ Delete files in output directory of specified type Parameters ---------- output_dir: str Directory of trace output CSVs Returns ------- Nothing """ output_dir = inject.get_injectable('output_dir') directories = ['', 'log', 'trace'] for subdir in directories: dir = os.path.join(output_dir, subdir) if subdir else output_dir if not os.path.exists(dir): continue if ignore: ignore = [os.path.realpath(p) for p in ignore] # logger.debug("Deleting %s files in output dir %s" % (file_type, dir)) for the_file in os.listdir(dir): if the_file.endswith(file_type): file_path = os.path.join(dir, the_file) if ignore and os.path.realpath(file_path) in ignore: logger.debug("delete_output_files ignoring %s" % file_path) continue try: if os.path.isfile(file_path): os.unlink(file_path) except Exception as e: print(e)
def p2p_time_window_overlap(p1_ids, p2_ids): """ Parameters ---------- p1_ids p2_ids Returns ------- """ timetable = inject.get_injectable("timetable") assert len(p1_ids) == len(p2_ids) # if series, ought to have same index assert (p1_ids.index == p2_ids.index).all() # ndarray with one row per p2p and one column per time period # array value of 1 where overlapping free periods and 0 elsewhere available = timetable.pairwise_available(p1_ids, p2_ids) row_ids, start_pos, run_length, run_val = rle(available) # rle returns all runs, but we only care about runs of available (run_val == 1) target_rows = np.where(run_val == 1) row_ids = row_ids[target_rows] run_length = run_length[target_rows] df = pd.DataFrame({'row_ids': row_ids, 'run_length': run_length}) # groupby index of row_ids match the numpy row indexes of timetable.pairwise_available ndarray # but there may be missing values of any no-overlap persons pairs max_overlap = df.groupby('row_ids').run_length.max() # fill in any missing values to align with input arrays input_row_ids = np.arange(len(p1_ids)) max_overlap = max_overlap.reindex(input_row_ids).fillna(0) # FIXME should we return series or ndarray? max_overlap.index = p1_ids.index return max_overlap
def get_shadow_pricing_info(): """ return dict with info about dtype and shapes of desired and modeled size tables block shape is (num_zones, num_segments + 1) Returns ------- shadow_pricing_info: dict dtype: <sp_dtype>, block_shapes: dict {<model_selector>: <block_shape>} """ land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') shadow_settings = config.read_model_settings('shadow_pricing.yaml') # shadow_pricing_models is dict of {<model_selector>: <model_name>} shadow_pricing_models = shadow_settings['shadow_pricing_models'] blocks = OrderedDict() for model_selector in shadow_pricing_models: sp_rows = len(land_use) sp_cols = len(size_terms[size_terms.model_selector == model_selector]) # extra tally column for TALLY_CHECKIN and TALLY_CHECKOUT semaphores blocks[block_name(model_selector)] = (sp_rows, sp_cols + 1) sp_dtype = np.int64 shadow_pricing_info = { 'dtype': sp_dtype, 'block_shapes': blocks, } for k in shadow_pricing_info: logger.debug("shadow_pricing_info %s: %s" % (k, shadow_pricing_info.get(k))) return shadow_pricing_info
def get_cached_spec(hhsize): spec_name = cached_spec_name(hhsize) spec = inject.get_injectable(spec_name, None) if spec is not None: logger.info("build_cdap_spec returning cached injectable spec %s", spec_name) return spec # this is problematic for multiprocessing and since we delete csv files in output_dir # at the start of every run, doesn't provide any benefit in single-processing as the # cached spec will be available as an injectable to subsequent chunks # # try data dir # if os.path.exists(config.output_file_path(spec_name)): # spec_path = config.output_file_path(spec_name) # logger.info("build_cdap_spec reading cached spec %s from %s", spec_name, spec_path) # return pd.read_csv(spec_path, index_col='Expression') return None
def cascading_input_file_path(file_name, dir_list_injectable_name, mandatory=True): dir_list = inject.get_injectable(dir_list_injectable_name) if isinstance(dir_list, str): dir_list = [dir_list] assert isinstance(dir_list, list) file_path = None for dir in dir_list: p = os.path.join(dir, file_name) if os.path.isfile(p): file_path = p break if mandatory and not file_path: raise RuntimeError("file_path %s: file '%s' not in %s" % (dir_list_injectable_name, file_path, dir_list)) return file_path
def read_settings_file(file_name, mandatory=True): def backfill_settings(settings, backfill): new_settings = backfill.copy() new_settings.update(settings) return new_settings configs_dir = inject.get_injectable('configs_dir') if isinstance(configs_dir, str): configs_dir = [configs_dir] assert isinstance(configs_dir, list) settings = {} for dir in configs_dir: file_path = os.path.join(dir, file_name) if os.path.exists(file_path): if settings: logger.debug("read settings for %s from %s" % (file_name, file_path)) with open(file_path) as f: s = yaml.load(f, Loader=yaml.SafeLoader) settings = backfill_settings(settings, s) if s.get('inherit_settings', False): logger.debug("inherit_settings flag set for %s in %s" % (file_name, file_path)) continue else: break if mandatory and not settings: raise RuntimeError("read_settings_file: no settings for '%s' in %s" % (file_name, configs_dir)) return settings
def interaction_trace_rows(interaction_df, choosers, sample_size=None): """ Trace model design for interaction_simulate Parameters ---------- interaction_df: pandas.DataFrame traced model_design dataframe choosers: pandas.DataFrame interaction_simulate choosers (needed to filter the model_design dataframe by traced hh or person id) sample_size int or None int for constant sample size, or None if choosers have different numbers of alternatives Returns ------- trace_rows : numpy.ndarray array of booleans to flag which rows in interaction_df to trace trace_ids : tuple (str, numpy.ndarray) column name and array of trace_ids mapping trace_rows to their target_id for use by trace_interaction_eval_results which needs to know target_id so it can create separate tables for each distinct target for readability """ # slicer column name and id targets to use for chooser id added to model_design dataframe # currently we only ever slice by person_id, but that could change, so we check here... traceable_table_ids = inject.get_injectable('traceable_table_ids', {}) if choosers.index.name == 'person_id' and 'persons' in traceable_table_ids: slicer_column_name = choosers.index.name targets = traceable_table_ids['persons'] elif 'household_id' in choosers.columns and 'households' in traceable_table_ids: slicer_column_name = 'household_id' targets = traceable_table_ids['households'] elif 'person_id' in choosers.columns and 'persons' in traceable_table_ids: slicer_column_name = 'person_id' targets = traceable_table_ids['persons'] else: print(choosers.columns) raise RuntimeError("interaction_trace_rows don't know how to slice index '%s'" % choosers.index.name) if sample_size is None: # if sample size not constant, we count on either # slicer column being in itneraction_df # or index of interaction_df being same as choosers if slicer_column_name in interaction_df.columns: trace_rows = np.in1d(interaction_df[slicer_column_name], targets) trace_ids = interaction_df.loc[trace_rows, slicer_column_name].values else: assert interaction_df.index.name == choosers.index.name trace_rows = np.in1d(interaction_df.index, targets) trace_ids = interaction_df[trace_rows].index.values else: if slicer_column_name == choosers.index.name: trace_rows = np.in1d(choosers.index, targets) trace_ids = np.asanyarray(choosers[trace_rows].index) elif slicer_column_name == 'person_id': trace_rows = np.in1d(choosers['person_id'], targets) trace_ids = np.asanyarray(choosers[trace_rows].person_id) elif slicer_column_name == 'household_id': trace_rows = np.in1d(choosers['household_id'], targets) trace_ids = np.asanyarray(choosers[trace_rows].household_id) else: assert False # simply repeat if sample size is constant across choosers assert sample_size == len(interaction_df.index) / len(choosers.index) trace_rows = np.repeat(trace_rows, sample_size) trace_ids = np.repeat(trace_ids, sample_size) assert type(trace_rows) == np.ndarray assert type(trace_ids) == np.ndarray trace_ids = (slicer_column_name, trace_ids) return trace_rows, trace_ids
def vectorize_joint_tour_scheduling( joint_tours, joint_tour_participants, persons_merged, alts, spec, model_settings, chunk_size=0, trace_label=None): """ Like vectorize_tour_scheduling but specifically for joint tours joint tours have a few peculiarities necessitating separate treatment: Timetable has to be initialized to set all timeperiods... Parameters ---------- tours : DataFrame DataFrame of tours containing tour attributes, as well as a person_id column to define the nth tour for each person. persons_merged : DataFrame DataFrame of persons containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. (or dict of specs keyed on tour_type if tour_types is not None) model_settings : dict Returns ------- choices : Series A Series of choices where the index is the index of the tours DataFrame and the values are the index of the alts DataFrame. persons_timetable : TimeTable timetable updated with joint tours (caller should replace_table for it to persist) """ trace_label = tracing.extend_trace_label(trace_label, 'vectorize_joint_tour_scheduling') assert len(joint_tours.index) > 0 assert 'tour_num' in joint_tours.columns assert 'tour_type' in joint_tours.columns timetable_window_id_col = None tour_owner_id_col = 'household_id' segment = None persons_timetable = inject.get_injectable("timetable") choice_list = [] # keep a series of the the most recent tours for each person # initialize with first trip from alts previous_tour_by_householdid = pd.Series(alts.index[0], index=joint_tours.household_id.unique()) # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # this ought to have been ensured when tours are created (tour_frequency.process_tours) # print "participant windows before scheduling\n", \ # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) for tour_num, nth_tours in joint_tours.groupby('tour_num', sort=True): tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % (tour_num,)) # no more than one tour per household per call to schedule_tours assert not nth_tours.household_id.duplicated().any() nth_participants = \ joint_tour_participants[joint_tour_participants.tour_id.isin(nth_tours.index)] timetable = build_joint_tour_timetables( nth_tours, nth_participants, persons_timetable, alts) choices = \ schedule_tours(nth_tours, persons_merged, alts, spec, segment, model_settings, timetable, timetable_window_id_col, previous_tour_by_householdid, tour_owner_id_col, chunk_size, tour_trace_label) # - update timetables of all joint tour participants persons_timetable.assign( nth_participants.person_id, reindex(choices, nth_participants.tour_id)) choice_list.append(choices) choices = pd.concat(choice_list) # add the start, end, and duration from tdd_alts # assert (alts.index == list(range(alts.shape[0]))).all() tdd = pd.DataFrame(data=alts.values[choices.values], columns=alts.columns, index=choices.index) # tdd = alts.loc[choices] # tdd.index = choices.index tdd.index = choices.index # include the index of the choice in the tdd alts table tdd['tdd'] = choices # print "participant windows after scheduling\n", \ # persons_timetable.slice_windows_by_row_id(joint_tour_participants.person_id) return tdd, persons_timetable
def vectorize_tour_scheduling(tours, persons_merged, alts, spec, segment_col, model_settings, chunk_size=0, trace_label=None): """ The purpose of this method is fairly straightforward - it takes tours and schedules them into time slots. Alternatives should be specified so as to define those time slots (usually with start and end times). schedule_tours adds variables that can be used in the spec which have to do with the previous tours per person. Every column in the alternatives table is appended with the suffix "_previous" and made available. So if your alternatives table has columns for start and end, then start_previous and end_previous will be set to the start and end of the most recent tour for a person. The first time through, start_previous and end_previous are undefined, so make sure to protect with a tour_num >= 2 in the variable computation. Parameters ---------- tours : DataFrame DataFrame of tours containing tour attributes, as well as a person_id column to define the nth tour for each person. persons_merged : DataFrame DataFrame of persons containing attributes referenced by expressions in spec alts : DataFrame DataFrame of alternatives which represent time slots. Will be passed to interaction_simulate in batches for each nth tour. spec : DataFrame The spec which will be passed to interaction_simulate. (or dict of specs keyed on tour_type if tour_types is not None) model_settings : dict Returns ------- choices : Series A Series of choices where the index is the index of the tours DataFrame and the values are the index of the alts DataFrame. timetable : TimeTable persons timetable updated with tours (caller should replace_table for it to persist) """ trace_label = tracing.extend_trace_label(trace_label, 'vectorize_tour_scheduling') assert len(tours.index) > 0 assert 'tour_num' in tours.columns assert 'tour_type' in tours.columns # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # this ought to have been ensured when tours are created (tour_frequency.process_tours) timetable = inject.get_injectable("timetable") choice_list = [] # keep a series of the the most recent tours for each person # initialize with first trip from alts previous_tour_by_personid = pd.Series(alts.index[0], index=tours.person_id.unique()) timetable_window_id_col = 'person_id' tour_owner_id_col = 'person_id' # no more than one tour per person per call to schedule_tours # tours must be scheduled in increasing trip_num order # second trip of type must be in group immediately following first # segregate scheduling by tour_type if multiple specs passed in dict keyed by tour_type for tour_num, nth_tours in tours.groupby('tour_num', sort=True): tour_trace_label = tracing.extend_trace_label(trace_label, 'tour_%s' % (tour_num,)) if isinstance(spec, dict): assert segment_col is not None for spec_segment in spec: segment_trace_label = tracing.extend_trace_label(tour_trace_label, spec_segment) in_segment = nth_tours[segment_col] == spec_segment if not in_segment.any(): logger.info("skipping empty segment %s") continue # assume segmentation of spec and logsum coefficients are aligned logsum_tour_purpose = spec_segment choices = \ schedule_tours(nth_tours[in_segment], persons_merged, alts, spec[spec_segment], logsum_tour_purpose, model_settings, timetable, timetable_window_id_col, previous_tour_by_personid, tour_owner_id_col, chunk_size, segment_trace_label) choice_list.append(choices) else: # unsegmented spec dict indicates no logsums # caller could use single-element spec dict if logsum support desired, # but this case nor required for mtctm1 assert segment_col is None logsum_segment = None choices = \ schedule_tours(nth_tours, persons_merged, alts, spec, logsum_segment, model_settings, timetable, timetable_window_id_col, previous_tour_by_personid, tour_owner_id_col, chunk_size, tour_trace_label) choice_list.append(choices) choices = pd.concat(choice_list) # add the start, end, and duration from tdd_alts # use np instead of (slower) loc[] since alts has rangeindex tdd = pd.DataFrame(data=alts.values[choices.values], columns=alts.columns, index=choices.index) # tdd = alts.loc[choices] # tdd.index = choices.index # include the index of the choice in the tdd alts table tdd['tdd'] = choices return tdd, timetable
def _compute_logsums(alt_tdd, tours_merged, tour_purpose, model_settings, trace_label): """ compute logsums for tours using skims for alt_tdd out_period and in_period """ trace_label = tracing.extend_trace_label(trace_label, 'logsums') logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) choosers = alt_tdd.join(tours_merged, how='left', rsuffix='_chooser') logger.info("%s compute_logsums for %d choosers%s alts" % (trace_label, choosers.shape[0], alt_tdd.shape[0])) # - setup skims skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack') orig_col_name = 'TAZ' dest_col_name = model_settings.get('DESTINATION_FOR_TOUR_PURPOSE').get(tour_purpose) odt_skim_stack_wrapper = skim_stack.wrap(left_key=orig_col_name, right_key=dest_col_name, skim_key='out_period') dot_skim_stack_wrapper = skim_stack.wrap(left_key=dest_col_name, right_key=orig_col_name, skim_key='in_period') od_skim_stack_wrapper = skim_dict.wrap(orig_col_name, dest_col_name) skims = { "odt_skims": odt_skim_stack_wrapper, "dot_skims": dot_skim_stack_wrapper, "od_skims": od_skim_stack_wrapper, 'orig_col_name': orig_col_name, 'dest_col_name': dest_col_name, } # - locals_dict constants = config.get_model_constants(logsum_settings) omnibus_coefficient_spec = get_coeffecients_spec(logsum_settings) coefficient_spec = omnibus_coefficient_spec[tour_purpose] coefficients = assign.evaluate_constants(coefficient_spec, constants=constants) locals_dict = {} locals_dict.update(coefficients) locals_dict.update(constants) locals_dict.update(skims) # - run preprocessor to annotate choosers # allow specification of alternate preprocessor for nontour choosers preprocessor = model_settings.get('LOGSUM_PREPROCESSOR', 'preprocessor') preprocessor_settings = logsum_settings[preprocessor] if preprocessor_settings: simulate.set_skim_wrapper_targets(choosers, skims) expressions.assign_columns( df=choosers, model_settings=preprocessor_settings, locals_dict=locals_dict, trace_label=trace_label) # - compute logsums logsum_spec = get_logsum_spec(logsum_settings) nest_spec = config.get_logit_model_settings(logsum_settings) logsums = simulate.simple_simulate_logsums( choosers, logsum_spec, nest_spec, skims=skims, locals_d=locals_dict, chunk_size=0, trace_label=trace_label) return logsums
def run_trip_destination( trips, tours_merged, chunk_size, trace_hh_id, trace_label): """ trip destination - main functionality separated from model step so it can be called iteratively Run the trip_destination model, assigning destinations for each (intermediate) trip (last trips already have a destination - either the tour primary destination or Home) Set trip destination and origin columns, and a boolean failed flag for any failed trips (destination for flagged failed trips will be set to -1) Parameters ---------- trips tours_merged chunk_size trace_hh_id trace_label Returns ------- """ model_settings = config.read_model_settings('trip_destination.yaml') preprocessor_settings = model_settings.get('preprocessor', None) logsum_settings = config.read_model_settings(model_settings['LOGSUM_SETTINGS']) land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') # - initialize trip origin and destination to those of half-tour # (we will sequentially adjust intermediate trips origin and destination as we choose them) tour_destination = reindex(tours_merged.destination, trips.tour_id).astype(int) tour_origin = reindex(tours_merged.origin, trips.tour_id).astype(int) trips['destination'] = np.where(trips.outbound, tour_destination, tour_origin) trips['origin'] = np.where(trips.outbound, tour_origin, tour_destination) trips['failed'] = False trips = trips.sort_index() trips['next_trip_id'] = np.roll(trips.index, -1) trips.next_trip_id = trips.next_trip_id.where(trips.trip_num < trips.trip_count, 0) # - filter tours_merged (AFTER copying destination and origin columns to trips) # tours_merged is used for logsums, we filter it here upfront to save space and time tours_merged_cols = logsum_settings['TOURS_MERGED_CHOOSER_COLUMNS'] if 'REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS' in model_settings: redundant_cols = model_settings['REDUNDANT_TOURS_MERGED_CHOOSER_COLUMNS'] tours_merged_cols = [c for c in tours_merged_cols if c not in redundant_cols] tours_merged = tours_merged[tours_merged_cols] # - skims skims = wrap_skims(model_settings) # - size_terms and alternatives alternatives = tour_destination_size_terms(land_use, size_terms, 'trip') # DataFrameMatrix alows us to treat dataframe as virtual a 2-D array, indexed by TAZ, purpose # e.g. size_terms.get(df.dest_taz, df.purpose) # returns a series of size_terms for each chooser's dest_taz and purpose with chooser index size_term_matrix = DataFrameMatrix(alternatives) # don't need size terms in alternatives, just TAZ index alternatives = alternatives.drop(alternatives.columns, axis=1) alternatives.index.name = model_settings['ALT_DEST'] # - process intermediate trips in ascending trip_num order intermediate = trips.trip_num < trips.trip_count if intermediate.any(): first_trip_num = trips[intermediate].trip_num.min() last_trip_num = trips[intermediate].trip_num.max() # iterate over trips in ascending trip_num order for trip_num in range(first_trip_num, last_trip_num + 1): nth_trips = trips[intermediate & (trips.trip_num == trip_num)] nth_trace_label = tracing.extend_trace_label(trace_label, 'trip_num_%s' % trip_num) # - annotate nth_trips if preprocessor_settings: expressions.assign_columns( df=nth_trips, model_settings=preprocessor_settings, locals_dict=config.get_model_constants(model_settings), trace_label=nth_trace_label) logger.info("Running %s with %d trips", nth_trace_label, nth_trips.shape[0]) # - choose destination for nth_trips, segmented by primary_purpose choices_list = [] for primary_purpose, trips_segment in nth_trips.groupby('primary_purpose'): choices = choose_trip_destination( primary_purpose, trips_segment, alternatives, tours_merged, model_settings, size_term_matrix, skims, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(nth_trace_label, primary_purpose)) choices_list.append(choices) destinations = pd.concat(choices_list) failed_trip_ids = nth_trips.index.difference(destinations.index) if failed_trip_ids.any(): logger.warning("%s sidelining %s trips without viable destination alternatives" % (nth_trace_label, failed_trip_ids.shape[0])) next_trip_ids = nth_trips.next_trip_id.reindex(failed_trip_ids) trips.loc[failed_trip_ids, 'failed'] = True trips.loc[failed_trip_ids, 'destination'] = -1 trips.loc[next_trip_ids, 'origin'] = trips.loc[failed_trip_ids].origin.values # - assign choices to these trips destinations and to next trips origin assign_in_place(trips, destinations.to_frame('destination')) destinations.index = nth_trips.next_trip_id.reindex(destinations.index) assign_in_place(trips, destinations.to_frame('origin')) del trips['next_trip_id'] return trips
def override_setting(key, value): new_settings = inject.get_injectable('settings') new_settings[key] = value inject.add_injectable('settings', new_settings)
def setting(key, default=None): return inject.get_injectable('settings').get(key, default)
def add_size_tables(): """ inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace) Size tables are pandas dataframes with locations counts for model_selector by zone and segment tour_destination_size_terms if using shadow pricing, we scale size_table counts to sample population (in which case, they have to be created while single-process) Scaling is problematic as it breaks household result replicability across sample sizes It also changes the magnitude of the size terms so if they are used as utilities in expression files, their importance will diminish relative to other utilities as the sample size decreases. Scaling makes most sense for a full sample in conjunction with shadow pricing, where shadow prices can be adjusted iteratively to bring modelled counts into line with desired (size table) counts. """ use_shadow_pricing = bool(config.setting('use_shadow_pricing')) shadow_settings = config.read_model_settings('shadow_pricing.yaml') shadow_pricing_models = shadow_settings['shadow_pricing_models'] # probably ought not scale if not shadow_pricing (breaks partial sample replicability) # but this allows compatability with existing CTRAMP behavior... scale_size_table = shadow_settings.get('SCALE_SIZE_TABLE', False) if shadow_pricing_models is None: logger.warning('shadow_pricing_models list not found in shadow_pricing settings') return # shadow_pricing_models is dict of {<model_selector>: <model_name>} # since these are scaled to model size, they have to be created while single-process for model_selector, model_name in iteritems(shadow_pricing_models): model_settings = config.read_model_settings(model_name) assert model_selector == model_settings['MODEL_SELECTOR'] segment_ids = model_settings['SEGMENT_IDS'] chooser_table_name = model_settings['CHOOSER_TABLE_NAME'] chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] choosers_df = inject.get_table(chooser_table_name).to_frame() if 'CHOOSER_FILTER_COLUMN_NAME' in model_settings: choosers_df = \ choosers_df[choosers_df[model_settings['CHOOSER_FILTER_COLUMN_NAME']] != 0] # - raw_desired_size land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') raw_size = tour_destination_size_terms(land_use, size_terms, model_selector) assert set(raw_size.columns) == set(segment_ids.keys()) if use_shadow_pricing or scale_size_table: inject.add_table('raw_' + size_table_name(model_selector), raw_size) # - scale size_table counts to sample population # scaled_size = zone_size * (total_segment_modeled / total_segment_desired) # segment scale factor (modeled / desired) keyed by segment_name segment_scale_factors = {} for c in raw_size: # number of zone demographics desired destination choices segment_desired_size = raw_size[c].astype(np.float64).sum() # number of synthetic population choosers in segment segment_chooser_count = \ (choosers_df[chooser_segment_column] == segment_ids[c]).sum() segment_scale_factors[c] = \ segment_chooser_count / np.maximum(segment_desired_size, 1) logger.info("add_desired_size_tables %s segment %s " "desired %s modeled %s scale_factor %s" % (chooser_table_name, c, segment_desired_size, segment_chooser_count, segment_scale_factors[c])) # FIXME - should we be rounding? scaled_size = (raw_size * segment_scale_factors).round() else: scaled_size = raw_size inject.add_table(size_table_name(model_selector), scaled_size)
def pipeline_file_path(file_name): prefix = inject.get_injectable('pipeline_file_prefix', None) return build_output_file_path(file_name, use_prefix=prefix)