def accessibility(land_use): """ If 'accessibility' is in input_tables list, then read it in, otherwise create skeleton table with same index as landuse. This allows loading of pre-computed accessibility table, which is particularly useful for single-process small household sample runs when there are many zones in landuse skeleton table only required if multiprocessing wants to slice accessibility, otherwise it will simply be replaced when accessibility model is run """ accessibility_df = read_input_table("accessibility", required=False) if accessibility_df is None: accessibility_df = pd.DataFrame(index=land_use.index) logger.info("created placeholder accessibility table %s" % (accessibility_df.shape, )) else: assert accessibility_df.sort_index().index.equals(land_use.to_frame().sort_index().index), \ f"loaded accessibility table index does not match index of land_use table" logger.info("loaded land_use %s" % (accessibility_df.shape, )) # replace table function with dataframe inject.add_table('accessibility', accessibility_df) return accessibility_df
def test_vts(): inject.add_injectable("settings", {}) # note: need 0 duration tour on one end of day to guarantee at least one available tour alts = pd.DataFrame({ "start": [1, 1, 2, 3], "end": [1, 4, 5, 6] }) alts['duration'] = alts.end - alts.start inject.add_injectable("tdd_alts", alts) current_tour_person_ids = pd.Series(['b', 'c'], index=['d', 'e']) previous_tour_by_personid = pd.Series([2, 2, 1], index=['a', 'b', 'c']) prev_tour_attrs = get_previous_tour_by_tourid(current_tour_person_ids, previous_tour_by_personid, alts) pdt.assert_series_equal( prev_tour_attrs.start_previous, pd.Series([2, 1], index=['d', 'e'], name='start_previous')) pdt.assert_series_equal( prev_tour_attrs.end_previous, pd.Series([5, 4], index=['d', 'e'], name='end_previous')) tours = pd.DataFrame({ "person_id": [1, 1, 2, 3, 3], "tour_num": [1, 2, 1, 1, 2], "tour_type": ['x', 'x', 'x', 'x', 'x'] }) persons = pd.DataFrame({ "income": [20, 30, 25] }, index=[1, 2, 3]) inject.add_table('persons', persons) spec = pd.DataFrame({"Coefficient": [1.2]}, index=["income"]) spec.index.name = "Expression" segment_col = None # no segmentation of model_spec inject.add_injectable("check_for_variability", True) tdd_choices, timetable = vectorize_tour_scheduling( tours, persons, alts, spec, segment_col, model_settings={}, chunk_size=0, trace_label='test_vts') # FIXME - dead reckoning regression # there's no real logic here - this is just what came out of the monte carlo # note that the result comes out ordered by the nth trips and not ordered # by the trip index. shrug? expected = [2, 2, 2, 0, 0] assert (tdd_choices.tdd.values == expected).all()
def step2(): table_name = inject.get_step_arg('table_name') assert table_name is not None table2 = pd.DataFrame({'column1': [10, 20, 30]}) inject.add_table(table_name, table2)
def person_windows(persons, tdd_alts): df = tt.create_timetable_windows(persons, tdd_alts) inject.add_table('person_windows', df) return df
def atwork_subtour_destination_sample(tours, persons_merged, atwork_subtour_destination_sample_spec, skim_dict, destination_size_terms, chunk_size, trace_hh_id): trace_label = 'atwork_subtour_location_sample' model_settings = inject.get_injectable('atwork_subtour_destination_settings') persons_merged = persons_merged.to_frame() tours = tours.to_frame() tours = tours[tours.tour_category == 'subtour'] # merge persons into tours choosers = pd.merge(tours, persons_merged, left_on='person_id', right_index=True) alternatives = destination_size_terms.to_frame() constants = config.get_model_constants(model_settings) sample_size = model_settings["SAMPLE_SIZE"] alt_col_name = model_settings["ALT_COL_NAME"] chooser_col_name = 'workplace_taz' logger.info("Running atwork_subtour_location_sample with %d persons" % len(choosers)) # create wrapper with keys for this lookup - in this case there is a workplace_taz # in the choosers and a TAZ in the alternatives which get merged during interaction # the skims will be available under the name "skims" for any @ expressions skims = skim_dict.wrap(chooser_col_name, 'TAZ') locals_d = { 'skims': skims } if constants is not None: locals_d.update(constants) # FIXME - MEMORY HACK - only include columns actually used in spec chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS'] choosers = choosers[chooser_columns] choices = interaction_sample( choosers, alternatives, sample_size=sample_size, alt_col_name=alt_col_name, spec=atwork_subtour_destination_sample_spec, skims=skims, locals_d=locals_d, chunk_size=chunk_size, trace_label=trace_label) choices['person_id'] = choosers.person_id choices['workplace_taz'] = choosers.workplace_taz inject.add_table('atwork_subtour_destination_sample', choices)
def land_use(store): df = store["land_use/taz_data"] logger.info("loaded land_use %s" % (df.shape, )) # replace table function with dataframe inject.add_table('land_use', df) return df
def create_households(trace_hh_id): df = pd.DataFrame({ 'household_id': [1, 2, 3], 'home_zone_id': {100, 100, 101} }) inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) tracing.register_traceable_table('households', df)
def land_use(): df = read_input_table("land_use_taz") logger.info("loaded land_use %s" % (df.shape,)) df.index.name = 'TAZ' # replace table function with dataframe inject.add_table('land_use', df) return df
def input_pre_processor(): """ Read input text files and save them as pipeline tables for use in subsequent steps. The files to read as specified by table_list, and array of dicts that specify the input file name, the name of the pipeline table, along with keys allow the specification of pre-processing steps. By default, reads table_list from 'input_table_list' in settings.yaml, unless an alternate table_list name is specified as a model step argument 'table_list'. (This allows alternate/additional input files to be read for repop) In the case of repop, this step is being run after an initial run has completed, in which case the input_table_list may specify replacement tables. (e.g. lowest geography controls that will replace the previous low controls dataframe.) See input_table_list in settings.yaml in the example folder for a working example +--------------+----------------------------------------------------------+ | key | description | +==============+=========================================+================+ | tablename | name of pipeline table in which to store dataframe | +--------------+----------------------------------------------------------+ | filename | name of csv file to read (in data_dir) | +--------------+----------------------------------------------------------+ | column_map | list of input columns to rename from_name: to_name | +--------------+----------------------------------------------------------+ | index_col | name of column to set as dataframe index column | +--------------+----------------------------------------------------------+ | drop_columns | list of column names of columns to drop | +--------------+----------------------------------------------------------+ """ # alternate table list name may have been provided as a model argument table_list_name = inject.get_step_arg('table_list', default='input_table_list') table_list = config.setting(table_list_name) assert table_list is not None, "no table list '%s' found in settings." % table_list_name logger.info('Using table list: %s' % table_list) for table_info in table_list: tablename = table_info.get('tablename') df = input.read_from_table_info(table_info) logger.info('registering table %s' % tablename) # add (or replace) pipeline table repop = inject.get_step_arg('repop', default=False) inject.add_table(tablename, df, replace=repop)
def create_controls_table(settings, configs_dir): expression_file_path = os.path.join(configs_dir, settings['controls_expression_file']) spec = read_spec(expression_file_path) df_list = [] for county in settings['counties']: df = get_acs_data(county, spec, settings) df_list.append(df) acs_table = pd.concat(df_list) acs_table.reset_index(inplace = True) inject.add_table('all_acs', acs_table) controls_table = create_controls(spec) inject.add_table('combined_acs', controls_table) print 'done'
def zone_data(): """ Pipeline table containing zone info. Specify with 'input_table_list' in settings.yaml. Must contain columns for at least zone id, latitude, and longitude. """ df = read_input_table('zone_data') logger.info('loaded zone data %s' % (df.shape,)) # replace table function with dataframe inject.add_table('zone_data', df) return df
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "raw.persons", warn_if_empty=True) return df
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape,)) df.index.name = 'person_id' # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) if trace_hh_id: tracing.register_traceable_table('persons', df) tracing.trace_df(df, "raw.persons", warn_if_empty=True) return df
def persons(households, trace_hh_id): df = read_raw_persons(households) logger.info("loaded persons %s" % (df.shape, )) # replace table function with dataframe inject.add_table('persons', df) pipeline.get_rn_generator().add_channel('persons', df) tracing.register_traceable_table('persons', df) if trace_hh_id: tracing.trace_df(df, "raw.persons", warn_if_empty=True) logger.debug( f"{len(df.household_id.unique())} unique household_ids in persons") logger.debug( f"{len(households.index.unique())} unique household_ids in households") assert not households.index.duplicated().any() assert not df.index.duplicated().any() persons_without_households = ~df.household_id.isin(households.index) if persons_without_households.any(): logger.error( f"{persons_without_households.sum()} persons out of {len(persons)} without households\n" f"{pd.Series({'person_id': persons_without_households.index.values})}" ) raise RuntimeError( f"{persons_without_households.sum()} persons with bad household_id" ) households_without_persons = df.groupby('household_id').size().reindex( households.index).isnull() if households_without_persons.any(): logger.error( f"{households_without_persons.sum()} households out of {len(households.index)} without persons\n" f"{pd.Series({'household_id': households_without_persons.index.values})}" ) raise RuntimeError( f"{households_without_persons.sum()} households with no persons") return df
def create_sample_data(): FIPS_NH = 33 FIPS_VT = 50 FIPS_ME = 23 STATE_FIPS_SAMPLE = [FIPS_ME, FIPS_NH, FIPS_VT] data_dir = setting('data_dir', inject.get_injectable('data_dir')) input_tables = setting('input_tables') # - corresp_taz_fips table_info = input_tables['corresp_taz_fips'] data_file_name = table_info['filename'] data_file_path = os.path.join(data_dir, table_info['filename']) FIPS_NUMA = pd.read_csv(data_file_path, comment='#') # slice by state FIPS FIPS_NUMA = FIPS_NUMA[FIPS_NUMA['StateFIPS'].isin(STATE_FIPS_SAMPLE)] inject.add_table(os.path.splitext(data_file_name)[0], FIPS_NUMA) # - corresp_taz_faf4 table_info = input_tables['corresp_taz_faf4'] data_file_name = table_info['filename'] data_file_path = os.path.join(data_dir, data_file_name) FAF_NUMA = pd.read_csv(data_file_path, comment='#') # slice by taz list from FIPS_NUMA FAF_NUMA = FAF_NUMA[FAF_NUMA['TAZ'].isin(FAF_NUMA.TAZ)] inject.add_table(os.path.splitext(data_file_name)[0], FAF_NUMA) # - cbp table_info = input_tables['cbp'] data_file_name = table_info['filename'] data_file_path = os.path.join(data_dir, data_file_name) cbp = pd.read_csv(data_file_path, comment='#') # slice by state FIPS cbp = cbp[cbp['fipstate'].isin(STATE_FIPS_SAMPLE)] inject.add_table(os.path.splitext(data_file_name)[0], cbp)
def iterate_location_choice( model_settings, persons_merged, persons, households, skim_dict, skim_stack, chunk_size, trace_hh_id, locutor, trace_label): """ iterate run_location_choice updating shadow pricing until convergence criteria satisfied or max_iterations reached. (If use_shadow_pricing not enabled, then just iterate once) Parameters ---------- model_settings : dict persons_merged : injected table persons : injected table skim_dict : skim.SkimDict skim_stack : skim.SkimStack chunk_size : int trace_hh_id : int locutor : bool whether this process is the privileged logger of shadow_pricing when multiprocessing trace_label : str Returns ------- adds choice column model_settings['DEST_CHOICE_COLUMN_NAME'] and annotations to persons table """ # column containing segment id chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] # boolean to filter out persons not needing location modeling (e.g. is_worker, is_student) chooser_filter_column = model_settings['CHOOSER_FILTER_COLUMN_NAME'] persons_merged_df = persons_merged.to_frame() persons_merged_df = persons_merged_df[persons_merged[chooser_filter_column]] spc = shadow_pricing.load_shadow_price_calculator(model_settings) max_iterations = spc.max_iterations logging.debug("%s max_iterations: %s" % (trace_label, max_iterations)) choices = None for iteration in range(1, max_iterations + 1): if spc.use_shadow_pricing and iteration > 1: spc.update_shadow_prices() choices = run_location_choice( persons_merged_df, skim_dict, skim_stack, spc, model_settings, chunk_size, trace_hh_id, trace_label=tracing.extend_trace_label(trace_label, 'i%s' % iteration)) choices_df = choices.to_frame('dest_choice') choices_df['segment_id'] = \ persons_merged_df[chooser_segment_column].reindex(choices_df.index) spc.set_choices(choices_df) if locutor: spc.write_trace_files(iteration) if spc.use_shadow_pricing and spc.check_fit(iteration): logging.info("%s converged after iteration %s" % (trace_label, iteration,)) break # - shadow price table if locutor: if spc.use_shadow_pricing and 'SHADOW_PRICE_TABLE' in model_settings: inject.add_table(model_settings['SHADOW_PRICE_TABLE'], spc.shadow_prices) if 'MODELED_SIZE_TABLE' in model_settings: inject.add_table(model_settings['MODELED_SIZE_TABLE'], spc.modeled_size) dest_choice_column_name = model_settings['DEST_CHOICE_COLUMN_NAME'] tracing.print_summary(dest_choice_column_name, choices, value_counts=True) persons_df = persons.to_frame() # We only chose school locations for the subset of persons who go to school # so we backfill the empty choices with -1 to code as no school location NO_DEST_TAZ = -1 persons_df[dest_choice_column_name] = \ choices.reindex(persons_df.index).fillna(NO_DEST_TAZ).astype(int) # - annotate persons table if 'annotate_persons' in model_settings: expressions.assign_columns( df=persons_df, model_settings=model_settings.get('annotate_persons'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons')) pipeline.replace_table("persons", persons_df) if trace_hh_id: tracing.trace_df(persons_df, label=trace_label, warn_if_empty=True) # - annotate households table if 'annotate_households' in model_settings: households_df = households.to_frame() expressions.assign_columns( df=households_df, model_settings=model_settings.get('annotate_households'), trace_label=tracing.extend_trace_label(trace_label, 'annotate_households')) pipeline.replace_table("households", households_df) if trace_hh_id: tracing.trace_df(households_df, label=trace_label, warn_if_empty=True) return persons_df
def households(households_sample_size, override_hh_ids, trace_hh_id): df_full = read_input_table("households") households_sliced = False logger.info("full household list contains %s households" % df_full.shape[0]) # only using households listed in override_hh_ids if override_hh_ids is not None: # trace_hh_id will not used if it is not in list of override_hh_ids logger.info("override household list containing %s households" % len(override_hh_ids)) df = df_full[df_full.index.isin(override_hh_ids)] households_sliced = True if df.shape[0] < len(override_hh_ids): logger.info("found %s of %s households in override household list" % (df.shape[0], len(override_hh_ids))) if df.shape[0] == 0: raise RuntimeError('No override households found in store') # if we are tracing hh exclusively elif trace_hh_id and households_sample_size == 1: # df contains only trace_hh (or empty if not in full store) df = tracing.slice_ids(df_full, trace_hh_id) households_sliced = True # if we need a subset of full store elif households_sample_size > 0 and df_full.shape[0] > households_sample_size: logger.info("sampling %s of %s households" % (households_sample_size, df_full.shape[0])) """ Because random seed is set differently for each step, sampling of households using Random.global_rng would sample differently depending upon which step it was called from. We use a one-off rng seeded with the pseudo step name 'sample_households' to provide repeatable sampling no matter when the table is loaded. Note that the external_rng is also seeded with base_seed so the sample will (rightly) change if the pipeline rng's base_seed is changed """ prng = pipeline.get_rn_generator().get_external_rng('sample_households') df = df_full.take(prng.choice(len(df_full), size=households_sample_size, replace=False)) households_sliced = True # if tracing and we missed trace_hh in sample, but it is in full store if trace_hh_id and trace_hh_id not in df.index and trace_hh_id in df_full.index: # replace first hh in sample with trace_hh logger.debug("replacing household %s with %s in household sample" % (df.index[0], trace_hh_id)) df_hh = df_full.loc[[trace_hh_id]] df = pd.concat([df_hh, df[1:]]) else: df = df_full # persons table inject.add_injectable('households_sliced', households_sliced) logger.info("loaded households %s" % (df.shape,)) df.index.name = 'household_id' # FIXME - pathological knowledge of name of chunk_id column used by chunked_choosers_by_chunk_id assert 'chunk_id' not in df.columns df['chunk_id'] = pd.Series(list(range(len(df))), df.index) # replace table function with dataframe inject.add_table('households', df) pipeline.get_rn_generator().add_channel('households', df) if trace_hh_id: tracing.register_traceable_table('households', df) tracing.trace_df(df, "raw.households", warn_if_empty=True) return df
def add_size_tables(): """ inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace) Size tables are pandas dataframes with locations counts for model_selector by zone and segment tour_destination_size_terms if using shadow pricing, we scale size_table counts to sample population (in which case, they have to be created while single-process) Scaling is problematic as it breaks household result replicability across sample sizes It also changes the magnitude of the size terms so if they are used as utilities in expression files, their importance will diminish relative to other utilities as the sample size decreases. Scaling makes most sense for a full sample in conjunction with shadow pricing, where shadow prices can be adjusted iteratively to bring modelled counts into line with desired (size table) counts. """ use_shadow_pricing = bool(config.setting('use_shadow_pricing')) shadow_settings = config.read_model_settings('shadow_pricing.yaml') shadow_pricing_models = shadow_settings['shadow_pricing_models'] # probably ought not scale if not shadow_pricing (breaks partial sample replicability) # but this allows compatability with existing CTRAMP behavior... scale_size_table = shadow_settings.get('SCALE_SIZE_TABLE', False) if shadow_pricing_models is None: logger.warning('shadow_pricing_models list not found in shadow_pricing settings') return # shadow_pricing_models is dict of {<model_selector>: <model_name>} # since these are scaled to model size, they have to be created while single-process for model_selector, model_name in iteritems(shadow_pricing_models): model_settings = config.read_model_settings(model_name) assert model_selector == model_settings['MODEL_SELECTOR'] segment_ids = model_settings['SEGMENT_IDS'] chooser_table_name = model_settings['CHOOSER_TABLE_NAME'] chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME'] choosers_df = inject.get_table(chooser_table_name).to_frame() if 'CHOOSER_FILTER_COLUMN_NAME' in model_settings: choosers_df = \ choosers_df[choosers_df[model_settings['CHOOSER_FILTER_COLUMN_NAME']] != 0] # - raw_desired_size land_use = inject.get_table('land_use') size_terms = inject.get_injectable('size_terms') raw_size = tour_destination_size_terms(land_use, size_terms, model_selector) assert set(raw_size.columns) == set(segment_ids.keys()) if use_shadow_pricing or scale_size_table: inject.add_table('raw_' + size_table_name(model_selector), raw_size) # - scale size_table counts to sample population # scaled_size = zone_size * (total_segment_modeled / total_segment_desired) # segment scale factor (modeled / desired) keyed by segment_name segment_scale_factors = {} for c in raw_size: # number of zone demographics desired destination choices segment_desired_size = raw_size[c].astype(np.float64).sum() # number of synthetic population choosers in segment segment_chooser_count = \ (choosers_df[chooser_segment_column] == segment_ids[c]).sum() segment_scale_factors[c] = \ segment_chooser_count / np.maximum(segment_desired_size, 1) logger.info("add_desired_size_tables %s segment %s " "desired %s modeled %s scale_factor %s" % (chooser_table_name, c, segment_desired_size, segment_chooser_count, segment_scale_factors[c])) # FIXME - should we be rounding? scaled_size = (raw_size * segment_scale_factors).round() else: scaled_size = raw_size inject.add_table(size_table_name(model_selector), scaled_size)