def regress(): persons_df = pipeline.get_table('persons') persons_df = persons_df[persons_df.household_id == HH_ID] print("persons_df\n", persons_df[['value_of_time', 'distance_to_work']]) """ persons_df person_id value_of_time distance_to_work person_id 3249922 23.349532 0.62 3249923 23.349532 0.62 """ tours_df = pipeline.get_table('tours') regress_tour_modes(tours_df) assert tours_df.shape[0] > 0 assert not tours_df.tour_mode.isnull().any() trips_df = pipeline.get_table('trips') assert trips_df.shape[0] > 0 assert not trips_df.purpose.isnull().any() assert not trips_df.depart.isnull().any() assert not trips_df.trip_mode.isnull().any() # should be at least two tours per trip assert trips_df.shape[0] >= 2*tours_df.shape[0]
def regress(): persons_df = pipeline.get_table('persons') persons_df = persons_df[persons_df.household_id == HH_ID] print("persons_df\n%s" % persons_df[['value_of_time', 'distance_to_work']]) """ persons_df person_id value_of_time distance_to_work person_id 3249922 23.349532 0.62 3249923 23.349532 0.62 """ tours_df = pipeline.get_table('tours') regress_tour_modes(tours_df) assert tours_df.shape[0] > 0 assert not tours_df.tour_mode.isnull().any() # optional logsum column was added to all tours except mandatory assert 'destination_logsum' in tours_df if (tours_df.destination_logsum.isnull() != (tours_df.tour_category == 'mandatory')).any(): print(tours_df[(tours_df.destination_logsum.isnull() != (tours_df.tour_category == 'mandatory'))]) assert (tours_df.destination_logsum.isnull() == ( tours_df.tour_category == 'mandatory')).all() # mode choice logsum calculated for all tours assert 'mode_choice_logsum' in tours_df assert not tours_df.mode_choice_logsum.isnull().any() trips_df = pipeline.get_table('trips') assert trips_df.shape[0] > 0 assert not trips_df.purpose.isnull().any() assert not trips_df.depart.isnull().any() assert not trips_df.trip_mode.isnull().any() # mode_choice_logsum calculated for all trips assert not trips_df.mode_choice_logsum.isnull().any() # should be at least two tours per trip assert trips_df.shape[0] >= 2 * tours_df.shape[0] # write_trip_matrices trip_matrices_file = config.output_file_path('trips_md.omx') assert os.path.exists(trip_matrices_file) trip_matrices = omx.open_file(trip_matrices_file) assert trip_matrices.shape() == (25, 25) assert 'WALK_MD' in trip_matrices.list_matrices() walk_trips = np.array(trip_matrices['WALK_MD']) assert walk_trips.dtype == np.dtype('float64') trip_matrices.close()
def test_pipeline_checkpoint_drop(): setup() _MODELS = [ 'step1', '_step2', '_step_add_col.table_name=table2;column_name=c2', '_step_forget_tab.table_name=table2', 'step3', 'step_forget_tab.table_name=table3', ] pipeline.run(models=_MODELS, resume_after=None) checkpoints = pipeline.get_checkpoints() print "checkpoints\n", checkpoints pipeline.get_table("table1") with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table2") assert "never checkpointed" in str(excinfo.value) # can't get a dropped table from current checkpoint with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table3") assert "was dropped" in str(excinfo.value) # ensure that we can still get table3 from a checkpoint at which it existed pipeline.get_table("table3", checkpoint_name="step3") pipeline.close_pipeline() close_handlers()
def full_run(configs_dir, data_dir, resume_after=None, chunk_size=0, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, trace_hh_id=None, trace_od=None, check_for_variability=None, two_zone=True): setup_dirs(configs_dir, data_dir) settings = inject_settings( two_zone=two_zone, households_sample_size=households_sample_size, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_od=trace_od, check_for_variability=check_for_variability, use_shadow_pricing=False ) # shadow pricing breaks replicability when sample_size varies MODELS = settings['models'] pipeline.run(models=MODELS, resume_after=resume_after) tours = pipeline.get_table('tours') tour_count = len(tours.index) return tour_count
def write_summaries(output_dir): summary_settings_name = 'output_summaries' summary_file_name = 'summaries.txt' summary_settings = setting(summary_settings_name) if summary_settings is None: logger.info( "No {summary_settings_name} specified in settings file. Nothing to write." ) return summary_dict = summary_settings mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path(summary_file_name), mode) as output_file: for table_name, column_names in summary_dict.items(): df = pipeline.get_table(table_name) for c in column_names: n = 100 empty = (df[c] == '') | df[c].isnull() print( f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n", file=output_file) print(df[c].value_counts().nlargest(n), file=output_file)
def regress_mini_mtf(): mtf_choice = pipeline.get_table("persons").sort_index().mandatory_tour_frequency # these choices are for pure regression - their appropriateness has not been checked per_ids = [2566698, 2877284, 2877287] choices = ['work1', 'work_and_school', 'school1'] expected_choice = pd.Series(choices, index=pd.Index(per_ids, name='person_id'), name='mandatory_tour_frequency') mtf_choice = mtf_choice[mtf_choice != ''] # drop null (empty string) choices offset = len(mtf_choice) // 2 # choose something midway as hh_id ordered by hh size print("mtf_choice\n", mtf_choice.head(offset).tail(5)) """ mtf_choice person_id 2458502 school1 2458503 school1 2566698 work1 2877284 work_and_school 2877287 school1 Name: mandatory_tour_frequency, dtype: object """ pdt.assert_series_equal(mtf_choice.reindex(per_ids), expected_choice)
def test_full_run2_repop_replace(): # Note: tests are run in alphabetical order. # This tests expects to find the pipeline h5 file from # test_full_run1 in the output folder _MODELS = [ 'input_pre_processor.table_list=repop_input_table_list;repop', 'repop_setup_data_structures', 'initial_seed_balancing.final=true;repop', 'integerize_final_seed_weights.repop', 'repop_balancing', 'expand_households.repop;replace', 'write_synthetic_population.repop', 'write_tables.repop', ] pipeline.run(models=_MODELS, resume_after='summarize') expanded_household_ids = pipeline.get_table('expanded_household_ids') assert isinstance(expanded_household_ids, pd.DataFrame) taz_hh_counts = expanded_household_ids.groupby('TAZ').size() assert len(taz_hh_counts) == TAZ_COUNT assert taz_hh_counts.loc[100] == TAZ_100_HH_REPOP_COUNT # tables will no longer be available after pipeline is closed pipeline.close_pipeline() inject.clear_cache()
def write_trip_matrices(trips, skim_dict, skim_stack): """ Write trip matrices step. Adds boolean columns to local trips table via annotation expressions, then aggregates trip counts and writes OD matrices to OMX. Save annotated trips table to pipeline if desired. """ model_settings = config.read_model_settings('write_trip_matrices.yaml') trips_df = annotate_trips(trips, skim_dict, skim_stack, model_settings) if bool(model_settings.get('SAVE_TRIPS_TABLE')): pipeline.replace_table('trips', trips_df) logger.info('Aggregating trips...') aggregate_trips = trips_df.groupby(['origin', 'destination'], sort=False).sum() logger.info('Finished.') orig_vals = aggregate_trips.index.get_level_values('origin') dest_vals = aggregate_trips.index.get_level_values('destination') zone_index = pipeline.get_table('land_use').index assert all(zone in zone_index for zone in orig_vals) assert all(zone in zone_index for zone in dest_vals) _, orig_index = zone_index.reindex(orig_vals) _, dest_index = zone_index.reindex(dest_vals) write_matrices(aggregate_trips, zone_index, orig_index, dest_index, model_settings)
def regress_mini_auto(): # regression test: these are among the middle households in households table # should be the same results as in run_mp (multiprocessing) test case hh_ids = [932147, 982875, 983048, 1024353] choices = [1, 1, 1, 0] expected_choice = pd.Series(choices, index=pd.Index(hh_ids, name="household_id"), name='auto_ownership') auto_choice = pipeline.get_table("households").sort_index().auto_ownership offset = HOUSEHOLDS_SAMPLE_SIZE // 2 # choose something midway as hh_id ordered by hh size print("auto_choice\n", auto_choice.head(offset).tail(4)) auto_choice = auto_choice.reindex(hh_ids) """ auto_choice household_id 932147 1 982875 1 983048 1 1024353 0 Name: auto_ownership, dtype: int64 """ pdt.assert_series_equal(auto_choice, expected_choice)
def full_run(resume_after=None, chunk_size=0, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, trace_hh_id=None, trace_od=None, check_for_variability=None): configs_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'example', 'configs') setup_dirs(configs_dir) settings = inject_settings( configs_dir, households_sample_size=households_sample_size, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_od=trace_od, check_for_variability=check_for_variability, use_shadow_pricing=False) # shadow pricing breaks replicability when sample_size varies MODELS = settings['models'] pipeline.run(models=MODELS, resume_after=resume_after) tours = pipeline.get_table('tours') tour_count = len(tours.index) return tour_count
def test_full_run1(): _MODELS = [ 'input_pre_processor', 'setup_data_structures', 'initial_seed_balancing', 'meta_control_factoring', 'final_seed_balancing', 'integerize_final_seed_weights', 'sub_balancing.geography=TRACT', 'sub_balancing.geography=TAZ', 'expand_households', 'summarize', 'write_tables', 'write_synthetic_population', ] pipeline.run(models=_MODELS, resume_after=None) expanded_household_ids = pipeline.get_table('expanded_household_ids') assert isinstance(expanded_household_ids, pd.DataFrame) taz_hh_counts = expanded_household_ids.groupby('TAZ').size() assert len(taz_hh_counts) == TAZ_COUNT assert taz_hh_counts.loc[100] == TAZ_100_HH_COUNT # output_tables action: skip output_dir = inject.get_injectable('output_dir') assert not os.path.exists(os.path.join(output_dir, 'households.csv')) assert os.path.exists(os.path.join(output_dir, 'summary_DISTRICT_1.csv')) # tables will no longer be available after pipeline is closed pipeline.close_pipeline() inject.clear_cache()
def test_full_run2(): configs_dir = os.path.join(os.path.dirname(__file__), 'configs2') orca.add_injectable("configs_dir", configs_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data2') orca.add_injectable("data_dir", data_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) orca.clear_cache() tracing.config_logger() _MODELS = [ 'input_pre_processor', 'setup_data_structures', 'initial_seed_balancing', 'meta_control_factoring', 'final_seed_balancing', 'integerize_final_seed_weights', 'sub_balancing.geography = DISTRICT', 'sub_balancing.geography = TRACT', 'sub_balancing.geography=TAZ', 'expand_households', 'summarize', 'write_results' ] pipeline.run(models=_MODELS, resume_after=None) assert isinstance(pipeline.get_table('expanded_household_ids'), pd.DataFrame) assert os.path.exists(os.path.join(output_dir, 'summary_DISTRICT.csv')) # tables will no longer be available after pipeline is closed pipeline.close_pipeline() orca.clear_cache()
def regress_mini_auto(): # regression test: these are among the middle households in households table # should be the same results as in run_mp (multiprocessing) test case hh_ids = [1099626, 1173905, 1196298, 1286259] choices = [1, 1, 0, 0] expected_choice = pd.Series(choices, index=pd.Index(hh_ids, name="household_id"), name='auto_ownership') auto_choice = pipeline.get_table("households").sort_index().auto_ownership offset = HOUSEHOLDS_SAMPLE_SIZE // 2 # choose something midway as hh_id ordered by hh size print("auto_choice\n%s" % auto_choice.head(offset).tail(4)) auto_choice = auto_choice.reindex(hh_ids) """ auto_choice household_id 1099626 1 1173905 1 1196298 0 1286259 0 Name: auto_ownership, dtype: int64 """ pdt.assert_series_equal(auto_choice, expected_choice, check_dtype=False)
def regress_mini_auto(): # regression test: these are among the middle households in households table # should be the same results as in test_pipeline (single-threaded) tests hh_ids = [932147, 982875, 983048, 1024353] choices = [1, 1, 1, 0] expected_choice = pd.Series(choices, index=pd.Index(hh_ids, name="household_id"), name='auto_ownership') auto_choice = pipeline.get_table("households").sort_index().auto_ownership offset = HOUSEHOLDS_SAMPLE_SIZE // 2 # choose something midway as hh_id ordered by hh size print("auto_choice\n", auto_choice.head(offset).tail(4)) auto_choice = auto_choice.reindex(hh_ids) """ auto_choice household_id 932147 1 982875 1 983048 1 1024353 0 Name: auto_ownership, dtype: int64 """ pdt.assert_series_equal(auto_choice, expected_choice)
def regress_mini_mtf(): mtf_choice = pipeline.get_table( "persons").sort_index().mandatory_tour_frequency # these choices are for pure regression - their appropriateness has not been checked per_ids = [2566701, 2566702, 3061895] choices = ['school1', 'school1', 'work1'] expected_choice = pd.Series(choices, index=pd.Index(per_ids, name='person_id'), name='mandatory_tour_frequency') mtf_choice = mtf_choice[mtf_choice != ''] # drop null (empty string) choices offset = len( mtf_choice) // 2 # choose something midway as hh_id ordered by hh size print("mtf_choice\n%s" % mtf_choice.head(offset).tail(3)) """ mtf_choice person_id 2566701 school1 2566702 school1 3061895 work1 Name: mandatory_tour_frequency, dtype: object """ pdt.assert_series_equal(mtf_choice.reindex(per_ids), expected_choice, check_dtype=False)
def full_run(resume_after=None, chunk_size=0, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, trace_hh_id=None, trace_od=None, check_for_variability=None): setup_dirs() settings = inject_settings( households_sample_size=households_sample_size, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_od=trace_od, testing_fail_trip_destination=False, check_for_variability=check_for_variability, want_dest_choice_sample_tables=False, use_shadow_pricing=False ) # shadow pricing breaks replicability when sample_size varies # FIXME should enable testing_fail_trip_destination? MODELS = settings['models'] pipeline.run(models=MODELS, resume_after=resume_after) tours = pipeline.get_table('tours') tour_count = len(tours.index) return tour_count
def step_forget_tab(): table_name = inject.get_step_arg('table_name') assert table_name is not None table = pipeline.get_table(table_name) pipeline.drop_table(table_name)
def test_mini_pipeline_run(): setup_dirs() inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, write_skim_cache=True ) _MODELS = [ 'initialize_landuse', 'compute_accessibility', 'initialize_households', 'school_location', 'workplace_location', 'auto_ownership_simulate' ] pipeline.run(models=_MODELS, resume_after=None) regress_mini_auto() pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() regress_mini_location_choice_logsums() # try to get a non-existant table with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("bogus") assert "never checkpointed" in str(excinfo.value) # try to get an existing table from a non-existant checkpoint with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("households", checkpoint_name="bogus") assert "not in checkpoints" in str(excinfo.value) # should create optional workplace_location_sample table workplace_location_sample_df = pipeline.get_table("workplace_location_sample") assert 'mode_choice_logsum' in workplace_location_sample_df pipeline.close_pipeline() inject.clear_cache() close_handlers()
def regress_mini_location_choice_logsums(): persons = pipeline.get_table("persons") # DEST_CHOICE_LOGSUM_COLUMN_NAME is specified in school_location.yaml and should be assigned assert 'school_location_logsum' in persons assert not persons.school_location_logsum.isnull().all() # DEST_CHOICE_LOGSUM_COLUMN_NAME is NOT specified in workplace_location.yaml assert 'workplace_location_logsum' not in persons
def test_mini_pipeline_run(): configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, # use_shadow_pricing=True ) _MODELS = [ 'initialize_landuse', 'compute_accessibility', 'initialize_households', 'school_location', 'workplace_location', 'auto_ownership_simulate' ] pipeline.run(models=_MODELS, resume_after=None) regress_mini_auto() pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() # try to get a non-existant table with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("bogus") assert "never checkpointed" in str(excinfo.value) # try to get an existing table from a non-existant checkpoint with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("households", checkpoint_name="bogus") assert "not in checkpoints" in str(excinfo.value) pipeline.close_pipeline() inject.clear_cache() close_handlers()
def initialize_landuse(): trace_label = 'initialize_landuse' model_settings = config.read_model_settings('initialize_landuse.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # instantiate accessibility (must be checkpointed to be be used to slice accessibility) accessibility = pipeline.get_table('accessibility')
def regress(): expanded_household_ids = pipeline.get_table("expanded_household_ids") assert isinstance(expanded_household_ids, pd.DataFrame) taz_hh_counts = expanded_household_ids.groupby("TAZ").size() assert len(taz_hh_counts) == TAZ_COUNT assert taz_hh_counts.loc[100] == TAZ_100_HH_COUNT # output_tables action: skip output_dir = inject.get_injectable("output_dir") assert not os.path.exists(os.path.join(output_dir, "households.csv")) assert os.path.exists(os.path.join(output_dir, "summary_DISTRICT_1.csv"))
def initialize_landuse(): trace_label = 'initialize_landuse' model_settings = config.read_model_settings('initialize_landuse.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # create accessibility (only required if multiprocessing wants to slice accessibility) land_use = pipeline.get_table('land_use') accessibility_df = pd.DataFrame(index=land_use.index) pipeline.replace_table("accessibility", accessibility_df)
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 8 pipeline.open_pipeline('auto_ownership_simulate') regress_mini_auto() # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str( excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() # should be able to get this before pipeline is closed (from existing open store) checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values hh_ids = pd.DataFrame({'household_id': hh_ids}) data_dir = inject.get_injectable('data_dir') hh_ids.to_csv(os.path.join(data_dir, 'override_hh_ids.csv'), index=False, header=True) pipeline.close_pipeline() inject.clear_cache() close_handlers()
def full_run(resume_after=None, chunk_size=0, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, trace_hh_id=None, trace_od=None, check_for_variability=None): configs_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'example', 'configs') orca.add_injectable("configs_dir", configs_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') orca.add_injectable("data_dir", data_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) inject_settings(configs_dir, households_sample_size=households_sample_size, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_od=trace_od, check_for_variability=check_for_variability) orca.clear_cache() tracing.config_logger() # assert orca.get_injectable("chunk_size") == chunk_size _MODELS = [ 'compute_accessibility', 'school_location_sample', 'school_location_logsums', 'school_location_simulate', 'workplace_location_sample', 'workplace_location_logsums', 'workplace_location_simulate', 'auto_ownership_simulate', 'cdap_simulate', 'mandatory_tour_frequency', 'mandatory_scheduling', 'non_mandatory_tour_frequency', 'destination_choice', 'non_mandatory_scheduling', 'tour_mode_choice_simulate', 'create_simple_trips', 'trip_mode_choice_simulate' ] pipeline.run(models=_MODELS, resume_after=resume_after) tours = pipeline.get_table('tours') tour_count = len(tours.index) pipeline.close() orca.clear_cache() return tour_count
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 8 pipeline.open_pipeline('auto_ownership_simulate') regress_mini_auto() # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str(excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() # should be able to get this before pipeline is closed (from existing open store) checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values hh_ids = pd.DataFrame({'household_id': hh_ids}) data_dir = inject.get_injectable('data_dir') hh_ids.to_csv(os.path.join(data_dir, 'override_hh_ids.csv'), index=False, header=True) pipeline.close_pipeline() inject.clear_cache() close_handlers()
def get_trips_df(model_settings): """Default to pipeline trips table unless user provides a CSV """ filename = model_settings.get('input_table', None) if not filename: logger.info("using 'trips' pipeline table for balancing step") trips_df = pipeline.get_table('trips') return trips_df.reset_index() logger.info('using %s for balancing step' % filename) fpath = config.data_file_path(filename, mandatory=True) return pd.read_csv(fpath, header=0, comment='#')
def test_pipeline_run(): inject.add_step('step1', steps.step1) inject.add_step('step2', steps.step2) inject.add_step('step3', steps.step3) inject.add_step('step_add_col', steps.step_add_col) inject.dump_state() _MODELS = [ 'step1', 'step2', 'step3', 'step_add_col.table_name=table2;column_name=c2' ] pipeline.run(models=_MODELS, resume_after=None) checkpoints = pipeline.get_checkpoints() print("checkpoints\n", checkpoints) c2 = pipeline.get_table("table2").c2 # get table from pipeline.get_table("table1", checkpoint_name="step3") # try to get a table from a step before it was checkpointed with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table2", checkpoint_name="step1") assert "not in checkpoint 'step1'" in str(excinfo.value) # try to get a non-existant table with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("bogus") assert "never checkpointed" in str(excinfo.value) # try to get an existing table from a non-existant checkpoint with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table1", checkpoint_name="bogus") assert "not in checkpoints" in str(excinfo.value) pipeline.close_pipeline() close_handlers()
def initialize_landuse(): trace_label = 'initialize_landuse' model_settings = config.read_model_settings('initialize_landuse.yaml', mandatory=True) annotate_tables(model_settings, trace_label) # create accessibility land_use = pipeline.get_table('land_use') accessibility_df = pd.DataFrame(index=land_use.index) # - write table to pipeline pipeline.replace_table("accessibility", accessibility_df)
def step_add_col(): table_name = inject.get_step_arg('table_name') assert table_name is not None col_name = inject.get_step_arg('column_name') assert col_name is not None table = pipeline.get_table(table_name) assert col_name not in table.columns table[col_name] = table.index + (1000 * len(table.columns)) pipeline.replace_table(table_name, table)
def annotate_trips(trips, network_los, model_settings): """ Add columns to local trips table. The annotator has access to the origin/destination skims and everything defined in the model settings CONSTANTS. Pipeline tables can also be accessed by listing them under TABLES in the preprocessor settings. """ trips_df = trips.to_frame() trace_label = 'trip_matrices' skim_dict = network_los.get_default_skim_dict() # setup skim keys if 'trip_period' not in trips_df: trips_df['trip_period'] = network_los.skim_time_period_label( trips_df.depart) od_skim_wrapper = skim_dict.wrap('origin', 'destination') odt_skim_stack_wrapper = skim_dict.wrap_3d(orig_key='origin', dest_key='destination', dim3_key='trip_period') skims = {'od_skims': od_skim_wrapper, "odt_skims": odt_skim_stack_wrapper} locals_dict = {} constants = config.get_model_constants(model_settings) if constants is not None: locals_dict.update(constants) expressions.annotate_preprocessors(trips_df, locals_dict, skims, model_settings, trace_label) # Data will be expanded by an expansion weight column from # the households pipeline table, if specified in the model settings. hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL') if hh_weight_col and hh_weight_col not in trips_df: logger.info("adding '%s' from households to trips table" % hh_weight_col) household_weights = pipeline.get_table('households')[hh_weight_col] trips_df[hh_weight_col] = trips_df.household_id.map(household_weights) return trips_df
def regress_3_zone(): tours_df = pipeline.get_table('tours') assert len(tours_df[tours_df.tour_mode == 'WALK_TRANSIT']) > 0 # should cache atap and btap for transit modes only for c in ['od_atap', 'od_btap', 'do_atap', 'do_btap']: # tour_mode_choice sets non-transit taps to 0 assert not (tours_df[tours_df.tour_mode.isin( ['WALK_TRANSIT', 'DRIVE_TRANSIT'])][c] == 0).any() baddies = ~tours_df.tour_mode.isin(['WALK_TRANSIT', 'DRIVE_TRANSIT' ]) & (tours_df[c] != 0) if baddies.any(): print(tours_df[baddies][[ 'tour_type', 'tour_mode', 'od_atap', 'od_btap', 'do_atap', 'do_btap' ]]) assert False
def test_load_cached_accessibility(): inject.clear_cache() inject.reinject_decorated_tables() data_dir = [ os.path.join(os.path.dirname(__file__), 'data'), example_path('data') ] setup_dirs(data_dir=data_dir) # # add OPTIONAL ceched table accessibility to input_table_list # activitysim.abm.tables.land_use.accessibility() will load this table if listed here # presumably independently calculated outside activitysim or a cached copy created during a previous run # settings = config.read_settings_file('settings.yaml', mandatory=True) input_table_list = settings.get('input_table_list') input_table_list.append({ 'tablename': 'accessibility', 'filename': 'cached_accessibility.csv', 'index_col': 'zone_id' }) inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, input_table_list=input_table_list) _MODELS = [ 'initialize_landuse', # 'compute_accessibility', # we load accessibility table ordinarily created by compute_accessibility 'initialize_households', ] pipeline.run(models=_MODELS, resume_after=None) accessibility_df = pipeline.get_table("accessibility") assert 'auPkRetail' in accessibility_df pipeline.close_pipeline() inject.clear_cache() close_handlers()
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in checkpointed_tables if t not in tables] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex) df.to_csv(file_path, index=write_index)