def full_run(resume_after=None, chunk_size=0, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE, trace_hh_id=None, trace_od=None, check_for_variability=None): configs_dir = os.path.join(os.path.dirname(__file__), '..', '..', '..', 'example', 'configs') orca.add_injectable("configs_dir", configs_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') orca.add_injectable("data_dir", data_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) inject_settings(configs_dir, households_sample_size=households_sample_size, chunk_size=chunk_size, trace_hh_id=trace_hh_id, trace_od=trace_od, check_for_variability=check_for_variability) orca.clear_cache() tracing.config_logger() # assert orca.get_injectable("chunk_size") == chunk_size _MODELS = [ 'compute_accessibility', 'school_location_sample', 'school_location_logsums', 'school_location_simulate', 'workplace_location_sample', 'workplace_location_logsums', 'workplace_location_simulate', 'auto_ownership_simulate', 'cdap_simulate', 'mandatory_tour_frequency', 'mandatory_scheduling', 'non_mandatory_tour_frequency', 'destination_choice', 'non_mandatory_scheduling', 'tour_mode_choice_simulate', 'create_simple_trips', 'trip_mode_choice_simulate' ] pipeline.run(models=_MODELS, resume_after=resume_after) tours = pipeline.get_table('tours') tour_count = len(tours.index) pipeline.close() orca.clear_cache() return tour_count
def test_mini_pipeline_run(): configs_dir = os.path.join(os.path.dirname(__file__), 'configs') orca.add_injectable("configs_dir", configs_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') orca.add_injectable("data_dir", data_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) orca.clear_cache() # assert len(orca.get_table("households").index) == HOUSEHOLDS_SAMPLE_SIZE _MODELS = [ 'compute_accessibility', 'school_location_simulate', 'workplace_location_simulate', 'auto_ownership_simulate' ] pipeline.run(models=_MODELS, resume_after=None) auto_choice = pipeline.get_table("households").auto_ownership # regression test: these are the first 3 households in households table hh_ids = [26960, 857296, 93428] choices = [0, 1, 0] expected_choice = pd.Series(choices, index=pd.Index(hh_ids, name="HHID"), name='auto_ownership') print "auto_choice\n", auto_choice.head(10) pdt.assert_series_equal(auto_choice[hh_ids], expected_choice) pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') mtf_choice = pipeline.get_table("persons").mandatory_tour_frequency per_ids = [92363, 92681, 93428] choices = ['work1', 'school1', 'school2'] expected_choice = pd.Series(choices, index=pd.Index(per_ids, name='PERID'), name='mandatory_tour_frequency') print "mtf_choice\n", mtf_choice.head(20) pdt.assert_series_equal(mtf_choice[per_ids], expected_choice) # try to get a non-existant table with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("bogus") assert "not in checkpointed tables" in str(excinfo.value) # try to get an existing table from a non-existant checkpoint with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("households", checkpoint_name="bogus") assert "not in checkpoints" in str(excinfo.value) pipeline.close() orca.clear_cache()
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') orca.add_injectable("configs_dir", configs_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') orca.add_injectable("data_dir", data_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) orca.clear_cache() # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) assert prev_checkpoint_count == 7 pipeline.start_pipeline('auto_ownership_simulate') auto_choice = pipeline.get_table("households").auto_ownership # regression test: these are the 2nd-4th households in households table hh_ids = [26960, 857296, 93428] choices = [0, 1, 0] expected_auto_choice = pd.Series(choices, index=pd.Index(hh_ids, name="HHID"), name='auto_ownership') print "auto_choice\n", auto_choice.head(4) pdt.assert_series_equal(auto_choice[hh_ids], expected_auto_choice) # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str( excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') mtf_choice = pipeline.get_table("persons").mandatory_tour_frequency per_ids = [92363, 92681, 93428] choices = ['work1', 'school1', 'school2'] expected_choice = pd.Series(choices, index=pd.Index(per_ids, name='PERID'), name='mandatory_tour_frequency') print "mtf_choice\n", mtf_choice.head(20) pdt.assert_series_equal(mtf_choice[per_ids], expected_choice) # should be able to get this before pipeline is closed (from existing open store) assert orca.get_injectable('pipeline_store') is not None checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count pipeline.close() # should also be able to get this after pipeline is closed (open and close) assert orca.get_injectable('pipeline_store') is None checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count orca.clear_cache()
# retrieve the state of a checkpointed table after a specific model was run df = pipeline.get_table(table_name="persons", checkpoint_name="school_location_simulate") print "\npersons table columns after school_location_simulate:", df.columns.values # get_table without checkpoint_name returns the latest version of the table df = pipeline.get_table("tours") print "\ntour_type value counts\n", df.tour_type.value_counts() # get_table for a computed (non-checkpointed, internal, orca) table # return the most recent value of a (non-checkpointed, internal) computed table df = pipeline.get_table("persons_merged") df = df[['household_id', 'age', 'auPkTotal', 'roundtrip_auto_time_to_work']] print "\npersons_merged selected columns\n", df.head(20) # write final versions of all checkpointed dataframes to CSV files to review results for table_name in pipeline.checkpointed_tables(): file_name = "final_%s_table.csv" % table_name file_path = os.path.join(orca.get_injectable("output_dir"), file_name) pipeline.get_table(table_name).to_csv(file_path) # tables will no longer be available after pipeline is closed pipeline.close() # write checkpoints (this can be called whether of not pipeline is open) file_path = os.path.join(orca.get_injectable("output_dir"), "checkpoints.csv") pipeline.get_checkpoints().to_csv(file_path) t0 = print_elapsed_time("all models", t0)