def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 8 pipeline.open_pipeline('auto_ownership_simulate') regress_mini_auto() # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str( excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() # should be able to get this before pipeline is closed (from existing open store) checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values hh_ids = pd.DataFrame({'household_id': hh_ids}) data_dir = inject.get_injectable('data_dir') hh_ids.to_csv(os.path.join(data_dir, 'override_hh_ids.csv'), index=False, header=True) pipeline.close_pipeline() inject.clear_cache() close_handlers()
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') setup_dirs(configs_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 8 pipeline.open_pipeline('auto_ownership_simulate') regress_mini_auto() # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str(excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') regress_mini_mtf() # should be able to get this before pipeline is closed (from existing open store) checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test num_hh_ids = 10 hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values hh_ids = pd.DataFrame({'household_id': hh_ids}) data_dir = inject.get_injectable('data_dir') hh_ids.to_csv(os.path.join(data_dir, 'override_hh_ids.csv'), index=False, header=True) pipeline.close_pipeline() inject.clear_cache() close_handlers()
def test_pipeline_checkpoint_drop(): setup() _MODELS = [ 'step1', '_step2', '_step_add_col.table_name=table2;column_name=c2', '_step_forget_tab.table_name=table2', 'step3', 'step_forget_tab.table_name=table3', ] pipeline.run(models=_MODELS, resume_after=None) checkpoints = pipeline.get_checkpoints() print "checkpoints\n", checkpoints pipeline.get_table("table1") with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table2") assert "never checkpointed" in str(excinfo.value) # can't get a dropped table from current checkpoint with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table3") assert "was dropped" in str(excinfo.value) # ensure that we can still get table3 from a checkpoint at which it existed pipeline.get_table("table3", checkpoint_name="step3") pipeline.close_pipeline() close_handlers()
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 checkpoints = pipeline.get_checkpoints() tables = OrderedDict() skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack', None) mode = 'wb' if sys.version_info < (3, ) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.usage: print(key, file=output_file) if skim_stack is None: unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \ {k for k in skim_dict.usage} print("\n### unused skim keys", file=output_file) for key in unused_keys: print(key, file=output_file) else: print("\n### skim_stack usage", file=output_file) for key in skim_stack.usage: print(key, file=output_file) unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \ {k for k in skim_dict.usage if not isinstance(k, tuple)} print("\n### unused skim str keys", file=output_file) for key in unused: print(key, file=output_file) unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \ {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \ {k for k in skim_stack.usage} print("\n### unused skim dim3 keys", file=output_file) for key in unused: print(key, file=output_file)
def track_skim_usage(output_dir): """ write statistics on skim usage (diagnostic to detect loading of un-needed skims) FIXME - have not yet implemented a facility to avoid loading of unused skims Parameters ---------- output_dir: str """ pd.options.display.max_columns = 500 pd.options.display.max_rows = 100 checkpoints = pipeline.get_checkpoints() tables = OrderedDict() skim_dict = inject.get_injectable('skim_dict') skim_stack = inject.get_injectable('skim_stack', None) mode = 'wb' if sys.version_info < (3,) else 'w' with open(config.output_file_path('skim_usage.txt'), mode) as output_file: print("\n### skim_dict usage", file=output_file) for key in skim_dict.usage: print(key, file=output_file) if skim_stack is None: unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \ {k for k in skim_dict.usage} print("\n### unused skim keys", file=output_file) for key in unused_keys: print(key, file=output_file) else: print("\n### skim_stack usage", file=output_file) for key in skim_stack.usage: print(key, file=output_file) unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \ {k for k in skim_dict.usage if not isinstance(k, tuple)} print("\n### unused skim str keys", file=output_file) for key in unused: print(key, file=output_file) unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \ {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \ {k for k in skim_stack.usage} print("\n### unused skim dim3 keys", file=output_file) for key in unused: print(key, file=output_file)
def test_pipeline_run(): inject.add_step('step1', steps.step1) inject.add_step('step2', steps.step2) inject.add_step('step3', steps.step3) inject.add_step('step_add_col', steps.step_add_col) inject.dump_state() _MODELS = [ 'step1', 'step2', 'step3', 'step_add_col.table_name=table2;column_name=c2' ] pipeline.run(models=_MODELS, resume_after=None) checkpoints = pipeline.get_checkpoints() print("checkpoints\n", checkpoints) c2 = pipeline.get_table("table2").c2 # get table from pipeline.get_table("table1", checkpoint_name="step3") # try to get a table from a step before it was checkpointed with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table2", checkpoint_name="step1") assert "not in checkpoint 'step1'" in str(excinfo.value) # try to get a non-existant table with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("bogus") assert "never checkpointed" in str(excinfo.value) # try to get an existing table from a non-existant checkpoint with pytest.raises(RuntimeError) as excinfo: pipeline.get_table("table1", checkpoint_name="bogus") assert "not in checkpoints" in str(excinfo.value) pipeline.close_pipeline() close_handlers()
def write_data_dictionary(output_dir): """ Write table schema for all tables model settings txt_format: output text file name (default data_dict.txt) or empty to suppress txt output csv_format: output csv file name (default data_dict.tcsvxt) or empty to suppress txt output schema_tables: list of tables to include in output (defaults to all checkpointed tables) for each table, write column names, dtype, and checkpoint added) text format writes individual table schemas to a single text file csv format writes all tables together with an additional table_name column Parameters ---------- output_dir: str """ model_settings = config.read_model_settings('write_data_dictionary') txt_format = model_settings.get('txt_format', 'data_dict.txt') csv_format = model_settings.get('csv_format', 'data_dict.csv') if not (csv_format or txt_format): logger.warning( f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified" ) return table_names = pipeline.checkpointed_tables() # use table_names list from model_settings, if provided schema_tables = model_settings.get('tables', None) if schema_tables: table_names = [c for c in schema_tables if c in table_names] # initialize schema as dict of dataframe[table_name, column_name, dtype, checkpoint] schema = dict() final_shapes = dict() for table_name in table_names: df = pipeline.get_table(table_name) final_shapes[table_name] = df.shape if df.index.name and df.index.name not in df.columns: df = df.reset_index() info = df.dtypes.astype(str).to_frame('dtype').reset_index().rename( columns={'index': 'column_name'}) info['checkpoint'] = '' info.insert(loc=0, column='table_name', value=table_name) schema[table_name] = info # annotate schema.info with name of checkpoint columns were first seen for _, row in pipeline.get_checkpoints().iterrows(): checkpoint_name = row[pipeline.CHECKPOINT_NAME] for table_name in table_names: # no change to table in this checkpoint if row[table_name] != checkpoint_name: continue # get the checkpointed version of the table df = pipeline.get_table(table_name, checkpoint_name) if df.index.name and df.index.name not in df.columns: df = df.reset_index() info = schema.get(table_name, None) # tag any new columns with checkpoint name prev_columns = info[info.checkpoint != ''].column_name.values new_cols = [c for c in df.columns.values if c not in prev_columns] is_new_column_this_checkpoont = info.column_name.isin(new_cols) info.checkpoint = np.where(is_new_column_this_checkpoont, checkpoint_name, info.checkpoint) schema[table_name] = info schema_df = pd.concat(schema.values()) if csv_format: schema_df.to_csv(config.output_file_path(csv_format), header=True, index=False) if txt_format: with open(config.output_file_path(txt_format), 'w') as output_file: # get max schema column widths from omnibus table col_width = { c: schema_df[c].str.len().max() + 2 for c in schema_df } for table_name in table_names: info = schema.get(table_name, None) columns_to_print = ['column_name', 'dtype', 'checkpoint'] info = info[columns_to_print].copy() # normalize schema columns widths across all table schemas for unified output formatting for c in info: info[c] = info[c].str.pad(col_width[c], side='right') info.columns = [c.ljust(col_width[c]) for c in info.columns] info = info.to_string(index=False) print( f"###\n### {table_name} {final_shapes[table_name]}\n###\n", file=output_file) print(f"{info}\n", file=output_file)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag: :: output_tables: h5_store: True action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') h5_store = output_tables_settings.get('h5_store', False) sort = output_tables_settings.get('sort', False) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [ t for t in checkpointed_tables if t not in tables ] else: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) if sort: traceable_table_indexes = inject.get_injectable( 'traceable_table_indexes', {}) if df.index.name in traceable_table_indexes: df = df.sort_index() logger.debug( f"write_tables sorting {table_name} on index {df.index.name}" ) else: # find all registered columns we can use to sort this table # (they are ordered appropriately in traceable_table_indexes) sort_columns = [ c for c in traceable_table_indexes if c in df.columns ] if len(sort_columns) > 0: df = df.sort_values(by=sort_columns) logger.debug( f"write_tables sorting {table_name} on columns {sort_columns}" ) else: logger.debug( f"write_tables sorting {table_name} on unrecognized index {df.index.name}" ) df = df.sort_index() if h5_store: file_path = config.output_file_path('%soutput_tables.h5' % prefix) df.to_hdf(file_path, key=table_name, mode='a', format='fixed') else: file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance( df.index, pd.MultiIndex) df.to_csv(file_path, index=write_index)
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') orca.add_injectable("configs_dir", configs_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') orca.add_injectable("data_dir", data_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) orca.clear_cache() # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']] assert prev_checkpoint_count == 11 pipeline.open_pipeline('auto_ownership_simulate') auto_choice = pipeline.get_table("households").auto_ownership # regression test: these are the same as in test_mini_pipeline_run1 hh_ids = [464138, 1918238, 2201602] choices = [0, 1, 2] expected_choice = pd.Series(choices, index=pd.Index(hh_ids, name="HHID"), name='auto_ownership') print "auto_choice\n", auto_choice.head(4) pdt.assert_series_equal(auto_choice[hh_ids], expected_choice) # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str( excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') mtf_choice = pipeline.get_table("persons").mandatory_tour_frequency per_ids = [24375, 92744, 172491] choices = ['school2', 'work_and_school', 'work1'] expected_choice = pd.Series(choices, index=pd.Index(per_ids, name='PERID'), name='mandatory_tour_frequency') print "mtf_choice\n", mtf_choice.head(20) pdt.assert_series_equal(mtf_choice[per_ids], expected_choice) # should be able to get this before pipeline is closed (from existing open store) checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count pipeline.close_pipeline() orca.clear_cache()
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag: :: output_tables: h5_store: True action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') h5_store = output_tables_settings.get('h5_store', False) if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [ t for t in checkpointed_tables if t not in tables ] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) if h5_store: file_path = config.output_file_path('%soutput_tables.h5' % prefix) df.to_hdf(file_path, key=table_name, mode='a', format='fixed') else: file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance( df.index, pd.MultiIndex) df.to_csv(file_path, index=write_index)
MODELS = setting('models') # If you provide a resume_after argument to pipeline.run # the pipeline manager will attempt to load checkpointed tables from the checkpoint store # and resume pipeline processing on the next submodel step after the specified checkpoint resume_after = setting('resume_after', None) if resume_after: print "resume_after", resume_after pipeline.run(models=MODELS, resume_after=resume_after) print "\n#### run completed" # write final versions of all checkpointed dataframes to CSV files to review results for table_name in pipeline.checkpointed_tables(): file_name = "final_%s_table.csv" % table_name file_path = os.path.join(orca.get_injectable("output_dir"), file_name) pipeline.get_table(table_name).to_csv(file_path) # write checkpoints file_path = os.path.join(orca.get_injectable("output_dir"), "checkpoints.csv") pipeline.get_checkpoints().to_csv(file_path) # tables will no longer be available after pipeline is closed pipeline.close_pipeline() t0 = print_elapsed_time("all models", t0)
def test_mini_pipeline_run2(): # the important thing here is that we should get # exactly the same results as for test_mini_pipeline_run # when we restart pipeline configs_dir = os.path.join(os.path.dirname(__file__), 'configs') orca.add_injectable("configs_dir", configs_dir) output_dir = os.path.join(os.path.dirname(__file__), 'output') orca.add_injectable("output_dir", output_dir) data_dir = os.path.join(os.path.dirname(__file__), 'data') orca.add_injectable("data_dir", data_dir) inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE) orca.clear_cache() # should be able to get this BEFORE pipeline is opened checkpoints_df = pipeline.get_checkpoints() prev_checkpoint_count = len(checkpoints_df.index) assert prev_checkpoint_count == 7 pipeline.start_pipeline('auto_ownership_simulate') auto_choice = pipeline.get_table("households").auto_ownership # regression test: these are the 2nd-4th households in households table hh_ids = [26960, 857296, 93428] choices = [0, 1, 0] expected_auto_choice = pd.Series(choices, index=pd.Index(hh_ids, name="HHID"), name='auto_ownership') print "auto_choice\n", auto_choice.head(4) pdt.assert_series_equal(auto_choice[hh_ids], expected_auto_choice) # try to run a model already in pipeline with pytest.raises(RuntimeError) as excinfo: pipeline.run_model('auto_ownership_simulate') assert "run model 'auto_ownership_simulate' more than once" in str( excinfo.value) # and these new ones pipeline.run_model('cdap_simulate') pipeline.run_model('mandatory_tour_frequency') mtf_choice = pipeline.get_table("persons").mandatory_tour_frequency per_ids = [92363, 92681, 93428] choices = ['work1', 'school1', 'school2'] expected_choice = pd.Series(choices, index=pd.Index(per_ids, name='PERID'), name='mandatory_tour_frequency') print "mtf_choice\n", mtf_choice.head(20) pdt.assert_series_equal(mtf_choice[per_ids], expected_choice) # should be able to get this before pipeline is closed (from existing open store) assert orca.get_injectable('pipeline_store') is not None checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count pipeline.close() # should also be able to get this after pipeline is closed (open and close) assert orca.get_injectable('pipeline_store') is None checkpoints_df = pipeline.get_checkpoints() assert len(checkpoints_df.index) == prev_checkpoint_count orca.clear_cache()
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) if output_tables_settings is None: logger.info("No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) checkpointed_tables = pipeline.checkpointed_tables() if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in checkpointed_tables if t not in tables] for table_name in output_tables_list: if table_name == 'checkpoints': df = pipeline.get_checkpoints() else: if table_name not in checkpointed_tables: logger.warning("Skipping '%s': Table not found." % table_name) continue df = pipeline.get_table(table_name) file_name = "%s%s.csv" % (prefix, table_name) file_path = config.output_file_path(file_name) # include the index if it has a name or is a MultiIndex write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex) df.to_csv(file_path, index=write_index)
def write_tables(output_dir): """ Write pipeline tables as csv files (in output directory) as specified by output_tables list in settings file. 'output_tables' can specify either a list of output tables to include or to skip if no output_tables list is specified, then no checkpointed tables will be written To write all output tables EXCEPT the households and persons tables: :: output_tables: action: skip tables: - households - persons To write ONLY the households table: :: output_tables: action: include tables: - households Parameters ---------- output_dir: str """ output_tables_settings_name = 'output_tables' output_tables_settings = setting(output_tables_settings_name) output_tables_list = pipeline.checkpointed_tables() if output_tables_settings is None: logger.info( "No output_tables specified in settings file. Nothing to write.") return action = output_tables_settings.get('action') tables = output_tables_settings.get('tables') prefix = output_tables_settings.get('prefix', 'final_') if action not in ['include', 'skip']: raise "expected %s action '%s' to be either 'include' or 'skip'" % \ (output_tables_settings_name, action) if action == 'include': output_tables_list = tables elif action == 'skip': output_tables_list = [t for t in output_tables_list if t not in tables] # should provide option to also write checkpoints? # output_tables_list.append("checkpoints.csv") for table_name in output_tables_list: table = inject.get_table(table_name, None) if table is None: logger.warn("Skipping '%s': Table not found." % table_name) continue df = table.to_frame() file_name = "%s%s.csv" % (prefix, table_name) logger.info("writing output file %s" % file_name) file_path = os.path.join(output_dir, file_name) write_index = df.index.name is not None df.to_csv(file_path, index=write_index) if (action == 'include') == ('checkpoints' in tables): # write checkpoints file_name = "%s%s.csv" % (prefix, 'checkpoints') pipeline.get_checkpoints().to_csv(os.path.join(output_dir, file_name))