Ejemplo n.º 1
0
def test_mini_pipeline_run2():

    # the important thing here is that we should get
    # exactly the same results as for test_mini_pipeline_run
    # when we restart pipeline

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs')

    setup_dirs(configs_dir)

    inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE)

    # should be able to get this BEFORE pipeline is opened
    checkpoints_df = pipeline.get_checkpoints()
    prev_checkpoint_count = len(checkpoints_df.index)

    # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']]
    assert prev_checkpoint_count == 8

    pipeline.open_pipeline('auto_ownership_simulate')

    regress_mini_auto()

    # try to run a model already in pipeline
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.run_model('auto_ownership_simulate')
    assert "run model 'auto_ownership_simulate' more than once" in str(
        excinfo.value)

    # and these new ones
    pipeline.run_model('cdap_simulate')
    pipeline.run_model('mandatory_tour_frequency')

    regress_mini_mtf()

    # should be able to get this before pipeline is closed (from existing open store)
    checkpoints_df = pipeline.get_checkpoints()
    assert len(checkpoints_df.index) == prev_checkpoint_count

    # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test
    num_hh_ids = 10
    hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values
    hh_ids = pd.DataFrame({'household_id': hh_ids})

    data_dir = inject.get_injectable('data_dir')
    hh_ids.to_csv(os.path.join(data_dir, 'override_hh_ids.csv'),
                  index=False,
                  header=True)

    pipeline.close_pipeline()
    inject.clear_cache()
    close_handlers()
Ejemplo n.º 2
0
def test_mini_pipeline_run2():

    # the important thing here is that we should get
    # exactly the same results as for test_mini_pipeline_run
    # when we restart pipeline

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs')

    setup_dirs(configs_dir)

    inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE)

    # should be able to get this BEFORE pipeline is opened
    checkpoints_df = pipeline.get_checkpoints()
    prev_checkpoint_count = len(checkpoints_df.index)

    # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']]
    assert prev_checkpoint_count == 8

    pipeline.open_pipeline('auto_ownership_simulate')

    regress_mini_auto()

    # try to run a model already in pipeline
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.run_model('auto_ownership_simulate')
    assert "run model 'auto_ownership_simulate' more than once" in str(excinfo.value)

    # and these new ones
    pipeline.run_model('cdap_simulate')
    pipeline.run_model('mandatory_tour_frequency')

    regress_mini_mtf()

    # should be able to get this before pipeline is closed (from existing open store)
    checkpoints_df = pipeline.get_checkpoints()
    assert len(checkpoints_df.index) == prev_checkpoint_count

    # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test
    num_hh_ids = 10
    hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values
    hh_ids = pd.DataFrame({'household_id': hh_ids})

    data_dir = inject.get_injectable('data_dir')
    hh_ids.to_csv(os.path.join(data_dir, 'override_hh_ids.csv'), index=False, header=True)

    pipeline.close_pipeline()
    inject.clear_cache()
    close_handlers()
Ejemplo n.º 3
0
def test_pipeline_checkpoint_drop():

    setup()

    _MODELS = [
        'step1',
        '_step2',
        '_step_add_col.table_name=table2;column_name=c2',
        '_step_forget_tab.table_name=table2',
        'step3',
        'step_forget_tab.table_name=table3',
    ]
    pipeline.run(models=_MODELS, resume_after=None)

    checkpoints = pipeline.get_checkpoints()
    print "checkpoints\n", checkpoints

    pipeline.get_table("table1")

    with pytest.raises(RuntimeError) as excinfo:
        pipeline.get_table("table2")
    assert "never checkpointed" in str(excinfo.value)

    # can't get a dropped table from current checkpoint
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.get_table("table3")
    assert "was dropped" in str(excinfo.value)

    # ensure that we can still get table3 from a checkpoint at which it existed
    pipeline.get_table("table3", checkpoint_name="step3")

    pipeline.close_pipeline()
    close_handlers()
Ejemplo n.º 4
0
def track_skim_usage(output_dir):
    """
    write statistics on skim usage (diagnostic to detect loading of un-needed skims)

    FIXME - have not yet implemented a facility to avoid loading of unused skims

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    checkpoints = pipeline.get_checkpoints()
    tables = OrderedDict()

    skim_dict = inject.get_injectable('skim_dict')
    skim_stack = inject.get_injectable('skim_stack', None)

    mode = 'wb' if sys.version_info < (3, ) else 'w'
    with open(config.output_file_path('skim_usage.txt'), mode) as output_file:

        print("\n### skim_dict usage", file=output_file)
        for key in skim_dict.usage:
            print(key, file=output_file)

        if skim_stack is None:

            unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \
                          {k for k in skim_dict.usage}

            print("\n### unused skim keys", file=output_file)
            for key in unused_keys:
                print(key, file=output_file)

        else:

            print("\n### skim_stack usage", file=output_file)
            for key in skim_stack.usage:
                print(key, file=output_file)

            unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \
                     {k for k in skim_dict.usage if not isinstance(k, tuple)}
            print("\n### unused skim str keys", file=output_file)
            for key in unused:
                print(key, file=output_file)

                unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \
                         {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \
                         {k for k in skim_stack.usage}
            print("\n### unused skim dim3 keys", file=output_file)
            for key in unused:
                print(key, file=output_file)
Ejemplo n.º 5
0
def track_skim_usage(output_dir):
    """
    write statistics on skim usage (diagnostic to detect loading of un-needed skims)

    FIXME - have not yet implemented a facility to avoid loading of unused skims

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    checkpoints = pipeline.get_checkpoints()
    tables = OrderedDict()

    skim_dict = inject.get_injectable('skim_dict')
    skim_stack = inject.get_injectable('skim_stack', None)

    mode = 'wb' if sys.version_info < (3,) else 'w'
    with open(config.output_file_path('skim_usage.txt'), mode) as output_file:

        print("\n### skim_dict usage", file=output_file)
        for key in skim_dict.usage:
            print(key, file=output_file)

        if skim_stack is None:

            unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \
                          {k for k in skim_dict.usage}

            print("\n### unused skim keys", file=output_file)
            for key in unused_keys:
                print(key, file=output_file)

        else:

            print("\n### skim_stack usage", file=output_file)
            for key in skim_stack.usage:
                print(key, file=output_file)

            unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \
                     {k for k in skim_dict.usage if not isinstance(k, tuple)}
            print("\n### unused skim str keys", file=output_file)
            for key in unused:
                print(key, file=output_file)

                unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \
                         {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \
                         {k for k in skim_stack.usage}
            print("\n### unused skim dim3 keys", file=output_file)
            for key in unused:
                print(key, file=output_file)
Ejemplo n.º 6
0
def test_pipeline_run():

    inject.add_step('step1', steps.step1)
    inject.add_step('step2', steps.step2)
    inject.add_step('step3', steps.step3)
    inject.add_step('step_add_col', steps.step_add_col)
    inject.dump_state()

    _MODELS = [
        'step1', 'step2', 'step3',
        'step_add_col.table_name=table2;column_name=c2'
    ]

    pipeline.run(models=_MODELS, resume_after=None)

    checkpoints = pipeline.get_checkpoints()
    print("checkpoints\n", checkpoints)

    c2 = pipeline.get_table("table2").c2

    # get table from
    pipeline.get_table("table1", checkpoint_name="step3")

    # try to get a table from a step before it was checkpointed
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.get_table("table2", checkpoint_name="step1")
    assert "not in checkpoint 'step1'" in str(excinfo.value)

    # try to get a non-existant table
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.get_table("bogus")
    assert "never checkpointed" in str(excinfo.value)

    # try to get an existing table from a non-existant checkpoint
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.get_table("table1", checkpoint_name="bogus")
    assert "not in checkpoints" in str(excinfo.value)

    pipeline.close_pipeline()

    close_handlers()
Ejemplo n.º 7
0
def write_data_dictionary(output_dir):
    """
    Write table schema for all tables

    model settings
        txt_format: output text file name (default data_dict.txt) or empty to suppress txt output
        csv_format: output csv file name (default data_dict.tcsvxt) or empty to suppress txt output

        schema_tables: list of tables to include in output (defaults to all checkpointed tables)

    for each table, write column names, dtype, and checkpoint added)

    text format writes individual table schemas to a single text file
    csv format writes all tables together with an additional table_name column

    Parameters
    ----------
    output_dir: str

    """

    model_settings = config.read_model_settings('write_data_dictionary')
    txt_format = model_settings.get('txt_format', 'data_dict.txt')
    csv_format = model_settings.get('csv_format', 'data_dict.csv')

    if not (csv_format or txt_format):
        logger.warning(
            f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified"
        )
        return

    table_names = pipeline.checkpointed_tables()

    # use table_names list from model_settings, if provided
    schema_tables = model_settings.get('tables', None)
    if schema_tables:
        table_names = [c for c in schema_tables if c in table_names]

    # initialize schema as dict of dataframe[table_name, column_name, dtype, checkpoint]
    schema = dict()
    final_shapes = dict()
    for table_name in table_names:
        df = pipeline.get_table(table_name)

        final_shapes[table_name] = df.shape

        if df.index.name and df.index.name not in df.columns:
            df = df.reset_index()
        info = df.dtypes.astype(str).to_frame('dtype').reset_index().rename(
            columns={'index': 'column_name'})
        info['checkpoint'] = ''

        info.insert(loc=0, column='table_name', value=table_name)
        schema[table_name] = info

    # annotate schema.info with name of checkpoint columns were first seen
    for _, row in pipeline.get_checkpoints().iterrows():

        checkpoint_name = row[pipeline.CHECKPOINT_NAME]

        for table_name in table_names:

            # no change to table in this checkpoint
            if row[table_name] != checkpoint_name:
                continue

            # get the checkpointed version of the table
            df = pipeline.get_table(table_name, checkpoint_name)

            if df.index.name and df.index.name not in df.columns:
                df = df.reset_index()

            info = schema.get(table_name, None)

            # tag any new columns with checkpoint name
            prev_columns = info[info.checkpoint != ''].column_name.values
            new_cols = [c for c in df.columns.values if c not in prev_columns]
            is_new_column_this_checkpoont = info.column_name.isin(new_cols)
            info.checkpoint = np.where(is_new_column_this_checkpoont,
                                       checkpoint_name, info.checkpoint)

            schema[table_name] = info

    schema_df = pd.concat(schema.values())

    if csv_format:
        schema_df.to_csv(config.output_file_path(csv_format),
                         header=True,
                         index=False)

    if txt_format:
        with open(config.output_file_path(txt_format), 'w') as output_file:

            # get max schema column widths from omnibus table
            col_width = {
                c: schema_df[c].str.len().max() + 2
                for c in schema_df
            }

            for table_name in table_names:
                info = schema.get(table_name, None)

                columns_to_print = ['column_name', 'dtype', 'checkpoint']
                info = info[columns_to_print].copy()

                # normalize schema columns widths across all table schemas for unified output formatting
                for c in info:
                    info[c] = info[c].str.pad(col_width[c], side='right')
                info.columns = [c.ljust(col_width[c]) for c in info.columns]

                info = info.to_string(index=False)

                print(
                    f"###\n### {table_name} {final_shapes[table_name]}\n###\n",
                    file=output_file)
                print(f"{info}\n", file=output_file)
Ejemplo n.º 8
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag:

    ::

      output_tables:
        h5_store: True
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')
    h5_store = output_tables_settings.get('h5_store', False)
    sort = output_tables_settings.get('sort', False)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [
            t for t in checkpointed_tables if t not in tables
        ]
    else:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

            if sort:
                traceable_table_indexes = inject.get_injectable(
                    'traceable_table_indexes', {})

                if df.index.name in traceable_table_indexes:
                    df = df.sort_index()
                    logger.debug(
                        f"write_tables sorting {table_name} on index {df.index.name}"
                    )
                else:
                    # find all registered columns we can use to sort this table
                    # (they are ordered appropriately in traceable_table_indexes)
                    sort_columns = [
                        c for c in traceable_table_indexes if c in df.columns
                    ]
                    if len(sort_columns) > 0:
                        df = df.sort_values(by=sort_columns)
                        logger.debug(
                            f"write_tables sorting {table_name} on columns {sort_columns}"
                        )
                    else:
                        logger.debug(
                            f"write_tables sorting {table_name} on unrecognized index {df.index.name}"
                        )
                        df = df.sort_index()

        if h5_store:
            file_path = config.output_file_path('%soutput_tables.h5' % prefix)
            df.to_hdf(file_path, key=table_name, mode='a', format='fixed')
        else:
            file_name = "%s%s.csv" % (prefix, table_name)
            file_path = config.output_file_path(file_name)

            # include the index if it has a name or is a MultiIndex
            write_index = df.index.name is not None or isinstance(
                df.index, pd.MultiIndex)

            df.to_csv(file_path, index=write_index)
Ejemplo n.º 9
0
def test_mini_pipeline_run2():

    # the important thing here is that we should get
    # exactly the same results as for test_mini_pipeline_run
    # when we restart pipeline

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs')
    orca.add_injectable("configs_dir", configs_dir)

    output_dir = os.path.join(os.path.dirname(__file__), 'output')
    orca.add_injectable("output_dir", output_dir)

    data_dir = os.path.join(os.path.dirname(__file__), 'data')
    orca.add_injectable("data_dir", data_dir)

    inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE)

    orca.clear_cache()

    # should be able to get this BEFORE pipeline is opened
    checkpoints_df = pipeline.get_checkpoints()
    prev_checkpoint_count = len(checkpoints_df.index)

    # print "checkpoints_df\n", checkpoints_df[['checkpoint_name']]
    assert prev_checkpoint_count == 11

    pipeline.open_pipeline('auto_ownership_simulate')

    auto_choice = pipeline.get_table("households").auto_ownership

    # regression test: these are the same as in test_mini_pipeline_run1
    hh_ids = [464138, 1918238, 2201602]
    choices = [0, 1, 2]
    expected_choice = pd.Series(choices,
                                index=pd.Index(hh_ids, name="HHID"),
                                name='auto_ownership')

    print "auto_choice\n", auto_choice.head(4)
    pdt.assert_series_equal(auto_choice[hh_ids], expected_choice)

    # try to run a model already in pipeline
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.run_model('auto_ownership_simulate')
    assert "run model 'auto_ownership_simulate' more than once" in str(
        excinfo.value)

    # and these new ones
    pipeline.run_model('cdap_simulate')
    pipeline.run_model('mandatory_tour_frequency')

    mtf_choice = pipeline.get_table("persons").mandatory_tour_frequency

    per_ids = [24375, 92744, 172491]
    choices = ['school2', 'work_and_school', 'work1']
    expected_choice = pd.Series(choices,
                                index=pd.Index(per_ids, name='PERID'),
                                name='mandatory_tour_frequency')

    print "mtf_choice\n", mtf_choice.head(20)
    pdt.assert_series_equal(mtf_choice[per_ids], expected_choice)

    # should be able to get this before pipeline is closed (from existing open store)
    checkpoints_df = pipeline.get_checkpoints()
    assert len(checkpoints_df.index) == prev_checkpoint_count

    pipeline.close_pipeline()
    orca.clear_cache()
Ejemplo n.º 10
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag:

    ::

      output_tables:
        h5_store: True
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')
    h5_store = output_tables_settings.get('h5_store', False)

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [
            t for t in checkpointed_tables if t not in tables
        ]

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

        if h5_store:
            file_path = config.output_file_path('%soutput_tables.h5' % prefix)
            df.to_hdf(file_path, key=table_name, mode='a', format='fixed')
        else:
            file_name = "%s%s.csv" % (prefix, table_name)
            file_path = config.output_file_path(file_name)

            # include the index if it has a name or is a MultiIndex
            write_index = df.index.name is not None or isinstance(
                df.index, pd.MultiIndex)

            df.to_csv(file_path, index=write_index)
Ejemplo n.º 11
0
MODELS = setting('models')


# If you provide a resume_after argument to pipeline.run
# the pipeline manager will attempt to load checkpointed tables from the checkpoint store
# and resume pipeline processing on the next submodel step after the specified checkpoint
resume_after = setting('resume_after', None)

if resume_after:
    print "resume_after", resume_after

pipeline.run(models=MODELS, resume_after=resume_after)

print "\n#### run completed"

# write final versions of all checkpointed dataframes to CSV files to review results
for table_name in pipeline.checkpointed_tables():
    file_name = "final_%s_table.csv" % table_name
    file_path = os.path.join(orca.get_injectable("output_dir"), file_name)
    pipeline.get_table(table_name).to_csv(file_path)

# write checkpoints
file_path = os.path.join(orca.get_injectable("output_dir"), "checkpoints.csv")
pipeline.get_checkpoints().to_csv(file_path)

# tables will no longer be available after pipeline is closed
pipeline.close_pipeline()

t0 = print_elapsed_time("all models", t0)
Ejemplo n.º 12
0
def test_mini_pipeline_run2():

    # the important thing here is that we should get
    # exactly the same results as for test_mini_pipeline_run
    # when we restart pipeline

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs')
    orca.add_injectable("configs_dir", configs_dir)

    output_dir = os.path.join(os.path.dirname(__file__), 'output')
    orca.add_injectable("output_dir", output_dir)

    data_dir = os.path.join(os.path.dirname(__file__), 'data')
    orca.add_injectable("data_dir", data_dir)

    inject_settings(configs_dir, households_sample_size=HOUSEHOLDS_SAMPLE_SIZE)

    orca.clear_cache()

    # should be able to get this BEFORE pipeline is opened
    checkpoints_df = pipeline.get_checkpoints()
    prev_checkpoint_count = len(checkpoints_df.index)
    assert prev_checkpoint_count == 7

    pipeline.start_pipeline('auto_ownership_simulate')

    auto_choice = pipeline.get_table("households").auto_ownership

    # regression test: these are the 2nd-4th households in households table
    hh_ids = [26960, 857296, 93428]
    choices = [0, 1, 0]
    expected_auto_choice = pd.Series(choices,
                                     index=pd.Index(hh_ids, name="HHID"),
                                     name='auto_ownership')

    print "auto_choice\n", auto_choice.head(4)
    pdt.assert_series_equal(auto_choice[hh_ids], expected_auto_choice)

    # try to run a model already in pipeline
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.run_model('auto_ownership_simulate')
    assert "run model 'auto_ownership_simulate' more than once" in str(
        excinfo.value)

    # and these new ones
    pipeline.run_model('cdap_simulate')
    pipeline.run_model('mandatory_tour_frequency')

    mtf_choice = pipeline.get_table("persons").mandatory_tour_frequency

    per_ids = [92363, 92681, 93428]

    choices = ['work1', 'school1', 'school2']
    expected_choice = pd.Series(choices,
                                index=pd.Index(per_ids, name='PERID'),
                                name='mandatory_tour_frequency')

    print "mtf_choice\n", mtf_choice.head(20)
    pdt.assert_series_equal(mtf_choice[per_ids], expected_choice)

    # should be able to get this before pipeline is closed (from existing open store)
    assert orca.get_injectable('pipeline_store') is not None
    checkpoints_df = pipeline.get_checkpoints()
    assert len(checkpoints_df.index) == prev_checkpoint_count

    pipeline.close()

    # should also be able to get this after pipeline is closed (open and close)
    assert orca.get_injectable('pipeline_store') is None
    checkpoints_df = pipeline.get_checkpoints()
    assert len(checkpoints_df.index) == prev_checkpoint_count

    orca.clear_cache()
Ejemplo n.º 13
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info("No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in checkpointed_tables if t not in tables]

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

        file_name = "%s%s.csv" % (prefix, table_name)
        file_path = config.output_file_path(file_name)

        # include the index if it has a name or is a MultiIndex
        write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex)

        df.to_csv(file_path, index=write_index)
Ejemplo n.º 14
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    output_tables_list = pipeline.checkpointed_tables()

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in output_tables_list if t not in tables]

    # should provide option to also write checkpoints?
    # output_tables_list.append("checkpoints.csv")

    for table_name in output_tables_list:
        table = inject.get_table(table_name, None)

        if table is None:
            logger.warn("Skipping '%s': Table not found." % table_name)
            continue

        df = table.to_frame()
        file_name = "%s%s.csv" % (prefix, table_name)
        logger.info("writing output file %s" % file_name)
        file_path = os.path.join(output_dir, file_name)
        write_index = df.index.name is not None
        df.to_csv(file_path, index=write_index)

    if (action == 'include') == ('checkpoints' in tables):
        # write checkpoints
        file_name = "%s%s.csv" % (prefix, 'checkpoints')
        pipeline.get_checkpoints().to_csv(os.path.join(output_dir, file_name))