Esempio n. 1
0
def write_coefficient_template(model_settings):
    coefficients = simulate.read_model_coefficients(model_settings)

    coefficients = coefficients.transpose()
    coefficients.columns.name = None

    template = coefficients.copy()

    coef_names = []
    coef_values = []

    for c in coefficients.columns:

        values = coefficients[c]
        unique_values = values.unique()

        for uv in unique_values:

            if len(unique_values) == 1:
                uv_coef_name = c + '_all'
            else:
                uv_coef_name = c + '_' + '_'.join(
                    values[values == uv].index.values)

            coef_names.append(uv_coef_name)
            coef_values.append(uv)

            template[c] = template[c].where(values != uv, uv_coef_name)

    refactored_coefficients = pd.DataFrame({
        'coefficient_name': coef_names,
        'value': coef_values
    })
    refactored_coefficients.value = refactored_coefficients.value.astype(
        np.float32)
    print(refactored_coefficients)

    template = template.transpose()
    template.to_csv(
        config.output_file_path('tour_mode_choice_coefficients_template.csv'),
        mode='w',
        index=True,
        header=True)

    refactored_coefficients.to_csv(config.output_file_path(
        'tour_mode_choice_refactored_coefficients.csv'),
                                   mode='w',
                                   index=False,
                                   header=True)
Esempio n. 2
0
def build_network(settings):
    """
    Build a Pandana network from CSV files
    """

    logger.info('building pandana network')
    network_settings_file = settings['network_settings_file']
    if not network_settings_file:
        logger.error("Please specify 'network_settings_file' in settings")
        return

    network_settings = config.read_model_settings(network_settings_file)
    logger.debug('using settings %s' % network_settings)

    nodes = pd.read_csv(config.data_file_path(network_settings['nodes']))
    links = pd.read_csv(config.data_file_path(network_settings['links']))

    nodes.index = nodes[network_settings['nodes-id']]

    network = pdna.Network(nodes[network_settings['nodes-x']],
                           nodes[network_settings['nodes-y']],
                           links[network_settings['links-a']],
                           links[network_settings['links-b']],
                           links[[network_settings['links-impedance']]],
                           twoway=network_settings['twoway'])

    network.save_hdf5(config.output_file_path('pandana_network.h5'))

    return network
Esempio n. 3
0
def track_skim_usage(output_dir):
    """
    write statistics on skim usage (diagnostic to detect loading of un-needed skims)

    FIXME - have not yet implemented a facility to avoid loading of unused skims

    FIXME - if resume_after, this will only reflect skims used after resume

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    skim_dict = inject.get_injectable('skim_dict')

    mode = 'wb' if sys.version_info < (3, ) else 'w'
    with open(config.output_file_path('skim_usage.txt'), mode) as output_file:

        print("\n### skim_dict usage", file=output_file)
        for key in skim_dict.get_skim_usage():
            print(key, file=output_file)

        unused = set(k for k in skim_dict.skim_info.base_keys) - set(
            k for k in skim_dict.get_skim_usage())

        for key in unused:
            print(key, file=output_file)
Esempio n. 4
0
def write_summaries(output_dir):

    summary_settings_name = 'output_summaries'
    summary_file_name = 'summaries.txt'

    summary_settings = setting(summary_settings_name)

    if summary_settings is None:
        logger.info(
            "No {summary_settings_name} specified in settings file. Nothing to write."
        )
        return

    summary_dict = summary_settings

    mode = 'wb' if sys.version_info < (3, ) else 'w'
    with open(config.output_file_path(summary_file_name), mode) as output_file:

        for table_name, column_names in summary_dict.items():

            df = pipeline.get_table(table_name)

            for c in column_names:
                n = 100
                empty = (df[c] == '') | df[c].isnull()

                print(
                    f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n",
                    file=output_file)
                print(df[c].value_counts().nlargest(n), file=output_file)
Esempio n. 5
0
def previous_write_data_dictionary(output_dir):
    """
    Write table_name, number of rows, columns, and bytes for each checkpointed table

    Parameters
    ----------
    output_dir: str

    """

    model_settings = config.read_model_settings('write_data_dictionary')
    txt_format = model_settings.get('txt_format', 'data_dict.txt')
    csv_format = model_settings.get('csv_format', 'data_dict.csv')

    if txt_format:

        output_file_path = config.output_file_path(txt_format)

        pd.options.display.max_columns = 500
        pd.options.display.max_rows = 100

        output_tables = pipeline.checkpointed_tables()

        # write data dictionary for all checkpointed_tables

        with open(output_file_path, 'w') as output_file:
            for table_name in output_tables:
                df = inject.get_table(table_name, None).to_frame()

                print("\n### %s %s" % (table_name, df.shape), file=output_file)
                print('index:',
                      df.index.name,
                      df.index.dtype,
                      file=output_file)
                print(df.dtypes, file=output_file)
Esempio n. 6
0
def test_full_run2():

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs2')
    inject.add_injectable("configs_dir", configs_dir)

    data_dir = os.path.join(os.path.dirname(__file__), 'data2')
    inject.add_injectable("data_dir", data_dir)

    output_dir = os.path.join(os.path.dirname(__file__), 'output')
    inject.add_injectable("output_dir", output_dir)

    inject.clear_cache()

    tracing.config_logger()

    tracing.delete_output_files('csv')
    tracing.delete_output_files('txt')
    tracing.delete_output_files('yaml')

    _MODELS = [
        'input_pre_processor',
        'setup_data_structures',
        'initial_seed_balancing',
        'meta_control_factoring',
        'final_seed_balancing',
        'integerize_final_seed_weights',
        'sub_balancing.geography=DISTRICT',
        'sub_balancing.geography=TRACT',
        'sub_balancing.geography=TAZ',
        'expand_households',
        'summarize',
        'write_tables'
    ]

    pipeline.run(models=_MODELS, resume_after=None)

    assert isinstance(pipeline.get_table('expanded_household_ids'), pd.DataFrame)

    # output tables list action: include
    assert os.path.exists(config.output_file_path('expanded_household_ids.csv'))
    assert os.path.exists(config.output_file_path('summary_DISTRICT.csv'))
    assert not os.path.exists(config.output_file_path('summary_TAZ.csv'))

    # tables will no longer be available after pipeline is closed
    pipeline.close_pipeline()

    inject.clear_cache()
Esempio n. 7
0
def regress():

    persons_df = pipeline.get_table('persons')
    persons_df = persons_df[persons_df.household_id == HH_ID]
    print("persons_df\n%s" % persons_df[['value_of_time', 'distance_to_work']])
    """
    persons_df
     person_id  value_of_time  distance_to_work
    person_id
    3249922        23.349532              0.62
    3249923        23.349532              0.62
    """

    tours_df = pipeline.get_table('tours')

    regress_tour_modes(tours_df)

    assert tours_df.shape[0] > 0
    assert not tours_df.tour_mode.isnull().any()

    # optional logsum column was added to all tours except mandatory
    assert 'destination_logsum' in tours_df
    if (tours_df.destination_logsum.isnull() !=
        (tours_df.tour_category == 'mandatory')).any():
        print(tours_df[(tours_df.destination_logsum.isnull() !=
                        (tours_df.tour_category == 'mandatory'))])
    assert (tours_df.destination_logsum.isnull() == (
        tours_df.tour_category == 'mandatory')).all()

    # mode choice logsum calculated for all tours
    assert 'mode_choice_logsum' in tours_df
    assert not tours_df.mode_choice_logsum.isnull().any()

    trips_df = pipeline.get_table('trips')
    assert trips_df.shape[0] > 0
    assert not trips_df.purpose.isnull().any()
    assert not trips_df.depart.isnull().any()
    assert not trips_df.trip_mode.isnull().any()

    # mode_choice_logsum calculated for all trips
    assert not trips_df.mode_choice_logsum.isnull().any()

    # should be at least two tours per trip
    assert trips_df.shape[0] >= 2 * tours_df.shape[0]

    # write_trip_matrices
    trip_matrices_file = config.output_file_path('trips_md.omx')
    assert os.path.exists(trip_matrices_file)
    trip_matrices = omx.open_file(trip_matrices_file)
    assert trip_matrices.shape() == (25, 25)

    assert 'WALK_MD' in trip_matrices.list_matrices()
    walk_trips = np.array(trip_matrices['WALK_MD'])
    assert walk_trips.dtype == np.dtype('float64')

    trip_matrices.close()
Esempio n. 8
0
def track_skim_usage(output_dir):
    """
    write statistics on skim usage (diagnostic to detect loading of un-needed skims)

    FIXME - have not yet implemented a facility to avoid loading of unused skims

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    checkpoints = pipeline.get_checkpoints()
    tables = OrderedDict()

    skim_dict = inject.get_injectable('skim_dict')
    skim_stack = inject.get_injectable('skim_stack', None)

    mode = 'wb' if sys.version_info < (3, ) else 'w'
    with open(config.output_file_path('skim_usage.txt'), mode) as output_file:

        print("\n### skim_dict usage", file=output_file)
        for key in skim_dict.usage:
            print(key, file=output_file)

        if skim_stack is None:

            unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \
                          {k for k in skim_dict.usage}

            print("\n### unused skim keys", file=output_file)
            for key in unused_keys:
                print(key, file=output_file)

        else:

            print("\n### skim_stack usage", file=output_file)
            for key in skim_stack.usage:
                print(key, file=output_file)

            unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \
                     {k for k in skim_dict.usage if not isinstance(k, tuple)}
            print("\n### unused skim str keys", file=output_file)
            for key in unused:
                print(key, file=output_file)

                unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \
                         {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \
                         {k for k in skim_stack.usage}
            print("\n### unused skim dim3 keys", file=output_file)
            for key in unused:
                print(key, file=output_file)
Esempio n. 9
0
def track_skim_usage(output_dir):
    """
    write statistics on skim usage (diagnostic to detect loading of un-needed skims)

    FIXME - have not yet implemented a facility to avoid loading of unused skims

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    checkpoints = pipeline.get_checkpoints()
    tables = OrderedDict()

    skim_dict = inject.get_injectable('skim_dict')
    skim_stack = inject.get_injectable('skim_stack', None)

    mode = 'wb' if sys.version_info < (3,) else 'w'
    with open(config.output_file_path('skim_usage.txt'), mode) as output_file:

        print("\n### skim_dict usage", file=output_file)
        for key in skim_dict.usage:
            print(key, file=output_file)

        if skim_stack is None:

            unused_keys = {k for k in skim_dict.skim_info['omx_keys']} - \
                          {k for k in skim_dict.usage}

            print("\n### unused skim keys", file=output_file)
            for key in unused_keys:
                print(key, file=output_file)

        else:

            print("\n### skim_stack usage", file=output_file)
            for key in skim_stack.usage:
                print(key, file=output_file)

            unused = {k for k in skim_dict.skim_info['omx_keys'] if not isinstance(k, tuple)} - \
                     {k for k in skim_dict.usage if not isinstance(k, tuple)}
            print("\n### unused skim str keys", file=output_file)
            for key in unused:
                print(key, file=output_file)

                unused = {k[0] for k in skim_dict.skim_info['omx_keys'] if isinstance(k, tuple)} - \
                         {k[0] for k in skim_dict.usage if isinstance(k, tuple)} - \
                         {k for k in skim_stack.usage}
            print("\n### unused skim dim3 keys", file=output_file)
            for key in unused:
                print(key, file=output_file)
Esempio n. 10
0
    def data_directory(self):

        # shouldn't be asking for this if not estimating
        assert self.estimating
        assert self.settings_name is not None

        parent_dir = config.output_file_path('estimation_data_bundle')

        if self.settings_name != self.model_name:
            parent_dir = os.path.join(parent_dir, self.settings_name)

        return os.path.join(parent_dir, self.model_name)
Esempio n. 11
0
def write_matrices(aggregate_trips, zone_index, orig_index, dest_index,
                   model_settings):
    """
    Write aggregated trips to OMX format.

    The MATRICES setting lists the new OMX files to write.
    Each file can contain any number of 'tables', each specified by a
    table key ('name') and a trips table column ('data_field') to use
    for aggregated counts.

    Any data type may be used for columns added in the annotation phase,
    but the table 'data_field's must be summable types: ints, floats, bools.
    """

    matrix_settings = model_settings.get('MATRICES')

    if not matrix_settings:
        logger.error('Missing MATRICES setting in write_trip_matrices.yaml')

    for matrix in matrix_settings:
        filename = matrix.get('file_name')
        filepath = config.output_file_path(filename)
        logger.info('opening %s' % filepath)
        file = omx.open_file(filepath, 'w')  # possibly overwrite existing file
        table_settings = matrix.get('tables')

        for table in table_settings:
            table_name = table.get('name')
            col = table.get('data_field')

            if col not in aggregate_trips:
                logger.error(
                    f'missing {col} column in aggregate_trips DataFrame')
                return

            hh_weight_col = model_settings.get('HH_EXPANSION_WEIGHT_COL')
            if hh_weight_col:
                aggregate_trips[col] = aggregate_trips[col] / aggregate_trips[
                    hh_weight_col]

            data = np.zeros((len(zone_index), len(zone_index)))
            data[orig_index, dest_index] = aggregate_trips[col]
            logger.info('writing %s' % table_name)
            file[table_name] = data  # write to file

        # include the index-to-zone map in the file
        logger.info('adding %s mapping for %s zones to %s' %
                    (zone_index.name, zone_index.size, filename))
        file.create_mapping(zone_index.name, zone_index.to_numpy())

        logger.info('closing %s' % filepath)
        file.close()
Esempio n. 12
0
def get_cached_spec(hhsize):

    spec_name = cached_spec_name(hhsize)

    spec = inject.get_injectable(spec_name, None)
    if spec is not None:
        logger.info("build_cdap_spec returning cached injectable spec %s",
                    spec_name)
        return spec

    # # try configs dir
    # spec_path = config.config_file_path(spec_name, mandatory=False)
    # if spec_path:
    #     logger.info("build_cdap_spec reading cached spec %s from %s", spec_name, spec_path)
    #     return pd.read_csv(spec_path, index_col='Expression')

    # try data dir
    if os.path.exists(config.output_file_path(spec_name)):
        spec_path = config.output_file_path(spec_name)
        logger.info("build_cdap_spec reading cached spec %s from %s",
                    spec_name, spec_path)
        return pd.read_csv(spec_path, index_col='Expression')

    return None
Esempio n. 13
0
def preload_injectables():
    """
    preload bulky injectables up front - stuff that isn't inserted into the pipeline
    """

    logger.info("preload_injectables")

    inject.add_step('track_skim_usage', track_skim_usage)
    inject.add_step('write_data_dictionary', write_data_dictionary)
    inject.add_step('write_tables', write_tables)

    table_list = config.setting('input_table_list')

    # default ActivitySim table names and indices
    if table_list is None:
        logger.warning(
            "No 'input_table_list' found in settings. This will be a "
            "required setting in upcoming versions of ActivitySim.")

        new_settings = inject.get_injectable('settings')
        new_settings['input_table_list'] = DEFAULT_TABLE_LIST
        inject.add_injectable('settings', new_settings)

    # FIXME undocumented feature
    if config.setting('write_raw_tables'):

        # write raw input tables as csv (before annotation)
        csv_dir = config.output_file_path('raw_tables')
        if not os.path.exists(csv_dir):
            os.makedirs(csv_dir)  # make directory if needed

        table_names = [t['tablename'] for t in table_list]
        for t in table_names:
            df = inject.get_table(t).to_frame()
            if t == 'households':
                df.drop(columns='chunk_id', inplace=True)
            df.to_csv(os.path.join(csv_dir, '%s.csv' % t), index=True)

    t0 = tracing.print_elapsed_time()

    # FIXME - still want to do this?
    # if inject.get_injectable('skim_dict', None) is not None:
    #     t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True)
    #
    # if inject.get_injectable('skim_stack', None) is not None:
    #     t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True)

    return True
Esempio n. 14
0
    def output_directory(self, bundle_directory=False):

        # shouldn't be asking for this if not estimating
        assert self.estimating
        assert self.model_name is not None

        dir = os.path.join(config.output_file_path('estimation_data_bundle'),
                           self.bundle_name)

        if bundle_directory:
            # shouldn't be asking - probably confused
            assert self.bundle_name != self.model_name

        if self.bundle_name != self.model_name and not bundle_directory:
            dir = os.path.join(dir, self.model_name)

        return dir
Esempio n. 15
0
def get_trace_csv(file_name):

    file_name = config.output_file_path(file_name)
    df = pd.read_csv(file_name)

    #        label    value_1    value_2    value_3    value_4
    # 0    tour_id        38         201         39         40
    # 1       mode  DRIVE_LOC  DRIVE_COM  DRIVE_LOC  DRIVE_LOC
    # 2  person_id    1888694    1888695    1888695    1888696
    # 3  tour_type       work   othmaint       work     school
    # 4   tour_num          1          1          1          1

    # transpose df and rename columns
    labels = df.label.values
    df = df.transpose()[1:]
    df.columns = labels

    return df
Esempio n. 16
0
def read_network_file(settings):
    """
    Read network from saved HDF5 file
    """

    network_fname = settings['saved_network']
    if not network_fname:
        logger.error("Please specify 'saved_network' file in settings")
        return

    network_fpath = config.data_file_path(network_fname, mandatory=False) or \
        config.output_file_path(network_fname)

    if not os.path.exists(network_fpath):
        logger.error('No network file %s found' % network_fname)
        return

    logger.info('Reading network from %s' % network_fpath)
    network = pdna.Network.from_hdf5(network_fpath)

    return network
Esempio n. 17
0
def get_osm_network(zone_data, settings):
    """
    Retrieve Pandana network from Open Street Maps
    """

    logger.info('getting osm network')
    zones_df = zone_data.to_frame()

    miles = settings.get('distance_units') == 'miles'
    # distance to degrees: 111 km = 69 miles = 1 degree of long (y), 3mi = 0.043
    conversion = 69 if miles else 111 * 1000
    buffer = settings.get('max_dist') / conversion
    xmin = min(zones_df[settings['zones_lon']]) - buffer
    xmax = max(zones_df[settings['zones_lon']]) + buffer
    ymin = min(zones_df[settings['zones_lat']]) - buffer
    ymax = max(zones_df[settings['zones_lat']]) + buffer
    logger.debug('bounding box: %s, %s, %s, %s' %
                 (str(ymin), str(xmin), str(ymax), str(xmax)))

    # default type=walk, which excludes freeways
    nodes, edges = osm.network_from_bbox(lat_min=ymin,
                                         lng_min=xmin,
                                         lat_max=ymax,
                                         lng_max=xmax,
                                         two_way=True,
                                         network_type='walk')

    if miles:
        logger.info('converting network distance units to miles...')
        edges[['distance']] = edges[['distance']] / 1609.34

    network = pdna.Network(nodes['x'], nodes['y'], edges['from'], edges['to'],
                           edges[['distance']])

    print(edges.head())
    print(edges[['distance']])
    network.save_hdf5(config.output_file_path('pandana_network.h5'))

    return network
Esempio n. 18
0
def write_data_dictionary(output_dir):
    """
    Write table_name, number of rows, columns, and bytes for each checkpointed table

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    output_tables = pipeline.checkpointed_tables()

    # write data dictionary for all checkpointed_tables

    mode = 'wb' if sys.version_info < (3, ) else 'w'
    with open(config.output_file_path('data_dict.txt'), mode) as output_file:
        for table_name in output_tables:
            df = inject.get_table(table_name, None).to_frame()

            print("\n### %s %s" % (table_name, df.shape), file=output_file)
            print('index:', df.index.name, df.index.dtype, file=output_file)
            print(df.dtypes, file=output_file)
Esempio n. 19
0
def write_data_dictionary(output_dir):
    """
    Write table_name, number of rows, columns, and bytes for each checkpointed table

    Parameters
    ----------
    output_dir: str

    """
    pd.options.display.max_columns = 500
    pd.options.display.max_rows = 100

    output_tables = pipeline.checkpointed_tables()

    # write data dictionary for all checkpointed_tables

    mode = 'wb' if sys.version_info < (3,) else 'w'
    with open(config.output_file_path('data_dict.txt'), mode) as output_file:
        for table_name in output_tables:
            df = inject.get_table(table_name, None).to_frame()

            print("\n### %s %s" % (table_name, df.shape), file=output_file)
            print('index:', df.index.name, df.index.dtype, file=output_file)
            print(df.dtypes, file=output_file)
Esempio n. 20
0
def read_from_table_info(table_info):
    """
    Read input text files and return cleaned up DataFrame.

    table_info is a dictionary that specifies the following input params.

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+
    | h5_tablename | name of target table in HDF5 file                        |
    +--------------+----------------------------------------------------------+

    """
    input_store = config.setting('input_store', None)
    create_input_store = config.setting('create_input_store', default=False)

    tablename = table_info.get('tablename')
    data_filename = table_info.get('filename', input_store)
    h5_tablename = table_info.get('h5_tablename') or tablename
    drop_columns = table_info.get('drop_columns', None)
    column_map = table_info.get('column_map', None)
    keep_columns = table_info.get('keep_columns', None)
    rename_columns = table_info.get('rename_columns', None)
    index_col = table_info.get('index_col', None)

    assert tablename is not None, 'no tablename provided'
    assert data_filename is not None, 'no input file provided'

    data_file_path = config.data_file_path(data_filename)

    df = _read_input_file(data_file_path, h5_tablename=h5_tablename)

    logger.debug('raw %s table columns: %s' % (tablename, df.columns.values))
    logger.debug('raw %s table size: %s' % (tablename, util.df_size(df)))

    if create_input_store:
        h5_filepath = config.output_file_path('input_data.h5')
        logger.info('writing %s to %s' % (h5_tablename, h5_filepath))
        df.to_hdf(h5_filepath, key=h5_tablename, mode='a')

        csv_dir = config.output_file_path('input_data')
        if not os.path.exists(csv_dir):
            os.makedirs(csv_dir)  # make directory if needed
        df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False)

    if drop_columns:
        logger.debug("dropping columns: %s" % drop_columns)
        df.drop(columns=drop_columns, inplace=True, errors='ignore')

    if column_map:
        warnings.warn(
            "table_inf option 'column_map' renamed 'rename_columns'"
            "Support for 'column_map' will be removed in future versions.",
            FutureWarning)
        logger.debug("renaming columns: %s" % column_map)
        df.rename(columns=column_map, inplace=True)

    # rename columns first, so keep_columns can be a stable list of expected/required columns
    if rename_columns:
        logger.info("renaming columns: %s" % rename_columns)
        df.rename(columns=rename_columns, inplace=True)

    # set index
    if index_col is not None:
        if index_col in df.columns:
            assert not df.duplicated(index_col).any()
            df.set_index(index_col, inplace=True)
        else:
            df.index.names = [index_col]

    logger.info("keeping columns: %s" % keep_columns)
    if keep_columns:
        logger.info("keeping columns: %s" % keep_columns)
        df = df[keep_columns]

    logger.debug('%s table columns: %s' % (tablename, df.columns.values))
    logger.debug('%s table size: %s' % (tablename, util.df_size(df)))
    logger.info('%s index name: %s' % (tablename, df.index.name))

    return df
Esempio n. 21
0
def cdap_simulate(persons_merged, persons, households, chunk_size,
                  trace_hh_id):
    """
    CDAP stands for Coordinated Daily Activity Pattern, which is a choice of
    high-level activity pattern for each person, in a coordinated way with other
    members of a person's household.

    Because Python requires vectorization of computation, there are some specialized
    routines in the cdap directory of activitysim for this purpose.  This module
    simply applies those utilities using the simulation framework.
    """

    trace_label = 'cdap'
    model_settings = config.read_model_settings('cdap.yaml')

    cdap_indiv_spec = simulate.read_model_spec(
        file_name=model_settings['INDIV_AND_HHSIZE1_SPEC'])

    # Rules and coefficients for generating interaction specs for different household sizes
    cdap_interaction_coefficients = \
        pd.read_csv(config.config_file_path('cdap_interaction_coefficients.csv'), comment='#')
    """
    spec to compute/specify the relative proportions of each activity (M, N, H)
    that should be used to choose activities for additional household members not handled by CDAP
    This spec is handled much like an activitysim logit utility spec,
    EXCEPT that the values computed are relative proportions, not utilities
    (i.e. values are not exponentiated before being normalized to probabilities summing to 1.0)
    """
    cdap_fixed_relative_proportions = \
        simulate.read_model_spec(file_name=model_settings['FIXED_RELATIVE_PROPORTIONS_SPEC'])

    persons_merged = persons_merged.to_frame()

    constants = config.get_model_constants(model_settings)

    cdap_interaction_coefficients = \
        cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients)

    # specs are built just-in-time on demand and cached as injectables
    # prebuilding here allows us to write them to the output directory
    # (also when multiprocessing locutor might not see all household sizes)
    logger.info("Pre-building cdap specs")
    for hhsize in range(2, cdap.MAX_HHSIZE + 1):
        spec = cdap.build_cdap_spec(cdap_interaction_coefficients,
                                    hhsize,
                                    cache=True)
        if inject.get_injectable('locutor', False):
            spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize),
                        index=True)

    estimator = estimation.manager.begin_estimation('cdap')
    if estimator:
        estimator.write_model_settings(model_settings, 'cdap.yaml')
        estimator.write_spec(model_settings, tag='INDIV_AND_HHSIZE1_SPEC')
        estimator.write_spec(model_settings=model_settings,
                             tag='FIXED_RELATIVE_PROPORTIONS_SPEC')
        estimator.write_table(cdap_interaction_coefficients,
                              'interaction_coefficients',
                              index=False,
                              append=False)
        estimator.write_choosers(persons_merged)
        for hhsize in range(2, cdap.MAX_HHSIZE + 1):
            spec = cdap.get_cached_spec(hhsize)
            estimator.write_table(spec, 'spec_%s' % hhsize, append=False)

    logger.info("Running cdap_simulate with %d persons",
                len(persons_merged.index))

    choices = cdap.run_cdap(
        persons=persons_merged,
        cdap_indiv_spec=cdap_indiv_spec,
        cdap_interaction_coefficients=cdap_interaction_coefficients,
        cdap_fixed_relative_proportions=cdap_fixed_relative_proportions,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_hh_id=trace_hh_id,
        trace_label=trace_label)

    if estimator:
        estimator.write_choices(choices)
        choices = estimator.get_survey_values(choices, 'persons',
                                              'cdap_activity')
        estimator.write_override_choices(choices)
        estimator.end_estimation()

    # - assign results to persons table and annotate
    persons = persons.to_frame()

    choices = choices.reindex(persons.index)
    persons['cdap_activity'] = choices

    expressions.assign_columns(
        df=persons,
        model_settings=model_settings.get('annotate_persons'),
        trace_label=tracing.extend_trace_label(trace_label,
                                               'annotate_persons'))

    pipeline.replace_table("persons", persons)

    # - annotate households table
    households = households.to_frame()
    expressions.assign_columns(
        df=households,
        model_settings=model_settings.get('annotate_households'),
        trace_label=tracing.extend_trace_label(trace_label,
                                               'annotate_households'))
    pipeline.replace_table("households", households)

    tracing.print_summary('cdap_activity',
                          persons.cdap_activity,
                          value_counts=True)
    logger.info(
        "cdap crosstabs:\n%s" %
        pd.crosstab(persons.ptype, persons.cdap_activity, margins=True))
Esempio n. 22
0
def cache_spec(hhsize, spec):
    spec_name = cached_spec_name(hhsize)
    # cache as injectable
    inject.add_injectable(spec_name, spec)
    # cache as csv in output_dir
    spec.to_csv(config.output_file_path(spec_name), index=True)
Esempio n. 23
0
def cdap_simulate(persons_merged, persons, households, cdap_indiv_spec,
                  cdap_interaction_coefficients,
                  cdap_fixed_relative_proportions, chunk_size, trace_hh_id):
    """
    CDAP stands for Coordinated Daily Activity Pattern, which is a choice of
    high-level activity pattern for each person, in a coordinated way with other
    members of a person's household.

    Because Python requires vectorization of computation, there are some specialized
    routines in the cdap directory of activitysim for this purpose.  This module
    simply applies those utilities using the simulation framework.
    """

    trace_label = 'cdap'
    model_settings = config.read_model_settings('cdap.yaml')

    persons_merged = persons_merged.to_frame()

    constants = config.get_model_constants(model_settings)

    cdap_interaction_coefficients = \
        cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients)

    # specs are built just-in-time on demand and cached as injectables
    # prebuilding here allows us to write them to the output directory
    # (also when multiprocessing locutor might not see all household sizes)
    logger.info("Pre-building cdap specs")
    for hhsize in range(2, cdap.MAX_HHSIZE + 1):
        spec = cdap.build_cdap_spec(cdap_interaction_coefficients,
                                    hhsize,
                                    cache=True)
        if inject.get_injectable('locutor', False):
            spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize),
                        index=True)

    logger.info("Running cdap_simulate with %d persons",
                len(persons_merged.index))

    choices = cdap.run_cdap(
        persons=persons_merged,
        cdap_indiv_spec=cdap_indiv_spec,
        cdap_interaction_coefficients=cdap_interaction_coefficients,
        cdap_fixed_relative_proportions=cdap_fixed_relative_proportions,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_hh_id=trace_hh_id,
        trace_label=trace_label)

    # - assign results to persons table and annotate
    persons = persons.to_frame()

    choices = choices.reindex(persons.index)
    persons['cdap_activity'] = choices.cdap_activity
    persons['cdap_rank'] = choices.cdap_rank

    expressions.assign_columns(
        df=persons,
        model_settings=model_settings.get('annotate_persons'),
        trace_label=tracing.extend_trace_label(trace_label,
                                               'annotate_persons'))

    pipeline.replace_table("persons", persons)

    # - annotate households table
    households = households.to_frame()
    expressions.assign_columns(
        df=households,
        model_settings=model_settings.get('annotate_households'),
        trace_label=tracing.extend_trace_label(trace_label,
                                               'annotate_households'))
    pipeline.replace_table("households", households)

    tracing.print_summary('cdap_activity',
                          persons.cdap_activity,
                          value_counts=True)
    logger.info(
        "cdap crosstabs:\n%s" %
        pd.crosstab(persons.ptype, persons.cdap_activity, margins=True))

    if trace_hh_id:

        tracing.trace_df(inject.get_table('persons_merged').to_frame(),
                         label="cdap",
                         columns=['ptype', 'cdap_rank', 'cdap_activity'],
                         warn_if_empty=True)
Esempio n. 24
0
def read_from_table_info(table_info):
    """
    Read input text files and return cleaned up DataFrame.

    table_info is a dictionary that specifies the following input params.

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+
    | h5_tablename | name of target table in HDF5 file                        |
    +--------------+----------------------------------------------------------+

    """
    input_store = config.setting('input_store', None)
    create_input_store = config.setting('create_input_store', default=False)

    tablename = table_info.get('tablename')
    data_filename = table_info.get('filename', input_store)
    h5_tablename = table_info.get('h5_tablename') or tablename
    drop_columns = table_info.get('drop_columns', None)
    column_map = table_info.get('column_map', None)
    index_col = table_info.get('index_col', None)

    assert tablename is not None, 'no tablename provided'
    assert data_filename is not None, 'no input file provided'

    data_file_path = config.data_file_path(data_filename)

    df = _read_input_file(data_file_path, h5_tablename=h5_tablename)

    logger.info('%s table columns: %s' % (tablename, df.columns.values))
    logger.info('%s table size: %s' % (tablename, util.df_size(df)))

    if create_input_store:
        h5_filepath = config.output_file_path('input_data.h5')
        logger.info('writing %s to %s' % (h5_tablename, h5_filepath))
        df.to_hdf(h5_filepath, key=h5_tablename, mode='a')

    if drop_columns:
        for c in drop_columns:
            logger.info("dropping column '%s'" % c)
            del df[c]

    if column_map:
        df.rename(columns=column_map, inplace=True)

    # set index
    if index_col is not None:
        if index_col in df.columns:
            assert not df.duplicated(index_col).any()
            df.set_index(index_col, inplace=True)
        else:
            df.index.names = [index_col]

    logger.info('%s index name: %s' % (tablename, df.index.name))

    return df
Esempio n. 25
0
def read_from_table_info(table_info):
    """
    Read input text files and return cleaned up DataFrame.

    table_info is a dictionary that specifies the following input params.

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+
    | h5_tablename | name of target table in HDF5 file                        |
    +--------------+----------------------------------------------------------+

    """
    input_store = config.setting('input_store', None)
    create_input_store = config.setting('create_input_store', default=False)

    tablename = table_info.get('tablename')
    data_filename = table_info.get('filename', input_store)
    h5_tablename = table_info.get('h5_tablename') or tablename
    drop_columns = table_info.get('drop_columns', None)
    column_map = table_info.get('column_map', None)
    keep_columns = table_info.get('keep_columns', None)
    rename_columns = table_info.get('rename_columns', None)
    csv_dtypes = table_info.get('dtypes', {})

    # don't require a redundant index_col directive for canonical tables
    # but allow explicit disabling of assignment of index col for canonical tables, in which case, presumably,
    # the canonical index will be assigned in a subsequent initialization step (e.g. initialize_tours)
    canonical_index_col = canonical_table_index_name(tablename)

    # if there is an explicit index_col entry in table_info
    if 'index_col' in table_info:
        # honor explicit index_col unless it conflicts with canonical name

        index_col = table_info['index_col']

        if canonical_index_col:
            if index_col:
                # if there is a non-empty index_col directive, it should be for canonical_table_index_name
                assert index_col == canonical_index_col, \
                    f"{tablename} index_col {table_info.get('index_col')} should be {index_col}"
            else:
                logger.info(f"Not assigning canonical index_col {tablename}.{canonical_index_col} "
                            f"because settings file index_col directive is explicitly None.")

        #  if there is an index_col directive for a canonical table, it should be for canonical_table_index_name

    else:
        # otherwise default is to use canonical index name for known tables, and no index for unknown tables
        index_col = canonical_index_col

    assert tablename is not None, 'no tablename provided'
    assert data_filename is not None, 'no input file provided'

    data_file_path = config.data_file_path(data_filename)

    df = _read_input_file(data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes)

    # logger.debug('raw %s table columns: %s' % (tablename, df.columns.values))
    logger.debug('raw %s table size: %s' % (tablename, util.df_size(df)))

    if create_input_store:
        h5_filepath = config.output_file_path('input_data.h5')
        logger.info('writing %s to %s' % (h5_tablename, h5_filepath))
        df.to_hdf(h5_filepath, key=h5_tablename, mode='a')

        csv_dir = config.output_file_path('input_data')
        if not os.path.exists(csv_dir):
            os.makedirs(csv_dir)  # make directory if needed
        df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False)

    if drop_columns:
        logger.debug("dropping columns: %s" % drop_columns)
        df.drop(columns=drop_columns, inplace=True, errors='ignore')

    if column_map:
        warnings.warn("table_inf option 'column_map' renamed 'rename_columns'"
                      "Support for 'column_map' will be removed in future versions.",
                      FutureWarning)
        logger.debug("renaming columns: %s" % column_map)
        df.rename(columns=column_map, inplace=True)

    # rename columns first, so keep_columns can be a stable list of expected/required columns
    if rename_columns:
        logger.debug("renaming columns: %s" % rename_columns)
        df.rename(columns=rename_columns, inplace=True)

    # set index
    if index_col is not None:
        if index_col in df.columns:
            assert not df.duplicated(index_col).any()
            if canonical_index_col:
                # we expect canonical indexes to be integer-valued
                assert (df[index_col] == df[index_col].astype(int)).all(), \
                    f"Index col '{index_col}' has non-integer values"
                df[index_col] = df[index_col].astype(int)
            df.set_index(index_col, inplace=True)
        else:
            # FIXME not sure we want to do this. More likely they omitted index col than that they want to name it?
            # df.index.names = [index_col]
            logger.error(f"index_col '{index_col}' specified in configs but not in {tablename} table!")
            logger.error(f"{tablename} columns are: {list(df.columns)}")
            raise RuntimeError(f"index_col '{index_col}' not in {tablename} table!")

    if keep_columns:
        logger.debug("keeping columns: %s" % keep_columns)
        if not set(keep_columns).issubset(set(df.columns)):
            logger.error(f"Required columns missing from {tablename} table: "
                         f"{list(set(keep_columns).difference(set(df.columns)))}")
            logger.error(f"{tablename} table has columns: {list(df.columns)}")
            raise RuntimeError(f"Required columns missing from {tablename} table")

        df = df[keep_columns]

    if df.columns.duplicated().any():
        duplicate_column_names = df.columns[df.columns.duplicated(keep=False)].unique().to_list()
        assert not df.columns.duplicated().any(), f"duplicate columns names in {tablename}: {duplicate_column_names}"

    logger.debug('%s table columns: %s' % (tablename, df.columns.values))
    logger.debug('%s table size: %s' % (tablename, util.df_size(df)))
    logger.debug('%s index name: %s' % (tablename, df.index.name))

    return df
Esempio n. 26
0
    def build_virtual_path(self,
                           recipe,
                           path_type,
                           orig,
                           dest,
                           tod,
                           demographic_segment,
                           want_choices,
                           trace_label,
                           filter_targets=None,
                           trace=False,
                           override_choices=None):

        trace_label = tracing.extend_trace_label(trace_label,
                                                 'build_virtual_path')

        # Tracing is implemented as a seperate, second call that operates ONLY on filter_targets
        assert not (trace and filter_targets is None)
        if filter_targets is not None:
            assert filter_targets.any()

            # slice orig and dest
            orig = orig[filter_targets]
            dest = dest[filter_targets]
            assert len(orig) > 0
            assert len(dest) > 0

            # slice tod and demographic_segment if not scalar
            if not isinstance(tod, str):
                tod = tod[filter_targets]
            if demographic_segment is not None:
                demographic_segment = demographic_segment[filter_targets]
                assert len(demographic_segment) > 0

            # slice choices
            # (requires actual choices from the previous call lest rands change on second call)
            assert want_choices == (override_choices is not None)
            if want_choices:
                override_choices = override_choices[filter_targets]

        units = self.units_for_recipe(recipe)
        assert units == 'utility' or not want_choices, "'want_choices' only supported supported if units is utility"

        access_mode = self.network_los.setting(
            f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.access')
        egress_mode = self.network_los.setting(
            f'TVPB_SETTINGS.{recipe}.path_types.{path_type}.egress')
        path_types_settings = self.network_los.setting(
            f'TVPB_SETTINGS.{recipe}.path_types.{path_type}')
        attributes_as_columns = \
            self.network_los.setting(f'TVPB_SETTINGS.{recipe}.tap_tap_settings.attributes_as_columns', [])

        path_info = {
            'path_type': path_type,
            'access_mode': access_mode,
            'egress_mode': egress_mode
        }

        # maz od pairs requested
        with memo("#TVPB build_virtual_path maz_od_df"):
            maz_od_df = pd.DataFrame({
                'idx': orig.index.values,
                'omaz': orig.values,
                'dmaz': dest.values,
                'seq': range(len(orig))
            })
            chunk.log_df(trace_label, "maz_od_df", maz_od_df)
            self.trace_maz_tap(maz_od_df, access_mode, egress_mode)

        # for location choice, there will be multiple alt dest rows per chooser and duplicate orig.index values
        # but tod and demographic_segment should be the same for all chooser rows (unique orig index values)
        # knowing this allows us to eliminate redundant computations (e.g. utilities of maz_tap pairs)
        duplicated = orig.index.duplicated(keep='first')
        chooser_attributes = pd.DataFrame(index=orig.index[~duplicated])
        if not isinstance(tod, str):
            chooser_attributes['tod'] = tod.loc[~duplicated]
        elif 'tod' in attributes_as_columns:
            chooser_attributes['tod'] = tod
        else:
            path_info['tod'] = tod
        if demographic_segment is not None:
            chooser_attributes[
                'demographic_segment'] = demographic_segment.loc[~duplicated]

        with memo("#TVPB build_virtual_path access_df"):
            access_df = self.compute_maz_tap_utilities(recipe,
                                                       maz_od_df,
                                                       chooser_attributes,
                                                       leg='access',
                                                       mode=access_mode,
                                                       trace_label=trace_label,
                                                       trace=trace)
        chunk.log_df(trace_label, "access_df", access_df)

        with memo("#TVPB build_virtual_path egress_df"):
            egress_df = self.compute_maz_tap_utilities(recipe,
                                                       maz_od_df,
                                                       chooser_attributes,
                                                       leg='egress',
                                                       mode=egress_mode,
                                                       trace_label=trace_label,
                                                       trace=trace)
        chunk.log_df(trace_label, "egress_df", egress_df)

        # path_info for use by expressions (e.g. penalty for drive access if no parking at access tap)
        with memo("#TVPB build_virtual_path compute_tap_tap"):
            transit_df = self.compute_tap_tap(recipe,
                                              maz_od_df,
                                              access_df,
                                              egress_df,
                                              chooser_attributes,
                                              path_info=path_info,
                                              trace_label=trace_label,
                                              trace=trace)
        chunk.log_df(trace_label, "transit_df", transit_df)

        with memo("#TVPB build_virtual_path best_paths"):
            path_df = self.best_paths(recipe, path_type, maz_od_df, access_df,
                                      egress_df, transit_df, trace_label,
                                      trace)
        chunk.log_df(trace_label, "path_df", path_df)

        # now that we have created path_df, we are done with the dataframes for the separate legs
        del access_df
        chunk.log_df(trace_label, "access_df", None)
        del egress_df
        chunk.log_df(trace_label, "egress_df", None)
        del transit_df
        chunk.log_df(trace_label, "transit_df", None)

        if units == 'utility':

            # logsums
            with memo("#TVPB build_virtual_path logsums"):
                # one row per seq with utilities in columns
                # path_num 0-based to aligh with logit.make_choices 0-based choice indexes
                path_df['path_num'] = path_df.groupby('seq').cumcount()
                chunk.log_df(trace_label, "path_df", path_df)

                utilities_df = path_df[['seq', 'path_num',
                                        units]].set_index(['seq', 'path_num'
                                                           ]).unstack()
                utilities_df.columns = utilities_df.columns.droplevel(
                )  # for legibility

                # add rows missing because no access or egress availability
                utilities_df = pd.concat(
                    [pd.DataFrame(index=maz_od_df.seq), utilities_df], axis=1)
                utilities_df = utilities_df.fillna(
                    UNAVAILABLE
                )  # set utilities for missing paths to UNAVAILABLE

                chunk.log_df(trace_label, "utilities_df", utilities_df)

                with warnings.catch_warnings(record=True) as w:
                    # Cause all warnings to always be triggered.
                    # most likely "divide by zero encountered in log" caused by all transit sets non-viable
                    warnings.simplefilter("always")

                    paths_nest_nesting_coefficient = path_types_settings.get(
                        'paths_nest_nesting_coefficient', 1)
                    exp_utilities = np.exp(utilities_df.values /
                                           paths_nest_nesting_coefficient)
                    logsums = np.maximum(
                        np.log(np.nansum(exp_utilities, axis=1)), UNAVAILABLE)

                    if len(w) > 0:
                        for wrn in w:
                            logger.warning(
                                f"{trace_label} - {type(wrn).__name__} ({wrn.message})"
                            )

                        DUMP = False
                        if DUMP:
                            zero_utilities_df = utilities_df[np.nansum(
                                np.exp(utilities_df.values), axis=1) == 0]
                            zero_utilities_df.to_csv(config.output_file_path(
                                'warning_utilities_df.csv'),
                                                     index=True)
                            bug

            if want_choices:

                # orig index to identify appropriate random number channel to use making choices
                utilities_df.index = orig.index

                with memo("#TVPB build_virtual_path make_choices"):

                    probs = logit.utils_to_probs(utilities_df,
                                                 allow_zero_probs=True,
                                                 trace_label=trace_label)
                    chunk.log_df(trace_label, "probs", probs)

                    if trace:
                        choices = override_choices

                        utilities_df['choices'] = choices
                        self.trace_df(utilities_df, trace_label,
                                      'utilities_df')

                        probs['choices'] = choices
                        self.trace_df(probs, trace_label, 'probs')
                    else:

                        choices, rands = logit.make_choices(
                            probs,
                            allow_bad_probs=True,
                            trace_label=trace_label)

                        chunk.log_df(trace_label, "rands", rands)
                        del rands
                        chunk.log_df(trace_label, "rands", None)

                    del probs
                    chunk.log_df(trace_label, "probs", None)

                # we need to get path_set, btap, atap from path_df row with same seq and path_num
                # drop seq join column, but keep path_num of choice to override_choices when tracing
                columns_to_cache = ['btap', 'atap', 'path_set', 'path_num']
                logsum_df = \
                    pd.merge(pd.DataFrame({'seq': range(len(orig)), 'path_num': choices.values}),
                             path_df[['seq'] + columns_to_cache],
                             on=['seq', 'path_num'], how='left')\
                    .drop(columns=['seq'])\
                    .set_index(orig.index)

                logsum_df['logsum'] = logsums

            else:

                assert len(logsums) == len(orig)
                logsum_df = pd.DataFrame({'logsum': logsums}, index=orig.index)

            chunk.log_df(trace_label, "logsum_df", logsum_df)

            del utilities_df
            chunk.log_df(trace_label, "utilities_df", None)

            if trace:
                self.trace_df(logsum_df, trace_label, 'logsum_df')

            chunk.log_df(trace_label, "logsum_df", logsum_df)
            results = logsum_df

        else:
            assert units == 'time'

            # return a series
            results = pd.Series(path_df[units].values, index=path_df['idx'])

            # zero-fill rows for O-D pairs where no best path exists because there was no tap-tap transit availability
            results = reindex(results, maz_od_df.idx).fillna(0.0)

            chunk.log_df(trace_label, "results", results)

        assert len(results) == len(orig)

        del path_df
        chunk.log_df(trace_label, "path_df", None)

        # diagnostic
        # maz_od_df['DIST'] = self.network_los.get_default_skim_dict().get('DIST').get(maz_od_df.omaz, maz_od_df.dmaz)
        # maz_od_df[units] = results.logsum if units == 'utility' else results.values
        # print(f"maz_od_df\n{maz_od_df}")

        return results
Esempio n. 27
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag:

    ::

      output_tables:
        h5_store: True
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')
    h5_store = output_tables_settings.get('h5_store', False)
    sort = output_tables_settings.get('sort', False)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [
            t for t in checkpointed_tables if t not in tables
        ]
    else:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

            if sort:
                traceable_table_indexes = inject.get_injectable(
                    'traceable_table_indexes', {})

                if df.index.name in traceable_table_indexes:
                    df = df.sort_index()
                    logger.debug(
                        f"write_tables sorting {table_name} on index {df.index.name}"
                    )
                else:
                    # find all registered columns we can use to sort this table
                    # (they are ordered appropriately in traceable_table_indexes)
                    sort_columns = [
                        c for c in traceable_table_indexes if c in df.columns
                    ]
                    if len(sort_columns) > 0:
                        df = df.sort_values(by=sort_columns)
                        logger.debug(
                            f"write_tables sorting {table_name} on columns {sort_columns}"
                        )
                    else:
                        logger.debug(
                            f"write_tables sorting {table_name} on unrecognized index {df.index.name}"
                        )
                        df = df.sort_index()

        if h5_store:
            file_path = config.output_file_path('%soutput_tables.h5' % prefix)
            df.to_hdf(file_path, key=table_name, mode='a', format='fixed')
        else:
            file_name = "%s%s.csv" % (prefix, table_name)
            file_path = config.output_file_path(file_name)

            # include the index if it has a name or is a MultiIndex
            write_index = df.index.name is not None or isinstance(
                df.index, pd.MultiIndex)

            df.to_csv(file_path, index=write_index)
Esempio n. 28
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    To write tables into a single HDF5 store instead of individual CSVs, use the h5_store flag:

    ::

      output_tables:
        h5_store: True
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info(
            "No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')
    h5_store = output_tables_settings.get('h5_store', False)

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [
            t for t in checkpointed_tables if t not in tables
        ]

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

        if h5_store:
            file_path = config.output_file_path('%soutput_tables.h5' % prefix)
            df.to_hdf(file_path, key=table_name, mode='a', format='fixed')
        else:
            file_name = "%s%s.csv" % (prefix, table_name)
            file_path = config.output_file_path(file_name)

            # include the index if it has a name or is a MultiIndex
            write_index = df.index.name is not None or isinstance(
                df.index, pd.MultiIndex)

            df.to_csv(file_path, index=write_index)
Esempio n. 29
0
def write_data_dictionary(output_dir):
    """
    Write table schema for all tables

    model settings
        txt_format: output text file name (default data_dict.txt) or empty to suppress txt output
        csv_format: output csv file name (default data_dict.tcsvxt) or empty to suppress txt output

        schema_tables: list of tables to include in output (defaults to all checkpointed tables)

    for each table, write column names, dtype, and checkpoint added)

    text format writes individual table schemas to a single text file
    csv format writes all tables together with an additional table_name column

    Parameters
    ----------
    output_dir: str

    """

    model_settings = config.read_model_settings('write_data_dictionary')
    txt_format = model_settings.get('txt_format', 'data_dict.txt')
    csv_format = model_settings.get('csv_format', 'data_dict.csv')

    if not (csv_format or txt_format):
        logger.warning(
            f"write_data_dictionary step invoked but neither 'txt_format' nor 'csv_format' specified"
        )
        return

    table_names = pipeline.checkpointed_tables()

    # use table_names list from model_settings, if provided
    schema_tables = model_settings.get('tables', None)
    if schema_tables:
        table_names = [c for c in schema_tables if c in table_names]

    # initialize schema as dict of dataframe[table_name, column_name, dtype, checkpoint]
    schema = dict()
    final_shapes = dict()
    for table_name in table_names:
        df = pipeline.get_table(table_name)

        final_shapes[table_name] = df.shape

        if df.index.name and df.index.name not in df.columns:
            df = df.reset_index()
        info = df.dtypes.astype(str).to_frame('dtype').reset_index().rename(
            columns={'index': 'column_name'})
        info['checkpoint'] = ''

        info.insert(loc=0, column='table_name', value=table_name)
        schema[table_name] = info

    # annotate schema.info with name of checkpoint columns were first seen
    for _, row in pipeline.get_checkpoints().iterrows():

        checkpoint_name = row[pipeline.CHECKPOINT_NAME]

        for table_name in table_names:

            # no change to table in this checkpoint
            if row[table_name] != checkpoint_name:
                continue

            # get the checkpointed version of the table
            df = pipeline.get_table(table_name, checkpoint_name)

            if df.index.name and df.index.name not in df.columns:
                df = df.reset_index()

            info = schema.get(table_name, None)

            # tag any new columns with checkpoint name
            prev_columns = info[info.checkpoint != ''].column_name.values
            new_cols = [c for c in df.columns.values if c not in prev_columns]
            is_new_column_this_checkpoont = info.column_name.isin(new_cols)
            info.checkpoint = np.where(is_new_column_this_checkpoont,
                                       checkpoint_name, info.checkpoint)

            schema[table_name] = info

    schema_df = pd.concat(schema.values())

    if csv_format:
        schema_df.to_csv(config.output_file_path(csv_format),
                         header=True,
                         index=False)

    if txt_format:
        with open(config.output_file_path(txt_format), 'w') as output_file:

            # get max schema column widths from omnibus table
            col_width = {
                c: schema_df[c].str.len().max() + 2
                for c in schema_df
            }

            for table_name in table_names:
                info = schema.get(table_name, None)

                columns_to_print = ['column_name', 'dtype', 'checkpoint']
                info = info[columns_to_print].copy()

                # normalize schema columns widths across all table schemas for unified output formatting
                for c in info:
                    info[c] = info[c].str.pad(col_width[c], side='right')
                info.columns = [c.ljust(col_width[c]) for c in info.columns]

                info = info.to_string(index=False)

                print(
                    f"###\n### {table_name} {final_shapes[table_name]}\n###\n",
                    file=output_file)
                print(f"{info}\n", file=output_file)
Esempio n. 30
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info("No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in checkpointed_tables if t not in tables]

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

        file_name = "%s%s.csv" % (prefix, table_name)
        file_path = config.output_file_path(file_name)

        # include the index if it has a name or is a MultiIndex
        write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex)

        df.to_csv(file_path, index=write_index)
Esempio n. 31
0
def cdap_simulate(persons_merged, persons, households,
                  cdap_indiv_spec,
                  cdap_interaction_coefficients,
                  cdap_fixed_relative_proportions,
                  chunk_size, trace_hh_id):
    """
    CDAP stands for Coordinated Daily Activity Pattern, which is a choice of
    high-level activity pattern for each person, in a coordinated way with other
    members of a person's household.

    Because Python requires vectorization of computation, there are some specialized
    routines in the cdap directory of activitysim for this purpose.  This module
    simply applies those utilities using the simulation framework.
    """

    trace_label = 'cdap'
    model_settings = config.read_model_settings('cdap.yaml')

    persons_merged = persons_merged.to_frame()

    constants = config.get_model_constants(model_settings)

    cdap_interaction_coefficients = \
        cdap.preprocess_interaction_coefficients(cdap_interaction_coefficients)

    # specs are built just-in-time on demand and cached as injectables
    # prebuilding here allows us to write them to the output directory
    # (also when multiprocessing locutor might not see all household sizes)
    logger.info("Pre-building cdap specs")
    for hhsize in range(2, cdap.MAX_HHSIZE + 1):
        spec = cdap.build_cdap_spec(cdap_interaction_coefficients, hhsize, cache=True)
        if inject.get_injectable('locutor', False):
            spec.to_csv(config.output_file_path('cdap_spec_%s.csv' % hhsize), index=True)

    logger.info("Running cdap_simulate with %d persons", len(persons_merged.index))

    choices = cdap.run_cdap(
        persons=persons_merged,
        cdap_indiv_spec=cdap_indiv_spec,
        cdap_interaction_coefficients=cdap_interaction_coefficients,
        cdap_fixed_relative_proportions=cdap_fixed_relative_proportions,
        locals_d=constants,
        chunk_size=chunk_size,
        trace_hh_id=trace_hh_id,
        trace_label=trace_label)

    # - assign results to persons table and annotate
    persons = persons.to_frame()

    choices = choices.reindex(persons.index)
    persons['cdap_activity'] = choices.cdap_activity
    persons['cdap_rank'] = choices.cdap_rank

    expressions.assign_columns(
        df=persons,
        model_settings=model_settings.get('annotate_persons'),
        trace_label=tracing.extend_trace_label(trace_label, 'annotate_persons'))

    pipeline.replace_table("persons", persons)

    # - annotate households table
    households = households.to_frame()
    expressions.assign_columns(
        df=households,
        model_settings=model_settings.get('annotate_households'),
        trace_label=tracing.extend_trace_label(trace_label, 'annotate_households'))
    pipeline.replace_table("households", households)

    tracing.print_summary('cdap_activity', persons.cdap_activity, value_counts=True)
    logger.info("cdap crosstabs:\n%s" %
                pd.crosstab(persons.ptype, persons.cdap_activity, margins=True))

    if trace_hh_id:

        tracing.trace_df(inject.get_table('persons_merged').to_frame(),
                         label="cdap",
                         columns=['ptype', 'cdap_rank', 'cdap_activity'],
                         warn_if_empty=True)
Esempio n. 32
0
def cached_spec_path(spec_name):
    return config.output_file_path(spec_name)