Beispiel #1
0
def build_network(settings):
    """
    Build a Pandana network from CSV files
    """

    logger.info('building pandana network')
    network_settings_file = settings['network_settings_file']
    if not network_settings_file:
        logger.error("Please specify 'network_settings_file' in settings")
        return

    network_settings = config.read_model_settings(network_settings_file)
    logger.debug('using settings %s' % network_settings)

    nodes = pd.read_csv(config.data_file_path(network_settings['nodes']))
    links = pd.read_csv(config.data_file_path(network_settings['links']))

    nodes.index = nodes[network_settings['nodes-id']]

    network = pdna.Network(nodes[network_settings['nodes-x']],
                           nodes[network_settings['nodes-y']],
                           links[network_settings['links-a']],
                           links[network_settings['links-b']],
                           links[[network_settings['links-impedance']]],
                           twoway=network_settings['twoway'])

    network.save_hdf5(config.output_file_path('pandana_network.h5'))

    return network
Beispiel #2
0
    def read_saved_shadow_prices(self, model_settings):
        """
        Read saved shadow_prices from csv file in data_dir (so-called warm start)
        returns None if no saved shadow price file name specified or named file not found

        Parameters
        ----------
        model_settings : dict

        Returns
        -------
        shadow_prices : pandas.DataFrame or None
        """

        shadow_prices = None

        # - load saved shadow_prices
        saved_shadow_price_file_name = model_settings.get('SAVED_SHADOW_PRICE_TABLE_NAME')
        if saved_shadow_price_file_name:
            # FIXME - where should we look for this file?
            file_path = config.data_file_path(saved_shadow_price_file_name, mandatory=False)
            if file_path:
                shadow_prices = pd.read_csv(file_path, index_col=0)
                self.saved_shadow_price_file_path = file_path  # informational
                logging.info("loaded saved_shadow_prices from %s" % file_path)
            else:
                logging.warning("Could not find saved_shadow_prices file %s" % file_path)

        return shadow_prices
    def read_saved_shadow_prices(self, model_settings):
        """
        Read saved shadow_prices from csv file in data_dir (so-called warm start)
        returns None if no saved shadow price file name specified or named file not found

        Parameters
        ----------
        model_settings : dict

        Returns
        -------
        shadow_prices : pandas.DataFrame or None
        """

        shadow_prices = None

        # - load saved shadow_prices
        saved_shadow_price_file_name = model_settings.get('SAVED_SHADOW_PRICE_TABLE_NAME')
        if saved_shadow_price_file_name:
            # FIXME - where should we look for this file?
            file_path = config.data_file_path(saved_shadow_price_file_name, mandatory=False)
            if file_path:
                shadow_prices = pd.read_csv(file_path, index_col=0)
                self.saved_shadow_price_file_path = file_path  # informational
                logger.info("loaded saved_shadow_prices from %s" % file_path)
            else:
                logger.warning("Could not find saved_shadow_prices file %s" % file_path)

        return shadow_prices
Beispiel #4
0
def override_hh_ids(settings):

    hh_ids_filename = settings.get('hh_ids', None)
    if hh_ids_filename is None:
        return None

    file_path = config.data_file_path(hh_ids_filename, mandatory=False)
    if not file_path:
        logger.error("hh_ids file name '%s' specified in settings not found" %
                     hh_ids_filename)
        return None

    df = pd.read_csv(file_path, comment='#')

    if 'household_id' not in df.columns:
        logger.error("No 'household_id' column in hh_ids file %s" %
                     hh_ids_filename)
        return None

    household_ids = df.household_id.astype(int).unique()

    if len(household_ids) == 0:
        logger.error("No households in hh_ids file %s" % hh_ids_filename)
        return None

    logger.info("Using hh_ids list with %s households from file %s" %
                (len(household_ids), hh_ids_filename))

    return household_ids
Beispiel #5
0
def read_pois_table(buffer_zones_settings, network, constants):
    poi_fname = config.data_file_path(buffer_zones_settings['pois'])
    poi_df = pd.read_csv(poi_fname, index_col=False)
    poi_df['net_node_id'] = network.get_node_ids(
        poi_df[constants['pois-x']].values, poi_df[constants['pois-y']].values)

    return poi_df
Beispiel #6
0
def skim_dict(data_dir, settings):

    omx_file_path = config.data_file_path(settings["skims_file"])
    tags_to_load = settings['skim_time_periods']['labels']

    logger.info("loading skim_dict from %s" % (omx_file_path, ))

    # select the skims to load
    skim_info = get_skim_info(omx_file_path, tags_to_load)

    logger.debug("omx_shape %s skim_dtype %s" % (skim_info['omx_shape'], skim_info['dtype']))

    skim_buffers = inject.get_injectable('data_buffers', None)
    if skim_buffers:
        logger.info('Using existing skim_buffers for skims')
    else:
        skim_buffers = buffers_for_skims(skim_info, shared=False)
        load_skims(omx_file_path, skim_info, skim_buffers)

    skim_data = skim_data_from_buffers(skim_buffers, skim_info)

    block_names = list(skim_info['blocks'].keys())
    for i in range(len(skim_data)):
        block_name = block_names[i]
        block_data = skim_data[i]
        logger.info("block_name %s bytes %s (%s)" %
                    (block_name, block_data.nbytes, util.GB(block_data.nbytes)))

    # create skim dict
    skim_dict = skim.SkimDict(skim_data, skim_info)
    skim_dict.offset_mapper.set_offset_int(-1)

    return skim_dict
Beispiel #7
0
    def initialize_settings(self):

        assert not self.settings_initialized
        settings = config.read_model_settings(ESTIMATION_SETTINGS_FILE_NAME)
        self.enabled = settings.get('enable', 'True')
        self.bundles = settings.get('bundles', [])
        self.model_estimation_table_types = settings.get(
            'model_estimation_table_types', {})
        self.estimation_table_recipes = settings.get(
            'estimation_table_recipes', {})

        if self.enabled:
            self.survey_tables = settings.get('survey_tables', {})
            for table_name, table_info in self.survey_tables.items():
                assert 'file_name' in table_info, \
                    "No file name specified for survey_table '%s' in %s" % (table_name, ESTIMATION_SETTINGS_FILE_NAME)
                file_path = config.data_file_path(table_info['file_name'],
                                                  mandatory=True)
                assert os.path.exists(file_path), \
                    "File for survey table '%s' not found: %s" % (table_name, file_path)
                df = pd.read_csv(file_path)
                index_col = table_info.get('index_col')
                if index_col is not None:
                    assert index_col in df.columns, \
                        "Index col '%s' not in survey_table '%s' in file: %s % (index_col, table_name, file_path)"
                    df.set_index(index_col, inplace=True)

                # add the table df to survey_tables
                table_info['df'] = df

        self.settings_initialized = True
Beispiel #8
0
def override_hh_ids(settings):

    hh_ids_filename = settings.get('hh_ids', None)
    if hh_ids_filename is None:
        return None

    file_path = config.data_file_path(hh_ids_filename, mandatory=False)
    if not file_path:
        logger.error("hh_ids file name '%s' specified in settings not found" % hh_ids_filename)
        return None

    df = pd.read_csv(file_path, comment='#')

    if 'household_id' not in df.columns:
        logger.error("No 'household_id' column in hh_ids file %s" % hh_ids_filename)
        return None

    household_ids = df.household_id.astype(int).unique()

    if len(household_ids) == 0:
        logger.error("No households in hh_ids file %s" % hh_ids_filename)
        return None

    logger.info("Using hh_ids list with %s households from file %s" %
                (len(household_ids), hh_ids_filename))

    return household_ids
Beispiel #9
0
def skim_dict(data_dir, settings):

    omx_file_path = config.data_file_path(settings["skims_file"])
    tags_to_load = settings['skim_time_periods']['labels']

    logger.info("loading skim_dict from %s" % (omx_file_path, ))

    # select the skims to load
    skim_info = get_skim_info(omx_file_path, tags_to_load)

    logger.debug("omx_shape %s skim_dtype %s" %
                 (skim_info['omx_shape'], skim_info['dtype']))

    skim_buffers = inject.get_injectable('data_buffers', None)
    if skim_buffers:
        logger.info('Using existing skim_buffers for skims')
    else:
        skim_buffers = buffers_for_skims(skim_info, shared=False)
        load_skims(omx_file_path, skim_info, skim_buffers)

    skim_data = skim_data_from_buffers(skim_buffers, skim_info)

    block_names = list(skim_info['blocks'].keys())
    for i in range(len(skim_data)):
        block_name = block_names[i]
        block_data = skim_data[i]
        logger.info(
            "block_name %s bytes %s (%s)" %
            (block_name, block_data.nbytes, util.GB(block_data.nbytes)))

    # create skim dict
    skim_dict = skim.SkimDict(skim_data, skim_info)
    skim_dict.offset_mapper.set_offset_int(-1)

    return skim_dict
Beispiel #10
0
def tap_skim_dict(data_dir, settings):

    logger.info("loading tap_skim_dict")

    cache_skim_key_values = settings['skim_time_periods']['labels']
    skim_dict = askim.SkimDict()

    for skims_file in settings["tap_skims_files"]:
        skims_file_path = config.data_file_path(skims_file)
        with omx.open_file(skims_file_path) as omx_file:
            add_to_skim_dict(skim_dict, omx_file, cache_skim_key_values)

    return skim_dict
Beispiel #11
0
def test_mini_pipeline_run2():

    # the important thing here is that we should get
    # exactly the same results as for test_mini_pipeline_run
    # when we restart pipeline

    setup_dirs()

    inject_settings(households_sample_size=HOUSEHOLDS_SAMPLE_SIZE,
                    read_skim_cache=True)

    # should be able to get this BEFORE pipeline is opened
    checkpoints_df = pipeline.get_checkpoints()
    prev_checkpoint_count = len(checkpoints_df.index)

    # print "checkpoints_df\n%s" % checkpoints_df[['checkpoint_name']]
    assert prev_checkpoint_count == 9

    pipeline.open_pipeline('auto_ownership_simulate')

    regress_mini_auto()

    # try to run a model already in pipeline
    with pytest.raises(RuntimeError) as excinfo:
        pipeline.run_model('auto_ownership_simulate')
    assert "run model 'auto_ownership_simulate' more than once" in str(
        excinfo.value)

    # and these new ones
    pipeline.run_model('cdap_simulate')
    pipeline.run_model('mandatory_tour_frequency')

    regress_mini_mtf()

    # should be able to get this before pipeline is closed (from existing open store)
    checkpoints_df = pipeline.get_checkpoints()
    assert len(checkpoints_df.index) == prev_checkpoint_count

    # - write list of override_hh_ids to override_hh_ids.csv in data for use in next test
    num_hh_ids = 10
    hh_ids = pipeline.get_table("households").head(num_hh_ids).index.values
    hh_ids = pd.DataFrame({'household_id': hh_ids})

    hh_ids_path = config.data_file_path('override_hh_ids.csv')
    hh_ids.to_csv(hh_ids_path, index=False, header=True)

    pipeline.close_pipeline()
    inject.clear_cache()
    close_handlers()
Beispiel #12
0
def get_trips_df(model_settings):
    """Default to pipeline trips table unless
    user provides a CSV
    """
    filename = model_settings.get('input_table', None)

    if not filename:
        logger.info("using 'trips' pipeline table for balancing step")
        trips_df = pipeline.get_table('trips')
        return trips_df.reset_index()

    logger.info('using %s for balancing step' % filename)
    fpath = config.data_file_path(filename, mandatory=True)

    return pd.read_csv(fpath, header=0, comment='#')
Beispiel #13
0
def read_zone_indexed_csv_file(file_name):
    logger.info('reading file \'%s\'' % file_name)

    fpath = config.data_file_path(file_name, mandatory=True)
    zone_df = pd.read_csv(fpath, header=0, comment='#')

    if ZONE_LABEL in zone_df.columns:
        zone_index = ZONE_LABEL  # str
    else:
        # use row numbers for zone ids. convert to 1-based zone ids simply by adding 1
        zone_index = zone_df.index + 1  # Series

    zone_df.set_index(zone_index, drop=True, inplace=True)
    zone_df.index.name = ZONE_LABEL

    return zone_df
Beispiel #14
0
def read_input_table(table_name):

    filename = setting('input_store', None)

    if not filename:
        logger.error("input store file name not specified in settings")
        raise RuntimeError("store file name not specified in settings")

    input_store_path = config.data_file_path(filename)

    if not os.path.exists(input_store_path):
        logger.error("store file not found: %s" % input_store_path)
        raise RuntimeError("store file not found: %s" % input_store_path)

    df = pd.read_hdf(input_store_path, table_name)

    return df
Beispiel #15
0
    def load_skim_info(self):
        """
        read skim info from omx files into SkimInfo, and store in self.skims_info dict keyed by skim_tag

        ONE_ZONE and TWO_ZONE systems have only TAZ skims
        THREE_ZONE systems have both TAZ and TAP skims
        """
        assert self.skim_dict_factory is not None
        # load taz skim_info
        self.skims_info['taz'] = self.skim_dict_factory.load_skim_info('taz')

        if self.zone_system == THREE_ZONE:
            # load tap skim_info
            self.skims_info['tap'] = self.skim_dict_factory.load_skim_info('tap')

        if self.zone_system == THREE_ZONE:
            # load this here rather than in load_data as it is required during multiprocessing to size TVPBCache
            self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True))
            self.tvpb = pathbuilder.TransitVirtualPathBuilder(self)  # dependent on self.tap_df
Beispiel #16
0
def test_mini_pipeline_run3():

    # test that hh_ids setting overrides household sampling

    setup_dirs()
    inject_settings(hh_ids='override_hh_ids.csv')

    households = inject.get_table('households').to_frame()

    override_hh_ids = pd.read_csv(config.data_file_path('override_hh_ids.csv'))

    print("\noverride_hh_ids\n%s" % override_hh_ids)

    print("\nhouseholds\n%s" % households.index)

    assert households.shape[0] == override_hh_ids.shape[0]
    assert households.index.isin(override_hh_ids.household_id).all()

    inject.clear_cache()
    close_handlers()
    def _read_skims_from_omx(self, skim_info, skim_data):
        """
        read skims from omx file into skim_data
        """

        skim_tag = skim_info.skim_tag
        omx_keys = skim_info.omx_keys
        omx_manifest = skim_info.omx_manifest  # dict mapping { omx_key: skim_name }

        for omx_file_name in skim_info.omx_file_names:

            omx_file_path = config.data_file_path(omx_file_name)
            num_skims_loaded = 0

            logger.info(f"_read_skims_from_omx {omx_file_path}")

            # read skims into skim_data
            with omx.open_file(omx_file_path) as omx_file:
                for skim_key, omx_key in omx_keys.items():

                    if omx_manifest[omx_key] == omx_file_name:

                        offset = skim_info.block_offsets[skim_key]
                        logger.debug(
                            f"_read_skims_from_omx file {omx_file_name} omx_key {omx_key} "
                            f"skim_key {skim_key} to offset {offset}")

                        if skim_dictionary.ROW_MAJOR_LAYOUT:
                            a = skim_data[offset, :, :]
                        else:
                            a = skim_data[:, :, offset]

                        # this will trigger omx readslice to read and copy data to skim_data's buffer
                        omx_data = omx_file[omx_key]
                        a[:] = omx_data[:]

                        num_skims_loaded += 1

            logger.info(
                f"_read_skims_from_omx loaded {num_skims_loaded} skims from {omx_file_name}"
            )
Beispiel #18
0
def test_mini_pipeline_run3():

    # test that hh_ids setting overrides household sampling

    configs_dir = os.path.join(os.path.dirname(__file__), 'configs')
    setup_dirs(configs_dir)
    inject_settings(configs_dir, hh_ids='override_hh_ids.csv')

    households = inject.get_table('households').to_frame()

    override_hh_ids = pd.read_csv(config.data_file_path('override_hh_ids.csv'))

    print("\noverride_hh_ids\n", override_hh_ids)

    print("\nhouseholds\n", households.index)

    assert households.shape[0] == override_hh_ids.shape[0]
    assert households.index.isin(override_hh_ids.household_id).all()

    inject.clear_cache()
    close_handlers()
Beispiel #19
0
def read_network_file(settings):
    """
    Read network from saved HDF5 file
    """

    network_fname = settings['saved_network']
    if not network_fname:
        logger.error("Please specify 'saved_network' file in settings")
        return

    network_fpath = config.data_file_path(network_fname, mandatory=False) or \
        config.output_file_path(network_fname)

    if not os.path.exists(network_fpath):
        logger.error('No network file %s found' % network_fname)
        return

    logger.info('Reading network from %s' % network_fpath)
    network = pdna.Network.from_hdf5(network_fpath)

    return network
Beispiel #20
0
def read_from_table_info(table_info):
    """
    Read input text files and return cleaned up DataFrame.

    table_info is a dictionary that specifies the following input params.

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+
    | h5_tablename | name of target table in HDF5 file                        |
    +--------------+----------------------------------------------------------+

    """
    input_store = config.setting('input_store', None)
    create_input_store = config.setting('create_input_store', default=False)

    tablename = table_info.get('tablename')
    data_filename = table_info.get('filename', input_store)
    h5_tablename = table_info.get('h5_tablename') or tablename
    drop_columns = table_info.get('drop_columns', None)
    column_map = table_info.get('column_map', None)
    keep_columns = table_info.get('keep_columns', None)
    rename_columns = table_info.get('rename_columns', None)
    csv_dtypes = table_info.get('dtypes', {})

    # don't require a redundant index_col directive for canonical tables
    # but allow explicit disabling of assignment of index col for canonical tables, in which case, presumably,
    # the canonical index will be assigned in a subsequent initialization step (e.g. initialize_tours)
    canonical_index_col = canonical_table_index_name(tablename)

    # if there is an explicit index_col entry in table_info
    if 'index_col' in table_info:
        # honor explicit index_col unless it conflicts with canonical name

        index_col = table_info['index_col']

        if canonical_index_col:
            if index_col:
                # if there is a non-empty index_col directive, it should be for canonical_table_index_name
                assert index_col == canonical_index_col, \
                    f"{tablename} index_col {table_info.get('index_col')} should be {index_col}"
            else:
                logger.info(f"Not assigning canonical index_col {tablename}.{canonical_index_col} "
                            f"because settings file index_col directive is explicitly None.")

        #  if there is an index_col directive for a canonical table, it should be for canonical_table_index_name

    else:
        # otherwise default is to use canonical index name for known tables, and no index for unknown tables
        index_col = canonical_index_col

    assert tablename is not None, 'no tablename provided'
    assert data_filename is not None, 'no input file provided'

    data_file_path = config.data_file_path(data_filename)

    df = _read_input_file(data_file_path, h5_tablename=h5_tablename, csv_dtypes=csv_dtypes)

    # logger.debug('raw %s table columns: %s' % (tablename, df.columns.values))
    logger.debug('raw %s table size: %s' % (tablename, util.df_size(df)))

    if create_input_store:
        h5_filepath = config.output_file_path('input_data.h5')
        logger.info('writing %s to %s' % (h5_tablename, h5_filepath))
        df.to_hdf(h5_filepath, key=h5_tablename, mode='a')

        csv_dir = config.output_file_path('input_data')
        if not os.path.exists(csv_dir):
            os.makedirs(csv_dir)  # make directory if needed
        df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False)

    if drop_columns:
        logger.debug("dropping columns: %s" % drop_columns)
        df.drop(columns=drop_columns, inplace=True, errors='ignore')

    if column_map:
        warnings.warn("table_inf option 'column_map' renamed 'rename_columns'"
                      "Support for 'column_map' will be removed in future versions.",
                      FutureWarning)
        logger.debug("renaming columns: %s" % column_map)
        df.rename(columns=column_map, inplace=True)

    # rename columns first, so keep_columns can be a stable list of expected/required columns
    if rename_columns:
        logger.debug("renaming columns: %s" % rename_columns)
        df.rename(columns=rename_columns, inplace=True)

    # set index
    if index_col is not None:
        if index_col in df.columns:
            assert not df.duplicated(index_col).any()
            if canonical_index_col:
                # we expect canonical indexes to be integer-valued
                assert (df[index_col] == df[index_col].astype(int)).all(), \
                    f"Index col '{index_col}' has non-integer values"
                df[index_col] = df[index_col].astype(int)
            df.set_index(index_col, inplace=True)
        else:
            # FIXME not sure we want to do this. More likely they omitted index col than that they want to name it?
            # df.index.names = [index_col]
            logger.error(f"index_col '{index_col}' specified in configs but not in {tablename} table!")
            logger.error(f"{tablename} columns are: {list(df.columns)}")
            raise RuntimeError(f"index_col '{index_col}' not in {tablename} table!")

    if keep_columns:
        logger.debug("keeping columns: %s" % keep_columns)
        if not set(keep_columns).issubset(set(df.columns)):
            logger.error(f"Required columns missing from {tablename} table: "
                         f"{list(set(keep_columns).difference(set(df.columns)))}")
            logger.error(f"{tablename} table has columns: {list(df.columns)}")
            raise RuntimeError(f"Required columns missing from {tablename} table")

        df = df[keep_columns]

    if df.columns.duplicated().any():
        duplicate_column_names = df.columns[df.columns.duplicated(keep=False)].unique().to_list()
        assert not df.columns.duplicated().any(), f"duplicate columns names in {tablename}: {duplicate_column_names}"

    logger.debug('%s table columns: %s' % (tablename, df.columns.values))
    logger.debug('%s table size: %s' % (tablename, util.df_size(df)))
    logger.debug('%s index name: %s' % (tablename, df.index.name))

    return df
    def load_skim_info(self, skim_tag):
        """
        Read omx files for skim <skim_tag> (e.g. 'TAZ') and build skim_info dict

        Parameters
        ----------
        skim_tag: str

        """

        self.omx_file_names = self.network_los.omx_file_names(skim_tag)

        # ignore any 3D skims not in skim_time_periods
        # specifically, load all skims except those with key2 not in dim3_tags_to_load
        skim_time_periods = self.network_los.skim_time_periods
        dim3_tags_to_load = skim_time_periods and skim_time_periods['labels']

        self.omx_manifest = {}  # dict mapping { omx_key: skim_name }

        for omx_file_name in self.omx_file_names:

            omx_file_path = config.data_file_path(omx_file_name)

            # logger.debug(f"load_skim_info {skim_tag} reading {omx_file_path}")

            with omx.open_file(omx_file_path) as omx_file:

                # fixme call to omx_file.shape() failing in windows p3.5
                if self.omx_shape is None:
                    self.omx_shape = tuple(
                        int(i) for i in
                        omx_file.shape())  # sometimes omx shape are floats!
                else:
                    assert (self.omx_shape == tuple(
                        int(i) for i in omx_file.shape()))

                for skim_name in omx_file.listMatrices():
                    assert skim_name not in self.omx_manifest, \
                        f"duplicate skim '{skim_name}' found in {self.omx_manifest[skim_name]} and {omx_file}"
                    self.omx_manifest[skim_name] = omx_file_name

                for m in omx_file.listMappings():
                    if self.offset_map is None:
                        self.offset_map_name = m
                        self.offset_map = omx_file.mapentries(
                            self.offset_map_name)
                        assert len(self.offset_map) == self.omx_shape[0]
                    else:
                        # don't really expect more than one, but ok if they are all the same
                        if not (self.offset_map == omx_file.mapentries(m)):
                            raise RuntimeError(
                                f"Multiple mappings in omx file: {self.offset_map_name} != {m}"
                            )

        # - omx_keys dict maps skim key to omx_key
        # DISTWALK: DISTWALK
        # ('DRV_COM_WLK_BOARDS', 'AM'): DRV_COM_WLK_BOARDS__AM, ...
        self.omx_keys = dict()
        for skim_name in self.omx_manifest.keys():
            key1, sep, key2 = skim_name.partition('__')

            # - ignore composite tags not in dim3_tags_to_load
            if dim3_tags_to_load and sep and key2 not in dim3_tags_to_load:
                continue

            skim_key = (key1, key2) if sep else key1

            self.omx_keys[skim_key] = skim_name

        self.num_skims = len(self.omx_keys)

        # - key1_subkeys dict maps key1 to dict of subkeys with that key1
        # DIST: {'DIST': 0}
        # DRV_COM_WLK_BOARDS: {'MD': 1, 'AM': 0, 'PM': 2}, ...
        key1_subkeys = dict()
        for skim_key, omx_key in self.omx_keys.items():
            if isinstance(skim_key, tuple):
                key1, key2 = skim_key
            else:
                key1 = key2 = skim_key
            key2_dict = key1_subkeys.setdefault(key1, {})
            key2_dict[key2] = len(key2_dict)

        key1_block_offsets = dict()
        offset = 0
        for key1, v in key1_subkeys.items():
            num_subkeys = len(v)
            key1_block_offsets[key1] = offset
            offset += num_subkeys

        # - block_offsets dict maps skim_key to offset of omx matrix
        # DIST: 0,
        # ('DRV_COM_WLK_BOARDS', 'AM'): 3,
        # ('DRV_COM_WLK_BOARDS', 'MD') 4, ...
        self.block_offsets = dict()
        for skim_key in self.omx_keys:

            if isinstance(skim_key, tuple):
                key1, key2 = skim_key
            else:
                key1 = key2 = skim_key

            key1_offset = key1_block_offsets[key1]
            key2_relative_offset = key1_subkeys.get(key1).get(key2)
            self.block_offsets[skim_key] = key1_offset + key2_relative_offset

        if skim_dictionary.ROW_MAJOR_LAYOUT:
            self.skim_data_shape = (self.num_skims, self.omx_shape[0],
                                    self.omx_shape[1])
        else:
            self.skim_data_shape = self.omx_shape + (self.num_skims, )

        # list of base keys (keys
        self.base_keys = tuple(k for k in key1_block_offsets.keys())
Beispiel #22
0
def read_from_table_info(table_info):
    """
    Read input text files and return cleaned up DataFrame.

    table_info is a dictionary that specifies the following input params.

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+
    | h5_tablename | name of target table in HDF5 file                        |
    +--------------+----------------------------------------------------------+

    """
    input_store = config.setting('input_store', None)
    create_input_store = config.setting('create_input_store', default=False)

    tablename = table_info.get('tablename')
    data_filename = table_info.get('filename', input_store)
    h5_tablename = table_info.get('h5_tablename') or tablename
    drop_columns = table_info.get('drop_columns', None)
    column_map = table_info.get('column_map', None)
    keep_columns = table_info.get('keep_columns', None)
    rename_columns = table_info.get('rename_columns', None)
    index_col = table_info.get('index_col', None)

    assert tablename is not None, 'no tablename provided'
    assert data_filename is not None, 'no input file provided'

    data_file_path = config.data_file_path(data_filename)

    df = _read_input_file(data_file_path, h5_tablename=h5_tablename)

    logger.debug('raw %s table columns: %s' % (tablename, df.columns.values))
    logger.debug('raw %s table size: %s' % (tablename, util.df_size(df)))

    if create_input_store:
        h5_filepath = config.output_file_path('input_data.h5')
        logger.info('writing %s to %s' % (h5_tablename, h5_filepath))
        df.to_hdf(h5_filepath, key=h5_tablename, mode='a')

        csv_dir = config.output_file_path('input_data')
        if not os.path.exists(csv_dir):
            os.makedirs(csv_dir)  # make directory if needed
        df.to_csv(os.path.join(csv_dir, '%s.csv' % tablename), index=False)

    if drop_columns:
        logger.debug("dropping columns: %s" % drop_columns)
        df.drop(columns=drop_columns, inplace=True, errors='ignore')

    if column_map:
        warnings.warn(
            "table_inf option 'column_map' renamed 'rename_columns'"
            "Support for 'column_map' will be removed in future versions.",
            FutureWarning)
        logger.debug("renaming columns: %s" % column_map)
        df.rename(columns=column_map, inplace=True)

    # rename columns first, so keep_columns can be a stable list of expected/required columns
    if rename_columns:
        logger.info("renaming columns: %s" % rename_columns)
        df.rename(columns=rename_columns, inplace=True)

    # set index
    if index_col is not None:
        if index_col in df.columns:
            assert not df.duplicated(index_col).any()
            df.set_index(index_col, inplace=True)
        else:
            df.index.names = [index_col]

    logger.info("keeping columns: %s" % keep_columns)
    if keep_columns:
        logger.info("keeping columns: %s" % keep_columns)
        df = df[keep_columns]

    logger.debug('%s table columns: %s' % (tablename, df.columns.values))
    logger.debug('%s table size: %s' % (tablename, util.df_size(df)))
    logger.info('%s index name: %s' % (tablename, df.index.name))

    return df
Beispiel #23
0
    def load_data(self):
        """
        Load tables and skims from files specified in network_los settigns
        """

        # load maz tables
        if self.zone_system in [TWO_ZONE, THREE_ZONE]:

            # maz
            file_name = self.setting('maz')
            self.maz_taz_df = pd.read_csv(config.data_file_path(file_name, mandatory=True))
            self.maz_taz_df = self.maz_taz_df[['MAZ', 'TAZ']].sort_values(by='MAZ')  # only fields we need

            self.maz_ceiling = self.maz_taz_df.MAZ.max() + 1

            # maz_to_maz_df
            maz_to_maz_tables = self.setting('maz_to_maz.tables')
            maz_to_maz_tables = [maz_to_maz_tables] if isinstance(maz_to_maz_tables, str) else maz_to_maz_tables
            for file_name in maz_to_maz_tables:

                df = pd.read_csv(config.data_file_path(file_name, mandatory=True))

                df['i'] = df.OMAZ * self.maz_ceiling + df.DMAZ
                df.set_index('i', drop=True, inplace=True, verify_integrity=True)
                logger.debug(f"loading maz_to_maz table {file_name} with {len(df)} rows")

                # FIXME - don't really need these columns, but if we do want them,
                #  we would need to merge them in since files may have different numbers of rows
                df.drop(columns=['OMAZ', 'DMAZ'], inplace=True)

                # besides, we only want data columns so we can coerce to same type as skims
                df = df.astype(np.dtype(self.skim_dtype_name))

                if self.maz_to_maz_df is None:
                    self.maz_to_maz_df = df
                else:
                    self.maz_to_maz_df = pd.concat([self.maz_to_maz_df, df], axis=1)

        # load tap tables
        if self.zone_system == THREE_ZONE:

            # tap_df should already have been loaded by load_skim_info because,
            # during multiprocessing, it is required by TapTapUidCalculator to size TVPBCache
            # self.tap_df = pd.read_csv(config.data_file_path(self.setting('tap'), mandatory=True))
            assert self.tap_df is not None

            # maz_to_tap_dfs - different sized sparse arrays with different columns, so we keep them seperate
            for mode, maz_to_tap_settings in self.setting('maz_to_tap').items():

                assert 'table' in maz_to_tap_settings, \
                    f"Expected setting maz_to_tap.{mode}.table not found in in {LOS_SETTINGS_FILE_NAME}"

                file_name = maz_to_tap_settings['table']
                df = pd.read_csv(config.data_file_path(file_name, mandatory=True))

                # trim tap set
                # if provided, use tap_line_distance_col together with tap_lines table to trim the near tap set
                # to only include the nearest tap to origin when more than one tap serves the same line
                distance_col = maz_to_tap_settings.get('tap_line_distance_col')
                if distance_col:

                    if self.tap_lines_df is None:
                        # load tap_lines on demand (required if they specify tap_line_distance_col)
                        tap_lines_file_name = self.setting('tap_lines', )
                        self.tap_lines_df = pd.read_csv(config.data_file_path(tap_lines_file_name, mandatory=True))

                        # csv file has one row per TAP with space-delimited list of lines served by that TAP
                        #  TAP                                      LINES
                        # 6020  GG_024b_SB GG_068_RT GG_228_WB GG_023X_RT
                        # stack to create dataframe with one column 'line' indexed by TAP with one row per line served
                        #  TAP        line
                        # 6020  GG_024b_SB
                        # 6020   GG_068_RT
                        # 6020   GG_228_WB
                        self.tap_lines_df = \
                            self.tap_lines_df.set_index('TAP').LINES.str.split(expand=True)\
                                .stack().droplevel(1).to_frame('line')

                    old_len = len(df)

                    # NOTE - merge will remove unused taps (not appearing in tap_lines)
                    df = pd.merge(df, self.tap_lines_df, left_on='TAP', right_index=True)

                    # find nearest TAP to MAz that serves line
                    df = df.sort_values(by=distance_col).drop_duplicates(subset=['MAZ', 'line'])

                    # we don't need to remember which lines are served by which TAPs
                    df = df.drop(columns='line').drop_duplicates(subset=['MAZ', 'TAP']).sort_values(['MAZ', 'TAP'])

                    logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows")
                    logger.debug(f"maz_to_tap table {file_name} max {distance_col} {df[distance_col].max()}")

                    max_dist = maz_to_tap_settings.get('max_dist', None)
                    if max_dist:
                        old_len = len(df)
                        df = df[df[distance_col] <= max_dist]
                        logger.debug(f"trimmed maz_to_tap table {file_name} from {old_len} to {len(df)} rows "
                                     f"based on max_dist {max_dist}")

                    if TRACE_TRIMMED_MAZ_TO_TAP_TABLES:
                        tracing.write_csv(df, file_name=f"trimmed_{maz_to_tap_settings['table']}", transpose=False)

                df.set_index(['MAZ', 'TAP'], drop=True, inplace=True, verify_integrity=True)
                logger.debug(f"loaded maz_to_tap table {file_name} with {len(df)} rows")

                assert mode not in self.maz_to_tap_dfs
                self.maz_to_tap_dfs[mode] = df

        mem.trace_memory_info('#MEM network_los.load_data before create_skim_dicts')

        # create taz skim dict
        assert 'taz' not in self.skim_dicts
        self.skim_dicts['taz'] = self.create_skim_dict('taz')
        # make sure skim has all tap_ids
        # FIXME - weird that there is no list of tazs?

        # create MazSkimDict facade
        if self.zone_system in [TWO_ZONE, THREE_ZONE]:
            # create MazSkimDict facade skim_dict
            # (must have already loaded dependencies: taz skim_dict, maz_to_maz_df, and maz_taz_df)
            assert 'maz' not in self.skim_dicts
            self.skim_dicts['maz'] = self.create_skim_dict('maz')
            # make sure skim has all maz_ids
            assert set(self.maz_taz_df['MAZ'].values).issubset(set(self.skim_dicts['maz'].zone_ids))

        # create tap skim dict
        if self.zone_system == THREE_ZONE:
            assert 'tap' not in self.skim_dicts
            self.skim_dicts['tap'] = self.create_skim_dict('tap')
            # make sure skim has all tap_ids
            assert set(self.tap_df['TAP'].values).issubset(set(self.skim_dicts['tap'].zone_ids))

        mem.trace_memory_info("network_los.load_data after create_skim_dicts")
Beispiel #24
0
def read_from_table_info(table_info):
    """
    Read input text files and return cleaned up DataFrame.

    table_info is a dictionary that specifies the following input params.

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+
    | h5_tablename | name of target table in HDF5 file                        |
    +--------------+----------------------------------------------------------+

    """
    input_store = config.setting('input_store', None)
    create_input_store = config.setting('create_input_store', default=False)

    tablename = table_info.get('tablename')
    data_filename = table_info.get('filename', input_store)
    h5_tablename = table_info.get('h5_tablename') or tablename
    drop_columns = table_info.get('drop_columns', None)
    column_map = table_info.get('column_map', None)
    index_col = table_info.get('index_col', None)

    assert tablename is not None, 'no tablename provided'
    assert data_filename is not None, 'no input file provided'

    data_file_path = config.data_file_path(data_filename)

    df = _read_input_file(data_file_path, h5_tablename=h5_tablename)

    logger.info('%s table columns: %s' % (tablename, df.columns.values))
    logger.info('%s table size: %s' % (tablename, util.df_size(df)))

    if create_input_store:
        h5_filepath = config.output_file_path('input_data.h5')
        logger.info('writing %s to %s' % (h5_tablename, h5_filepath))
        df.to_hdf(h5_filepath, key=h5_tablename, mode='a')

    if drop_columns:
        for c in drop_columns:
            logger.info("dropping column '%s'" % c)
            del df[c]

    if column_map:
        df.rename(columns=column_map, inplace=True)

    # set index
    if index_col is not None:
        if index_col in df.columns:
            assert not df.duplicated(index_col).any()
            df.set_index(index_col, inplace=True)
        else:
            df.index.names = [index_col]

    logger.info('%s index name: %s' % (tablename, df.index.name))

    return df