Ejemplo n.º 1
0
def _destination_sample(primary_purpose, trips, alternatives, model_settings,
                        size_term_matrix, skims, alt_dest_col_name, estimator,
                        chunk_size, chunk_tag, trace_label):
    """

    Note: trips with no viable destination receive no sample rows
    (because we call interaction_sample with allow_zero_probs=True)
    All other trips will have one or more rows with pick_count summing to sample_size

    returns
        choices: pandas.DataFrame

               alt_dest      prob  pick_count
    trip_id
    102829169      2898  0.002333           1
    102829169      2901  0.004976           1
    102829169      3193  0.002628           1
    """

    spec = simulate.spec_for_segment(model_settings,
                                     spec_id='DESTINATION_SAMPLE_SPEC',
                                     segment_name=primary_purpose,
                                     estimator=estimator)

    sample_size = model_settings['SAMPLE_SIZE']
    if config.setting('disable_destination_sampling',
                      False) or (estimator
                                 and estimator.want_unsampled_alternatives):
        # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count
        logger.info(
            "Estimation mode for %s using unsampled alternatives short_circuit_choices"
            % (trace_label, ))
        sample_size = 0

    locals_dict = config.get_model_constants(model_settings).copy()

    # size_terms of destination zones are purpose-specific, and trips have various purposes
    # so the relevant size_term for each interaction_sample row
    # cannot be determined until after choosers are joined with alternatives
    # (unless we iterate over trip.purpose - which we could, though we are already iterating over trip_num)
    # so, instead, expressions determine row-specific size_term by a call to: size_terms.get(df.alt_dest, df.purpose)
    locals_dict.update({'size_terms': size_term_matrix})
    locals_dict.update(skims)

    log_alt_losers = config.setting('log_alt_losers', False)

    choices = interaction_sample(choosers=trips,
                                 alternatives=alternatives,
                                 sample_size=sample_size,
                                 alt_col_name=alt_dest_col_name,
                                 log_alt_losers=log_alt_losers,
                                 allow_zero_probs=True,
                                 spec=spec,
                                 skims=skims,
                                 locals_d=locals_dict,
                                 chunk_size=chunk_size,
                                 chunk_tag=chunk_tag,
                                 trace_label=trace_label)

    return choices
Ejemplo n.º 2
0
def add_geography_columns(incidence_table, households_df, crosswalk_df):
    """
    Add seed and meta geography columns to incidence_table

    Parameters
    ----------
    incidence_table
    households_df
    crosswalk_df

    Returns
    -------

    """

    geographies = setting('geographies')
    meta_geography = geographies[0]
    seed_geography = setting('seed_geography')

    # add seed_geography col to incidence table
    incidence_table[seed_geography] = households_df[seed_geography]

    # add meta column to incidence table
    seed_to_meta = \
        crosswalk_df[[seed_geography, meta_geography]] \
        .groupby(seed_geography, as_index=True).min()[meta_geography]
    incidence_table[meta_geography] = incidence_table[seed_geography].map(
        seed_to_meta)

    return incidence_table
Ejemplo n.º 3
0
def _location_sample(segment_name, choosers, alternatives, skims, estimator,
                     model_settings, alt_dest_col_name, chunk_size, chunk_tag,
                     trace_label):
    """
    select a sample of alternative locations.

    Logsum calculations are expensive, so we build a table of persons * all zones
    and then select a sample subset of potential locations

    The sample subset is generated by making multiple choices (<sample_size> number of choices)
    which results in sample containing up to <sample_size> choices for each choose (e.g. person)
    and a pick_count indicating how many times that choice was selected for that chooser.)

    person_id,  dest_zone_id, rand,            pick_count
    23750,      14,           0.565502716034,  4
    23750,      16,           0.711135838871,  6
    ...
    23751,      12,           0.408038878552,  1
    23751,      14,           0.972732479292,  2
    """
    assert not choosers.empty

    logger.info("Running %s with %d persons" %
                (trace_label, len(choosers.index)))

    sample_size = model_settings["SAMPLE_SIZE"]
    if config.setting('disable_destination_sampling',
                      False) or (estimator
                                 and estimator.want_unsampled_alternatives):
        # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count
        logger.info(
            "Estimation mode for %s using unsampled alternatives short_circuit_choices"
            % (trace_label, ))
        sample_size = 0

    locals_d = {'skims': skims, 'segment_size': segment_name}
    constants = config.get_model_constants(model_settings)
    locals_d.update(constants)

    spec = simulate.spec_for_segment(model_settings,
                                     spec_id='SAMPLE_SPEC',
                                     segment_name=segment_name,
                                     estimator=estimator)

    # here since presumably we want this when called for either sample or presample
    log_alt_losers = config.setting('log_alt_losers', False)

    choices = interaction_sample(choosers,
                                 alternatives,
                                 spec=spec,
                                 sample_size=sample_size,
                                 alt_col_name=alt_dest_col_name,
                                 log_alt_losers=log_alt_losers,
                                 skims=skims,
                                 locals_d=locals_d,
                                 chunk_size=chunk_size,
                                 chunk_tag=chunk_tag,
                                 trace_label=trace_label)

    return choices
Ejemplo n.º 4
0
    def load_settings(self):
        """
        Read setting file and initialize object variables (see class docstring for list of object variables)
        """

        try:
            self.los_settings = config.read_settings_file(self.los_settings_file_name, mandatory=True)
        except config.SettingsFileNotFound as e:

            print(f"los_settings_file_name {self.los_settings_file_name} not found - trying global settings")
            print(f"skims_file: {config.setting('skims_file')}")
            print(f"skim_time_periods: {config.setting('skim_time_periods')}")
            print(f"source_file_paths: {config.setting('source_file_paths')}")
            print(f"inject.get_injectable('configs_dir') {inject.get_injectable('configs_dir')}")

            # look for legacy 'skims_file' setting in global settings file
            if config.setting('skims_file'):

                warnings.warn("Support for 'skims_file' setting in global settings file will be removed."
                              "Use 'taz_skims' in network_los.yaml config file instead.", FutureWarning)

                # in which case, we also expect to find skim_time_periods in settings file
                skim_time_periods = config.setting('skim_time_periods')
                assert skim_time_periods is not None, "'skim_time_periods' setting not found."
                warnings.warn("Support for 'skim_time_periods' setting in global settings file will be removed."
                              "Put 'skim_time_periods' in network_los.yaml config file instead.", FutureWarning)

                self.los_settings = {
                    'taz_skims': config.setting('skims_file'),
                    'zone_system': ONE_ZONE,
                    'skim_time_periods': skim_time_periods
                }

            else:
                raise e

        # validate skim_time_periods
        self.skim_time_periods = self.setting('skim_time_periods')
        if 'hours' in self.skim_time_periods:
            self.skim_time_periods['periods'] = self.skim_time_periods.pop('hours')
            warnings.warn('support for `skim_time_periods` key `hours` will be removed in '
                          'future verions. Use `periods` instead',
                          FutureWarning)
        assert 'periods' in self.skim_time_periods, "'periods' key not found in network_los.skim_time_periods"
        assert 'labels' in self.skim_time_periods, "'labels' key not found in network_los.skim_time_periods"

        self.zone_system = self.setting('zone_system')
        assert self.zone_system in [ONE_ZONE, TWO_ZONE, THREE_ZONE], \
            f"Network_LOS: unrecognized zone_system: {self.zone_system}"

        if self.zone_system in [TWO_ZONE, THREE_ZONE]:
            # maz_to_maz_settings
            self.max_blend_distance = self.setting('maz_to_maz.max_blend_distance', default={})
            if isinstance(self.max_blend_distance, int):
                self.max_blend_distance = {'DEFAULT': self.max_blend_distance}
            self.blend_distance_skim_name = self.setting('maz_to_maz.blend_distance_skim_name', default=None)

        # validate skim_time_periods
        self.skim_time_periods = self.setting('skim_time_periods')
        assert {'periods', 'labels'}.issubset(set(self.skim_time_periods.keys()))
Ejemplo n.º 5
0
def add_geography_columns(incidence_table, households_df, crosswalk_df):
    """
    Add seed and meta geography columns to incidence_table

    Parameters
    ----------
    incidence_table
    households_df
    crosswalk_df

    Returns
    -------

    """

    geographies = setting('geographies')
    meta_geography = geographies[0]
    seed_geography = setting('seed_geography')

    # add seed_geography col to incidence table
    incidence_table[seed_geography] = households_df[seed_geography]

    # add meta column to incidence table (unless it's already there)
    if seed_geography != meta_geography:
        tmp = crosswalk_df[list({seed_geography, meta_geography})]
        seed_to_meta = tmp.groupby(seed_geography,
                                   as_index=True).min()[meta_geography]
        incidence_table[meta_geography] = incidence_table[seed_geography].map(
            seed_to_meta)

    return incidence_table
Ejemplo n.º 6
0
def run(args):
    """
    Run bca4abm. Specify a project folder using the '--working_dir' option,
    or point to the config, data, and output folders directly with
    '--config', '--data', and '--output'.

    """

    if args.working_dir and os.path.exists(args.working_dir):
        os.chdir(args.working_dir)

    if args.config:
        inject.add_injectable('configs_dir', args.config)

    if args.data:
        inject.add_injectable('data_dir', args.data)

    if args.output:
        inject.add_injectable('output_dir', args.output)

    for injectable in ['configs_dir', 'data_dir', 'output_dir']:
        try:
            dir_path = inject.get_injectable(injectable)
        except RuntimeError:
            sys.exit('Error: please specify either a --working_dir '
                     "containing 'configs', 'data', and 'output' folders "
                     'or all three of --config, --data, and --output')
        if not os.path.exists(dir_path):
            sys.exit("Could not find %s '%s'" % (injectable, os.path.abspath(dir_path)))

    if args.pipeline:
        inject.add_injectable('pipeline_file_name', args.pipeline)

    if args.resume:
        override_setting('resume_after', args.resume)

    tracing.config_logger()
    tracing.delete_csv_files()  # only modifies output_dir
    warnings.simplefilter('always')
    logging.captureWarnings(capture=True)

    t0 = tracing.print_elapsed_time()

    # If you provide a resume_after argument to pipeline.run
    # the pipeline manager will attempt to load checkpointed tables from the checkpoint store
    # and resume pipeline processing on the next submodel step after the specified checkpoint
    resume_after = setting('resume_after', None)

    if resume_after:
        print('resume_after: %s' % resume_after)

    pipeline.run(models=setting('models'), resume_after=resume_after)

    # tables will no longer be available after pipeline is closed
    pipeline.close_pipeline()

    t0 = tracing.print_elapsed_time('all models', t0)
Ejemplo n.º 7
0
def meta_summary(incidence_df, control_spec, top_geography, top_id,
                 sub_geographies, hh_id_col):

    if setting('NO_INTEGERIZATION_EVER', False):
        seed_weight_cols = ['preliminary_balanced_weight', 'balanced_weight']
        sub_weight_cols = ['balanced_weight']
    else:
        seed_weight_cols = [
            'preliminary_balanced_weight', 'balanced_weight', 'integer_weight'
        ]
        sub_weight_cols = ['balanced_weight', 'integer_weight']

    incidence_df = incidence_df[incidence_df[top_geography] == top_id]

    control_cols = control_spec.target.values

    controls_df = get_control_table(top_geography)

    # controls for this geography as series
    controls = controls_df[control_cols].loc[top_id]

    incidence = incidence_df[control_cols]

    summary = pd.DataFrame(index=control_cols)

    summary.index.name = 'control_name'

    summary['control_value'] = controls

    seed_geography = setting('seed_geography')
    seed_weights_df = get_weight_table(seed_geography)

    for c in seed_weight_cols:
        if c in seed_weights_df:
            summary_col_name = '%s_%s' % (top_geography, c)
            summary[summary_col_name] = \
                incidence.multiply(seed_weights_df[c], axis="index").sum(axis=0)

    for g in sub_geographies:

        sub_weights = get_weight_table(g)

        if sub_weights is None:
            continue

        sub_weights = sub_weights[sub_weights[top_geography] == top_id]

        sub_weights = sub_weights[[hh_id_col] +
                                  sub_weight_cols].groupby(hh_id_col).sum()

        for c in sub_weight_cols:
            summary['%s_%s' % (g, c)] = \
                incidence.multiply(sub_weights[c], axis="index").sum(axis=0)

    return summary
Ejemplo n.º 8
0
def load_tables(table_list_name, data_dir=None):

    table_list = setting(table_list_name)
    if table_list is None:
        raise "I expected to find table list '%s' with table_info in settings." % table_list_name

    if data_dir is None:
        data_dir = setting('data_dir', inject.get_injectable('data_dir'))

    for table_name, table_info in table_list.iteritems():

        df = read_table(table_name, table_info, data_dir)
        inject.add_table(table_name, df)
Ejemplo n.º 9
0
def run(args):
    """
    Run the models. Specify a project folder using the '--working_dir' option,
    or point to the config, data, and output folders directly with
    '--config', '--data', and '--output'. Both '--config' and '--data' can be
    specified multiple times. Directories listed first take precedence.

    """

    from activitysim import abm  # register injectables

    tracing.config_logger(basic=True)
    handle_standard_args(args)  # possibly update injectables
    tracing.config_logger(
        basic=False)  # update using possibly new logging configs
    config.filter_warnings()
    logging.captureWarnings(capture=True)

    log_settings()

    t0 = tracing.print_elapsed_time()

    # If you provide a resume_after argument to pipeline.run
    # the pipeline manager will attempt to load checkpointed tables from the checkpoint store
    # and resume pipeline processing on the next submodel step after the specified checkpoint
    resume_after = config.setting('resume_after', None)

    # cleanup if not resuming
    if not resume_after:
        cleanup_output_files()
    elif config.setting('cleanup_trace_files_on_resume', False):
        tracing.delete_trace_files()

    if config.setting('multiprocess', False):
        logger.info('run multiprocess simulation')

        from activitysim.core import mp_tasks
        run_list = mp_tasks.get_run_list()
        injectables = {k: inject.get_injectable(k) for k in INJECTABLES}
        mp_tasks.run_multiprocess(run_list, injectables)
    else:
        logger.info('run single process simulation')

        pipeline.run(models=config.setting('models'),
                     resume_after=resume_after)
        pipeline.close_pipeline()
        chunk.log_write_hwm()

    tracing.print_elapsed_time('all models', t0)
Ejemplo n.º 10
0
def preload_injectables():
    """
    preload bulky injectables up front - stuff that isn't inserted into the pipeline
    """

    logger.info("preload_injectables")

    inject.add_step('track_skim_usage', track_skim_usage)
    inject.add_step('write_data_dictionary', write_data_dictionary)
    inject.add_step('write_tables', write_tables)

    table_list = config.setting('input_table_list')

    # default ActivitySim table names and indices
    if table_list is None:
        logger.warning(
            "No 'input_table_list' found in settings. This will be a "
            "required setting in upcoming versions of ActivitySim.")

        new_settings = inject.get_injectable('settings')
        new_settings['input_table_list'] = DEFAULT_TABLE_LIST
        inject.add_injectable('settings', new_settings)

    # FIXME undocumented feature
    if config.setting('write_raw_tables'):

        # write raw input tables as csv (before annotation)
        csv_dir = config.output_file_path('raw_tables')
        if not os.path.exists(csv_dir):
            os.makedirs(csv_dir)  # make directory if needed

        table_names = [t['tablename'] for t in table_list]
        for t in table_names:
            df = inject.get_table(t).to_frame()
            if t == 'households':
                df.drop(columns='chunk_id', inplace=True)
            df.to_csv(os.path.join(csv_dir, '%s.csv' % t), index=True)

    t0 = tracing.print_elapsed_time()

    # FIXME - still want to do this?
    # if inject.get_injectable('skim_dict', None) is not None:
    #     t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True)
    #
    # if inject.get_injectable('skim_stack', None) is not None:
    #     t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True)

    return True
Ejemplo n.º 11
0
def preload_injectables():
    """
    preload bulky injectables up front - stuff that isn't inserted into the pipeline
    """

    logger.info("preload_injectables")

    inject.add_step('track_skim_usage', track_skim_usage)
    inject.add_step('write_data_dictionary', write_data_dictionary)
    inject.add_step('write_tables', write_tables)

    table_list = config.setting('input_table_list')

    # default ActivitySim table names and indices
    if table_list is None:
        logger.warn("No 'input_table_list' found in settings. This will be a "
                    "required setting in upcoming versions of ActivitySim.")

        new_settings = inject.get_injectable('settings')
        new_settings['input_table_list'] = DEFAULT_TABLE_LIST
        inject.add_injectable('settings', new_settings)

    t0 = tracing.print_elapsed_time()

    # FIXME - still want to do this?
    # if inject.get_injectable('skim_dict', None) is not None:
    #     t0 = tracing.print_elapsed_time("preload skim_dict", t0, debug=True)
    #
    # if inject.get_injectable('skim_stack', None) is not None:
    #     t0 = tracing.print_elapsed_time("preload skim_stack", t0, debug=True)

    return True
Ejemplo n.º 12
0
def read_input_table(tablename):
    """Reads input table name and returns cleaned DataFrame.

    Uses settings found in input_table_list in settings.yaml

    Parameters
    ----------
    tablename : string

    Returns
    -------
    pandas DataFrame
    """
    table_list = config.setting('input_table_list')
    assert table_list is not None, 'no input_table_list found in settings'

    table_info = None
    for info in table_list:
        if info['tablename'] == tablename:
            table_info = info

    assert table_info is not None, \
        'could not find info for for tablename %s in settings.yaml' % tablename

    return read_from_table_info(table_info)
Ejemplo n.º 13
0
def read_input_table(tablename, required=True):
    """Reads input table name and returns cleaned DataFrame.

    Uses settings found in input_table_list in global settings file

    Parameters
    ----------
    tablename : string

    Returns
    -------
    pandas DataFrame
    """
    table_list = config.setting('input_table_list')
    assert table_list is not None, 'no input_table_list found in settings'

    table_info = None
    for info in table_list:
        if info['tablename'] == tablename:
            table_info = info

    if table_info is not None:
        df = read_from_table_info(table_info)
    else:
        if required:
            raise RuntimeError(f"could not find info for for tablename {tablename} in settings file")
        df = None

    return df
Ejemplo n.º 14
0
    def load_shared_data(self, shared_data_buffers):
        """
        Load omx skim data into shared_data buffers
        Only called when multiprocessing - BEFORE any models are run or any call to load_data()

        Parameters
        ----------
        shared_data_buffers: dict of multiprocessing.RawArray keyed by skim_tag
        """

        assert self.multiprocess()
        # assert self.skim_dict_factory.supports_shared_data_for_multiprocessing

        if self.skim_dict_factory.supports_shared_data_for_multiprocessing:
            for skim_tag in self.skims_info.keys():
                assert skim_tag in shared_data_buffers, f"load_shared_data expected allocated shared_data_buffers"
                self.skim_dict_factory.load_skims_to_buffer(self.skims_info[skim_tag], shared_data_buffers[skim_tag])

        if self.zone_system == THREE_ZONE:
            assert self.tvpb is not None

            if self.rebuild_tvpb_cache and not config.setting('resume_after', None):
                # delete old cache at start of new run so that stale cache is not loaded by load_data_to_buffer
                # when singleprocess, this call is made (later in program flow) in the initialize_los step
                self.tvpb.tap_cache.cleanup()

            self.tvpb.tap_cache.load_data_to_buffer(shared_data_buffers[self.tvpb.tap_cache.cache_tag])
Ejemplo n.º 15
0
def write_summaries(output_dir):

    summary_settings_name = 'output_summaries'
    summary_file_name = 'summaries.txt'

    summary_settings = setting(summary_settings_name)

    if summary_settings is None:
        logger.info(
            "No {summary_settings_name} specified in settings file. Nothing to write."
        )
        return

    summary_dict = summary_settings

    mode = 'wb' if sys.version_info < (3, ) else 'w'
    with open(config.output_file_path(summary_file_name), mode) as output_file:

        for table_name, column_names in summary_dict.items():

            df = pipeline.get_table(table_name)

            for c in column_names:
                n = 100
                empty = (df[c] == '') | df[c].isnull()

                print(
                    f"\n### {table_name}.{c} type: {df.dtypes[c]} rows: {len(df)} ({empty.sum()} empty)\n\n",
                    file=output_file)
                print(df[c].value_counts().nlargest(n), file=output_file)
Ejemplo n.º 16
0
def write_skim_cache(skim_info, skim_data):
    """
        write skim data from skim_data to canonically named cache file(s) in output directory
    """

    skim_cache_dir = config.setting('skim_cache_dir', default_skim_cache_dir())
    logger.info(
        f"load_skims writing skims data to cache directory {skim_cache_dir}")

    omx_name = skim_info['omx_name']
    dtype = np.dtype(skim_info['dtype'])

    blocks = skim_info['blocks']
    block = 0
    for block_name, block_size in blocks.items():
        skim_cache_file_name = build_skim_cache_file_name(omx_name, block)
        skim_cache_path = os.path.join(skim_cache_dir, skim_cache_file_name)

        block_data = skim_data[block]

        logger.info(
            f"load_skims writing block_name {block_name} {block_data.shape} to {skim_cache_file_name}"
        )

        data = np.memmap(skim_cache_path,
                         shape=block_data.shape,
                         dtype=dtype,
                         mode='w+')
        data[::] = block_data

        block += 1
Ejemplo n.º 17
0
def scenario_dir():

    scenarios_dir = setting('scenarios_dir')
    assert scenarios_dir is not None, "scenarios_dir not defined in settings file"

    if not os.path.exists(scenarios_dir):
        raise RuntimeError("scenarios_dir not found: %s" % scenarios_dir)

    scenario_name = setting('scenario_name')
    assert scenario_name is not None, "scenario_name not defined in settings file"

    scenario_dir_path = os.path.join(scenarios_dir, scenario_name)
    assert os.path.exists(
        scenario_dir_path), "scenario_dir not found: %s" % scenario_dir_path

    return scenario_dir_path
Ejemplo n.º 18
0
def scenarios_dir():

    scenarios_dir = setting('scenarios_dir', 'scenarios')

    if not os.path.exists(scenarios_dir):
        raise RuntimeError("scenarios_dir: directory does not exist")
    return scenarios_dir
Ejemplo n.º 19
0
def _destination_sample(spec_segment_name, choosers, destination_size_terms,
                        skims, estimator, model_settings, alt_dest_col_name,
                        chunk_size, chunk_tag, trace_label):

    model_spec = simulate.spec_for_segment(model_settings,
                                           spec_id='SAMPLE_SPEC',
                                           segment_name=spec_segment_name,
                                           estimator=estimator)

    logger.info("running %s with %d tours", trace_label, len(choosers))

    sample_size = model_settings['SAMPLE_SIZE']
    if config.setting('disable_destination_sampling',
                      False) or (estimator
                                 and estimator.want_unsampled_alternatives):
        # FIXME interaction_sample will return unsampled complete alternatives with probs and pick_count
        logger.info(
            "Estimation mode for %s using unsampled alternatives short_circuit_choices"
            % (trace_label, ))
        sample_size = 0

    locals_d = {'skims': skims}
    constants = config.get_model_constants(model_settings)
    if constants is not None:
        locals_d.update(constants)

    log_alt_losers = config.setting('log_alt_losers', False)

    choices = interaction_sample(choosers,
                                 alternatives=destination_size_terms,
                                 sample_size=sample_size,
                                 alt_col_name=alt_dest_col_name,
                                 log_alt_losers=log_alt_losers,
                                 spec=model_spec,
                                 skims=skims,
                                 locals_d=locals_d,
                                 chunk_size=chunk_size,
                                 chunk_tag=chunk_tag,
                                 trace_label=trace_label)

    # remember person_id in chosen alts so we can merge with persons in subsequent steps
    # (broadcasts person_id onto all alternatives sharing the same tour_id index value)
    choices['person_id'] = choosers.person_id

    return choices
Ejemplo n.º 20
0
    def multiprocess(self):
        """
        return True if this is a multiprocessing run (even if it is a main or single-process subprocess)

        Returns
        -------
            bool
        """
        is_multiprocess = config.setting('multiprocess', False)
        return is_multiprocess
Ejemplo n.º 21
0
def multi_integerize(incidence_df, sub_zone_weights, sub_controls_df,
                     control_spec, total_hh_control_col, parent_geography,
                     parent_id, sub_geography, sub_control_zones):
    """

    Parameters
    ----------
    incidence_df : pandas.Dataframe
        full incidence_df for all hh samples in seed zone
    sub_zone_weights : pandas.DataFame
        balanced subzone household sample weights to integerize
    sub_controls_df : pandas.Dataframe
        sub_geography controls (one row per zone indexed by sub_zone id)
    control_spec : pandas.Dataframe
        full control spec with columns 'target', 'seed_table', 'importance', ...
    total_hh_control_col : str
        name of total_hh column (so we can preferentially match this control)
    parent_geography : str
        parent geography zone name
    parent_id : int
        parent geography zone id
    sub_geography : str
        subzone geography name (e.g. 'TAZ')
    sub_control_zones : pandas.Series
        index is zone id and value is zone label (e.g. TAZ_101)
        for use in sub_controls_df column names

    Returns
    -------
    integer_weights_df : pandas.DataFrame
        canonical form weight table, with columns for 'balanced_weight', 'integer_weight'
        plus columns for household id, parent and sub_geography zone ids
    """

    trace_label = "%s_%s" % (parent_geography, parent_id)

    if setting('NO_INTEGERIZATION_EVER', False):
        integerizer = do_no_integerizing
    elif use_simul_integerizer():
        integerizer = do_simul_integerizing
    else:
        integerizer = do_sequential_integerizing

    integer_weights_df = integerizer(
        trace_label=trace_label,
        incidence_df=incidence_df,
        sub_weights=sub_zone_weights,
        sub_controls_df=sub_controls_df,
        control_spec=control_spec,
        total_hh_control_col=total_hh_control_col,
        sub_geography=sub_geography,
        sub_control_zones=sub_control_zones,
    )

    return integer_weights_df
Ejemplo n.º 22
0
def run_destination_sample(spec_segment_name, tours, persons_merged,
                           model_settings, network_los, destination_size_terms,
                           estimator, chunk_size, trace_label):

    # FIXME - MEMORY HACK - only include columns actually used in spec (omit them pre-merge)
    chooser_columns = model_settings['SIMULATE_CHOOSER_COLUMNS']
    persons_merged = persons_merged[[
        c for c in persons_merged.columns if c in chooser_columns
    ]]
    tours = tours[[
        c for c in tours.columns if c in chooser_columns or c == 'person_id'
    ]]
    choosers = pd.merge(tours,
                        persons_merged,
                        left_on='person_id',
                        right_index=True,
                        how='left')

    # interaction_sample requires that choosers.index.is_monotonic_increasing
    if not choosers.index.is_monotonic_increasing:
        logger.debug(
            f"run_destination_sample {trace_label} sorting choosers because not monotonic_increasing"
        )
        choosers = choosers.sort_index()

    # by default, enable presampling for multizone systems, unless they disable it in settings file
    pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE)
    if pre_sample_taz and not config.setting('want_dest_choice_presampling',
                                             True):
        pre_sample_taz = False
        logger.info(f"Disabled destination zone presampling for {trace_label} "
                    f"because 'want_dest_choice_presampling' setting is False")

    if pre_sample_taz:

        logger.info("Running %s destination_presample with %d tours" %
                    (trace_label, len(tours)))

        choices = destination_presample(spec_segment_name, choosers,
                                        model_settings, network_los,
                                        destination_size_terms, estimator,
                                        chunk_size, trace_label)

    else:
        choices = destination_sample(spec_segment_name, choosers,
                                     model_settings, network_los,
                                     destination_size_terms, estimator,
                                     chunk_size, trace_label)

    # remember person_id in chosen alts so we can merge with persons in subsequent steps
    # (broadcasts person_id onto all alternatives sharing the same tour_id index value)
    choices['person_id'] = tours.person_id

    return choices
Ejemplo n.º 23
0
def load_skims(omx_file_path, skim_info, skim_buffers):

    read_cache = config.setting('read_skim_cache')
    write_cache = config.setting('write_skim_cache')
    assert not (read_cache and write_cache), \
        "read_skim_cache and write_skim_cache are both True in settings file. I am assuming this is a mistake"

    skim_data = skim_data_from_buffers(skim_buffers, skim_info)

    t0 = tracing.print_elapsed_time()

    if read_cache:
        read_skim_cache(skim_info, skim_data)
        t0 = tracing.print_elapsed_time("read_skim_cache", t0)
    else:
        read_skims_from_omx(skim_info, skim_data, omx_file_path)
        t0 = tracing.print_elapsed_time("read_skims_from_omx", t0)

    if write_cache:
        write_skim_cache(skim_info, skim_data)
        t0 = tracing.print_elapsed_time("write_skim_cache", t0)
Ejemplo n.º 24
0
def log_settings():

    settings = [
        'households_sample_size',
        'chunk_size',
        'multiprocess',
        'num_processes',
        'resume_after',
    ]

    for k in settings:
        logger.info('setting %s: %s' % (k, config.setting(k)))
Ejemplo n.º 25
0
def input_pre_processor():

    # - load generic data
    data_dir = setting('data_dir', inject.get_injectable('data_dir'))
    load_tables('input_tables', data_dir)

    # - load scenario input data
    scenario_input_dir = os.path.join(scenario_dir(), 'inputs')
    load_tables('scenario_input_tables', scenario_input_dir)

    for table_name in pipeline.orca_dataframe_tables():
        df = inject.get_table(table_name, None).to_frame()
Ejemplo n.º 26
0
def merge_seed_data(expanded_household_ids, seed_data_df, seed_columns, trace_label):

    seed_geography = setting('seed_geography')
    hh_col = setting('household_id_col')

    df_columns = seed_data_df.columns.values

    # warn of any columns that aren't in seed_data_df
    for c in seed_columns:
        if c not in df_columns and c != hh_col:
            logger.warning("column '%s' not in %s" % (c, trace_label))

    # remove any columns that aren't in seed_data_df
    df_columns = [c for c in seed_columns if c in df_columns]

    # seed_geography column in seed_data_df is redundant (already in expanded_household_ids table)
    if seed_geography in df_columns:
        df_columns.remove(seed_geography)

    # join to seed_data on either index or hh_col (for persons)
    right_index = (seed_data_df.index.name == hh_col)
    right_on = hh_col if hh_col in seed_data_df.columns and not right_index else None
    assert right_index or right_on

    if right_on and hh_col not in df_columns:
        df_columns.append(hh_col)

    merged_df = pd.merge(
        how="left",
        left=expanded_household_ids,
        right=seed_data_df[df_columns],
        left_on=hh_col,
        right_index=right_index,
        right_on=right_on
    )

    if hh_col not in seed_columns:
        del merged_df[hh_col]

    return merged_df
Ejemplo n.º 27
0
def input_pre_processor():
    """
    Read input text files and save them as pipeline tables for use in subsequent steps.

    The files to read as specified by table_list, and array of dicts that specify the
    input file name, the name of the pipeline table, along with keys allow the specification
    of pre-processing steps.

    By default, reads table_list from 'input_table_list' in settings.yaml,
    unless an alternate table_list name is specified as a model step argument 'table_list'.
    (This allows alternate/additional input files to be read for repop)

    In the case of repop, this step is being run after an initial run has completed,
    in which case the input_table_list may specify replacement tables.
    (e.g. lowest geography controls that will replace the previous low controls dataframe.)

    See input_table_list in settings.yaml in the example folder for a working example

    +--------------+----------------------------------------------------------+
    | key          | description                                              |
    +==============+=========================================+================+
    | tablename    | name of pipeline table in which to store dataframe       |
    +--------------+----------------------------------------------------------+
    | filename     | name of csv file to read (in data_dir)                   |
    +--------------+----------------------------------------------------------+
    | column_map   | list of input columns to rename from_name: to_name       |
    +--------------+----------------------------------------------------------+
    | index_col    | name of column to set as dataframe index column          |
    +--------------+----------------------------------------------------------+
    | drop_columns | list of column names of columns to drop                  |
    +--------------+----------------------------------------------------------+

    """

    # alternate table list name may have been provided as a model argument
    table_list_name = inject.get_step_arg('table_list',
                                          default='input_table_list')
    table_list = config.setting(table_list_name)

    assert table_list is not None, "no table list '%s' found in settings." % table_list_name

    logger.info('Using table list: %s' % table_list)

    for table_info in table_list:

        tablename = table_info.get('tablename')
        df = input.read_from_table_info(table_info)
        logger.info('registering table %s' % tablename)

        # add (or replace) pipeline table
        repop = inject.get_step_arg('repop', default=False)
        inject.add_table(tablename, df, replace=repop)
Ejemplo n.º 28
0
def person_trips_processor(trips_with_demographics, person_trips_spec,
                           person_trips_settings, coc_column_names, settings,
                           chunk_size, trace_hh_id):
    """
    Compute disaggregate trips benefits
    """

    trips_df = trips_with_demographics.to_frame()

    logger.info(
        "Running person_trips_processor with %d trips (chunk size = %s)" %
        (len(trips_with_demographics), chunk_size))

    # eval person_trips_spec in context of trips_with_demographics
    locals_dict = config.get_model_constants(person_trips_settings)
    locals_dict.update(config.setting('globals'))

    locals_dict['trips'] = trips_df

    trace_rows = trace_hh_id and trips_df['household_id'] == trace_hh_id

    coc_summary, trace_results, trace_assigned_locals = \
        bca.eval_and_sum(assignment_expressions=person_trips_spec,
                         df=trips_df,
                         locals_dict=locals_dict,
                         df_alias='trips',
                         group_by_column_names=coc_column_names,
                         chunk_size=chunk_size,
                         trace_rows=trace_rows)

    result_prefix = 'PT_'
    add_result_columns("coc_results", coc_summary, result_prefix)
    add_summary_results(coc_summary,
                        prefix=result_prefix,
                        spec=person_trips_spec)

    if trace_hh_id:

        if trace_results is not None:

            # FIXME - moved this into assign_variables
            # add trips_df columns to trace_results
            # trace_results = pd.concat([trips_df[trace_rows], trace_results], axis=1)

            tracing.write_csv(trace_results,
                              file_name="person_trips",
                              index_label='trip_id',
                              column_labels=['label', 'trip'])

        if trace_assigned_locals:
            tracing.write_csv(trace_assigned_locals,
                              file_name="person_trips_locals")
Ejemplo n.º 29
0
def aggregate_zone_processor(zones, trace_od):
    """
    zones: orca table

    zone data for base and build scenario dat files combined into a single dataframe
    with columns names prefixed with base_ or build_ indexed by ZONE
    """

    trace_label = 'aggregate_zone'
    model_settings = config.read_model_settings('aggregate_zone.yaml')
    spec_file_name = model_settings.get('spec_file_name', 'aggregate_zone.csv')
    aggregate_zone_spec = bca.read_assignment_spec(spec_file_name)

    zones_df = zones.to_frame()

    logger.info("Running aggregate_zone_processor with %d zones" %
                (len(zones_df.index), ))

    if trace_od:
        trace_orig, trace_dest = trace_od
        trace_od_rows = (zones_df.index == trace_orig) | (zones_df.index
                                                          == trace_dest)
    else:
        trace_od_rows = None

    # locals whose values will be accessible to the execution context
    # when the expressions in spec are applied to choosers
    locals_dict = config.get_model_constants(model_settings)
    locals_dict.update(config.setting('globals'))

    # eval_variables evaluates each of the expressions in spec
    # in the context of each row in of the choosers dataframe
    results, trace_results, trace_assigned_locals = \
        assign.assign_variables(aggregate_zone_spec,
                                zones_df,
                                locals_dict,
                                df_alias='zones',
                                trace_rows=trace_od_rows)

    pipeline.replace_table('aggregate_zone_summary', results)

    if trace_results is not None:

        tracing.write_csv(trace_results,
                          file_name="aggregate_zone",
                          index_label='zone',
                          column_labels=['label', 'zone'])

        if trace_assigned_locals:
            tracing.write_csv(trace_assigned_locals,
                              file_name="aggregate_zone_locals")
Ejemplo n.º 30
0
def trip_destination_sample(primary_purpose, trips, alternatives,
                            model_settings, size_term_matrix, skim_hotel,
                            estimator, chunk_size, trace_hh_id, trace_label):
    """

    Returns
    -------
    destination_sample: pandas.dataframe
        choices_df from interaction_sample with (up to) sample_size alts for each chooser row
        index (non unique) is trip_id from trips (duplicated for each alt)
        and columns dest_zone_id, prob, and pick_count

        dest_zone_id: int
            alt identifier from alternatives[<alt_col_name>]
        prob: float
            the probability of the chosen alternative
        pick_count : int
            number of duplicate picks for chooser, alt
    """
    trace_label = tracing.extend_trace_label(trace_label,
                                             'trip_destination_sample')

    assert len(trips) > 0
    assert len(alternatives) > 0

    # by default, enable presampling for multizone systems, unless they disable it in settings file
    network_los = inject.get_injectable('network_los')
    pre_sample_taz = not (network_los.zone_system == los.ONE_ZONE)
    if pre_sample_taz and not config.setting('want_dest_choice_presampling',
                                             True):
        pre_sample_taz = False
        logger.info(f"Disabled destination zone presampling for {trace_label} "
                    f"because 'want_dest_choice_presampling' setting is False")

    if pre_sample_taz:

        logger.info("Running %s trip_destination_presample with %d trips" %
                    (trace_label, len(trips)))

        choices = destination_presample(primary_purpose, trips, alternatives,
                                        model_settings, size_term_matrix,
                                        skim_hotel, network_los, estimator,
                                        chunk_size, trace_hh_id, trace_label)

    else:
        choices = destination_sample(primary_purpose, trips, alternatives,
                                     model_settings, size_term_matrix,
                                     skim_hotel, estimator, chunk_size,
                                     trace_label)

    return choices
Ejemplo n.º 31
0
def run():
    config.handle_standard_args()

    # specify None for a pseudo random base seed
    # inject.add_injectable('rng_base_seed', 0)

    tracing.config_logger()
    config.filter_warnings()

    tracing.delete_csv_files()

    # If you provide a resume_after argument to pipeline.run
    # the pipeline manager will attempt to load checkpointed tables from the checkpoint store
    # and resume pipeline processing on the next submodel step after the specified checkpoint
    resume_after = setting('resume_after', None)

    if resume_after:
        print("resume_after", resume_after)

    pipeline.run(models=setting('models'), resume_after=resume_after)

    # tables will no longer be available after pipeline is closed
    pipeline.close_pipeline()
Ejemplo n.º 32
0
def log_settings(injectables):

    settings = [
        'households_sample_size',
        'chunk_size',
        'multiprocess',
        'num_processes',
        'resume_after',
    ]

    for k in settings:
        logger.info("setting %s: %s" % (k, config.setting(k)))

    for k in injectables:
        logger.info("injectable %s: %s" % (k, inject.get_injectable(k)))
Ejemplo n.º 33
0
def read_input_table(table_name):

    filename = setting('input_store', None)

    if not filename:
        logger.error("input store file name not specified in settings")
        raise RuntimeError("store file name not specified in settings")

    input_store_path = config.data_file_path(filename)

    if not os.path.exists(input_store_path):
        logger.error("store file not found: %s" % input_store_path)
        raise RuntimeError("store file not found: %s" % input_store_path)

    df = pd.read_hdf(input_store_path, table_name)

    return df
Ejemplo n.º 34
0
def write_tables(output_dir):
    """
    Write pipeline tables as csv files (in output directory) as specified by output_tables list
    in settings file.

    'output_tables' can specify either a list of output tables to include or to skip
    if no output_tables list is specified, then no checkpointed tables will be written

    To write all output tables EXCEPT the households and persons tables:

    ::

      output_tables:
        action: skip
        tables:
          - households
          - persons

    To write ONLY the households table:

    ::

      output_tables:
        action: include
        tables:
           - households

    Parameters
    ----------
    output_dir: str

    """

    output_tables_settings_name = 'output_tables'

    output_tables_settings = setting(output_tables_settings_name)

    if output_tables_settings is None:
        logger.info("No output_tables specified in settings file. Nothing to write.")
        return

    action = output_tables_settings.get('action')
    tables = output_tables_settings.get('tables')
    prefix = output_tables_settings.get('prefix', 'final_')

    if action not in ['include', 'skip']:
        raise "expected %s action '%s' to be either 'include' or 'skip'" % \
              (output_tables_settings_name, action)

    checkpointed_tables = pipeline.checkpointed_tables()
    if action == 'include':
        output_tables_list = tables
    elif action == 'skip':
        output_tables_list = [t for t in checkpointed_tables if t not in tables]

    for table_name in output_tables_list:

        if table_name == 'checkpoints':
            df = pipeline.get_checkpoints()
        else:
            if table_name not in checkpointed_tables:
                logger.warning("Skipping '%s': Table not found." % table_name)
                continue
            df = pipeline.get_table(table_name)

        file_name = "%s%s.csv" % (prefix, table_name)
        file_path = config.output_file_path(file_name)

        # include the index if it has a name or is a MultiIndex
        write_index = df.index.name is not None or isinstance(df.index, pd.core.index.MultiIndex)

        df.to_csv(file_path, index=write_index)
Ejemplo n.º 35
0
def add_size_tables():
    """
    inject tour_destination_size_terms tables for each model_selector (e.g. school, workplace)

    Size tables are pandas dataframes with locations counts for model_selector by zone and segment
    tour_destination_size_terms

    if using shadow pricing, we scale size_table counts to sample population
    (in which case, they have to be created while single-process)

    Scaling is problematic as it breaks household result replicability across sample sizes
    It also changes the magnitude of the size terms so if they are used as utilities in
    expression files, their importance will diminish relative to other utilities as the sample
    size decreases.

    Scaling makes most sense for a full sample in conjunction with shadow pricing, where
    shadow prices can be adjusted iteratively to bring modelled counts into line with desired
    (size table) counts.
    """

    use_shadow_pricing = bool(config.setting('use_shadow_pricing'))

    shadow_settings = config.read_model_settings('shadow_pricing.yaml')
    shadow_pricing_models = shadow_settings['shadow_pricing_models']

    # probably ought not scale if not shadow_pricing (breaks partial sample replicability)
    # but this allows compatability with existing CTRAMP behavior...
    scale_size_table = shadow_settings.get('SCALE_SIZE_TABLE', False)

    if shadow_pricing_models is None:
        logger.warning('shadow_pricing_models list not found in shadow_pricing settings')
        return

    # shadow_pricing_models is dict of {<model_selector>: <model_name>}
    # since these are scaled to model size, they have to be created while single-process

    for model_selector, model_name in iteritems(shadow_pricing_models):

        model_settings = config.read_model_settings(model_name)

        assert model_selector == model_settings['MODEL_SELECTOR']

        segment_ids = model_settings['SEGMENT_IDS']
        chooser_table_name = model_settings['CHOOSER_TABLE_NAME']
        chooser_segment_column = model_settings['CHOOSER_SEGMENT_COLUMN_NAME']

        choosers_df = inject.get_table(chooser_table_name).to_frame()
        if 'CHOOSER_FILTER_COLUMN_NAME' in model_settings:
            choosers_df = \
                choosers_df[choosers_df[model_settings['CHOOSER_FILTER_COLUMN_NAME']] != 0]

        # - raw_desired_size
        land_use = inject.get_table('land_use')
        size_terms = inject.get_injectable('size_terms')
        raw_size = tour_destination_size_terms(land_use, size_terms, model_selector)
        assert set(raw_size.columns) == set(segment_ids.keys())

        if use_shadow_pricing or scale_size_table:

            inject.add_table('raw_' + size_table_name(model_selector), raw_size)

            # - scale size_table counts to sample population
            # scaled_size = zone_size * (total_segment_modeled / total_segment_desired)

            # segment scale factor (modeled / desired) keyed by segment_name
            segment_scale_factors = {}
            for c in raw_size:
                # number of zone demographics desired destination choices
                segment_desired_size = raw_size[c].astype(np.float64).sum()

                # number of synthetic population choosers in segment
                segment_chooser_count = \
                    (choosers_df[chooser_segment_column] == segment_ids[c]).sum()

                segment_scale_factors[c] = \
                    segment_chooser_count / np.maximum(segment_desired_size, 1)

                logger.info("add_desired_size_tables %s segment %s "
                            "desired %s modeled %s scale_factor %s" %
                            (chooser_table_name, c,
                             segment_desired_size,
                             segment_chooser_count,
                             segment_scale_factors[c]))

            # FIXME - should we be rounding?
            scaled_size = (raw_size * segment_scale_factors).round()
        else:
            scaled_size = raw_size

        inject.add_table(size_table_name(model_selector), scaled_size)
Ejemplo n.º 36
0
    def __init__(self, model_settings, num_processes, shared_data=None, shared_data_lock=None):
        """

        Presence of shared_data is used as a flag for multiprocessing
        If we are multiprocessing, shared_data should be a multiprocessing.RawArray buffer
        to aggregate modeled_size across all sub-processes, and shared_data_lock should be
        a multiprocessing.Lock object to coordinate access to that buffer.

        Optionally load saved shadow_prices from data_dir if config setting use_shadow_pricing
        and shadow_setting LOAD_SAVED_SHADOW_PRICES are both True

        Parameters
        ----------
        model_settings : dict
        shared_data : multiprocessing.Array or None (if single process)
        shared_data_lock : numpy array wrapping multiprocessing.RawArray or None (if single process)
        """

        self.num_processes = num_processes
        self.use_shadow_pricing = bool(config.setting('use_shadow_pricing'))
        self.saved_shadow_price_file_path = None  # set by read_saved_shadow_prices if loaded

        self.model_selector = model_settings['MODEL_SELECTOR']

        full_model_run = config.setting('households_sample_size') == 0
        if self.use_shadow_pricing and not full_model_run:
            logging.warning("deprecated combination of use_shadow_pricing and not full_model_run")

        self.segment_ids = model_settings['SEGMENT_IDS']

        # - modeled_size (set by call to set_choices/synchronize_choices)
        self.modeled_size = None

        if self.use_shadow_pricing:
            self.shadow_settings = config.read_model_settings('shadow_pricing.yaml')

            for k in self.shadow_settings:
                logger.debug("shadow_settings %s: %s" % (k, self.shadow_settings.get(k)))

        # - destination_size_table (desired_size)
        self.desired_size = inject.get_table(size_table_name(self.model_selector)).to_frame()

        # - shared_data
        if shared_data is not None:
            assert shared_data.shape[0] == self.desired_size.shape[0]
            assert shared_data.shape[1] == self.desired_size.shape[1] + 1  # tally column
            assert shared_data_lock is not None
        self.shared_data = shared_data
        self.shared_data_lock = shared_data_lock

        # - load saved shadow_prices (if available) and set max_iterations accordingly
        if self.use_shadow_pricing:
            self.shadow_prices = None
            self.shadow_price_method = self.shadow_settings['SHADOW_PRICE_METHOD']
            assert self.shadow_price_method in ['daysim', 'ctramp']

            if self.shadow_settings['LOAD_SAVED_SHADOW_PRICES']:
                # read_saved_shadow_prices logs error and returns None if file not found
                self.shadow_prices = self.read_saved_shadow_prices(model_settings)

            if self.shadow_prices is None:
                self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS', 5)
            else:
                self.max_iterations = self.shadow_settings.get('MAX_ITERATIONS_SAVED', 1)

            # initial_shadow_price if we did not load
            if self.shadow_prices is None:
                # initial value depends on method
                initial_shadow_price = 1.0 if self.shadow_price_method == 'ctramp' else 0.0
                self.shadow_prices = \
                    pd.DataFrame(data=initial_shadow_price,
                                 columns=self.desired_size.columns,
                                 index=self.desired_size.index)
        else:
            self.max_iterations = 1

        self.num_fail = pd.DataFrame(index=self.desired_size.columns)
        self.max_abs_diff = pd.DataFrame(index=self.desired_size.columns)
        self.max_rel_diff = pd.DataFrame(index=self.desired_size.columns)
Ejemplo n.º 37
0
    # inject.add_injectable('data_dir', '/Users/jeff.doyle/work/activitysim-data/mtc_tm1/data')
    inject.add_injectable('data_dir', ['ancillary_data', data_dir])
    # inject.add_injectable('data_dir', ['ancillary_data', '../activitysim/abm/test/data'])
    inject.add_injectable('configs_dir', ['configs', '../example/configs'])

    injectables = config.handle_standard_args()

    tracing.config_logger()
    config.filter_warnings()

    log_settings(injectables)

    t0 = tracing.print_elapsed_time()

    # cleanup if not resuming
    if not config.setting('resume_after', False):
        cleanup_output_files()

    run_list = mp_tasks.get_run_list()

    if run_list['multiprocess']:
        # do this after config.handle_standard_args, as command line args may override injectables
        injectables = list(set(injectables) | set(['data_dir', 'configs_dir', 'output_dir']))
        injectables = {k: inject.get_injectable(k) for k in injectables}
    else:
        injectables = None

    run(run_list, injectables)

    # pipeline.open_pipeline('_')
    #