Beispiel #1
0
def generate_list_of_sources_in_fbs_method(methodname):
    """
    Determine what FlowByActivities are used to generate a FlowBySector
    :param methodname: string, FlowBySector method
    :return: list, pairs of FlowByActivity source names and years
    """
    sources = []
    # load the fbs method yaml
    fbs_yaml = load_yaml_dict(methodname, flowbytype='FBS')

    # create list of data and allocation data sets
    fbs = fbs_yaml['source_names']
    for fbs_k, fbs_v in fbs.items():
        try:
            sources.append([fbs_k, fbs_v['year']])
        except KeyError:
            log.info(
                'Could not append %s to datasource '
                'list because missing year', fbs_k)
            continue
        activities = fbs_v['activity_sets']
        for aset, attr in activities.items():
            if attr['allocation_source'] != 'None':
                sources.append([
                    attr['allocation_source'], attr['allocation_source_year']
                ])
            if 'helper_source' in attr:
                sources.append(
                    [attr['helper_source'], attr['helper_source_year']])
            if 'literature_sources' in attr:
                for source, date in attr['literature_sources'].items():
                    sources.append([source, date])
    # load any additional fbas that are called in a fbs method within fxns
    try:
        fbas = load_fbs_methods_additional_fbas_config()[methodname]
        for s, acts_info in fbas.items():
            for acts, fxn_info in acts_info.items():
                for fxn, fba_info in fxn_info.items():
                    for fba, y in fba_info.items():
                        fxn_config = \
                            load_functions_loading_fbas_config()[fxn][fba]
                        sources.append([fxn_config['source'], y])
    except KeyError:
        # if no additional fbas than pass
        log.info(
            f'There are no additional Flow-By-Activities '
            'used in generating %s', methodname)
        pass

    return sources
Beispiel #2
0
def load_source_dict(sourcename):
    """
    Load the yaml method file for a flowbyactivity dataset
    or for a value from the literature
    :param sourcename: string, FBA source name or value from the lit name
    :return: dictionary, the method file
    """

    try:
        # check if citation info is for values in the literature
        config_load = load_values_from_literature_citations_config()
        config = config_load[sourcename]
    except KeyError:
        # else check if file exists, then try loading
        # citation information from source yaml
        sourcename = get_flowsa_base_name(sourceconfigpath, sourcename, "yaml")
        config = load_yaml_dict(sourcename, flowbytype='FBA')

    return config
Beispiel #3
0
def seeAvailableFlowByModels(flowbytype):
    """
    Return available Flow-By-Activity or Flow-By-Sector models
    :param flowbytype: 'FBA' or 'FBS'
    :return: console printout of available models
    """

    # return fba directory path dependent on FBA or FBS
    if flowbytype == 'FBA':
        fb_directory = sourceconfigpath
    else:
        fb_directory = flowbysectormethodpath

    # empty dictionary
    fb_dict = {}
    # empty df
    fb_df = []
    # run through all files and append
    for file in os.listdir(fb_directory):
        if file.endswith(".yaml"):
            # drop file extension
            f = os.path.splitext(file)[0]
            if flowbytype == 'FBA':
                s = load_yaml_dict(f, 'FBA')
                try:
                    years = s['years']
                except KeyError:
                    years = 'YAML missing information on years'
                fb_dict.update({f: years})
            # else if FBS
            else:
                fb_df.append(f)

    # determine format of data to print
    if flowbytype == 'FBA':
        data_print = fb_dict
    else:
        data_print = fb_df

    # print data in human-readable format
    pprint.pprint(data_print, width=79, compact=True)
    return data_print
Beispiel #4
0
def main(**kwargs):
    """
    Creates a flowbysector dataset
    :param kwargs: dictionary of arguments, only argument is
        "method_name", the name of method corresponding to flowbysector
        method yaml name
    :return: parquet, FBS save to local folder
    """
    if len(kwargs) == 0:
        kwargs = parse_args()

    method_name = kwargs['method']
    download_FBA_if_missing = kwargs.get('download_FBAs_if_missing')
    # assign arguments
    vLog.info("Initiating flowbysector creation for %s", method_name)
    # call on method
    method = load_yaml_dict(method_name, flowbytype='FBS')
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v, download_FBA_if_missing)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba before mapping, if specified in yaml
            if "clean_fba_before_mapping_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows = dynamically_import_fxn(
                    k, v["clean_fba_before_mapping_df_fxn"])(flows)

            # map flows to federal flow list or material flow list
            flows_mapped, mapping_files = \
                map_fbs_flows(flows, k, v, keep_fba_columns=True)

            # clean up fba, if specified in yaml
            if "clean_fba_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows_mapped = dynamically_import_fxn(
                    k, v["clean_fba_df_fxn"])(flows_mapped)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)
            else:
                aset_names = None

            # master list of activity names read in from data source
            ml_act = []
            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = \
                        aset_names[aset_names['activity_set'] == aset]['name']
                else:
                    names = attr['names']

                # to avoid double counting data from the same source, in
                # the event there are values in both the APB and ACB
                # columns, if an activity has already been read in and
                # allocated, remove that activity from the mapped flows
                # regardless of what activity set the data was read in
                flows_mapped = flows_mapped[~(
                    (flows_mapped[fba_activity_fields[0]].isin(ml_act)) |
                    (flows_mapped[fba_activity_fields[1]].isin(ml_act))
                )].reset_index(drop=True)
                ml_act.extend(names)

                vLog.info("Preparing to handle %s in %s", aset, k)
                # subset fba data by activity
                flows_subset = flows_mapped[
                    (flows_mapped[fba_activity_fields[0]].isin(names)) |
                    (flows_mapped[fba_activity_fields[1]].isin(names)
                     )].reset_index(drop=True)

                # subset by flowname if exists
                if 'source_flows' in attr:
                    flows_subset = flows_subset[flows_subset['FlowName'].isin(
                        attr['source_flows'])]
                if len(flows_subset) == 0:
                    log.warning(f"no data found for flows in {aset}")
                    continue
                if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0:
                    log.warning(f"all flow data for {aset} is 0")
                    continue
                # if activities are sector-like, check sectors are valid
                if check_activities_sector_like(k):
                    flows_subset2 = replace_naics_w_naics_from_another_year(
                        flows_subset, method['target_sector_source'])

                    # check impact on df FlowAmounts
                    vLog.info(
                        'Calculate FlowAmount difference caused by '
                        'replacing NAICS Codes with %s, saving '
                        'difference in Validation log',
                        method['target_sector_source'],
                    )
                    calculate_flowamount_diff_between_dfs(
                        flows_subset, flows_subset2)
                else:
                    flows_subset2 = flows_subset.copy()

                # extract relevant geoscale data or aggregate existing data
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset2, v['geoscale_to_use'],
                    attr['allocation_from_scale'])
                # if loading data subnational geoscale, check for data loss
                if attr['allocation_from_scale'] != 'national':
                    compare_geographic_totals(flows_subset_geo, flows_mapped,
                                              k, attr, aset, names)

                # Add sectors to df activity, depending on level
                # of specified sector aggregation
                log.info("Adding sectors to %s", k)
                flows_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if "clean_fba_w_sec_df_fxn" in v:
                    vLog.info("Cleaning up %s FlowByActivity with sectors", k)
                    flows_subset_wsec = dynamically_import_fxn(
                        k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec,
                                                        attr=attr,
                                                        method=method)

                # rename SourceName to MetaSources and drop columns
                flows_mapped_wsec = flows_subset_wsec.\
                    rename(columns={'SourceName': 'MetaSources'}).\
                    drop(columns=['FlowName', 'Compartment'])

                # if allocation method is "direct", then no need
                # to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    fbs = direct_allocation_method(flows_mapped_wsec, k, names,
                                                   method)
                # if allocation method for an activity set requires a specific
                # function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    fbs = function_allocation_method(flows_mapped_wsec, k,
                                                     names, attr, fbs_list)
                else:
                    fbs = dataset_allocation_method(flows_mapped_wsec, attr,
                                                    names, method, k, v, aset,
                                                    aset_names,
                                                    download_FBA_if_missing)

                # drop rows where flowamount = 0
                # (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors
                # being activity-like or not
                if check_activities_sector_like(k) is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to %s level",
                         method['target_geoscale'])
                # determine from scale
                if fips_number_key[v['geoscale_to_use']] <\
                        fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale,
                                              method['target_geoscale'],
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6
                # associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg)

                # check if any sector information is lost before reaching
                # the target sector length, if so,
                # allocate values equally to disaggregated sectors
                vLog.info(
                    'Searching for and allocating FlowAmounts for any parent '
                    'NAICS that were dropped in the subset to '
                    '%s child NAICS', method['target_sector_level'])
                fbs_agg_2 = equally_allocate_parent_to_child_naics(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                compare_activity_to_sector_flowamounts(flows_mapped_wsec,
                                                       fbs_agg_2, aset, k,
                                                       method)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are
                # NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].\
                    reset_index(drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull()) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_geo_subset_and_fbs_output_totals(
                    flows_subset_geo, fbs_sector_subset, aset, k, v, attr,
                    method)

                log.info("Completed flowbysector for %s", aset)
                fbs_list.append(fbs_sector_subset)
        else:
            if 'clean_fbs_df_fxn' in v:
                flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"],
                                               v["clean_fbs_df_fxn"])(flows)
            flows = update_geoscale(flows, method['target_geoscale'])
            # if the loaded flow dt is already in FBS format,
            # append directly to list of FBS
            log.info("Append %s to FBS list", k)
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # add missing fields, ensure correct data type,
    # add missing columns, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    # prior to aggregating, replace MetaSources string with all sources
    # that share context/flowable/sector values
    fbss = harmonize_FBS_columns(fbss)
    # aggregate df as activities might have data for
    # the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # ensure correct data types/order of columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # check for negative flow amounts
    check_for_negative_flowamounts(fbss)
    # tmp reset data quality scores
    fbss = reset_fbs_dq_scores(fbss)
    # save parquet file
    meta = set_fb_meta(method_name, "FlowBySector")
    write_df_to_file(fbss, paths, meta)
    write_metadata(method_name, method, meta, "FlowBySector")
    # rename the log file saved to local directory
    rename_log_file(method_name, meta)
    log.info(
        'See the Validation log for detailed assessment of '
        'model results in %s', logoutputpath)
Beispiel #5
0
def main(**kwargs):
    """
    Generate FBA parquet(s)
    :param kwargs: 'source' and 'year'
    :return: parquet saved to local directory
    """
    # assign arguments
    if len(kwargs) == 0:
        kwargs = parse_args()

    source = kwargs['source']
    year = kwargs['year']

    # assign yaml parameters (common.py fxn), drop any extensions to FBA
    # filename if run into error
    try:
        config = load_yaml_dict(source, flowbytype='FBA')
    except UnboundLocalError:
        log.info(f'Could not find Flow-By-Activity config file for {source}')
        source = get_flowsa_base_name(sourceconfigpath, source, "yaml")
        log.info(f'Generating FBA for {source}')
        config = load_yaml_dict(source, flowbytype='FBA')

    log.info("Creating dataframe list")
    # year input can either be sequential years (e.g. 2007-2009) or single year
    if '-' in str(year):
        years = str(year).split('-')
        year_iter = list(range(int(years[0]), int(years[1]) + 1))
    else:
        # Else only a single year defined, create an array of one:
        year_iter = [year]

    # check that year(s) are listed in the method yaml, return warning if not
    years_list = list(set(list(map(int, year_iter))
                          ).difference(config['years']))
    if len(years_list) != 0:
        log.warning(f'Years not listed in FBA method yaml: {years_list}, '
                    f'data might not exist')

    for p_year in year_iter:
        year = str(p_year)
        # replace parts of urls with specific instructions from source.py
        urls = assemble_urls_for_query(source=source, year=year, config=config)
        # create a list with data from all source urls
        df_list = call_urls(url_list=urls,
                            source=source, year=year, config=config)
        # concat the dataframes and parse data with specific
        # instructions from source.py
        log.info("Concat dataframe list and parse data")
        dfs = parse_data(df_list=df_list,
                         source=source, year=year, config=config)
        if isinstance(dfs, list):
            for frame in dfs:
                if not len(frame.index) == 0:
                    try:
                        source_names = frame['SourceName']
                        source_name = source_names.iloc[0]
                    except KeyError:
                        source_name = source
                    process_data_frame(df=frame,
                                       source=source_name, year=year,
                                       config=config)
        else:
            process_data_frame(df=dfs, source=source, year=year, config=config)
Beispiel #6
0
def add_sectors_to_flowbyactivity(flowbyactivity_df,
                                  sectorsourcename=SECTOR_SOURCE_NAME,
                                  **kwargs):
    """
    Add Sectors from the Activity fields and mapped them to Sector
    from the crosswalk. No allocation is performed.
    :param flowbyactivity_df: A standard flowbyactivity data frame
    :param sectorsourcename: A sector source name, using package default
    :param kwargs: option to include the parameter 'allocationmethod',
    which modifies function behavoir if = 'direct'
    :return: a df with activity fields mapped to 'sectors'
    """
    # First check if source activities are NAICS like -
    # if so make it into a mapping file
    s = pd.unique(flowbyactivity_df['SourceName'])[0]
    # load catalog info for source, first check for sourcename used
    # in source catalog
    ts = return_true_source_catalog_name(s)
    src_info = load_yaml_dict('source_catalog')[ts]
    # read the pre-determined level of sector aggregation of
    # each crosswalk from the source catalog
    levelofSectoragg = src_info['sector_aggregation_level']
    # if the FBS activity set is 'direct', overwrite the
    # levelofsectoragg, or if specified in fxn call
    if kwargs != {}:
        if 'allocationmethod' in kwargs:
            if kwargs['allocationmethod'] == 'direct':
                levelofSectoragg = 'disaggregated'
        if 'overwrite_sectorlevel' in kwargs:
            levelofSectoragg = kwargs['overwrite_sectorlevel']
    # if data are provided in NAICS format, use the mastercrosswalk
    if src_info['sector-like_activities']:
        cw = load_crosswalk('sector_timeseries')
        sectors = cw.loc[:, [SECTOR_SOURCE_NAME]]
        # Create mapping df that's just the sectors at first
        mapping = sectors.drop_duplicates()
        # Add the sector twice as activities so mapping is identical
        mapping = mapping.assign(Activity=sectors[SECTOR_SOURCE_NAME])
        mapping = mapping.rename(columns={SECTOR_SOURCE_NAME: "Sector"})
        # add columns so can run expand_naics_list_fxn
        # if sector-like_activities = True, missing columns, so add
        mapping['ActivitySourceName'] = s
        # tmp assignment
        mapping['SectorType'] = None
        # Include all digits of naics in mapping, if levelofNAICSagg
        # is specified as "aggregated"
        if levelofSectoragg == 'aggregated':
            mapping = expand_naics_list(mapping, sectorsourcename)
    else:
        # if source data activities are text strings, or sector-like
        # activities should be modified, call on the manually
        # created source crosswalks
        mapping = get_activitytosector_mapping(s)
        # filter by SectorSourceName of interest
        mapping = mapping[mapping['SectorSourceName'] == sectorsourcename]
        # drop SectorSourceName
        mapping = mapping.drop(columns=['SectorSourceName'])
        # Include all digits of naics in mapping, if levelofNAICSagg
        # is specified as "aggregated"
        if levelofSectoragg == 'aggregated':
            mapping = expand_naics_list(mapping, sectorsourcename)
    # Merge in with flowbyactivity by
    flowbyactivity_wsector_df = flowbyactivity_df.copy(deep=True)
    for k, v in activity_fields.items():
        sector_direction = k
        flowbyactivity_field = v[0]["flowbyactivity"]
        flowbysector_field = v[1]["flowbysector"]
        sector_type_field = sector_direction + 'SectorType'
        mappings_df_tmp = mapping.rename(
            columns={
                'Activity': flowbyactivity_field,
                'Sector': flowbysector_field,
                'SectorType': sector_type_field
            })
        # column doesn't exist for sector-like activities,
        # so ignore if error occurs
        mappings_df_tmp = mappings_df_tmp.drop(columns=['ActivitySourceName'],
                                               errors='ignore')
        # Merge them in. Critical this is a left merge to
        # preserve all unmapped rows
        flowbyactivity_wsector_df = pd.merge(flowbyactivity_wsector_df,
                                             mappings_df_tmp,
                                             how='left',
                                             on=flowbyactivity_field)
    for c in [
            'SectorProducedBy', 'ProducedBySectorType', 'SectorConsumedBy',
            'ConsumedBySectorType'
    ]:
        flowbyactivity_wsector_df[c] = \
            flowbyactivity_wsector_df[c].replace({np.nan: None})
    # add sector source name
    flowbyactivity_wsector_df = \
        flowbyactivity_wsector_df.assign(SectorSourceName=sectorsourcename)

    # if activities are sector-like check that the sectors are in the crosswalk
    if src_info['sector-like_activities']:
        flowbyactivity_wsector_df =\
            replace_naics_w_naics_from_another_year(flowbyactivity_wsector_df,
                                                    sectorsourcename)

    return flowbyactivity_wsector_df