Beispiel #1
0
def getFlowByActivity(flowclass,
                      years,
                      datasource,
                      geographic_level='all',
                      file_location='local'):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :param geographic_level: 'all', 'national', 'state', 'county'. Default is 'all'
    :param file_location: 'local' or 'remote'. Default is 'local'
    :return: a pandas DataFrame in FlowByActivity format
    """
    fbas = pd.DataFrame()
    for y in years:
        # definitions
        fba_file = datasource + "_" + str(y) + ".parquet"
        local_file_path = fbaoutputpath + fba_file
        remote_file_path = 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + fba_file
        # load data
        if file_location == 'local':
            fba = load_file(fba_file, local_file_path, remote_file_path)
        else:
            log.info('Loading ' + datasource + ' from remote server')
            fba = pd.read_parquet(remote_file_path)
        fba = fba[fba['Class'].isin(flowclass)]
        # if geographic level specified, only load rows in geo level
        if geographic_level != 'all':
            fba = filter_by_geoscale(fba, geographic_level)
        # concat dfs
        fbas = pd.concat([fbas, fba], sort=False)
    return fbas
Beispiel #2
0
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method):

    import flowsa
    from flowsa.mapping import add_sectors_to_flowbyactivity
    from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, agg_by_geoscale, fba_default_grouping_fields, \
        sector_ratios, proportional_allocation_by_location_and_sector, filter_by_geoscale
    from flowsa.BLS_QCEW import clean_bls_qcew_fba

    bls = flowsa.getFlowByActivity(flowclass=['Employment'],
                                   datasource='BLS_QCEW',
                                   years=[2002])
    # clean df
    bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict)
    bls = clean_bls_qcew_fba(bls, attr)

    # bls_agg = agg_by_geoscale(bls, 'state', 'national', fba_default_grouping_fields)
    bls_agg = filter_by_geoscale(bls, 'national')

    # assign naics to allocation dataset
    bls_wsec = add_sectors_to_flowbyactivity(
        bls_agg, sectorsourcename=method['target_sector_source'])
    # drop rows where sector = None ( does not occur with mining)
    bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()]
    bls_wsec = bls_wsec.rename(columns={'SectorProducedBy': 'Sector'})

    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
    # aren't in list
    sector_list = df_wsec['Sector'].unique().tolist()
    # subset fba allocation table to the values in the activity list, based on overlapping sectors
    bls_wsec = bls_wsec.loc[bls_wsec['Sector'].isin(sector_list)]
    # calculate proportional ratios
    bls_wsec = proportional_allocation_by_location_and_sector(
        bls_wsec, 'Sector')  #, 'agg')
    bls_wsec = bls_wsec.rename(columns={
        'FlowAmountRatio': 'EmployeeRatio',
        'FlowAmount': 'Employees'
    })

    # merge the two dfs
    df = pd.merge(df_wsec,
                  bls_wsec[['Sector', 'EmployeeRatio', 'Employees']],
                  how='left',
                  left_on='Sector',
                  right_on='Sector')
    df['EmployeeRatio'] = df['EmployeeRatio'].fillna(0)
    # calculate gal/employee in 2002
    df.loc[:, 'FlowAmount'] = (df['FlowAmount'] *
                               df['EmployeeRatio']) / df['Employees']
    df.loc[:, 'Unit'] = 'gal/employee'

    # drop cols
    df = df.drop(columns=['Employees', 'EmployeeRatio'])

    return df
Beispiel #3
0
def getFlowByActivity(datasource,
                      year,
                      flowclass=None,
                      geographic_level=None,
                      download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Retrieves stored data in the FlowByActivity format
    :param datasource: str, the code of the datasource.
    :param year: int, a year, e.g. 2012
    :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
    :param geographic_level: str, a geographic level of the data.
                             Optional. E.g. 'national', 'state', 'county'.
    :param download_if_missing: bool, if True will attempt to load from remote server
        prior to generating if file not found locally
    :return: a pandas DataFrame in FlowByActivity format
    """
    from esupy.processed_data_mgmt import download_from_remote
    # Set fba metadata
    name = flowsa.flowbyactivity.set_fba_name(datasource, year)
    fba_meta = set_fb_meta(name, "FlowByActivity")

    # Try to load a local version of fba; generate and load if missing
    fba = load_preprocessed_output(fba_meta, paths)
    # Remote download
    if fba is None and download_if_missing:
        log.info('%s %s not found in %s, downloading from remote source',
                 datasource, str(year), fbaoutputpath)
        download_from_remote(fba_meta, paths)
        fba = load_preprocessed_output(fba_meta, paths)

    if fba is None:
        log.info('%s %s not found in %s, running functions to generate FBA',
                 datasource, str(year), fbaoutputpath)
        # Generate the fba
        flowsa.flowbyactivity.main(year=year, source=datasource)
        # Now load the fba
        fba = load_preprocessed_output(fba_meta, paths)
        if fba is None:
            log.error('getFlowByActivity failed, FBA not found')
        else:
            log.info('Loaded %s %s from %s', datasource, str(year),
                     fbaoutputpath)
    else:
        log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath)

    # Address optional parameters
    if flowclass is not None:
        fba = fba[fba['Class'] == flowclass]
    # if geographic level specified, only load rows in geo level
    if geographic_level is not None:
        fba = filter_by_geoscale(fba, geographic_level)
    return fba
Beispiel #4
0
def getFlowByActivity(flowclass, years, datasource, geographic_level='all'):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can \
    specify 'national', 'state', 'county'
    :return: a pandas DataFrame in FlowByActivity format
    """

    fbas = pd.DataFrame()
    for y in years:
        # first try reading parquet from your local repo
        try:
            log.info('Loading ' + datasource + ' ' + str(y) +
                     ' parquet from local repository')
            fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) +
                                  ".parquet")
            fba = fba[fba['Class'].isin(flowclass)]
            fbas = pd.concat([fbas, fba], sort=False)
        except (OSError, FileNotFoundError):
            # if parquet does not exist in local repo, read file from Data Commons
            try:
                log.info(
                    datasource +
                    ' parquet not found in local repo, loading from Data Commons'
                )
                fba = pd.read_parquet(
                    'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/'
                    + datasource + "_" + str(y) + '.parquet')
                fba = fba[fba['Class'].isin(flowclass)]
                fbas = pd.concat([fbas, fba], sort=False)
            except FileNotFoundError:
                log.error("No parquet file found for datasource " +
                          datasource + "and year " + str(y) +
                          " in flowsa or Data Commons")

    # if geographic level specified, only load rows in geo level
    if geographic_level != 'all':
        fbas = filter_by_geoscale(fbas, geographic_level)

    return fbas
Beispiel #5
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__],
                                v["clean_fba_df_fxn"])(flows)

            flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = aset_names[aset_names['activity_set'] ==
                                       aset]['name']
                else:
                    names = attr['names']

                log.info("Preparing to handle subset of flownames " +
                         ', '.join(map(str, names)) + " in " + k)
                # subset fba data by activity
                flows_subset = flows[
                    (flows[fba_activity_fields[0]].isin(names)) |
                    (flows[fba_activity_fields[1]].isin(names))].reset_index(
                        drop=True)

                # extract relevant geoscale data or aggregate existing data
                log.info("Subsetting/aggregating dataframe to " +
                         attr['allocation_from_scale'] + " geoscale")
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset, v['geoscale_to_use'],
                    attr['allocation_from_scale'])

                # Add sectors to df activity, depending on level of specified sector aggregation
                log.info("Adding sectors to " + k)
                flow_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'])
                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__],
                                               v["clean_fba_w_sec_df_fxn"])(
                                                   flow_subset_wsec, attr)

                # map df to elementary flows
                log.info("Mapping flows in " + k +
                         ' to federal elementary flow list')
                if 'fedefl_mapping' in v:
                    mapping_files = v['fedefl_mapping']
                else:
                    mapping_files = k

                flow_subset_mapped = map_elementary_flows(
                    flow_subset_wsec, mapping_files)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' +
                             ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_mapped.copy()

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " +
                             attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(
                        flowclass=[attr['allocation_source_class']],
                        datasource=attr['allocation_source'],
                        years=[attr['allocation_source_year']
                               ]).reset_index(drop=True)

                    fba_allocation = clean_df(fba_allocation,
                                              flow_by_activity_fields,
                                              fba_fill_na_dict)

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['FlowName'].isin(
                                attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(
                                attr['allocation_compartment'])]

                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(
                                                     fba_allocation, attr)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " +
                             attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(
                        fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    from_scale = attr['allocation_from_scale']
                    to_scale = v['geoscale_to_use']
                    # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                    if fips_number_key[from_scale] > fips_number_key[to_scale]:
                        fba_allocation = agg_by_geoscale(
                            fba_allocation, from_scale, to_scale,
                            fba_default_grouping_fields)
                    # else, if fba is more aggregated than allocation table, use fba as both to and from scale
                    else:
                        fba_allocation = filter_by_geoscale(
                            fba_allocation, from_scale)

                    # assign sector to allocation dataset
                    # todo: add sectorsourcename col value
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation_wsec = add_sectors_to_flowbyactivity(
                        fba_allocation,
                        sectorsourcename=method['target_sector_source'])

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " +
                             attr['allocation_source'])
                    fba_allocation_wsec = generalize_activity_field_names(
                        fba_allocation_wsec)

                    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
                    if 'clean_allocation_fba_w_sec' in attr:
                        log.info("Further disaggregating sectors in " +
                                 attr['allocation_source'])
                        fba_allocation_wsec = getattr(
                            sys.modules[__name__],
                            attr["clean_allocation_fba_w_sec"])(
                                fba_allocation_wsec, attr, method)

                    # subset fba datasets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] +
                             " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(
                        fba_allocation_wsec, k, names)

                    # drop columns
                    fba_allocation_subset = fba_allocation_subset.drop(
                        columns=['Activity'])

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info(
                            "Using the specified allocation help for subset of "
                            + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(
                            fba_allocation_subset, method, attr, v)

                    # create flow allocation ratios for each activity
                    flow_alloc_list = []
                    for n in names:
                        log.info("Creating allocation ratios for " + n)
                        fba_allocation_subset_2 = get_fba_allocation_subset(
                            fba_allocation_subset, k, [n])
                        if len(fba_allocation_subset_2) == 0:
                            log.info("No data found to allocate " + n)
                        else:
                            flow_alloc = allocate_by_sector(
                                fba_allocation_subset_2,
                                attr['allocation_method'])
                            flow_alloc = flow_alloc.assign(FBA_Activity=n)
                            flow_alloc_list.append(flow_alloc)
                    flow_allocation = pd.concat(flow_alloc_list)

                    # check for issues with allocation ratios
                    check_allocation_ratios(flow_allocation, aset, k)

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_mapped = flow_subset_mapped.loc[
                        (flow_subset_mapped[fbs_activity_fields[0]].
                         isin(sector_list)) |
                        (flow_subset_mapped[fbs_activity_fields[1]].
                         isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info(
                        "Checking if flowbyactivity and allocation dataframes use the same location systems"
                    )
                    check_if_location_systems_match(flow_subset_mapped,
                                                    flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " +
                             attr['allocation_source'])
                    fbs = flow_subset_mapped.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorProducedBy',
                            'ActivityProducedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    fbs = fbs.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorConsumedBy',
                            'ActivityConsumedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs[
                        'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                    # fill null rows with 0 because no allocation info
                    fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'],
                                     how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[
                        'FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=[
                        'Sector_x', 'FlowAmountRatio_x', 'Sector_y',
                        'FlowAmountRatio_y', 'FlowAmountRatio',
                        'FBA_Activity_x', 'FBA_Activity_y'
                    ])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # clean df
                fbs = clean_df(fbs, flow_by_sector_fields_w_activity,
                               fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to " +
                         method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[
                        attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs_geo_agg = agg_by_geoscale(
                    fbs, from_scale, to_scale,
                    fbs_grouping_fields_w_activities)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(
                    fbs_geo_agg, fbs_grouping_fields_w_activities)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs_agg = sector_disaggregation(
                    fbs_sec_agg, flow_by_sector_fields_w_activity)

                # compare flowbysector with flowbyactivity
                check_for_differences_between_fba_load_and_fbs_output(
                    flow_subset_mapped, fbs_agg, aset, k)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs_agg.loc[
                    (fbs_agg[fbs_activity_fields[0]].isin(sector_list))
                    & (fbs_agg[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_2 = fbs_agg.loc[
                    (fbs_agg[fbs_activity_fields[0]].isin(sector_list))
                    & (fbs_agg[fbs_activity_fields[1]].isnull())].reset_index(
                        drop=True)
                fbs_3 = fbs_agg.loc[(fbs_agg[fbs_activity_fields[0]].isnull())
                                    & (fbs_agg[fbs_activity_fields[1]].isin(
                                        sector_list))].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # check if losing data by subsetting at specified sector length
                log.info('Checking if losing data by subsetting dataframe')
                fbs_sector_subset_2 = check_if_losing_sector_data(
                    fbs_agg, fbs_sector_subset, method['target_sector_level'])

                # set source name
                fbs_sector_subset_2.loc[:, 'SectorSourceName'] = method[
                    'target_sector_source']

                # drop activity columns
                del fbs_sector_subset_2[
                    'ActivityProducedBy'], fbs_sector_subset_2[
                        'ActivityConsumedBy']

                log.info(
                    "Completed flowbysector for activity subset with flows " +
                    ', '.join(map(str, names)))
                fbs_list.append(fbs_sector_subset_2)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # add missing fields, ensure correct data type, reorder columns
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Beispiel #6
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of water data and allocation datasets
    fbas = method['flowbyactivity_sources']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fbas.items():
        # pull water data for allocation
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows = flowsa.getFlowByActivity(flowclass=[v['class']],
                                         years=[v['year']],
                                         datasource=k)

        # if necessary, standardize names in data set
        if v['activity_name_standardization_fxn'] != 'None':
            log.info("Standardizing activity names in " + k)
            flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows)

        # drop description field
        flows = flows.drop(columns='Description')
        # fill null values
        flows = flows.fillna(value=fba_fill_na_dict)
        # map df to elementary flows - commented out until mapping complete
        # log.info("Mapping flows in " + k + ' to federal elementary flow list')
        # flows_mapped = map_elementary_flows(flows, k)
        # convert unit todo: think about unit conversion here
        log.info("Converting units in " + k)
        flows = convert_unit(flows)

        # create dictionary of allocation datasets for different activities
        activities = v['activity_sets']
        for aset, attr in activities.items():
            # subset by named activities
            names = [attr['names']]
            log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)
            # subset usgs data by activity
            flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) |
                                (flows[fba_activity_fields[1]].isin(names))]

            # Reset index values after subset
            flow_subset = flow_subset.reset_index(drop=True)

            # check if flowbyactivity data exists at specified geoscale to use
            log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " +
                     v['geoscale_to_use'] + ' level')
            geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use'])
            # aggregate geographically to the scale of the allocation dataset
            if geocheck == "Yes":
                activity_from_scale = v['geoscale_to_use']
            else:
                # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                # geoscale, and sum to specified geoscale
                log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level")
                new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names,
                                                                                        v['geoscale_to_use'])
                activity_from_scale = new_geoscale_to_use

            activity_to_scale = attr['allocation_from_scale']
            # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale
            if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names)
            # else, aggregate to geoscale want to use
            elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names)
            # else, if usgs is more aggregated than allocation table, filter relevant rows
            else:
                log.info("Filtering out " + activity_from_scale + " data")
                flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names)

            # location column pad zeros if necessary
            flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5
                                                                    else x
                                                                    )

            # Add sectors to usgs activity, creating two versions of the flow subset
            # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS)
            # the second version, "flow_subset_agg" includes only the most aggregated level of sectors
            log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names)))
            flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                             sectorsourcename=method['target_sector_source'])
            flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset,
                                                                 sectorsourcename=method['target_sector_source'],
                                                                 levelofSectoragg='agg')

            # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
            # dataframe to create sector allocation ratios
            if attr['allocation_method'] == 'direct':
                fbs = flow_subset_wsec_agg.copy()
            else:
                # determine appropriate allocation dataset
                log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year']))
                fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                          datasource=attr['allocation_source'],
                                                          years=[attr['allocation_source_year']]).reset_index(drop=True)

                # fill null values
                fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict)
                # convert unit
                fba_allocation = convert_unit(fba_allocation)

                # subset based on yaml settings
                if attr['allocation_flow'] != 'None':
                    fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                if attr['allocation_compartment'] != 'None':
                    fba_allocation = fba_allocation.loc[
                        fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                # reset index
                fba_allocation = fba_allocation.reset_index(drop=True)

                # check if allocation data exists at specified geoscale to use
                log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) +
                         " at the " + attr['allocation_from_scale'] + " level")
                check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale'])

                # aggregate geographically to the scale of the flowbyactivty source, if necessary
                from_scale = attr['allocation_from_scale']
                to_scale = v['geoscale_to_use']
                # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                if fips_number_key[from_scale] > fips_number_key[to_scale]:
                    fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names)
                # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                else:
                    fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                # assign sector to allocation dataset
                log.info("Adding sectors to " + attr['allocation_source'])
                fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                               sectorsourcename=method['target_sector_source'],
                                                               levelofSectoragg=attr[
                                                                   'allocation_sector_aggregation'])
                # subset fba datsets to only keep the naics associated with usgs activity subset
                log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)
                # Reset index values after subset
                fba_allocation_subset = fba_allocation_subset.reset_index(drop=True)
                # generalize activity field names to enable link to water withdrawal table
                log.info("Generalizing activity names in subset of " + attr['allocation_source'])
                fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                # drop columns
                fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                # if there is an allocation helper dataset, modify allocation df
                if attr['allocation_helper'] == 'yes':
                    log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                    fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                # create flow allocation ratios
                log.info("Creating allocation ratios for " + attr['allocation_source'])
                flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                # aren't in list
                sector_list = flow_allocation['Sector'].unique().tolist()

                # subset fba allocation table to the values in the activity list, based on overlapping sectors
                flow_subset_wsec = flow_subset_wsec.loc[
                    (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                    (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                # check if fba and allocation dfs have the same LocationSystem
                log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                # merge water withdrawal df w/flow allocation dataset
                log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                fbs = flow_subset_wsec.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorProducedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                fbs = fbs.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorConsumedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                # drop columns where both sector produced/consumed by in flow allocation dif is null
                fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                # merge the flowamount columns
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                # calculate flow amounts for each sector
                log.info("Calculating new flow amounts using flow ratios")
                fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                # drop columns
                log.info("Cleaning up new flow by sector")
                fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                        'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

            # rename flow name to flowable
            fbs = fbs.rename(columns={"FlowName": 'Flowable',
                                      "Compartment": "Context"
                                      })

            # drop rows where flowamount = 0 (although this includes dropping suppressed data)
            fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)
            # add missing data columns
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)
            # fill null values
            fbs = fbs.fillna(value=fbs_fill_na_dict)

            # aggregate df geographically, if necessary
            log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
            if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                from_scale = v['geoscale_to_use']
            else:
                from_scale = attr['allocation_from_scale']

            to_scale = method['target_geoscale']

            fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

            # aggregate data to every sector level
            log.info("Aggregating flowbysector to " + method['target_sector_level'])
            fbs = sector_aggregation(fbs, fbs_default_grouping_fields)

            # test agg by sector
            sector_agg_comparison = sector_flow_comparision(fbs)

            # return sector level specified in method yaml
            # load the crosswalk linking sector lengths
            cw = load_sector_length_crosswalk()
            sector_list = cw[method['target_sector_level']].unique().tolist()

            # add any non-NAICS sectors used with NAICS
            household = load_household_sector_codes()
            household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']]
            # add household sector to sector list
            sector_list.extend(household['Code'].tolist())
            # subset df
            fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                          (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)

            # add any missing columns of data and cast to appropriate data type
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

            log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
            fbss.append(fbs)
    # create single df of all activities
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Beispiel #7
0
def geoscale_flow_comparison(flowclass,
                             years,
                             datasource,
                             activitynames=['all'],
                             to_scale='national'):
    """ Aggregates county data to state and national, and state data to national level, allowing for comparisons
        in flow totals for a given flowclass and industry. First assigns all flownames to NAICS and standardizes units.

        Assigned to NAICS rather than using FlowNames for aggregation to negate any changes in flownames across
        time/geoscale
    """

    # load parquet file checking aggregation
    flows = flowsa.getFlowByActivity(flowclass=flowclass,
                                     years=years,
                                     datasource=datasource)
    # fill null values
    flows = flows.fillna(value=fba_fill_na_dict)
    # convert units
    flows = convert_unit(flows)

    # if activityname set to default, then compare aggregation for all activities. If looking at particular activity,
    # filter that activity out
    if activitynames == ['all']:
        flow_subset = flows.copy()
    else:
        flow_subset = flows[
            (flows[fba_activity_fields[0]].isin(activitynames)) |
            (flows[fba_activity_fields[1]].isin(activitynames))]

    # Reset index values after subset
    flow_subset = flow_subset.reset_index()

    # pull naics crosswalk
    mapping = get_activitytosector_mapping(flow_subset['SourceName'].all())

    # assign naics to activities
    # usgs datasource is not easily assigned to naics for checking totals, so instead standardize activity names
    if datasource == 'USGS_NWIS_WU':
        flow_subset = standardize_usgs_nwis_names(flow_subset)
    else:
        flow_subset = pd.merge(flow_subset,
                               mapping[['Activity', 'Sector']],
                               left_on='ActivityProducedBy',
                               right_on='Activity',
                               how='left').rename(
                                   {'Sector': 'SectorProducedBy'}, axis=1)
        flow_subset = pd.merge(flow_subset,
                               mapping[['Activity', 'Sector']],
                               left_on='ActivityConsumedBy',
                               right_on='Activity',
                               how='left').rename(
                                   {'Sector': 'SectorConsumedBy'}, axis=1)
    flow_subset = flow_subset.drop(columns=[
        'ActivityProducedBy', 'ActivityConsumedBy', 'Activity_x', 'Activity_y',
        'Description'
    ],
                                   errors='ignore')
    flow_subset['SectorProducedBy'] = flow_subset['SectorProducedBy'].replace({
        np.nan:
        None
    }).astype(str)
    flow_subset['SectorConsumedBy'] = flow_subset['SectorConsumedBy'].replace({
        np.nan:
        None
    }).astype(str)

    # create list of geoscales for aggregation
    if to_scale == 'national':
        geoscales = ['national', 'state', 'county']
    elif to_scale == 'state':
        geoscales = ['state', 'county']

    # create empty df list
    flow_dfs = []
    for i in geoscales:
        try:
            # filter by geoscale
            fba_from_scale = filter_by_geoscale(flow_subset, i)

            # remove/add column names as a column
            group_cols = fba_default_grouping_fields.copy()
            for j in ['Location', 'ActivityProducedBy', 'ActivityConsumedBy']:
                group_cols.remove(j)
            for j in ['SectorProducedBy', 'SectorConsumedBy']:
                group_cols.append(j)

            # county sums to state and national, state sums to national
            if to_scale == 'state':
                fba_from_scale['Location'] = fba_from_scale['Location'].apply(
                    lambda x: str(x[0:2]))
            elif to_scale == 'national':
                fba_from_scale['Location'] = US_FIPS

            # aggregate
            fba_agg = aggregator(fba_from_scale, group_cols)

            # rename flowamount column, based on geoscale
            fba_agg = fba_agg.rename(columns={"FlowAmount": "FlowAmount_" + i})

            # drop fields irrelevant to aggregated flow comparision
            drop_fields = flows[[
                'MeasureofSpread', 'Spread', 'DistributionType',
                'DataReliability', 'DataCollection'
            ]]
            fba_agg = fba_agg.drop(columns=drop_fields)

            # reset index
            fba_agg = fba_agg.reset_index(drop=True)

            flow_dfs.append(fba_agg)
        except:
            pass

    # merge list of dfs by column
    flow_comparison = reduce(
        lambda left, right: pd.merge(
            left,
            right,
            on=[
                'Class', 'SourceName', 'FlowName', 'Unit', 'SectorProducedBy',
                'SectorConsumedBy', 'Compartment', 'Location',
                'LocationSystem', 'Year'
            ],
            how='outer'), flow_dfs)

    # sort df
    flow_comparison = flow_comparison.sort_values([
        'Year', 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'FlowName',
        'Compartment'
    ])

    return flow_comparison
Beispiel #8
0
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method):
    """

    :param df_wsec:
    :param attr:
    :param method:
    :return:
    """

    import flowsa
    from flowsa.mapping import add_sectors_to_flowbyactivity
    from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict,  proportional_allocation_by_location_and_activity, \
        filter_by_geoscale, harmonize_units
    from flowsa.BLS_QCEW import clean_bls_qcew_fba

    bls = flowsa.getFlowByActivity(flowclass=['Employment'],
                                   datasource='BLS_QCEW',
                                   years=[2002])

    bls = filter_by_geoscale(bls, 'national')

    # clean df
    bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict)
    bls = harmonize_units(bls)
    bls = clean_bls_qcew_fba(bls, attr=attr)

    # assign naics to allocation dataset
    bls_wsec = add_sectors_to_flowbyactivity(
        bls, sectorsourcename=method['target_sector_source'])
    # drop rows where sector = None ( does not occur with mining)
    bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()]
    bls_wsec = bls_wsec.rename(columns={
        'SectorProducedBy': 'Sector',
        'FlowAmount': 'HelperFlow'
    })

    # merge the two dfs
    df = pd.merge(df_wsec,
                  bls_wsec[['Location', 'Sector', 'HelperFlow']],
                  how='left',
                  left_on=['Location', 'SectorConsumedBy'],
                  right_on=['Location', 'Sector'])
    # drop any rows where sector is None
    df = df[~df['Sector'].isnull()]
    # fill helperflow values with 0
    df['HelperFlow'] = df['HelperFlow'].fillna(0)

    # calculate proportional ratios
    df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector')

    df_wratio = df_wratio.rename(columns={
        'FlowAmountRatio': 'EmployeeRatio',
        'HelperFlow': 'Employees'
    })

    # drop rows where helperflow = 0
    df_wratio = df_wratio[df_wratio['Employees'] != 0]

    # calculate gal/employee in 2002
    df_wratio.loc[:, 'FlowAmount'] = (
        df_wratio['FlowAmount'] *
        df_wratio['EmployeeRatio']) / df_wratio['Employees']
    df_wratio.loc[:, 'Unit'] = 'gal/employee'

    # drop cols
    df_wratio = df_wratio.drop(
        columns=['Sector', 'Employees', 'EmployeeRatio'])

    return df_wratio