Esempio n. 1
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: The datasource name
    :param v: The datasource parameters
    :return:
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = 'all'
        log.info("Retrieving flowbyactivity for datasource " + k +
                 " in year " + str(v['year']))
        flows_df = flowsa.getFlowByActivity(flowclass=[v['class']],
                                            years=[v['year']],
                                            datasource=k,
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = getattr(sys.modules[__name__],
                           v["FBS_datapull_fxn"])(*v['parameters'])
    else:
        log.error("Data format not specified in method file for datasource " +
                  k)

    return flows_df
Esempio n. 2
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or
    FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: str, The datasource name
    :param v: dictionary, The datasource parameters
    :return: df of identified parquet
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = None
        vLog.info("Retrieving flowbyactivity for datasource %s in year %s", k,
                  str(v['year']))
        flows_df = flowsa.getFlowByActivity(datasource=k,
                                            year=v['year'],
                                            flowclass=v['class'],
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = dynamically_import_fxn(k, v["FBS_datapull_fxn"])(v)
    else:
        vLog.error(
            "Data format not specified in method file for datasource %s", k)

    return flows_df
Esempio n. 3
0
def load_fba_w_standardized_units(datasource, year, **kwargs):
    """
    Standardize how a FBA is loaded for allocation purposes when generating a FBS.
    Important to immediately convert the df units to standardized units.
    :param datasource: string, FBA source name
    :param year: int, year of data
    :param kwargs: optional parameters include flowclass, geographic_level,
           and download_if_missing
    :return: fba df with standardized units
    """
    # determine if any addtional parameters required to load a Flow-By-Activity
    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'flowclass' in kwargs:
        fba_dict['flowclass'] = kwargs['flowclass']
    if 'geographic_level' in kwargs:
        fba_dict['geographic_level'] = kwargs['geographic_level']
    # load the allocation FBA
    fba = flowsa.getFlowByActivity(datasource, year, **fba_dict).reset_index(drop=True)
    # ensure df loaded correctly/has correct dtypes
    fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict)
    # convert to standardized units
    fba = standardize_units(fba)

    return fba
Esempio n. 4
0
def convert_blackhurst_data_to_gal_per_year(df, attr):

    import flowsa
    from flowsa.mapping import add_sectors_to_flowbyactivity
    from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict

    # load the bea make table
    bmt = flowsa.getFlowByActivity(flowclass=['Money'],
                                   datasource='BEA_Make_Table',
                                   years=[2002])
    # clean df
    bmt = clean_df(bmt, flow_by_activity_fields, fba_fill_na_dict)
    # drop rows with flowamount = 0
    bmt = bmt[bmt['FlowAmount'] != 0]

    bh_df_revised = pd.merge(
        df,
        bmt[['FlowAmount', 'ActivityProducedBy', 'Location']],
        left_on=['ActivityConsumedBy', 'Location'],
        right_on=['ActivityProducedBy', 'Location'])

    bh_df_revised.loc[:, 'FlowAmount'] = ((bh_df_revised['FlowAmount_x']) *
                                          (bh_df_revised['FlowAmount_y']))
    bh_df_revised.loc[:, 'Unit'] = 'gal'
    # drop columns
    bh_df_revised = bh_df_revised.drop(
        columns=["FlowAmount_x", "FlowAmount_y", 'ActivityProducedBy_y'])
    bh_df_revised = bh_df_revised.rename(
        columns={"ActivityProducedBy_x": "ActivityProducedBy"})

    return bh_df_revised
Esempio n. 5
0
def disaggregate_pastureland(fba_w_sector, attr):
    """
    The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: The CoA Cropland dataframe after linked to sectors
    :return: The CoA cropland dataframe with disaggregated pastureland data
    """

    import flowsa
    from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \
        fba_fill_na_dict

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector['Sector'] == '112']
    # add temp loc column for state fips
    p.loc[:, 'Location_tmp'] = p['Location'].apply(lambda x: str(x[0:2]))

    # load usda coa cropland naics
    df_f = flowsa.getFlowByActivity(flowclass=['Land'],
                                    years=[attr['allocation_source_year']],
                                    datasource='USDA_CoA_Cropland_NAICS')
    df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict)
    # subset to land in farms data
    df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
    # subset to rows related to pastureland
    df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: str(x[0:3])) ==
                    '112']
    # drop rows with "&'
    df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
    # create sector column
    df_f.loc[:, 'Sector'] = df_f['ActivityConsumedBy']
    # create proportional ratios
    df_f = allocate_by_sector(df_f, 'proportional')
    # drop naics = '11
    df_f = df_f[df_f['Sector'] != '11']
    # drop 000 in location
    df_f.loc[:, 'Location'] = df_f['Location'].apply(lambda x: str(x[0:2]))

    # merge the coa pastureland data with land in farm data
    df = p.merge(df_f[['Sector', 'Location', 'FlowAmountRatio']],
                 how='left',
                 left_on="Location_tmp",
                 right_on="Location")
    # multiply the flowamount by the flowratio
    df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
    # drop columns and rename
    df = df.drop(
        columns=['Location_tmp', 'Sector_x', 'Location_y', 'FlowAmountRatio'])
    df = df.rename(columns={"Sector_y": "Sector", "Location_x": 'Location'})

    # drop rows where sector = 112 and then concat with original fba_w_sector
    fba_w_sector = fba_w_sector[fba_w_sector['Sector'].apply(
        lambda x: str(x[0:3])) != '112'].reset_index(drop=True)
    fba_w_sector = pd.concat([fba_w_sector, df],
                             sort=False).reset_index(drop=True)

    return fba_w_sector
Esempio n. 6
0
def allocation_helper(df_w_sector, method, attr):
    """
    Used when two df required to create allocation ratio
    :param df_w_sector:
    :param method: currently written for 'multiplication'
    :param attr:
    :return:
    """
    helper_allocation = flowsa.getFlowByActivity(
        flowclass=[attr['helper_source_class']],
        datasource=attr['helper_source'],
        years=[attr['helper_source_year']])
    # fill null values
    helper_allocation = helper_allocation.fillna(value=fba_fill_na_dict)
    # convert unit
    helper_allocation = convert_unit(helper_allocation)

    # assign naics to allocation dataset
    helper_allocation = add_sectors_to_flowbyactivity(
        helper_allocation,
        sectorsourcename=method['target_sector_source'],
        levelofSectoragg=attr['helper_sector_aggregation'])
    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = generalize_activity_field_names(helper_allocation)
    # drop columns
    helper_allocation = helper_allocation.drop(
        columns=['Activity', 'Description', 'Min', 'Max'])
    # rename column
    helper_allocation = helper_allocation.rename(
        columns={"FlowAmount": 'HelperFlow'})

    # merge allocation df with helper df based on sectors, depending on geo scales of dfs
    if attr['helper_from_scale'] == 'national':
        modified_fba_allocation = df_w_sector.merge(
            helper_allocation[['Sector', 'HelperFlow']], how='left')
    if (attr['helper_from_scale']
            == 'state') and (attr['allocation_from_scale'] == 'county'):
        helper_allocation['Location_tmp'] = helper_allocation[
            'Location'].apply(lambda x: str(x[0:2]))
        df_w_sector['Location_tmp'] = df_w_sector['Location'].apply(
            lambda x: str(x[0:2]))
        modified_fba_allocation = df_w_sector.merge(
            helper_allocation[['Sector', 'Location_tmp', 'HelperFlow']],
            how='left')
        modified_fba_allocation = modified_fba_allocation.drop(
            columns=['Location_tmp'])

    # modify flow amounts using helper data
    if attr['helper_method'] == 'multiplication':
        modified_fba_allocation['FlowAmount'] = modified_fba_allocation[
            'FlowAmount'] * modified_fba_allocation['HelperFlow']
    # drop columns
    modified_fba_allocation = modified_fba_allocation.drop(
        columns="HelperFlow")

    return modified_fba_allocation
Esempio n. 7
0
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method):

    import flowsa
    from flowsa.mapping import add_sectors_to_flowbyactivity
    from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, agg_by_geoscale, fba_default_grouping_fields, \
        sector_ratios, proportional_allocation_by_location_and_sector, filter_by_geoscale
    from flowsa.BLS_QCEW import clean_bls_qcew_fba

    bls = flowsa.getFlowByActivity(flowclass=['Employment'],
                                   datasource='BLS_QCEW',
                                   years=[2002])
    # clean df
    bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict)
    bls = clean_bls_qcew_fba(bls, attr)

    # bls_agg = agg_by_geoscale(bls, 'state', 'national', fba_default_grouping_fields)
    bls_agg = filter_by_geoscale(bls, 'national')

    # assign naics to allocation dataset
    bls_wsec = add_sectors_to_flowbyactivity(
        bls_agg, sectorsourcename=method['target_sector_source'])
    # drop rows where sector = None ( does not occur with mining)
    bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()]
    bls_wsec = bls_wsec.rename(columns={'SectorProducedBy': 'Sector'})

    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
    # aren't in list
    sector_list = df_wsec['Sector'].unique().tolist()
    # subset fba allocation table to the values in the activity list, based on overlapping sectors
    bls_wsec = bls_wsec.loc[bls_wsec['Sector'].isin(sector_list)]
    # calculate proportional ratios
    bls_wsec = proportional_allocation_by_location_and_sector(
        bls_wsec, 'Sector')  #, 'agg')
    bls_wsec = bls_wsec.rename(columns={
        'FlowAmountRatio': 'EmployeeRatio',
        'FlowAmount': 'Employees'
    })

    # merge the two dfs
    df = pd.merge(df_wsec,
                  bls_wsec[['Sector', 'EmployeeRatio', 'Employees']],
                  how='left',
                  left_on='Sector',
                  right_on='Sector')
    df['EmployeeRatio'] = df['EmployeeRatio'].fillna(0)
    # calculate gal/employee in 2002
    df.loc[:, 'FlowAmount'] = (df['FlowAmount'] *
                               df['EmployeeRatio']) / df['Employees']
    df.loc[:, 'Unit'] = 'gal/employee'

    # drop cols
    df = df.drop(columns=['Employees', 'EmployeeRatio'])

    return df
Esempio n. 8
0
def load_fba_w_standardized_units(datasource, year, **kwargs):
    """
    Standardize how a FBA is loaded for allocation purposes when
    generating a FBS. Important to immediately convert the df units to
    standardized units.
    :param datasource: string, FBA source name
    :param year: int, year of data
    :param kwargs: optional parameters include flowclass, geographic_level,
           download_if_missing, allocation_map_to_flow_list
    :return: fba df with standardized units
    """

    from flowsa.sectormapping import map_fbs_flows

    # determine if any addtional parameters required to load a Flow-By-Activity
    # add parameters to dictionary if exist in method yaml
    fba_dict = {}
    if 'flowclass' in kwargs:
        fba_dict['flowclass'] = kwargs['flowclass']
    if 'geographic_level' in kwargs:
        fba_dict['geographic_level'] = kwargs['geographic_level']
    if 'download_FBA_if_missing' in kwargs:
        fba_dict['download_FBA_if_missing'] = kwargs['download_FBA_if_missing']
    # load the allocation FBA
    fba = flowsa.getFlowByActivity(datasource, year,
                                   **fba_dict).reset_index(drop=True)
    # convert to standardized units either by mapping to federal
    # flow list/material flow list or by using function. Mapping will add
    # context and flowable columns
    if 'allocation_map_to_flow_list' in kwargs:
        if kwargs['allocation_map_to_flow_list']:
            # ensure df loaded correctly/has correct dtypes
            fba = clean_df(fba,
                           flow_by_activity_fields,
                           fba_fill_na_dict,
                           drop_description=False)
            fba, mapping_files = map_fbs_flows(fba,
                                               datasource,
                                               kwargs,
                                               keep_fba_columns=True,
                                               keep_unmapped_rows=True)
        else:
            # ensure df loaded correctly/has correct dtypes
            fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict)
            fba = standardize_units(fba)
    else:
        # ensure df loaded correctly/has correct dtypes
        fba = clean_df(fba, flow_by_activity_fields, fba_fill_na_dict)
        fba = standardize_units(fba)

    return fba
Esempio n. 9
0
def assign_nonpoint_dqi(args):
    '''
    Compares facility coverage data between NEI point and Census to estimate
    facility coverage in NEI nonpoint
    '''
    import stewi
    import flowsa
    nei_facility_list = stewi.getInventoryFacilities('NEI', args['year'])
    nei_count = nei_facility_list.groupby('NAICS')['FacilityID'].count()
    census = flowsa.getFlowByActivity(flowclass=['Other'],
                                      years=[args['year']],
                                      datasource="Census_CBP")
    census = census[census['FlowName'] == 'Number of establishments']
    census_count = census.groupby('ActivityProducedBy')['FlowAmount'].sum()
Esempio n. 10
0
def scale_blackhurst_results_to_usgs_values(df_to_scale, attr):
    """
    Scale the initial estimates for Blackhurst-based mining estimates to USGS values. Oil-based sectors are allocated
    a larger percentage of the difference between initial water withdrawal estimates and published USGS values.

    This method is based off the Water Satellite Table created by Yang and Ingwersen, 2017
    :param df_to_scale:
    :param attr:
    :return:
    """

    import flowsa
    from flowsa.flowbyfunctions import harmonize_units

    # determine national level published withdrawal data for usgs mining in FBS method year
    pv_load = flowsa.getFlowByActivity(flowclass=['Water'],
                                       years=[str(attr['helper_source_year'])],
                                       datasource="USGS_NWIS_WU")
    pv_load = harmonize_units(pv_load)
    pv_sub = pv_load[(pv_load['Location'] == str(US_FIPS)) & (
        pv_load['ActivityConsumedBy'] == 'Mining')].reset_index(drop=True)
    pv = pv_sub['FlowAmount'].loc[
        0] * 1000000  # usgs unit is Mgal, blackhurst unit is gal

    # sum quantity of water withdrawals already allocated to sectors
    av = df_to_scale['FlowAmount'].sum()

    # calculate the difference between published value and allocated value
    vd = pv - av

    # subset df to scale into oil and non-oil sectors
    df_to_scale['sector_label'] = np.where(
        df_to_scale['SectorConsumedBy'].apply(lambda x: x[0:5] == '21111'),
        'oil', 'nonoil')
    df_to_scale['ratio'] = np.where(df_to_scale['sector_label'] == 'oil',
                                    2 / 3, 1 / 3)
    df_to_scale['label_sum'] = df_to_scale.groupby(
        ['Location', 'sector_label'])['FlowAmount'].transform('sum')
    df_to_scale.loc[:, 'value_difference'] = vd.astype(float)

    # calculate revised water withdrawal allocation
    df_scaled = df_to_scale.copy()
    df_scaled.loc[:, 'FlowAmount'] = df_scaled['FlowAmount'] + \
                                     (df_scaled['FlowAmount'] / df_scaled['label_sum']) * \
                                     (df_scaled['ratio'] * df_scaled['value_difference'])
    df_scaled = df_scaled.drop(
        columns=['sector_label', 'ratio', 'label_sum', 'value_difference'])

    return df_scaled
Esempio n. 11
0
def get_fba_subset(name, year, flowclass):
    test_fba = flowsa.getFlowByActivity(flowclass=[flowclass], years=[year],datasource=name)
    
    if subset_activities:
        aset_names = pd.read_csv(flowbysectoractivitysetspath+asets_source,dtype=str)
        asets = [
            #'activity_set_1', 
            #'activity_set_2', 
            #'activity_set_3',
            #'activity_set_4', 
            'activity_set_5', 
            'activity_set_6', 
            ]
        activities = aset_names[aset_names['activity_set'].isin(asets)]['name']
        test_fba = test_fba[test_fba['ActivityProducedBy'].isin(activities)]
    return test_fba
Esempio n. 12
0
def unique_activity_names(flowclass, years, datasource):
    """read in the ers parquet files, select the unique activity names, return df with one column """
    # create single df representing all selected years
    df = flowsa.getFlowByActivity(flowclass, years, datasource)

    column_activities = df[["ActivityConsumedBy",
                            "ActivityProducedBy"]].values.ravel()
    unique_activities = pd.unique(column_activities)
    df_unique = unique_activities.reshape((-1, 1))
    df_unique = pd.DataFrame({'Activity': df_unique[:, 0]})
    df_unique = df_unique.loc[df_unique['Activity'].notnull()]
    df_unique = df_unique.loc[df_unique['Activity'] != 'None']
    df_unique = df_unique.assign(ActivitySourceName=datasource)

    # sort df
    df_unique = df_unique.sort_values(['Activity']).reset_index(drop=True)

    return df_unique
Esempio n. 13
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: The datasource name
    :param v: The datasource parameters
    :return:
    """
    if v['data_format'] == 'FBA':
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows_df = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k)
    elif v['data_format'] == 'FBS':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = getattr(sys.modules[__name__], v["FBS_datapull_fxn"])(v['parameters'])
    else:
        log.error("No parquet file found for datasource " + k)

    return flows_df
Esempio n. 14
0
def load_source_dataframe(sourcename, source_dict, download_FBA_if_missing):
    """
    Load the source dataframe. Data can be a FlowbyActivity or
    FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param sourcename: str, The datasource name
    :param source_dict: dictionary, The datasource parameters
    :param download_FBA_if_missing: Bool, if True will download FBAs from
       Data Commons. Default is False.
    :return: df of identified parquet
    """
    if source_dict['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter
        # to filter dataframe
        if 'source_fba_load_scale' in source_dict:
            geo_level = source_dict['source_fba_load_scale']
        else:
            geo_level = None
        vLog.info("Retrieving Flow-By-Activity for datasource %s in year %s",
                  sourcename, str(source_dict['year']))
        flows_df = flowsa.getFlowByActivity(
            datasource=sourcename,
            year=source_dict['year'],
            flowclass=source_dict['class'],
            geographic_level=geo_level,
            download_FBA_if_missing=download_FBA_if_missing)
    elif source_dict['data_format'] == 'FBS':
        vLog.info("Retrieving flowbysector for datasource %s", sourcename)
        flows_df = flowsa.getFlowBySector(sourcename)
    elif source_dict['data_format'] == 'FBS_outside_flowsa':
        vLog.info("Retrieving flowbysector for datasource %s", sourcename)
        flows_df = dynamically_import_fxn(
            sourcename, source_dict["FBS_datapull_fxn"])(source_dict)
    else:
        vLog.error(
            "Data format not specified in method "
            "file for datasource %s", sourcename)

    return flows_df
import flowsa
from flowsa.common import flowbysectoractivitysetspath


# as_year = '2010'
as_year = '2014'

if __name__ == '__main__':

    # define mecs land fba parameters
    land_flowclass = ['Land']
    land_years = [as_year]
    datasource = 'EIA_MECS_Land'

    # Read BLM PLS crosswalk
    df_import = flowsa.getFlowByActivity(land_flowclass, land_years, datasource)

    # drop unused columns
    df = df_import[['ActivityConsumedBy']].drop_duplicates()
    df = df[~df['ActivityConsumedBy'].str.contains('-')]

    # rename columns
    df = df.rename(columns={"ActivityConsumedBy": "name"})

    # assign column values
    df = df.assign(activity_set='activity_set_1')
    df = df.assign(note='')

    # reorder dataframe
    df = df[['activity_set', 'name', 'note']]
    df = df.sort_values(['activity_set', 'name']).reset_index(drop=True)
Esempio n. 16
0
See source_catalog.yaml for available FlowByActivity datasets and available parameters for getFlowByActivity()

Examples of use of flowsa. Read parquet files as dataframes.
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can
    specify 'national', 'state', 'county'
    :return: a pandas DataFrame in FlowByActivity format
"""

import flowsa
from flowsa.common import fbaoutputpath

# single flowclass, year, datasource, geographic_level default = 'all', file_location default = 'local'
usda_cropland_fba_2017 = flowsa.getFlowByActivity(
    flowclass=['Land'], years=[2017], datasource="USDA_CoA_Cropland")
# load file from remote server instead of local directory
usda_cropland_fba_2017_remote = flowsa.getFlowByActivity(
    flowclass=['Land'],
    years=[2017],
    datasource="USDA_CoA_Cropland",
    file_location='remote')

# multiple flowclass
usda_iwms_fba_2013 = flowsa.getFlowByActivity(flowclass=['Land', 'Water'],
                                              years=[2013],
                                              datasource="USDA_IWMS")

# only load state level data and save as csv
# set parameters
fc = ['Water']
Esempio n. 17
0
    :param year: int, a year, e.g. 2012
    :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
    :param geographic_level: str, a geographic level of the data.
    Optional. E.g. 'national', 'state', 'county'.
    :return: a pandas DataFrame in FlowByActivity format

"""

import flowsa
from flowsa.common import fbaoutputpath

# see available FBA models
flowsa.seeAvailableFlowByModels('FBA')

# Load all information for USDA Cropland
usda_cropland_fba_2017 = flowsa.getFlowByActivity(datasource="USDA_CoA_Cropland", year=2017)

# only load state level data and save as csv
# set parameters
ds = "USGS_NWIS_WU"
year_fba = 2015
fc = 'Water'
geo_level_fba = 'state'
# load FBA
usgs_water_fba_2015 =\
    flowsa.getFlowByActivity(datasource=ds, year=year_fba, flowclass=fc,
                             geographic_level=geo_level_fba).reset_index(drop=True)
# save output to csv
usgs_water_fba_2015.Location =\
    usgs_water_fba_2015.Location.apply('="{}"'.format)  # maintain leading 0s in location col
usgs_water_fba_2015.to_csv(fbaoutputpath + ds + "_" + str(year_fba) + ".csv", index=False)
Esempio n. 18
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__],
                                v["clean_fba_df_fxn"])(flows)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = aset_names[aset_names['activity_set'] ==
                                       aset]['name']
                else:
                    names = attr['names']

                log.info("Preparing to handle subset of flownames " +
                         ', '.join(map(str, names)) + " in " + k)
                # subset fba data by activity
                flows_subset = flows[
                    (flows[fba_activity_fields[0]].isin(names)) |
                    (flows[fba_activity_fields[1]].isin(names))].reset_index(
                        drop=True)

                # extract relevant geoscale data or aggregate existing data
                log.info("Subsetting/aggregating dataframe to " +
                         attr['allocation_from_scale'] + " geoscale")
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset, v['geoscale_to_use'],
                    attr['allocation_from_scale'])

                # Add sectors to df activity, depending on level of specified sector aggregation
                log.info("Adding sectors to " + k)
                flow_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__],
                                               v["clean_fba_w_sec_df_fxn"])(
                                                   flow_subset_wsec, attr=attr)

                # map df to elementary flows
                log.info("Mapping flows in " + k +
                         ' to federal elementary flow list')
                if 'fedefl_mapping' in v:
                    mapping_files = v['fedefl_mapping']
                else:
                    mapping_files = k

                flow_subset_mapped = map_elementary_flows(
                    flow_subset_wsec, mapping_files)

                # clean up mapped fba with sectors, if specified in yaml
                if "clean_mapped_fba_w_sec_df_fxn" in v:
                    log.info("Cleaning up " + k +
                             " FlowByActivity with sectors")
                    flow_subset_mapped = getattr(
                        sys.modules[__name__],
                        v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped,
                                                            attr, method)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' +
                             ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_mapped.copy()
                    # for each activity, if activities are not sector like, check that there is no data loss
                    if load_source_catalog(
                    )[k]['sector-like_activities'] is False:
                        activity_list = []
                        for n in names:
                            log.info('Checking for ' + n + ' at ' +
                                     method['target_sector_level'])
                            fbs_subset = fbs[(
                                (fbs[fba_activity_fields[0]] == n) &
                                (fbs[fba_activity_fields[1]] == n)) |
                                             (fbs[fba_activity_fields[0]] == n)
                                             |
                                             (fbs[fba_activity_fields[1]] == n
                                              )].reset_index(drop=True)
                            fbs_subset = check_if_losing_sector_data(
                                fbs_subset, method['target_sector_level'])
                            activity_list.append(fbs_subset)
                        fbs = pd.concat(activity_list, ignore_index=True)

                # if allocation method for an activity set requires a specific function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    log.info(
                        'Calling on function specified in method yaml to allocate '
                        + ', '.join(map(str, names)) + ' to sectors')
                    fbs = getattr(sys.modules[__name__],
                                  attr['allocation_source'])(
                                      flow_subset_mapped, attr, fbs_list)

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " +
                             attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(
                        flowclass=[attr['allocation_source_class']],
                        datasource=attr['allocation_source'],
                        years=[attr['allocation_source_year']
                               ]).reset_index(drop=True)

                    # clean df and harmonize unites
                    fba_allocation = clean_df(fba_allocation,
                                              flow_by_activity_fields,
                                              fba_fill_na_dict)
                    fba_allocation = harmonize_units(fba_allocation)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " +
                             attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(
                        fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    fba_allocation = subset_df_by_geoscale(
                        fba_allocation, attr['allocation_from_scale'],
                        v['geoscale_to_use'])

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['FlowName'].isin(
                                attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(
                                attr['allocation_compartment'])]

                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(
                                                     fba_allocation, attr=attr)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # assign sector to allocation dataset
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation_wsec = add_sectors_to_flowbyactivity(
                        fba_allocation,
                        sectorsourcename=method['target_sector_source'])

                    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
                    if 'clean_allocation_fba_w_sec' in attr:
                        log.info("Further disaggregating sectors in " +
                                 attr['allocation_source'])
                        fba_allocation_wsec = getattr(
                            sys.modules[__name__],
                            attr["clean_allocation_fba_w_sec"])(
                                fba_allocation_wsec, attr=attr, method=method)

                    # subset fba datasets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] +
                             " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(
                        fba_allocation_wsec,
                        k,
                        names,
                        flowSubsetMapped=flow_subset_mapped,
                        allocMethod=attr['allocation_method'])

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info(
                            "Using the specified allocation help for subset of "
                            + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(
                            fba_allocation_subset, attr, method, v)

                    # create flow allocation ratios for each activity
                    # if load_source_catalog()[k]['sector-like_activities']
                    flow_alloc_list = []
                    group_cols = fba_mapped_default_grouping_fields
                    group_cols = [
                        e for e in group_cols
                        if e not in ('ActivityProducedBy',
                                     'ActivityConsumedBy')
                    ]
                    for n in names:
                        log.info("Creating allocation ratios for " + n)
                        fba_allocation_subset_2 = get_fba_allocation_subset(
                            fba_allocation_subset,
                            k, [n],
                            flowSubsetMapped=flow_subset_mapped,
                            allocMethod=attr['allocation_method'])
                        if len(fba_allocation_subset_2) == 0:
                            log.info("No data found to allocate " + n)
                        else:
                            flow_alloc = allocate_by_sector(
                                fba_allocation_subset_2,
                                k,
                                attr['allocation_source'],
                                attr['allocation_method'],
                                group_cols,
                                flowSubsetMapped=flow_subset_mapped)
                            flow_alloc = flow_alloc.assign(FBA_Activity=n)
                            flow_alloc_list.append(flow_alloc)
                    flow_allocation = pd.concat(flow_alloc_list,
                                                ignore_index=True)

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " +
                             attr['allocation_source'])
                    flow_allocation = collapse_activity_fields(flow_allocation)

                    # check for issues with allocation ratios
                    check_allocation_ratios(flow_allocation, aset, k,
                                            method_name)

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_mapped = flow_subset_mapped.loc[
                        (flow_subset_mapped[fbs_activity_fields[0]].
                         isin(sector_list)) |
                        (flow_subset_mapped[fbs_activity_fields[1]].
                         isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info(
                        "Checking if flowbyactivity and allocation dataframes use the same location systems"
                    )
                    check_if_location_systems_match(flow_subset_mapped,
                                                    flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " +
                             attr['allocation_source'])
                    fbs = flow_subset_mapped.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorProducedBy',
                            'ActivityProducedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    fbs = fbs.merge(
                        flow_allocation[[
                            'Location', 'Sector', 'FlowAmountRatio',
                            'FBA_Activity'
                        ]],
                        left_on=[
                            'Location', 'SectorConsumedBy',
                            'ActivityConsumedBy'
                        ],
                        right_on=['Location', 'Sector', 'FBA_Activity'],
                        how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs[
                        'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                    # fill null rows with 0 because no allocation info
                    fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'],
                                     how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[
                        'FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=[
                        'Sector_x', 'FlowAmountRatio_x', 'Sector_y',
                        'FlowAmountRatio_y', 'FlowAmountRatio',
                        'FBA_Activity_x', 'FBA_Activity_y'
                    ])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors being activity-like or not
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                # todo: replace with fxn return_from_scale
                log.info("Aggregating flowbysector to " +
                         method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[
                        attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale,
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict)

                # check if any sector information is lost before reaching the target sector length, if so,
                # allocate values equally to disaggregated sectors
                log.info('Checking for data at ' +
                         method['target_sector_level'])
                fbs_agg_2 = check_if_losing_sector_data(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                # todo: modify fxn to work if activities are sector like in df being allocated
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    check_for_differences_between_fba_load_and_fbs_output(
                        flow_subset_mapped, fbs_agg_2, aset, k, method_name)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list))
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index(
                        drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull())
                    & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list)
                       )].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_load_and_fbs_output_totals(flows_subset_geo,
                                                       fbs_sector_subset, aset,
                                                       k, method_name, attr,
                                                       method, mapping_files)

                log.info(
                    "Completed flowbysector for activity subset with flows " +
                    ', '.join(map(str, names)))
                fbs_list.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # add missing fields, ensure correct data type, reorder columns
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Esempio n. 19
0
def disaggregate_cropland(fba_w_sector, attr):
    """
    In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector:
    :param attr:
    :return:
    """

    import flowsa
    from flowsa.flowbyfunctions import generalize_activity_field_names, sector_aggregation,\
        fbs_default_grouping_fields, clean_df, fba_fill_na_dict, add_missing_flow_by_fields
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector['Sector'].apply(lambda x: str(x[0:3]))
                            != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop['Sector'].apply(lambda x: len(x) > 3)].reset_index(
        drop=True)
    # create tmp location
    crop.loc[:, 'Location_tmp'] = crop['Location'].apply(lambda x: str(x[0:2]))

    # load the relevant state level harvested cropland by naics
    naics_load = flowsa.getFlowByActivity(
        flowclass=['Land'],
        years=[attr['allocation_source_year']],
        datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True)
    # clean df
    naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict)
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] ==
                  'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(naics,
                                          sectorsourcename='NAICS_2012_Code',
                                          levelofSectoragg='agg')
    # add missing fbs fields
    naics = add_missing_flow_by_fields(naics, flow_by_sector_fields)

    # aggregate sectors to create any missing naics levels
    naics = sector_aggregation(naics, fbs_default_grouping_fields)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics = sector_disaggregation(naics)
    # drop rows where sector consumed by is none and FlowAmount 0
    naics = naics[naics['SectorConsumedBy'].notnull()]
    naics = naics.loc[naics['FlowAmount'] != 0]
    # create ratios
    naics = sector_ratios(naics)
    # drop sectors < 4 digits
    #naics = naics[naics['SectorConsumedBy'].apply(lambda x: len(x) > 3)].reset_index(drop=True)
    # create temporary sector column to match the two dfs on
    naics.loc[:,
              'Location_tmp'] = naics['Location'].apply(lambda x: str(x[0:2]))

    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop['Sector'].apply(
            lambda x: i + 1 >= len(x) >= i)]
        crop_subset.loc[:, 'Sector_tmp'] = crop_subset['Sector'].apply(
            lambda x: x[0:i])
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'],
                                         keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df['Sector'].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = naics.loc[naics['SectorConsumedBy'].apply(
            lambda x: len(x) == i + 1)].reset_index(drop=True)
        naics_subset.loc[:, 'Sector_tmp'] = naics_subset[
            'SectorConsumedBy'].apply(lambda x: x[0:i])
        # merge the two df based on locations
        df_subset = pd.merge(df_subset,
                             naics_subset[[
                                 'SectorConsumedBy', 'FlowAmountRatio',
                                 'Sector_tmp', 'Location_tmp'
                             ]],
                             how='left',
                             left_on=['Sector', 'Location_tmp'],
                             right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[
            'FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(
            drop=True)
        # drop columns
        df_subset = df_subset.drop(
            columns=['Sector', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(columns={"SectorConsumedBy": "Sector"})
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # pasture data
    pasture = fba_w_sector.loc[fba_w_sector['Sector'].apply(
        lambda x: str(x[0:3])) == '112'].reset_index(drop=True)

    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    return fba_w_sector
Esempio n. 20
0
def geoscale_flow_comparison(flowclass,
                             years,
                             datasource,
                             activitynames=['all'],
                             to_scale='national'):
    """ Aggregates county data to state and national, and state data to national level, allowing for comparisons
        in flow totals for a given flowclass and industry. First assigns all flownames to NAICS and standardizes units.

        Assigned to NAICS rather than using FlowNames for aggregation to negate any changes in flownames across
        time/geoscale
    """

    # load parquet file checking aggregation
    flows = flowsa.getFlowByActivity(flowclass=flowclass,
                                     years=years,
                                     datasource=datasource)
    # fill null values
    flows = flows.fillna(value=fba_fill_na_dict)
    # convert units
    flows = convert_unit(flows)

    # if activityname set to default, then compare aggregation for all activities. If looking at particular activity,
    # filter that activity out
    if activitynames == ['all']:
        flow_subset = flows.copy()
    else:
        flow_subset = flows[
            (flows[fba_activity_fields[0]].isin(activitynames)) |
            (flows[fba_activity_fields[1]].isin(activitynames))]

    # Reset index values after subset
    flow_subset = flow_subset.reset_index()

    # pull naics crosswalk
    mapping = get_activitytosector_mapping(flow_subset['SourceName'].all())

    # assign naics to activities
    # usgs datasource is not easily assigned to naics for checking totals, so instead standardize activity names
    if datasource == 'USGS_NWIS_WU':
        flow_subset = standardize_usgs_nwis_names(flow_subset)
    else:
        flow_subset = pd.merge(flow_subset,
                               mapping[['Activity', 'Sector']],
                               left_on='ActivityProducedBy',
                               right_on='Activity',
                               how='left').rename(
                                   {'Sector': 'SectorProducedBy'}, axis=1)
        flow_subset = pd.merge(flow_subset,
                               mapping[['Activity', 'Sector']],
                               left_on='ActivityConsumedBy',
                               right_on='Activity',
                               how='left').rename(
                                   {'Sector': 'SectorConsumedBy'}, axis=1)
    flow_subset = flow_subset.drop(columns=[
        'ActivityProducedBy', 'ActivityConsumedBy', 'Activity_x', 'Activity_y',
        'Description'
    ],
                                   errors='ignore')
    flow_subset['SectorProducedBy'] = flow_subset['SectorProducedBy'].replace({
        np.nan:
        None
    }).astype(str)
    flow_subset['SectorConsumedBy'] = flow_subset['SectorConsumedBy'].replace({
        np.nan:
        None
    }).astype(str)

    # create list of geoscales for aggregation
    if to_scale == 'national':
        geoscales = ['national', 'state', 'county']
    elif to_scale == 'state':
        geoscales = ['state', 'county']

    # create empty df list
    flow_dfs = []
    for i in geoscales:
        try:
            # filter by geoscale
            fba_from_scale = filter_by_geoscale(flow_subset, i)

            # remove/add column names as a column
            group_cols = fba_default_grouping_fields.copy()
            for j in ['Location', 'ActivityProducedBy', 'ActivityConsumedBy']:
                group_cols.remove(j)
            for j in ['SectorProducedBy', 'SectorConsumedBy']:
                group_cols.append(j)

            # county sums to state and national, state sums to national
            if to_scale == 'state':
                fba_from_scale['Location'] = fba_from_scale['Location'].apply(
                    lambda x: str(x[0:2]))
            elif to_scale == 'national':
                fba_from_scale['Location'] = US_FIPS

            # aggregate
            fba_agg = aggregator(fba_from_scale, group_cols)

            # rename flowamount column, based on geoscale
            fba_agg = fba_agg.rename(columns={"FlowAmount": "FlowAmount_" + i})

            # drop fields irrelevant to aggregated flow comparision
            drop_fields = flows[[
                'MeasureofSpread', 'Spread', 'DistributionType',
                'DataReliability', 'DataCollection'
            ]]
            fba_agg = fba_agg.drop(columns=drop_fields)

            # reset index
            fba_agg = fba_agg.reset_index(drop=True)

            flow_dfs.append(fba_agg)
        except:
            pass

    # merge list of dfs by column
    flow_comparison = reduce(
        lambda left, right: pd.merge(
            left,
            right,
            on=[
                'Class', 'SourceName', 'FlowName', 'Unit', 'SectorProducedBy',
                'SectorConsumedBy', 'Compartment', 'Location',
                'LocationSystem', 'Year'
            ],
            how='outer'), flow_dfs)

    # sort df
    flow_comparison = flow_comparison.sort_values([
        'Year', 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'FlowName',
        'Compartment'
    ])

    return flow_comparison
Esempio n. 21
0
def convert_statcan_data_to_US_water_use(df, attr):
    """
    Use Canadian GDP data to convert 3 digit canadian water use to us water
    use:
    - canadian gdp
    - us gdp
    :return:
    """
    import flowsa
    from flowsa.values_from_literature import get_Canadian_to_USD_exchange_rate
    from flowsa.flowbyfunctions import assign_fips_location_system, aggregator, fba_default_grouping_fields
    from flowsa.common import US_FIPS, load_bea_crosswalk

    # load Canadian GDP data
    gdp = flowsa.getFlowByActivity(flowclass=['Money'],
                                   datasource='StatCan_GDP',
                                   years=[attr['allocation_source_year']])
    # drop 31-33
    gdp = gdp[gdp['ActivityProducedBy'] != '31-33']
    gdp = gdp.rename(columns={"FlowAmount": "CanDollar"})

    # merge df
    df_m = pd.merge(df,
                    gdp[['CanDollar', 'ActivityProducedBy']],
                    how='left',
                    left_on='ActivityConsumedBy',
                    right_on='ActivityProducedBy')
    df_m['CanDollar'] = df_m['CanDollar'].fillna(0)
    df_m = df_m.drop(columns=["ActivityProducedBy_y"])
    df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"})
    df_m = df_m[df_m['CanDollar'] != 0]

    exchange_rate = get_Canadian_to_USD_exchange_rate(
        str(attr['allocation_source_year']))
    exchange_rate = float(exchange_rate)
    # convert to mgal/USD
    df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] /
                                                      exchange_rate)
    df_m.loc[:, 'Unit'] = 'Mgal/USD'

    df_m = df_m.drop(columns=["CanDollar"])

    # convert Location to US
    df_m.loc[:, 'Location'] = US_FIPS
    df_m = assign_fips_location_system(df_m,
                                       str(attr['allocation_source_year']))

    # load us gdp
    # load Canadian GDP data
    us_gdp_load = flowsa.getFlowByActivity(
        flowclass=['Money'],
        datasource='BEA_GDP_GrossOutput_IO',
        years=[attr['allocation_source_year']])
    # load bea crosswalk
    cw_load = load_bea_crosswalk()
    cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates()
    cw = cw[cw['NAICS_2012_Code'].apply(
        lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True)

    # merge
    us_gdp = pd.merge(us_gdp_load,
                      cw,
                      how='left',
                      left_on='ActivityProducedBy',
                      right_on='BEA_2012_Detail_Code')
    us_gdp = us_gdp.drop(
        columns=['ActivityProducedBy', 'BEA_2012_Detail_Code'])
    # rename columns
    us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'})
    # agg by naics
    us_gdp = aggregator(us_gdp, fba_default_grouping_fields)
    us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'})

    # determine annual us water use
    df_m2 = pd.merge(df_m,
                     us_gdp[['ActivityProducedBy', 'us_gdp']],
                     how='left',
                     left_on='ActivityConsumedBy',
                     right_on='ActivityProducedBy')

    df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp'])
    df_m2.loc[:, 'Unit'] = 'Mgal'
    df_m2 = df_m2.rename(
        columns={'ActivityProducedBy_x': 'ActivityProducedBy'})
    df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp'])

    return df_m2
Esempio n. 22
0
def allocation_helper(df_w_sector, method, attr):
    """
    Used when two df required to create allocation ratio
    :param df_w_sector:
    :param method: currently written for 'multiplication'
    :param attr:
    :return:
    """

    from flowsa.mapping import add_sectors_to_flowbyactivity

    helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']],
                                                 datasource=attr['helper_source'],
                                                 years=[attr['helper_source_year']])
    # clean df
    helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict)
    # drop rows with flowamount = 0
    helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0]

    # assign naics to allocation dataset
    helper_allocation = add_sectors_to_flowbyactivity(helper_allocation,
                                                      sectorsourcename=method[
                                                          'target_sector_source'],
                                                      levelofSectoragg=attr[
                                                          'helper_sector_aggregation'])
    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = generalize_activity_field_names(helper_allocation)
    # drop columns
    helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max'])
    # rename column
    helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})

    # merge allocation df with helper df based on sectors, depending on geo scales of dfs
    if attr['helper_from_scale'] == 'national':
        modified_fba_allocation = df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']],
                                                    how='left')
    if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'state'):
        modified_fba_allocation = df_w_sector.merge(
            helper_allocation[['Sector', 'Location', 'HelperFlow']], how='left')
    if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply(
            lambda x: str(x[0:2]))
        df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: str(x[0:2]))
        modified_fba_allocation = df_w_sector.merge(
            helper_allocation[['Sector', 'Location_tmp', 'HelperFlow']],
            how='left')
        modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp'])

    # todo: modify so if missing data, replaced with value from one geoscale up instead of national
    # if missing values (na or 0), replace with national level values
    replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(
        drop=True)
    replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'})
    modified_fba_allocation = modified_fba_allocation.merge(
        replacement_values[['Sector', 'ReplacementValue']], how='left')
    modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(
        modified_fba_allocation['ReplacementValue'])
    modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0,
                                                            modified_fba_allocation['ReplacementValue'],
                                                            modified_fba_allocation['HelperFlow'])
    # modify flow amounts using helper data
    if attr['helper_method'] == 'multiplication':
        # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated
        # with new unit
        modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(
            value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation[
                                                           'HelperFlow']
    # drop columns
    modified_fba_allocation = modified_fba_allocation.drop(
        columns=["HelperFlow", 'ReplacementValue'])

    # drop rows of 0 to speed up allocation
    modified_fba_allocation = modified_fba_allocation[
        modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    #todo: modify the unit

    return modified_fba_allocation
Esempio n. 23
0
available parameters for getFlowByActivity().
Examples of use of flowsa. Read parquet files as dataframes.
    :param datasource: str, the code of the datasource.
    :param year: int, a year, e.g. 2012
    :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
    :param geographic_level: str, a geographic level of the data.
    Optional. E.g. 'national', 'state', 'county'.
    :return: a pandas DataFrame in FlowByActivity format

"""

import flowsa
from flowsa.settings import fbaoutputpath

# see all datasources and years available in flowsa
flowsa.seeAvailableFlowByModels('FBA')

# Load all information for EIA MECS Land
fba_mecs = flowsa.getFlowByActivity(datasource="EIA_MECS_Land", year=2014)

# only load state level water data and save as csv
fba_usgs = flowsa.getFlowByActivity(
    datasource="USGS_NWIS_WU",
    year=2015,
    flowclass='Water',
    geographic_level='state').reset_index(drop=True)

# save output to csv, maintain leading 0s in location col
fba_usgs.Location = fba_usgs.Location.apply('="{}"'.format)
fba_usgs.to_csv(f"{fbaoutputpath}USGS_NWIS_WU_2015.csv", index=False)
Esempio n. 24
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of water data and allocation datasets
    fbas = method['flowbyactivity_sources']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fbas.items():
        # pull water data for allocation
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows = flowsa.getFlowByActivity(flowclass=[v['class']],
                                         years=[v['year']],
                                         datasource=k)

        # if necessary, standardize names in data set
        if v['activity_name_standardization_fxn'] != 'None':
            log.info("Standardizing activity names in " + k)
            flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows)

        # drop description field
        flows = flows.drop(columns='Description')
        # fill null values
        flows = flows.fillna(value=fba_fill_na_dict)
        # map df to elementary flows - commented out until mapping complete
        # log.info("Mapping flows in " + k + ' to federal elementary flow list')
        # flows_mapped = map_elementary_flows(flows, k)
        # convert unit todo: think about unit conversion here
        log.info("Converting units in " + k)
        flows = convert_unit(flows)

        # create dictionary of allocation datasets for different activities
        activities = v['activity_sets']
        for aset, attr in activities.items():
            # subset by named activities
            names = [attr['names']]
            log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)
            # subset usgs data by activity
            flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) |
                                (flows[fba_activity_fields[1]].isin(names))]

            # Reset index values after subset
            flow_subset = flow_subset.reset_index(drop=True)

            # check if flowbyactivity data exists at specified geoscale to use
            log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " +
                     v['geoscale_to_use'] + ' level')
            geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use'])
            # aggregate geographically to the scale of the allocation dataset
            if geocheck == "Yes":
                activity_from_scale = v['geoscale_to_use']
            else:
                # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                # geoscale, and sum to specified geoscale
                log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level")
                new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names,
                                                                                        v['geoscale_to_use'])
                activity_from_scale = new_geoscale_to_use

            activity_to_scale = attr['allocation_from_scale']
            # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale
            if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names)
            # else, aggregate to geoscale want to use
            elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names)
            # else, if usgs is more aggregated than allocation table, filter relevant rows
            else:
                log.info("Filtering out " + activity_from_scale + " data")
                flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names)

            # location column pad zeros if necessary
            flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5
                                                                    else x
                                                                    )

            # Add sectors to usgs activity, creating two versions of the flow subset
            # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS)
            # the second version, "flow_subset_agg" includes only the most aggregated level of sectors
            log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names)))
            flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                             sectorsourcename=method['target_sector_source'])
            flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset,
                                                                 sectorsourcename=method['target_sector_source'],
                                                                 levelofSectoragg='agg')

            # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
            # dataframe to create sector allocation ratios
            if attr['allocation_method'] == 'direct':
                fbs = flow_subset_wsec_agg.copy()
            else:
                # determine appropriate allocation dataset
                log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year']))
                fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                          datasource=attr['allocation_source'],
                                                          years=[attr['allocation_source_year']]).reset_index(drop=True)

                # fill null values
                fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict)
                # convert unit
                fba_allocation = convert_unit(fba_allocation)

                # subset based on yaml settings
                if attr['allocation_flow'] != 'None':
                    fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                if attr['allocation_compartment'] != 'None':
                    fba_allocation = fba_allocation.loc[
                        fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                # reset index
                fba_allocation = fba_allocation.reset_index(drop=True)

                # check if allocation data exists at specified geoscale to use
                log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) +
                         " at the " + attr['allocation_from_scale'] + " level")
                check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale'])

                # aggregate geographically to the scale of the flowbyactivty source, if necessary
                from_scale = attr['allocation_from_scale']
                to_scale = v['geoscale_to_use']
                # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                if fips_number_key[from_scale] > fips_number_key[to_scale]:
                    fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names)
                # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                else:
                    fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                # assign sector to allocation dataset
                log.info("Adding sectors to " + attr['allocation_source'])
                fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                               sectorsourcename=method['target_sector_source'],
                                                               levelofSectoragg=attr[
                                                                   'allocation_sector_aggregation'])
                # subset fba datsets to only keep the naics associated with usgs activity subset
                log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)
                # Reset index values after subset
                fba_allocation_subset = fba_allocation_subset.reset_index(drop=True)
                # generalize activity field names to enable link to water withdrawal table
                log.info("Generalizing activity names in subset of " + attr['allocation_source'])
                fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                # drop columns
                fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                # if there is an allocation helper dataset, modify allocation df
                if attr['allocation_helper'] == 'yes':
                    log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                    fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                # create flow allocation ratios
                log.info("Creating allocation ratios for " + attr['allocation_source'])
                flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                # aren't in list
                sector_list = flow_allocation['Sector'].unique().tolist()

                # subset fba allocation table to the values in the activity list, based on overlapping sectors
                flow_subset_wsec = flow_subset_wsec.loc[
                    (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                    (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                # check if fba and allocation dfs have the same LocationSystem
                log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                # merge water withdrawal df w/flow allocation dataset
                log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                fbs = flow_subset_wsec.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorProducedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                fbs = fbs.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorConsumedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                # drop columns where both sector produced/consumed by in flow allocation dif is null
                fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                # merge the flowamount columns
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                # calculate flow amounts for each sector
                log.info("Calculating new flow amounts using flow ratios")
                fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                # drop columns
                log.info("Cleaning up new flow by sector")
                fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                        'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

            # rename flow name to flowable
            fbs = fbs.rename(columns={"FlowName": 'Flowable',
                                      "Compartment": "Context"
                                      })

            # drop rows where flowamount = 0 (although this includes dropping suppressed data)
            fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)
            # add missing data columns
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)
            # fill null values
            fbs = fbs.fillna(value=fbs_fill_na_dict)

            # aggregate df geographically, if necessary
            log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
            if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                from_scale = v['geoscale_to_use']
            else:
                from_scale = attr['allocation_from_scale']

            to_scale = method['target_geoscale']

            fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

            # aggregate data to every sector level
            log.info("Aggregating flowbysector to " + method['target_sector_level'])
            fbs = sector_aggregation(fbs, fbs_default_grouping_fields)

            # test agg by sector
            sector_agg_comparison = sector_flow_comparision(fbs)

            # return sector level specified in method yaml
            # load the crosswalk linking sector lengths
            cw = load_sector_length_crosswalk()
            sector_list = cw[method['target_sector_level']].unique().tolist()

            # add any non-NAICS sectors used with NAICS
            household = load_household_sector_codes()
            household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']]
            # add household sector to sector list
            sector_list.extend(household['Code'].tolist())
            # subset df
            fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                          (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)

            # add any missing columns of data and cast to appropriate data type
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

            log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
            fbss.append(fbs)
    # create single df of all activities
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Esempio n. 25
0
def disaggregate_pastureland(fba_w_sector, attr, method, years_list, sector_column):
    """
    The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state
    level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS.
    :param fba_w_sector: The CoA Cropland dataframe after linked to sectors
    :param attr:
    :param years_list:
    :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy)
    :return: The CoA cropland dataframe with disaggregated pastureland data
    """

    import flowsa
    from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \
        fba_fill_na_dict, replace_strings_with_NoneType, replace_NoneType_with_empty_cells, \
        fba_mapped_default_grouping_fields, harmonize_units
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # subset the coa data so only pastureland
    p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True)
    if len(p) != 0:
        # add temp loc column for state fips
        p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2]))
        df_sourcename = pd.unique(p['SourceName'])[0]

        # load usda coa cropland naics
        df_class = ['Land']
        df_years = years_list
        df_allocation = 'USDA_CoA_Cropland_NAICS'
        df_f = flowsa.getFlowByActivity(flowclass=df_class, years=df_years, datasource=df_allocation)
        df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict)
        df_f = harmonize_units(df_f)
        # subset to land in farms data
        df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS']
        # subset to rows related to pastureland
        df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112']
        # drop rows with "&'
        df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')]
        # create sector columns
        df_f = add_sectors_to_flowbyactivity(df_f, sectorsourcename=method['target_sector_source'])
        # create proportional ratios
        group_cols = fba_mapped_default_grouping_fields
        group_cols = [e for e in group_cols if
                      e not in ('ActivityProducedBy', 'ActivityConsumedBy')]
        df_f = allocate_by_sector(df_f, df_sourcename, df_allocation, 'proportional', group_cols)
        # tmp drop NoneTypes
        df_f = replace_NoneType_with_empty_cells(df_f)
        # drop naics = '11
        df_f = df_f[df_f[sector_column] != '11']
        # drop 000 in location
        df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2]))

        # merge the coa pastureland data with land in farm data
        df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left',
                     left_on="Location_tmp", right_on="Location")
        # multiply the flowamount by the flowratio
        df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio']
        # drop columns and rename
        df = df.drop(columns=['Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio'])
        df = df.rename(columns={sector_column + '_y': sector_column,
                                "Location_x": 'Location'})

        # drop rows where sector = 112 and then concat with original fba_w_sector
        fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True)
        fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True)

        # fill empty cells with NoneType
        fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
Esempio n. 26
0
def allocation_helper(df_w_sector, method, attr, v):
    """
    Used when two df required to create allocation ratio
    :param df_w_sector:
    :param method: currently written for 'multiplication' and 'proportional'
    :param attr:
    :return:
    """

    from flowsa.Blackhurst_IO import scale_blackhurst_results_to_usgs_values
    from flowsa.BLS_QCEW import clean_bls_qcew_fba, bls_clean_allocation_fba_w_sec
    from flowsa.mapping import add_sectors_to_flowbyactivity

    helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']],
                                                 datasource=attr['helper_source'],
                                                 years=[attr['helper_source_year']])
    if 'clean_helper_fba' in attr:
        log.info("Cleaning " + attr['helper_source'] + ' FBA')
        # tmp hard coded - need to generalize
        if attr['helper_source'] == 'BLS_QCEW':
            helper_allocation = clean_bls_qcew_fba(helper_allocation, attr)
            # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba"])(helper_allocation, attr)
    # clean df
    helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict)
    # drop rows with flowamount = 0
    helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0]

    # agg data if necessary or filter
    # determine to scale
    to_scale = min(fips_number_key[attr['allocation_from_scale']], fips_number_key[v['geoscale_to_use']])
    if fips_number_key[attr['helper_from_scale']] > to_scale:
        helper_allocation = agg_by_geoscale(helper_allocation,
                                            attr['helper_from_scale'],
                                            list(fips_number_key.keys())[list(fips_number_key.values()).index(to_scale)],
                                            fba_default_grouping_fields)
    else:
        helper_allocation = filter_by_geoscale(helper_allocation, attr['helper_from_scale'])

    # assign naics to allocation dataset
    helper_allocation = add_sectors_to_flowbyactivity(helper_allocation,
                                                      sectorsourcename=method['target_sector_source'])

    # generalize activity field names to enable link to water withdrawal table
    helper_allocation = generalize_activity_field_names(helper_allocation)
    # clean up helper fba with sec
    if 'clean_helper_fba_wsec' in attr:
        log.info("Cleaning " + attr['helper_source'] + ' FBA with sectors')
        # tmp hard coded - need to generalize
        if attr['helper_source'] == 'BLS_QCEW':
            helper_allocation = bls_clean_allocation_fba_w_sec(helper_allocation, attr, method)
            # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba_wsec"])(helper_allocation, attr, method)
    # drop columns
    helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max'])

    if attr['helper_method'] == 'proportional':
        # if calculating proportion, first subset the helper allocation df to only contain relevant sectors
        # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
        # aren't in list
        sector_list = df_w_sector['Sector'].unique().tolist()
        # subset fba allocation table to the values in the activity list, based on overlapping sectors
        helper_allocation = helper_allocation.loc[helper_allocation['Sector'].isin(sector_list)]
        # calculate proportional ratios
        helper_allocation = proportional_allocation_by_location_and_sector(helper_allocation, 'Sector')

    # rename column
    helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'})
    merge_columns = [e for e in ['Location','Sector', 'HelperFlow', 'FlowAmountRatio'] if e in
                     helper_allocation.columns.values.tolist()]

    # merge allocation df with helper df based on sectors, depending on geo scales of dfs
    if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'):
        helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply(lambda x: x[0:2])
        df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: x[0:2])
        merge_columns.append('Location_tmp')
        modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left')
        modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp'])
    else:
        modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left')

    # modify flow amounts using helper data
    if 'multiplication' in attr['helper_method']:
        # todo: modify so if missing data, replaced with value from one geoscale up instead of national
        # todo: modify year after merge if necessary
        # if missing values (na or 0), replace with national level values
        replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(
            drop=True)
        replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'})
        modified_fba_allocation = modified_fba_allocation.merge(
            replacement_values[['Sector', 'ReplacementValue']], how='left')
        modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(
            modified_fba_allocation['ReplacementValue'])
        modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0,
                                                                modified_fba_allocation['ReplacementValue'],
                                                                modified_fba_allocation['HelperFlow'])

        # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated
        # with new unit
        modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(value=0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation['HelperFlow']
        # drop columns
        modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'ReplacementValue'])

    elif attr['helper_method'] == 'proportional':
        modified_fba_allocation['FlowAmountRatio'] = modified_fba_allocation['FlowAmountRatio'].fillna(0)
        modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \
                                                       modified_fba_allocation['FlowAmountRatio']
        modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'FlowAmountRatio'])

    # drop rows of 0
    modified_fba_allocation = modified_fba_allocation[modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True)

    # todo: change units
    modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal'

    # option to scale up fba values
    if 'scaled' in attr['helper_method']:
        log.info("Scaling " + attr['helper_source'] + ' to FBA values')
        # tmp hard coded - need to generalize
        if attr['helper_source'] == 'BLS_QCEW':
            modified_fba_allocation = scale_blackhurst_results_to_usgs_values(modified_fba_allocation, attr)
            # modified_fba_allocation = getattr(sys.modules[__name__], attr["scale_helper_results"])(modified_fba_allocation, attr)

    return modified_fba_allocation
Esempio n. 27
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # clean up fba, if specified in yaml
            if v["clean_fba_df_fxn"] != 'None':
                log.info("Cleaning up " + k + " FlowByActivity")
                flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows)

            flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict)

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                names = attr['names']
                log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)

                # check if flowbyactivity data exists at specified geoscale to use
                flow_subset_list = []
                for n in names:
                    # subset usgs data by activity
                    flow_subset = flows[(flows[fba_activity_fields[0]] == n) |
                                        (flows[fba_activity_fields[1]] == n)].reset_index(drop=True)
                    log.info("Checking if flowbyactivity data exists for " + n + " at the " +
                             v['geoscale_to_use'] + ' level')
                    geocheck = check_if_data_exists_at_geoscale(flow_subset, v['geoscale_to_use'], activitynames=n)
                    # aggregate geographically to the scale of the allocation dataset
                    if geocheck == "Yes":
                        activity_from_scale = v['geoscale_to_use']
                    else:
                        # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                        # geoscale, and sum to specified geoscale
                        log.info("Checking if flowbyactivity data exists for " + n + " at a less aggregated level")
                        activity_from_scale = check_if_data_exists_at_less_aggregated_geoscale(flow_subset,
                                                                                               v['geoscale_to_use'], n)

                    activity_to_scale = attr['allocation_from_scale']
                    # if df is less aggregated than allocation df, aggregate usgs activity to allocation geoscale
                    if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                        log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                        flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale,
                                                      fba_default_grouping_fields, n)
                    # else, aggregate to geoscale want to use
                    elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                        log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                        flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'],
                                                      fba_default_grouping_fields, n)
                    # else, if usgs is more aggregated than allocation table, filter relevant rows
                    else:
                        log.info("Subsetting " + activity_from_scale + " data")
                        flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, n)

                    # Add sectors to df activity, depending on level of specified sector aggregation
                    log.info("Adding sectors to " + k + " for " + n)
                    flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                                     sectorsourcename=method['target_sector_source'],
                                                                     levelofSectoragg=attr['activity_sector_aggregation'])
                    flow_subset_list.append(flow_subset_wsec)
                flow_subset_wsec = pd.concat(flow_subset_list, sort=False).reset_index(drop=True)

                # clean up fba with sectors, if specified in yaml
                if v["clean_fba_w_sec_df_fxn"] != 'None':
                    log.info("Cleaning up " + k + " FlowByActivity with sectors")
                    flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])(flow_subset_wsec, attr)

                # map df to elementary flows - commented out until mapping complete
                log.info("Mapping flows in " + k + ' to federal elementary flow list')
                flow_subset_wsec = map_elementary_flows(flow_subset_wsec, k)

                # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors')
                    fbs = flow_subset_wsec.copy()

                else:
                    # determine appropriate allocation dataset
                    log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " +
                             str(attr['allocation_source_year']))
                    fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                              datasource=attr['allocation_source'],
                                                              years=[attr['allocation_source_year']]).reset_index(drop=True)

                    fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict)

                    # subset based on yaml settings
                    if attr['allocation_flow'] != 'None':
                        fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                    if attr['allocation_compartment'] != 'None':
                        fba_allocation = fba_allocation.loc[
                            fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                    # cleanup the fba allocation df, if necessary
                    if 'clean_allocation_fba' in attr:
                        log.info("Cleaning " + attr['allocation_source'])
                        fba_allocation = getattr(sys.modules[__name__],
                                                 attr["clean_allocation_fba"])(fba_allocation)
                    # reset index
                    fba_allocation = fba_allocation.reset_index(drop=True)

                    # check if allocation data exists at specified geoscale to use
                    log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level")
                    check_if_data_exists_at_geoscale(fba_allocation, attr['allocation_from_scale'])

                    # aggregate geographically to the scale of the flowbyactivty source, if necessary
                    from_scale = attr['allocation_from_scale']
                    to_scale = v['geoscale_to_use']
                    # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                    if fips_number_key[from_scale] > fips_number_key[to_scale]:
                        fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale,
                                                         fba_default_grouping_fields, names)
                    # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                    else:
                        fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                    # assign sector to allocation dataset
                    log.info("Adding sectors to " + attr['allocation_source'])
                    fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                                   sectorsourcename=method['target_sector_source'],
                                                                   levelofSectoragg=attr['allocation_sector_aggregation'])

                    # subset fba datsets to only keep the sectors associated with activity subset
                    log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                    fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)

                    # generalize activity field names to enable link to main fba source
                    log.info("Generalizing activity columns in subset of " + attr['allocation_source'])
                    fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                    # drop columns
                    fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                    # call on fxn to further disaggregate the fba allocation data, if exists
                    if 'allocation_disaggregation_fxn' in attr:
                        log.info("Futher disaggregating sectors in " + attr['allocation_source'])
                        fba_allocation_subset = getattr(sys.modules[__name__],
                                                        attr["allocation_disaggregation_fxn"])(fba_allocation_subset, attr)

                    # if there is an allocation helper dataset, modify allocation df
                    if attr['allocation_helper'] == 'yes':
                        log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                        fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                    # create flow allocation ratios
                    log.info("Creating allocation ratios for " + attr['allocation_source'])
                    flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                    # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                    # aren't in list
                    sector_list = flow_allocation['Sector'].unique().tolist()

                    # subset fba allocation table to the values in the activity list, based on overlapping sectors
                    flow_subset_wsec = flow_subset_wsec.loc[
                        (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                        (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                    # check if fba and allocation dfs have the same LocationSystem
                    log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                    check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                    # merge fba df w/flow allocation dataset
                    log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                    fbs = flow_subset_wsec.merge(
                        flow_allocation[['Location', 'Sector', 'FlowAmountRatio']],
                        left_on=['Location', 'SectorProducedBy'],
                        right_on=['Location', 'Sector'], how='left')

                    fbs = fbs.merge(
                        flow_allocation[['Location', 'Sector', 'FlowAmountRatio']],
                        left_on=['Location', 'SectorConsumedBy'],
                        right_on=['Location', 'Sector'], how='left')

                    # merge the flowamount columns
                    fbs.loc[:, 'FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])

                    # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo'
                    # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations")
                    # check_if_data_exists_for_same_geoscales(fbs, k, attr['names'])

                    # drop rows where there is no allocation data
                    fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                    # calculate flow amounts for each sector
                    log.info("Calculating new flow amounts using flow ratios")
                    fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                    # drop columns
                    log.info("Cleaning up new flow by sector")
                    fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                            'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # clean df
                fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
                if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                to_scale = method['target_geoscale']

                fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs = sector_aggregation(fbs, fbs_default_grouping_fields)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs = sector_disaggregation(fbs)

                # test agg by sector
                # sector_agg_comparison = sector_flow_comparision(fbs)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])
                # add any non-NAICS sectors used with NAICS
                sector_list = add_non_naics_sectors(sector_list, method['target_sector_level'])

                # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows
                fbs_1 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) &
                                (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)
                fbs_2 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                                (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2], sort=False)

                # set source name
                fbs_sector_subset.loc[:, 'SectorSourceName'] = method['target_sector_source']

                log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
                fbss.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append " + k + " to FBS list")
            fbss.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    fbss = fbss.replace({'nan': None})
    # add missing fields, ensure correct data type, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Esempio n. 28
0
def disaggregate_cropland(fba_w_sector, attr, method, years_list, sector_column):
    """
    In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector:
    :param attr:
    :param years_list:
    :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy)
    :param attr:
    :return:
    """

    import flowsa
    from flowsa.flowbyfunctions import sector_aggregation,\
        fbs_default_grouping_fields, clean_df, fba_fill_na_dict, fbs_fill_na_dict, add_missing_flow_by_fields,\
        sector_disaggregation, sector_ratios, replace_strings_with_NoneType, replace_NoneType_with_empty_cells,\
        harmonize_units
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # tmp drop NoneTypes
    fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector)

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(drop=True)
    # create tmp location
    crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))\

    # load the relevant state level harvested cropland by naics
    naics_load = flowsa.getFlowByActivity(flowclass=['Land'],
                                          years=years_list,
                                          datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True)
    # clean df
    naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict)
    naics = harmonize_units(naics)
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # drop the activities that include '&'
    naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(naics, sectorsourcename=method['target_sector_source'])
    # add missing fbs fields
    naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict)
    # drop cols and rename
    # naics = naics.drop(columns=["SectorProducedBy"])
    # naics = naics.rename(columns={"SectorConsumedBy": sector_column})

    # aggregate sectors to create any missing naics levels
    group_cols = fbs_default_grouping_fields
    # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')]
    # group_cols.append(sector_column)
    naics2 = sector_aggregation(naics, group_cols)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics3 = sector_disaggregation(naics2, group_cols)
    # drop rows where FlowAmount 0
    # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))]
    naics3 = naics3.loc[naics3['FlowAmount'] != 0]
    # create ratios
    naics4 = sector_ratios(naics3, sector_column)
    # create temporary sector column to match the two dfs on
    naics4 = naics4.assign(Location_tmp=naics4['Location'].apply(lambda x: x[0:2]))
    # tmp drop Nonetypes
    naics4 = replace_NoneType_with_empty_cells(naics4)

    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop[sector_column].apply(lambda x: i+1 >= len(x) >= i)]
        crop_subset = crop_subset.assign(Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i]))
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i+1)].reset_index(drop=True)
        naics_subset = naics_subset.assign(Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i]))
        # merge the two df based on locations
        df_subset = pd.merge(df_subset, naics_subset[[sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp']],
                      how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset['FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(drop=True)
        # drop columns
        df_subset = df_subset.drop(columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(columns={sector_column + '_y': sector_column})
        # tmp drop Nonetypes
        df_subset = replace_NoneType_with_empty_cells(df_subset)
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # pasture data
    pasture = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True)
    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    # fill empty cells with NoneType
    fba_w_sector = replace_strings_with_NoneType(fba_w_sector)

    return fba_w_sector
"""
Write the csv called on in flowbysectormethods yaml files for land use related EIA CBECS
"""

import flowsa
from flowsa.common import flowbysectoractivitysetspath

as_year = '2012'

if __name__ == '__main__':

    # define mecs land fba parameters
    datasource = 'EIA_CBECS_Land'

    # Read BLM PLS crosswalk
    df_import = flowsa.getFlowByActivity(datasource, as_year)

    # drop unused columns
    df = df_import[['ActivityConsumedBy']].drop_duplicates()
    # drop 'all buildings' to avoid double counting
    df = df[~df['ActivityConsumedBy'].
            isin(['All buildings', 'Mercantile', 'Health care'])]

    # rename columns
    df = df.rename(columns={"ActivityConsumedBy": "name"})

    # assign column values
    df = df.assign(activity_set='activity_set_1')
    df = df.assign(note='')

    # reorder dataframe
Esempio n. 30
0
# __init__.py (flowsa)
# !/usr/bin/env python3
# coding=utf-8
# [email protected]
"""
Examples of use of flowsa. Read parquet files as dataframes.
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :return: a pandas DataFrame in FlowByActivity format
"""
import flowsa

# "employment" based datasets
employ_bls_flowsbyactivity_2012 = flowsa.getFlowByActivity(flowclass=['Employment'], years=[2012],
                                                           datasource="BLS_QCEW")
employ_bls_flowsbyactivity_2015 = flowsa.getFlowByActivity(flowclass=['Employment', 'Money'], years=[2015],
                                                           datasource="BLS_QCEW")
employ_cpb_flowsbyactivity_2012 = flowsa.getFlowByActivity(flowclass=['Employment', 'Other'], years=[2012],
                                                           datasource="Census_CBP")


# "land" based datasets
cropland_flowsbyactivity_2017 = flowsa.getFlowByActivity(flowclass=['Land'], years=[2017],
                                                         datasource="USDA_CoA_Cropland")

# "money" based datasets
fisheries_noaa_flowsbyactivity = flowsa.getFlowByActivity(flowclass=['Money'], years=["2012-2018"],
                                                          datasource="NOAA_FisheryLandings")