Beispiel #1
0
def usgs_fba_data_cleanup(df):
    """
    Clean up the dataframe to prepare for flowbysector. Used in flowbysector.py
    :param df: df, FBA format
    :return: df, modified FBA
    """

    # drop rows of commercial data (because only exists for 3 states),
    # causes issues because linked with public supply
    # also drop closed-loop or once-through cooling (thermoelectric power)
    # to avoid double counting
    vLogDetailed.info('Removing all rows for Commercial Data because does not '
                      'exist for all states and causes issues as information '
                      'on Public Supply deliveries.')
    dfa = df[~df['Description'].str.lower().str.
             contains('commercial|closed-loop cooling|once-through')]
    calculate_flowamount_diff_between_dfs(df, dfa)
    # calculated NET PUBLIC SUPPLY by subtracting out deliveries to domestic
    vLogDetailed.info('Modify the public supply values to generate '
                      'NET public supply by subtracting out deliveries '
                      'to domestic')
    dfb = calculate_net_public_supply(dfa)

    # check that golf + crop = total irrigation, if not,
    # assign all of total irrigation to crop
    vLogDetailed.info('If states do not distinguish between golf and crop '
                      'irrigation as a subset of total irrigation, assign '
                      'all of total irrigation to crop')
    dfc = check_golf_and_crop_irrigation_totals(dfb)

    # national
    df1 = dfc[dfc['Location'] == US_FIPS]

    # drop flowname = 'total' rows when possible to prevent double counting
    # subset data where flowname = total and where it does not
    vLogDetailed.info('Drop rows where the FlowName is total to prevent'
                      'double counting at the state and county levels. '
                      'Retain rows at national level')
    df2 = dfc[dfc['FlowName'] == 'total']
    # set conditions for data to keep when flowname = 'total
    c1 = df2['Location'] != US_FIPS
    c2 = (~df2['ActivityProducedBy'].isnull()) & \
         (~df2['ActivityConsumedBy'].isnull())
    # subset data
    df2 = df2[c1 & c2].reset_index(drop=True)

    # second subset doesn't have total flowname or total compartment
    df3 = dfc[dfc['FlowName'] != 'total']
    df3 = df3[df3['Compartment'] != 'total']
    df3 = df3[df3['Location'] != US_FIPS]

    # concat the two df
    dfd = pd.concat([df1, df2, df3], ignore_index=True, sort=False)

    # In 2015, there is data for consumptive water use for
    # thermo and crop, drop because do not calculate consumptive water loss
    # for all water categories
    dfd = dfd[dfd['Compartment'] != 'air'].reset_index(drop=True)

    return dfd
Beispiel #2
0
def cbecs_land_fba_cleanup(fba_load):
    """
    Clean up the land fba for use in allocation
    :param fba_load: df, eia cbecs land flowbyactivity format
    :return: df, flowbyactivity with modified values
    """

    # estimate floor space using number of floors
    fba = calculate_floorspace_based_on_number_of_floors(fba_load)

    # calculate the land area in addition to building footprint
    fba1 = calculate_total_facility_land_area(fba)

    # drop activities of 'all buildings' to avoid double counting
    fba2 = fba1[fba1['ActivityConsumedBy'] != 'All buildings'].reset_index(
        drop=True)
    vLogDetailed.info('Drop the principle building activity "All buildings" '
                      'to avoid double counting')
    calculate_flowamount_diff_between_dfs(fba1, fba2)

    return fba2
Beispiel #3
0
def calculate_floorspace_based_on_number_of_floors(fba_load):
    """
    Estimate total floorspace for each building type based on data
    on the number of floors for each building type.
    Assumptions (Taken from Yang's static satellite tables):
    1. When floor range is 4-9, assume 6 stories
    2. When floor range is 10 or more, assume 15 stories
    :param fba_load: df, eia cbecs land flowbyactivity
    :return: df, eia cbecs land fba with estimated total floorspace
    """

    # disaggregate mercentile to malls and non malls
    fba = disaggregate_eia_cbecs_mercentile(fba_load)
    vLogDetailed.info('Calculate floorspace for mall and nonmall buildings '
                      'with different number of floors. Once calculated, '
                      'drop mercantile data from dataframe to avoid double '
                      'counting.')
    calculate_flowamount_diff_between_dfs(fba_load, fba)

    # disaggregate other and vacant
    fba2 = disaggregate_eia_cbecs_vacant_and_other(fba)
    vLogDetailed.info('Due to data suppression for floorspace by building '
                      'number of floors, some data is lost when dropping '
                      'floorspace for all buildings within a principle '
                      'building activity. To avoid this data loss, all '
                      'remaining floorspace for "All buildings" by number of '
                      'floors is allocated to "Vacant" and "Other" principle '
                      'building activities, as these activities are allocated '
                      'to all commercial building sectors. This assumption '
                      'results in a total floorspace increase for "Vacant" '
                      'and "Other" activities.')
    calculate_flowamount_diff_between_dfs(fba, fba2)

    # drop data for 'all buildings'
    fba3 = fba2[fba2['Description'] != 'All buildings']
    # add column 'DivisionFactor' based on description
    fba3 = fba3.assign(DivisionFactor=fba3['Description'].apply(lambda x: (
        1 if 'One' in x else (2 if 'Two' in x else (3 if 'Three' in x else (
            6 if 'Four' in x else (15 if 'Ten' in x else "")))))))
    # modify flowamounts to represent building footprint rather than
    # total floorspace
    fba3['FlowAmount'] = fba3['FlowAmount'] / fba3['DivisionFactor']
    # sum values for single flowamount for each bulding type
    vLogDetailed.info('Drop flows for "All Buildings" to avoid double '
                      'counting, as maintain floorspace by buildings based '
                      'on number of floors. Also dividing total floorspace '
                      'by number of floors to calculate a building footprint. '
                      'Calculates result in reduced FlowAmount for all '
                      'categories.')
    calculate_flowamount_diff_between_dfs(fba2, fba3)
    # rename the FlowAmounts and sum so total floorspace, rather than have
    # multiple rows based on floors
    fba3 = fba3.assign(
        FlowName=fba3['FlowName'].apply(lambda x: ','.join(x.split(',')[:-1])))
    # modify the description
    fba3 = fba3.assign(Description='Building Footprint')
    groupbycols = fba_mapped_default_grouping_fields
    fba4 = aggregator(fba3, groupbycols)

    return fba4
Beispiel #4
0
def main(**kwargs):
    """
    Creates a flowbysector dataset
    :param kwargs: dictionary of arguments, only argument is
        "method_name", the name of method corresponding to flowbysector
        method yaml name
    :return: parquet, FBS save to local folder
    """
    if len(kwargs) == 0:
        kwargs = parse_args()

    method_name = kwargs['method']
    download_FBA_if_missing = kwargs.get('download_FBAs_if_missing')
    # assign arguments
    vLog.info("Initiating flowbysector creation for %s", method_name)
    # call on method
    method = load_yaml_dict(method_name, flowbytype='FBS')
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v, download_FBA_if_missing)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # clean up fba before mapping, if specified in yaml
            if "clean_fba_before_mapping_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows = dynamically_import_fxn(
                    k, v["clean_fba_before_mapping_df_fxn"])(flows)

            # map flows to federal flow list or material flow list
            flows_mapped, mapping_files = \
                map_fbs_flows(flows, k, v, keep_fba_columns=True)

            # clean up fba, if specified in yaml
            if "clean_fba_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows_mapped = dynamically_import_fxn(
                    k, v["clean_fba_df_fxn"])(flows_mapped)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)
            else:
                aset_names = None

            # master list of activity names read in from data source
            ml_act = []
            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = \
                        aset_names[aset_names['activity_set'] == aset]['name']
                else:
                    names = attr['names']

                # to avoid double counting data from the same source, in
                # the event there are values in both the APB and ACB
                # columns, if an activity has already been read in and
                # allocated, remove that activity from the mapped flows
                # regardless of what activity set the data was read in
                flows_mapped = flows_mapped[~(
                    (flows_mapped[fba_activity_fields[0]].isin(ml_act)) |
                    (flows_mapped[fba_activity_fields[1]].isin(ml_act))
                )].reset_index(drop=True)
                ml_act.extend(names)

                vLog.info("Preparing to handle %s in %s", aset, k)
                # subset fba data by activity
                flows_subset = flows_mapped[
                    (flows_mapped[fba_activity_fields[0]].isin(names)) |
                    (flows_mapped[fba_activity_fields[1]].isin(names)
                     )].reset_index(drop=True)

                # subset by flowname if exists
                if 'source_flows' in attr:
                    flows_subset = flows_subset[flows_subset['FlowName'].isin(
                        attr['source_flows'])]
                if len(flows_subset) == 0:
                    log.warning(f"no data found for flows in {aset}")
                    continue
                if len(flows_subset[flows_subset['FlowAmount'] != 0]) == 0:
                    log.warning(f"all flow data for {aset} is 0")
                    continue
                # if activities are sector-like, check sectors are valid
                if check_activities_sector_like(k):
                    flows_subset2 = replace_naics_w_naics_from_another_year(
                        flows_subset, method['target_sector_source'])

                    # check impact on df FlowAmounts
                    vLog.info(
                        'Calculate FlowAmount difference caused by '
                        'replacing NAICS Codes with %s, saving '
                        'difference in Validation log',
                        method['target_sector_source'],
                    )
                    calculate_flowamount_diff_between_dfs(
                        flows_subset, flows_subset2)
                else:
                    flows_subset2 = flows_subset.copy()

                # extract relevant geoscale data or aggregate existing data
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset2, v['geoscale_to_use'],
                    attr['allocation_from_scale'])
                # if loading data subnational geoscale, check for data loss
                if attr['allocation_from_scale'] != 'national':
                    compare_geographic_totals(flows_subset_geo, flows_mapped,
                                              k, attr, aset, names)

                # Add sectors to df activity, depending on level
                # of specified sector aggregation
                log.info("Adding sectors to %s", k)
                flows_subset_wsec = add_sectors_to_flowbyactivity(
                    flows_subset_geo,
                    sectorsourcename=method['target_sector_source'],
                    allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if "clean_fba_w_sec_df_fxn" in v:
                    vLog.info("Cleaning up %s FlowByActivity with sectors", k)
                    flows_subset_wsec = dynamically_import_fxn(
                        k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec,
                                                        attr=attr,
                                                        method=method)

                # rename SourceName to MetaSources and drop columns
                flows_mapped_wsec = flows_subset_wsec.\
                    rename(columns={'SourceName': 'MetaSources'}).\
                    drop(columns=['FlowName', 'Compartment'])

                # if allocation method is "direct", then no need
                # to create alloc ratios, else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    fbs = direct_allocation_method(flows_mapped_wsec, k, names,
                                                   method)
                # if allocation method for an activity set requires a specific
                # function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    fbs = function_allocation_method(flows_mapped_wsec, k,
                                                     names, attr, fbs_list)
                else:
                    fbs = dataset_allocation_method(flows_mapped_wsec, attr,
                                                    names, method, k, v, aset,
                                                    aset_names,
                                                    download_FBA_if_missing)

                # drop rows where flowamount = 0
                # (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors
                # being activity-like or not
                if check_activities_sector_like(k) is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to %s level",
                         method['target_geoscale'])
                # determine from scale
                if fips_number_key[v['geoscale_to_use']] <\
                        fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale,
                                              method['target_geoscale'],
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6
                # associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg)

                # check if any sector information is lost before reaching
                # the target sector length, if so,
                # allocate values equally to disaggregated sectors
                vLog.info(
                    'Searching for and allocating FlowAmounts for any parent '
                    'NAICS that were dropped in the subset to '
                    '%s child NAICS', method['target_sector_level'])
                fbs_agg_2 = equally_allocate_parent_to_child_naics(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                compare_activity_to_sector_flowamounts(flows_mapped_wsec,
                                                       fbs_agg_2, aset, k,
                                                       method)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are
                # NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                    (fbs_agg_2[fbs_activity_fields[1]].isnull())].\
                    reset_index(drop=True)
                fbs_3 = fbs_agg_2.loc[
                    (fbs_agg_2[fbs_activity_fields[0]].isnull()) &
                    (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_geo_subset_and_fbs_output_totals(
                    flows_subset_geo, fbs_sector_subset, aset, k, v, attr,
                    method)

                log.info("Completed flowbysector for %s", aset)
                fbs_list.append(fbs_sector_subset)
        else:
            if 'clean_fbs_df_fxn' in v:
                flows = dynamically_import_fxn(v["clean_fbs_df_fxn_source"],
                                               v["clean_fbs_df_fxn"])(flows)
            flows = update_geoscale(flows, method['target_geoscale'])
            # if the loaded flow dt is already in FBS format,
            # append directly to list of FBS
            log.info("Append %s to FBS list", k)
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # add missing fields, ensure correct data type,
    # add missing columns, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    # prior to aggregating, replace MetaSources string with all sources
    # that share context/flowable/sector values
    fbss = harmonize_FBS_columns(fbss)
    # aggregate df as activities might have data for
    # the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # ensure correct data types/order of columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # check for negative flow amounts
    check_for_negative_flowamounts(fbss)
    # tmp reset data quality scores
    fbss = reset_fbs_dq_scores(fbss)
    # save parquet file
    meta = set_fb_meta(method_name, "FlowBySector")
    write_df_to_file(fbss, paths, meta)
    write_metadata(method_name, method, meta, "FlowBySector")
    # rename the log file saved to local directory
    rename_log_file(method_name, meta)
    log.info(
        'See the Validation log for detailed assessment of '
        'model results in %s', logoutputpath)
Beispiel #5
0
def calculate_net_public_supply(df_load):
    """
    USGS Provides info on the quantity of public supply withdrawals that
    are delivered to domestic use. The USGS PS withdrawals are not necessarily
    greater than/equal to the Domestic deliveries because water can be
    withdrawn in one county and delivered in another (water can also cross
    state lines). Therefore, can/do end up with NEGATIVE net public supply
    values and PS water should only be used at a national level

    Domestic deliveries are subtracted from public supply. An assumption is
    made that PS deliveries to domestic is fresh water. The national level
    data can then be allocated to end users using the BEA Use tables.
    :param df_load: USGS df
    :return: df with net public supply values
    """

    # subset into 2 dfs, one that contains PS data and one that does not
    df1 = df_load[(df_load[fba_activity_fields[0]] == 'Public Supply') |
                  (df_load[fba_activity_fields[1]] == 'Public Supply')]
    df2 = df_load[(df_load[fba_activity_fields[0]] != 'Public Supply')
                  & (df_load[fba_activity_fields[1]] != 'Public Supply')]

    # drop all deliveries to thermo and industrial
    # (not enough states report the data to make usable)
    df1_sub = df1[~df1[fba_activity_fields[1]].isin([
        'Industrial', 'Thermoelectric Power',
        'Thermoelectric Power Closed-loop cooling',
        'Thermoelectric Power Once-through cooling'
    ])]
    # drop duplicate info of "Public Supply deliveries to"
    df1_sub = df1_sub.loc[~df1_sub['Description'].str.
                          contains("Public Supply total deliveries")]
    df1_sub = df1_sub.loc[~df1_sub['Description'].str.
                          contains("deliveries from public supply")]

    # calculate data drop
    vLogDetailed.info('Dropping rows that contain "deliveries from public '
                      'supply" to avoid double counting with rows of "Public '
                      'Supply deliveries to"')
    calculate_flowamount_diff_between_dfs(df1, df1_sub)

    # drop county level values because cannot use county data
    vLogDetailed.info('Dropping county level public supply withdrawals '
                      'because will end up with negative values due to '
                      'instances of water deliveries coming from surrounding '
                      'counties')
    df1_sub = df1_sub[df1_sub['Location'].apply(
        lambda x: x[2:6] == '000')].reset_index(drop=True)

    # df of ps delivered and ps withdrawn and us total
    df_d = df1_sub[df1_sub[fba_activity_fields[0]] == 'Public Supply']
    df_w = df1_sub[df1_sub[fba_activity_fields[1]] == 'Public Supply']
    df_us = df1_sub[df1_sub['Location'] == '00000']
    # split consumed further into fresh water (assumption domestic
    # deliveries are freshwater) assumption that water withdrawal taken
    # equally from ground and surface
    df_w1 = df_w[(df_w['FlowName'] == 'fresh')
                 & (df_w['Compartment'] != 'total')]
    df_w2 = df_w[(df_w['FlowName'] == 'fresh')
                 & (df_w['Compartment'] == 'total')]
    # compare units
    compare_df_units(df_w1, df_w2)
    df_wm = pd.merge(df_w1,
                     df_w2[['FlowAmount', 'Location', 'Unit']],
                     how='left',
                     left_on=['Location', 'Unit'],
                     right_on=['Location', 'Unit'])
    df_wm = df_wm.rename(columns={
        "FlowAmount_x": "FlowAmount",
        "FlowAmount_y": "FlowTotal"
    })
    # compare units
    compare_df_units(df_wm, df_d)
    # merge the deliveries to domestic
    df_w_modified = pd.merge(df_wm,
                             df_d[['FlowAmount', 'Location']],
                             how='left',
                             left_on='Location',
                             right_on='Location')
    df_w_modified = df_w_modified.rename(columns={
        "FlowAmount_x": "FlowAmount",
        "FlowAmount_y": "DomesticDeliveries"
    })

    # create flowratio for ground/surface
    df_w_modified.loc[:, 'FlowRatio'] = \
        df_w_modified['FlowAmount'] / df_w_modified['FlowTotal']
    # calculate new, net total public supply withdrawals
    # will end up with negative values due to instances of water
    # deliveries coming form surrounding counties
    df_w_modified.loc[:, 'FlowAmount'] = \
        df_w_modified['FlowAmount'] - (df_w_modified['FlowRatio'] *
                                       df_w_modified['DomesticDeliveries'])

    net_ps = df_w_modified.drop(columns=["FlowTotal", "DomesticDeliveries"])

    # compare units
    compare_df_units(df_d, net_ps)
    # because assuming domestic is all fresh, drop
    # flowname/flowable/Compartment/context
    # and instead use those column data from the net_ps df
    df_d_modified = df_d.drop(
        columns=['FlowName', 'Flowable', 'Compartment', 'Context', 'FlowUUID'])
    # Also allocate to ground/surface from state ratios
    df_d_modified = pd.merge(df_d_modified,
                             net_ps[[
                                 'FlowName', 'Flowable', 'Compartment',
                                 'Context', 'FlowUUID', 'Location', 'FlowRatio'
                             ]],
                             how='left',
                             left_on='Location',
                             right_on='Location')
    df_d_modified.loc[:, 'FlowAmount'] = \
        df_d_modified['FlowAmount'] * df_d_modified['FlowRatio']
    df_d_modified = df_d_modified.drop(columns=["FlowRatio"])

    net_ps = net_ps.drop(columns=["FlowRatio"])

    # concat dfs back (non-public supply, public supply
    # deliveries, net ps withdrawals)
    modified_ps = pd.concat([df2, df_d_modified, net_ps, df_us],
                            ignore_index=True)

    return modified_ps