Beispiel #1
0
def validate_state_totals(df, year):
    """Generate validation by state, sums across species.

    Details on results by state can be found in the search results help website
    https://echo.epa.gov/help/loading-tool/water-pollution-search/search-results-help-dmr
    """
    filepath = DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv")
    if not filepath.is_file():
        download_state_totals_validation(year)
    log.info('validating against state totals')
    reference_df = pd.read_csv(filepath)
    reference_df['FlowAmount'] = 0.0
    reference_df = unit_convert(reference_df, 'FlowAmount',
                                'Unit', 'lb', lb_kg, 'Amount')
    reference_df = reference_df[['FlowName', 'State', 'FlowAmount']]

    # to match the state totals, only compare NPD facilities, and remove some flows
    flow_exclude = pd.read_csv(DMR_DATA_PATH.joinpath('DMR_state_filter_list.csv'))
    state_flow_exclude_list = flow_exclude['POLLUTANT_DESC'].to_list()

    dmr_by_state = df[~df['FlowName'].isin(state_flow_exclude_list)]
    dmr_by_state = dmr_by_state[dmr_by_state['PermitTypeCode'] == 'NPD']

    dmr_by_state = dmr_by_state[['State', 'FlowAmount']]
    dmr_by_state = dmr_by_state[['State', 'FlowAmount']
                                ].groupby('State').sum().reset_index()
    dmr_by_state['FlowName'] = 'All'
    validation_df = validate_inventory(dmr_by_state, reference_df,
                                       group_by="state")
    write_validation_result('DMR', year, validation_df)
Beispiel #2
0
def validate_national_totals(inv, TRIyear):
    log.info('validating data against national totals')
    filename = DATA_PATH.joinpath(f'TRI_{TRIyear}_NationalTotals.csv')
    if filename.is_file():
        tri_national_totals = pd.read_csv(filename,
                                          header=0,
                                          dtype={"FlowAmount": float})
        tri_national_totals['FlowAmount_kg'] = 0
        tri_national_totals = unit_convert(tri_national_totals,
                                           'FlowAmount_kg', 'Unit', 'Pounds',
                                           lb_kg, 'FlowAmount')
        # drop old amount and units
        tri_national_totals.drop('FlowAmount', axis=1, inplace=True)
        tri_national_totals.drop('Unit', axis=1, inplace=True)
        # Rename cols to match reference format
        tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                                   inplace=True)
        inv = map_to_fedefl(inv)
        if inv is not None:
            validation_result = validate_inventory(inv,
                                                   tri_national_totals,
                                                   group_by='flow',
                                                   tolerance=5.0)
            write_validation_result('TRI', TRIyear, validation_result)
    else:
        log.warning(f'validation file for TRI_{TRIyear} does not exist. '
                    'Please run option B')
Beispiel #3
0
def validate_eGRID(year, flowbyfac):
    """Validate eGRID flowbyfacility data against national totals."""
    validation_file = DATA_PATH.joinpath(f"eGRID_{year}_NationalTotals.csv")
    if not validation_file.is_file():
        generate_national_totals(year)
    log.info('validating data against national totals')
    egrid_national_totals = pd.read_csv(validation_file, header=0,
                                        dtype={"FlowAmount": float})
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'lbs',
        lb_kg, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'tons',
        USton_kg, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'MMBtu',
        MMBtu_MJ, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'MWh',
        MWh_MJ, 'FlowAmount')
    # drop old unit
    egrid_national_totals.drop('Unit', axis=1, inplace=True)
    validation_result = validate_inventory(flowbyfac, egrid_national_totals,
                                           group_by='flow', tolerance=5.0)
    write_validation_result('eGRID', year, validation_result)
Beispiel #4
0
def validate_national_totals(nei_flowbyfacility, year):
    """Validate against national flow totals."""
    log.info('validating flow by facility against national totals')
    if not DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv').is_file():
        generate_national_totals(year)
    else:
        log.info('using already processed national totals validation file')
    nei_national_totals = pd.read_csv(
        DATA_PATH.joinpath(f'NEI_{year}_NationalTotals.csv'),
        header=0,
        dtype={"FlowAmount[kg]": float})
    nei_national_totals.rename(columns={'FlowAmount[kg]': 'FlowAmount'},
                               inplace=True)
    validation_result = validate_inventory(nei_flowbyfacility,
                                           nei_national_totals,
                                           group_by='flow',
                                           tolerance=5.0)
    write_validation_result('NEI', year, validation_result)
Beispiel #5
0
def validate_national_totals_by_subpart(tab_df, year):
    log.info('validating flowbyfacility against national totals')
    # apply CO2e factors for some flows
    mask = (tab_df['AmountCO2e'].isna() & tab_df['FlowID'].isin(flows_CO2e))
    tab_df.loc[mask,
               'Flow Description'] = 'Fluorinated GHG Emissions (mt CO2e)'
    subpart_L_GWPs = load_subpart_l_gwp()
    subpart_L_GWPs.rename(columns={'Flow Name': 'FlowName'}, inplace=True)
    tab_df = tab_df.merge(subpart_L_GWPs,
                          how='left',
                          on=['FlowName', 'Flow Description'])
    tab_df['CO2e_factor'] = tab_df['CO2e_factor'].fillna(1)
    tab_df.loc[mask,
               'AmountCO2e'] = tab_df['FlowAmount'] * tab_df['CO2e_factor']

    # for subset of flows, use CO2e for validation
    mask = tab_df['FlowID'].isin(flows_CO2e)
    tab_df.loc[mask, 'FlowAmount'] = tab_df['AmountCO2e']

    # parse tabulated data
    tab_df.drop(columns=['FacilityID', 'DataReliability', 'FlowName'],
                inplace=True)
    tab_df.rename(columns={
        'Process': 'SubpartName',
        'FlowID': 'FlowName'
    },
                  inplace=True)

    # import and parse reference data
    ref_df = pd.read_csv(
        DATA_PATH.joinpath(f'GHGRP_{year}_NationalTotals.csv'))
    ref_df.drop(columns=['FlowName'], inplace=True)
    ref_df.rename(columns={
        'SUBPART_NAME': 'SubpartName',
        'FlowCode': 'FlowName'
    },
                  inplace=True)

    validation_result = validate_inventory(tab_df, ref_df, group_by='subpart')
    # Update flow names to indicate which are in CO2e
    validation_result.loc[
        validation_result['FlowName'].isin(flows_CO2e),
        'FlowName'] = validation_result['FlowName'] + ' (CO2e)'
    write_validation_result('GHGRP', year, validation_result)
def validate_state_totals(report_year, flowbyfacility):
    log.info('validating data against state totals')
    file_path = DATA_PATH.joinpath(f'RCRAInfo_{report_year}_StateTotals.csv')
    if file_path.is_file():
        totals = pd.read_csv(file_path, dtype={"FlowAmount_kg": float})
        # Rename cols to match reference format
        totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True)
        # Validate waste generated against state totals, include only NBR data
        flowbyfacility['State'] = flowbyfacility['FacilityID'].str[0:2]
        flowbyfacility = apply_filters_to_inventory(
            flowbyfacility, 'RCRAInfo', report_year,
            ['National_Biennial_Report', 'imported_wastes', 'US_States_only'])
        validation_df = validate_inventory(flowbyfacility,
                                           totals,
                                           group_by='state')
        write_validation_result('RCRAInfo', report_year, validation_df)
    else:
        log.warning(
            f'validation file for RCRAInfo_{report_year} does not exist.')