Ejemplo n.º 1
0
def getInventory(inventory_acronym,
                 year,
                 stewiformat='flowbyfacility',
                 filters=None,
                 filter_for_LCI=False,
                 US_States_Only=False,
                 download_if_missing=False):
    """Return or generate an inventory in a standard output format.

    :param inventory_acronym: like 'TRI'
    :param year: year as number like 2010
    :param stewiformat: str e.g. 'flowbyfacility' or 'flow'
    :param filters: a list of named filters to apply to inventory
    :param filter_for_LCI: whether or not to filter inventory for life
        cycle inventory creation, is DEPRECATED in favor of 'filters'
    :param US_States_Only: includes only US states, is DEPRECATED in
        favor of 'filters'
    :param download_if_missing: bool, if True will attempt to load from
        remote server prior to generating if file not found locally
    :return: dataframe with standard fields depending on output format
    """
    f = ensure_format(stewiformat)
    inventory = read_inventory(inventory_acronym, year, f, download_if_missing)
    if not filters:
        filters = []
    if f.value > 2:  # exclude FLOW and FACILITY
        # for backwards compatability, maintain these optional parameters in getInventory
        if filter_for_LCI:
            log.warning(
                r'"filter_for_LCI" parameter is deprecated and will be removed '
                'as a paramter in getInventory in future release.\n'
                r'Add "filter_for_LCI" to filters.')
            if 'filter_for_LCI' not in filters:
                filters.append('filter_for_LCI')
        if US_States_Only:
            log.warning(
                r'"US_States_Only" parameter is deprecated and will be removed '
                'as a paramter in getInventory in future release.\n'
                r'Add "US_States_only" to filters.')
            if 'US_States_only' not in filters:
                filters.append('US_States_only')

        inventory = apply_filters_to_inventory(inventory, inventory_acronym,
                                               year, filters,
                                               download_if_missing)
        # After filting, may be necessary to reaggregate inventory again
        inventory = aggregate(inventory)

    inventory = add_missing_fields(inventory,
                                   inventory_acronym,
                                   f,
                                   maintain_columns=False)

    return inventory
Ejemplo n.º 2
0
def main(**kwargs):
    parser = argparse.ArgumentParser(argument_default = argparse.SUPPRESS)

    parser.add_argument('Option',
                        help = 'What do you want to do:\
                        [A] Download DMR files from web\
                        [B] Generate StEWI inventory outputs and\
                            validate to state totals\
                        [C] Download state totals',
                        type = str)

    parser.add_argument('-Y', '--Year', nargs = '+',
                        help = 'What DMR year(s) you want to retrieve',
                        type = str)

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args())

    for year in kwargs['Year']:

        if kwargs['Option'] == 'A':
            log.info(f"Querying for {year}")

            # two digit SIC codes from advanced search drop down stripped and formatted as a list
            sic2 = list(pd.read_csv(DMR_DATA_PATH.joinpath('2_digit_SIC.csv'),
                        dtype={'SIC2': str})['SIC2'])
            # Query by state, then by SIC-state where necessary
            result_dict = query_dmr(year=year)
            log.debug('possible errors: ' + ', '.join(
                [s for s in result_dict.keys()
                 if result_dict[s] != 'success']))
            state_max_error_list = [s for s in result_dict.keys()
                                    if result_dict[s] == 'max_error']
            state_no_data_list = [s for s in result_dict.keys()
                                  if result_dict[s] == 'no_data']
            if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0):
                log.info('all states succesfully downloaded')
            else:
                if (len(state_max_error_list) > 0):
                    log.error(f"Max error: {' '.join(state_max_error_list)}")
                if (len(state_no_data_list) > 0):
                    log.error(f"No data error: {' '.join(state_no_data_list)}")
                log.info('Breaking up queries further by SIC')
                result_dict = query_dmr(year=year, sic_list=sic2,
                                        state_list=state_max_error_list)
                sic_state_max_error_list = [s for s in result_dict.keys()
                                            if result_dict[s] == 'max_error']
                if len(sic_state_max_error_list) > 0:
                    log.error(f"Max error: {' '.join(sic_state_max_error_list)}")

            log.info(f"Querying nutrients for {year}")
            # Query aggregated nutrients data
            for nutrient in ['N', 'P']:
                result_dict = query_dmr(year=year, nutrient=nutrient)
                log.debug('possible errors: ' + ', '.join(
                    [s for s in result_dict.keys()
                     if result_dict[s] != 'success']))
                state_max_error_list = [s for s in result_dict.keys()
                                        if result_dict[s] == 'max_error']
                state_no_data_list = [s for s in result_dict.keys()
                                      if result_dict[s] == 'no_data']
                if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0):
                    log.info(f'all states succesfully downloaded for {nutrient}')
                else:
                    result_dict = query_dmr(year=year, sic_list=sic2,
                                            state_list=state_max_error_list,
                                            nutrient=nutrient)
            # write metadata
            generate_metadata(year, datatype='source')

        if kwargs['Option'] == 'B':
            log.info(f'generating inventories for DMR {year}')
            state_df = combine_DMR_inventory(year)
            state_df = filter_states(standardize_df(state_df))

            # Validation against state totals is done prior to combining
            # with aggregated nutrients
            validate_state_totals(state_df, year)

            P_df = combine_DMR_inventory(year, nutrient='P')
            N_df = combine_DMR_inventory(year, nutrient='N')

            nut_drop_list = read_pollutant_parameter_list()
            nut_drop_list = nut_drop_list[(nut_drop_list['NITROGEN'] == 'Y') |
                                          (nut_drop_list['PHOSPHORUS'] == 'Y')]
            nut_drop_list = list(set(nut_drop_list['FlowName']))

            # Consolidate N and P based flows to reflect nutrient aggregation
            P_df = consolidate_nutrients(P_df, nut_drop_list, 'P')
            N_df = consolidate_nutrients(N_df, nut_drop_list, 'N')

            nutrient_agg_df = pd.concat([P_df, N_df])
            nutrient_agg_df = filter_states(standardize_df(nutrient_agg_df))

            # Filter out nitrogen and phosphorus flows before combining
            # with aggregated nutrients
            dmr_nut_filtered = state_df[~state_df['FlowName'].isin(nut_drop_list)]
            dmr_df = pd.concat([dmr_nut_filtered,
                                nutrient_agg_df]).reset_index(drop=True)

            # PermitTypeCode needed for state validation but not maintained
            dmr_df = dmr_df.drop(columns=['PermitTypeCode'])

            # generate output for facility
            facility_columns = ['FacilityID', 'FacilityName', 'City',
                                'State', 'Zip', 'Latitude', 'Longitude',
                                'County', 'NAICS', 'SIC'] # 'Address' not in DMR
            dmr_facility = dmr_df[facility_columns].drop_duplicates()
            store_inventory(dmr_facility, 'DMR_' + year, 'facility')

            # generate output for flow
            flow_columns = ['FlowID', 'FlowName']
            dmr_flow = dmr_df[flow_columns].drop_duplicates()
            dmr_flow.sort_values(by=['FlowName'], inplace=True)
            dmr_flow['Compartment'] = 'water'
            dmr_flow['Unit'] = 'kg'
            store_inventory(dmr_flow, 'DMR_' + year, 'flow')

            # generate output for flowbyfacility
            fbf_columns = ['FlowName', 'FlowAmount', 'FacilityID',
                           'DataReliability']
            dmr_fbf = dmr_df[fbf_columns].reset_index(drop=True)
            dmr_fbf = aggregate(dmr_fbf, ['FacilityID', 'FlowName'])
            dmr_fbf['Compartment'] = 'water'
            dmr_fbf['Unit'] = 'kg'
            store_inventory(dmr_fbf, 'DMR_' + year, 'flowbyfacility')

            # write metadata
            generate_metadata(year, datatype='inventory')

        if kwargs['Option'] == 'C':
            download_state_totals_validation(year)
Ejemplo n.º 3
0
def Generate_TRI_files_csv(TRIyear, Files):
    """Generate TRI inventories from downloaded files."""
    tri_required_fields = imp_fields(
        TRI_DATA_PATH.joinpath('TRI_required_fields.txt'))
    keys = imp_fields(TRI_DATA_PATH.joinpath('TRI_keys.txt'))
    values = list()
    for p in range(len(keys)):
        start = 13 + 2 * p
        end = start + 1
        values.append(concat_req_field(tri_required_fields[start:end + 1]))
    # Create a dictionary that had the import fields for each release
    # type to use in import process
    import_dict = dict_create(keys, values)
    # Build the TRI DataFrame
    tri = import_TRI_by_release_type(import_dict, TRIyear)
    # drop NA for Amount, but leave in zeros
    tri = tri.dropna(subset=['FlowAmount'])
    tri = strip_coln_white_space(tri, 'Basis of Estimate')
    # Convert to float if there are errors - be careful with this line
    if tri['FlowAmount'].values.dtype != 'float64':
        tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce')
    tri = tri[tri['FlowAmount'] != 0]
    # Import reliability scores for TRI
    tri_reliability_table = get_reliability_table_for_source('TRI')
    tri = pd.merge(tri,
                   tri_reliability_table,
                   left_on='Basis of Estimate',
                   right_on='Code',
                   how='left')
    tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5)
    tri.drop(['Basis of Estimate', 'Code'], axis=1, inplace=True)
    # Replace source info with Context
    source_to_context = pd.read_csv(
        TRI_DATA_PATH.joinpath('TRI_ReleaseType_to_Compartment.csv'))
    tri = pd.merge(tri, source_to_context, how='left')
    # Convert units to ref mass unit of kg
    tri['Amount_kg'] = 0.0
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount')
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount')
    tri.drop(columns=['FlowAmount', 'Unit'], inplace=True)
    # Rename cols to match reference format
    tri.rename(columns={
        'Amount_kg': 'FlowAmount',
        'DQI Reliability Score': 'DataReliability'
    },
               inplace=True)
    tri.drop(columns=['ReleaseType'], inplace=True)
    grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment']
    tri = aggregate(tri, grouping_vars)

    validate_national_totals(tri, TRIyear)

    # FLOWS
    flowsdf = tri[['FlowName', 'CAS',
                   'Compartment']].drop_duplicates().reset_index(drop=True)
    flowsdf.loc[:, 'FlowID'] = flowsdf['CAS']
    store_inventory(flowsdf, 'TRI_' + TRIyear, 'flow')

    # FLOW BY FACILITY
    tri.drop(columns=['CAS'], inplace=True)
    store_inventory(tri, 'TRI_' + TRIyear, 'flowbyfacility')

    # FACILITY
    # Import and handle TRI facility data
    import_facility = tri_required_fields[0:10]
    tri_facility = pd.read_csv(OUTPUT_PATH.joinpath(f'US_1a_{TRIyear}.csv'),
                               usecols=import_facility,
                               low_memory=False)
    tri_facility = tri_facility.drop_duplicates(ignore_index=True)
    # rename columns
    TRI_facility_name_crosswalk = {
        'TRIFID': 'FacilityID',
        'FACILITY NAME': 'FacilityName',
        'FACILITY STREET': 'Address',
        'FACILITY CITY': 'City',
        'FACILITY COUNTY': 'County',
        'FACILITY STATE': 'State',
        'FACILITY ZIP CODE': 'Zip',
        'PRIMARY NAICS CODE': 'NAICS',
        'LATITUDE': 'Latitude',
        'LONGITUDE': 'Longitude',
    }
    tri_facility.rename(columns=TRI_facility_name_crosswalk, inplace=True)
    store_inventory(tri_facility, 'TRI_' + TRIyear, 'facility')
Ejemplo n.º 4
0
def generate_eGRID_files(year):
    """Parse a local eGRID file to generate StEWI output files.

    :param year: str, Year of eGRID dataset
    """
    log.info(f'generating eGRID files for {year}')
    log.info('importing plant level emissions data')
    egrid = parse_eGRID(year, 'PLNT', 'eGRID_required_fields.csv')

    flowbyfac_fields = filter_fields('eGRID_required_fields.csv', 'flowbyfac_fields')

    flowbyfac_prelim = egrid[flowbyfac_fields]
    conversion = []
    conversion.append(flowbyfac_prelim[['FacilityID', 'Plant primary fuel']])
    conversion.append(egrid_unit_convert(
        flowbyfac_prelim[['Nitrogen oxides', 'Sulfur dioxide', 'Carbon dioxide']], USton_kg))
    conversion.append(egrid_unit_convert(
        flowbyfac_prelim[['Methane', 'Nitrous oxide']], lb_kg))
    conversion.append(egrid_unit_convert(
        flowbyfac_prelim[['Heat', 'Steam']], MMBtu_MJ))
    conversion.append(egrid_unit_convert(flowbyfac_prelim[['Electricity']], MWh_MJ))
    flowbyfac_stacked = pd.concat(conversion, axis=1)
    # Create flowbyfac
    flowbyfac = pd.melt(flowbyfac_stacked,
                        id_vars=['FacilityID', 'Plant primary fuel'],
                        value_vars=list(flowbyfac_stacked.columns[2:]),
                        var_name='FlowName', value_name='FlowAmount')

    flowbyfac = flowbyfac.dropna(subset=['FlowAmount'])
    flowbyfac['FlowAmount'] = pd.to_numeric(flowbyfac['FlowAmount'])
    flowbyfac = flowbyfac.sort_values(by=['FacilityID'], axis=0,
                                      ascending=True, inplace=False,
                                      kind='quicksort', na_position='last')

    # Read in unit sheet to get comment fields related to source of heat, NOx,
    # SO2, and CO2 emission estimates for calculating data quality information
    log.info('importing unit level data to assess data quality')
    unit_egrid = parse_eGRID(year, 'UNT', 'eGRID_unit_level_required_fields.csv')

    rel_score_cols = filter_fields('eGRID_unit_level_required_fields.csv',
                                   'reliability_flows')

    flows_used_for_weighting = filter_fields('eGRID_unit_level_required_fields.csv',
                                             'weighting_flows')

    unit_emissions_with_rel_scores = ['Heat', 'Nitrogen oxides',
                                      'Sulfur dioxide', 'Carbon dioxide']

    unit_egrid.update(unit_egrid[rel_score_cols].fillna(''))
    unit_egrid.update(unit_egrid[flows_used_for_weighting].fillna(0))
    # Generate combined columns as lists before exploding lists into multiple rows
    unit_egrid['FlowName'] = unit_egrid.apply(lambda _: unit_emissions_with_rel_scores, axis=1)
    unit_egrid['ReliabilitySource'] = unit_egrid[rel_score_cols].values.tolist()
    unit_egrid['FlowAmount'] = unit_egrid[flows_used_for_weighting].values.tolist()
    unit_egrid = unit_egrid.drop(columns=rel_score_cols + flows_used_for_weighting)
    unit_egrid = unit_egrid.set_index(list(unit_egrid.columns
                                           .difference(['FlowName',
                                                        'ReliabilitySource',
                                                        'FlowAmount']))
                                      ).apply(pd.Series.explode).reset_index()

    dq_mapping = pd.read_csv(eGRID_DATA_DIR
                             .joinpath('eGRID_unit_level_reliability_scores.csv'))
    unit_egrid = unit_egrid.merge(dq_mapping, how='left')

    # Aggregate data reliability scores by facility and flow
    rel_scores_by_facility = aggregate(unit_egrid, grouping_vars=['FacilityID', 'FlowName'])
    rel_scores_by_facility = rel_scores_by_facility.drop(columns=['FlowAmount'])

    # Merge in heat_SO2_CO2_NOx reliability scores calculated from unit sheet
    flowbyfac = flowbyfac.merge(rel_scores_by_facility,
                                on=['FacilityID', 'FlowName'], how='left')
    # Assign electricity to a reliabilty score of 1
    flowbyfac.loc[flowbyfac['FlowName'] == 'Electricity', 'DataReliability'] = 1
    flowbyfac['DataReliability'] = flowbyfac['DataReliability'].fillna(5)

    # Methane and nitrous oxide reliability scores
    # Assign 3 to all facilities except for certain fuel types where
    # measurements are taken
    flowbyfac.loc[(flowbyfac['FlowName'] == 'Methane') |
                  (flowbyfac['FlowName'] == 'Nitrous oxide'),
                  'DataReliability'] = 3
    # For all but selected fuel types, change it to 2
    flowbyfac.loc[((flowbyfac['FlowName'] == 'Methane') |
                   (flowbyfac['FlowName'] == 'Nitrous oxide')) &
                   ((flowbyfac['Plant primary fuel'] != 'PG') |
                    (flowbyfac['Plant primary fuel'] != 'RC') |
                    (flowbyfac['Plant primary fuel'] != 'WC') |
                    (flowbyfac['Plant primary fuel'] != 'SLW')),
                   'DataReliability'] = 2

    # Import flow compartments
    flow_compartments = pd.read_csv(eGRID_DATA_DIR
                                    .joinpath('eGRID_flow_compartments.csv'),
                                    header=0)
    flowbyfac = pd.merge(flowbyfac, flow_compartments, on='FlowName', how='left')

    # Drop unneeded columns
    flowbyfac = flowbyfac.drop(columns=['Plant primary fuel', 'OriginalName'])

    # Write flowbyfacility file to output
    store_inventory(flowbyfac, 'eGRID_' + year, 'flowbyfacility')

    # Creation of the facility file
    # Need to change column names manually
    egrid_fields = filter_fields('eGRID_required_fields.csv', 'facility_fields')
    egrid_fac_fields = [c for c in egrid if c in (egrid_fields +
                                                  StewiFormat.FACILITY.fields())]

    facility = egrid[egrid_fac_fields].reset_index(drop=True)

    # Data starting in 2018 for resource mix is listed as percentage.
    # For consistency multiply by 100
    if int(year) >= 2018:
        facility.loc[:, facility.columns.str.contains('resource mix')] *= 100

    log.debug(len(facility))
    #2019: 11865
    #2018: 10964
    #2016: 9709
    #2014: 8503
    store_inventory(facility, 'eGRID_' + year, 'facility')

    # Write flows file
    flows = flowbyfac[['FlowName', 'Compartment', 'Unit']]
    flows = flows.drop_duplicates()
    flows = flows.sort_values(by='FlowName', axis=0)
    store_inventory(flows, 'eGRID_' + year, 'flow')

    validate_eGRID(year, flowbyfac)
Ejemplo n.º 5
0
def Generate_RCRAInfo_files_csv(report_year):
    """Generate stewi inventory files from downloaded data files."""
    log.info(f'generating inventory files for {report_year}')
    filepath = DIR_RCRA_BY_YEAR.joinpath(
        f'br_reporting_{str(report_year)}.csv')
    # Get columns to keep
    fieldstokeep = pd.read_csv(
        RCRA_DATA_PATH.joinpath('RCRA_required_fields.txt'), header=None)
    # on_bad_lines requires pandas >= 1.3
    df = pd.read_csv(filepath,
                     header=0,
                     usecols=list(fieldstokeep[0]),
                     low_memory=False,
                     on_bad_lines='skip',
                     encoding='ISO-8859-1')

    log.info(f'completed reading {filepath}')
    # Checking the Waste Generation Data Health
    df = df[pd.to_numeric(df['Generation Tons'], errors='coerce').notnull()]
    df['Generation Tons'] = df['Generation Tons'].astype(float)
    log.debug(f'number of records: {len(df)}')
    # Reassign the NAICS to a string
    df['NAICS'] = df['Primary NAICS'].astype('str')
    df.drop(columns=['Primary NAICS'], inplace=True)
    # Create field for DQI Reliability Score with fixed value from CSV
    rcrainfo_reliability_table = get_reliability_table_for_source('RCRAInfo')
    df['DataReliability'] = float(
        rcrainfo_reliability_table['DQI Reliability Score'])
    # Create a new field to put converted amount in
    df['Amount_kg'] = 0.0
    # Convert amounts from tons. Note this could be replaced with a conversion utility
    df['Amount_kg'] = USton_kg * df['Generation Tons']
    # Read in waste descriptions
    linewidthsdf = pd.read_csv(
        RCRA_DATA_PATH.joinpath('RCRAInfo_LU_WasteCode_LineComponents.csv'))
    names = linewidthsdf['Data Element Name']
    try:
        wastecodesfile = [
            file for file in OUTPUT_PATH.glob('*lu_waste_code*.csv')
        ][0]
    except IndexError:
        log.exception('waste codes file missing, download and unzip waste code'
                      f' file to {OUTPUT_PATH}')
    waste_codes = pd.read_csv(wastecodesfile, header=0, names=names)
    # Remove rows where any fields are na description is missing
    waste_codes = waste_codes[[
        'Waste Code', 'Code Type', 'Waste Code Description'
    ]].dropna()
    waste_codes['Waste Code Description'] = waste_codes[
        'Waste Code Description'].apply(waste_description_cleaner)
    waste_codes = waste_codes.drop_duplicates(ignore_index=True)
    waste_codes = waste_codes[~(
        (waste_codes['Waste Code'].duplicated(False)) &
        ((waste_codes['Waste Code Description'].isna()) |
         (waste_codes['Waste Code Description'] == 'Unknown')))]
    waste_codes.rename(columns={
        'Waste Code': 'Waste Code Group',
        'Code Type': 'Waste Code Type'
    },
                       inplace=True)
    df = df.merge(waste_codes, on='Waste Code Group', how='left')

    # Replace form code with the code name
    form_code_name_file = RCRA_DATA_PATH.joinpath('RCRA_LU_FORM_CODE.csv')
    form_code_name_df = pd.read_csv(form_code_name_file,
                                    header=0,
                                    usecols=['FORM_CODE', 'FORM_CODE_NAME'])
    form_code_name_df.rename(columns={'FORM_CODE': 'Form Code'}, inplace=True)
    df = df.merge(form_code_name_df, on='Form Code', how='left')

    df['FlowName'] = df['Waste Code Description']

    # If there is not useful waste code, fill it with the Form Code Name
    # Find the NAs in FlowName and then give that source of Form Code
    df.loc[df['FlowName'].isnull(), 'FlowNameSource'] = 'Form Code'
    df.loc[df['FlowNameSource'].isnull(), 'FlowNameSource'] = 'Waste Code'
    # Set FlowIDs to the appropriate code
    df.loc[df['FlowName'].isnull(), 'FlowID'] = df['Form Code']
    df.loc[df['FlowID'].isnull(), 'FlowID'] = df['Waste Code Group']
    df['FlowName'].fillna(df['FORM_CODE_NAME'], inplace=True)
    df = df.dropna(subset=['FlowID']).reset_index(drop=True)
    drop_fields = [
        'Generation Tons', 'Management Method', 'Waste Description',
        'Waste Code Description', 'FORM_CODE_NAME'
    ]
    df.drop(columns=drop_fields, inplace=True)
    # Rename cols used by multiple tables
    df.rename(columns={
        'Handler ID': 'FacilityID',
        'Amount_kg': 'FlowAmount'
    },
              inplace=True)

    # Prepare flows file
    flows = df[['FlowName', 'FlowID', 'FlowNameSource']]
    flows = flows.drop_duplicates(ignore_index=True)
    # Sort them by the flow names
    flows.sort_values(by='FlowName', axis=0, inplace=True)
    store_inventory(flows, 'RCRAInfo_' + report_year, 'flow')

    # Prepare facilities file
    facilities = df[[
        'FacilityID', 'Handler Name', 'Location Street Number',
        'Location Street 1', 'Location Street 2', 'Location City',
        'Location State', 'Location Zip', 'County Name', 'NAICS',
        'Generator ID Included in NBR'
    ]].reset_index(drop=True)
    facilities.drop_duplicates(inplace=True, ignore_index=True)
    facilities['Address'] = facilities[[
        'Location Street Number', 'Location Street 1', 'Location Street 2'
    ]].apply(lambda x: ' '.join(x.dropna()), axis=1)
    facilities.drop(columns=[
        'Location Street Number', 'Location Street 1', 'Location Street 2'
    ],
                    inplace=True)
    facilities.rename(columns={
        'Primary NAICS': 'NAICS',
        'Handler Name': 'FacilityName',
        'Location City': 'City',
        'Location State': 'State',
        'Location Zip': 'Zip',
        'County Name': 'County'
    },
                      inplace=True)
    store_inventory(facilities, 'RCRAInfo_' + report_year, 'facility')
    # Prepare flow by facility
    flowbyfacility = aggregate(df, [
        'FacilityID', 'FlowName', 'Source Code',
        'Generator Waste Stream Included in NBR'
    ])
    store_inventory(flowbyfacility, 'RCRAInfo_' + report_year,
                    'flowbyfacility')

    validate_state_totals(report_year, flowbyfacility)

    # Record metadata
    generate_metadata(report_year, filepath, datatype='inventory')
Ejemplo n.º 6
0
def main(**kwargs):

    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)

    parser.add_argument('Option',
                        help='What do you want to do:\
                        [A] Download and save GHGRP data\
                        [B] Generate inventory files for StEWI and validate\
                        [C] Download national totals data for validation',
                        type=str)

    parser.add_argument('-Y',
                        '--Year',
                        nargs='+',
                        help='What GHGRP year do you want to retrieve',
                        type=str)

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args())

    for year in kwargs['Year']:
        pickle_file = OUTPUT_PATH.joinpath(f'GHGRP_{year}.pk')
        if kwargs['Option'] == 'A':

            m = MetaGHGRP()
            download_excel_tables(m)

            # download subpart emissions tables for report year and save locally
            # parse subpart emissions data to match standardized EPA format
            ghgrp1 = download_and_parse_subpart_tables(year, m)

            # parse emissions data for subparts E, BB, CC, LL (S already accounted for)
            ghgrp2 = parse_additional_suparts_data(
                esbb_subparts_path, 'esbb_subparts_columns.csv', year)

            # parse emissions data for subpart O
            ghgrp3 = parse_subpart_O(year)

            # parse emissions data for subpart L
            ghgrp4 = parse_subpart_L(year)

            # concatenate ghgrp1, ghgrp2, ghgrp3, and ghgrp4
            ghgrp = pd.concat([ghgrp1, ghgrp2, ghgrp3,
                               ghgrp4]).reset_index(drop=True)

            # map flow descriptions to standard gas names from GHGRP
            ghg_mapping = pd.read_csv(
                GHGRP_DATA_PATH.joinpath('ghg_mapping.csv'),
                usecols=['Flow Description', 'FlowName', 'GAS_CODE'])
            ghgrp = pd.merge(ghgrp,
                             ghg_mapping,
                             on='Flow Description',
                             how='left')
            missing = ghgrp[ghgrp['FlowName'].isna()]
            if len(missing) > 0:
                log.warning('some flows are unmapped')
            ghgrp.drop('Flow Description', axis=1, inplace=True)

            # rename certain columns for consistency
            ghgrp.rename(columns={
                'FACILITY_ID': 'FacilityID',
                'NAICS_CODE': 'NAICS',
                'GAS_CODE': 'FlowCode'
            },
                         inplace=True)

            # pickle data and save to network
            log.info(f'saving processed GHGRP data to {pickle_file}')
            ghgrp.to_pickle(pickle_file)

            generate_metadata(year, m, datatype='source')

        if kwargs['Option'] == 'B':
            log.info(f'extracting data from {pickle_file}')
            ghgrp = pd.read_pickle(pickle_file)

            # import data reliability scores
            ghgrp_reliability_table = get_reliability_table_for_source(
                'GHGRPa')

            # add reliability scores
            ghgrp = pd.merge(ghgrp,
                             ghgrp_reliability_table,
                             left_on='METHOD',
                             right_on='Code',
                             how='left')

            # fill NAs with 5 for DQI reliability score
            ghgrp['DQI Reliability Score'] = ghgrp[
                'DQI Reliability Score'].fillna(value=5)

            # convert metric tons to kilograms
            ghgrp['FlowAmount'] = 1000 * ghgrp['FlowAmount'].astype('float')

            # rename reliability score column for consistency
            ghgrp.rename(columns={
                'DQI Reliability Score': 'DataReliability',
                'SUBPART_NAME': 'Process',
                'FlowCode': 'FlowID'
            },
                         inplace=True)
            ghgrp['ProcessType'] = 'Subpart'

            log.info('generating flowbysubpart output')

            # generate flowbysubpart
            ghgrp_fbs = ghgrp[StewiFormat.FLOWBYPROCESS.subset_fields(
                ghgrp)].reset_index(drop=True)
            ghgrp_fbs = aggregate(
                ghgrp_fbs,
                ['FacilityID', 'FlowName', 'Process', 'ProcessType'])
            store_inventory(ghgrp_fbs, 'GHGRP_' + year, 'flowbyprocess')

            log.info('generating flowbyfacility output')
            ghgrp_fbf = ghgrp[StewiFormat.FLOWBYFACILITY.subset_fields(
                ghgrp)].reset_index(drop=True)

            # aggregate instances of more than one flow for same facility and flow type
            ghgrp_fbf = aggregate(ghgrp_fbf, ['FacilityID', 'FlowName'])
            store_inventory(ghgrp_fbf, 'GHGRP_' + year, 'flowbyfacility')

            log.info('generating flows output')
            flow_columns = ['FlowName', 'FlowID']
            ghgrp_flow = ghgrp[flow_columns].drop_duplicates()
            ghgrp_flow.dropna(subset=['FlowName'], inplace=True)
            ghgrp_flow.sort_values(by=['FlowID', 'FlowName'], inplace=True)
            ghgrp_flow['Compartment'] = 'air'
            ghgrp_flow['Unit'] = 'kg'
            store_inventory(ghgrp_flow, 'GHGRP_' + year, 'flow')

            log.info('generating facilities output')
            facilities_df = get_facilities(
                data_summaries_path.joinpath(f'ghgp_data_{year}.xlsx'))

            # add facility information based on facility ID
            ghgrp = ghgrp.merge(facilities_df, on='FacilityID', how='left')

            # generate facilities output and save to network
            ghgrp_facility = ghgrp[StewiFormat.FACILITY.subset_fields(
                ghgrp)].drop_duplicates()
            ghgrp_facility.dropna(subset=['FacilityName'], inplace=True)
            # ensure NAICS does not have trailing decimal/zero
            ghgrp_facility['NAICS'] = ghgrp_facility['NAICS'].fillna(0)
            ghgrp_facility['NAICS'] = ghgrp_facility['NAICS'].astype(
                int).astype(str)
            ghgrp_facility.loc[ghgrp_facility['NAICS'] == '0', 'NAICS'] = None
            ghgrp_facility.sort_values(by=['FacilityID'], inplace=True)
            store_inventory(ghgrp_facility, 'GHGRP_' + year, 'facility')

            validate_national_totals_by_subpart(ghgrp, year)

            # Record metadata compiled from all GHGRP files and tables
            generate_metadata(year, m=None, datatype='inventory')

        elif kwargs['Option'] == 'C':
            log.info('generating national totals for validation')
            validation_table = 'V_GHG_EMITTER_SUBPART'
            generate_national_totals_validation(validation_table, year)
Ejemplo n.º 7
0
def main(**kwargs):

    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)

    parser.add_argument('Option',
                        help='What do you want to do:\
                        [A] Download NEI data and \
                            generate StEWI inventory outputs and validate \
                            to national totals\
                        [B] Download national totals',
                        type=str)

    parser.add_argument('-Y',
                        '--Year',
                        nargs='+',
                        help='What NEI year(s) you want to retrieve',
                        type=str)

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args())

    for year in kwargs['Year']:
        if kwargs['Option'] == 'A':

            nei_point = standardize_output(year)

            log.info('generating flow by facility output')
            nei_flowbyfacility = aggregate(nei_point,
                                           ['FacilityID', 'FlowName'])
            store_inventory(nei_flowbyfacility, 'NEI_' + year,
                            'flowbyfacility')
            log.debug(len(nei_flowbyfacility))
            #2017: 2184786
            #2016: 1965918
            #2014: 2057249
            #2011: 1840866

            log.info('generating flow by SCC output')
            nei_flowbyprocess = aggregate(
                nei_point, ['FacilityID', 'FlowName', 'Process'])
            nei_flowbyprocess['ProcessType'] = 'SCC'
            store_inventory(nei_flowbyprocess, 'NEI_' + year, 'flowbyprocess')
            log.debug(len(nei_flowbyprocess))
            #2017: 4055707

            log.info('generating flows output')
            nei_flows = nei_point[['FlowName', 'FlowID', 'Compartment']]
            nei_flows = nei_flows.drop_duplicates()
            nei_flows['Unit'] = 'kg'
            nei_flows = nei_flows.sort_values(by='FlowName', axis=0)
            store_inventory(nei_flows, 'NEI_' + year, 'flow')
            log.debug(len(nei_flows))
            #2017: 293
            #2016: 282
            #2014: 279
            #2011: 277

            log.info('generating facility output')
            facility = nei_point[[
                'FacilityID', 'FacilityName', 'Address', 'City', 'State',
                'Zip', 'Latitude', 'Longitude', 'NAICS', 'County'
            ]]
            facility = facility.drop_duplicates('FacilityID')
            facility = facility.astype({'Zip': 'str'})
            store_inventory(facility, 'NEI_' + year, 'facility')
            log.debug(len(facility))
            #2017: 87162
            #2016: 85802
            #2014: 85125
            #2011: 95565

            generate_metadata(year, datatype='inventory')

            if year in ['2011', '2014', '2017']:
                validate_national_totals(nei_flowbyfacility, year)
            else:
                log.info('no validation performed')

        elif kwargs['Option'] == 'B':
            if year in ['2011', '2014', '2017']:
                generate_national_totals(year)
            else:
                log.info(f'national totals do not exist for year {year}')