Beispiel #1
0
def combine_DMR_inventory(year, nutrient=''):
    """Loop through pickled data and combines into a dataframe."""
    path = OUTPUT_PATH.joinpath(year)
    if not path.is_dir():
        raise stewi.exceptions.DataNotFoundError
    output_df = pd.DataFrame()
    filestub = ''
    if nutrient:
        filestub = nutrient + '_'
        log.info(f'reading stored DMR queries by state for {nutrient}...')
    else:
        log.info('reading stored DMR queries by state...')
    for state in STATES:
        log.debug(f'accessing data for {state}')
        filepath = path.joinpath(f'{filestub}state_{state}.pickle')
        result = unpickle(filepath)
        if result is None:
            log.warning(f'No data found for {state}. Retrying query...')
            if (query_dmr(year=year, sic_list=None,
                         state_list=[state],
                         nutrient=nutrient).get(state) == 'success'):
                result = unpickle(filepath)
        if result is not None:
            output_df = pd.concat([output_df, result], ignore_index=True)
    return output_df
Beispiel #2
0
def check_for_file(filepath: Path, state) -> bool:
    if filepath.is_file():
        log.debug(f'file already exists for {state}, skipping')
        return True
    else:
        log.info(f'executing query for {state}')
        return False
Beispiel #3
0
def download_chunks(table,
                    table_count,
                    m,
                    row_start=0,
                    report_year='',
                    filepath=''):
    """Download data from envirofacts in chunks."""
    # Generate URL for each 5,000 row grouping and add to DataFrame
    output_table = pd.DataFrame()
    while row_start <= table_count:
        row_end = row_start + 4999
        table_url = generate_url(table=table,
                                 report_year=report_year,
                                 row_start=row_start,
                                 row_end=row_end,
                                 output_ext='csv')
        log.debug(f'url: {table_url}')
        table_temp, temp_time = import_table(table_url, get_time=True)
        output_table = pd.concat([output_table, table_temp])
        row_start += 5000
    m.add(time=temp_time,
          url=generate_url(table,
                           report_year=report_year,
                           row_start='',
                           output_ext='csv'),
          filetype='Database',
          filename=filepath)
    if filepath:
        output_table.to_csv(filepath, index=False)
    return output_table
Beispiel #4
0
def map_to_fedefl(df):
    try:
        import fedelemflowlist
    except ImportError:
        log.warning('requires installation of fedelemflowlist, data will not '
                    'validate correctly')
        return None
    tri = fedelemflowlist.get_flowmapping('TRI')
    tri = tri[['SourceFlowName', 'TargetFlowName']].drop_duplicates()
    mapped_df = df.merge(tri,
                         how='left',
                         left_on='FlowName',
                         right_on='SourceFlowName')
    if mapped_df['FlowAmount'].sum() != df['FlowAmount'].sum():
        log.warning('Error on mapping, data loss')
    # validation throws errors when mixture and trade secret chemicals are
    # maintained so drop them while they remain unmapped
    criteria = (mapped_df['TargetFlowName'].isna() &
                (mapped_df['FlowName'].str.lower().str.contains('trade secret')
                 | mapped_df['FlowName'].str.lower().str.contains('mixture')))
    mapped_df = mapped_df[~criteria].reset_index(drop=True)
    missing_flows = mapped_df[mapped_df['TargetFlowName'].isna()]['FlowName']
    missing_flows = missing_flows.drop_duplicates().sort_values()
    if len(missing_flows) > 0:
        log.debug('flows from reference df missing in mapping file')
    mapped_df.loc[~mapped_df['TargetFlowName'].isna(),
                  'FlowName'] = mapped_df['TargetFlowName']
    mapped_df = mapped_df.drop(columns=['SourceFlowName', 'TargetFlowName'])
    return mapped_df
def addChemicalMatches(inventories_df):
    """Add data for chemical matches to inventory or combined inventory df."""
    inventory_list = list(inventories_df['Source'].unique())
    chemicalmatches = chemicalmatcher.get_matches_for_StEWI(
        inventory_list=inventory_list)
    chemicalmatches = chemicalmatches[
        chemicalmatches['Source'].isin(inventory_list)]
    chemicalmatches = chemicalmatches.drop(columns=['FlowID'])
    chemicalmatches = chemicalmatches.drop_duplicates(subset=['FlowName',
                                                              'Source'])
    inventories = pd.merge(inventories_df,
                           chemicalmatches,
                           on=['FlowName', 'Source'],
                           how='left')
    # Compare unmatched flows to flows_missing_SRS_ list to ensure none missing
    missing_flows = inventories.loc[
        inventories['SRS_ID'].isna()][['FlowName', 'Source']].drop_duplicates()
    cm_missing = chemicalmatcher.read_cm_file('missing')
    missing_flows = missing_flows.assign(missing=missing_flows['FlowName'].
                                         isin(cm_missing['FlowName'])==False)
    if sum(missing_flows.missing) > 0:
        log.warning('New unknown flows identified, run chemicalmatcher')
        log.debug(missing_flows[missing_flows['missing']].to_string())

    return inventories
Beispiel #6
0
def standardize_output(year, source='Point'):
    """Read and parses NEI data.

    :param year : str, Year of NEI dataset
    :returns nei: DataFrame of parsed NEI data.
    """
    nei = pd.DataFrame()
    # read in nei files and concatenate all nei files into one dataframe
    nei_file_path = _config[year]['file_name']
    for file in nei_file_path:
        filename = OUTPUT_PATH.joinpath(file)
        if not filename.is_file():
            log.info(f'{file} not found in {OUTPUT_PATH}, '
                     'downloading source data')
            # download source file and metadata
            file_meta = set_stewi_meta(strip_file_extension(file))
            file_meta.category = EXT_DIR
            file_meta.tool = file_meta.tool.lower()
            download_from_remote(file_meta, paths)
        # concatenate all other files
        log.info(f'reading NEI data from {filename}')
        nei = pd.concat([nei, read_data(year, filename)])
        log.debug(f'{str(len(nei))} records')
    # convert TON to KG
    nei['FlowAmount'] = nei['FlowAmount'] * USton_kg

    log.info('adding Data Quality information')
    if source == 'Point':
        nei_reliability_table = get_reliability_table_for_source('NEI')
        nei_reliability_table['Code'] = nei_reliability_table['Code'].astype(
            float)
        nei['ReliabilityScore'] = nei['ReliabilityScore'].astype(float)
        nei = nei.merge(nei_reliability_table,
                        left_on='ReliabilityScore',
                        right_on='Code',
                        how='left')
        nei['DataReliability'] = nei['DQI Reliability Score']
        # drop Code and DQI Reliability Score columns
        nei = nei.drop(
            columns=['Code', 'DQI Reliability Score', 'ReliabilityScore'])

        nei['Compartment'] = 'air'
        """
        # Modify compartment based on stack height (ft)
        nei.loc[nei['StackHeight'] < 32, 'Compartment'] = 'air/ground'
        nei.loc[(nei['StackHeight'] >= 32) & (nei['StackHeight'] < 164),
                'Compartment'] = 'air/low'
        nei.loc[(nei['StackHeight'] >= 164) & (nei['StackHeight'] < 492),
                'Compartment'] = 'air/high'
        nei.loc[nei['StackHeight'] >= 492, 'Compartment'] = 'air/very high'
        """
    else:
        nei['DataReliability'] = 3
    # add Source column
    nei['Source'] = source
    nei.reset_index(drop=True, inplace=True)
    return nei
def getInventoriesforFacilityMatches(inventory_dict, facilitymatches,
                                     filter_for_LCI, base_inventory=None):
    """Retrieve stored flowbyfacility datasets based on passed dictionary.

    Filters them if necessary. Returns only those facilities with an FRS_ID
    except for those in the base_inventory where all are returned.
    :param inventory_dict:
    :param facilitymatches: dataframe matching FacilityMatches format
    :param filter_for_LCI:
    :param base_inventory:
    """
    if base_inventory is not None:
        # Identify the FRS in the base inventory and keep only those
        # base_inventory_FRS = facilitymatches[
        #     facilitymatches['Source'] == base_inventory]
        base_FRS_list = list(pd.unique(facilitymatches[
            facilitymatches['Source'] == base_inventory]['FRS_ID']))

    columns_to_keep = StewiFormat.FLOWBYFACILITY.fields() + ['Source',
                                                             'Year', 'FRS_ID']
    inventories = pd.DataFrame()
    filters = None
    if filter_for_LCI:
        filters = ['filter_for_LCI']
    for k in inventory_dict.keys():
        inventory = stewi.getInventory(k, inventory_dict[k],
                                       'flowbyfacility',
                                       filters)
        if inventory is None:
            continue
        inventory["Source"] = k
        # Merge in FRS_ID, ensure only single FRS added per facility ID, keeping
        # first listed
        facmatches = facilitymatches[facilitymatches['Source'] == k]
        facmatches = facmatches.drop_duplicates(subset=['FacilityID', 'Source'],
                                                keep='first')
        inventory = pd.merge(inventory,
                             facmatches,
                             on=['FacilityID', 'Source'], how='left')
        if inventory['FRS_ID'].isna().sum() > 0:
            log.debug('Some facilities missing FRS_ID')

        # If this isn't the base inventory, filter records for facilities not
        # found in the base inventory
        if k is not base_inventory and base_inventory is not None:
            inventory = inventory[inventory['FRS_ID'].isin(
                base_FRS_list)]

        # Add metadata
        inventory["Year"] = inventory_dict[k]
        cols_to_keep = [c for c in columns_to_keep if c in inventory]
        inventory = inventory[cols_to_keep]
        inventories = pd.concat([inventories, inventory], ignore_index=True)

    return inventories
Beispiel #8
0
def download_data(url_params, filepath: Path, sic_list) -> str:
    df = pd.DataFrame()
    if sic_list:
        skip_errors = True
    else:
        skip_errors = False
        sic_list = ['']
    for sic in sic_list:
        url_params['p_sic2'] = sic
        counter = 1
        pages = 1
        while counter <= pages:
            url_params['pageno'] = counter
            url = generate_url(url_params)
            log.debug(url)
            for attempt in range(3):
                try:
                    r = requests.get(url)
                    r.raise_for_status()
                    result = pd.DataFrame(r.json())
                    break
                except requests.exceptions.HTTPError as err:
                    log.info(err)
                    time.sleep(20)
                    pass
            else:
                log.warning("exceeded max attempts")
                return 'other_error'
            if 'Error' in result.index:
                if skip_errors:
                    log.debug(f"error in sic_{sic}")
                    break
                elif result['Results'].astype(str).str.contains('Maximum').any():
                    return 'max_error'
                else:
                    return 'other_error'
            elif 'NoDataMsg' in result.index:
                if skip_errors:
                    log.debug(f"no data in sic_{sic}")
                    break
                else:
                    return 'no_data'
            else:
                df = pd.concat([df, pd.DataFrame(result['Results']['Results'])],
                               ignore_index=True)
                # set page count
                pages = int(result['Results']['PageCount'])
                counter += 1
    log.debug(f"saving to {filepath}")
    pd.to_pickle(df, filepath)
    return 'success'
def get_SRSInfo_for_program_list(inventory):
    # See all lists
    # https://cdxnodengn.epa.gov/cdx-srs-rest/reference/substance_lists
    # Base URL for queries
    substancesbylistname = 'substances/list_acronym/'
    srs_flow_df = pd.DataFrame()
    for listname in inventory_to_SRSlist_acronymns[inventory]:
        log.debug('Getting %s', listname)
        lists_of_interest = obtain_list_names(listname)
        url = base + substancesbylistname + urllib.parse.quote(listname)
        flow_info = query_SRS_for_program_list(url, inventory,
                                               lists_of_interest)
        if len(flow_info) == 0:
            log.info(f'No flows found for {listname}')
        srs_flow_df = pd.concat([srs_flow_df, flow_info])
    srs_flow_df.drop_duplicates(inplace=True)
    if (inventory == 'TRI'):
        srs_flow_df['PGM_ID'] = srs_flow_df['PGM_ID'].apply(
            lambda x: str(x).lstrip('0'))
    srs_flow_df.sort_values(by='PGM_ID', inplace=True)
    return srs_flow_df
Beispiel #10
0
def main(**kwargs):
    parser = argparse.ArgumentParser(argument_default = argparse.SUPPRESS)

    parser.add_argument('Option',
                        help = 'What do you want to do:\
                        [A] Download DMR files from web\
                        [B] Generate StEWI inventory outputs and\
                            validate to state totals\
                        [C] Download state totals',
                        type = str)

    parser.add_argument('-Y', '--Year', nargs = '+',
                        help = 'What DMR year(s) you want to retrieve',
                        type = str)

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args())

    for year in kwargs['Year']:

        if kwargs['Option'] == 'A':
            log.info(f"Querying for {year}")

            # two digit SIC codes from advanced search drop down stripped and formatted as a list
            sic2 = list(pd.read_csv(DMR_DATA_PATH.joinpath('2_digit_SIC.csv'),
                        dtype={'SIC2': str})['SIC2'])
            # Query by state, then by SIC-state where necessary
            result_dict = query_dmr(year=year)
            log.debug('possible errors: ' + ', '.join(
                [s for s in result_dict.keys()
                 if result_dict[s] != 'success']))
            state_max_error_list = [s for s in result_dict.keys()
                                    if result_dict[s] == 'max_error']
            state_no_data_list = [s for s in result_dict.keys()
                                  if result_dict[s] == 'no_data']
            if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0):
                log.info('all states succesfully downloaded')
            else:
                if (len(state_max_error_list) > 0):
                    log.error(f"Max error: {' '.join(state_max_error_list)}")
                if (len(state_no_data_list) > 0):
                    log.error(f"No data error: {' '.join(state_no_data_list)}")
                log.info('Breaking up queries further by SIC')
                result_dict = query_dmr(year=year, sic_list=sic2,
                                        state_list=state_max_error_list)
                sic_state_max_error_list = [s for s in result_dict.keys()
                                            if result_dict[s] == 'max_error']
                if len(sic_state_max_error_list) > 0:
                    log.error(f"Max error: {' '.join(sic_state_max_error_list)}")

            log.info(f"Querying nutrients for {year}")
            # Query aggregated nutrients data
            for nutrient in ['N', 'P']:
                result_dict = query_dmr(year=year, nutrient=nutrient)
                log.debug('possible errors: ' + ', '.join(
                    [s for s in result_dict.keys()
                     if result_dict[s] != 'success']))
                state_max_error_list = [s for s in result_dict.keys()
                                        if result_dict[s] == 'max_error']
                state_no_data_list = [s for s in result_dict.keys()
                                      if result_dict[s] == 'no_data']
                if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0):
                    log.info(f'all states succesfully downloaded for {nutrient}')
                else:
                    result_dict = query_dmr(year=year, sic_list=sic2,
                                            state_list=state_max_error_list,
                                            nutrient=nutrient)
            # write metadata
            generate_metadata(year, datatype='source')

        if kwargs['Option'] == 'B':
            log.info(f'generating inventories for DMR {year}')
            state_df = combine_DMR_inventory(year)
            state_df = filter_states(standardize_df(state_df))

            # Validation against state totals is done prior to combining
            # with aggregated nutrients
            validate_state_totals(state_df, year)

            P_df = combine_DMR_inventory(year, nutrient='P')
            N_df = combine_DMR_inventory(year, nutrient='N')

            nut_drop_list = read_pollutant_parameter_list()
            nut_drop_list = nut_drop_list[(nut_drop_list['NITROGEN'] == 'Y') |
                                          (nut_drop_list['PHOSPHORUS'] == 'Y')]
            nut_drop_list = list(set(nut_drop_list['FlowName']))

            # Consolidate N and P based flows to reflect nutrient aggregation
            P_df = consolidate_nutrients(P_df, nut_drop_list, 'P')
            N_df = consolidate_nutrients(N_df, nut_drop_list, 'N')

            nutrient_agg_df = pd.concat([P_df, N_df])
            nutrient_agg_df = filter_states(standardize_df(nutrient_agg_df))

            # Filter out nitrogen and phosphorus flows before combining
            # with aggregated nutrients
            dmr_nut_filtered = state_df[~state_df['FlowName'].isin(nut_drop_list)]
            dmr_df = pd.concat([dmr_nut_filtered,
                                nutrient_agg_df]).reset_index(drop=True)

            # PermitTypeCode needed for state validation but not maintained
            dmr_df = dmr_df.drop(columns=['PermitTypeCode'])

            # generate output for facility
            facility_columns = ['FacilityID', 'FacilityName', 'City',
                                'State', 'Zip', 'Latitude', 'Longitude',
                                'County', 'NAICS', 'SIC'] # 'Address' not in DMR
            dmr_facility = dmr_df[facility_columns].drop_duplicates()
            store_inventory(dmr_facility, 'DMR_' + year, 'facility')

            # generate output for flow
            flow_columns = ['FlowID', 'FlowName']
            dmr_flow = dmr_df[flow_columns].drop_duplicates()
            dmr_flow.sort_values(by=['FlowName'], inplace=True)
            dmr_flow['Compartment'] = 'water'
            dmr_flow['Unit'] = 'kg'
            store_inventory(dmr_flow, 'DMR_' + year, 'flow')

            # generate output for flowbyfacility
            fbf_columns = ['FlowName', 'FlowAmount', 'FacilityID',
                           'DataReliability']
            dmr_fbf = dmr_df[fbf_columns].reset_index(drop=True)
            dmr_fbf = aggregate(dmr_fbf, ['FacilityID', 'FlowName'])
            dmr_fbf['Compartment'] = 'water'
            dmr_fbf['Unit'] = 'kg'
            store_inventory(dmr_fbf, 'DMR_' + year, 'flowbyfacility')

            # write metadata
            generate_metadata(year, datatype='inventory')

        if kwargs['Option'] == 'C':
            download_state_totals_validation(year)
Beispiel #11
0
def generate_eGRID_files(year):
    """Parse a local eGRID file to generate StEWI output files.

    :param year: str, Year of eGRID dataset
    """
    log.info(f'generating eGRID files for {year}')
    log.info('importing plant level emissions data')
    egrid = parse_eGRID(year, 'PLNT', 'eGRID_required_fields.csv')

    flowbyfac_fields = filter_fields('eGRID_required_fields.csv', 'flowbyfac_fields')

    flowbyfac_prelim = egrid[flowbyfac_fields]
    conversion = []
    conversion.append(flowbyfac_prelim[['FacilityID', 'Plant primary fuel']])
    conversion.append(egrid_unit_convert(
        flowbyfac_prelim[['Nitrogen oxides', 'Sulfur dioxide', 'Carbon dioxide']], USton_kg))
    conversion.append(egrid_unit_convert(
        flowbyfac_prelim[['Methane', 'Nitrous oxide']], lb_kg))
    conversion.append(egrid_unit_convert(
        flowbyfac_prelim[['Heat', 'Steam']], MMBtu_MJ))
    conversion.append(egrid_unit_convert(flowbyfac_prelim[['Electricity']], MWh_MJ))
    flowbyfac_stacked = pd.concat(conversion, axis=1)
    # Create flowbyfac
    flowbyfac = pd.melt(flowbyfac_stacked,
                        id_vars=['FacilityID', 'Plant primary fuel'],
                        value_vars=list(flowbyfac_stacked.columns[2:]),
                        var_name='FlowName', value_name='FlowAmount')

    flowbyfac = flowbyfac.dropna(subset=['FlowAmount'])
    flowbyfac['FlowAmount'] = pd.to_numeric(flowbyfac['FlowAmount'])
    flowbyfac = flowbyfac.sort_values(by=['FacilityID'], axis=0,
                                      ascending=True, inplace=False,
                                      kind='quicksort', na_position='last')

    # Read in unit sheet to get comment fields related to source of heat, NOx,
    # SO2, and CO2 emission estimates for calculating data quality information
    log.info('importing unit level data to assess data quality')
    unit_egrid = parse_eGRID(year, 'UNT', 'eGRID_unit_level_required_fields.csv')

    rel_score_cols = filter_fields('eGRID_unit_level_required_fields.csv',
                                   'reliability_flows')

    flows_used_for_weighting = filter_fields('eGRID_unit_level_required_fields.csv',
                                             'weighting_flows')

    unit_emissions_with_rel_scores = ['Heat', 'Nitrogen oxides',
                                      'Sulfur dioxide', 'Carbon dioxide']

    unit_egrid.update(unit_egrid[rel_score_cols].fillna(''))
    unit_egrid.update(unit_egrid[flows_used_for_weighting].fillna(0))
    # Generate combined columns as lists before exploding lists into multiple rows
    unit_egrid['FlowName'] = unit_egrid.apply(lambda _: unit_emissions_with_rel_scores, axis=1)
    unit_egrid['ReliabilitySource'] = unit_egrid[rel_score_cols].values.tolist()
    unit_egrid['FlowAmount'] = unit_egrid[flows_used_for_weighting].values.tolist()
    unit_egrid = unit_egrid.drop(columns=rel_score_cols + flows_used_for_weighting)
    unit_egrid = unit_egrid.set_index(list(unit_egrid.columns
                                           .difference(['FlowName',
                                                        'ReliabilitySource',
                                                        'FlowAmount']))
                                      ).apply(pd.Series.explode).reset_index()

    dq_mapping = pd.read_csv(eGRID_DATA_DIR
                             .joinpath('eGRID_unit_level_reliability_scores.csv'))
    unit_egrid = unit_egrid.merge(dq_mapping, how='left')

    # Aggregate data reliability scores by facility and flow
    rel_scores_by_facility = aggregate(unit_egrid, grouping_vars=['FacilityID', 'FlowName'])
    rel_scores_by_facility = rel_scores_by_facility.drop(columns=['FlowAmount'])

    # Merge in heat_SO2_CO2_NOx reliability scores calculated from unit sheet
    flowbyfac = flowbyfac.merge(rel_scores_by_facility,
                                on=['FacilityID', 'FlowName'], how='left')
    # Assign electricity to a reliabilty score of 1
    flowbyfac.loc[flowbyfac['FlowName'] == 'Electricity', 'DataReliability'] = 1
    flowbyfac['DataReliability'] = flowbyfac['DataReliability'].fillna(5)

    # Methane and nitrous oxide reliability scores
    # Assign 3 to all facilities except for certain fuel types where
    # measurements are taken
    flowbyfac.loc[(flowbyfac['FlowName'] == 'Methane') |
                  (flowbyfac['FlowName'] == 'Nitrous oxide'),
                  'DataReliability'] = 3
    # For all but selected fuel types, change it to 2
    flowbyfac.loc[((flowbyfac['FlowName'] == 'Methane') |
                   (flowbyfac['FlowName'] == 'Nitrous oxide')) &
                   ((flowbyfac['Plant primary fuel'] != 'PG') |
                    (flowbyfac['Plant primary fuel'] != 'RC') |
                    (flowbyfac['Plant primary fuel'] != 'WC') |
                    (flowbyfac['Plant primary fuel'] != 'SLW')),
                   'DataReliability'] = 2

    # Import flow compartments
    flow_compartments = pd.read_csv(eGRID_DATA_DIR
                                    .joinpath('eGRID_flow_compartments.csv'),
                                    header=0)
    flowbyfac = pd.merge(flowbyfac, flow_compartments, on='FlowName', how='left')

    # Drop unneeded columns
    flowbyfac = flowbyfac.drop(columns=['Plant primary fuel', 'OriginalName'])

    # Write flowbyfacility file to output
    store_inventory(flowbyfac, 'eGRID_' + year, 'flowbyfacility')

    # Creation of the facility file
    # Need to change column names manually
    egrid_fields = filter_fields('eGRID_required_fields.csv', 'facility_fields')
    egrid_fac_fields = [c for c in egrid if c in (egrid_fields +
                                                  StewiFormat.FACILITY.fields())]

    facility = egrid[egrid_fac_fields].reset_index(drop=True)

    # Data starting in 2018 for resource mix is listed as percentage.
    # For consistency multiply by 100
    if int(year) >= 2018:
        facility.loc[:, facility.columns.str.contains('resource mix')] *= 100

    log.debug(len(facility))
    #2019: 11865
    #2018: 10964
    #2016: 9709
    #2014: 8503
    store_inventory(facility, 'eGRID_' + year, 'facility')

    # Write flows file
    flows = flowbyfac[['FlowName', 'Compartment', 'Unit']]
    flows = flows.drop_duplicates()
    flows = flows.sort_values(by='FlowName', axis=0)
    store_inventory(flows, 'eGRID_' + year, 'flow')

    validate_eGRID(year, flowbyfac)
def Generate_RCRAInfo_files_csv(report_year):
    """Generate stewi inventory files from downloaded data files."""
    log.info(f'generating inventory files for {report_year}')
    filepath = DIR_RCRA_BY_YEAR.joinpath(
        f'br_reporting_{str(report_year)}.csv')
    # Get columns to keep
    fieldstokeep = pd.read_csv(
        RCRA_DATA_PATH.joinpath('RCRA_required_fields.txt'), header=None)
    # on_bad_lines requires pandas >= 1.3
    df = pd.read_csv(filepath,
                     header=0,
                     usecols=list(fieldstokeep[0]),
                     low_memory=False,
                     on_bad_lines='skip',
                     encoding='ISO-8859-1')

    log.info(f'completed reading {filepath}')
    # Checking the Waste Generation Data Health
    df = df[pd.to_numeric(df['Generation Tons'], errors='coerce').notnull()]
    df['Generation Tons'] = df['Generation Tons'].astype(float)
    log.debug(f'number of records: {len(df)}')
    # Reassign the NAICS to a string
    df['NAICS'] = df['Primary NAICS'].astype('str')
    df.drop(columns=['Primary NAICS'], inplace=True)
    # Create field for DQI Reliability Score with fixed value from CSV
    rcrainfo_reliability_table = get_reliability_table_for_source('RCRAInfo')
    df['DataReliability'] = float(
        rcrainfo_reliability_table['DQI Reliability Score'])
    # Create a new field to put converted amount in
    df['Amount_kg'] = 0.0
    # Convert amounts from tons. Note this could be replaced with a conversion utility
    df['Amount_kg'] = USton_kg * df['Generation Tons']
    # Read in waste descriptions
    linewidthsdf = pd.read_csv(
        RCRA_DATA_PATH.joinpath('RCRAInfo_LU_WasteCode_LineComponents.csv'))
    names = linewidthsdf['Data Element Name']
    try:
        wastecodesfile = [
            file for file in OUTPUT_PATH.glob('*lu_waste_code*.csv')
        ][0]
    except IndexError:
        log.exception('waste codes file missing, download and unzip waste code'
                      f' file to {OUTPUT_PATH}')
    waste_codes = pd.read_csv(wastecodesfile, header=0, names=names)
    # Remove rows where any fields are na description is missing
    waste_codes = waste_codes[[
        'Waste Code', 'Code Type', 'Waste Code Description'
    ]].dropna()
    waste_codes['Waste Code Description'] = waste_codes[
        'Waste Code Description'].apply(waste_description_cleaner)
    waste_codes = waste_codes.drop_duplicates(ignore_index=True)
    waste_codes = waste_codes[~(
        (waste_codes['Waste Code'].duplicated(False)) &
        ((waste_codes['Waste Code Description'].isna()) |
         (waste_codes['Waste Code Description'] == 'Unknown')))]
    waste_codes.rename(columns={
        'Waste Code': 'Waste Code Group',
        'Code Type': 'Waste Code Type'
    },
                       inplace=True)
    df = df.merge(waste_codes, on='Waste Code Group', how='left')

    # Replace form code with the code name
    form_code_name_file = RCRA_DATA_PATH.joinpath('RCRA_LU_FORM_CODE.csv')
    form_code_name_df = pd.read_csv(form_code_name_file,
                                    header=0,
                                    usecols=['FORM_CODE', 'FORM_CODE_NAME'])
    form_code_name_df.rename(columns={'FORM_CODE': 'Form Code'}, inplace=True)
    df = df.merge(form_code_name_df, on='Form Code', how='left')

    df['FlowName'] = df['Waste Code Description']

    # If there is not useful waste code, fill it with the Form Code Name
    # Find the NAs in FlowName and then give that source of Form Code
    df.loc[df['FlowName'].isnull(), 'FlowNameSource'] = 'Form Code'
    df.loc[df['FlowNameSource'].isnull(), 'FlowNameSource'] = 'Waste Code'
    # Set FlowIDs to the appropriate code
    df.loc[df['FlowName'].isnull(), 'FlowID'] = df['Form Code']
    df.loc[df['FlowID'].isnull(), 'FlowID'] = df['Waste Code Group']
    df['FlowName'].fillna(df['FORM_CODE_NAME'], inplace=True)
    df = df.dropna(subset=['FlowID']).reset_index(drop=True)
    drop_fields = [
        'Generation Tons', 'Management Method', 'Waste Description',
        'Waste Code Description', 'FORM_CODE_NAME'
    ]
    df.drop(columns=drop_fields, inplace=True)
    # Rename cols used by multiple tables
    df.rename(columns={
        'Handler ID': 'FacilityID',
        'Amount_kg': 'FlowAmount'
    },
              inplace=True)

    # Prepare flows file
    flows = df[['FlowName', 'FlowID', 'FlowNameSource']]
    flows = flows.drop_duplicates(ignore_index=True)
    # Sort them by the flow names
    flows.sort_values(by='FlowName', axis=0, inplace=True)
    store_inventory(flows, 'RCRAInfo_' + report_year, 'flow')

    # Prepare facilities file
    facilities = df[[
        'FacilityID', 'Handler Name', 'Location Street Number',
        'Location Street 1', 'Location Street 2', 'Location City',
        'Location State', 'Location Zip', 'County Name', 'NAICS',
        'Generator ID Included in NBR'
    ]].reset_index(drop=True)
    facilities.drop_duplicates(inplace=True, ignore_index=True)
    facilities['Address'] = facilities[[
        'Location Street Number', 'Location Street 1', 'Location Street 2'
    ]].apply(lambda x: ' '.join(x.dropna()), axis=1)
    facilities.drop(columns=[
        'Location Street Number', 'Location Street 1', 'Location Street 2'
    ],
                    inplace=True)
    facilities.rename(columns={
        'Primary NAICS': 'NAICS',
        'Handler Name': 'FacilityName',
        'Location City': 'City',
        'Location State': 'State',
        'Location Zip': 'Zip',
        'County Name': 'County'
    },
                      inplace=True)
    store_inventory(facilities, 'RCRAInfo_' + report_year, 'facility')
    # Prepare flow by facility
    flowbyfacility = aggregate(df, [
        'FacilityID', 'FlowName', 'Source Code',
        'Generator Waste Stream Included in NBR'
    ])
    store_inventory(flowbyfacility, 'RCRAInfo_' + report_year,
                    'flowbyfacility')

    validate_state_totals(report_year, flowbyfacility)

    # Record metadata
    generate_metadata(report_year, filepath, datatype='inventory')
Beispiel #13
0
def main(**kwargs):

    parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS)

    parser.add_argument('Option',
                        help='What do you want to do:\
                        [A] Download NEI data and \
                            generate StEWI inventory outputs and validate \
                            to national totals\
                        [B] Download national totals',
                        type=str)

    parser.add_argument('-Y',
                        '--Year',
                        nargs='+',
                        help='What NEI year(s) you want to retrieve',
                        type=str)

    if len(kwargs) == 0:
        kwargs = vars(parser.parse_args())

    for year in kwargs['Year']:
        if kwargs['Option'] == 'A':

            nei_point = standardize_output(year)

            log.info('generating flow by facility output')
            nei_flowbyfacility = aggregate(nei_point,
                                           ['FacilityID', 'FlowName'])
            store_inventory(nei_flowbyfacility, 'NEI_' + year,
                            'flowbyfacility')
            log.debug(len(nei_flowbyfacility))
            #2017: 2184786
            #2016: 1965918
            #2014: 2057249
            #2011: 1840866

            log.info('generating flow by SCC output')
            nei_flowbyprocess = aggregate(
                nei_point, ['FacilityID', 'FlowName', 'Process'])
            nei_flowbyprocess['ProcessType'] = 'SCC'
            store_inventory(nei_flowbyprocess, 'NEI_' + year, 'flowbyprocess')
            log.debug(len(nei_flowbyprocess))
            #2017: 4055707

            log.info('generating flows output')
            nei_flows = nei_point[['FlowName', 'FlowID', 'Compartment']]
            nei_flows = nei_flows.drop_duplicates()
            nei_flows['Unit'] = 'kg'
            nei_flows = nei_flows.sort_values(by='FlowName', axis=0)
            store_inventory(nei_flows, 'NEI_' + year, 'flow')
            log.debug(len(nei_flows))
            #2017: 293
            #2016: 282
            #2014: 279
            #2011: 277

            log.info('generating facility output')
            facility = nei_point[[
                'FacilityID', 'FacilityName', 'Address', 'City', 'State',
                'Zip', 'Latitude', 'Longitude', 'NAICS', 'County'
            ]]
            facility = facility.drop_duplicates('FacilityID')
            facility = facility.astype({'Zip': 'str'})
            store_inventory(facility, 'NEI_' + year, 'facility')
            log.debug(len(facility))
            #2017: 87162
            #2016: 85802
            #2014: 85125
            #2011: 95565

            generate_metadata(year, datatype='inventory')

            if year in ['2011', '2014', '2017']:
                validate_national_totals(nei_flowbyfacility, year)
            else:
                log.info('no validation performed')

        elif kwargs['Option'] == 'B':
            if year in ['2011', '2014', '2017']:
                generate_national_totals(year)
            else:
                log.info(f'national totals do not exist for year {year}')