def combine_DMR_inventory(year, nutrient=''): """Loop through pickled data and combines into a dataframe.""" path = OUTPUT_PATH.joinpath(year) if not path.is_dir(): raise stewi.exceptions.DataNotFoundError output_df = pd.DataFrame() filestub = '' if nutrient: filestub = nutrient + '_' log.info(f'reading stored DMR queries by state for {nutrient}...') else: log.info('reading stored DMR queries by state...') for state in STATES: log.debug(f'accessing data for {state}') filepath = path.joinpath(f'{filestub}state_{state}.pickle') result = unpickle(filepath) if result is None: log.warning(f'No data found for {state}. Retrying query...') if (query_dmr(year=year, sic_list=None, state_list=[state], nutrient=nutrient).get(state) == 'success'): result = unpickle(filepath) if result is not None: output_df = pd.concat([output_df, result], ignore_index=True) return output_df
def check_for_file(filepath: Path, state) -> bool: if filepath.is_file(): log.debug(f'file already exists for {state}, skipping') return True else: log.info(f'executing query for {state}') return False
def download_chunks(table, table_count, m, row_start=0, report_year='', filepath=''): """Download data from envirofacts in chunks.""" # Generate URL for each 5,000 row grouping and add to DataFrame output_table = pd.DataFrame() while row_start <= table_count: row_end = row_start + 4999 table_url = generate_url(table=table, report_year=report_year, row_start=row_start, row_end=row_end, output_ext='csv') log.debug(f'url: {table_url}') table_temp, temp_time = import_table(table_url, get_time=True) output_table = pd.concat([output_table, table_temp]) row_start += 5000 m.add(time=temp_time, url=generate_url(table, report_year=report_year, row_start='', output_ext='csv'), filetype='Database', filename=filepath) if filepath: output_table.to_csv(filepath, index=False) return output_table
def map_to_fedefl(df): try: import fedelemflowlist except ImportError: log.warning('requires installation of fedelemflowlist, data will not ' 'validate correctly') return None tri = fedelemflowlist.get_flowmapping('TRI') tri = tri[['SourceFlowName', 'TargetFlowName']].drop_duplicates() mapped_df = df.merge(tri, how='left', left_on='FlowName', right_on='SourceFlowName') if mapped_df['FlowAmount'].sum() != df['FlowAmount'].sum(): log.warning('Error on mapping, data loss') # validation throws errors when mixture and trade secret chemicals are # maintained so drop them while they remain unmapped criteria = (mapped_df['TargetFlowName'].isna() & (mapped_df['FlowName'].str.lower().str.contains('trade secret') | mapped_df['FlowName'].str.lower().str.contains('mixture'))) mapped_df = mapped_df[~criteria].reset_index(drop=True) missing_flows = mapped_df[mapped_df['TargetFlowName'].isna()]['FlowName'] missing_flows = missing_flows.drop_duplicates().sort_values() if len(missing_flows) > 0: log.debug('flows from reference df missing in mapping file') mapped_df.loc[~mapped_df['TargetFlowName'].isna(), 'FlowName'] = mapped_df['TargetFlowName'] mapped_df = mapped_df.drop(columns=['SourceFlowName', 'TargetFlowName']) return mapped_df
def addChemicalMatches(inventories_df): """Add data for chemical matches to inventory or combined inventory df.""" inventory_list = list(inventories_df['Source'].unique()) chemicalmatches = chemicalmatcher.get_matches_for_StEWI( inventory_list=inventory_list) chemicalmatches = chemicalmatches[ chemicalmatches['Source'].isin(inventory_list)] chemicalmatches = chemicalmatches.drop(columns=['FlowID']) chemicalmatches = chemicalmatches.drop_duplicates(subset=['FlowName', 'Source']) inventories = pd.merge(inventories_df, chemicalmatches, on=['FlowName', 'Source'], how='left') # Compare unmatched flows to flows_missing_SRS_ list to ensure none missing missing_flows = inventories.loc[ inventories['SRS_ID'].isna()][['FlowName', 'Source']].drop_duplicates() cm_missing = chemicalmatcher.read_cm_file('missing') missing_flows = missing_flows.assign(missing=missing_flows['FlowName']. isin(cm_missing['FlowName'])==False) if sum(missing_flows.missing) > 0: log.warning('New unknown flows identified, run chemicalmatcher') log.debug(missing_flows[missing_flows['missing']].to_string()) return inventories
def standardize_output(year, source='Point'): """Read and parses NEI data. :param year : str, Year of NEI dataset :returns nei: DataFrame of parsed NEI data. """ nei = pd.DataFrame() # read in nei files and concatenate all nei files into one dataframe nei_file_path = _config[year]['file_name'] for file in nei_file_path: filename = OUTPUT_PATH.joinpath(file) if not filename.is_file(): log.info(f'{file} not found in {OUTPUT_PATH}, ' 'downloading source data') # download source file and metadata file_meta = set_stewi_meta(strip_file_extension(file)) file_meta.category = EXT_DIR file_meta.tool = file_meta.tool.lower() download_from_remote(file_meta, paths) # concatenate all other files log.info(f'reading NEI data from {filename}') nei = pd.concat([nei, read_data(year, filename)]) log.debug(f'{str(len(nei))} records') # convert TON to KG nei['FlowAmount'] = nei['FlowAmount'] * USton_kg log.info('adding Data Quality information') if source == 'Point': nei_reliability_table = get_reliability_table_for_source('NEI') nei_reliability_table['Code'] = nei_reliability_table['Code'].astype( float) nei['ReliabilityScore'] = nei['ReliabilityScore'].astype(float) nei = nei.merge(nei_reliability_table, left_on='ReliabilityScore', right_on='Code', how='left') nei['DataReliability'] = nei['DQI Reliability Score'] # drop Code and DQI Reliability Score columns nei = nei.drop( columns=['Code', 'DQI Reliability Score', 'ReliabilityScore']) nei['Compartment'] = 'air' """ # Modify compartment based on stack height (ft) nei.loc[nei['StackHeight'] < 32, 'Compartment'] = 'air/ground' nei.loc[(nei['StackHeight'] >= 32) & (nei['StackHeight'] < 164), 'Compartment'] = 'air/low' nei.loc[(nei['StackHeight'] >= 164) & (nei['StackHeight'] < 492), 'Compartment'] = 'air/high' nei.loc[nei['StackHeight'] >= 492, 'Compartment'] = 'air/very high' """ else: nei['DataReliability'] = 3 # add Source column nei['Source'] = source nei.reset_index(drop=True, inplace=True) return nei
def getInventoriesforFacilityMatches(inventory_dict, facilitymatches, filter_for_LCI, base_inventory=None): """Retrieve stored flowbyfacility datasets based on passed dictionary. Filters them if necessary. Returns only those facilities with an FRS_ID except for those in the base_inventory where all are returned. :param inventory_dict: :param facilitymatches: dataframe matching FacilityMatches format :param filter_for_LCI: :param base_inventory: """ if base_inventory is not None: # Identify the FRS in the base inventory and keep only those # base_inventory_FRS = facilitymatches[ # facilitymatches['Source'] == base_inventory] base_FRS_list = list(pd.unique(facilitymatches[ facilitymatches['Source'] == base_inventory]['FRS_ID'])) columns_to_keep = StewiFormat.FLOWBYFACILITY.fields() + ['Source', 'Year', 'FRS_ID'] inventories = pd.DataFrame() filters = None if filter_for_LCI: filters = ['filter_for_LCI'] for k in inventory_dict.keys(): inventory = stewi.getInventory(k, inventory_dict[k], 'flowbyfacility', filters) if inventory is None: continue inventory["Source"] = k # Merge in FRS_ID, ensure only single FRS added per facility ID, keeping # first listed facmatches = facilitymatches[facilitymatches['Source'] == k] facmatches = facmatches.drop_duplicates(subset=['FacilityID', 'Source'], keep='first') inventory = pd.merge(inventory, facmatches, on=['FacilityID', 'Source'], how='left') if inventory['FRS_ID'].isna().sum() > 0: log.debug('Some facilities missing FRS_ID') # If this isn't the base inventory, filter records for facilities not # found in the base inventory if k is not base_inventory and base_inventory is not None: inventory = inventory[inventory['FRS_ID'].isin( base_FRS_list)] # Add metadata inventory["Year"] = inventory_dict[k] cols_to_keep = [c for c in columns_to_keep if c in inventory] inventory = inventory[cols_to_keep] inventories = pd.concat([inventories, inventory], ignore_index=True) return inventories
def download_data(url_params, filepath: Path, sic_list) -> str: df = pd.DataFrame() if sic_list: skip_errors = True else: skip_errors = False sic_list = [''] for sic in sic_list: url_params['p_sic2'] = sic counter = 1 pages = 1 while counter <= pages: url_params['pageno'] = counter url = generate_url(url_params) log.debug(url) for attempt in range(3): try: r = requests.get(url) r.raise_for_status() result = pd.DataFrame(r.json()) break except requests.exceptions.HTTPError as err: log.info(err) time.sleep(20) pass else: log.warning("exceeded max attempts") return 'other_error' if 'Error' in result.index: if skip_errors: log.debug(f"error in sic_{sic}") break elif result['Results'].astype(str).str.contains('Maximum').any(): return 'max_error' else: return 'other_error' elif 'NoDataMsg' in result.index: if skip_errors: log.debug(f"no data in sic_{sic}") break else: return 'no_data' else: df = pd.concat([df, pd.DataFrame(result['Results']['Results'])], ignore_index=True) # set page count pages = int(result['Results']['PageCount']) counter += 1 log.debug(f"saving to {filepath}") pd.to_pickle(df, filepath) return 'success'
def get_SRSInfo_for_program_list(inventory): # See all lists # https://cdxnodengn.epa.gov/cdx-srs-rest/reference/substance_lists # Base URL for queries substancesbylistname = 'substances/list_acronym/' srs_flow_df = pd.DataFrame() for listname in inventory_to_SRSlist_acronymns[inventory]: log.debug('Getting %s', listname) lists_of_interest = obtain_list_names(listname) url = base + substancesbylistname + urllib.parse.quote(listname) flow_info = query_SRS_for_program_list(url, inventory, lists_of_interest) if len(flow_info) == 0: log.info(f'No flows found for {listname}') srs_flow_df = pd.concat([srs_flow_df, flow_info]) srs_flow_df.drop_duplicates(inplace=True) if (inventory == 'TRI'): srs_flow_df['PGM_ID'] = srs_flow_df['PGM_ID'].apply( lambda x: str(x).lstrip('0')) srs_flow_df.sort_values(by='PGM_ID', inplace=True) return srs_flow_df
def main(**kwargs): parser = argparse.ArgumentParser(argument_default = argparse.SUPPRESS) parser.add_argument('Option', help = 'What do you want to do:\ [A] Download DMR files from web\ [B] Generate StEWI inventory outputs and\ validate to state totals\ [C] Download state totals', type = str) parser.add_argument('-Y', '--Year', nargs = '+', help = 'What DMR year(s) you want to retrieve', type = str) if len(kwargs) == 0: kwargs = vars(parser.parse_args()) for year in kwargs['Year']: if kwargs['Option'] == 'A': log.info(f"Querying for {year}") # two digit SIC codes from advanced search drop down stripped and formatted as a list sic2 = list(pd.read_csv(DMR_DATA_PATH.joinpath('2_digit_SIC.csv'), dtype={'SIC2': str})['SIC2']) # Query by state, then by SIC-state where necessary result_dict = query_dmr(year=year) log.debug('possible errors: ' + ', '.join( [s for s in result_dict.keys() if result_dict[s] != 'success'])) state_max_error_list = [s for s in result_dict.keys() if result_dict[s] == 'max_error'] state_no_data_list = [s for s in result_dict.keys() if result_dict[s] == 'no_data'] if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0): log.info('all states succesfully downloaded') else: if (len(state_max_error_list) > 0): log.error(f"Max error: {' '.join(state_max_error_list)}") if (len(state_no_data_list) > 0): log.error(f"No data error: {' '.join(state_no_data_list)}") log.info('Breaking up queries further by SIC') result_dict = query_dmr(year=year, sic_list=sic2, state_list=state_max_error_list) sic_state_max_error_list = [s for s in result_dict.keys() if result_dict[s] == 'max_error'] if len(sic_state_max_error_list) > 0: log.error(f"Max error: {' '.join(sic_state_max_error_list)}") log.info(f"Querying nutrients for {year}") # Query aggregated nutrients data for nutrient in ['N', 'P']: result_dict = query_dmr(year=year, nutrient=nutrient) log.debug('possible errors: ' + ', '.join( [s for s in result_dict.keys() if result_dict[s] != 'success'])) state_max_error_list = [s for s in result_dict.keys() if result_dict[s] == 'max_error'] state_no_data_list = [s for s in result_dict.keys() if result_dict[s] == 'no_data'] if (len(state_max_error_list) == 0) and (len(state_no_data_list) == 0): log.info(f'all states succesfully downloaded for {nutrient}') else: result_dict = query_dmr(year=year, sic_list=sic2, state_list=state_max_error_list, nutrient=nutrient) # write metadata generate_metadata(year, datatype='source') if kwargs['Option'] == 'B': log.info(f'generating inventories for DMR {year}') state_df = combine_DMR_inventory(year) state_df = filter_states(standardize_df(state_df)) # Validation against state totals is done prior to combining # with aggregated nutrients validate_state_totals(state_df, year) P_df = combine_DMR_inventory(year, nutrient='P') N_df = combine_DMR_inventory(year, nutrient='N') nut_drop_list = read_pollutant_parameter_list() nut_drop_list = nut_drop_list[(nut_drop_list['NITROGEN'] == 'Y') | (nut_drop_list['PHOSPHORUS'] == 'Y')] nut_drop_list = list(set(nut_drop_list['FlowName'])) # Consolidate N and P based flows to reflect nutrient aggregation P_df = consolidate_nutrients(P_df, nut_drop_list, 'P') N_df = consolidate_nutrients(N_df, nut_drop_list, 'N') nutrient_agg_df = pd.concat([P_df, N_df]) nutrient_agg_df = filter_states(standardize_df(nutrient_agg_df)) # Filter out nitrogen and phosphorus flows before combining # with aggregated nutrients dmr_nut_filtered = state_df[~state_df['FlowName'].isin(nut_drop_list)] dmr_df = pd.concat([dmr_nut_filtered, nutrient_agg_df]).reset_index(drop=True) # PermitTypeCode needed for state validation but not maintained dmr_df = dmr_df.drop(columns=['PermitTypeCode']) # generate output for facility facility_columns = ['FacilityID', 'FacilityName', 'City', 'State', 'Zip', 'Latitude', 'Longitude', 'County', 'NAICS', 'SIC'] # 'Address' not in DMR dmr_facility = dmr_df[facility_columns].drop_duplicates() store_inventory(dmr_facility, 'DMR_' + year, 'facility') # generate output for flow flow_columns = ['FlowID', 'FlowName'] dmr_flow = dmr_df[flow_columns].drop_duplicates() dmr_flow.sort_values(by=['FlowName'], inplace=True) dmr_flow['Compartment'] = 'water' dmr_flow['Unit'] = 'kg' store_inventory(dmr_flow, 'DMR_' + year, 'flow') # generate output for flowbyfacility fbf_columns = ['FlowName', 'FlowAmount', 'FacilityID', 'DataReliability'] dmr_fbf = dmr_df[fbf_columns].reset_index(drop=True) dmr_fbf = aggregate(dmr_fbf, ['FacilityID', 'FlowName']) dmr_fbf['Compartment'] = 'water' dmr_fbf['Unit'] = 'kg' store_inventory(dmr_fbf, 'DMR_' + year, 'flowbyfacility') # write metadata generate_metadata(year, datatype='inventory') if kwargs['Option'] == 'C': download_state_totals_validation(year)
def generate_eGRID_files(year): """Parse a local eGRID file to generate StEWI output files. :param year: str, Year of eGRID dataset """ log.info(f'generating eGRID files for {year}') log.info('importing plant level emissions data') egrid = parse_eGRID(year, 'PLNT', 'eGRID_required_fields.csv') flowbyfac_fields = filter_fields('eGRID_required_fields.csv', 'flowbyfac_fields') flowbyfac_prelim = egrid[flowbyfac_fields] conversion = [] conversion.append(flowbyfac_prelim[['FacilityID', 'Plant primary fuel']]) conversion.append(egrid_unit_convert( flowbyfac_prelim[['Nitrogen oxides', 'Sulfur dioxide', 'Carbon dioxide']], USton_kg)) conversion.append(egrid_unit_convert( flowbyfac_prelim[['Methane', 'Nitrous oxide']], lb_kg)) conversion.append(egrid_unit_convert( flowbyfac_prelim[['Heat', 'Steam']], MMBtu_MJ)) conversion.append(egrid_unit_convert(flowbyfac_prelim[['Electricity']], MWh_MJ)) flowbyfac_stacked = pd.concat(conversion, axis=1) # Create flowbyfac flowbyfac = pd.melt(flowbyfac_stacked, id_vars=['FacilityID', 'Plant primary fuel'], value_vars=list(flowbyfac_stacked.columns[2:]), var_name='FlowName', value_name='FlowAmount') flowbyfac = flowbyfac.dropna(subset=['FlowAmount']) flowbyfac['FlowAmount'] = pd.to_numeric(flowbyfac['FlowAmount']) flowbyfac = flowbyfac.sort_values(by=['FacilityID'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last') # Read in unit sheet to get comment fields related to source of heat, NOx, # SO2, and CO2 emission estimates for calculating data quality information log.info('importing unit level data to assess data quality') unit_egrid = parse_eGRID(year, 'UNT', 'eGRID_unit_level_required_fields.csv') rel_score_cols = filter_fields('eGRID_unit_level_required_fields.csv', 'reliability_flows') flows_used_for_weighting = filter_fields('eGRID_unit_level_required_fields.csv', 'weighting_flows') unit_emissions_with_rel_scores = ['Heat', 'Nitrogen oxides', 'Sulfur dioxide', 'Carbon dioxide'] unit_egrid.update(unit_egrid[rel_score_cols].fillna('')) unit_egrid.update(unit_egrid[flows_used_for_weighting].fillna(0)) # Generate combined columns as lists before exploding lists into multiple rows unit_egrid['FlowName'] = unit_egrid.apply(lambda _: unit_emissions_with_rel_scores, axis=1) unit_egrid['ReliabilitySource'] = unit_egrid[rel_score_cols].values.tolist() unit_egrid['FlowAmount'] = unit_egrid[flows_used_for_weighting].values.tolist() unit_egrid = unit_egrid.drop(columns=rel_score_cols + flows_used_for_weighting) unit_egrid = unit_egrid.set_index(list(unit_egrid.columns .difference(['FlowName', 'ReliabilitySource', 'FlowAmount'])) ).apply(pd.Series.explode).reset_index() dq_mapping = pd.read_csv(eGRID_DATA_DIR .joinpath('eGRID_unit_level_reliability_scores.csv')) unit_egrid = unit_egrid.merge(dq_mapping, how='left') # Aggregate data reliability scores by facility and flow rel_scores_by_facility = aggregate(unit_egrid, grouping_vars=['FacilityID', 'FlowName']) rel_scores_by_facility = rel_scores_by_facility.drop(columns=['FlowAmount']) # Merge in heat_SO2_CO2_NOx reliability scores calculated from unit sheet flowbyfac = flowbyfac.merge(rel_scores_by_facility, on=['FacilityID', 'FlowName'], how='left') # Assign electricity to a reliabilty score of 1 flowbyfac.loc[flowbyfac['FlowName'] == 'Electricity', 'DataReliability'] = 1 flowbyfac['DataReliability'] = flowbyfac['DataReliability'].fillna(5) # Methane and nitrous oxide reliability scores # Assign 3 to all facilities except for certain fuel types where # measurements are taken flowbyfac.loc[(flowbyfac['FlowName'] == 'Methane') | (flowbyfac['FlowName'] == 'Nitrous oxide'), 'DataReliability'] = 3 # For all but selected fuel types, change it to 2 flowbyfac.loc[((flowbyfac['FlowName'] == 'Methane') | (flowbyfac['FlowName'] == 'Nitrous oxide')) & ((flowbyfac['Plant primary fuel'] != 'PG') | (flowbyfac['Plant primary fuel'] != 'RC') | (flowbyfac['Plant primary fuel'] != 'WC') | (flowbyfac['Plant primary fuel'] != 'SLW')), 'DataReliability'] = 2 # Import flow compartments flow_compartments = pd.read_csv(eGRID_DATA_DIR .joinpath('eGRID_flow_compartments.csv'), header=0) flowbyfac = pd.merge(flowbyfac, flow_compartments, on='FlowName', how='left') # Drop unneeded columns flowbyfac = flowbyfac.drop(columns=['Plant primary fuel', 'OriginalName']) # Write flowbyfacility file to output store_inventory(flowbyfac, 'eGRID_' + year, 'flowbyfacility') # Creation of the facility file # Need to change column names manually egrid_fields = filter_fields('eGRID_required_fields.csv', 'facility_fields') egrid_fac_fields = [c for c in egrid if c in (egrid_fields + StewiFormat.FACILITY.fields())] facility = egrid[egrid_fac_fields].reset_index(drop=True) # Data starting in 2018 for resource mix is listed as percentage. # For consistency multiply by 100 if int(year) >= 2018: facility.loc[:, facility.columns.str.contains('resource mix')] *= 100 log.debug(len(facility)) #2019: 11865 #2018: 10964 #2016: 9709 #2014: 8503 store_inventory(facility, 'eGRID_' + year, 'facility') # Write flows file flows = flowbyfac[['FlowName', 'Compartment', 'Unit']] flows = flows.drop_duplicates() flows = flows.sort_values(by='FlowName', axis=0) store_inventory(flows, 'eGRID_' + year, 'flow') validate_eGRID(year, flowbyfac)
def Generate_RCRAInfo_files_csv(report_year): """Generate stewi inventory files from downloaded data files.""" log.info(f'generating inventory files for {report_year}') filepath = DIR_RCRA_BY_YEAR.joinpath( f'br_reporting_{str(report_year)}.csv') # Get columns to keep fieldstokeep = pd.read_csv( RCRA_DATA_PATH.joinpath('RCRA_required_fields.txt'), header=None) # on_bad_lines requires pandas >= 1.3 df = pd.read_csv(filepath, header=0, usecols=list(fieldstokeep[0]), low_memory=False, on_bad_lines='skip', encoding='ISO-8859-1') log.info(f'completed reading {filepath}') # Checking the Waste Generation Data Health df = df[pd.to_numeric(df['Generation Tons'], errors='coerce').notnull()] df['Generation Tons'] = df['Generation Tons'].astype(float) log.debug(f'number of records: {len(df)}') # Reassign the NAICS to a string df['NAICS'] = df['Primary NAICS'].astype('str') df.drop(columns=['Primary NAICS'], inplace=True) # Create field for DQI Reliability Score with fixed value from CSV rcrainfo_reliability_table = get_reliability_table_for_source('RCRAInfo') df['DataReliability'] = float( rcrainfo_reliability_table['DQI Reliability Score']) # Create a new field to put converted amount in df['Amount_kg'] = 0.0 # Convert amounts from tons. Note this could be replaced with a conversion utility df['Amount_kg'] = USton_kg * df['Generation Tons'] # Read in waste descriptions linewidthsdf = pd.read_csv( RCRA_DATA_PATH.joinpath('RCRAInfo_LU_WasteCode_LineComponents.csv')) names = linewidthsdf['Data Element Name'] try: wastecodesfile = [ file for file in OUTPUT_PATH.glob('*lu_waste_code*.csv') ][0] except IndexError: log.exception('waste codes file missing, download and unzip waste code' f' file to {OUTPUT_PATH}') waste_codes = pd.read_csv(wastecodesfile, header=0, names=names) # Remove rows where any fields are na description is missing waste_codes = waste_codes[[ 'Waste Code', 'Code Type', 'Waste Code Description' ]].dropna() waste_codes['Waste Code Description'] = waste_codes[ 'Waste Code Description'].apply(waste_description_cleaner) waste_codes = waste_codes.drop_duplicates(ignore_index=True) waste_codes = waste_codes[~( (waste_codes['Waste Code'].duplicated(False)) & ((waste_codes['Waste Code Description'].isna()) | (waste_codes['Waste Code Description'] == 'Unknown')))] waste_codes.rename(columns={ 'Waste Code': 'Waste Code Group', 'Code Type': 'Waste Code Type' }, inplace=True) df = df.merge(waste_codes, on='Waste Code Group', how='left') # Replace form code with the code name form_code_name_file = RCRA_DATA_PATH.joinpath('RCRA_LU_FORM_CODE.csv') form_code_name_df = pd.read_csv(form_code_name_file, header=0, usecols=['FORM_CODE', 'FORM_CODE_NAME']) form_code_name_df.rename(columns={'FORM_CODE': 'Form Code'}, inplace=True) df = df.merge(form_code_name_df, on='Form Code', how='left') df['FlowName'] = df['Waste Code Description'] # If there is not useful waste code, fill it with the Form Code Name # Find the NAs in FlowName and then give that source of Form Code df.loc[df['FlowName'].isnull(), 'FlowNameSource'] = 'Form Code' df.loc[df['FlowNameSource'].isnull(), 'FlowNameSource'] = 'Waste Code' # Set FlowIDs to the appropriate code df.loc[df['FlowName'].isnull(), 'FlowID'] = df['Form Code'] df.loc[df['FlowID'].isnull(), 'FlowID'] = df['Waste Code Group'] df['FlowName'].fillna(df['FORM_CODE_NAME'], inplace=True) df = df.dropna(subset=['FlowID']).reset_index(drop=True) drop_fields = [ 'Generation Tons', 'Management Method', 'Waste Description', 'Waste Code Description', 'FORM_CODE_NAME' ] df.drop(columns=drop_fields, inplace=True) # Rename cols used by multiple tables df.rename(columns={ 'Handler ID': 'FacilityID', 'Amount_kg': 'FlowAmount' }, inplace=True) # Prepare flows file flows = df[['FlowName', 'FlowID', 'FlowNameSource']] flows = flows.drop_duplicates(ignore_index=True) # Sort them by the flow names flows.sort_values(by='FlowName', axis=0, inplace=True) store_inventory(flows, 'RCRAInfo_' + report_year, 'flow') # Prepare facilities file facilities = df[[ 'FacilityID', 'Handler Name', 'Location Street Number', 'Location Street 1', 'Location Street 2', 'Location City', 'Location State', 'Location Zip', 'County Name', 'NAICS', 'Generator ID Included in NBR' ]].reset_index(drop=True) facilities.drop_duplicates(inplace=True, ignore_index=True) facilities['Address'] = facilities[[ 'Location Street Number', 'Location Street 1', 'Location Street 2' ]].apply(lambda x: ' '.join(x.dropna()), axis=1) facilities.drop(columns=[ 'Location Street Number', 'Location Street 1', 'Location Street 2' ], inplace=True) facilities.rename(columns={ 'Primary NAICS': 'NAICS', 'Handler Name': 'FacilityName', 'Location City': 'City', 'Location State': 'State', 'Location Zip': 'Zip', 'County Name': 'County' }, inplace=True) store_inventory(facilities, 'RCRAInfo_' + report_year, 'facility') # Prepare flow by facility flowbyfacility = aggregate(df, [ 'FacilityID', 'FlowName', 'Source Code', 'Generator Waste Stream Included in NBR' ]) store_inventory(flowbyfacility, 'RCRAInfo_' + report_year, 'flowbyfacility') validate_state_totals(report_year, flowbyfacility) # Record metadata generate_metadata(report_year, filepath, datatype='inventory')
def main(**kwargs): parser = argparse.ArgumentParser(argument_default=argparse.SUPPRESS) parser.add_argument('Option', help='What do you want to do:\ [A] Download NEI data and \ generate StEWI inventory outputs and validate \ to national totals\ [B] Download national totals', type=str) parser.add_argument('-Y', '--Year', nargs='+', help='What NEI year(s) you want to retrieve', type=str) if len(kwargs) == 0: kwargs = vars(parser.parse_args()) for year in kwargs['Year']: if kwargs['Option'] == 'A': nei_point = standardize_output(year) log.info('generating flow by facility output') nei_flowbyfacility = aggregate(nei_point, ['FacilityID', 'FlowName']) store_inventory(nei_flowbyfacility, 'NEI_' + year, 'flowbyfacility') log.debug(len(nei_flowbyfacility)) #2017: 2184786 #2016: 1965918 #2014: 2057249 #2011: 1840866 log.info('generating flow by SCC output') nei_flowbyprocess = aggregate( nei_point, ['FacilityID', 'FlowName', 'Process']) nei_flowbyprocess['ProcessType'] = 'SCC' store_inventory(nei_flowbyprocess, 'NEI_' + year, 'flowbyprocess') log.debug(len(nei_flowbyprocess)) #2017: 4055707 log.info('generating flows output') nei_flows = nei_point[['FlowName', 'FlowID', 'Compartment']] nei_flows = nei_flows.drop_duplicates() nei_flows['Unit'] = 'kg' nei_flows = nei_flows.sort_values(by='FlowName', axis=0) store_inventory(nei_flows, 'NEI_' + year, 'flow') log.debug(len(nei_flows)) #2017: 293 #2016: 282 #2014: 279 #2011: 277 log.info('generating facility output') facility = nei_point[[ 'FacilityID', 'FacilityName', 'Address', 'City', 'State', 'Zip', 'Latitude', 'Longitude', 'NAICS', 'County' ]] facility = facility.drop_duplicates('FacilityID') facility = facility.astype({'Zip': 'str'}) store_inventory(facility, 'NEI_' + year, 'facility') log.debug(len(facility)) #2017: 87162 #2016: 85802 #2014: 85125 #2011: 95565 generate_metadata(year, datatype='inventory') if year in ['2011', '2014', '2017']: validate_national_totals(nei_flowbyfacility, year) else: log.info('no validation performed') elif kwargs['Option'] == 'B': if year in ['2011', '2014', '2017']: generate_national_totals(year) else: log.info(f'national totals do not exist for year {year}')