def test_stewi_config(): from stewi.globals import config _config = config()['databases'] url_list = [] # RCRAInfo, TRI, DMR for inv in ['RCRAInfo', 'TRI', 'DMR']: url_list.append(_config[inv]['url']) # eGRID for k, v in _config['eGRID'].items(): if isinstance(v, dict) and 'download_url' in v: url_list.append(v['download_url']) # GHGRP ghgrp = _config['GHGRP'] url_list.extend([ ghgrp['url'] + u for u in [ ghgrp['lo_subparts_url'], ghgrp['esbb_subparts_url'], ghgrp['data_summaries_url'] ] ]) url_check = {} for url in url_list: if url not in url_check.keys(): url_check[url] = url_is_alive(url) error_list = [k for k, v in url_check.items() if not v] s = '\n'.join(error_list) assert all(url_check.values()), f"error in {s}"
def test_generate_inventories(year): for inventory in config()['databases']: if SKIP_BROWSER_DOWNLOAD and inventory in requires_browser_download: continue try: generate_inventory(inventory, year) except InventoryNotAvailableError as err: print(err) continue
def test_all_inventory_generation(): error_list = [] for inventory in config()['databases']: if SKIP_BROWSER_DOWNLOAD and inventory in requires_browser_download: continue df = stewi.getInventory(inventory, year) error = df is None or len(df) == 0 if error: error_list.append(inventory) assert len( error_list) == 0, f"Generation of {','.join(error_list)} unsuccessful"
import argparse import urllib import time from pathlib import Path from stewi.globals import unit_convert,\ DATA_PATH, lb_kg, write_metadata, get_reliability_table_for_source,\ log, compile_source_metadata, config, store_inventory, set_stewi_meta,\ paths, read_source_metadata, aggregate from stewi.validate import update_validationsets_sources, validate_inventory,\ write_validation_result from stewi.filter import filter_states, filter_config import stewi.exceptions _config = config()['databases']['DMR'] DMR_DATA_PATH = DATA_PATH / 'DMR' EXT_DIR = 'DMR Data Files' OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR) states_df = pd.read_csv(DATA_PATH.joinpath('state_codes.csv')) STATES = list(states_df['states']) + list(states_df['dc']) +\ list(states_df['territories']) STATES = tuple(x for x in STATES if str(x) != 'nan') # Values used for StEWI query PARAM_GROUP = True DETECTION = 'HALF' ESTIMATION = True
import io import argparse import re from pathlib import Path from stewi.globals import unit_convert, DATA_PATH, set_stewi_meta,\ get_reliability_table_for_source, write_metadata, url_is_alive,\ lb_kg, g_kg, config, store_inventory, log, paths, compile_source_metadata,\ read_source_metadata, aggregate from stewi.validate import update_validationsets_sources, validate_inventory,\ write_validation_result import stewi.exceptions EXT_DIR = 'TRI Data Files' OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR) _config = config()['databases']['TRI'] TRI_DATA_PATH = DATA_PATH / 'TRI' def visit(url): html = requests.get(url).text soup = BeautifulSoup(html, 'html.parser') return soup def link_zip(url, queries, year): soup = visit(url) TRI_zip_options = {} for link in soup.find_all(queries['TRI_year_reported']): TRI_zip_options[link.text] = link.get(queries['TRI_zip']) return TRI_zip_options[year]
# filter.py (stewi) # !/usr/bin/env python3 # coding=utf-8 """ Functions to support filtering of processed inventories """ import pandas as pd from stewi.globals import DATA_PATH, config, read_inventory, log from stewi.formats import StewiFormat filter_config = config(file='filter.yaml') def apply_filters_to_inventory(inventory, inventory_acronym, year, filters, download_if_missing=False): """Apply one or more filters from a passed list to an inventory dataframe. :param inventory: df of stewi inventory of type flowbyfacility or flowbyprocess :param inventory_acronym: str of inventory e.g. 'NEI' :param year: year as number like 2010 :param filters: a list of named filters to apply to inventory :param download_if_missing: bool, if True will attempt to load from remote server prior to generating if file not found locally :return: DataFrame of filtered inventory """ if 'filter_for_LCI' in filters: for name in filter_config['filter_for_LCI']['filters']: if name not in filters: filters.append(name) compare_to_available_filters(filters)
import zipfile import io from pathlib import Path from esupy.remote import make_url_request from stewi.globals import DATA_PATH, write_metadata,\ unit_convert, log, MMBtu_MJ, MWh_MJ, config, USton_kg, lb_kg,\ compile_source_metadata, remove_line_breaks, paths, store_inventory,\ read_source_metadata, set_stewi_meta, aggregate from stewi.validate import update_validationsets_sources, validate_inventory,\ write_validation_result from stewi.formats import StewiFormat import stewi.exceptions _config = config()['databases']['eGRID'] # set filepath EXT_DIR = 'eGRID Data Files' OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR) eGRID_DATA_DIR = DATA_PATH / 'eGRID' def imp_fields(filename, year): """Import list of fields from egrid that are desired for LCI. :param filename: str name of csv file :param year: str year of egrid inventory :return: a list of source fields and a dictionary to stewi fields """ egrid_req_fields_df = pd.read_csv(eGRID_DATA_DIR.joinpath(filename),
def Generate_TRI_files_csv(TRIyear, Files): _config = config()['databases']['TRI'] tri_url = _config['url'] link_zip_TRI = link_zip(tri_url, _config['queries'], TRIyear) regex = re.compile( r'https://www3.epa.gov/tri/current/US_\d{4}_?(\d*)\.zip') tri_version = re.search(regex, link_zip_TRI).group(1) if not tri_version: tri_version = 'last' tri_required_fields = imp_fields(data_dir + 'TRI_required_fields.txt') keys = imp_fields(data_dir + 'TRI_keys.txt') # the same function can be used import_facility = tri_required_fields[0:10] values = list() for p in range(len(keys)): start = 13 + 2 * p end = start + 1 values.append(concat_req_field(tri_required_fields[start:end + 1])) # Create a dictionary that had the import fields for each release type to use in import process import_dict = dict_create(keys, values) # Build the TRI DataFrame tri = import_TRI_by_release_type(import_dict, TRIyear) # drop NA for Amount, but leave in zeros tri = tri.dropna(subset=['FlowAmount']) tri = strip_coln_white_space(tri, 'Basis of Estimate') #Convert to float if there are errors - be careful with this line if tri['FlowAmount'].values.dtype != 'float64': tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce') #Drop 0 for FlowAmount tri = tri[tri['FlowAmount'] != 0] # Import reliability scores for TRI tri_reliability_table = reliability_table[reliability_table['Source'] == 'TRI'] tri_reliability_table.drop('Source', axis=1, inplace=True) #Merge with reliability table to get tri = pd.merge(tri, tri_reliability_table, left_on='Basis of Estimate', right_on='Code', how='left') # Fill NAs with 5 for DQI reliability score tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5) # Drop unneeded columns tri.drop('Basis of Estimate', axis=1, inplace=True) tri.drop('Code', axis=1, inplace=True) # Replace source info with Context source_cnxt = data_dir + 'TRI_ReleaseType_to_Compartment.csv' source_to_context = pd.read_csv(source_cnxt) tri = pd.merge(tri, source_to_context, how='left') # Convert units to ref mass unit of kg # Create a new field to put converted amount in tri['Amount_kg'] = 0.0 tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount') tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount') # drop old amount and units tri.drop('FlowAmount', axis=1, inplace=True) tri.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True) tri.rename(columns={'DQI Reliability Score': 'ReliabilityScore'}, inplace=True) #Drop release type tri.drop('ReleaseType', axis=1, inplace=True) #Group by facility, flow and compartment to aggregate different release types grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment'] # Create a specialized weighted mean function to use for aggregation of reliability wm = lambda x: weight_mean(x, tri.loc[x.index, "FlowAmount"]) # Groupby and aggregate with your dictionary: tri = tri.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'ReliabilityScore': wm }) tri = tri.reset_index() #VALIDATE tri_national_totals = pd.read_csv(data_dir + 'TRI_' + TRIyear + '_NationalTotals.csv', header=0, dtype={"FlowAmount": np.float}) tri_national_totals['FlowAmount_kg'] = 0 tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg', 'Unit', 'Pounds', 0.4535924, 'FlowAmount') # drop old amount and units tri_national_totals.drop('FlowAmount', axis=1, inplace=True) tri_national_totals.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) validation_result = validate_inventory(tri, tri_national_totals, group_by='flow', tolerance=5.0) write_validation_result('TRI', TRIyear, validation_result) #FLOWS flows = tri.groupby(['FlowName', 'CAS', 'Compartment']).count().reset_index() #stack by compartment flowsdf = flows[['FlowName', 'CAS', 'Compartment']] flowsdf['FlowID'] = flowsdf['CAS'] #export chemicals #!!!Still needs CAS number and FlowID flowsdf.to_csv(output_dir + 'flow/' + 'TRI_' + TRIyear + '.csv', index=False) #FLOW BY FACILITY #drop CAS tri.drop(columns=['CAS'], inplace=True) tri_file_name = 'TRI_' + TRIyear + '.csv' tri.to_csv(output_dir + 'flowbyfacility/' + tri_file_name, index=False) #FACILITY ##Import and handle TRI facility data tri_facility = pd.read_csv(set_dir(data_dir + '../../../') + 'TRI/US_1a_' + TRIyear + '.txt', sep='\t', header=0, usecols=import_facility, error_bad_lines=False, low_memory=False) #get unique facilities tri_facility_unique_ids = pd.unique(tri_facility['TRIFID']) tri_facility_unique_rows = tri_facility.drop_duplicates() #Use group by to elimiate additional ID duplicates #tri_facility_unique_rows_agg = tri_facility_unique_rows.groupby(['TRIFID']) #tri_facility_final = tri_facility_unique_rows_agg.aggregate() tri_facility_final = tri_facility_unique_rows #rename columns TRI_facility_name_crosswalk = { 'TRIFID': 'FacilityID', 'FACILITY NAME': 'FacilityName', 'FACILITY STREET': 'Address', 'FACILITY CITY': 'City', 'FACILITY COUNTY': 'County', 'FACILITY STATE': 'State', 'FACILITY ZIP CODE': 'Zip', 'PRIMARY NAICS CODE': 'NAICS', 'LATITUDE': 'Latitude', 'LONGITUDE': 'Longitude' } tri_facility_final.rename(columns=TRI_facility_name_crosswalk, inplace=True) tri_facility_final.to_csv(output_dir + 'facility/' + 'TRI_' + TRIyear + '.csv', index=False) # Record TRI metadata external_dir = set_dir(data_dir + '../../../') for file in Files: tri_csv = external_dir + 'TRI/US_' + file + '_' + TRIyear + '.txt' try: retrieval_time = os.path.getctime(tri_csv) except: retrieval_time = time.time() tri_metadata['SourceAquisitionTime'] = time.ctime(retrieval_time) tri_metadata['SourceFileName'] = get_relpath(tri_csv) tri_metadata['SourceURL'] = tri_url tri_metadata['SourceVersion'] = tri_version write_metadata('TRI', TRIyear, tri_metadata)
log, store_inventory, compile_source_metadata, read_source_metadata,\ aggregate, set_stewi_meta from stewi.validate import update_validationsets_sources, validate_inventory,\ write_validation_result from stewi.filter import apply_filters_to_inventory import stewi.exceptions try: from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager except ImportError: log.error('Must install selenium and webdriver_manager for RCRAInfo. ' 'See install instructions for optional package ' 'installation or install them indepedently and retry.') _config = config()['databases']['RCRAInfo'] EXT_DIR = 'RCRAInfo Data Files' OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR) RCRA_DATA_PATH = DATA_PATH / 'RCRAInfo' DIR_RCRA_BY_YEAR = OUTPUT_PATH.joinpath('RCRAInfo_by_year') def waste_description_cleaner(x): if ('from br conversion' in x) or (x == 'From 1989 BR data'): x = None return x def extracting_files(path_unzip, name): with zipfile.ZipFile(path_unzip.joinpath(name + '.zip')) as z: z.extractall(path_unzip)
import requests from xml.dom import minidom import time import argparse import warnings from pathlib import Path from stewi.globals import download_table, write_metadata, import_table, \ DATA_PATH, get_reliability_table_for_source, set_stewi_meta, config,\ store_inventory, paths, log, \ compile_source_metadata, read_source_metadata, aggregate from stewi.validate import update_validationsets_sources, validate_inventory,\ write_validation_result from stewi.formats import StewiFormat _config = config()['databases']['GHGRP'] GHGRP_DATA_PATH = DATA_PATH / 'GHGRP' EXT_DIR = 'GHGRP Data Files' OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR) # Flow codes that are reported in validation in CO2e flows_CO2e = ['PFC', 'HFC', 'Other', 'Very_Short', 'HFE', 'Other_Full'] # define GWPs # (these values are from IPCC's AR4, which is consistent with GHGRP methodology) CH4GWP = 25 N2OGWP = 298 HFC23GWP = 14800 # define column groupings ghgrp_cols = pd.read_csv(GHGRP_DATA_PATH.joinpath('ghgrp_columns.csv'))
import facilitymatcher.WriteFRSNAICSforStEWI as write_naics from esupy.processed_data_mgmt import Paths, load_preprocessed_output,\ write_df_to_file, write_metadata_to_file, read_source_metadata,\ download_from_remote from esupy.util import strip_file_extension MODULEPATH = Path(__file__).resolve().parent DATA_PATH = MODULEPATH / 'data' paths = Paths() paths.local_path = os.path.realpath(paths.local_path + "/facilitymatcher") output_dir = paths.local_path ext_folder = 'FRS Data Files' FRSpath = paths.local_path + '/' + ext_folder FRS_config = config(config_path=MODULEPATH)['databases']['FRS'] inventory_to_FRS_pgm_acronymn = FRS_config['program_dictionary'] stewi_inventories = list(inventory_to_FRS_pgm_acronymn.keys()) def set_facilitymatcher_meta(file_name, category): """Create a class of esupy FileMeta.""" facilitymatcher_meta = set_stewi_meta(file_name, category) facilitymatcher_meta.tool = "facilitymatcher" return facilitymatcher_meta def download_extract_FRS_combined_national(file=None): """Download and extract file from source to local directory.""" url = FRS_config['url']
def test_chemical_matches(): assert chemicalmatcher.get_matches_for_StEWI( config()['databases'].keys()) is not None
import numpy as np import argparse import requests import zipfile import io from pathlib import Path from esupy.processed_data_mgmt import download_from_remote from esupy.util import strip_file_extension from stewi.globals import DATA_PATH, write_metadata, USton_kg, lb_kg,\ log, store_inventory, config, read_source_metadata,\ paths, aggregate, get_reliability_table_for_source, set_stewi_meta from stewi.validate import update_validationsets_sources, validate_inventory,\ write_validation_result _config = config()['databases']['NEI'] EXT_DIR = 'NEI Data Files' OUTPUT_PATH = Path(paths.local_path).joinpath(EXT_DIR) NEI_DATA_PATH = DATA_PATH / 'NEI' def read_data(year, file): """Read NEI data and return a dataframe based on identified columns. :param year : str, Year of NEI dataset for identifying field names :param file : str, File path containing NEI data (parquet). :returns df : DataFrame of NEI data from a single file with standardized column names. """ nei_required_fields = pd.read_table( NEI_DATA_PATH.joinpath('NEI_required_fields.csv'), sep=',')