def write_validation_result(inventory_acronym, year, validation_df): """Write the validation result and associated metadata to local dir. :param inventory_acronym: str for inventory e.g. 'TRI' :param year: str for year e.g. '2016' :param validation_df: df returned from validate_inventory function """ directory = Path(paths.local_path).joinpath('validation') directory.mkdir(parents=True, exist_ok=True) log.info(f'writing validation result to {directory}') validation_df.to_csv(directory.joinpath(f"{inventory_acronym}_{year}.csv"), index=False) # Get metadata on validation dataset validation_set_info_table = read_ValidationSets_Sources() # Get record for year and source validation_set_info = validation_set_info_table[ (validation_set_info_table['Inventory'] == inventory_acronym) & (validation_set_info_table['Year'] == year)] if len(validation_set_info) != 1: log.error('no validation metadata found') return # Convert to Series validation_set_info = validation_set_info.iloc[0, ] # Use the same format an inventory metadata to described the validation set data validation_metadata = dict(source_metadata) validation_metadata['SourceFileName'] = validation_set_info['Name'] validation_metadata['SourceVersion'] = validation_set_info['Version'] validation_metadata['SourceURL'] = validation_set_info['URL'] validation_metadata['SourceAcquisitionTime'] = validation_set_info['Date Acquired'] validation_metadata['Criteria'] = validation_set_info['Criteria'] # Write metadata to file write_metadata(inventory_acronym + '_' + year, validation_metadata, datatype="validation")
def generate_metadata(year, files, datatype='inventory'): """Get metadata and writes to .json.""" if datatype == 'source': source_path = [ str(OUTPUT_PATH.joinpath(f'US_{p}_{year}.csv')) for p in files ] source_meta = compile_source_metadata(source_path, _config, year) source_meta['SourceType'] = 'Zip file' tri_url = _config['url'] link_zip_TRI = link_zip(tri_url, _config['queries'], year) regex = 'https.*/(.*(?=/\w*.zip))' # tri_version = link_zip_TRI.split('/')[-2] try: tri_version = re.search(regex, link_zip_TRI).group(1) except AttributeError: # no match found from regex tri_version = 'last' source_meta['SourceVersion'] = tri_version write_metadata('TRI_' + year, source_meta, category=EXT_DIR, datatype='source') else: source_meta = read_source_metadata(paths, set_stewi_meta( 'TRI_' + year, EXT_DIR), force_JSON=True)['tool_meta'] write_metadata('TRI_' + year, source_meta, datatype=datatype)
def generate_metadata(year, datatype='inventory'): """Get metadata and writes to .json.""" nei_file_path = _config[year]['file_name'] if datatype == 'inventory': source_meta = [] for file in nei_file_path: meta = set_stewi_meta(strip_file_extension(file), EXT_DIR) source_meta.append( read_source_metadata(paths, meta, force_JSON=True)) write_metadata('NEI_' + year, source_meta, datatype=datatype)
def generate_metadata(year, datatype='inventory'): """Generate metadata and writes to json for datatypes 'inventory' or 'source'.""" if datatype == 'source': source_path = str(OUTPUT_PATH.joinpath(_config[year]['file_name'])) source_meta = compile_source_metadata(source_path, _config, year) write_metadata('eGRID_' + year, source_meta, category=EXT_DIR, datatype='source') else: source_meta = read_source_metadata(paths, set_stewi_meta('eGRID_' + year, EXT_DIR), force_JSON=True)['tool_meta'] write_metadata('eGRID_' + year, source_meta, datatype=datatype)
def generate_metadata(year, datatype='inventory'): """Generate metadata and write to json for datatypes 'inventory' or 'source'.""" if datatype == 'source': source_path = str(OUTPUT_PATH.joinpath(year)) source_meta = compile_source_metadata(source_path, _config, year) source_meta['SourceType'] = 'Web Service' write_metadata(f"DMR_{year}", source_meta, category=EXT_DIR, datatype='source') else: source_meta = read_source_metadata(paths, set_stewi_meta(f"DMR_{year}", EXT_DIR), force_JSON=True)['tool_meta'] write_metadata(f"DMR_{year}", source_meta, datatype=datatype)
def generate_metadata(year, files, datatype='inventory'): """Get metadata and writes to .json.""" if datatype == 'source': source_path = [str(p) for p in files] source_meta = compile_source_metadata(source_path, _config, year) source_meta['SourceType'] = 'Zip file' source_meta['SourceURL'] = _config['url'] write_metadata('RCRAInfo_' + str(year), source_meta, category=EXT_DIR, datatype='source') else: source_meta = read_source_metadata(paths, set_stewi_meta( 'RCRAInfo_' + year, EXT_DIR), force_JSON=True)['tool_meta'] write_metadata('RCRAInfo_' + year, source_meta, datatype=datatype)
def generate_metadata(year, m, datatype='inventory'): """Get metadata and writes to .json.""" if datatype == 'source': source_path = m.filename source_meta = compile_source_metadata(source_path, _config, year) source_meta['SourceType'] = m.filetype source_meta['SourceURL'] = m.url source_meta['SourceAcquisitionTime'] = m.time write_metadata('GHGRP_' + year, source_meta, category=EXT_DIR, datatype='source') else: source_meta = read_source_metadata(paths, set_stewi_meta( 'GHGRP_' + year, EXT_DIR), force_JSON=True)['tool_meta'] write_metadata('GHGRP_' + year, source_meta, datatype=datatype)
def Generate_TRI_files_csv(TRIyear, Files): _config = config()['databases']['TRI'] tri_url = _config['url'] link_zip_TRI = link_zip(tri_url, _config['queries'], TRIyear) regex = re.compile( r'https://www3.epa.gov/tri/current/US_\d{4}_?(\d*)\.zip') tri_version = re.search(regex, link_zip_TRI).group(1) if not tri_version: tri_version = 'last' tri_required_fields = imp_fields(data_dir + 'TRI_required_fields.txt') keys = imp_fields(data_dir + 'TRI_keys.txt') # the same function can be used import_facility = tri_required_fields[0:10] values = list() for p in range(len(keys)): start = 13 + 2 * p end = start + 1 values.append(concat_req_field(tri_required_fields[start:end + 1])) # Create a dictionary that had the import fields for each release type to use in import process import_dict = dict_create(keys, values) # Build the TRI DataFrame tri = import_TRI_by_release_type(import_dict, TRIyear) # drop NA for Amount, but leave in zeros tri = tri.dropna(subset=['FlowAmount']) tri = strip_coln_white_space(tri, 'Basis of Estimate') #Convert to float if there are errors - be careful with this line if tri['FlowAmount'].values.dtype != 'float64': tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce') #Drop 0 for FlowAmount tri = tri[tri['FlowAmount'] != 0] # Import reliability scores for TRI tri_reliability_table = reliability_table[reliability_table['Source'] == 'TRI'] tri_reliability_table.drop('Source', axis=1, inplace=True) #Merge with reliability table to get tri = pd.merge(tri, tri_reliability_table, left_on='Basis of Estimate', right_on='Code', how='left') # Fill NAs with 5 for DQI reliability score tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5) # Drop unneeded columns tri.drop('Basis of Estimate', axis=1, inplace=True) tri.drop('Code', axis=1, inplace=True) # Replace source info with Context source_cnxt = data_dir + 'TRI_ReleaseType_to_Compartment.csv' source_to_context = pd.read_csv(source_cnxt) tri = pd.merge(tri, source_to_context, how='left') # Convert units to ref mass unit of kg # Create a new field to put converted amount in tri['Amount_kg'] = 0.0 tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount') tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount') # drop old amount and units tri.drop('FlowAmount', axis=1, inplace=True) tri.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True) tri.rename(columns={'DQI Reliability Score': 'ReliabilityScore'}, inplace=True) #Drop release type tri.drop('ReleaseType', axis=1, inplace=True) #Group by facility, flow and compartment to aggregate different release types grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment'] # Create a specialized weighted mean function to use for aggregation of reliability wm = lambda x: weight_mean(x, tri.loc[x.index, "FlowAmount"]) # Groupby and aggregate with your dictionary: tri = tri.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'ReliabilityScore': wm }) tri = tri.reset_index() #VALIDATE tri_national_totals = pd.read_csv(data_dir + 'TRI_' + TRIyear + '_NationalTotals.csv', header=0, dtype={"FlowAmount": np.float}) tri_national_totals['FlowAmount_kg'] = 0 tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg', 'Unit', 'Pounds', 0.4535924, 'FlowAmount') # drop old amount and units tri_national_totals.drop('FlowAmount', axis=1, inplace=True) tri_national_totals.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) validation_result = validate_inventory(tri, tri_national_totals, group_by='flow', tolerance=5.0) write_validation_result('TRI', TRIyear, validation_result) #FLOWS flows = tri.groupby(['FlowName', 'CAS', 'Compartment']).count().reset_index() #stack by compartment flowsdf = flows[['FlowName', 'CAS', 'Compartment']] flowsdf['FlowID'] = flowsdf['CAS'] #export chemicals #!!!Still needs CAS number and FlowID flowsdf.to_csv(output_dir + 'flow/' + 'TRI_' + TRIyear + '.csv', index=False) #FLOW BY FACILITY #drop CAS tri.drop(columns=['CAS'], inplace=True) tri_file_name = 'TRI_' + TRIyear + '.csv' tri.to_csv(output_dir + 'flowbyfacility/' + tri_file_name, index=False) #FACILITY ##Import and handle TRI facility data tri_facility = pd.read_csv(set_dir(data_dir + '../../../') + 'TRI/US_1a_' + TRIyear + '.txt', sep='\t', header=0, usecols=import_facility, error_bad_lines=False, low_memory=False) #get unique facilities tri_facility_unique_ids = pd.unique(tri_facility['TRIFID']) tri_facility_unique_rows = tri_facility.drop_duplicates() #Use group by to elimiate additional ID duplicates #tri_facility_unique_rows_agg = tri_facility_unique_rows.groupby(['TRIFID']) #tri_facility_final = tri_facility_unique_rows_agg.aggregate() tri_facility_final = tri_facility_unique_rows #rename columns TRI_facility_name_crosswalk = { 'TRIFID': 'FacilityID', 'FACILITY NAME': 'FacilityName', 'FACILITY STREET': 'Address', 'FACILITY CITY': 'City', 'FACILITY COUNTY': 'County', 'FACILITY STATE': 'State', 'FACILITY ZIP CODE': 'Zip', 'PRIMARY NAICS CODE': 'NAICS', 'LATITUDE': 'Latitude', 'LONGITUDE': 'Longitude' } tri_facility_final.rename(columns=TRI_facility_name_crosswalk, inplace=True) tri_facility_final.to_csv(output_dir + 'facility/' + 'TRI_' + TRIyear + '.csv', index=False) # Record TRI metadata external_dir = set_dir(data_dir + '../../../') for file in Files: tri_csv = external_dir + 'TRI/US_' + file + '_' + TRIyear + '.txt' try: retrieval_time = os.path.getctime(tri_csv) except: retrieval_time = time.time() tri_metadata['SourceAquisitionTime'] = time.ctime(retrieval_time) tri_metadata['SourceFileName'] = get_relpath(tri_csv) tri_metadata['SourceURL'] = tri_url tri_metadata['SourceVersion'] = tri_version write_metadata('TRI', TRIyear, tri_metadata)
def Generate_RCRAInfo_files_csv(report_year, RCRAInfopath, RCRAfInfoflatfileURL): RCRAInfoBRtextfile = RCRAInfopath + 'RCRAInfo_by_year/br_reporting_' + report_year + '.txt' #Get file columns widths linewidthsdf = pd.read_csv(data_dir + 'RCRA_FlatFile_LineComponents.csv') BRwidths = linewidthsdf['Size'] #Metadata BR_meta = globals.inventory_metadata #Get columns to keep RCRAfieldstokeepdf = pd.read_csv(data_dir + 'RCRA_required_fields.txt', header=None) RCRAfieldstokeep = list(RCRAfieldstokeepdf[0]) #Get total row count of the file with open(RCRAInfoBRtextfile, 'rb') as rcrafile: row_count = sum([1 for row in rcrafile]) - 1 BR = pd.read_csv(RCRAInfoBRtextfile, header=0, usecols=RCRAfieldstokeep, sep='\t', low_memory=False, error_bad_lines=False, encoding='ISO-8859-1') # Checking the Waste Generation Data Health BR = BR[pd.to_numeric(BR['Generation Tons'], errors='coerce').notnull()] BR['Generation Tons'] = BR['Generation Tons'].astype(float) print(BR.head()) #Pickle as a backup # BR.to_pickle('work/BR_'+ report_year + '.pk') #Read in to start from a pickle # BR = pd.read_pickle('work/BR_'+report_year+'.pk') print(len(BR)) #2001:838497 #2003:770727 #2005:697706 #2007:765764 #2009:919906 #2011:1590067 #2013:1581899 #2015:2053108 #2017:1446613 #Validate correct import - number of states should be 50+ (includes PR and territories) states = BR['State'].unique() print(len(states)) #2001: 46 #2003: 46 #2005: 46 #2007: 46 #2009: 46 #2011: 56 #2013: 56 #2015: 57 #2017: 45 #Filtering to remove double counting and non BR waste records #Do not double count generation from sources that receive it only #Check sum of tons and number of records after each filter step #See EPA 2013. Biennial Report Analytical Methodologies: Data Selection #Logic and Assumptions used to Analyze the Biennial Report. Office of Resource Conservation and Recovery #Drop lines with source code G61 BR = BR[BR['Source Code'] != 'G61'] print(len(BR)) #2001:798905 #2003:722958 #2005:650413 #2007:722383 #2009:879845 #2011:1496275 #2013:1492245 #2015:1959883 #2017:1375562 #Only include wastes that are included in the National Biennial Report BR = BR[BR['Generator ID Included in NBR'] == 'Y'] print(len(BR)) #2001:734349 #2003:629802 #2005:482345 #2007:598748 #2009:704233 #2011:1284796 #2013:1283457 #2015:1759711 #2017:1269987 BR = BR[BR['Generator Waste Stream Included in NBR'] == 'Y'] print(len(BR)) #2001:172539 #2003:167488 #2005:152036 #2007:151729 #2009:142918 #2011:209342 #2013:256978 #2015:288980 #2017:202842 #Remove imported wastes, source codes G63-G75 ImportSourceCodes = pd.read_csv(data_dir + 'RCRAImportSourceCodes.txt', header=None) ImportSourceCodes = ImportSourceCodes[0].tolist() SourceCodesPresent = BR['Source Code'].unique().tolist() SourceCodestoKeep = [] for item in SourceCodesPresent: if item not in ImportSourceCodes: #print(item) SourceCodestoKeep.append(item) BR = BR[BR['Source Code'].isin(SourceCodestoKeep)] print(len(BR)) #2001:172539 #2003:167264 #2005:151638 #2007:151695 #2009:142825 #2011:209306 #2013:256844 #2015:286813 #2017:202513 #Reassign the NAICS to a string BR['NAICS'] = BR['Primary NAICS'].astype('str') BR.drop('Primary NAICS', axis=1, inplace=True) #Create field for DQI Reliability Score with fixed value from CSV #Currently generating a warning reliability_table = globals.reliability_table rcrainfo_reliability_table = reliability_table[reliability_table['Source'] == 'RCRAInfo'] rcrainfo_reliability_table.drop('Source', axis=1, inplace=True) BR['ReliabilityScore'] = float( rcrainfo_reliability_table['DQI Reliability Score']) #Create a new field to put converted amount in BR['Amount_kg'] = 0.0 #Convert amounts from tons. Note this could be replaced with a conversion utility BR['Amount_kg'] = USton_kg * BR['Generation Tons'] ##Read in waste descriptions linewidthsdf = pd.read_csv(data_dir + 'RCRAInfo_LU_WasteCode_LineComponents.csv') widths = linewidthsdf['Size'] names = linewidthsdf['Data Element Name'] File_lu = [ file for file in os.listdir(RCRAInfopath) if 'lu_waste_code' in file.lower() ][0] wastecodesfile = RCRAInfopath + File_lu WasteCodesTest = pd.read_fwf(wastecodesfile, widths=widths, header=None, names=names, nrows=10) WasteCodes = pd.read_fwf(wastecodesfile, widths=widths, header=None, names=names) WasteCodes = WasteCodes[[ 'Waste Code', 'Code Type', 'Waste Code Description' ]] #Remove rows where any fields are na description is missing WasteCodes.dropna(inplace=True) #Bring in form codes #Replace form code with the code name form_code_name_file = data_dir + 'RCRA_LU_FORM_CODE.csv' form_code_table_cols_needed = ['FORM_CODE', 'FORM_CODE_NAME'] form_code_name_df = pd.read_csv(form_code_name_file, header=0, usecols=form_code_table_cols_needed) #Merge waste codes with BR records BR = pd.merge(BR, WasteCodes, left_on='Waste Code Group', right_on='Waste Code', how='left') #Rename code type to make it clear BR.rename(columns={'Code Type': 'Waste Code Type'}, inplace=True) #Merge form codes with BR BR = pd.merge(BR, form_code_name_df, left_on='Form Code', right_on='FORM_CODE', how='left') #Drop duplicates from merge BR.drop(columns=['FORM_CODE', 'Waste Code Group'], inplace=True) #Set flow name to Waste Code Description BR['FlowName'] = BR['Waste Code Description'] #BR['FlowNameSource'] = 'Waste Code Description' #If a useful Waste Code Description is present, use it BR['FlowName'] = BR['FlowName'].apply(waste_description_cleaner) #Check unique flow names pd.unique(BR['FlowName']) #If there is not useful waste code, fill it with the Form Code Name #Find the NAs in FlowName and then give that source of Form Code BR.loc[BR['FlowName'].isnull(), 'FlowNameSource'] = 'Form Code' #Now for those source name rows that are blank, tell it its a waste code BR.loc[BR['FlowNameSource'].isnull(), 'FlowNameSource'] = 'Waste Code' #Set FlowIDs to the appropriate code BR.loc[BR['FlowName'].isnull(), 'FlowID'] = BR['Form Code'] BR.loc[BR['FlowID'].isnull(), 'FlowID'] = BR['Waste Code'] #Now finally fill names that are blank with the form code name BR['FlowName'].fillna(BR['FORM_CODE_NAME'], inplace=True) #Drop unneeded fields BR.drop('Generation Tons', axis=1, inplace=True) BR.drop('Generator ID Included in NBR', axis=1, inplace=True) BR.drop('Generator Waste Stream Included in NBR', axis=1, inplace=True) BR.drop('Source Code', axis=1, inplace=True) BR.drop('Management Method', axis=1, inplace=True) BR.drop('Waste Description', axis=1, inplace=True) BR.drop('Waste Code Description', axis=1, inplace=True) BR.drop('FORM_CODE_NAME', axis=1, inplace=True) #Rename cols used by multiple tables BR.rename(columns={'Handler ID': 'FacilityID'}, inplace=True) #rename new name BR.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True) #Prepare flows file flows = BR[['FlowName', 'FlowID', 'FlowNameSource']] #Drop duplicates flows = flows.drop_duplicates() flows['Compartment'] = 'Waste' flows['Unit'] = 'kg' #Sort them by the flow names flows.sort_values(by='FlowName', axis=0, inplace=True) #Export them flows.to_csv(output_dir + 'flow/RCRAInfo_' + report_year + '.csv', index=False) #Prepare facilities file facilities = BR[[ 'FacilityID', 'Handler Name', 'Location Street Number', 'Location Street 1', 'Location Street 2', 'Location City', 'Location State', 'Location Zip', 'County Name', 'NAICS' ]] #Drop duplicates facilities.drop_duplicates(inplace=True) facilities['Location Street Number'] = facilities[ 'Location Street Number'].apply(str) facilities['Location Street Number'].fillna('', inplace=True) facilities[ 'Address'] = facilities['Location Street Number'] + ' ' + facilities[ 'Location Street 1'] + ' ' + facilities['Location Street 2'] facilities.drop(columns=[ 'Location Street Number', 'Location Street 1', 'Location Street 2' ], inplace=True) facilities.rename(columns={ 'Primary NAICS': 'NAICS', 'Handler Name': 'FacilityName', 'Location City': 'City', 'Location State': 'State', 'Location Zip': 'Zip', 'County Name': 'County' }, inplace=True) facilities.to_csv(output_dir + 'facility/RCRAInfo_' + report_year + '.csv', index=False) #Prepare flow by facility flowbyfacility = BR.groupby(['FacilityID', 'ReliabilityScore', 'FlowName' ])['FlowAmount'].sum().reset_index() ##VALIDATION BR_national_total = pd.read_csv(data_dir + 'RCRAInfo_' + report_year + '_NationalTotals.csv', header=0, dtype={"FlowAmount": np.float}) BR_national_total['FlowAmount_kg'] = 0 BR_national_total = unit_convert(BR_national_total, 'FlowAmount_kg', 'Unit', 'Tons', 907.18474, 'FlowAmount') BR_national_total.drop('FlowAmount', axis=1, inplace=True) BR_national_total.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format BR_national_total.rename(columns={'FlowAmount_kg': 'FlowAmount'}, inplace=True) #Validate total waste generated against national totals sum_of_flowbyfacility = flowbyfacility['FlowAmount'].sum() sum_of_flowbyfacility_df = pd.DataFrame({ 'FlowAmount': [sum_of_flowbyfacility], 'FlowName': 'ALL', 'Compartment': 'waste' }) validation_df = validate_inventory(sum_of_flowbyfacility_df, BR_national_total, group_by='flow') write_validation_result('RCRAInfo', report_year, validation_df) #Export to csv flowbyfacility.to_csv(output_dir + 'flowbyfacility/RCRAInfo_' + report_year + '.csv', index=False) #Record metadata try: retrieval_time = os.path.getctime(RCRAInfoBRtextfile) except: retrieval_time = time.time() BR_meta['SourceAquisitionTime'] = time.ctime(retrieval_time) BR_meta['SourceFileName'] = RCRAInfoBRtextfile BR_meta['SourceURL'] = RCRAfInfoflatfileURL write_metadata('RCRAInfo', report_year, BR_meta)
nei_retrieval_time = time.ctime(os.path.getctime(point_1_path)) if nei_retrieval_time is not None: NEI_meta['SourceAquisitionTime'] = nei_retrieval_time NEI_meta['SourceFileName'] = get_relpath(point_1_path) NEI_meta['SourceURL'] = 'http://eis.epa.gov' #extract version from filepath using regex import re pattern = 'V[0-9]' version = re.search(pattern, point_1_path, flags=re.IGNORECASE) if version is not None: NEI_meta['SourceVersion'] = version.group(0) #Write metadata to json write_metadata('NEI', report_year, NEI_meta) #VALIDATE nei_national_totals = pd.read_csv(data_dir + 'NEI_' + report_year + '_NationalTotals.csv', header=0, dtype={"FlowAmount": np.float}) nei_national_totals['FlowAmount_kg'] = 0 nei_national_totals = unit_convert(nei_national_totals, 'FlowAmount_kg', 'Unit', 'LB', lb_kg, 'FlowAmount') nei_national_totals = unit_convert(nei_national_totals, 'FlowAmount_kg', 'Unit', 'TON', USton_kg, 'FlowAmount') # drop old amount and units nei_national_totals.drop('FlowAmount', axis=1, inplace=True) nei_national_totals.drop('Unit', axis=1, inplace=True) # Rename cols to match reference format