Ejemplo n.º 1
0
def validate_eGRID(year, flowbyfac):
    """Validate eGRID flowbyfacility data against national totals."""
    validation_file = DATA_PATH.joinpath(f"eGRID_{year}_NationalTotals.csv")
    if not validation_file.is_file():
        generate_national_totals(year)
    log.info('validating data against national totals')
    egrid_national_totals = pd.read_csv(validation_file, header=0,
                                        dtype={"FlowAmount": float})
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'lbs',
        lb_kg, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'tons',
        USton_kg, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'MMBtu',
        MMBtu_MJ, 'FlowAmount')
    egrid_national_totals = unit_convert(
        egrid_national_totals, 'FlowAmount', 'Unit', 'MWh',
        MWh_MJ, 'FlowAmount')
    # drop old unit
    egrid_national_totals.drop('Unit', axis=1, inplace=True)
    validation_result = validate_inventory(flowbyfac, egrid_national_totals,
                                           group_by='flow', tolerance=5.0)
    write_validation_result('eGRID', year, validation_result)
Ejemplo n.º 2
0
def validate_state_totals(df, year):
    """Generate validation by state, sums across species.

    Details on results by state can be found in the search results help website
    https://echo.epa.gov/help/loading-tool/water-pollution-search/search-results-help-dmr
    """
    filepath = DATA_PATH.joinpath(f"DMR_{year}_StateTotals.csv")
    if not filepath.is_file():
        download_state_totals_validation(year)
    log.info('validating against state totals')
    reference_df = pd.read_csv(filepath)
    reference_df['FlowAmount'] = 0.0
    reference_df = unit_convert(reference_df, 'FlowAmount',
                                'Unit', 'lb', lb_kg, 'Amount')
    reference_df = reference_df[['FlowName', 'State', 'FlowAmount']]

    # to match the state totals, only compare NPD facilities, and remove some flows
    flow_exclude = pd.read_csv(DMR_DATA_PATH.joinpath('DMR_state_filter_list.csv'))
    state_flow_exclude_list = flow_exclude['POLLUTANT_DESC'].to_list()

    dmr_by_state = df[~df['FlowName'].isin(state_flow_exclude_list)]
    dmr_by_state = dmr_by_state[dmr_by_state['PermitTypeCode'] == 'NPD']

    dmr_by_state = dmr_by_state[['State', 'FlowAmount']]
    dmr_by_state = dmr_by_state[['State', 'FlowAmount']
                                ].groupby('State').sum().reset_index()
    dmr_by_state['FlowName'] = 'All'
    validation_df = validate_inventory(dmr_by_state, reference_df,
                                       group_by="state")
    write_validation_result('DMR', year, validation_df)
Ejemplo n.º 3
0
def validate_national_totals(inv, TRIyear):
    log.info('validating data against national totals')
    filename = DATA_PATH.joinpath(f'TRI_{TRIyear}_NationalTotals.csv')
    if filename.is_file():
        tri_national_totals = pd.read_csv(filename,
                                          header=0,
                                          dtype={"FlowAmount": float})
        tri_national_totals['FlowAmount_kg'] = 0
        tri_national_totals = unit_convert(tri_national_totals,
                                           'FlowAmount_kg', 'Unit', 'Pounds',
                                           lb_kg, 'FlowAmount')
        # drop old amount and units
        tri_national_totals.drop('FlowAmount', axis=1, inplace=True)
        tri_national_totals.drop('Unit', axis=1, inplace=True)
        # Rename cols to match reference format
        tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                                   inplace=True)
        inv = map_to_fedefl(inv)
        if inv is not None:
            validation_result = validate_inventory(inv,
                                                   tri_national_totals,
                                                   group_by='flow',
                                                   tolerance=5.0)
            write_validation_result('TRI', TRIyear, validation_result)
    else:
        log.warning(f'validation file for TRI_{TRIyear} does not exist. '
                    'Please run option B')
Ejemplo n.º 4
0
def Generate_TRI_files_csv(TRIyear, Files):
    """Generate TRI inventories from downloaded files."""
    tri_required_fields = imp_fields(
        TRI_DATA_PATH.joinpath('TRI_required_fields.txt'))
    keys = imp_fields(TRI_DATA_PATH.joinpath('TRI_keys.txt'))
    values = list()
    for p in range(len(keys)):
        start = 13 + 2 * p
        end = start + 1
        values.append(concat_req_field(tri_required_fields[start:end + 1]))
    # Create a dictionary that had the import fields for each release
    # type to use in import process
    import_dict = dict_create(keys, values)
    # Build the TRI DataFrame
    tri = import_TRI_by_release_type(import_dict, TRIyear)
    # drop NA for Amount, but leave in zeros
    tri = tri.dropna(subset=['FlowAmount'])
    tri = strip_coln_white_space(tri, 'Basis of Estimate')
    # Convert to float if there are errors - be careful with this line
    if tri['FlowAmount'].values.dtype != 'float64':
        tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce')
    tri = tri[tri['FlowAmount'] != 0]
    # Import reliability scores for TRI
    tri_reliability_table = get_reliability_table_for_source('TRI')
    tri = pd.merge(tri,
                   tri_reliability_table,
                   left_on='Basis of Estimate',
                   right_on='Code',
                   how='left')
    tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5)
    tri.drop(['Basis of Estimate', 'Code'], axis=1, inplace=True)
    # Replace source info with Context
    source_to_context = pd.read_csv(
        TRI_DATA_PATH.joinpath('TRI_ReleaseType_to_Compartment.csv'))
    tri = pd.merge(tri, source_to_context, how='left')
    # Convert units to ref mass unit of kg
    tri['Amount_kg'] = 0.0
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount')
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount')
    tri.drop(columns=['FlowAmount', 'Unit'], inplace=True)
    # Rename cols to match reference format
    tri.rename(columns={
        'Amount_kg': 'FlowAmount',
        'DQI Reliability Score': 'DataReliability'
    },
               inplace=True)
    tri.drop(columns=['ReleaseType'], inplace=True)
    grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment']
    tri = aggregate(tri, grouping_vars)

    validate_national_totals(tri, TRIyear)

    # FLOWS
    flowsdf = tri[['FlowName', 'CAS',
                   'Compartment']].drop_duplicates().reset_index(drop=True)
    flowsdf.loc[:, 'FlowID'] = flowsdf['CAS']
    store_inventory(flowsdf, 'TRI_' + TRIyear, 'flow')

    # FLOW BY FACILITY
    tri.drop(columns=['CAS'], inplace=True)
    store_inventory(tri, 'TRI_' + TRIyear, 'flowbyfacility')

    # FACILITY
    # Import and handle TRI facility data
    import_facility = tri_required_fields[0:10]
    tri_facility = pd.read_csv(OUTPUT_PATH.joinpath(f'US_1a_{TRIyear}.csv'),
                               usecols=import_facility,
                               low_memory=False)
    tri_facility = tri_facility.drop_duplicates(ignore_index=True)
    # rename columns
    TRI_facility_name_crosswalk = {
        'TRIFID': 'FacilityID',
        'FACILITY NAME': 'FacilityName',
        'FACILITY STREET': 'Address',
        'FACILITY CITY': 'City',
        'FACILITY COUNTY': 'County',
        'FACILITY STATE': 'State',
        'FACILITY ZIP CODE': 'Zip',
        'PRIMARY NAICS CODE': 'NAICS',
        'LATITUDE': 'Latitude',
        'LONGITUDE': 'Longitude',
    }
    tri_facility.rename(columns=TRI_facility_name_crosswalk, inplace=True)
    store_inventory(tri_facility, 'TRI_' + TRIyear, 'facility')
Ejemplo n.º 5
0
def Generate_TRI_files_csv(TRIyear, Files):
    _config = config()['databases']['TRI']
    tri_url = _config['url']
    link_zip_TRI = link_zip(tri_url, _config['queries'], TRIyear)
    regex = re.compile(
        r'https://www3.epa.gov/tri/current/US_\d{4}_?(\d*)\.zip')
    tri_version = re.search(regex, link_zip_TRI).group(1)
    if not tri_version:
        tri_version = 'last'
    tri_required_fields = imp_fields(data_dir + 'TRI_required_fields.txt')
    keys = imp_fields(data_dir +
                      'TRI_keys.txt')  # the same function can be used
    import_facility = tri_required_fields[0:10]
    values = list()
    for p in range(len(keys)):
        start = 13 + 2 * p
        end = start + 1
        values.append(concat_req_field(tri_required_fields[start:end + 1]))
    # Create a dictionary that had the import fields for each release type to use in import process
    import_dict = dict_create(keys, values)
    # Build the TRI DataFrame
    tri = import_TRI_by_release_type(import_dict, TRIyear)
    # drop NA for Amount, but leave in zeros
    tri = tri.dropna(subset=['FlowAmount'])
    tri = strip_coln_white_space(tri, 'Basis of Estimate')
    #Convert to float if there are errors - be careful with this line
    if tri['FlowAmount'].values.dtype != 'float64':
        tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce')
    #Drop 0 for FlowAmount
    tri = tri[tri['FlowAmount'] != 0]
    # Import reliability scores for TRI
    tri_reliability_table = reliability_table[reliability_table['Source'] ==
                                              'TRI']
    tri_reliability_table.drop('Source', axis=1, inplace=True)
    #Merge with reliability table to get
    tri = pd.merge(tri,
                   tri_reliability_table,
                   left_on='Basis of Estimate',
                   right_on='Code',
                   how='left')
    # Fill NAs with 5 for DQI reliability score
    tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5)
    # Drop unneeded columns
    tri.drop('Basis of Estimate', axis=1, inplace=True)
    tri.drop('Code', axis=1, inplace=True)
    # Replace source info with Context
    source_cnxt = data_dir + 'TRI_ReleaseType_to_Compartment.csv'
    source_to_context = pd.read_csv(source_cnxt)
    tri = pd.merge(tri, source_to_context, how='left')
    # Convert units to ref mass unit of kg
    # Create a new field to put converted amount in
    tri['Amount_kg'] = 0.0
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount')
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount')
    # drop old amount and units
    tri.drop('FlowAmount', axis=1, inplace=True)
    tri.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True)
    tri.rename(columns={'DQI Reliability Score': 'ReliabilityScore'},
               inplace=True)
    #Drop release type
    tri.drop('ReleaseType', axis=1, inplace=True)
    #Group by facility, flow and compartment to aggregate different release types
    grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment']
    # Create a specialized weighted mean function to use for aggregation of reliability
    wm = lambda x: weight_mean(x, tri.loc[x.index, "FlowAmount"])
    # Groupby and aggregate with your dictionary:
    tri = tri.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'ReliabilityScore': wm
    })
    tri = tri.reset_index()

    #VALIDATE
    tri_national_totals = pd.read_csv(data_dir + 'TRI_' + TRIyear +
                                      '_NationalTotals.csv',
                                      header=0,
                                      dtype={"FlowAmount": np.float})
    tri_national_totals['FlowAmount_kg'] = 0
    tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg',
                                       'Unit', 'Pounds', 0.4535924,
                                       'FlowAmount')
    # drop old amount and units
    tri_national_totals.drop('FlowAmount', axis=1, inplace=True)
    tri_national_totals.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                               inplace=True)
    validation_result = validate_inventory(tri,
                                           tri_national_totals,
                                           group_by='flow',
                                           tolerance=5.0)
    write_validation_result('TRI', TRIyear, validation_result)
    #FLOWS
    flows = tri.groupby(['FlowName', 'CAS',
                         'Compartment']).count().reset_index()
    #stack by compartment
    flowsdf = flows[['FlowName', 'CAS', 'Compartment']]
    flowsdf['FlowID'] = flowsdf['CAS']
    #export chemicals
    #!!!Still needs CAS number and FlowID
    flowsdf.to_csv(output_dir + 'flow/' + 'TRI_' + TRIyear + '.csv',
                   index=False)
    #FLOW BY FACILITY
    #drop CAS
    tri.drop(columns=['CAS'], inplace=True)
    tri_file_name = 'TRI_' + TRIyear + '.csv'
    tri.to_csv(output_dir + 'flowbyfacility/' + tri_file_name, index=False)
    #FACILITY
    ##Import and handle TRI facility data
    tri_facility = pd.read_csv(set_dir(data_dir + '../../../') + 'TRI/US_1a_' +
                               TRIyear + '.txt',
                               sep='\t',
                               header=0,
                               usecols=import_facility,
                               error_bad_lines=False,
                               low_memory=False)
    #get unique facilities
    tri_facility_unique_ids = pd.unique(tri_facility['TRIFID'])
    tri_facility_unique_rows = tri_facility.drop_duplicates()
    #Use group by to elimiate additional ID duplicates
    #tri_facility_unique_rows_agg = tri_facility_unique_rows.groupby(['TRIFID'])
    #tri_facility_final = tri_facility_unique_rows_agg.aggregate()
    tri_facility_final = tri_facility_unique_rows
    #rename columns
    TRI_facility_name_crosswalk = {
        'TRIFID': 'FacilityID',
        'FACILITY NAME': 'FacilityName',
        'FACILITY STREET': 'Address',
        'FACILITY CITY': 'City',
        'FACILITY COUNTY': 'County',
        'FACILITY STATE': 'State',
        'FACILITY ZIP CODE': 'Zip',
        'PRIMARY NAICS CODE': 'NAICS',
        'LATITUDE': 'Latitude',
        'LONGITUDE': 'Longitude'
    }
    tri_facility_final.rename(columns=TRI_facility_name_crosswalk,
                              inplace=True)
    tri_facility_final.to_csv(output_dir + 'facility/' + 'TRI_' + TRIyear +
                              '.csv',
                              index=False)
    # Record TRI metadata
    external_dir = set_dir(data_dir + '../../../')
    for file in Files:
        tri_csv = external_dir + 'TRI/US_' + file + '_' + TRIyear + '.txt'
        try:
            retrieval_time = os.path.getctime(tri_csv)
        except:
            retrieval_time = time.time()
        tri_metadata['SourceAquisitionTime'] = time.ctime(retrieval_time)
        tri_metadata['SourceFileName'] = get_relpath(tri_csv)
        tri_metadata['SourceURL'] = tri_url
        tri_metadata['SourceVersion'] = tri_version
        write_metadata('TRI', TRIyear, tri_metadata)
def Generate_RCRAInfo_files_csv(report_year, RCRAInfopath,
                                RCRAfInfoflatfileURL):
    RCRAInfoBRtextfile = RCRAInfopath + 'RCRAInfo_by_year/br_reporting_' + report_year + '.txt'
    #Get file columns widths
    linewidthsdf = pd.read_csv(data_dir + 'RCRA_FlatFile_LineComponents.csv')
    BRwidths = linewidthsdf['Size']
    #Metadata
    BR_meta = globals.inventory_metadata
    #Get columns to keep
    RCRAfieldstokeepdf = pd.read_csv(data_dir + 'RCRA_required_fields.txt',
                                     header=None)
    RCRAfieldstokeep = list(RCRAfieldstokeepdf[0])
    #Get total row count of the file
    with open(RCRAInfoBRtextfile, 'rb') as rcrafile:
        row_count = sum([1 for row in rcrafile]) - 1
    BR = pd.read_csv(RCRAInfoBRtextfile,
                     header=0,
                     usecols=RCRAfieldstokeep,
                     sep='\t',
                     low_memory=False,
                     error_bad_lines=False,
                     encoding='ISO-8859-1')
    # Checking the Waste Generation Data Health
    BR = BR[pd.to_numeric(BR['Generation Tons'], errors='coerce').notnull()]
    BR['Generation Tons'] = BR['Generation Tons'].astype(float)
    print(BR.head())
    #Pickle as a backup
    # BR.to_pickle('work/BR_'+ report_year + '.pk')
    #Read in to start from a pickle
    # BR = pd.read_pickle('work/BR_'+report_year+'.pk')
    print(len(BR))
    #2001:838497
    #2003:770727
    #2005:697706
    #2007:765764
    #2009:919906
    #2011:1590067
    #2013:1581899
    #2015:2053108
    #2017:1446613
    #Validate correct import - number of states should be 50+ (includes PR and territories)
    states = BR['State'].unique()
    print(len(states))
    #2001: 46
    #2003: 46
    #2005: 46
    #2007: 46
    #2009: 46
    #2011: 56
    #2013: 56
    #2015: 57
    #2017: 45
    #Filtering to remove double counting and non BR waste records
    #Do not double count generation from sources that receive it only
    #Check sum of tons and number of records after each filter step
    #See EPA 2013. Biennial Report Analytical Methodologies: Data Selection
    #Logic and Assumptions used to Analyze the Biennial Report. Office of Resource Conservation and Recovery
    #Drop lines with source code G61
    BR = BR[BR['Source Code'] != 'G61']
    print(len(BR))
    #2001:798905
    #2003:722958
    #2005:650413
    #2007:722383
    #2009:879845
    #2011:1496275
    #2013:1492245
    #2015:1959883
    #2017:1375562
    #Only include wastes that are included in the National Biennial Report
    BR = BR[BR['Generator ID Included in NBR'] == 'Y']
    print(len(BR))
    #2001:734349
    #2003:629802
    #2005:482345
    #2007:598748
    #2009:704233
    #2011:1284796
    #2013:1283457
    #2015:1759711
    #2017:1269987
    BR = BR[BR['Generator Waste Stream Included in NBR'] == 'Y']
    print(len(BR))
    #2001:172539
    #2003:167488
    #2005:152036
    #2007:151729
    #2009:142918
    #2011:209342
    #2013:256978
    #2015:288980
    #2017:202842
    #Remove imported wastes, source codes G63-G75
    ImportSourceCodes = pd.read_csv(data_dir + 'RCRAImportSourceCodes.txt',
                                    header=None)
    ImportSourceCodes = ImportSourceCodes[0].tolist()
    SourceCodesPresent = BR['Source Code'].unique().tolist()
    SourceCodestoKeep = []
    for item in SourceCodesPresent:
        if item not in ImportSourceCodes:
            #print(item)
            SourceCodestoKeep.append(item)
    BR = BR[BR['Source Code'].isin(SourceCodestoKeep)]
    print(len(BR))
    #2001:172539
    #2003:167264
    #2005:151638
    #2007:151695
    #2009:142825
    #2011:209306
    #2013:256844
    #2015:286813
    #2017:202513
    #Reassign the NAICS to a string
    BR['NAICS'] = BR['Primary NAICS'].astype('str')
    BR.drop('Primary NAICS', axis=1, inplace=True)
    #Create field for DQI Reliability Score with fixed value from CSV
    #Currently generating a warning
    reliability_table = globals.reliability_table
    rcrainfo_reliability_table = reliability_table[reliability_table['Source']
                                                   == 'RCRAInfo']
    rcrainfo_reliability_table.drop('Source', axis=1, inplace=True)
    BR['ReliabilityScore'] = float(
        rcrainfo_reliability_table['DQI Reliability Score'])
    #Create a new field to put converted amount in
    BR['Amount_kg'] = 0.0
    #Convert amounts from tons. Note this could be replaced with a conversion utility
    BR['Amount_kg'] = USton_kg * BR['Generation Tons']
    ##Read in waste descriptions
    linewidthsdf = pd.read_csv(data_dir +
                               'RCRAInfo_LU_WasteCode_LineComponents.csv')
    widths = linewidthsdf['Size']
    names = linewidthsdf['Data Element Name']
    File_lu = [
        file for file in os.listdir(RCRAInfopath)
        if 'lu_waste_code' in file.lower()
    ][0]
    wastecodesfile = RCRAInfopath + File_lu
    WasteCodesTest = pd.read_fwf(wastecodesfile,
                                 widths=widths,
                                 header=None,
                                 names=names,
                                 nrows=10)
    WasteCodes = pd.read_fwf(wastecodesfile,
                             widths=widths,
                             header=None,
                             names=names)
    WasteCodes = WasteCodes[[
        'Waste Code', 'Code Type', 'Waste Code Description'
    ]]
    #Remove rows where any fields are na description is missing
    WasteCodes.dropna(inplace=True)
    #Bring in form codes
    #Replace form code with the code name
    form_code_name_file = data_dir + 'RCRA_LU_FORM_CODE.csv'
    form_code_table_cols_needed = ['FORM_CODE', 'FORM_CODE_NAME']
    form_code_name_df = pd.read_csv(form_code_name_file,
                                    header=0,
                                    usecols=form_code_table_cols_needed)
    #Merge waste codes with BR records
    BR = pd.merge(BR,
                  WasteCodes,
                  left_on='Waste Code Group',
                  right_on='Waste Code',
                  how='left')
    #Rename code type to make it clear
    BR.rename(columns={'Code Type': 'Waste Code Type'}, inplace=True)
    #Merge form codes with BR
    BR = pd.merge(BR,
                  form_code_name_df,
                  left_on='Form Code',
                  right_on='FORM_CODE',
                  how='left')
    #Drop duplicates from merge
    BR.drop(columns=['FORM_CODE', 'Waste Code Group'], inplace=True)
    #Set flow name to Waste Code Description
    BR['FlowName'] = BR['Waste Code Description']
    #BR['FlowNameSource'] = 'Waste Code Description'
    #If a useful Waste Code Description is present, use it
    BR['FlowName'] = BR['FlowName'].apply(waste_description_cleaner)
    #Check unique flow names
    pd.unique(BR['FlowName'])
    #If there is not useful waste code, fill it with the Form Code Name
    #Find the NAs in FlowName and then give that source of Form Code
    BR.loc[BR['FlowName'].isnull(), 'FlowNameSource'] = 'Form Code'
    #Now for those source name rows that are blank, tell it its a waste code
    BR.loc[BR['FlowNameSource'].isnull(), 'FlowNameSource'] = 'Waste Code'
    #Set FlowIDs to the appropriate code
    BR.loc[BR['FlowName'].isnull(), 'FlowID'] = BR['Form Code']
    BR.loc[BR['FlowID'].isnull(), 'FlowID'] = BR['Waste Code']
    #Now finally fill names that are blank with the form code name
    BR['FlowName'].fillna(BR['FORM_CODE_NAME'], inplace=True)
    #Drop unneeded fields
    BR.drop('Generation Tons', axis=1, inplace=True)
    BR.drop('Generator ID Included in NBR', axis=1, inplace=True)
    BR.drop('Generator Waste Stream Included in NBR', axis=1, inplace=True)
    BR.drop('Source Code', axis=1, inplace=True)
    BR.drop('Management Method', axis=1, inplace=True)
    BR.drop('Waste Description', axis=1, inplace=True)
    BR.drop('Waste Code Description', axis=1, inplace=True)
    BR.drop('FORM_CODE_NAME', axis=1, inplace=True)
    #Rename cols used by multiple tables
    BR.rename(columns={'Handler ID': 'FacilityID'}, inplace=True)
    #rename new name
    BR.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True)
    #Prepare flows file
    flows = BR[['FlowName', 'FlowID', 'FlowNameSource']]
    #Drop duplicates
    flows = flows.drop_duplicates()
    flows['Compartment'] = 'Waste'
    flows['Unit'] = 'kg'
    #Sort them by the flow names
    flows.sort_values(by='FlowName', axis=0, inplace=True)
    #Export them
    flows.to_csv(output_dir + 'flow/RCRAInfo_' + report_year + '.csv',
                 index=False)
    #Prepare facilities file
    facilities = BR[[
        'FacilityID', 'Handler Name', 'Location Street Number',
        'Location Street 1', 'Location Street 2', 'Location City',
        'Location State', 'Location Zip', 'County Name', 'NAICS'
    ]]
    #Drop duplicates
    facilities.drop_duplicates(inplace=True)
    facilities['Location Street Number'] = facilities[
        'Location Street Number'].apply(str)
    facilities['Location Street Number'].fillna('', inplace=True)
    facilities[
        'Address'] = facilities['Location Street Number'] + ' ' + facilities[
            'Location Street 1'] + ' ' + facilities['Location Street 2']
    facilities.drop(columns=[
        'Location Street Number', 'Location Street 1', 'Location Street 2'
    ],
                    inplace=True)
    facilities.rename(columns={
        'Primary NAICS': 'NAICS',
        'Handler Name': 'FacilityName',
        'Location City': 'City',
        'Location State': 'State',
        'Location Zip': 'Zip',
        'County Name': 'County'
    },
                      inplace=True)
    facilities.to_csv(output_dir + 'facility/RCRAInfo_' + report_year + '.csv',
                      index=False)
    #Prepare flow by facility
    flowbyfacility = BR.groupby(['FacilityID', 'ReliabilityScore', 'FlowName'
                                 ])['FlowAmount'].sum().reset_index()
    ##VALIDATION
    BR_national_total = pd.read_csv(data_dir + 'RCRAInfo_' + report_year +
                                    '_NationalTotals.csv',
                                    header=0,
                                    dtype={"FlowAmount": np.float})
    BR_national_total['FlowAmount_kg'] = 0
    BR_national_total = unit_convert(BR_national_total, 'FlowAmount_kg',
                                     'Unit', 'Tons', 907.18474, 'FlowAmount')
    BR_national_total.drop('FlowAmount', axis=1, inplace=True)
    BR_national_total.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    BR_national_total.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                             inplace=True)
    #Validate total waste generated against national totals
    sum_of_flowbyfacility = flowbyfacility['FlowAmount'].sum()
    sum_of_flowbyfacility_df = pd.DataFrame({
        'FlowAmount': [sum_of_flowbyfacility],
        'FlowName':
        'ALL',
        'Compartment':
        'waste'
    })
    validation_df = validate_inventory(sum_of_flowbyfacility_df,
                                       BR_national_total,
                                       group_by='flow')
    write_validation_result('RCRAInfo', report_year, validation_df)
    #Export to csv
    flowbyfacility.to_csv(output_dir + 'flowbyfacility/RCRAInfo_' +
                          report_year + '.csv',
                          index=False)
    #Record metadata
    try:
        retrieval_time = os.path.getctime(RCRAInfoBRtextfile)
    except:
        retrieval_time = time.time()
    BR_meta['SourceAquisitionTime'] = time.ctime(retrieval_time)
    BR_meta['SourceFileName'] = RCRAInfoBRtextfile
    BR_meta['SourceURL'] = RCRAfInfoflatfileURL
    write_metadata('RCRAInfo', report_year, BR_meta)
Ejemplo n.º 7
0
import re
pattern = 'V[0-9]'
version = re.search(pattern, point_1_path, flags=re.IGNORECASE)
if version is not None:
    NEI_meta['SourceVersion'] = version.group(0)

#Write metadata to json
write_metadata('NEI', report_year, NEI_meta)

#VALIDATE
nei_national_totals = pd.read_csv(data_dir + 'NEI_' + report_year +
                                  '_NationalTotals.csv',
                                  header=0,
                                  dtype={"FlowAmount": np.float})
nei_national_totals['FlowAmount_kg'] = 0
nei_national_totals = unit_convert(nei_national_totals, 'FlowAmount_kg',
                                   'Unit', 'LB', lb_kg, 'FlowAmount')
nei_national_totals = unit_convert(nei_national_totals, 'FlowAmount_kg',
                                   'Unit', 'TON', USton_kg, 'FlowAmount')
# drop old amount and units
nei_national_totals.drop('FlowAmount', axis=1, inplace=True)
nei_national_totals.drop('Unit', axis=1, inplace=True)
# Rename cols to match reference format
nei_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                           inplace=True)
validation_result = validate_inventory(nei_flowbyfacility,
                                       nei_national_totals,
                                       group_by='flow',
                                       tolerance=5.0)
write_validation_result('NEI', report_year, validation_result)