def write_validation_result(inventory_acronym, year, validation_df):
    """Write the validation result and associated metadata to local dir.

    :param inventory_acronym: str for inventory e.g. 'TRI'
    :param year: str for year e.g. '2016'
    :param validation_df: df returned from validate_inventory function
    """
    directory = Path(paths.local_path).joinpath('validation')
    directory.mkdir(parents=True, exist_ok=True)
    log.info(f'writing validation result to {directory}')
    validation_df.to_csv(directory.joinpath(f"{inventory_acronym}_{year}.csv"),
                         index=False)
    # Get metadata on validation dataset
    validation_set_info_table = read_ValidationSets_Sources()
    # Get record for year and source
    validation_set_info = validation_set_info_table[
        (validation_set_info_table['Inventory'] == inventory_acronym) &
        (validation_set_info_table['Year'] == year)]
    if len(validation_set_info) != 1:
        log.error('no validation metadata found')
        return
    # Convert to Series
    validation_set_info = validation_set_info.iloc[0, ]
    # Use the same format an inventory metadata to described the validation set data
    validation_metadata = dict(source_metadata)
    validation_metadata['SourceFileName'] = validation_set_info['Name']
    validation_metadata['SourceVersion'] = validation_set_info['Version']
    validation_metadata['SourceURL'] = validation_set_info['URL']
    validation_metadata['SourceAcquisitionTime'] = validation_set_info['Date Acquired']
    validation_metadata['Criteria'] = validation_set_info['Criteria']
    # Write metadata to file
    write_metadata(inventory_acronym + '_' + year, validation_metadata,
                   datatype="validation")
Exemple #2
0
def generate_metadata(year, files, datatype='inventory'):
    """Get metadata and writes to .json."""
    if datatype == 'source':
        source_path = [
            str(OUTPUT_PATH.joinpath(f'US_{p}_{year}.csv')) for p in files
        ]
        source_meta = compile_source_metadata(source_path, _config, year)
        source_meta['SourceType'] = 'Zip file'
        tri_url = _config['url']
        link_zip_TRI = link_zip(tri_url, _config['queries'], year)
        regex = 'https.*/(.*(?=/\w*.zip))'
        # tri_version = link_zip_TRI.split('/')[-2]
        try:
            tri_version = re.search(regex, link_zip_TRI).group(1)
        except AttributeError:  # no match found from regex
            tri_version = 'last'
        source_meta['SourceVersion'] = tri_version
        write_metadata('TRI_' + year,
                       source_meta,
                       category=EXT_DIR,
                       datatype='source')
    else:
        source_meta = read_source_metadata(paths,
                                           set_stewi_meta(
                                               'TRI_' + year, EXT_DIR),
                                           force_JSON=True)['tool_meta']
        write_metadata('TRI_' + year, source_meta, datatype=datatype)
Exemple #3
0
def generate_metadata(year, datatype='inventory'):
    """Get metadata and writes to .json."""
    nei_file_path = _config[year]['file_name']
    if datatype == 'inventory':
        source_meta = []
        for file in nei_file_path:
            meta = set_stewi_meta(strip_file_extension(file), EXT_DIR)
            source_meta.append(
                read_source_metadata(paths, meta, force_JSON=True))
        write_metadata('NEI_' + year, source_meta, datatype=datatype)
def generate_metadata(year, datatype='inventory'):
    """Generate metadata and writes to json for datatypes 'inventory' or 'source'."""
    if datatype == 'source':
        source_path = str(OUTPUT_PATH.joinpath(_config[year]['file_name']))
        source_meta = compile_source_metadata(source_path, _config, year)
        write_metadata('eGRID_' + year, source_meta, category=EXT_DIR,
                       datatype='source')
    else:
        source_meta = read_source_metadata(paths, set_stewi_meta('eGRID_' + year,
                                           EXT_DIR),
                                           force_JSON=True)['tool_meta']
        write_metadata('eGRID_' + year, source_meta, datatype=datatype)
Exemple #5
0
def generate_metadata(year, datatype='inventory'):
    """Generate metadata and write to json for datatypes 'inventory' or 'source'."""
    if datatype == 'source':
        source_path = str(OUTPUT_PATH.joinpath(year))
        source_meta = compile_source_metadata(source_path, _config, year)
        source_meta['SourceType'] = 'Web Service'
        write_metadata(f"DMR_{year}", source_meta, category=EXT_DIR,
                       datatype='source')
    else:
        source_meta = read_source_metadata(paths, set_stewi_meta(f"DMR_{year}",
                                           EXT_DIR),
                                           force_JSON=True)['tool_meta']
        write_metadata(f"DMR_{year}", source_meta, datatype=datatype)
def generate_metadata(year, files, datatype='inventory'):
    """Get metadata and writes to .json."""
    if datatype == 'source':
        source_path = [str(p) for p in files]
        source_meta = compile_source_metadata(source_path, _config, year)
        source_meta['SourceType'] = 'Zip file'
        source_meta['SourceURL'] = _config['url']
        write_metadata('RCRAInfo_' + str(year),
                       source_meta,
                       category=EXT_DIR,
                       datatype='source')
    else:
        source_meta = read_source_metadata(paths,
                                           set_stewi_meta(
                                               'RCRAInfo_' + year, EXT_DIR),
                                           force_JSON=True)['tool_meta']
        write_metadata('RCRAInfo_' + year, source_meta, datatype=datatype)
Exemple #7
0
def generate_metadata(year, m, datatype='inventory'):
    """Get metadata and writes to .json."""
    if datatype == 'source':
        source_path = m.filename
        source_meta = compile_source_metadata(source_path, _config, year)
        source_meta['SourceType'] = m.filetype
        source_meta['SourceURL'] = m.url
        source_meta['SourceAcquisitionTime'] = m.time
        write_metadata('GHGRP_' + year,
                       source_meta,
                       category=EXT_DIR,
                       datatype='source')
    else:
        source_meta = read_source_metadata(paths,
                                           set_stewi_meta(
                                               'GHGRP_' + year, EXT_DIR),
                                           force_JSON=True)['tool_meta']
        write_metadata('GHGRP_' + year, source_meta, datatype=datatype)
def Generate_TRI_files_csv(TRIyear, Files):
    _config = config()['databases']['TRI']
    tri_url = _config['url']
    link_zip_TRI = link_zip(tri_url, _config['queries'], TRIyear)
    regex = re.compile(
        r'https://www3.epa.gov/tri/current/US_\d{4}_?(\d*)\.zip')
    tri_version = re.search(regex, link_zip_TRI).group(1)
    if not tri_version:
        tri_version = 'last'
    tri_required_fields = imp_fields(data_dir + 'TRI_required_fields.txt')
    keys = imp_fields(data_dir +
                      'TRI_keys.txt')  # the same function can be used
    import_facility = tri_required_fields[0:10]
    values = list()
    for p in range(len(keys)):
        start = 13 + 2 * p
        end = start + 1
        values.append(concat_req_field(tri_required_fields[start:end + 1]))
    # Create a dictionary that had the import fields for each release type to use in import process
    import_dict = dict_create(keys, values)
    # Build the TRI DataFrame
    tri = import_TRI_by_release_type(import_dict, TRIyear)
    # drop NA for Amount, but leave in zeros
    tri = tri.dropna(subset=['FlowAmount'])
    tri = strip_coln_white_space(tri, 'Basis of Estimate')
    #Convert to float if there are errors - be careful with this line
    if tri['FlowAmount'].values.dtype != 'float64':
        tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce')
    #Drop 0 for FlowAmount
    tri = tri[tri['FlowAmount'] != 0]
    # Import reliability scores for TRI
    tri_reliability_table = reliability_table[reliability_table['Source'] ==
                                              'TRI']
    tri_reliability_table.drop('Source', axis=1, inplace=True)
    #Merge with reliability table to get
    tri = pd.merge(tri,
                   tri_reliability_table,
                   left_on='Basis of Estimate',
                   right_on='Code',
                   how='left')
    # Fill NAs with 5 for DQI reliability score
    tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5)
    # Drop unneeded columns
    tri.drop('Basis of Estimate', axis=1, inplace=True)
    tri.drop('Code', axis=1, inplace=True)
    # Replace source info with Context
    source_cnxt = data_dir + 'TRI_ReleaseType_to_Compartment.csv'
    source_to_context = pd.read_csv(source_cnxt)
    tri = pd.merge(tri, source_to_context, how='left')
    # Convert units to ref mass unit of kg
    # Create a new field to put converted amount in
    tri['Amount_kg'] = 0.0
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount')
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount')
    # drop old amount and units
    tri.drop('FlowAmount', axis=1, inplace=True)
    tri.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True)
    tri.rename(columns={'DQI Reliability Score': 'ReliabilityScore'},
               inplace=True)
    #Drop release type
    tri.drop('ReleaseType', axis=1, inplace=True)
    #Group by facility, flow and compartment to aggregate different release types
    grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment']
    # Create a specialized weighted mean function to use for aggregation of reliability
    wm = lambda x: weight_mean(x, tri.loc[x.index, "FlowAmount"])
    # Groupby and aggregate with your dictionary:
    tri = tri.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'ReliabilityScore': wm
    })
    tri = tri.reset_index()

    #VALIDATE
    tri_national_totals = pd.read_csv(data_dir + 'TRI_' + TRIyear +
                                      '_NationalTotals.csv',
                                      header=0,
                                      dtype={"FlowAmount": np.float})
    tri_national_totals['FlowAmount_kg'] = 0
    tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg',
                                       'Unit', 'Pounds', 0.4535924,
                                       'FlowAmount')
    # drop old amount and units
    tri_national_totals.drop('FlowAmount', axis=1, inplace=True)
    tri_national_totals.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                               inplace=True)
    validation_result = validate_inventory(tri,
                                           tri_national_totals,
                                           group_by='flow',
                                           tolerance=5.0)
    write_validation_result('TRI', TRIyear, validation_result)
    #FLOWS
    flows = tri.groupby(['FlowName', 'CAS',
                         'Compartment']).count().reset_index()
    #stack by compartment
    flowsdf = flows[['FlowName', 'CAS', 'Compartment']]
    flowsdf['FlowID'] = flowsdf['CAS']
    #export chemicals
    #!!!Still needs CAS number and FlowID
    flowsdf.to_csv(output_dir + 'flow/' + 'TRI_' + TRIyear + '.csv',
                   index=False)
    #FLOW BY FACILITY
    #drop CAS
    tri.drop(columns=['CAS'], inplace=True)
    tri_file_name = 'TRI_' + TRIyear + '.csv'
    tri.to_csv(output_dir + 'flowbyfacility/' + tri_file_name, index=False)
    #FACILITY
    ##Import and handle TRI facility data
    tri_facility = pd.read_csv(set_dir(data_dir + '../../../') + 'TRI/US_1a_' +
                               TRIyear + '.txt',
                               sep='\t',
                               header=0,
                               usecols=import_facility,
                               error_bad_lines=False,
                               low_memory=False)
    #get unique facilities
    tri_facility_unique_ids = pd.unique(tri_facility['TRIFID'])
    tri_facility_unique_rows = tri_facility.drop_duplicates()
    #Use group by to elimiate additional ID duplicates
    #tri_facility_unique_rows_agg = tri_facility_unique_rows.groupby(['TRIFID'])
    #tri_facility_final = tri_facility_unique_rows_agg.aggregate()
    tri_facility_final = tri_facility_unique_rows
    #rename columns
    TRI_facility_name_crosswalk = {
        'TRIFID': 'FacilityID',
        'FACILITY NAME': 'FacilityName',
        'FACILITY STREET': 'Address',
        'FACILITY CITY': 'City',
        'FACILITY COUNTY': 'County',
        'FACILITY STATE': 'State',
        'FACILITY ZIP CODE': 'Zip',
        'PRIMARY NAICS CODE': 'NAICS',
        'LATITUDE': 'Latitude',
        'LONGITUDE': 'Longitude'
    }
    tri_facility_final.rename(columns=TRI_facility_name_crosswalk,
                              inplace=True)
    tri_facility_final.to_csv(output_dir + 'facility/' + 'TRI_' + TRIyear +
                              '.csv',
                              index=False)
    # Record TRI metadata
    external_dir = set_dir(data_dir + '../../../')
    for file in Files:
        tri_csv = external_dir + 'TRI/US_' + file + '_' + TRIyear + '.txt'
        try:
            retrieval_time = os.path.getctime(tri_csv)
        except:
            retrieval_time = time.time()
        tri_metadata['SourceAquisitionTime'] = time.ctime(retrieval_time)
        tri_metadata['SourceFileName'] = get_relpath(tri_csv)
        tri_metadata['SourceURL'] = tri_url
        tri_metadata['SourceVersion'] = tri_version
        write_metadata('TRI', TRIyear, tri_metadata)
def Generate_RCRAInfo_files_csv(report_year, RCRAInfopath,
                                RCRAfInfoflatfileURL):
    RCRAInfoBRtextfile = RCRAInfopath + 'RCRAInfo_by_year/br_reporting_' + report_year + '.txt'
    #Get file columns widths
    linewidthsdf = pd.read_csv(data_dir + 'RCRA_FlatFile_LineComponents.csv')
    BRwidths = linewidthsdf['Size']
    #Metadata
    BR_meta = globals.inventory_metadata
    #Get columns to keep
    RCRAfieldstokeepdf = pd.read_csv(data_dir + 'RCRA_required_fields.txt',
                                     header=None)
    RCRAfieldstokeep = list(RCRAfieldstokeepdf[0])
    #Get total row count of the file
    with open(RCRAInfoBRtextfile, 'rb') as rcrafile:
        row_count = sum([1 for row in rcrafile]) - 1
    BR = pd.read_csv(RCRAInfoBRtextfile,
                     header=0,
                     usecols=RCRAfieldstokeep,
                     sep='\t',
                     low_memory=False,
                     error_bad_lines=False,
                     encoding='ISO-8859-1')
    # Checking the Waste Generation Data Health
    BR = BR[pd.to_numeric(BR['Generation Tons'], errors='coerce').notnull()]
    BR['Generation Tons'] = BR['Generation Tons'].astype(float)
    print(BR.head())
    #Pickle as a backup
    # BR.to_pickle('work/BR_'+ report_year + '.pk')
    #Read in to start from a pickle
    # BR = pd.read_pickle('work/BR_'+report_year+'.pk')
    print(len(BR))
    #2001:838497
    #2003:770727
    #2005:697706
    #2007:765764
    #2009:919906
    #2011:1590067
    #2013:1581899
    #2015:2053108
    #2017:1446613
    #Validate correct import - number of states should be 50+ (includes PR and territories)
    states = BR['State'].unique()
    print(len(states))
    #2001: 46
    #2003: 46
    #2005: 46
    #2007: 46
    #2009: 46
    #2011: 56
    #2013: 56
    #2015: 57
    #2017: 45
    #Filtering to remove double counting and non BR waste records
    #Do not double count generation from sources that receive it only
    #Check sum of tons and number of records after each filter step
    #See EPA 2013. Biennial Report Analytical Methodologies: Data Selection
    #Logic and Assumptions used to Analyze the Biennial Report. Office of Resource Conservation and Recovery
    #Drop lines with source code G61
    BR = BR[BR['Source Code'] != 'G61']
    print(len(BR))
    #2001:798905
    #2003:722958
    #2005:650413
    #2007:722383
    #2009:879845
    #2011:1496275
    #2013:1492245
    #2015:1959883
    #2017:1375562
    #Only include wastes that are included in the National Biennial Report
    BR = BR[BR['Generator ID Included in NBR'] == 'Y']
    print(len(BR))
    #2001:734349
    #2003:629802
    #2005:482345
    #2007:598748
    #2009:704233
    #2011:1284796
    #2013:1283457
    #2015:1759711
    #2017:1269987
    BR = BR[BR['Generator Waste Stream Included in NBR'] == 'Y']
    print(len(BR))
    #2001:172539
    #2003:167488
    #2005:152036
    #2007:151729
    #2009:142918
    #2011:209342
    #2013:256978
    #2015:288980
    #2017:202842
    #Remove imported wastes, source codes G63-G75
    ImportSourceCodes = pd.read_csv(data_dir + 'RCRAImportSourceCodes.txt',
                                    header=None)
    ImportSourceCodes = ImportSourceCodes[0].tolist()
    SourceCodesPresent = BR['Source Code'].unique().tolist()
    SourceCodestoKeep = []
    for item in SourceCodesPresent:
        if item not in ImportSourceCodes:
            #print(item)
            SourceCodestoKeep.append(item)
    BR = BR[BR['Source Code'].isin(SourceCodestoKeep)]
    print(len(BR))
    #2001:172539
    #2003:167264
    #2005:151638
    #2007:151695
    #2009:142825
    #2011:209306
    #2013:256844
    #2015:286813
    #2017:202513
    #Reassign the NAICS to a string
    BR['NAICS'] = BR['Primary NAICS'].astype('str')
    BR.drop('Primary NAICS', axis=1, inplace=True)
    #Create field for DQI Reliability Score with fixed value from CSV
    #Currently generating a warning
    reliability_table = globals.reliability_table
    rcrainfo_reliability_table = reliability_table[reliability_table['Source']
                                                   == 'RCRAInfo']
    rcrainfo_reliability_table.drop('Source', axis=1, inplace=True)
    BR['ReliabilityScore'] = float(
        rcrainfo_reliability_table['DQI Reliability Score'])
    #Create a new field to put converted amount in
    BR['Amount_kg'] = 0.0
    #Convert amounts from tons. Note this could be replaced with a conversion utility
    BR['Amount_kg'] = USton_kg * BR['Generation Tons']
    ##Read in waste descriptions
    linewidthsdf = pd.read_csv(data_dir +
                               'RCRAInfo_LU_WasteCode_LineComponents.csv')
    widths = linewidthsdf['Size']
    names = linewidthsdf['Data Element Name']
    File_lu = [
        file for file in os.listdir(RCRAInfopath)
        if 'lu_waste_code' in file.lower()
    ][0]
    wastecodesfile = RCRAInfopath + File_lu
    WasteCodesTest = pd.read_fwf(wastecodesfile,
                                 widths=widths,
                                 header=None,
                                 names=names,
                                 nrows=10)
    WasteCodes = pd.read_fwf(wastecodesfile,
                             widths=widths,
                             header=None,
                             names=names)
    WasteCodes = WasteCodes[[
        'Waste Code', 'Code Type', 'Waste Code Description'
    ]]
    #Remove rows where any fields are na description is missing
    WasteCodes.dropna(inplace=True)
    #Bring in form codes
    #Replace form code with the code name
    form_code_name_file = data_dir + 'RCRA_LU_FORM_CODE.csv'
    form_code_table_cols_needed = ['FORM_CODE', 'FORM_CODE_NAME']
    form_code_name_df = pd.read_csv(form_code_name_file,
                                    header=0,
                                    usecols=form_code_table_cols_needed)
    #Merge waste codes with BR records
    BR = pd.merge(BR,
                  WasteCodes,
                  left_on='Waste Code Group',
                  right_on='Waste Code',
                  how='left')
    #Rename code type to make it clear
    BR.rename(columns={'Code Type': 'Waste Code Type'}, inplace=True)
    #Merge form codes with BR
    BR = pd.merge(BR,
                  form_code_name_df,
                  left_on='Form Code',
                  right_on='FORM_CODE',
                  how='left')
    #Drop duplicates from merge
    BR.drop(columns=['FORM_CODE', 'Waste Code Group'], inplace=True)
    #Set flow name to Waste Code Description
    BR['FlowName'] = BR['Waste Code Description']
    #BR['FlowNameSource'] = 'Waste Code Description'
    #If a useful Waste Code Description is present, use it
    BR['FlowName'] = BR['FlowName'].apply(waste_description_cleaner)
    #Check unique flow names
    pd.unique(BR['FlowName'])
    #If there is not useful waste code, fill it with the Form Code Name
    #Find the NAs in FlowName and then give that source of Form Code
    BR.loc[BR['FlowName'].isnull(), 'FlowNameSource'] = 'Form Code'
    #Now for those source name rows that are blank, tell it its a waste code
    BR.loc[BR['FlowNameSource'].isnull(), 'FlowNameSource'] = 'Waste Code'
    #Set FlowIDs to the appropriate code
    BR.loc[BR['FlowName'].isnull(), 'FlowID'] = BR['Form Code']
    BR.loc[BR['FlowID'].isnull(), 'FlowID'] = BR['Waste Code']
    #Now finally fill names that are blank with the form code name
    BR['FlowName'].fillna(BR['FORM_CODE_NAME'], inplace=True)
    #Drop unneeded fields
    BR.drop('Generation Tons', axis=1, inplace=True)
    BR.drop('Generator ID Included in NBR', axis=1, inplace=True)
    BR.drop('Generator Waste Stream Included in NBR', axis=1, inplace=True)
    BR.drop('Source Code', axis=1, inplace=True)
    BR.drop('Management Method', axis=1, inplace=True)
    BR.drop('Waste Description', axis=1, inplace=True)
    BR.drop('Waste Code Description', axis=1, inplace=True)
    BR.drop('FORM_CODE_NAME', axis=1, inplace=True)
    #Rename cols used by multiple tables
    BR.rename(columns={'Handler ID': 'FacilityID'}, inplace=True)
    #rename new name
    BR.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True)
    #Prepare flows file
    flows = BR[['FlowName', 'FlowID', 'FlowNameSource']]
    #Drop duplicates
    flows = flows.drop_duplicates()
    flows['Compartment'] = 'Waste'
    flows['Unit'] = 'kg'
    #Sort them by the flow names
    flows.sort_values(by='FlowName', axis=0, inplace=True)
    #Export them
    flows.to_csv(output_dir + 'flow/RCRAInfo_' + report_year + '.csv',
                 index=False)
    #Prepare facilities file
    facilities = BR[[
        'FacilityID', 'Handler Name', 'Location Street Number',
        'Location Street 1', 'Location Street 2', 'Location City',
        'Location State', 'Location Zip', 'County Name', 'NAICS'
    ]]
    #Drop duplicates
    facilities.drop_duplicates(inplace=True)
    facilities['Location Street Number'] = facilities[
        'Location Street Number'].apply(str)
    facilities['Location Street Number'].fillna('', inplace=True)
    facilities[
        'Address'] = facilities['Location Street Number'] + ' ' + facilities[
            'Location Street 1'] + ' ' + facilities['Location Street 2']
    facilities.drop(columns=[
        'Location Street Number', 'Location Street 1', 'Location Street 2'
    ],
                    inplace=True)
    facilities.rename(columns={
        'Primary NAICS': 'NAICS',
        'Handler Name': 'FacilityName',
        'Location City': 'City',
        'Location State': 'State',
        'Location Zip': 'Zip',
        'County Name': 'County'
    },
                      inplace=True)
    facilities.to_csv(output_dir + 'facility/RCRAInfo_' + report_year + '.csv',
                      index=False)
    #Prepare flow by facility
    flowbyfacility = BR.groupby(['FacilityID', 'ReliabilityScore', 'FlowName'
                                 ])['FlowAmount'].sum().reset_index()
    ##VALIDATION
    BR_national_total = pd.read_csv(data_dir + 'RCRAInfo_' + report_year +
                                    '_NationalTotals.csv',
                                    header=0,
                                    dtype={"FlowAmount": np.float})
    BR_national_total['FlowAmount_kg'] = 0
    BR_national_total = unit_convert(BR_national_total, 'FlowAmount_kg',
                                     'Unit', 'Tons', 907.18474, 'FlowAmount')
    BR_national_total.drop('FlowAmount', axis=1, inplace=True)
    BR_national_total.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    BR_national_total.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                             inplace=True)
    #Validate total waste generated against national totals
    sum_of_flowbyfacility = flowbyfacility['FlowAmount'].sum()
    sum_of_flowbyfacility_df = pd.DataFrame({
        'FlowAmount': [sum_of_flowbyfacility],
        'FlowName':
        'ALL',
        'Compartment':
        'waste'
    })
    validation_df = validate_inventory(sum_of_flowbyfacility_df,
                                       BR_national_total,
                                       group_by='flow')
    write_validation_result('RCRAInfo', report_year, validation_df)
    #Export to csv
    flowbyfacility.to_csv(output_dir + 'flowbyfacility/RCRAInfo_' +
                          report_year + '.csv',
                          index=False)
    #Record metadata
    try:
        retrieval_time = os.path.getctime(RCRAInfoBRtextfile)
    except:
        retrieval_time = time.time()
    BR_meta['SourceAquisitionTime'] = time.ctime(retrieval_time)
    BR_meta['SourceFileName'] = RCRAInfoBRtextfile
    BR_meta['SourceURL'] = RCRAfInfoflatfileURL
    write_metadata('RCRAInfo', report_year, BR_meta)
nei_retrieval_time = time.ctime(os.path.getctime(point_1_path))

if nei_retrieval_time is not None:
    NEI_meta['SourceAquisitionTime'] = nei_retrieval_time
NEI_meta['SourceFileName'] = get_relpath(point_1_path)
NEI_meta['SourceURL'] = 'http://eis.epa.gov'

#extract version from filepath using regex
import re
pattern = 'V[0-9]'
version = re.search(pattern, point_1_path, flags=re.IGNORECASE)
if version is not None:
    NEI_meta['SourceVersion'] = version.group(0)

#Write metadata to json
write_metadata('NEI', report_year, NEI_meta)

#VALIDATE
nei_national_totals = pd.read_csv(data_dir + 'NEI_' + report_year +
                                  '_NationalTotals.csv',
                                  header=0,
                                  dtype={"FlowAmount": np.float})
nei_national_totals['FlowAmount_kg'] = 0
nei_national_totals = unit_convert(nei_national_totals, 'FlowAmount_kg',
                                   'Unit', 'LB', lb_kg, 'FlowAmount')
nei_national_totals = unit_convert(nei_national_totals, 'FlowAmount_kg',
                                   'Unit', 'TON', USton_kg, 'FlowAmount')
# drop old amount and units
nei_national_totals.drop('FlowAmount', axis=1, inplace=True)
nei_national_totals.drop('Unit', axis=1, inplace=True)
# Rename cols to match reference format