Ejemplo n.º 1
0
def extacting_TRI_data_files(link_zip, files, year):
    external_dir = set_dir(data_dir + '../../../')
    r_file = requests.get(link_zip)
    for file in files:
        df_columns = pd.read_csv(data_dir + 'TRI_File_' + file +
                                 '_columns.txt',
                                 header=0)
        columns = list(df_columns['Names'])
        n_columns = len(columns)
        with zipfile.ZipFile(io.BytesIO(r_file.content)) as z:
            z.extract('US_' + file + '_' + year + '.txt', external_dir + 'TRI')
        df = pd.read_csv(
            external_dir + 'TRI/US_' + file + '_' + year + '.txt',
            header=None,
            encoding='ISO-8859-1',
            error_bad_lines=False,
            sep='\t',
            low_memory=False,
            skiprows=[0],
            lineterminator='\n',
            usecols=range(n_columns))  # avoiding \r\n created in Windows OS
        df.columns = columns
        df.to_csv(external_dir + 'TRI/US_' + file + '_' + year + '.txt',
                  sep='\t',
                  index=False)
def organizing_files_by_year(Tables, Path, Years_saved):
    for Table in Tables:
        # Get file columns widths
        dir_RCRA_by_year = set_dir(Path + 'RCRAInfo_by_year/')
        linewidthsdf = pd.read_csv(data_dir +
                                   'RCRA_FlatFile_LineComponents.csv')
        BRwidths = linewidthsdf['Size'].astype(int).tolist()
        BRnames = linewidthsdf['Data Element Name'].tolist()
        Files = [
            file for file in os.listdir(Path)
            if ((file.startswith(Table)) & file.endswith('.txt'))
        ]
        Files.sort()
        for File in Files:
            df = pd.read_fwf(Path + File, widths = BRwidths,\
                        header = None, names = BRnames,
                        encoding = 'utf-8')
            df.sort_values(by=['Report Cycle'])
            df = df[df['Report Cycle'].apply(lambda x: str(x).isnumeric())]
            df['Report Cycle'] = df['Report Cycle'].astype(int)
            df = df[~df['Report Cycle'].isin(Years_saved)]
            Years = list(df['Report Cycle'].unique())
            for Year in Years:
                if re.match(r'\d{4}', str(int(Year))):
                    df_year = df[df['Report Cycle'] == Year]
                    Path_directory = dir_RCRA_by_year + 'br_reporting_' + str(
                        int(Year)) + '.txt'
                    condition = True
                    while condition:
                        try:
                            if os.path.exists(Path_directory):
                                with open(Path_directory, 'a') as f:
                                    df_year.to_csv(f,
                                                   header=False,
                                                   sep='\t',
                                                   index=False)
                            else:
                                df_year.to_csv(Path_directory,
                                               sep='\t',
                                               index=False)
                            condition = False
                        except UnicodeEncodeError:
                            for column in df_year:
                                if df_year[column].dtype == object:
                                    df_year[column] = df_year[column].map(lambda x: x.replace(u'\uFFFD', '?') \
                                                    if type(x) == str else x)
                            condition = True
                else:
                    continue
Ejemplo n.º 3
0
def import_TRI_by_release_type(d, year):
    # Import TRI file
    external_dir = set_dir(data_dir + '../../../')
    tri_release_output_fieldnames = [
        'FacilityID', 'CAS', 'FlowName', 'Unit', 'FlowAmount',
        'Basis of Estimate', 'ReleaseType'
    ]
    tri = pd.DataFrame()
    for k, v in d.items():
        #create a data type dictionary
        dtype_dict = {
            'TRIFID': "str",
            'CHEMICAL NAME': "str",
            'CAS NUMBER': "str",
            'UNIT OF MEASURE': "str"
        }
        #If a basis of estimate field is present, set its type to string
        if len(v) > 5:
            dtype_dict[v[5]] = "str"
        if (k == 'offsiteland') | (k == 'offsiteother'):
            file = '3a'
        else:
            file = '1a'
        tri_csv = external_dir + 'TRI/US_' + file + '_' + year + '.txt'
        tri_part = pd.read_csv(
            tri_csv,
            sep='\t',
            header=0,
            usecols=v,
            dtype=dtype_dict,
            na_values=['NO'],
            error_bad_lines=False,
            low_memory=False,
            converters={v[4]: lambda x: pd.to_numeric(x, errors='coerce')})

        tri_part['ReleaseType'] = k
        tri_part.columns = tri_release_output_fieldnames
        tri = pd.concat([tri, tri_part])
    return tri
Ejemplo n.º 4
0
def Generate_TRI_files_csv(TRIyear, Files):
    _config = config()['databases']['TRI']
    tri_url = _config['url']
    link_zip_TRI = link_zip(tri_url, _config['queries'], TRIyear)
    regex = re.compile(
        r'https://www3.epa.gov/tri/current/US_\d{4}_?(\d*)\.zip')
    tri_version = re.search(regex, link_zip_TRI).group(1)
    if not tri_version:
        tri_version = 'last'
    tri_required_fields = imp_fields(data_dir + 'TRI_required_fields.txt')
    keys = imp_fields(data_dir +
                      'TRI_keys.txt')  # the same function can be used
    import_facility = tri_required_fields[0:10]
    values = list()
    for p in range(len(keys)):
        start = 13 + 2 * p
        end = start + 1
        values.append(concat_req_field(tri_required_fields[start:end + 1]))
    # Create a dictionary that had the import fields for each release type to use in import process
    import_dict = dict_create(keys, values)
    # Build the TRI DataFrame
    tri = import_TRI_by_release_type(import_dict, TRIyear)
    # drop NA for Amount, but leave in zeros
    tri = tri.dropna(subset=['FlowAmount'])
    tri = strip_coln_white_space(tri, 'Basis of Estimate')
    #Convert to float if there are errors - be careful with this line
    if tri['FlowAmount'].values.dtype != 'float64':
        tri['FlowAmount'] = pd.to_numeric(tri['FlowAmount'], errors='coerce')
    #Drop 0 for FlowAmount
    tri = tri[tri['FlowAmount'] != 0]
    # Import reliability scores for TRI
    tri_reliability_table = reliability_table[reliability_table['Source'] ==
                                              'TRI']
    tri_reliability_table.drop('Source', axis=1, inplace=True)
    #Merge with reliability table to get
    tri = pd.merge(tri,
                   tri_reliability_table,
                   left_on='Basis of Estimate',
                   right_on='Code',
                   how='left')
    # Fill NAs with 5 for DQI reliability score
    tri['DQI Reliability Score'] = tri['DQI Reliability Score'].fillna(value=5)
    # Drop unneeded columns
    tri.drop('Basis of Estimate', axis=1, inplace=True)
    tri.drop('Code', axis=1, inplace=True)
    # Replace source info with Context
    source_cnxt = data_dir + 'TRI_ReleaseType_to_Compartment.csv'
    source_to_context = pd.read_csv(source_cnxt)
    tri = pd.merge(tri, source_to_context, how='left')
    # Convert units to ref mass unit of kg
    # Create a new field to put converted amount in
    tri['Amount_kg'] = 0.0
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Pounds', lb_kg, 'FlowAmount')
    tri = unit_convert(tri, 'Amount_kg', 'Unit', 'Grams', g_kg, 'FlowAmount')
    # drop old amount and units
    tri.drop('FlowAmount', axis=1, inplace=True)
    tri.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri.rename(columns={'Amount_kg': 'FlowAmount'}, inplace=True)
    tri.rename(columns={'DQI Reliability Score': 'ReliabilityScore'},
               inplace=True)
    #Drop release type
    tri.drop('ReleaseType', axis=1, inplace=True)
    #Group by facility, flow and compartment to aggregate different release types
    grouping_vars = ['FacilityID', 'FlowName', 'CAS', 'Compartment']
    # Create a specialized weighted mean function to use for aggregation of reliability
    wm = lambda x: weight_mean(x, tri.loc[x.index, "FlowAmount"])
    # Groupby and aggregate with your dictionary:
    tri = tri.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'ReliabilityScore': wm
    })
    tri = tri.reset_index()

    #VALIDATE
    tri_national_totals = pd.read_csv(data_dir + 'TRI_' + TRIyear +
                                      '_NationalTotals.csv',
                                      header=0,
                                      dtype={"FlowAmount": np.float})
    tri_national_totals['FlowAmount_kg'] = 0
    tri_national_totals = unit_convert(tri_national_totals, 'FlowAmount_kg',
                                       'Unit', 'Pounds', 0.4535924,
                                       'FlowAmount')
    # drop old amount and units
    tri_national_totals.drop('FlowAmount', axis=1, inplace=True)
    tri_national_totals.drop('Unit', axis=1, inplace=True)
    # Rename cols to match reference format
    tri_national_totals.rename(columns={'FlowAmount_kg': 'FlowAmount'},
                               inplace=True)
    validation_result = validate_inventory(tri,
                                           tri_national_totals,
                                           group_by='flow',
                                           tolerance=5.0)
    write_validation_result('TRI', TRIyear, validation_result)
    #FLOWS
    flows = tri.groupby(['FlowName', 'CAS',
                         'Compartment']).count().reset_index()
    #stack by compartment
    flowsdf = flows[['FlowName', 'CAS', 'Compartment']]
    flowsdf['FlowID'] = flowsdf['CAS']
    #export chemicals
    #!!!Still needs CAS number and FlowID
    flowsdf.to_csv(output_dir + 'flow/' + 'TRI_' + TRIyear + '.csv',
                   index=False)
    #FLOW BY FACILITY
    #drop CAS
    tri.drop(columns=['CAS'], inplace=True)
    tri_file_name = 'TRI_' + TRIyear + '.csv'
    tri.to_csv(output_dir + 'flowbyfacility/' + tri_file_name, index=False)
    #FACILITY
    ##Import and handle TRI facility data
    tri_facility = pd.read_csv(set_dir(data_dir + '../../../') + 'TRI/US_1a_' +
                               TRIyear + '.txt',
                               sep='\t',
                               header=0,
                               usecols=import_facility,
                               error_bad_lines=False,
                               low_memory=False)
    #get unique facilities
    tri_facility_unique_ids = pd.unique(tri_facility['TRIFID'])
    tri_facility_unique_rows = tri_facility.drop_duplicates()
    #Use group by to elimiate additional ID duplicates
    #tri_facility_unique_rows_agg = tri_facility_unique_rows.groupby(['TRIFID'])
    #tri_facility_final = tri_facility_unique_rows_agg.aggregate()
    tri_facility_final = tri_facility_unique_rows
    #rename columns
    TRI_facility_name_crosswalk = {
        'TRIFID': 'FacilityID',
        'FACILITY NAME': 'FacilityName',
        'FACILITY STREET': 'Address',
        'FACILITY CITY': 'City',
        'FACILITY COUNTY': 'County',
        'FACILITY STATE': 'State',
        'FACILITY ZIP CODE': 'Zip',
        'PRIMARY NAICS CODE': 'NAICS',
        'LATITUDE': 'Latitude',
        'LONGITUDE': 'Longitude'
    }
    tri_facility_final.rename(columns=TRI_facility_name_crosswalk,
                              inplace=True)
    tri_facility_final.to_csv(output_dir + 'facility/' + 'TRI_' + TRIyear +
                              '.csv',
                              index=False)
    # Record TRI metadata
    external_dir = set_dir(data_dir + '../../../')
    for file in Files:
        tri_csv = external_dir + 'TRI/US_' + file + '_' + TRIyear + '.txt'
        try:
            retrieval_time = os.path.getctime(tri_csv)
        except:
            retrieval_time = time.time()
        tri_metadata['SourceAquisitionTime'] = time.ctime(retrieval_time)
        tri_metadata['SourceFileName'] = get_relpath(tri_csv)
        tri_metadata['SourceURL'] = tri_url
        tri_metadata['SourceVersion'] = tri_version
        write_metadata('TRI', TRIyear, tri_metadata)
                        help='What RCRAInfo tables you want.\
                        Check:\
                        https://rcrainfopreprod.epa.gov/rcrainfo-help/application/publicHelp/index.htm',
                        required=False,
                        default=[None])

    args = parser.parse_args()

    #Metadata
    BR_meta = globals.inventory_metadata

    #RCRAInfo url
    _config = config()['databases']['RCRAInfo']
    RCRAfInfoflatfileURL = _config['url']

    RCRAInfopath = set_dir(data_dir + "../../../RCRAInfo/")

    ##Adds sepcified Year to BR_REPORTING table
    tables = args.Tables
    if 'BR_REPORTING' in tables:
        args.Tables[tables.index(
            'BR_REPORTING')] = 'BR_REPORTING' + '_' + args.Year

    if args.Option == 'A':

        query = _config['queries']['Table_of_tables']
        download_zip(RCRAfInfoflatfileURL, RCRAInfopath, args.Tables, query)

    elif args.Option == 'B':

        regex = re.compile(r'RCRAInfo_(\d{4})')
Ejemplo n.º 6
0
#NEI import and process to Standardized EPA output format
#This script uses the NEI data exports from EIS.

from stewi.globals import set_dir,output_dir,data_dir, write_metadata,inventory_metadata,get_relpath,unit_convert,\
    validate_inventory,write_validation_result,USton_kg,lb_kg
import pandas as pd
import numpy as np
import os
import time

report_year = '2016'

external_dir = set_dir('../NEI/')

nei_required_fields = pd.read_table(data_dir + 'NEI_required_fields.csv',
                                    sep=',').fillna('Null')
nei_file_path = pd.read_table(data_dir + 'NEI_' + report_year +
                              '_file_path.csv',
                              sep=',').fillna('Null')


def read_data(source, file):
    #tmp = pd.Series(list(nei_required_fields[source]), index=list(nei_required_fields['StandardizedEPA']))
    file_result = pd.DataFrame(
        columns=list(nei_required_fields['StandardizedEPA']))
    # read nei file by chunks
    for file_chunk in pd.read_table(
            external_dir + file,
            sep=',',
            usecols=list(set(nei_required_fields[source]) - set(['Null'])),
            chunksize=100000,