Beispiel #1
0
def process_incidents(remote=False,
                      land=False,
                      company_names=False,
                      companies=False,
                      test=False,
                      lang='en'):

    if remote:
        if lang == 'en':
            link = "https://www.cer-rec.gc.ca/en/safety-environment/industry-performance/interactive-pipeline/map/2020-12-31-incident-data.csv"
        else:
            link = "https://www.cer-rec.gc.ca/fr/securite-environnement/rendement-lindustrie/carte-interactive-pipelines/carte/2020-12-31-donnees-incidents.csv"
        print('downloading remote incidents file')
        df = pd.read_csv(link,
                         skiprows=1,
                         encoding="UTF-16",
                         error_bad_lines=False)
        df.to_csv("./raw_data/incidents.csv", index=False)
    elif test:
        print('reading test incidents file')
        if lang == 'en':
            df = pd.read_csv("./raw_data/test_data/incidents_en.csv",
                             skiprows=0,
                             encoding="UTF-8",
                             error_bad_lines=False)
            df = process_english(df)
        else:
            df = pd.read_csv("./raw_data/incidents_fr.csv",
                             skiprows=1,
                             encoding="UTF-16",
                             error_bad_lines=False)
            df = process_french(df)

    else:
        print('reading local incidents file')
        if lang == 'en':
            print('starting english incidents...')
            df = pd.read_csv("./raw_data/incidents_en.csv",
                             skiprows=0,
                             encoding="UTF-8",
                             error_bad_lines=False)
            df = process_english(df)
        else:
            print('starting french incidents...')
            df = pd.read_csv("./raw_data/incidents_fr.csv",
                             skiprows=1,
                             encoding="UTF-16",
                             error_bad_lines=False)
            df = process_french(df)

    # initial data processing
    df['Company'] = df['Company'].replace(company_rename())

    df['Approximate Volume Released'] = pd.to_numeric(
        df['Approximate Volume Released'], errors='coerce')
    df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='raise')

    for delete in [
            'Significant', 'Release Type', 'Nearest Populated Centre',
            'Reported Date'
    ]:
        del df[delete]

    if company_names:
        print(get_company_names(df['Company']))

    # industryTrend = changes(df, volume=True)
    # perKm = incidentsPerKm(df)
    perKm = None

    if companies:
        company_files = companies
    else:
        company_files = [
            'NOVA Gas Transmission Ltd.', 'TransCanada PipeLines Limited',
            'Enbridge Pipelines Inc.', 'Enbridge Pipelines (NW) Inc.',
            'Enbridge Bakken Pipeline Company Inc.', 'Express Pipeline Ltd.',
            'Trans Mountain Pipeline ULC',
            'Trans Quebec and Maritimes Pipeline Inc.',
            'Trans-Northern Pipelines Inc.',
            'TransCanada Keystone Pipeline GP Ltd.', 'Westcoast Energy Inc.',
            'Alliance Pipeline Ltd.', 'PKM Cochin ULC',
            'Foothills Pipe Lines Ltd.', 'Southern Lights Pipeline',
            'Emera Brunswick Pipeline Company Ltd.',
            'Plains Midstream Canada ULC', 'Genesis Pipeline Canada Ltd.',
            'Montreal Pipe Line Limited', 'Trans-Northern Pipelines Inc.',
            'Kingston Midstream Westspur Limited',
            'Many Islands Pipe Lines (Canada) Limited',
            'Vector Pipeline Limited Partnership',
            'Maritimes & Northeast Pipeline Management Ltd.'
        ]

    for company in company_files:
        folder_name = company.replace(' ', '').replace('.', '')
        df_c = df[df['Company'] == company].copy().reset_index(drop=True)
        df_vol = df_c[~df_c['Approximate Volume Released'].isnull()].copy(
        ).reset_index(drop=True)
        thisCompanyData = {}
        if not df_vol.empty:
            # calculate metadata here, before non releases are filtered out
            meta = incidentMetaData(df, perKm, company, lang)
            # companyTrend = changes(df_vol, volume=False)
            # meta['trends'] = {"company": companyTrend, "industry": industryTrend}
            thisCompanyData['meta'] = meta
            del df_vol['Incident Types']
            del df_vol['Company']
            df_vol = optimizeJson(df_vol)
            thisCompanyData['events'] = df_vol.to_dict(orient='records')
            if not test:
                with open(
                        '../incidents/company_data/' + lang + '/' +
                        folder_name + '.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)
        else:
            # there are no product release incidents
            thisCompanyData['events'] = df_vol.to_dict(orient='records')
            thisCompanyData['meta'] = {"companyName": company}
            if not test:
                with open(
                        '../incidents/company_data/' + lang + '/' +
                        folder_name + '.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)

    return df_c, df_vol, meta
def get_company_names():
    response = jsonify({'company': util.get_company_names()})
    response.headers.add('Access-Control-Allow-Origin', '*')

    return response
Beispiel #3
0
def process_conditions(remote=False,
                       nonStandard=True,
                       company_names=False,
                       companies=False,
                       test=False,
                       lang='en',
                       save=True):
    if remote:
        print('downloading remote conditions file')
        if lang == 'en':
            link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv'
            df = pd.read_csv(link,
                             sep='\t',
                             lineterminator='\r',
                             encoding="UTF-16",
                             error_bad_lines=False)
            df = normalize_text(df,
                                ['Location', 'Short Project Name', 'Theme(s)'])
        else:
            link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv'
            linkFR = 'https://www.cer-rec.gc.ca/ouvert/conditions/conditions.csv'
            df = pd.read_csv(link,
                             sep='\t',
                             lineterminator='\r',
                             encoding="UTF-16",
                             error_bad_lines=False)
            fr = pd.read_csv(linkFR,
                             sep='\t',
                             lineterminator='\r',
                             encoding="UTF-16",
                             error_bad_lines=False)
            df = process_french(df, fr)

    elif test:
        print('reading test conditions data')
        df = pd.read_csv('./raw_data/test_data/conditions.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])
    else:
        print('reading local conditions data')
        if lang == 'en':
            df = pd.read_csv('./raw_data/conditions_en.csv',
                             encoding="UTF-16",
                             sep='\t')
            df = normalize_text(df,
                                ['Location', 'Short Project Name', 'Theme(s)'])
        else:
            df = pd.read_csv('./raw_data/conditions_en.csv',
                             encoding="UTF-16",
                             sep='\t')
            fr = pd.read_csv('./raw_data/conditions_fr.csv',
                             encoding="UTF-16",
                             sep='\t')
            df = process_french(df, fr)

    for date_col in ['Effective Date', 'Issuance Date', 'Sunset Date']:
        df[date_col] = pd.to_datetime(df[date_col])

    if not nonStandard:
        # only include non-standard conditions
        df = df[df['Condition Type'] != 'Standard']

    delete_cols = [
        'Condition', 'Condition Phase', 'Instrument Activity',
        'Condition Type', 'Condition Filing'
    ]

    for delete in delete_cols:
        del df[delete]

    for r in ['\n', '"']:
        df['Company'] = df['Company'].replace(r, '', regex=True)
    df['Company'] = [x.strip() for x in df['Company']]
    df['Condition Status'] = df['Condition Status'].astype('object')
    df['Condition Status'] = [str(x).strip() for x in df['Condition Status']]
    # preliminary processing
    df['Company'] = df['Company'].replace(company_rename())

    df = df[df['Short Project Name'] != "SAM/COM"].copy().reset_index(
        drop=True)
    df['Theme(s)'] = df['Theme(s)'].replace({"nan": "No theme specified"})

    regions_map = import_simplified()
    df = add_links(df)
    if company_names:
        print(get_company_names(df['Company']))

    if companies:  # used to set one company for testing
        company_files = companies
    else:
        company_files = [
            'NOVA Gas Transmission Ltd.', 'TransCanada PipeLines Limited',
            'Enbridge Pipelines Inc.', 'Enbridge Pipelines (NW) Inc.',
            'Express Pipeline Ltd.', 'Trans Mountain Pipeline ULC',
            'Trans Quebec and Maritimes Pipeline Inc.',
            'Trans-Northern Pipelines Inc.',
            'TransCanada Keystone Pipeline GP Ltd.', 'Westcoast Energy Inc.',
            'Alliance Pipeline Ltd.', 'PKM Cochin ULC',
            'Foothills Pipe Lines Ltd.', 'Southern Lights Pipeline',
            'Emera Brunswick Pipeline Company Ltd.',
            'Many Islands Pipe Lines (Canada) Limited',
            'Maritimes & Northeast Pipeline Management Ltd.',
            'Vector Pipeline Limited Partnership',
            'Plains Midstream Canada ULC',
            'Enbridge Bakken Pipeline Company Inc.',
            'Genesis Pipeline Canada Ltd.', 'Montreal Pipe Line Limited',
            'Kingston Midstream Westspur Limited',
            'Aurora Pipeline Company Ltd'
        ]

    for company in company_files:
        thisCompanyData = {}
        folder_name = company.replace(' ', '').replace('.', '')

        df_c = df[df['Company'] == company].copy().reset_index(drop=True)
        if not df_c.empty:
            # df_c = add_links(df_c, links)
            df_c['condition id'] = [
                str(ins) + '_' + str(cond) for ins, cond in zip(
                    df_c['Instrument Number'], df_c['Condition Number'])
            ]
            expanded_locations = []
            for unique in df_c['condition id']:
                row = df_c[df_c['condition id'] == unique].copy().reset_index(
                    drop=True)
                locations = [x.split(',') for x in row['Location']]
                for region in locations[0]:
                    regionProvince = region.strip().split('/')
                    row['id'] = regionProvince[0].strip()
                    row['Flat Province'] = regionProvince[-1].strip()
                    expanded_locations.append(row.copy())

            df_all = pd.concat(expanded_locations,
                               axis=0,
                               sort=False,
                               ignore_index=True)
            # calculate metadata here
            dfmeta, meta = conditionMetaData(df_all, folder_name)
            meta["build"] = True
            thisCompanyData['meta'] = meta
            shp, mapMeta = conditions_on_map(dfmeta, regions_map, folder_name,
                                             lang)

            thisCompanyData['regions'] = shp.to_json()
            thisCompanyData['mapMeta'] = mapMeta.to_dict(orient='records')
            if not test and save:
                with open(
                        '../conditions/company_data/' + lang + '/' +
                        folder_name + '.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)
                print('completed+saved ' + lang + ' conditions: ' + company)
        else:
            meta = {"companyName": company}
            thisCompanyData = {
                'meta': {
                    "companyName": company,
                    "build": False
                },
                'regions': "{}",
                'mapMeta': []
            }

            if not test and save:
                with open(
                        '../conditions/company_data/' + lang + '/' +
                        folder_name + '.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)
                print('completed+saved ' + lang + ' conditions: ' + company)

        # if not test:
        #     print('completed '+lang+' conditions: '+company)

    return df_c, shp, dfmeta, meta
Beispiel #4
0
def process_incidents(remote=False,
                      land=False,
                      company_names=False,
                      companies=False,
                      test=False):
    if remote:
        link = "https://www.cer-rec.gc.ca/en/safety-environment/industry-performance/interactive-pipeline/map/2020-12-31-comprehensive-incident-data.csv"
        print('downloading remote incidents file')
        df = pd.read_csv(link,
                         skiprows=1,
                         encoding="UTF-16",
                         error_bad_lines=False)
        df.to_csv("./raw_data/incidents.csv", index=False)
    elif test:
        print('reading test incidents file')
        df = pd.read_csv(
            "./raw_data/test_data/comprehensive-incident-data.csv",
            skiprows=0,
            encoding="UTF-8",
            error_bad_lines=False)
    else:
        print('reading local incidents file')
        df = pd.read_csv("./raw_data/comprehensive-incident-data.csv",
                         skiprows=0,
                         encoding='latin-1',
                         error_bad_lines=True)

    for vol in [
            'Approximate Volume Released (m³)',
            'Approximate Volume Released (m3)'
    ]:
        try:
            df = df.rename(columns={vol: 'Approximate Volume Released'})
        except:
            None

    # initial data processing
    df['Company'] = df['Company'].replace(company_rename())
    df['Company'] = [x if x in group1 else "Group 2" for x in df['Company']]

    df['Approximate Volume Released'] = pd.to_numeric(
        df['Approximate Volume Released'], errors='coerce')
    df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='raise')
    df['Substance'] = df['Substance'].replace({
        "Water":
        "Other",
        "Hydrogen Sulphide":
        "Other",
        "Amine":
        "Other",
        "Contaminated Water":
        "Other",
        "Potassium Hydroxide (caustic solution)":
        "Other",
        "Glycol":
        "Other",
        "Pulp slurry":
        "Other",
        "Sulphur":
        "Other",
        "Odourant":
        "Other",
        "Potassium Carbonate":
        "Other",
        "Waste Oil":
        "Other",
        "Produced Water":
        "Other",
        "Butane":
        "Natural Gas Liquids",
        "Mixed HVP Hydrocarbons":
        "Other",
        "Drilling Fluid":
        "Other",
        "Jet Fuel":
        "Other",
        "Gasoline":
        "Other",
        "Sulphur Dioxide":
        "Other",
        "Lube Oil":
        "Other",
        "Propane":
        "Natural Gas Liquids",
        "Fuel Gas":
        "Other",
        "Diesel Fuel":
        "Other"
    })

    if company_names:
        print(get_company_names(df['Company']))

    keep = [
        'Incident Number', 'Incident Types', 'Province', 'Company', 'Status',
        'Latitude', 'Longitude', 'Approximate Volume Released', 'Substance',
        'Year', 'What happened category', 'Why it happened category',
        'Activity being performed at time of incident',
        'How the incident was discovered', 'Incident type',
        'Residual effects on the environment', 'Number of fatalities',
        'Number of individuals injured', 'Off Company Property',
        'Was NEB Staff Deployed'
    ]

    for col in df.columns:
        if col not in keep:
            del df[col]

    df = df.rename(
        columns={
            'What happened category': 'What happened',
            'Why it happened category': 'Why it happened',
            'Activity being performed at time of incident':
            'Activity at time of incident',
            'How the incident was discovered': 'How was it discovered'
        })
    # df = df[~df['Approximate Volume Released'].isnull()].copy().reset_index(drop=True)
    fillZero = [
        'Approximate Volume Released', 'Number of fatalities',
        'Number of individuals injured'
    ]
    for f in fillZero:
        df[f] = df[f].fillna(0)

    fillOther = ['How was it discovered']
    for f in fillOther:
        df[f] = df[f].fillna("Other")

    textCols = [
        'Incident Number', 'Incident Types', 'Province', 'Company', 'Status',
        'Substance', 'What happened', 'Why it happened',
        'Activity at time of incident', 'How was it discovered',
        'Incident type', 'Residual effects on the environment',
        'Off Company Property', 'Was NEB Staff Deployed'
    ]

    for t in textCols:
        df[t] = [str(x).strip() for x in df[t]]

    meta = {}
    allCompanyData = {}
    allCompanyData['meta'] = meta
    allCompanyData['events'] = df.to_dict(orient='records')
    if not test:
        with open('../incidents/incident_releases.json', 'w') as fp:
            json.dump(allCompanyData, fp)

    return allCompanyData, df
def process_conditions(remote=False,
                       sql=False,
                       non_standard=True,
                       company_names=False,
                       companies=False,
                       test=False,
                       save=True):
    if remote:
        print('downloading remote conditions file')
        link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv'
        df = pd.read_csv(
            link,
            # sep='\t',
            # lineterminator='\r',
            encoding="latin-1",
            error_bad_lines=True)
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])

    elif test:
        print('reading test conditions data')
        df = pd.read_csv('./raw_data/test_data/conditions.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])
    else:
        print('reading local conditions data')
        df = pd.read_csv('./raw_data/conditions_en.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])

    for date_col in ['Effective Date', 'Issuance Date', 'Sunset Date']:
        df[date_col] = pd.to_datetime(df[date_col])

    if not non_standard:
        # only include non-standard conditions
        df = df[df['Condition Type'] != 'Standard']

    delete_cols = [
        'Condition', 'Condition Phase', 'Instrument Activity',
        'Condition Type', 'Condition Filing'
    ]

    for delete in delete_cols:
        del df[delete]

    for r in ['\n', '"']:
        df['Company'] = df['Company'].replace(r, '', regex=True)
    df['Company'] = [x.strip() for x in df['Company']]
    df['Condition Status'] = df['Condition Status'].astype('object')
    df['Condition Status'] = [str(x).strip() for x in df['Condition Status']]
    # preliminary processing
    df['Company'] = df['Company'].replace(company_rename())
    df = apply_system_id(df, "Company")

    df = df[df['Short Project Name'] != "SAM/COM"].copy().reset_index(
        drop=True)

    df = add_links(df, sql)
    if company_names:
        print(get_company_names(df['Company']))

    df, region_replace, project_names = idify_conditions(df, sql)
    regions_map = import_simplified(region_replace)

    if companies:
        company_files = companies
    else:
        company_files = get_company_list("all")

    for company in company_files:
        try:
            df_c, shp, dfmeta, meta = process_company(df, company,
                                                      project_names,
                                                      regions_map, test, save)
            print("completed: " + company)
        except:
            print("conditions error: " + company)
            raise

    return df_c, shp, dfmeta, meta
Beispiel #6
0
def process_incidents(remote=False, land=False, company_names=False, companies=False, test=False):
    if remote:
        link = "https://www.cer-rec.gc.ca/open/incident/pipeline-incidents-data.csv"
        process_func = process_english
        print('downloading remote incidents file')
        df = pd.read_csv(link,
                         skiprows=0,
                         encoding="latin-1",
                         engine="python",
                         error_bad_lines=False)
        df = process_func(df)
        df.to_csv("./raw_data/incidents_"+"en"+".csv", index=False)
    elif test:
        print('reading test incidents file')
        path = "./raw_data/test_data/incidents_en.csv"
        process_func = process_english

        df = pd.read_csv(path,
                         skiprows=0,
                         encoding="utf-8",
                         error_bad_lines=False)
        df = process_func(df)

    else:
        print('reading local incidents file')
        path = "./raw_data/incidents_en.csv"
        process_func = process_english
        encoding = "latin-1"

        df = pd.read_csv(path,
                         skiprows=0,
                         encoding=encoding,
                         error_bad_lines=False)
        df = process_func(df)

    # initial data processing
    df['Company'] = df['Company'].replace(company_rename())
    df['Approximate Volume Released'] = pd.to_numeric(df['Approximate Volume Released'],
                                                      errors='coerce')

    df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='raise')

    for delete in ['Significant',
                   'Release Type',
                   'Nearest Populated Centre',
                   'Reported Date']:
        del df[delete]

    if company_names:
        print(get_company_names(df['Company']))

    perKm = None

    if companies:
        company_files = companies
    else:
        company_files = ['NOVA Gas Transmission Ltd.',
                         'TransCanada PipeLines Limited',
                         'Enbridge Pipelines Inc.',
                         'Enbridge Pipelines (NW) Inc.',
                         'Enbridge Bakken Pipeline Company Inc.',
                         'Express Pipeline Ltd.',
                         'Trans Mountain Pipeline ULC',
                         'Trans Quebec and Maritimes Pipeline Inc.',
                         'Trans-Northern Pipelines Inc.',
                         'TransCanada Keystone Pipeline GP Ltd.',
                         'Westcoast Energy Inc.',
                         'Alliance Pipeline Ltd.',
                         'PKM Cochin ULC',
                         'Foothills Pipe Lines Ltd.',
                         'Southern Lights Pipeline',
                         'Emera Brunswick Pipeline Company Ltd.',
                         'Plains Midstream Canada ULC',
                         'Genesis Pipeline Canada Ltd.',
                         'Montreal Pipe Line Limited',
                         'Trans-Northern Pipelines Inc.',
                         'Kingston Midstream Westspur Limited',
                         'Many Islands Pipe Lines (Canada) Limited',
                         'Vector Pipeline Limited Partnership',
                         'Maritimes & Northeast Pipeline Management Ltd.',
                         'Aurora Pipeline Company Ltd']

    for company in company_files:
        folder_name = company.replace(' ', '').replace('.', '')
        df_c = df[df['Company'] == company].copy().reset_index(drop=True)
        df_vol = df_c[~df_c['Approximate Volume Released'].isnull()].copy().reset_index(drop=True)
        thisCompanyData = {}
        if not df_vol.empty:
            # calculate metadata here, before non releases are filtered out
            meta = incidentMetaData(df, perKm, company)
            thisCompanyData['meta'] = meta
            for delete in ['Incident Types', 'Company', 'why common', 'what common']:
                del df_vol[delete]
            df_vol = optimizeJson(df_vol)
            thisCompanyData['events'] = df_vol.to_dict(orient='records')
            if not test:
                with open('../incidents/company_data/'+folder_name+'.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)
        else:
            # there are no product release incidents
            thisCompanyData['events'] = df_vol.to_dict(orient='records')
            thisCompanyData['meta'] = {"companyName": company}
            if not test:
                with open('../incidents/company_data/'+folder_name+'.json', 'w') as fp:
                    json.dump(thisCompanyData, fp)

    return df_c, df_vol, meta