Beispiel #1
0
def process_throughput(points,
                       save=False,
                       sql=False,
                       commodity='Gas',
                       companies=False,
                       frequency='m'):

    if commodity == 'Gas':
        if frequency == "m":
            query = 'throughput_gas_monthly.sql'

        df = get_traffic_data(sql, query)
        df = df.rename(
            columns={
                'Capacity (1000 m3/d)': 'Capacity',
                'Throughput (1000 m3/d)': 'Throughput'
            })

        # Saturn corner case
        df = df.drop(df[(df['KeyPointID'] == "KP0036")
                        & (df['Throughput'] == 0)].index)
        units = "Bcf/d"

    else:
        query = 'throughput_oil_monthly.sql'
        df = get_traffic_data(sql, query)
        df = df.rename(
            columns={
                'Available Capacity (1000 m3/d)': 'Capacity',
                'Throughput (1000 m3/d)': 'Throughput'
            })
        df['Trade Type'] = [str(p).strip() for p in df['Product']]
        del df['Product']
        units = "Mb/d"

    df = conversion(df, commodity, ['Capacity', 'Throughput'], False, 0)
    df = df[df['Trade Type'] != "`"].copy().reset_index(drop=True)
    df = apply_trade_id(df)
    df['Date'] = pd.to_datetime(df['Date'])
    company_files = get_company_list(commodity)

    if companies:
        company_files = companies

    for company in company_files:
        try:
            this_company_data, df_c = process_company(df, company, commodity,
                                                      points, units, save)
            print("completed: " + company)
        except:
            print("traffic error: " + company)
            raise
    return this_company_data, df_c
def process_apportionment(save=False, sql=False, companies=False):

    if sql:
        df = get_data(os.getcwd(), "apportionment.sql", "PipelineInformation",
                      sql)
    else:
        print('reading local apportionment csv...')
        df = pd.read_csv("./raw_data/apportionment.csv")

    df = normalize_dates(df, ['Date'])
    df = normalize_text(df, ['Pipeline Name'])
    # enbridge processing
    df = df.drop(df[(df['Pipeline Name'] == 'EnbridgeMainline')
                    & (df['KeyPointID'].isin(['KP0016', 'KP0021']))].index)
    df = df.drop(df[(df['Pipeline Name'] == 'EnbridgeMainline')
                    & (df['Date'].dt.year < 2016)].index)
    # cochin processing
    df = df.drop(df[(df['Pipeline Name'] == 'Cochin')
                    & (df['KeyPointID'] != 'KP0018')].index)
    df = df[~df['Pipeline Name'].
            isin(["SouthernLights", "Westpur", "TransNorthern"])].reset_index(
                drop=True)

    df = df.rename(columns={x: x.split("(")[0].strip() for x in df.columns})
    num_cols = [
        'Available Capacity', 'Original Nominations', 'Accepted Nominations',
        'Apportionment Percentage'
    ]
    df = normalize_numeric(df, num_cols, 2)
    df = conversion(df, "oil", num_cols[:-1], 2, False)
    df['Apportionment Percentage'] = df['Apportionment Percentage'].round(2)
    company_files = get_company_list("all")

    if companies:
        company_files = companies

    enbridge_points = get_enbridge_points(sql)
    df = sort_by_points(df)

    for company in company_files:
        try:
            this_company_data = process_company(df, company, enbridge_points,
                                                save)
            print("completed: " + company)
        except:
            print("apportionment error: " + company)
            raise

    return this_company_data
Beispiel #3
0
def process_incidents(remote=False, companies=False, test=False):
    if remote:
        link = "https://www.cer-rec.gc.ca/open/incident/pipeline-incidents-data.csv"
        process_func = process_english
        print('downloading remote incidents file')
        df = pd.read_csv(link,
                         skiprows=0,
                         encoding="latin-1",
                         engine="python",
                         error_bad_lines=False)
        df.to_csv("./raw_data/incidents_" + "en" + ".csv", index=False)
        df = process_func(df)

    elif test:
        print('reading test incidents file')
        path = "./raw_data/test_data/incidents_en.csv"
        process_func = process_english

        df = pd.read_csv(path,
                         skiprows=0,
                         encoding="utf-8",
                         error_bad_lines=False)
        df = process_func(df)

    else:
        print('reading local incidents file')
        path = "./raw_data/incidents_en.csv"
        process_func = process_english
        encoding = "latin-1"

        df = pd.read_csv(path,
                         skiprows=0,
                         encoding=encoding,
                         engine="python",
                         error_bad_lines=False)
        df = process_func(df)

    # initial data processing
    df['Approximate Volume Released'] = pd.to_numeric(
        df['Approximate Volume Released'], errors='coerce')

    df['Reported Date'] = pd.to_datetime(df['Reported Date'], errors='raise')

    for delete in [
            'Significant', 'Release Type', 'Nearest Populated Centre',
            'Reported Date'
    ]:
        del df[delete]

    if companies:
        company_files = companies
    else:
        company_files = get_company_list("all")

    for company in company_files:
        try:
            folder_name = company.replace(' ', '').replace('.', '')
            df_c = df[df['Company'] == company].copy().reset_index(drop=True)
            df_vol = df_c[~df_c['Approximate Volume Released'].isnull()].copy(
            ).reset_index(drop=True)
            this_company_data = {}
            if not df_vol.empty:
                # calculate metadata here, before non releases are filtered out
                meta = incident_meta_data(df, company)
                this_company_data['meta'] = meta
                for delete in [
                        'Incident Types', 'Company', 'why common',
                        'what common'
                ]:
                    del df_vol[delete]
                df_vol = optimize_json(df_vol)
                this_company_data['events'] = df_vol.to_dict(orient='records')
                this_company_data['meta']['build'] = True
                if not test:
                    with open(
                            '../data_output/incidents/' + folder_name +
                            '.json', 'w') as fp:
                        json.dump(this_company_data, fp)
            else:
                # there are no product release incidents
                this_company_data['events'] = df_vol.to_dict(orient='records')
                this_company_data['meta'] = {
                    "companyName": company,
                    "build": False
                }
                if not test:
                    with open(
                            '../data_output/incidents/' + folder_name +
                            '.json', 'w') as fp:
                        json.dump(this_company_data, fp)
            print("completed: " + company)
        except:
            print("incidents error: " + company)

    return df_c, df_vol, meta
Beispiel #4
0
def process_remediation(sql=False,
                        remote=True,
                        companies=False,
                        test=False,
                        save=True):

    if test:
        print("reading test remediation test data")
        df = pd.read_csv(
            os.path.join(os.getcwd(), "raw_data", "test_data",
                         "remediation.csv"))
    elif remote:
        print("reading remote remediation file")
        df = pd.read_csv(
            "https://www.cer-rec.gc.ca/open/compliance/contamination.csv",
            encoding="latin-1",
            engine="python",
        )
        df.to_csv("./raw_data/remediation.csv")
    else:
        print("reading local remediation file")
        df = pd.read_csv("./raw_data/remediation.csv")

    contaminants = get_data(sql=sql,
                            script_loc=os.getcwd(),
                            query="remediationContaminants.sql",
                            db="dsql22cap")
    old = get_data(sql=sql,
                   script_loc=os.getcwd(),
                   query="remediation_pre_2018.sql",
                   db="dsql22cap")

    df = apply_contaminant_ids(df, contaminants)
    df["Contaminants at the Site"] = [["18"] if x == None else x
                                      for x in df["Contaminants at the Site"]]
    df["Site Within 30 Meters Of Waterbody"] = [
        True if x == "Yes" else False
        for x in df["Site Within 30 Meters Of Waterbody"]
    ]
    df = normalize_text(df, [
        'Applicable Land Use', 'Site Status', 'Activity At Time Of Discovery',
        'Pipeline Name', 'Facility Name'
    ])

    pipe_section = []
    na = "Not Specified"
    for pipe, section in zip(df['Pipeline Name'], df['Facility Name']):
        if pipe == na and section == na:
            pipe_section.append("ns")  # Not Specified
        elif pipe == na and section != na:
            pipe_section.append("f")  # Facility
        elif pipe != na and section == na:
            pipe_section.append("p")  # Pipeline
        elif pipe != na and section != na:
            pipe_section.append("pf")  # Pipeline and Facility
        else:
            print("error here!")

    df["ps"] = pipe_section
    del df['Pipeline Name']
    del df['Facility Name']

    # add id's
    land_use_ids = {
        "developed land - industrial": "dli",
        "developed land - small commercial": "dls",
        "developed land - residential": "dlr",
        "barren land": "bl",
        "shrub land": "sl",
        "vegetative barren": "vb",
        "forests": "f",
        "Agricultural Cropland": "ac",
        "water / wetlands": "w",
        "Tundra / Native Prairie / Parks": "t",
        "agricultural land": "al",
        "protected area": "pa",
        "non-developed land": "ndl"
    }

    status_ids = {
        "monitored": "m",
        "post-remediation monitoring": "prm",
        "facility monitoring": "fm",
        "ongoing remediation": "or",
        "site assessment": "sa",
        "risk managed": "rm"
    }

    activity_ids = {
        "maintenance": "m",
        "operation": "o",
        "construction": "c",
        "abandonment": "a"
    }

    df = idify(df, "Applicable Land Use", land_use_ids)
    df = idify(df, "Province", "region")
    df = idify(df, "Site Status", status_ids)
    df = idify(df, "Activity At Time Of Discovery", activity_ids)

    df['Final Submission Date'] = pd.to_datetime(df['Final Submission Date'])
    df['y'] = df['Final Submission Date'].dt.year

    df = df.fillna(value=np.nan)
    for ns in [
            'Applicable Land Use', 'Activity At Time Of Discovery',
            'Contaminants at the Site',
            'Initial Estimate of Contaminated Soil (m3)',
            'Site Within 30 Meters Of Waterbody', 'Site Status', 'Latitude',
            'Longitude'
    ]:

        df[ns] = [
            None if x in ["Not Specified", np.nan, "nan"] else x
            for x in df[ns]
        ]

    for numeric in [
            'Initial Estimate of Contaminated Soil (m3)', 'Latitude',
            'Longitude', 'y'
    ]:

        df[numeric] = df[numeric].replace(np.nan, int(-1))

    for int_numeric in ['y', 'Initial Estimate of Contaminated Soil (m3)']:
        df[int_numeric] = df[int_numeric].astype(int)

    df['loc'] = [[lat, long]
                 for lat, long in zip(df['Latitude'], df['Longitude'])]
    del df['Latitude']
    del df['Longitude']
    columns = {
        "Event ID": "id",
        "Site Status": "s",
        "Activity At Time Of Discovery": "a",
        "Province": "p",
        "Applicable Land Use": "use",
        "Contaminants at the Site": "c",
        "Initial Estimate of Contaminated Soil (m3)": "vol",
        "Site Within 30 Meters Of Waterbody": "w"
    }

    df = df.rename(columns=columns)
    for col in df:
        if col not in columns.values() and col not in [
                "Company Name", "Final Submission Date", "y", "ps", "loc"
        ]:
            del df[col]

    df['Company Name'] = df['Company Name'].replace(company_rename())
    df = apply_system_id(df, "Company Name")

    old["Company"] = old["Company"].replace(company_rename())
    old = apply_system_id(old, "Company")

    if companies:
        company_files = companies
    else:
        company_files = get_company_list("all")

    for company in company_files:
        try:
            folder_name = company.replace(' ', '').replace('.', '')
            df_c = df[df['Company Name'] == company].copy().reset_index(
                drop=True)
            this_company_data = {}

            if not df_c.empty:
                this_company_data["meta"] = meta(df_c, company, old)
                this_company_data["build"] = True
                this_company_data["data"] = optimize_json(df_c)
                if save and not test:
                    with open(
                            '../data_output/remediation/' + folder_name +
                            '.json', 'w') as fp:
                        json.dump(this_company_data, fp)
            else:
                this_company_data['data'] = df_c.to_dict(orient='records')
                this_company_data['meta'] = {"companyName": company}
                this_company_data["build"] = False
                if save and not test:
                    with open(
                            '../data_output/remediation/' + folder_name +
                            '.json', 'w') as fp:
                        json.dump(this_company_data, fp)
            print("completed: " + company)
        except:
            print("remediation error: " + company)
            raise

    return df, this_company_data
def process_conditions(remote=False,
                       sql=False,
                       non_standard=True,
                       company_names=False,
                       companies=False,
                       test=False,
                       save=True):
    if remote:
        print('downloading remote conditions file')
        link = 'http://www.cer-rec.gc.ca/open/conditions/conditions.csv'
        df = pd.read_csv(
            link,
            # sep='\t',
            # lineterminator='\r',
            encoding="latin-1",
            error_bad_lines=True)
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])

    elif test:
        print('reading test conditions data')
        df = pd.read_csv('./raw_data/test_data/conditions.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])
    else:
        print('reading local conditions data')
        df = pd.read_csv('./raw_data/conditions_en.csv',
                         encoding="UTF-16",
                         sep='\t')
        df = normalize_text(df, ['Location', 'Short Project Name', 'Theme(s)'])

    for date_col in ['Effective Date', 'Issuance Date', 'Sunset Date']:
        df[date_col] = pd.to_datetime(df[date_col])

    if not non_standard:
        # only include non-standard conditions
        df = df[df['Condition Type'] != 'Standard']

    delete_cols = [
        'Condition', 'Condition Phase', 'Instrument Activity',
        'Condition Type', 'Condition Filing'
    ]

    for delete in delete_cols:
        del df[delete]

    for r in ['\n', '"']:
        df['Company'] = df['Company'].replace(r, '', regex=True)
    df['Company'] = [x.strip() for x in df['Company']]
    df['Condition Status'] = df['Condition Status'].astype('object')
    df['Condition Status'] = [str(x).strip() for x in df['Condition Status']]
    # preliminary processing
    df['Company'] = df['Company'].replace(company_rename())
    df = apply_system_id(df, "Company")

    df = df[df['Short Project Name'] != "SAM/COM"].copy().reset_index(
        drop=True)

    df = add_links(df, sql)
    if company_names:
        print(get_company_names(df['Company']))

    df, region_replace, project_names = idify_conditions(df, sql)
    regions_map = import_simplified(region_replace)

    if companies:
        company_files = companies
    else:
        company_files = get_company_list("all")

    for company in company_files:
        try:
            df_c, shp, dfmeta, meta = process_company(df, company,
                                                      project_names,
                                                      regions_map, test, save)
            print("completed: " + company)
        except:
            print("conditions error: " + company)
            raise

    return df_c, shp, dfmeta, meta
Beispiel #6
0
def process_oandm(remote=False, companies=False, test=False):

    lang = "en"
    if remote:
        link = "https://can01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fwww.cer-rec.gc.ca%2Fopen%2Foperations%2Foperation-and-maintenance-activity.csv&data=04%7C01%7CMichelle.Shabits%40cer-rec.gc.ca%7Cbbc3fece7b3a439e253908d8f9ec4eab%7C56e9b8d38a3549abbdfc27de59608f01%7C0%7C0%7C637534140608125634%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=HvG6KtuvEzJiNy4CZ4OyplKnfx2Zk5sPjUNNutoohic%3D&reserved=0"

        print('downloading remote oandm file')
        df = pd.read_csv(link,
                         skiprows=0,
                         encoding="latin-1",
                         engine="python",
                         error_bad_lines=False)

        df.to_csv("./raw_data/oandm_" + lang + ".csv", index=False)
    elif test:
        print('reading test oandm file')
        path = "./raw_data/test_data/oandm_en.csv"
        df = pd.read_csv(path,
                         skiprows=0,
                         encoding="utf-8",
                         error_bad_lines=False)

    else:
        print('reading local oandm file')
        if lang == 'en':
            path = "./raw_data/oandm_en.csv"
            encoding = "utf-8"

        df = pd.read_csv(path,
                         skiprows=0,
                         encoding=encoding,
                         error_bad_lines=False)

    df = strip_cols(df)
    df = df.rename(columns={x: x.replace("\xa0", " ") for x in df.columns})
    df = df.replace({"Yes": "y", "No": "n"})
    # Event Number and nearest populated center should be deleted later
    # New Land Area Needed is probably the total land
    for delete in [
            'Company City', 'Company Postal Code',
            'Company Province/Territory', 'Circumstance(s)',
            'Result Of A Class Location Change',
            'Distance To Closest Building', 'Event Creation Date',
            'Submission Date', 'Pipeline Name', 'Pipeline Outside Diameter',
            'Pipeline Length', 'Commodity Carried', 'Facility Name',
            'Facility Type', 'New Permanent Land Needed',
            'Activity Acquiring New Private Land',
            'Activity Acquiring New Land Under Compliance',
            'Land Within Critical Habitat', 'Activity Crossing Water Body',
            'New Temporary Land Needed', 'Vehicle Crossing Count',
            'Provincial and federal authorities been consulted',
            'Activity Using Navigable Water',
            'Activity Following DFO Fish Measures For In Stream Work',
            'Navigable Water Frozen Or Dry',
            'Activity Following DFO Fish Measures For Crossing',
            'Ground Disturbance Near Water Required',
            'Navigable Water Activity Meeting Transport Canada Minor Works And Waters Order'
    ]:
        del df[delete]

    for date_col in df.columns:
        if "date" in date_col.lower():
            df[date_col] = pd.to_datetime(df[date_col])

    df['Company Name'] = df['Company Name'].replace(company_rename())
    df = apply_system_id(df, "Company Name")
    df = column_insights(df)
    df = df.rename(columns={
        "Species At Risk Present At Activity Site":
        "Species At Risk Present"
    })
    df = df[df['Commencement Date'].dt.year >= 2015].reset_index(drop=True)
    if companies:
        company_files = companies
    else:
        company_files = get_company_list("all")

    for company in company_files:
        try:
            folder_name = company.replace(' ', '').replace('.', '')
            df_c = df[df['Company Name'] == company].copy().reset_index(
                drop=True)
            df_c = df_c.drop_duplicates(subset=['Event Number'])
            this_company_data = {}
            if not df_c.empty:
                this_company_data["meta"] = metadata(df_c, company, test)
                this_company_data["build"] = True
                this_company_data["data"] = optimize_json(df_c)
                if not test:
                    with open('../data_output/oandm/' + folder_name + '.json',
                              'w') as fp:
                        json.dump(this_company_data, fp)
            else:
                # there are no o and m events
                this_company_data['data'] = df_c.to_dict(orient='records')
                this_company_data['meta'] = {"companyName": company}
                this_company_data["build"] = False
                if not test:
                    with open('../data_output/oandm/' + folder_name + '.json',
                              'w') as fp:
                        json.dump(this_company_data, fp)
            print("completed: " + company)
        except:
            print("o&m error: " + company)
            raise
    return this_company_data
Beispiel #7
0
def process_tolls_data(sql=True, companies=False, save=True, completed=[]):

    def generate_path_series(df, paths, series_col):
        path_series = []
        for path in paths:
            df_p = df[df["Path"] == path].copy().reset_index(drop=True)
            if not df_p.empty:
                path_series.append({"pathName": path,
                                   "series": process_path(df_p, series_col)})
        return path_series

    def find_series_col(df, company):
        products = sorted(list(set(df["Product"])))
        services = sorted(list(set(df["Service"])))
        units = list(set(df["Original Toll Unit"]))
        if len(products) > 1:
            product_filter = list(set(df["Product"]))
            product_filter = [[x, True] if x == "heavy crude" else [x, False] for x in product_filter]
        else:
            product_filter = False
        
        if len(units) > 1:
            series_col = "Units"
            print("Multiple units for: "+company)
        elif len(products) > 1 and len(services) <= 1 :
            series_col = "Product"
        elif len(services) > 1 and len(products) <= 1:
            series_col = "Service"
        elif len(services) <= 1 and len(products) <= 1:
            series_col = "Path"
        elif len(products) > 1 and len(services) > 1:
            series_col = "Service"
        else:
            series_col = "Service"
            print("error! Need to filter on two columns")
        # override series col if needed
        if company in ["Westcoast", "Keystone"]:
            series_col = "Path"

        return series_col, product_filter

    df, descriptions, toll_nums = get_tolls_data(sql)
    toll_nums = normalize_dates(toll_nums, ["s", "e"])
    df = normalize_text(df, ['Product', 'Path', 'Service', 'Original Toll Unit', 'Converted Toll Unit'])
    df = normalize_dates(df, ["Effective Start", "Effective End"])
    df = df[~df["Effective Start"].isnull()].copy().reset_index(drop=True)

    company_files = get_company_list()
    process_description(descriptions, save)

    if companies:
        company_files = companies
    for company in company_files:
        # print(company)
        this_company_data = {}
        if company == "EnbridgeMainline":
            df_c = df[df["PipelineID"].isin(["EnbridgeMainline", "EnbridgeFSP", "EnbridgeLocal"])].copy().reset_index(drop=True)
        else:
            df_c = df[df["PipelineID"] == company].copy().reset_index(drop=True)

        df_c, selected_paths, selectedService, path_filter, split_default, path_totals, decimals = company_filter(df_c, company)
        meta = {"companyName": company}
        if not df_c.empty and company in completed:
            meta["build"] = True
            meta["pathTotals"] = path_totals
            meta["decimals"] = decimals
            paths = sorted(list(set(df_c["Path"])))
            services = sorted(list(set(df_c["Service"])))
            units = list(set(df_c["Original Toll Unit"]))
            meta["pathFilter"] = path_filter
            meta["split"] = {"default": split_default}
            if split_default:
                meta["split"]["buttons"] = list(set(df_c["split"]))
                path_series = {}
                meta["paths"], meta["seriesCol"], meta["products"], meta["services"], meta["units"], meta["tollNum"], meta["unitsFilter"] = {}, {}, {}, {}, {}, {}, {}
                if company == "EnbridgeMainline":
                    meta["splitDescription"] = {}
                else:
                    meta["splitDescription"] = False

                for split in list(set(df_c["split"])):
                    df_split = df_c[df_c["split"] == split].copy().reset_index(drop=True)
                    # add toll numbers
                    this_nums = toll_nums[toll_nums["PipelineID"] == list(df_split["PipelineID"])[0]].copy()
                    del this_nums["PipelineID"]
                    meta["tollNum"][split] = this_nums.to_dict(orient="records")
                    # add enbridge descriptions
                    if meta["splitDescription"] != False and split != "Enbridge Mainline":
                        current_definition = descriptions[descriptions["PipelineID"] ==list(df_split["PipelineID"])[0]]
                        meta["splitDescription"][split] = list(current_definition["Toll Description"])[0]


                    paths = sorted(list(set(df_split["Path"])))
                    services = sorted(list(set(df_split["Service"])))
                    units = list(set(df_split["Original Toll Unit"]))
                    series_col, product_filter = find_series_col(df_split, company)
                    if len(selected_paths) > 0:
                        meta["paths"][split] = [[p, True] if p in selected_paths[split] else [p, False] for p in paths]
                    else:
                        meta["paths"][split] = [[p, True] for p in paths]
                    meta["products"][split] = product_filter
                    meta["seriesCol"][split] = series_col
                    meta["unitsFilter"][split] = units_filter(df_split)
                    if selectedService:
                        meta["services"][split] = [[s, True] if s == selectedService[split] else [s, False] for s in services]
                    else:
                        meta["services"][split] = selectedService
                    meta["units"][split] = units
                    path_series[split] = generate_path_series(df_split, paths, series_col)
            else:
                # add toll numbers
                this_nums = toll_nums[toll_nums["PipelineID"] == company].copy()
                del this_nums["PipelineID"]
                meta["tollNum"] = this_nums.to_dict(orient="records")
                series_col, product_filter = find_series_col(df_c, company)
                meta["products"] = product_filter
                meta["seriesCol"] = series_col
                meta["paths"] = [[p, True] if p in selected_paths else [p, False] for p in paths]
                meta["services"] = [[s, True] if s == selectedService else [s, False] for s in services]
                meta["units"] = units
                meta["unitsFilter"] = units_filter(df_c)
                path_series = generate_path_series(df_c, paths, series_col)

            this_company_data["meta"] = meta
            this_company_data["tolls"] = path_series
        else:
            meta["build"] = False
            this_company_data["meta"] = meta

        if save:
            with open('../data_output/tolls/'+company+'.json', 'w') as fp:
                json.dump(this_company_data, fp, default=str)

    return df_c, this_company_data