コード例 #1
0
def main():
    #Create EIA API using your specific API key
    api_key = "3c109e8bc5897c4015d86e77e699ebc6"
    api = eia.API(api_key)
    #Declare desired series ID
    series_ID = 'TOTAL.SOT5PUS.A'
    global df
    df = retrieve_time_series(api, series_ID)
コード例 #2
0
def __call_api(report, key="", series=None):
    api = eia.API(key)
    if series == None:
        series_search = api.data_by_series(
            series=commonData.dictEiaReports.get(report))
    else:
        series_search = api.data_by_series(series=series)

    result = pd.DataFrame(series_search)
    return result
コード例 #3
0
ファイル: plot2graphs.py プロジェクト: ThorbjoernJonsson/EIA
def main():
    api_key = "YOUR_API_KEY_HERE"
    api = eia.API(api_key)
    #Total US inventories
    #series_ID1='http://api.eia.gov/series/?api_key=YOUR_API_KEY_HER&series_id=PET.WCESTUS1.W'
    series_ID1 = 'http://api.eia.gov/series/?YOUR_API_KEY_HER&series_id=PET.W_EPC0_SAX_YCUOK_MBBL.W'
    series_ID2 = 'http://api.eia.gov/series/?api_key=YOUR_API_KEY_HER&series_id=PET.RWTC.D'
    df1 = retrieve_time_series(api, series_ID1)
    df2 = retrieve_time_series(api, series_ID2)
    plot_time_series(df1, df2)
コード例 #4
0
def main():
    """
    Run main script
    """
    #Create EIA API using your specific API key
    api_key = "YOUR API KEY HERE"
    api = eia.API(api_key)
    #Declare desired series ID
    series_ID = 'EMISS.CO2-TOTV-TT-NG-TX.A'
    df = retrieve_time_series(api, series_ID)
    #Print the returned dataframe df
    print(df)
コード例 #5
0
def main():
    api_key = "5fbe8e00551266c048f84d7d28961828"
    api = eia.API(api_key)
    series_ID = 'EBA.PSEI-ALL.D.HL'
    df = retrieve_time_series(api, series_ID)

    # Cleaning the data
    df.reset_index(level=0, inplace=True)
    df.rename(columns={
        'index': 'Date',
        df.columns[1]: 'Electricity Demand'
    },
              inplace=True)
    df['Hour'] = df['Date'].str[10:12]
    df['Date'] = pd.to_datetime(df['Date'].str[:-9], format='%Y %m%d')
    df['Date'] = pd.to_datetime(
        df['Date']) + df['Hour'].astype('timedelta64[h]')
コード例 #6
0
df_BAA = pd.read_excel('data/BA_Codes_930.xlsx', sheetname='Table 1')
df_BAA.drop(df_BAA.index[:3], inplace=True)
df_BAA.rename(columns={
    'HOURLY AND DAILY BALANCING AUTHORITY': 'BAA_Acronym',
    'Unnamed: 1': 'BAA_Name',
    'Unnamed: 2': 'NRC_ID',
    'Unnamed: 3': 'Region'
},
              inplace=True)
BAA = pd.np.array(df_BAA['BAA_Acronym'])

#%%
#Use the EIA python call to pull an example dataset from the EIA API
#Use this example dataset to figure out how to format dates
api_key = "d365fe67a9ec71960d69102951ae474f"
api = eia.API(api_key)
series_search = api.data_by_series(series='EBA.PJM-ALL.TI.H')
df = pd.DataFrame(series_search)
df.index

#Convert dataframe index of date strings to a list of date strings for processing
date_list = df.index.tolist()

#Remove the last three characters from the date string 2015 0701T05Z 01;
#Datetime won't process two instances of days of the month, so I have to remove the '01'
dates_trimmed = [x[:-3] for x in date_list]

#Use datetime.strptime to parse the sting into a datetime object
dates_formatted = [
    datetime.strptime(date, '%Y %m%dT%HZ') for date in dates_trimmed
]
コード例 #7
0
def main():
    api_key = "YOUR_API_KEY_HERE"
    api = eia.API(api_key)
    series_ID='http://api.eia.gov/series/?api_key=6569647505cfb6f73e1aa9363045abc3&series_id=PET.WCESTUS1.W'
    df = retrieve_time_series(api, series_ID)
    plot_time_series(df)
import pandas as pd

import eia

import utils
import api_keys

if __name__ == "__main__":
    eia_api = eia.API(api_keys.eia)
    time_series_label = "TOTAL.COEXPUS.M"  # (C)rude (O)il (E)xport - (M)onthly
    eia_data = pd.DataFrame(eia_api.data_by_series(series=time_series_label))
    utils.clean_EIA_series(eia_data, column_label=time_series_label)
    utils.plot_time_series(
        eia_data,
        x_label="Date",
        x_unit="Year",
        y_label="U.S Exports of Crude Oil (Monthly)",
        y_unit="Thousand Barrels Per Day",
        column_name=time_series_label,
    )

    pass
コード例 #9
0
def getEIAData(series_ID):
    api_key = "776d3a3fe9bf6dfbd47b9141d0059f79"
    api = eia.API(api_key)
    df = retrieve_time_series(api, series_ID)
    return df
コード例 #10
0
#pip install EIA-python
#pip install networkx
import numpy as np
import pandas as pd
import eia
import networkx as nx
import matplotlib.pyplot as plt

#Get API key from EIA website and pass into eia.API() method
apiKey = "5f54b3e66477e22ec068066b1de8026d"
api = eia.API(apiKey)

series_id_list = [
    "INTL.57-1-DZA-TBPD.M", "INTL.57-1-AGO-TBPD.M", "INTL.57-1-COG-TBPD.M",
    "INTL.57-1-COD-TBPD.M", "INTL.57-1-ECU-TBPD.M", "INTL.57-1-GNQ-TBPD.M",
    "INTL.57-1-GAB-TBPD.M", "INTL.57-1-IRN-TBPD.M", "INTL.57-1-IRQ-TBPD.M",
    "INTL.57-1-KWT-TBPD.M", "INTL.57-1-LBY-TBPD.M", "INTL.57-1-NGA-TBPD.M",
    "INTL.57-1-QAT-TBPD.M", "INTL.57-1-RUS-TBPD.M", "INTL.57-1-SAU-TBPD.M",
    "INTL.57-1-ARE-TBPD.M", "INTL.57-1-VEN-TBPD.M", "INTL.57-1-USA-TBPD.M"
]

#call the method for each series within the api.data_by_series() method and plug into a pandas dataframe
df_list = [
    pd.DataFrame(api.data_by_series(series)) for series in series_id_list
]
oil_data = pd.concat(df_list, axis=1)

#Drop NAN values
oil_data = oil_data.replace("--", np.nan)
oil_data_reduced = oil_data.dropna()
oil_data_reduced
コード例 #11
0
ファイル: metadata_fetch.py プロジェクト: charlie9578/OpenOA
def fetch_eia(api_key, plant_id, file_path):
    """
    Read in EIA data of wind farm of interest
    - from EIA API for monthly productions, return monthly net energy generation time series
    - from local Excel files for wind farm metadata, return dictionary of metadata

    Args:
        api_key(:obj:`string`): 32-character user-specific API key, obtained from EIA
        plant_id(:obj:`string`): 5-character EIA power plant code
        file_path(:obj:`string`): directory with EIA metadata .xlsx files in 2017

    Returns:
        :obj:`pandas.Series`: monthly net energy generation in MWh
        :obj:`dictionary`: metadata of the wind farm with 'plant_id'

    """

    # EIA metadata

    plant_var_list = [
        "City",
        "Latitude",
        "Longitude",
        "Balancing Authority Name",
        "Transmission or Distribution System Owner",
    ]

    wind_var_list = [
        "Utility Name",
        "Plant Name",
        "State",
        "County",
        "Nameplate Capacity (MW)",
        "Operating Month",
        "Operating Year",
        "Number of Turbines",
        "Predominant Turbine Manufacturer",
        "Predominant Turbine Model Number",
        "Turbine Hub Height (Feet)",
    ]

    def meta_dic_fn(metafile, sheet, var_list):
        all_plant = pd.read_excel(file_path + metafile,
                                  sheet_name=sheet,
                                  skiprows=1)

        eia_plant = all_plant.loc[all_plant["Plant Code"] == np.int(
            plant_id)]  # specific wind farm

        if eia_plant.shape[0] == 0:  # Couldn't locate EIA ID in database
            raise Exception("Plant ID not found in EIA database")

        eia_info = eia_plant[var_list]  # select column
        eia_info = eia_info.reset_index(drop=True)  # reset index to 0
        eia_dic = eia_info.T.to_dict()  # convert to dictionary
        out_dic = eia_dic[
            0]  # remove extra level of dictionary, "0" in this case

        return out_dic

    # file path with 2017 EIA metadata files
    plant_dic = meta_dic_fn("2___Plant_Y2017.xlsx", "Plant", plant_var_list)
    wind_dic = meta_dic_fn("3_2_Wind_Y2017.xlsx", "Operable", wind_var_list)

    # convert feet to meter
    hubheight_meter = np.round(
        unit_conversion.convert_feet_to_meter(
            wind_dic["Turbine Hub Height (Feet)"]))
    wind_dic.update({"Turbine Hub Height (m)": hubheight_meter})
    wind_dic.pop("Turbine Hub Height (Feet)",
                 None)  # delete hub height in feet
    out_dic = plant_dic.copy()
    out_dic.update(wind_dic)  # append dictionary

    # EIA monthly energy production data

    api = eia.API(api_key)  # get data from EIA

    series_search_m = api.data_by_series(series="ELEC.PLANT.GEN.%s-ALL-ALL.M" %
                                         plant_id)
    eia_monthly = pd.DataFrame(
        series_search_m)  # net monthly energy generation of wind farm in MWh
    eia_monthly.columns = ["eia_monthly_mwh"]  # rename column
    eia_monthly = eia_monthly.set_index(pd.DatetimeIndex(
        eia_monthly.index))  # convert to DatetimeIndex

    return eia_monthly, out_dic
コード例 #12
0
def main():
    """
    Run main script
    """
    #Create EIA API using your specific API key
    api_key = "YOR API KEY HERE"
    api = eia.API(api_key)

    #Pull the electricity price data
    series_ID = 'ELEC.PRICE.TX-ALL.M'
    electricity_df = retrieve_time_series(api, series_ID)
    electricity_df.reset_index(level=0, inplace=True)
    #Rename the columns for easer analysis
    electricity_df.rename(columns={
        'index': 'Date',
        electricity_df.columns[1]: 'Electricity_Price'
    },
                          inplace=True)
    #Convert the Date column into a date object
    electricity_df['Date'] = pd.to_datetime(electricity_df['Date'])
    #Set Date as a Pandas DatetimeIndex
    electricity_df.index = pd.DatetimeIndex(electricity_df['Date'])
    #Decompose the time series into parts
    decompose_time_series(electricity_df['Electricity_Price'])

    #Pull in natural gas time series data
    series_ID = 'NG.N3035TX3.M'
    nat_gas_df = retrieve_time_series(api, series_ID)
    nat_gas_df.reset_index(level=0, inplace=True)
    #Rename the columns
    nat_gas_df.rename(columns={
        'index': 'Date',
        nat_gas_df.columns[1]: 'Nat_Gas_Price_MCF'
    },
                      inplace=True)
    #Convert the Date column into a date object
    nat_gas_df['Date'] = pd.to_datetime(nat_gas_df['Date'])
    #Set Date as a Pandas DatetimeIndex
    nat_gas_df.index = pd.DatetimeIndex(nat_gas_df['Date'])
    #Decompose the time series into parts
    decompose_time_series(nat_gas_df['Nat_Gas_Price_MCF'])

    #Merge the two time series together based on Date Index
    master_df = pd.merge(electricity_df['Electricity_Price'],
                         nat_gas_df['Nat_Gas_Price_MCF'],
                         left_index=True,
                         right_index=True)
    master_df.reset_index(level=0, inplace=True)

    #Plot the two variables in the same plot
    plt.plot(master_df['Date'],
             master_df['Electricity_Price'],
             label="Electricity_Price")
    plt.plot(master_df['Date'],
             master_df['Nat_Gas_Price_MCF'],
             label="Nat_Gas_Price")
    # Place a legend to the right of this smaller subplot.
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.title('Natural Gas Price vs. TX Electricity Price over Time')
    plt.show()

    #Transform the columns using natural log
    master_df['Electricity_Price_Transformed'] = np.log(
        master_df['Electricity_Price'])
    master_df['Nat_Gas_Price_MCF_Transformed'] = np.log(
        master_df['Nat_Gas_Price_MCF'])

    #In order to make the time series stationary, difference the data by 1 month
    n = 1
    master_df['Electricity_Price_Transformed_Differenced'] = master_df[
        'Electricity_Price_Transformed'] - master_df[
            'Electricity_Price_Transformed'].shift(n)
    master_df['Nat_Gas_Price_MCF_Transformed_Differenced'] = master_df[
        'Nat_Gas_Price_MCF_Transformed'] - master_df[
            'Nat_Gas_Price_MCF_Transformed'].shift(n)

    #Run each differenced time series thru the Augmented Dickey Fuller test
    print('Augmented Dickey-Fuller Test: Electricity Price Time Series')
    augmented_dickey_fuller_statistics(
        master_df['Electricity_Price_Transformed_Differenced'].dropna())
    print('Augmented Dickey-Fuller Test: Natural Gas Price Time Series')
    augmented_dickey_fuller_statistics(
        master_df['Nat_Gas_Price_MCF_Transformed_Differenced'].dropna())

    #Conver the dataframe to a numpy array
    master_array = np.array(master_df[[
        'Electricity_Price_Transformed_Differenced',
        'Nat_Gas_Price_MCF_Transformed_Differenced'
    ]].dropna())

    #Generate a training and test set for building the model: 95/5 split
    training_set = master_array[:int(0.95 * (len(master_array)))]
    test_set = master_array[int(0.95 * (len(master_array))):]

    #Fit to a VAR model
    model = VAR(endog=training_set)
    model_fit = model.fit()
    #Print a summary of the model results
    model_fit.summary()

    #Compare the forecasted results to the real data
    prediction = model_fit.forecast(model_fit.y, steps=len(test_set))

    #Merge the array data back into the master dataframe, and un-difference and back-transform
    data_with_predictions = pd.DataFrame(np.vstack(
        (training_set, prediction))).rename(
            columns={
                0: 'Electricity_Price_Transformed_Differenced_PostProcess',
                1: 'Nat_Gas_Price_MCF_Transformed_Differenced_PostProcess'
            })
    #Define which data is predicted and which isn't in the 'Predicted' column
    data_with_predictions.loc[:, 'Predicted'] = 1
    data_with_predictions.loc[(data_with_predictions.index >= 0) &
                              (data_with_predictions.index <=
                               (len(training_set) - 1)), 'Predicted'] = 0

    #Add a row of NaN at the begining of the df
    data_with_predictions.loc[-1] = [None, None, None]  # adding a row
    data_with_predictions.index = data_with_predictions.index + 1  # shifting index
    data_with_predictions.sort_index(inplace=True)
    #Add back into the original dataframe
    master_df.loc[:,
                  'Electricity_Price_Transformed_Differenced_PostProcess'] = data_with_predictions[
                      'Electricity_Price_Transformed_Differenced_PostProcess']
    master_df.loc[:, 'Predicted'] = data_with_predictions['Predicted']

    #Un-difference the data
    for i in range(1, len(master_df.index) - 1):
        master_df.at[i, 'Electricity_Price_Transformed'] = master_df.at[
            i - 1, 'Electricity_Price_Transformed'] + master_df.at[
                i, 'Electricity_Price_Transformed_Differenced_PostProcess']

    #Back-transform the data
    master_df.loc[:, 'Predicted_Electricity_Price'] = np.exp(
        master_df['Electricity_Price_Transformed'])

    #Compare the forecasted data to the real data
    print(master_df[master_df['Predicted'] == 1][[
        'Date', 'Electricity_Price', 'Predicted_Electricity_Price'
    ]])

    #Evaluate the accuracy of the results, pre un-differencing and back-transformation
    calculate_model_accuracy_metrics(
        list(master_df[master_df['Predicted'] == 1]['Electricity_Price']),
        list(master_df[master_df['Predicted'] == 1]
             ['Predicted_Electricity_Price']))
コード例 #13
0
class UpdateParams:
    """
    The intention of this class is to obtain the latest LCOH parameters that
    are from online databases.
    Expects inputs of NG, PETRO or, COAL

    NG -> industrial price ($ per thousand cubic feet)
    COAL -> other industrial use price ($ per short ton)
    Petroleum -> residual fuel oil prices by area -> wholesale/resale price by
    all sellers annual ($ per gallon)
    
    CHECK UNITS BETWEEN SERIES _ HENRY HUB , etc. 

    """

    path = "./calculation_data"

    today = datetime.datetime.now()

    eia_api_key = "68ea6b4094e685e32ec986a8053568d9"

    api = eia.API(eia_api_key)

    eerc_esc = pd.read_csv(os.path.join(path, "EERC_Fuel_Esc.csv"),
                           index_col=["State"])

    def get_max_fp(state_abbr, fuel_type="NG", year=False):
        """Obtains max state-level fuel price"""

        if (not year):

            year = UpdateParams.today.year

        if fuel_type.upper() == "NG":

            series_ID = "NG.N3035" + state_abbr + "3.A"

        elif fuel_type.upper() == "COAL":

            series_ID = "COAL.COST." + state_abbr + "-10.A"

        elif fuel_type.upper() == "PETRO":

            series_ID = "PET.EMA_EPPR_PWA_S" + state_abbr + "_DPG.A"

        else:
            raise AssertionError("Please input a valid fuel_type")

        # Check if state-level available, if not return USA price
        try:
            fuel_series = UpdateParams.api.data_by_series(series=series_ID)

            dict_key = list(fuel_series.keys())[0]

            # if fuel price in state is empty return national price
            if all(v is None for v in list(fuel_series[dict_key].values())):

                return 0.0

        except KeyError:

            return 0.0

        j = 0

        while True:

            try:
                return fuel_series[dict_key][str(year - j) + "  "] / 1.0

                break

            except:

                j += 1

    def get_fuel_price(state_abbr, fuel_type="NG", year=False):
        """Obtain fuel avgs on the annul, state scale from the EIA database."""

        if (not year):

            year = UpdateParams.today.year

        if fuel_type.upper() == "NG":

            series_ID = "NG.N3035" + state_abbr + "3.A"

            series_USA = "NG.RNGWHHD.A"

            series_LA = UpdateParams.api.data_by_series(series="NG.N3035" +
                                                        "LA" + "3.A")

            dict_key_LA = list(series_LA.keys())[0]

        elif fuel_type.upper() == "COAL":

            series_ID = "COAL.COST." + state_abbr + "-10.A"

            series_USA = "COAL.COST.US-10.A"

        elif fuel_type.upper() == "PETRO":
            # state level wholesale/resale price data ends 2011
            series_ID = "PET.EMA_EPPR_PWA_S" + state_abbr + "_DPG.A"

            series_USA = "PET.EMA_EPPR_PWG_NUS_DPG.A"

        else:
            raise AssertionError("Please input a valid fuel_type")

        fuel_series_USA = UpdateParams.api.data_by_series(series=series_USA)

        dict_key_USA = list(fuel_series_USA.keys())[0]

        # find latest USA value
        i = 0

        while True:

            try:
                fp_USA = fuel_series_USA[dict_key_USA][str(year - i) +
                                                       "  "] / 1.0

                break

            except:

                i += 1

        # Check if state-level available, if not return USA price
        try:
            fuel_series = UpdateParams.api.data_by_series(series=series_ID)

            dict_key = list(fuel_series.keys())[0]

            # if fuel price in state is empty return national price
            if all(v is None for v in list(fuel_series[dict_key].values())):

                return (fp_USA, year - i)

        except KeyError:

            return (fp_USA, year - i)

        j = 0

        # find latest year for state
        while True:

            try:
                fp_state = fuel_series[dict_key][str(year - j) + "  "] / 1.0

                break

            except:

                j += 1

        if fuel_type.upper() == "NG":
            # series_LA is just the actual series not a series ID
            fp_mult = fp_state / series_LA[dict_key_LA][str(year - j) + "  "]
            return (fp_mult * fp_USA / 1.037, year - j)

        # return USA value if 2 years more recent vs state
        if ((year - i) - (year - j) >= 2) | (fp_state >= fp_USA):

            return (fp_USA / 1.037, year - i)

        return (fp_state, year - j)

    def get_esc(state_abbr, fuel_type="NG"):
        """Grabs fuel esc from EERC"""

        temp_dict = {"NG": "Natural Gas", "COAL": "Coal", "PETRO": "Residual"}

        return UpdateParams.eerc_esc.loc[state_abbr, temp_dict[fuel_type]]

    def create_index():
        """ 
        https://fred.stlouisfed.org/series/WPU061 - producer price index csv
    
        https://www.chemengonline.com/pci - chemical eng cost index - by year
    
        """
        path = UpdateParams.path

        def remove_nonnumeric(string):
            dummy_var = float(re.sub(r'[^\d.]', '', string))
            return dummy_var

        def get_CE_index():

            cost_dict = {}

            index_list = [
                "CE INDEX", "Equipment", "Heat Exchangers and Tanks",
                "Process Machinery", "Pipe, valves and fittings",
                "Process Instruments", "Pumps and Compressors",
                "Electrical equipment", "Structural supports",
                "Construction Labor", "Buildings", "Engineering Supervision"
            ]

            # grab raw txt
            file = open(os.path.join(path, "cost_index.txt"), "r")
            text = file.read()
            file.close()

            # modify initial year here
            data = text.split("1978")

            # Remove the initial few words
            data.pop(0)
            cost_dict['1978'] = data[0:12]
            del data[0:12]
            data = data[0]

            # Go through text and grab data points as a function of year
            for i in range(1979, 2019):
                data = data.split(str(i))
                data.pop(0)
                cost_dict[str(i)] = data[0:12]
                del data[0:12]
                data = data[0]

            df = pd.DataFrame(cost_dict, index=index_list)

            return df.applymap(remove_nonnumeric)

        ce_index = get_CE_index()

        def get_ppi_inds():
            """https://www.bls.gov/developers/api_signature_v2.htm"""
            # noyears is the maximum number of years you can pull from api in 1 query
            noyears = 20

            # the last year that you want the data from - default is this year -1
            endyear = UpdateParams.today.year - 1

            # the first year that you want the data from, if not available NaN will be the value
            startyear = 1970

            noyear_list = [noyears] * ((endyear - startyear) // noyears) + [
                (endyear - startyear) % 20 + 1
            ]
            year_tracker = endyear

            df = pd.DataFrame(
                columns=list(map(str, list(range(startyear, endyear + 1)))))

            for noyears in noyear_list:

                headers = {'Content-type': 'application/json'}
                # please label your series as "PPI series id" : "df label name"
                series_list = \
                OrderedDict({
                        'WPU061': "Industrial Chemicals",
                        "PCU33241033241052": "Boiler",
                        "PCU333994333994": "Furnace",
                        "PCU333414333414": "Solar Field",
                        "PCU33361133361105": "CHP",
                        "WPU10250105": "Aluminum",
                        "WPU11790105": "BatteryStorage"
                        })
                data = json.dumps({
                    "seriesid":
                    list(series_list.keys()),
                    "annualaverage":
                    "true",
                    "startyear":
                    str(year_tracker - noyears + 1),
                    "endyear":
                    str(year_tracker),
                    "registrationkey":
                    "2ad8d1d2aa574a05a389c070bee5e070"
                })
                p = requests.post(
                    'https://api.bls.gov/publicAPI/v2/timeseries/data/',
                    data=data,
                    headers=headers)
                json_data = json.loads(p.text)

                pd_dict = {}

                for i in range(len(json_data["Results"]["series"])):
                    series_id = json_data["Results"]["series"][i]["seriesID"]
                    pd_dict[series_id] = [
                        j for j in json_data["Results"]["series"][i]["data"]
                        if j["periodName"] == "Annual"
                    ]

                for i in series_list.keys():
                    ser_vals = [j["value"] for j in pd_dict[i][::-1]]
                    ser_vals = [float("nan")
                                ] * (noyears - len(ser_vals)) + ser_vals
                    df.loc[series_list[i],
                           list(
                               map(
                                   str,
                                   list(
                                       range(year_tracker - noyears +
                                             1, year_tracker +
                                             1))))] = ser_vals

                year_tracker -= noyears

            return df

        ppi_index = get_ppi_inds()

        comb_index = pd.concat([ce_index, ppi_index], join="outer", sort=True)

        comb_index.to_csv(os.path.join(path, "cost_index_data.csv"))
コード例 #14
0
    def __init__(self, **kwargs):
        """Initialize, loading data."""
        import eia
        import googlemaps
        import zillow
        from shapely.geometry import Polygon

        self._p = kwargs.get('logger')

        load_canopy_polys = kwargs.get('load_canopy_polys', True)

        self._dir_name = os.path.dirname(os.path.realpath(__file__))

        # Load meta.json.
        self.prt('Loading meta file...')
        meta_path = os.path.join(self._dir_name, '..', 'meta.json')
        if os.path.exists(meta_path):
            with open(meta_path, 'r') as f:
                meta = json.load(f)
            self._imgs_mean = meta['mean']
            self._imgs_std = meta['std']
            self._train_count = meta['train_count']

        # Load parcel data.
        self._parcels_fname = self.download_file(self._PARCELS_URL)

        with open(self._parcels_fname, 'r') as f:
            self._parcels = json.load(f)

        self._parcel_polygons = [
            x.get('geometry', {}).get('coordinates', [])
            for x in self._parcels.get('features', [])
        ]

        self._parcel_polygons = list(
            filter(None, ([[y for y in x if len(y) >= 3]
                           for x in self._parcel_polygons])))

        self._parcel_polygons = [[Polygon(y) for y in x if len(y) >= 3]
                                 for x in self._parcel_polygons]

        with open(os.path.join(self._dir_name, '..', 'google.key'), 'r') as f:
            self._google_key = f.readline().strip()
        self._google_client = googlemaps.Client(key=self._google_key)

        with open(os.path.join(self._dir_name, '..', 'eia.key'), 'r') as f:
            self._eia_key = f.readline().strip()
        self._eia_client = eia.API(self._eia_key)

        with open(os.path.join(self._dir_name, '..', 'zillow.key'), 'r') as f:
            self._zillow_key = f.readline().strip()
        self._zillow_client = zillow.ValuationApi()

        self._cropsize = self._INPUT_IMG_SIZE - 2 * self._CROPPIX

        # Load canopy data.
        if not load_canopy_polys:
            return

        self._canopy_fname = os.path.join(self._dir_name, '..', 'geo',
                                          'ENVIRONMENTAL_TreeCanopy2014.json')
        with open(self._canopy_fname, 'r') as f:
            self._canopies = json.load(f)

        raw_canpols = [
            x.get('geometry', {}).get('coordinates', [])
            for x in self._canopies.get('features', [])
            if x.get('geometry', {}) is not None
        ]

        raw_canpols = list(
            filter(None,
                   ([[y for y in x if len(y) >= 3] for x in raw_canpols])))

        raw_canpols = [[
            Polygon([(a, b + self._LAT_OFFSET) for a, b in y]) for y in x
            if len(y) >= 3
        ] for x in raw_canpols]

        self._canopy_polygons = []
        for canpoly in tqdm(raw_canpols, desc='Extracting canopy polygons'):
            cps = [x.buffer(0) for x in canpoly]
            cps = self._canopy_polygons.extend([
                a for b in [
                    list(x.geoms) if 'multi' in str(type(x)).lower() else [x]
                    for x in cps
                ] for a in b
            ])

        self._model = None
コード例 #15
0
def main():
    """
    Run main script
    """
    #Create EIA API using your specific API key
    api_key = 'API KEY HERE'
    api = eia.API(api_key)

    #Pull the electricity price data
    series_ID = 'EBA.TEX-ALL.D.H'
    electricity_demand_df = retrieve_time_series(api, series_ID)
    electricity_demand_df.reset_index(level=0, inplace=True)
    #Rename the columns for easer analysis
    electricity_demand_df.rename(columns={
        'index':
        'Date_Time',
        electricity_demand_df.columns[1]:
        'Electricity_Demand_MWh'
    },
                                 inplace=True)
    #Format the 'Date' column
    electricity_demand_df['Date_Time'] = electricity_demand_df[
        'Date_Time'].astype(str).str[:-4]
    #Remove the 'T' from the Date column
    electricity_demand_df['Date_Time'] = electricity_demand_df[
        'Date_Time'].str.replace('T', ' ')
    #Convert the Date column into a date object
    electricity_demand_df['Date_Time'] = pd.to_datetime(
        electricity_demand_df['Date_Time'], format='%Y %m%d %H')
    #Convert from UTC to Central Standard Time
    electricity_demand_df['Date_Time'] = electricity_demand_df[
        'Date_Time'].dt.tz_localize('UTC')
    electricity_demand_df['Date_Time'] = pd.to_datetime(
        electricity_demand_df['Date_Time'].dt.tz_convert(
            'US/Central').dt.strftime("%Y-%m-%d %H:%M:%S"))

    #Plot the data on a yearly basis, using 2019 as an example year
    plot_data(df=electricity_demand_df[
        (electricity_demand_df['Date_Time'] >= pd.to_datetime('2019-01-01'))
        & (electricity_demand_df['Date_Time'] < pd.to_datetime('2020-01-01'))],
              x_variable='Date_Time',
              y_variable='Electricity_Demand_MWh',
              title='TX Electricity Demand: 2019')
    #Plot the data on a monthly basis, using December 2017 as an example
    plot_data(df=electricity_demand_df[
        (electricity_demand_df['Date_Time'] >= pd.to_datetime('2017-12-01'))
        & (electricity_demand_df['Date_Time'] < pd.to_datetime('2018-01-01'))],
              x_variable='Date_Time',
              y_variable='Electricity_Demand_MWh',
              title='TX Electricity Demand: December 2017')
    #Plot the data on a weekly basis, using July 1-7, 2019 as an example
    plot_data(df=electricity_demand_df[
        (electricity_demand_df['Date_Time'] >= pd.to_datetime('2019-07-01'))
        & (electricity_demand_df['Date_Time'] < pd.to_datetime('2019-07-07'))],
              x_variable='Date_Time',
              y_variable='Electricity_Demand_MWh',
              title='TX Electricity Demand: Monday-Sunday July 1-7, 2019')
    #Pull the hour into and individual column
    electricity_demand_df['Hour'] = electricity_demand_df['Date_Time'].dt.hour
    #Pull the day of month for each reading
    electricity_demand_df['Day_Of_Month'] = electricity_demand_df[
        'Date_Time'].dt.day
    #Pull day of week for each reading
    electricity_demand_df['Day_Of_Week'] = electricity_demand_df[
        'Date_Time'].dt.day_name()
    #Pull the numeric value for day of the week
    electricity_demand_df['Day_Of_Week_Numeric'] = electricity_demand_df[
        'Date_Time'].dt.dayofweek + 1
    #Pull the date in terms of week
    electricity_demand_df['Week'] = electricity_demand_df['Date_Time'].dt.week
    #Pull the month of the year
    electricity_demand_df['Month'] = electricity_demand_df[
        'Date_Time'].dt.month.apply(lambda x: calendar.month_abbr[x])
    #Pull the numeric value for month
    electricity_demand_df['Month_Numeric'] = electricity_demand_df[
        'Date_Time'].dt.month
    #Pull th year
    electricity_demand_df['Year'] = electricity_demand_df['Date_Time'].dt.year

    #Calculate the hour with max demand for each date in the data set
    electricity_demand_df[
        'Peak_Demand_Hour_MWh_For_Day'] = electricity_demand_df.groupby(
            ['Day_Of_Month', 'Month', 'Year'],
            sort=False)['Electricity_Demand_MWh'].transform('max')

    #Create time series with just peak hourly data
    peak_demand_hour_df = electricity_demand_df[
        electricity_demand_df['Electricity_Demand_MWh'] ==
        electricity_demand_df['Peak_Demand_Hour_MWh_For_Day']]
    #Rename the 'Hour' column to 'Peak_Demand_Hour'
    peak_demand_hour_df = peak_demand_hour_df.rename(
        columns={'Hour': 'Peak_Demand_Hour'})
    #Create a histogram of counts by hour
    ax = peak_demand_hour_df['Peak_Demand_Hour'].value_counts().plot(
        kind='bar', title='Peak Demand Hour by Number of Occurrences')
    ax.set_xlabel("Demand Hour (0-23 hour)")
    ax.set_ylabel("Number of Occurrences")

    #Create a histogram of counts by peak demand hour, grouped by day of the week
    generate_histogram_of_aggregated_counts(
        peak_demand_hour_df,
        peak_demand_hour_column='Peak_Demand_Hour',
        group_by_column='Day_Of_Week_Numeric')
    #Create a histogram of counts by peak demand hour, grouped by month
    generate_histogram_of_aggregated_counts(
        peak_demand_hour_df,
        peak_demand_hour_column='Peak_Demand_Hour',
        group_by_column='Month_Numeric')
    #Subset the dataframe to only include the features and labels that we're going to use
    #in the random forest model
    peak_demand_hour_model = peak_demand_hour_df[[
        'Peak_Demand_Hour', 'Day_Of_Week', 'Week', 'Month'
    ]]
    #Convert the Week, Year, and Peak_Demand_Your variables into categoric string variables (from numeric)
    peak_demand_hour_model.loc[:,
                               'Week'] = peak_demand_hour_model['Week'].apply(
                                   str)
    peak_demand_hour_model.loc[:,
                               'Peak_Demand_Hour'] = 'Hour ' + peak_demand_hour_model[
                                   'Peak_Demand_Hour'].apply(str)
    #Pull the counts per peak demand hour category
    counts_by_category = pd.DataFrame(
        peak_demand_hour_model.groupby('Peak_Demand_Hour')
        ['Peak_Demand_Hour'].count())
    #Isolate peak hour occurrences that occur more than 15 times
    more_than_15_occurrences = counts_by_category[
        counts_by_category['Peak_Demand_Hour'] > 15]
    #Filter the data set to only include instances with more than 15 occurrences--this is just to remove
    #any super anomalous cases from the model
    peak_demand_hour_model = peak_demand_hour_model[
        peak_demand_hour_model['Peak_Demand_Hour'].isin(
            list(more_than_15_occurrences.index))]
    #Remove the labels from the features
    features = peak_demand_hour_model.drop('Peak_Demand_Hour', axis=1)
    #One hot encode the categorical features
    features = pd.get_dummies(features)
    #Create labels
    labels = np.array(peak_demand_hour_model['Peak_Demand_Hour'])
    #Saving feature names for later use
    feature_list = list(features.columns)
    # Convert to numpy array
    features = np.array(features)
    # Split the data into training and testing sets
    train_features, test_features, train_labels, test_labels = train_test_split(
        features, labels, test_size=0.25, random_state=5)
    #Create the parameter grid, which is plugged into
    #GridSearchCV, where all hyperparamter combos are tested to find the optimal parameters combination
    parameter_grid = {
        'max_depth': [80, 90, 100, 110],
        'n_estimators': [700, 800, 900, 1000, 1100, 1200]
    }
    grid_search_rf(parameter_grid, train_features, train_labels)
    """
    Grid Search Outputs:
        Fitting 3 folds for each of 24 candidates, totalling 72 fits
        [Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
        [Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   25.3s
        [Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   54.0s finished
        {'max_depth': 100, 'n_estimators': 1100}
    """
    #Plug in optimized model parameters into final RF model
    rf = RandomForestClassifier(n_estimators=1100,
                                max_depth=100,
                                random_state=1500)
    #Fit the model
    rf.fit(train_features, train_labels)
    # Use the forest's predict method on the test data
    print(
        confusion_matrix(test_labels,
                         rf.predict(test_features),
                         labels=[
                             'Hour 8', 'Hour 9', 'Hour 10', 'Hour 14',
                             'Hour 15', 'Hour 16', 'Hour 17', 'Hour 18',
                             'Hour 19', 'Hour 20', 'Hour 21'
                         ]))
    accuracy_score(test_labels,
                   rf.predict(test_features),
                   normalize=True,
                   sample_weight=None)
    #Obtain feature importances in the model
    feature_importances = pd.DataFrame(rf.feature_importances_,
                                       index=feature_list,
                                       columns=['importance'
                                                ]).sort_values('importance',
                                                               ascending=False)
    print(feature_importances)