Exemple #1
0
def plot(sql_db='tycho_production'):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~ Read SQL ~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    SQL = tycho.PostgreSQLCon()
    SQL.make_con()

    # --- Read in ETL ---
    merged = SQL.sql_to_pandas('etl_L3')
    gppd = SQL.sql_to_pandas('gppd_merged')

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~ Plot ~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    plot_cems_emissions(merged)
    plot_corr_heatmap(merged)
    plot_eda_pair(merged)
    plot_shap(merged)

    # --- Plot training data ---
    plot_map_plants(merged,
                    country='United States of America',
                    title='U.S. Power Plants Included in Tycho Training Set')

    plot_map_plants(gppd, country=config.PREDICT_COUNTRIES[0], tiles=10)

    # --- Plot Prediction ---
    plot_emission_factor(data_type='test', country='U.S.')
Exemple #2
0
def test_duplicates(ts_frequency=TS_FREQUENCY):
    # --- establish SQL Connection ---
    SQL = tycho.PostgreSQLCon(schema='test')
    SQL.make_con()

    # --- Read in ETL Pickle ---
    merged = SQL.sql_to_pandas('etl_L3')

    # --- count samples for each plant_id_wri ---
    counts = merged.groupby('plant_id_wri',
                            as_index=False)['datetime_utc'].count()

    # --- test counts ---
    counts['not_valid'] = 0
    counts['upper_limit'] = int(TS_DIVISOR * 1.1)
    counts.loc[counts['datetime_utc'] > counts['upper_limit'], 'not_valid'] = 1
    assert counts['not_valid'].sum() == 0
Exemple #3
0
def test_earthengine(test_n=5):
    """Compare merged dataframe earth engine to the raw data."""

    # --- establish SQL Connection ---
    SQL = tycho.PostgreSQLCon(schema='test')
    SQL.make_con()

    # --- Read in ETL Pickle ---
    merged = SQL.sql_to_pandas('etl_L3')

    # --- subset to n ---
    keep_plants = merged.sample(frac=1).iloc[0:test_n]['plant_id_eia'].tolist()
    merged = merged.loc[merged['plant_id_eia'].isin(keep_plants)]

    # --- fetch earth engine ---
    fetcher = tycho.EarthEngineFetcherLite(read_cache=False, use_cache=False)
    earthengine = fetcher.fetch(merged)

    # --- pivot onto index ---
    index_df = merged[['plant_id_wri', 'datetime_utc']]

    merger = tycho.RemoteDataMerger
    merger.earthengine = earthengine  #override pickle reading
    merger._pivot_buffers()
    merger._merge_pivot(index_df)
    merger._clean()
    earthengine_out = merger.merged

    # --- filter ---
    merged = merged[list(earthengine_out.columns)]

    # --- sort ---
    earthengine_out.sort_values(['datetime_utc', 'plant_id_eia'],
                                ascending=True,
                                inplace=True)
    earthengine_out.reset_index(drop=True, inplace=True)
    merged.sort_values(['datetime_utc', 'plant_id_eia'],
                       ascending=True,
                       inplace=True)
    merged.reset_index(drop=True, inplace=True)

    # --- compare ---
    assert_frame_equal(earthengine_out, merged)
Exemple #4
0
def etl():

    # --- establish SQL Connection ---
    SQL = tycho.PostgreSQLCon()
    SQL.make_con()
    
    if config.RUN_PRE_EE:
        # --- Fetch EPA CEMS data if not present in 'data/CEMS/csvs' (as zip files) ---
        CemsFetch = tycho.EPACEMSFetcher()
        CemsFetch.fetch()

        # --- Load EIA 860/923 data from PUDL ---
        PudlLoad = tycho.PUDLLoader(SQL=SQL)
        PudlLoad.load()
        eightsixty = PudlLoad.eightsixty

        # --- load CEMS data from pickle, or construct dataframe from csvs ---
        CemsLoad = tycho.CEMSLoader(SQL=SQL)
        CemsLoad.load()
        cems = CemsLoad.cems

        # --- Load WRI Global Power Plant Database data from csv ---
        GppdLoad = tycho.GPPDLoader(SQL=SQL) 
        GppdLoad.load()
        gppd = GppdLoad.gppd
        
        # --- Merge eightsixty, gppd, cems together into a long_df ---
        TrainingMerge = tycho.TrainingDataMerger(eightsixty, gppd, cems)
        TrainingMerge.merge()
        df = TrainingMerge.df
        
        # --- Output to SQL ---
        SQL.pandas_to_sql(df, 'etl_pre_L3')
    
    else:
        df = SQL.sql_to_pandas('etl_pre_L3')
        df = df.loc[df['datetime_utc'] < pd.datetime(config.MAX_YEAR+1 ,1, 1)]

    # --- Only keep n_generators worth of plants ---
    if tycho.config.N_GENERATORS != None:
        # --- Create list of plant_id_wri ---
        plant_ids = list(set(df['plant_id_wri']))
        plant_ids = sorted(plant_ids)

        # --- Shuffle ---
        random.Random(42).shuffle(plant_ids)

        # --- Subset list ---
        log.info(f"....reducing n_generators down from {len(plant_ids)} from config")
        keep = plant_ids[0:config.N_GENERATORS]

        # --- Subset df ---
        df = df.loc[df['plant_id_wri'].isin(keep)]
        log.info(f"....n_generators now {len(list(set(df['plant_id_wri'])))} from config")

    #--- Load Google Earth Engine Data (such as weather and population) using df for dates ---
    EarthEngineFetch = tycho.EarthEngineFetcherLite()
    EarthEngineFetch.fetch(df)

    # --- Merge Remote Sensing (Earth Engine) Data onto df ---
    EarthEngineMerge = tycho.EarthEngineDataMergerLite()
    df = EarthEngineMerge.merge(df)
    
    if config.FETCH_S3:
        # --- fetch S3 data ---
        SentinelFetcher = tycho.S3Fetcher()
        SentinelFetcher.fetch(df)

    # --- merge S3 data together ---
    SentinelMerger = tycho.L3Merger()
    SentinelMerger.merge(df)

    if config.RUN_BAYES_OPT:
        # --- Optimize L3 Conical Parameters (distance and angle of emission measurement) ---
        SentinelOptimzier = tycho.L3Optimizer()
        SentinelOptimzier.optimize(df)

    # --- aggregate and merge onto df ---
    SentinelCalculator = tycho.L3Loader()
    df = SentinelCalculator.calculate(df)
    
    # --- Save to SQL ---
    SQL.pandas_to_sql(df, 'etl_L3')
Exemple #5
0
def database():
    SQL = tycho.PostgreSQLCon()
    wri_table(SQL)
    plants_view(SQL)
    country_view(SQL)
    dirtiest_plants_view(SQL)
Exemple #6
0
def train(save_pickles=True):

    # --- establish SQL Connection ---
    SQL = tycho.PostgreSQLCon()
    SQL.make_con()

    # --- Read in ETL Pickle ---
    merged = SQL.sql_to_pandas('etl_L3')

    # --- Sanitize ---
    ColumnSanitize = tycho.ColumnSanitizer()
    clean = ColumnSanitize.sanitize(merged)

    # --- Create average lookup tables ---
    avg_table = tycho.calc_average_y_vals_per_MW(clean)

    # --- Split ---
    Splitter = tycho.FourWaySplit()
    X_train_df, X_test_df, y_train_all, y_test_all = Splitter.split(clean)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~ Pipeline ~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    pandas_pipe = Pipeline(steps=[
        ('capacity', tycho.CapacityFeatures()),
        ('date', tycho.DateFeatures()),
        ('avg_values', tycho.ApplyAvgY(avg_table)),
        ('dropnull', tycho.DropNullColumns()),
        ('onehot', tycho.OneHotEncodeWithThresh()),
    ])

    numpy_pipe = Pipeline(steps=[
        ('imputer', SimpleImputer()),
        ('scaler', tycho.LowMemoryMinMaxScaler()),
    ])

    preprocess_pipe = Pipeline(steps=[
        ('pd', pandas_pipe),
        ('np', numpy_pipe),
    ])

    # --- Fit/transform ---
    X_train = preprocess_pipe.fit_transform(X_train_df)
    X_test = preprocess_pipe.transform(X_test_df)

    # --- Create complete dfs for output ---
    train_out_df = X_train_df[[
        'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh',
        'primary_fuel'
    ]]
    train_out_df = pd.concat([train_out_df, y_train_all], axis='columns')

    test_out_df = X_test_df[[
        'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh',
        'primary_fuel'
    ]]
    test_out_df = pd.concat([test_out_df, y_test_all], axis='columns')

    # --- output preprocessing pipe ---
    if save_pickles:
        out_path = os.path.join('models', config.TRAIN_MODEL)
        with open(os.path.join(out_path, 'pipe.pkl'), 'wb') as handle:
            pickle.dump(preprocess_pipe, handle)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~ Train Model ~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    for y_col in config.ML_Y_COLS:
        log.info('\n')
        log.info(f'....beginning fit for {y_col} using {config.TRAIN_MODEL}')

        # --- Subset y ---
        y_train = np.array(y_train_all[y_col])
        y_test = np.array(y_test_all[y_col])

        # --- Initialize Model ---
        if config.TRAIN_MODEL == 'lr':
            model = LinearRegression(fit_intercept=True,
                                     normalize=False,
                                     n_jobs=-1)

        elif config.TRAIN_MODEL == 'bayes-lgbm':

            estimator = LGBMRegressor(
                random_state=1,
                n_jobs=12,
                verbose=-1,
                num_iterations=1000,
                boosting_type=None,
                learning_rate=0.03,
                subsample=0.7,
                boosting='dart',
            )

            lgbm_pbounds = {
                # 'boosting':['gbdt','dart'],
                # 'learning_rate': (0.01, 1.),
                # 'n_estimators': (2, 2000),
                'max_depth': (3, 12),
                # 'min_child_weight': (0., 100.),
                # 'min_data_in_leaf' : (1, 40),
                'num_leaves': (
                    2, 2000
                ),  # large num_leaves helps improve accuracy but might lead to over-fitting
                # 'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart
                'objective': ['rmse', 'mae', 'tweedie'],
                'max_bin': (
                    128, 10000
                ),  # large max_bin helps improve accuracy but might slow down training progress
                # 'colsample_bytree' : (0.3,1),
                # 'subsample' : (0.3, 1.),
                # 'reg_alpha' : (0., 300.),
                # 'reg_lambda' : (0., 300.),
            }

            model = tycho.BayesRegressor(estimator=estimator,
                                         pbounds=lgbm_pbounds)

        elif config.TRAIN_MODEL == 'bayes-xgb':

            estimator = XGBRegressor(random_state=1,
                                     nthread=12,
                                     tree_method='gpu_hist',
                                     single_precision_histogram=True,
                                     validate_paramters=True)

            xgb_pbounds = {
                'booster': ['dart', 'gbtree', 'gblinear'],
                'max_depth': (3, 11),
                # 'learning_rate': (0.1, 0.5),
                'subsample': (0.1, 1.),
                # 'sampling_metod':['uniform','gradient_based'],
                'colsample_bytree': (0.1, 1.),
                # 'colsample_bylevel': (0.1, 1.),
                'max_bin': (
                    2, 10000
                ),  # large max_bin helps improve accuracy but might slow down training progress
                # 'grow_policy':['depthwise','lossguide'],
                # 'min_child_weight': (0., 100),
                'reg_alpha': (0., 250.),
                'reg_lambda': (0., 250.),
                'gamma': (0., 10.),
                # 'objective': ['reg:tweedie'],
            }

            model = tycho.BayesRegressor(estimator=estimator,
                                         pbounds=xgb_pbounds)

        # --- Fit  ---
        model.fit(X_train, y_train)

        # --- Get best estimator ---
        y_train_pred = model.predict(X_train)
        log.info(
            f'........best train MAE for {y_col}: {mae(y_train, y_train_pred)}'
        )
        log.info(
            f'........best train mape for {y_col}: {mape(y_train, y_train_pred)}'
        )
        log.info(
            f'........average value for {y_col} is {y_train.mean()}, MAE as a percent is {mae(y_train, y_train_pred) / y_train.mean()}'
        )

        # --- Predict on test ---
        y_pred = model.predict(X_test)
        log.info(f'........best test mae for {y_col}: {mae(y_test, y_pred)}')
        log.info(f'........best test mape for {y_col}: {mape(y_test, y_pred)}')
        log.info(
            f'........average value for {y_col} is {y_test.mean()}, MAE as a percent is {mae(y_test, y_pred) / y_test.mean()}'
        )

        if save_pickles:
            if config.TRAIN_MODEL == 'tpot':
                # --- Output model pipeline ---
                model.export(
                    os.path.join(out_path, f'tpot_best_pipe_{y_col}.py'))
                best = model.fitted_pipeline_
                with open(
                        os.path.join(
                            out_path,
                            f'model_{y_col}_{config.TRAIN_MODEL}.pkl'),
                        'wb') as handle:
                    pickle.dump(best, handle)

            else:
                # --- Output model ---
                with open(
                        os.path.join(
                            out_path,
                            f'model_{y_col}_{config.TRAIN_MODEL}.pkl'),
                        'wb') as handle:
                    pickle.dump(model, handle)

        # --- save predictions to out dfs ---
        train_out_df[f'pred_{y_col}'] = y_train_pred
        test_out_df[f'pred_{y_col}'] = y_pred

    return train_out_df, test_out_df
Exemple #7
0
def predict(plot=True):

    # --- establish SQL Connection ---
    SQL = tycho.PostgreSQLCon()
    SQL.make_con()

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~ Read GPPD ~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    GPPDLoad = tycho.GPPDLoader(countries=config.PREDICT_COUNTRIES)
    GPPDLoad.load()  #loads countries
    gppd = GPPDLoad.gppd

    # --- Repeat rows for observations at TS_FREQUENCY ---
    df = apply_date_range_to_gppd(gppd)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~ Download Earth Engine Data ~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    #--- Load Google Earth Engine Data (such as weather and population) using df for dates ---
    EarthEngineFetch = tycho.EarthEngineFetcherLite()
    EarthEngineFetch.fetch(df)

    # --- Merge Remote Sensing (Earth Engine) Data onto df ---
    EarthEngineMerge = tycho.EarthEngineDataMergerLite()
    df = EarthEngineMerge.merge(df)

    if config.FETCH_S3:
        # --- fetch S3 data ---
        SentinelFetcher = tycho.S3Fetcher()
        SentinelFetcher.fetch(df)

    # --- aggregate and merge onto df ---
    SentinelCalculator = tycho.L3Loader()
    merged = SentinelCalculator.calculate(df)

    # --- Sanitize ---
    ColumnSanitize = tycho.ColumnSanitizer()
    clean = ColumnSanitize.sanitize(merged)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~ Predict ~~~~~~~~~~~~~~~~~~
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    load_path = os.path.join('models', config.PREDICT_MODEL)

    with open(os.path.join(load_path, 'pipe.pkl'), 'rb') as handle:
        preprocess_pipe = pickle.load(handle)

    clean = preprocess_pipe.transform(clean)

    # --- Iterate through each target variable ---
    pred_out_df = merged[[
        'plant_id_wri', 'datetime_utc', 'country', 'latitude', 'longitude',
        'primary_fuel', 'estimated_generation_gwh', 'wri_capacity_mw'
    ]]

    log.info('....starting predictions')
    for y_col in config.ML_Y_COLS:

        with open(
                os.path.join(load_path,
                             f'model_{y_col}_{config.PREDICT_MODEL}.pkl'),
                'rb') as handle:
            model = pickle.load(handle)

        log.info(f'........predicting for {y_col}')
        y_pred = model.predict(clean)

        # --- Cap gross_load_mw by feasible capacity factor ---
        if y_col == 'gross_load_mw':
            estimated_cf = merged['estimated_generation_gwh'] / (
                merged['wri_capacity_mw'] * 365 * 24 / 1000)
            max_feasible_cf = (estimated_cf * 1.25).clip(upper=0.95)
            max_feasible_mwh = merged['wri_capacity_mw'] * max_feasible_cf * (
                365 / config.TS_DIVISOR) * 24
            assert len(y_pred) == len(max_feasible_mwh)
            y_pred = y_pred.clip(min=0, max=max_feasible_mwh)

        # --- Append prediction to dataframe ---
        pred_out_df[f'pred_{y_col}'] = y_pred

    # --- Write out dfs ---
    log.info('....writing out predictions')
    SQL.pandas_to_sql(pred_out_df, 'predictions')

    return pred_out_df
Exemple #8
0
def test_CEMS(test_n=10):
    """Compare merged dataframe CEMS to the raw data."""

    # --- establish SQL Connection ---
    SQL = tycho.PostgreSQLCon(schema='test')
    SQL.make_con()

    # --- Read in ETL Pickle ---
    merged = SQL.sql_to_pandas('etl_L3')

    # --- subset to n ---
    keep_plants = merged.sample(frac=1).iloc[0:test_n]['plant_id_eia'].tolist()
    merged = merged.loc[merged['plant_id_eia'].isin(keep_plants)]

    # --- fetch cems ---
    # ASSUMING FETCHING WORKS

    # --- load cems ---
    loader = tycho.CEMSLoader(ts_frequency=TS_FREQUENCY,
                              years=[2019],
                              clean_on_load=True,
                              use_pickle=False)
    loader._read_csvs()
    loader._clean_cems()
    cems = loader.cems

    # --- pivot onto index ---
    index_df = merged[['plant_id_eia', 'datetime_utc']]
    cems = cems.merge(index_df,
                      on=['plant_id_eia', 'datetime_utc'],
                      how='right')

    # --- check for missing dates from merged ---
    cems_subset = cems.loc[cems['plant_id_eia'].isin(keep_plants)]
    missing = cems_subset.merge(index_df,
                                on=['plant_id_eia', 'datetime_utc'],
                                how='left')
    missing = missing.loc[missing.isnull().sum(axis=1) > 0]
    assert len(missing) == 0

    # --- clean up ---
    keep_cols = [
        'datetime_utc', 'plant_id_eia', 'gross_load_mw', 'so2_lbs', 'nox_lbs',
        'co2_lbs', 'operational_time'
    ]
    cems = cems[keep_cols]
    merged = merged[keep_cols]

    # --- sort ---
    cems.sort_values(['datetime_utc', 'plant_id_eia'],
                     ascending=True,
                     inplace=True)
    cems.reset_index(drop=True, inplace=True)
    merged.sort_values(['datetime_utc', 'plant_id_eia'],
                       ascending=True,
                       inplace=True)
    merged.reset_index(drop=True, inplace=True)

    # --- compare ---
    assert_frame_equal(cems,
                       merged,
                       check_less_precise=True,
                       check_dtype=False)