def plot(sql_db='tycho_production'): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~ Read SQL ~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ SQL = tycho.PostgreSQLCon() SQL.make_con() # --- Read in ETL --- merged = SQL.sql_to_pandas('etl_L3') gppd = SQL.sql_to_pandas('gppd_merged') # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~ Plot ~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ plot_cems_emissions(merged) plot_corr_heatmap(merged) plot_eda_pair(merged) plot_shap(merged) # --- Plot training data --- plot_map_plants(merged, country='United States of America', title='U.S. Power Plants Included in Tycho Training Set') plot_map_plants(gppd, country=config.PREDICT_COUNTRIES[0], tiles=10) # --- Plot Prediction --- plot_emission_factor(data_type='test', country='U.S.')
def test_duplicates(ts_frequency=TS_FREQUENCY): # --- establish SQL Connection --- SQL = tycho.PostgreSQLCon(schema='test') SQL.make_con() # --- Read in ETL Pickle --- merged = SQL.sql_to_pandas('etl_L3') # --- count samples for each plant_id_wri --- counts = merged.groupby('plant_id_wri', as_index=False)['datetime_utc'].count() # --- test counts --- counts['not_valid'] = 0 counts['upper_limit'] = int(TS_DIVISOR * 1.1) counts.loc[counts['datetime_utc'] > counts['upper_limit'], 'not_valid'] = 1 assert counts['not_valid'].sum() == 0
def test_earthengine(test_n=5): """Compare merged dataframe earth engine to the raw data.""" # --- establish SQL Connection --- SQL = tycho.PostgreSQLCon(schema='test') SQL.make_con() # --- Read in ETL Pickle --- merged = SQL.sql_to_pandas('etl_L3') # --- subset to n --- keep_plants = merged.sample(frac=1).iloc[0:test_n]['plant_id_eia'].tolist() merged = merged.loc[merged['plant_id_eia'].isin(keep_plants)] # --- fetch earth engine --- fetcher = tycho.EarthEngineFetcherLite(read_cache=False, use_cache=False) earthengine = fetcher.fetch(merged) # --- pivot onto index --- index_df = merged[['plant_id_wri', 'datetime_utc']] merger = tycho.RemoteDataMerger merger.earthengine = earthengine #override pickle reading merger._pivot_buffers() merger._merge_pivot(index_df) merger._clean() earthengine_out = merger.merged # --- filter --- merged = merged[list(earthengine_out.columns)] # --- sort --- earthengine_out.sort_values(['datetime_utc', 'plant_id_eia'], ascending=True, inplace=True) earthengine_out.reset_index(drop=True, inplace=True) merged.sort_values(['datetime_utc', 'plant_id_eia'], ascending=True, inplace=True) merged.reset_index(drop=True, inplace=True) # --- compare --- assert_frame_equal(earthengine_out, merged)
def etl(): # --- establish SQL Connection --- SQL = tycho.PostgreSQLCon() SQL.make_con() if config.RUN_PRE_EE: # --- Fetch EPA CEMS data if not present in 'data/CEMS/csvs' (as zip files) --- CemsFetch = tycho.EPACEMSFetcher() CemsFetch.fetch() # --- Load EIA 860/923 data from PUDL --- PudlLoad = tycho.PUDLLoader(SQL=SQL) PudlLoad.load() eightsixty = PudlLoad.eightsixty # --- load CEMS data from pickle, or construct dataframe from csvs --- CemsLoad = tycho.CEMSLoader(SQL=SQL) CemsLoad.load() cems = CemsLoad.cems # --- Load WRI Global Power Plant Database data from csv --- GppdLoad = tycho.GPPDLoader(SQL=SQL) GppdLoad.load() gppd = GppdLoad.gppd # --- Merge eightsixty, gppd, cems together into a long_df --- TrainingMerge = tycho.TrainingDataMerger(eightsixty, gppd, cems) TrainingMerge.merge() df = TrainingMerge.df # --- Output to SQL --- SQL.pandas_to_sql(df, 'etl_pre_L3') else: df = SQL.sql_to_pandas('etl_pre_L3') df = df.loc[df['datetime_utc'] < pd.datetime(config.MAX_YEAR+1 ,1, 1)] # --- Only keep n_generators worth of plants --- if tycho.config.N_GENERATORS != None: # --- Create list of plant_id_wri --- plant_ids = list(set(df['plant_id_wri'])) plant_ids = sorted(plant_ids) # --- Shuffle --- random.Random(42).shuffle(plant_ids) # --- Subset list --- log.info(f"....reducing n_generators down from {len(plant_ids)} from config") keep = plant_ids[0:config.N_GENERATORS] # --- Subset df --- df = df.loc[df['plant_id_wri'].isin(keep)] log.info(f"....n_generators now {len(list(set(df['plant_id_wri'])))} from config") #--- Load Google Earth Engine Data (such as weather and population) using df for dates --- EarthEngineFetch = tycho.EarthEngineFetcherLite() EarthEngineFetch.fetch(df) # --- Merge Remote Sensing (Earth Engine) Data onto df --- EarthEngineMerge = tycho.EarthEngineDataMergerLite() df = EarthEngineMerge.merge(df) if config.FETCH_S3: # --- fetch S3 data --- SentinelFetcher = tycho.S3Fetcher() SentinelFetcher.fetch(df) # --- merge S3 data together --- SentinelMerger = tycho.L3Merger() SentinelMerger.merge(df) if config.RUN_BAYES_OPT: # --- Optimize L3 Conical Parameters (distance and angle of emission measurement) --- SentinelOptimzier = tycho.L3Optimizer() SentinelOptimzier.optimize(df) # --- aggregate and merge onto df --- SentinelCalculator = tycho.L3Loader() df = SentinelCalculator.calculate(df) # --- Save to SQL --- SQL.pandas_to_sql(df, 'etl_L3')
def database(): SQL = tycho.PostgreSQLCon() wri_table(SQL) plants_view(SQL) country_view(SQL) dirtiest_plants_view(SQL)
def train(save_pickles=True): # --- establish SQL Connection --- SQL = tycho.PostgreSQLCon() SQL.make_con() # --- Read in ETL Pickle --- merged = SQL.sql_to_pandas('etl_L3') # --- Sanitize --- ColumnSanitize = tycho.ColumnSanitizer() clean = ColumnSanitize.sanitize(merged) # --- Create average lookup tables --- avg_table = tycho.calc_average_y_vals_per_MW(clean) # --- Split --- Splitter = tycho.FourWaySplit() X_train_df, X_test_df, y_train_all, y_test_all = Splitter.split(clean) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~ Pipeline ~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ pandas_pipe = Pipeline(steps=[ ('capacity', tycho.CapacityFeatures()), ('date', tycho.DateFeatures()), ('avg_values', tycho.ApplyAvgY(avg_table)), ('dropnull', tycho.DropNullColumns()), ('onehot', tycho.OneHotEncodeWithThresh()), ]) numpy_pipe = Pipeline(steps=[ ('imputer', SimpleImputer()), ('scaler', tycho.LowMemoryMinMaxScaler()), ]) preprocess_pipe = Pipeline(steps=[ ('pd', pandas_pipe), ('np', numpy_pipe), ]) # --- Fit/transform --- X_train = preprocess_pipe.fit_transform(X_train_df) X_test = preprocess_pipe.transform(X_test_df) # --- Create complete dfs for output --- train_out_df = X_train_df[[ 'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh', 'primary_fuel' ]] train_out_df = pd.concat([train_out_df, y_train_all], axis='columns') test_out_df = X_test_df[[ 'datetime_utc', 'plant_id_wri', 'estimated_generation_gwh', 'primary_fuel' ]] test_out_df = pd.concat([test_out_df, y_test_all], axis='columns') # --- output preprocessing pipe --- if save_pickles: out_path = os.path.join('models', config.TRAIN_MODEL) with open(os.path.join(out_path, 'pipe.pkl'), 'wb') as handle: pickle.dump(preprocess_pipe, handle) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~ Train Model ~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ for y_col in config.ML_Y_COLS: log.info('\n') log.info(f'....beginning fit for {y_col} using {config.TRAIN_MODEL}') # --- Subset y --- y_train = np.array(y_train_all[y_col]) y_test = np.array(y_test_all[y_col]) # --- Initialize Model --- if config.TRAIN_MODEL == 'lr': model = LinearRegression(fit_intercept=True, normalize=False, n_jobs=-1) elif config.TRAIN_MODEL == 'bayes-lgbm': estimator = LGBMRegressor( random_state=1, n_jobs=12, verbose=-1, num_iterations=1000, boosting_type=None, learning_rate=0.03, subsample=0.7, boosting='dart', ) lgbm_pbounds = { # 'boosting':['gbdt','dart'], # 'learning_rate': (0.01, 1.), # 'n_estimators': (2, 2000), 'max_depth': (3, 12), # 'min_child_weight': (0., 100.), # 'min_data_in_leaf' : (1, 40), 'num_leaves': ( 2, 2000 ), # large num_leaves helps improve accuracy but might lead to over-fitting # 'boosting_type' : ['gbdt', 'dart'], # for better accuracy -> try dart 'objective': ['rmse', 'mae', 'tweedie'], 'max_bin': ( 128, 10000 ), # large max_bin helps improve accuracy but might slow down training progress # 'colsample_bytree' : (0.3,1), # 'subsample' : (0.3, 1.), # 'reg_alpha' : (0., 300.), # 'reg_lambda' : (0., 300.), } model = tycho.BayesRegressor(estimator=estimator, pbounds=lgbm_pbounds) elif config.TRAIN_MODEL == 'bayes-xgb': estimator = XGBRegressor(random_state=1, nthread=12, tree_method='gpu_hist', single_precision_histogram=True, validate_paramters=True) xgb_pbounds = { 'booster': ['dart', 'gbtree', 'gblinear'], 'max_depth': (3, 11), # 'learning_rate': (0.1, 0.5), 'subsample': (0.1, 1.), # 'sampling_metod':['uniform','gradient_based'], 'colsample_bytree': (0.1, 1.), # 'colsample_bylevel': (0.1, 1.), 'max_bin': ( 2, 10000 ), # large max_bin helps improve accuracy but might slow down training progress # 'grow_policy':['depthwise','lossguide'], # 'min_child_weight': (0., 100), 'reg_alpha': (0., 250.), 'reg_lambda': (0., 250.), 'gamma': (0., 10.), # 'objective': ['reg:tweedie'], } model = tycho.BayesRegressor(estimator=estimator, pbounds=xgb_pbounds) # --- Fit --- model.fit(X_train, y_train) # --- Get best estimator --- y_train_pred = model.predict(X_train) log.info( f'........best train MAE for {y_col}: {mae(y_train, y_train_pred)}' ) log.info( f'........best train mape for {y_col}: {mape(y_train, y_train_pred)}' ) log.info( f'........average value for {y_col} is {y_train.mean()}, MAE as a percent is {mae(y_train, y_train_pred) / y_train.mean()}' ) # --- Predict on test --- y_pred = model.predict(X_test) log.info(f'........best test mae for {y_col}: {mae(y_test, y_pred)}') log.info(f'........best test mape for {y_col}: {mape(y_test, y_pred)}') log.info( f'........average value for {y_col} is {y_test.mean()}, MAE as a percent is {mae(y_test, y_pred) / y_test.mean()}' ) if save_pickles: if config.TRAIN_MODEL == 'tpot': # --- Output model pipeline --- model.export( os.path.join(out_path, f'tpot_best_pipe_{y_col}.py')) best = model.fitted_pipeline_ with open( os.path.join( out_path, f'model_{y_col}_{config.TRAIN_MODEL}.pkl'), 'wb') as handle: pickle.dump(best, handle) else: # --- Output model --- with open( os.path.join( out_path, f'model_{y_col}_{config.TRAIN_MODEL}.pkl'), 'wb') as handle: pickle.dump(model, handle) # --- save predictions to out dfs --- train_out_df[f'pred_{y_col}'] = y_train_pred test_out_df[f'pred_{y_col}'] = y_pred return train_out_df, test_out_df
def predict(plot=True): # --- establish SQL Connection --- SQL = tycho.PostgreSQLCon() SQL.make_con() # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~ Read GPPD ~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GPPDLoad = tycho.GPPDLoader(countries=config.PREDICT_COUNTRIES) GPPDLoad.load() #loads countries gppd = GPPDLoad.gppd # --- Repeat rows for observations at TS_FREQUENCY --- df = apply_date_range_to_gppd(gppd) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~ Download Earth Engine Data ~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #--- Load Google Earth Engine Data (such as weather and population) using df for dates --- EarthEngineFetch = tycho.EarthEngineFetcherLite() EarthEngineFetch.fetch(df) # --- Merge Remote Sensing (Earth Engine) Data onto df --- EarthEngineMerge = tycho.EarthEngineDataMergerLite() df = EarthEngineMerge.merge(df) if config.FETCH_S3: # --- fetch S3 data --- SentinelFetcher = tycho.S3Fetcher() SentinelFetcher.fetch(df) # --- aggregate and merge onto df --- SentinelCalculator = tycho.L3Loader() merged = SentinelCalculator.calculate(df) # --- Sanitize --- ColumnSanitize = tycho.ColumnSanitizer() clean = ColumnSanitize.sanitize(merged) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~ Predict ~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ load_path = os.path.join('models', config.PREDICT_MODEL) with open(os.path.join(load_path, 'pipe.pkl'), 'rb') as handle: preprocess_pipe = pickle.load(handle) clean = preprocess_pipe.transform(clean) # --- Iterate through each target variable --- pred_out_df = merged[[ 'plant_id_wri', 'datetime_utc', 'country', 'latitude', 'longitude', 'primary_fuel', 'estimated_generation_gwh', 'wri_capacity_mw' ]] log.info('....starting predictions') for y_col in config.ML_Y_COLS: with open( os.path.join(load_path, f'model_{y_col}_{config.PREDICT_MODEL}.pkl'), 'rb') as handle: model = pickle.load(handle) log.info(f'........predicting for {y_col}') y_pred = model.predict(clean) # --- Cap gross_load_mw by feasible capacity factor --- if y_col == 'gross_load_mw': estimated_cf = merged['estimated_generation_gwh'] / ( merged['wri_capacity_mw'] * 365 * 24 / 1000) max_feasible_cf = (estimated_cf * 1.25).clip(upper=0.95) max_feasible_mwh = merged['wri_capacity_mw'] * max_feasible_cf * ( 365 / config.TS_DIVISOR) * 24 assert len(y_pred) == len(max_feasible_mwh) y_pred = y_pred.clip(min=0, max=max_feasible_mwh) # --- Append prediction to dataframe --- pred_out_df[f'pred_{y_col}'] = y_pred # --- Write out dfs --- log.info('....writing out predictions') SQL.pandas_to_sql(pred_out_df, 'predictions') return pred_out_df
def test_CEMS(test_n=10): """Compare merged dataframe CEMS to the raw data.""" # --- establish SQL Connection --- SQL = tycho.PostgreSQLCon(schema='test') SQL.make_con() # --- Read in ETL Pickle --- merged = SQL.sql_to_pandas('etl_L3') # --- subset to n --- keep_plants = merged.sample(frac=1).iloc[0:test_n]['plant_id_eia'].tolist() merged = merged.loc[merged['plant_id_eia'].isin(keep_plants)] # --- fetch cems --- # ASSUMING FETCHING WORKS # --- load cems --- loader = tycho.CEMSLoader(ts_frequency=TS_FREQUENCY, years=[2019], clean_on_load=True, use_pickle=False) loader._read_csvs() loader._clean_cems() cems = loader.cems # --- pivot onto index --- index_df = merged[['plant_id_eia', 'datetime_utc']] cems = cems.merge(index_df, on=['plant_id_eia', 'datetime_utc'], how='right') # --- check for missing dates from merged --- cems_subset = cems.loc[cems['plant_id_eia'].isin(keep_plants)] missing = cems_subset.merge(index_df, on=['plant_id_eia', 'datetime_utc'], how='left') missing = missing.loc[missing.isnull().sum(axis=1) > 0] assert len(missing) == 0 # --- clean up --- keep_cols = [ 'datetime_utc', 'plant_id_eia', 'gross_load_mw', 'so2_lbs', 'nox_lbs', 'co2_lbs', 'operational_time' ] cems = cems[keep_cols] merged = merged[keep_cols] # --- sort --- cems.sort_values(['datetime_utc', 'plant_id_eia'], ascending=True, inplace=True) cems.reset_index(drop=True, inplace=True) merged.sort_values(['datetime_utc', 'plant_id_eia'], ascending=True, inplace=True) merged.reset_index(drop=True, inplace=True) # --- compare --- assert_frame_equal(cems, merged, check_less_precise=True, check_dtype=False)