def combine_files_setup(self, outformat='pkl.gz'): """Combine multiple files (for a given site) into a single file data set. Will also use PVLib get_clearsky method and fill in Clearsky GHI pvlib column. Returns ------- None """ time_cols = ['Year', 'Month', 'Day', 'Hour', 'Minute'] for id, file_set in self.files_df.groupby(self.files_df['id']): files = [os.path.join(self.path_to_read_dir, f) for f in file_set.index] header = pd.read_csv(files[0], nrows=2) # read header to get time zone, latitude, longitude, elevation tz = 'Etc/GMT' + header['Time Zone'][0].replace('-', '+') # negative sign confuses 'Etc/GMTXX' timezone? df = pd.concat([pd.read_csv(f, skiprows=2) for f in files]) df.index = pd.to_datetime(df[time_cols]) df.index = df.index.tz_localize(tz) df = df.drop(time_cols, axis=1) latitude = float(header['Latitude'][0]) longitude = float(header['Longitude'][0]) elevation = float(header['Elevation'][0]) # add Is clear NSRDB column and Clearsky GHI pvlib column # Scale Clearsky GHI pvlib to match periods of clarity between detection = cs_detection.ClearskyDetection(df, copy=False, set_ghi_status=True) detection.set_nsrdb_sky_status(label='Is clear NSRDB') detection.generate_pvlib_clearsky(latitude, longitude, elevation, tz=tz) detection.scale_model('GHI', 'Clearsky GHI pvlib', 'Is clear NSRDB') df = detection.df if outformat == 'pkl': pd.to_pickle(df, os.path.join(self.path_to_write_dir, str(int(id))) + '.pkl') elif outformat == 'pkl.gz': pd.to_pickle(df, os.path.join(self.path_to_write_dir, str(int(id))) + '.pkl.gz') elif outformat == 'csv': df.to_csv(os.path.join(self.path_to_write_dir, str(int(id))) + '.csv') print('Files successfully written to {}'.format(self.path_to_write_dir))
len(nsrdb.df) # # Investigate input data # ## ABQ # In[43]: nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz') nsrdb.df.index = nsrdb.df.index.tz_convert('MST') nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') # In[44]: train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) train.trim_dates('01-01-2013', '01-01-2015') test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) test.trim_dates('01-01-2015', None) # In[45]: clf = ensemble.RandomForestClassifier(random_state=42) # In[46]: feature_cols = [ 'tfn', 'abs_ideal_ratio_diff grad', 'abs_ideal_ratio_diff grad mean', 'abs_ideal_ratio_diff grad std', 'abs_ideal_ratio_diff grad second', 'abs_ideal_ratio_diff grad second mean', 'abs_ideal_ratio_diff grad second std',
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, by_day=True, multiproc=True) pred = pred.astype(bool) # In[127]: vis = visualize.Visualizer() # In[128]: srrl_tmp = cs_detection.ClearskyDetection(nsrdb_srrl.df) srrl_tmp.intersection(ground.df.index) vis.add_line_ser(test.df['GHI'], 'GHI') vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs') vis.add_circle_ser(test.df[(srrl_tmp.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only') vis.add_circle_ser(test.df[(srrl_tmp.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only') vis.add_circle_ser(test.df[(srrl_tmp.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only') # vis.add_line_ser(test.df['abs_ideal_ratio_diff'] * 100) # In[129]: vis.show()
'GHI Clearsky GHI pvlib gradient second ratio min', 'GHI Clearsky GHI pvlib gradient second ratio max', 'GHI Clearsky GHI pvlib line length ratio', 'GHI Clearsky GHI pvlib line length ratio gradient', 'GHI Clearsky GHI pvlib line length ratio gradient second' ] target_cols = ['sky_status'] # # Train/test on NSRDB data to find optimal parameters # In[66]: train = cs_detection.ClearskyDetection(nsrdb.df) train.trim_dates('01-01-2010', '01-01-2015') test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None) # In[67]: train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') # In[68]: utils.calc_all_window_metrics(train.df, 3, meas_col='GHI', model_col='Clearsky GHI pvlib', overwrite=True)
# In[5]: ground.df.index[0], ground.df.index[-1] # In[6]: nsrdb.df.index[0], nsrdb.df.index[-1] # In[7]: ground2 = cs_detection.ClearskyDetection(ground.df, 'GHI', 'Clearsky GHI pvlib', solar_noon_col='abs(t-tnoon)') # In[8]: ground2.trim_dates('01-01-2002', '01-01-2015') ground2.df = ground2.df[ground2.df.index.minute % 30 == 0] # In[9]: nsrdb2 = cs_detection.ClearskyDetection(nsrdb.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status', solar_noon_col='abs(t-tnoon)')
def split_df_by_date(obj, start, mid, end): train = cs_detection.ClearskyDetection(obj.df) train.trim_dates(start, mid) test = cs_detection.ClearskyDetection(obj.df) test.trim_dates(mid, end) return train, test
# In[3]: len(nsrdb.df) # # Train/test on NSRDB data to find optimal parameters # ## Default classifier # In[4]: train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) train.trim_dates(None, '01-01-2015') test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) test.trim_dates('01-01-2015', None) # In[5]: train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') # In[6]: clf = ensemble.RandomForestClassifier(random_state=42)
import pygal # # Train on default data # In[2]: detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status') detect_obj.df.index = detect_obj.df.index.tz_convert('MST') # In[3]: train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') train_obj.trim_dates(None, '01-01-2015') test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') test_obj.trim_dates('01-01-2015', None) # In[4]: clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42) # In[5]: clf = train_obj.fit_model(clf)
# In[8]: clf = nsrdb.fit_model(feature_cols, target_cols, clf) # Training vs the clearsky model in NSRDB is quite accurate. I don't really want to use this clearsky curve though since it's unavailable for ground based measurements. # ### Visualize # In[9]: train = cs_detection.ClearskyDetection(nsrdb.df) train.trim_dates(None, '01-01-2015') test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None) # In[10]: clf.fit(train.df[feature_cols], train.df[target_cols]) # In[11]: pred = clf.predict(test.df[feature_cols]).flatten()
# In[3]: len(nsrdb.df) # # Train/test on NSRDB data to find optimal parameters # ## Default classifier # In[4]: train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) train.trim_dates(None, '01-01-2015') test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) test.trim_dates('11-01-2015', '01-07-2015') # In[5]: train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') # In[6]: clf = ensemble.RandomForestClassifier(random_state=42)
target_cols = ['sky_status'] # # Align date ranges # In[6]: ground.df.index[0], ground.df.index[-1] # In[7]: nsrdb.df.index[0], nsrdb.df.index[-1] # In[8]: ground2 = cs_detection.ClearskyDetection(ground.df) # In[9]: ground2.trim_dates('01-01-2008', '01-01-2012') ground2.df = ground2.df[ground2.df.index.minute % 30 == 0] # In[10]: nsrdb2 = cs_detection.ClearskyDetection(nsrdb.df) # In[11]: nsrdb2.trim_dates('01-01-2008', '01-01-2012') nsrdb2.df = nsrdb2.df[nsrdb2.df.index.minute % 30 == 0]
import matplotlib import pv_clf import numpy as np get_ipython().magic('matplotlib notebook') get_ipython().magic('load_ext autoreload') get_ipython().magic('autoreload 2') # In[507]: nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz') nsrdb.df.index = nsrdb.df.index.tz_convert('MST') train = cs_detection.ClearskyDetection(nsrdb.df) train.trim_dates(None, '01-01-2015') test = cs_detection.ClearskyDetection(nsrdb.df) test.trim_dates('01-01-2015', None) # In[508]: X = np.asarray([train.df.index.values, train.df['GHI'].values, train.df['Clearsky GHI pvlib'].values]).T # In[509]: X.shape
nsrdb = cs_detection.ClearskyDetection.read_pickle('ornl_nsrdb_1.pkl.gz') nsrdb.df.index = nsrdb.df.index.tz_convert('EST') nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn') # In[3]: len(nsrdb.df) # # Train/test on NSRDB data to find optimal parameters # ## Default classifier # In[4]: train = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) train.trim_dates('01-01-2010', '01-01-2015') test = cs_detection.ClearskyDetection(nsrdb.df, scale_col=None) test.trim_dates('01-01-2015', None) # In[5]: train.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status') # In[6]: clf = ensemble.RandomForestClassifier(random_state=42) # In[7]: utils.calc_all_window_metrics(train.df,
nsrdb.to_pickle('srrl_nsrdb_cloudy.pkl', overwrite=True) # In[14]: ground.to_pickle('srrl_ground_cloudy.pkl', overwrite=True) # # Science # In[16]: ground_small = cs_detection.ClearskyDetection(ground.df) # In[17]: ground_small.trim_dates('07-01-2006', '07-08-2006') # In[18]: vis = Visualizer() vis.add_line_ser(ground_small.df['GHI'], 'GHI') vis.add_line_ser(ground_small.df['Clearsky GHI pvlib'], 'GHIcs') vis.add_line_ser(ground_small.df['Total Cloud Cover [%]'], 'TCC')
np.set_printoptions(precision=4) get_ipython().magic('matplotlib inline') get_ipython().magic("config InlineBackend.figure_format = 'retina'") matplotlib.rcParams.update({'font.size': 16}) import warnings warnings.filterwarnings(action='ignore') plt.close('all') # Train on default data# nsrdb = pd.read_pickle('abq_nsrdb_1.pkl.gz') detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status') detect_obj.df.index = detect_obj.df.index.tz_convert('MST')train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') train_obj.trim_dates(None, '01-01-2015') test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') test_obj.trim_dates('01-01-2015', None)clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, max_depth=10, random_state=42)clf = train_obj.fit_model(clf)pred = test_obj.predict(clf)print(metrics.accuracy_score(test_obj.df['sky_status'], pred))print(metrics.recall_score(test_obj.df['sky_status'], pred))cm = metrics.confusion_matrix(test_obj.df['sky_status'], pred)visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))fig, ax = plt.subplots(figsize=(12, 8)) _ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_) _ = ax.set_xticks(range(len(clf.feature_importances_))) _ = ax.set_xticklabels(test_obj.features_, rotation=45) _ = ax.set_ylabel('Importance') _ = ax.set_xlabel('Feature') _ = fig.tight_layout()fig, ax = plt.subplots(figsize=(12, 8)) nsrdb_mask = test_obj.df['sky_status'].values