def metrics(obs, pred, f, q, m): # obs - log(observed), pred - prediction, f - FIB, q - subset, m - model rsq = round(r2_score(obs, pred), 3) dw = round(durbin_watson(obs - pred), 3) # Durbin-Watson rmse = round(np.sqrt(((pred - obs)**2).mean()), 3) # Root Mean Square Error mape = 100 * round(abs( (pred - obs) / obs).mean(), 3) # Mean Absolute Percentage Error sens_spec = wqm.pred_eval(obs, pred, thresh=np.log10( wqm.fib_thresh(f))) # Sensitivity/Specificity auroc = round(HF_models.compute_AUROC(obs, pred, f), 3) # Area Under the Receiver Operating Curve # Add to q performance for model m to perf dataframe mets = [[ rsq, dw, rmse, mape, auroc, sens_spec['Sensitivity'], sens_spec['Specificity'], sens_spec['Samples'], sens_spec['Exceedances'] ]] temp_perf = pd.DataFrame(data=mets, columns=[ 'Rsq', 'D-W', 'RMSE', 'MAPE', 'AUROC', 'sens', 'spec', 'N', 'exc' ], index=[[q], [m]]) return temp_perf
def compute_AUROC(y, y_pred, f): tune_range = np.arange(0.7, 2.25, 0.005) sens_spec = np.array([ wqm.pred_eval(y, (y_pred * j), thresh=np.log10(wqm.fib_thresh(f)), tune=True) for j in tune_range ]) tpr = sens_spec[:, 0] fpr = 1 - sens_spec[:, 1] auroc = auc(fpr, tpr) return auroc
def compute_AUROC(y, y_pred, f): # Calculate AUROC given the observed and predicted SSS exceedances tune_range = np.arange(0.7, 2.25, 0.005) sens_spec = np.array([ wqm.pred_eval(y, (y_pred * j), thresh=np.log10(wqm.fib_thresh(f)), tune=True) for j in tune_range ]) tpr = sens_spec[:, 0] fpr = 1 - sens_spec[:, 1] auroc = auc(fpr, tpr) return auroc
blr, blr_perf = wqm.fit(y_train, X_train_vs, model_type='blr') #%% TUNE tune_mlr = wqm.tune(y_train, X_train_vs, model=mlr, cm_perf=cm_train) tune_blr = wqm.tune(y_train, X_train_vs, model=blr, cm_perf=cm_train) #%% TRAIN/TEST PERFORMANCE print('\n\n- - - | Metrics | - - -') if np.isnan(tune_mlr): tune_mlr = 1 if np.isnan(tune_blr): tune_blr = 0.5 mlr_t_perf = wqm.pred_eval(y_train, mlr.predict(X_train_vs) * tune_mlr, thresh=np.log10(wqm.fib_thresh(f))) train_perf_df = train_perf_df.append(pd.DataFrame(mlr_t_perf, index=['MLR-T'])) mlr_t_perf_test = wqm.pred_eval(y_test, mlr.predict(X_test_vs) * tune_mlr, thresh=np.log10(wqm.fib_thresh(f))) test_perf_df = test_perf_df.append( pd.DataFrame(mlr_t_perf_test, index=['MLR-T'])) blr_t_perf = wqm.pred_eval(y_train > np.log10(wqm.fib_thresh(f)), blr.predict_proba(X_train_vs)[:, 1] > tune_blr) train_perf_df = train_perf_df.append(pd.DataFrame(blr_t_perf, index=['BLR-T'])) blr_t_perf_test = wqm.pred_eval(y_test > np.log10(wqm.fib_thresh(f)), blr.predict_proba(X_test_vs)[:, 1] > tune_blr) test_perf_df = test_perf_df.append(
kurt = df_fib.kurtosis() kurt.name = 'kurtosis' df_stats = df_stats.append(kurt.T) # At or Below Level of Quantification bloq = (df_fib == 10).sum() bloq.name = 'abloq' df_stats = df_stats.append(bloq.T) df_stats.loc['abloq_%'] = round( 100 * df_stats.loc['abloq'] / df_stats.loc['N'], 1) # Exceedances exc = pd.Series() for f in ['TC', 'FC', 'ENT']: exc[f] = (df_fib[f] > wqm.fib_thresh(f)).sum() exc.name = 'exc' df_stats = df_stats.append(exc.T) df_stats.loc['exc_%'] = round( 100 * df_stats.loc['exc'] / df_stats.loc['N'], 1) # Shanon Entropy shan = pd.Series() for f in ['TC', 'FC', 'ENT']: vals, counts = np.unique(df_fib[f], return_counts=True) shan[f] = round(stats.entropy(counts / len(df_fib[f])), 3) shan.name = 'Shannon' df_stats = df_stats.append(shan.T) # Append to basic stats df df_stats = df_stats.T
### INPUTS ### case = 'LP3' f = 'ENT' model_types = ['RF'] model_color = {'MLR': 'b', 'GLS': 'g', 'RF': 'k', 'ANN': 'r'} folder = '/Users/rtsearcy/Box/water_quality_modeling/thfs/EDA/summer2020/prediction' case_folder = folder + '/test_cases/' + case + '/' test_cases = pd.read_csv(os.path.join(folder, 'test_cases.csv'), index_col=['test_case']) train_events = test_cases.loc[case]['train_event'].split(',') thresh = wqm.fib_thresh(f) ### Load data df_train = pd.read_excel(os.path.join( case_folder, 'train_test_subsets_' + f + '_' + case + '.xlsx'), sheet_name='Train', index_col='dt', parse_dates=['dt']) print('Train: ' + str(len(df_train))) df_rm = pd.read_excel(os.path.join( case_folder, 'train_test_subsets_' + f + '_' + case + '.xlsx'), sheet_name='RM Test', index_col='dt', parse_dates=['dt']) df_rm = df_rm[df_rm.index.year == df_rm.index[0].year] # Remove Year 2 RM Data
# Fit mlr = sm.OLS(train['log' + f], sm.add_constant(X_train), hasconst=True).fit() print(mlr.summary2()) # Tune (TBD) # Eval (Train) print('\nMetrics (Training)') rmse = np.sqrt(((mlr.predict() - train['log' + f])**2).mean()) print('RMSE - ' + str(round(rmse, 3))) mape = abs((mlr.predict() - train['log' + f]) / train['log' + f]).mean() print('MAPE - ' + str(round(mape, 3))) mlr_train_perf = wqm.pred_eval(train['log' + f], mlr.predict(), thresh=np.log10(wqm.fib_thresh(f))) print('AUROC - ' + str(round(compute_AUROC(train['log' + f], mlr.predict(), f), 3))) print(mlr_train_perf) # Eval (HF Test) if len(hf_test) > 0: print('\nMetrics (HF Testing)') hf_test_pred = mlr.predict(sm.add_constant(hf_test_IV, has_constant='add')) rmse = np.sqrt(((hf_test_pred - hf_test['log' + f])**2).mean()) print('RMSE - ' + str(round(rmse, 3))) mape = abs((hf_test_pred - hf_test['log' + f]) / hf_test['log' + f]).mean() print('MAPE - ' + str(round(mape, 3))) print('AUROC - ' + str(round(compute_AUROC(hf_test['log' + f], hf_test_pred, f), 3))) mlr_hft_perf = wqm.pred_eval(hf_test['log' + f],