def get_mse_from_n_feat(df_nonnan_nonzerot0, pred_dt, cfg_tds, model_path, mod_bound=None, mod_name="", delete_RADAR_t0=False, set_log_weight=False): print("Get dependence of MSE on n features for lead time t0 + %imin" % pred_dt, end="") if mod_bound is not None: print(" (for %s)" % mod_name) mod_name = "_%s" % mod_name else: print(" (for all samples)") sys.stdout.flush() ## Check whether data on MSE already exists: calc_new_model = "y" if os.path.exists( os.path.join( model_path, "MSE_feature_count_gain_%i%s.pkl" % (pred_dt, mod_name))): calc_new_model = "" while (calc_new_model != "y" and calc_new_model != "n"): calc_new_model = raw_input( " MSE data exists alreay, get new one? [y/n] ") #if calc_new_model=="n": # print(" Use existing one, return from this function") # return ## Calculate sample weights for XGB fitting: if set_log_weight: df_nonnan_nonzerot0["s_weight"] = feat.calc_sample_weight( df_nonnan_nonzerot0["TRT_Rank|0"], df_nonnan_nonzerot0["TRT_Rank_diff|%i" % pred_dt]) ## Delete rows with TRT Rank close to zero at lead time: print(" Delete rows with TRT Rank close to zero at lead time") if delete_RADAR_t0: print(" Get predictor matrix X without RADAR variables at t0") X_feature_sel = "no_radar_t0" else: print(" Get predictor matrix X with RADAR variables at t0") X_feature_sel = "all" X_train, X_test, y_train, y_test = ipt.get_model_input( df_nonnan_nonzerot0, del_TRTeqZero_tpred=True, split_Xy_traintest=True, pred_dt=pred_dt, TRTRankt0_bound=mod_bound, check_for_nans=False, X_feature_sel=X_feature_sel) ## Load XGBmodel: print(" Load XGBmodel") with open( os.path.join( model_path, "model_%i%s_t0diff_maxdepth6.pkl" % (pred_dt, mod_name)), "rb") as file: xgb_model = pickle.load(file) ## Order features by importance (gain): top_features_gain = pd.DataFrame.from_dict( xgb_model.get_booster().get_score(importance_type='gain'), orient="index", columns=["F_score"]).sort_values(by=['F_score'], ascending=False) ## Create list of number of features to select for the fitting: n_feat_arr = get_n_feat_arr(model="xgb") ## Get models fitted with n top features: if calc_new_model == "y": print(" Get models fitted with n top features") ls_models = [ fit_model_n_feat(X_train, y_train, top_features_gain, n_feat, n_feat_arr, set_log_weight=set_log_weight) for n_feat in n_feat_arr ] print(" Save list of models as pickle to disk") with open( os.path.join( model_path, "models_%i%s_t0diff_maxdepth6_nfeat.pkl" % (pred_dt, mod_name)), "wb") as file: pickle.dump(ls_models, file, protocol=2) else: print(" Load existing models fitted with n top features") with open( os.path.join( model_path, "models_%i%s_t0diff_maxdepth6_nfeat.pkl" % (pred_dt, mod_name)), "rb") as file: ls_models = pickle.load(file) ## Get mean square error of models with n features: print(" Get mean square error of models with n features") MSE_r2_ls = [mse_r2_n_feat(X_test, y_test, top_features_gain, n_feat, model) \ for n_feat, model in zip(n_feat_arr,ls_models)] df_mse_feat_count = pd.DataFrame.from_dict({ "Feature Count": n_feat_arr, "MSE %imin%s" % (pred_dt, mod_name): [score[0] for score in MSE_r2_ls], "R2 %imin%s" % (pred_dt, mod_name): [score[1] for score in MSE_r2_ls] }) df_mse_feat_count.set_index("Feature Count", inplace=True) print(" Save dataframe with MSE to disk") with open( os.path.join( model_path, "MSE_feature_count_gain_%i%s.pkl" % (pred_dt, mod_name)), "wb") as file: pickle.dump(df_mse_feat_count, file, protocol=2) ## Append MSE values to existing HDF5 file (if existing): print(" Append MSE values to HDF5 file") df_mse_feat_count.to_hdf(os.path.join(model_path, "MSE_feature_count_gain.h5"), key="MSE_%imin%s" % (pred_dt, mod_name), mode="a", format="t", append=True)
def selected_model_fit(df_nonnan_nonzerot0, pred_dt, n_feat_ls, cfg_tds, model_path, ls_mod_bound=[None], ls_model_names=[""]): if len(ls_mod_bound) > 1: y_test_ls = [] TRT_diff_pred_ls = [] for mod_bound, n_feat, mod_name in zip(ls_mod_bound, n_feat_ls, ls_model_names): print("\nGet selected XGB model for prediction of lead time %imin" % (pred_dt), end="") if mod_bound is not None: print(" (%i features for model '%s')" % (n_feat, mod_name)) mod_name = "_%s" % mod_name else: print(" (%i features for all samples)" % n_feat) sys.stdout.flush() ## Delete rows with TRT Rank close to zero at lead time: print(" Delete rows with TRT Rank close to zero at lead time") X_train, X_test, y_train, y_test = ipt.get_model_input( df_nonnan_nonzerot0, del_TRTeqZero_tpred=True, split_Xy_traintest=True, pred_dt=pred_dt, TRTRankt0_bound=mod_bound, check_for_nans=False, ) precalc_n_feat = get_n_feat_arr("xgb") if n_feat not in precalc_n_feat: ## Load XGBmodel: print(" Load XGBmodel") with open( os.path.join( model_path, "model_%i%s_t0diff_maxdepth6.pkl" % (pred_dt, mod_name)), "rb") as file: xgb_model = pickle.load(file) ## Order features by importance (gain): top_features_gain = pd.DataFrame.from_dict( xgb_model.get_booster().get_score(importance_type='gain'), orient="index", columns=["F_score"]).sort_values(by=['F_score'], ascending=False) ## Fit model: model = fit_model_n_feat(X_train, y_train, top_features_gain, n_feat, np.array(n_feat)) else: with open( os.path.join( model_path, "models_%i%s_t0diff_maxdepth6_nfeat.pkl" % (pred_dt, mod_name)), "rb") as file: model = pickle.load(file)[np.where( precalc_n_feat == n_feat)[0][0]] ## Save the model to disk: model_saving_name = "model_%i%s_t0diff_maxdepth6_%ifeat_gain.pkl" % ( pred_dt, mod_name, n_feat) with open(os.path.join(model_path, model_saving_name), "wb") as file: pickle.dump(model, file, protocol=2) ## Get features: features = model.get_booster().feature_names ## Make prediction and get skill scores: TRT_diff_pred = model.predict(X_test[features]) ## Append to list of results for combined plot: if len(ls_mod_bound) > 1: y_test_ls.append(y_test) TRT_diff_pred_ls.append(TRT_diff_pred) ## Make combined plot: if len(ls_mod_bound) > 1: y_test_combi = pd.concat(y_test_ls, axis=0) pred_gain_combi = np.concatenate(TRT_diff_pred_ls) mse_gain = sklearn.metrics.mean_squared_error(y_test_combi, pred_gain_combi) r2_gain = sklearn.metrics.r2_score(y_test_combi, pred_gain_combi) plot_pred_vs_obs_core(y_test_combi, pred_gain_combi, pred_dt, "_%s" % "|".join(ls_model_names), cfg_tds) ## Return model for and put into dictionary: if len(ls_mod_bound) > 1: raise ImplementationError("Not yet implemented to used models fitted with" + \ "TRT Rank subset for prediction, not returned") else: return model
## Get prediction leadtime from model: pred_dt = -1 while (pred_dt%5!=0 or pred_dt<0): pred_dt = int(raw_input("For which lead time should comparison be made? ") ## Get features of largest models (ANN and XGB) top_features_gain = features = ls_models_xgb[-1].get_booster().feature_names xgb_model = ls_models_xgb[-1] mlp_model = ls_models_mlp[-1].best_estimator_ ## Get scores for the following number of features: n_feat_arr = fit.get_n_feat_arr("xgb") ## Get training and testing data (non-normalised for XGBoost model) and the scores: X_train_nonnorm, X_test_nonnorm, y_train_nonnorm, \ y_test_nonnorm = ipt.get_model_input(df_nonnan, del_TRTeqZero_tpred=True, split_Xy_traintest=True, pred_dt = pred_dt, X_normalise=False,check_for_nans=False,verbose=True) pred_xgb = xgb_model.predict(X_test_nonnorm[features]) fit.plot_pred_vs_obs_core(y_test_nonnorm,pred_xgb,pred_dt,"_xgb1000",cfg_tds) MSE_r2_ls_xgb = [fit.mse_r2_n_feat(X_test_nonnorm, y_test_nonnorm, top_features_gain, n_feat, model) for n_feat, model in zip(n_feat_arr[9:],ls_models_xgb[9:])] del(X_train_nonnorm, X_test_nonnorm, y_train_nonnorm, y_test_nonnorm) ## Get training and testing data (normalised for ANN model) and the scores: X_train_norm, X_test_norm, y_train_norm, \ y_test_norm, scaler = ipt.get_model_input(df_nonnan, del_TRTeqZero_tpred=True, split_Xy_traintest=True, pred_dt = pred_dt, X_normalise=True,check_for_nans=False,verbose=True) pred_mlp = mlp_model.predict(X_test_norm[features]) fit.plot_pred_vs_obs_core(y_test_norm,pred_mlp,pred_dt,"_mlp1000",cfg_tds) MSE_r2_ls_mlp = [fit.mse_r2_n_feat(X_test_norm, y_test_norm, top_features_gain, n_feat, model) for n_feat, model in zip(n_feat_arr[9:],ls_models_mlp)] del(X_train_norm, X_test_norm, y_train_norm, y_test_norm) ## Get scores into dataframe:
def get_feature_importance(df_nonnan_nonzerot0,pred_dt,cfg_tds,model_path,mod_bound=None, mod_name="",delete_RADAR_t0=False,set_log_weight=False,max_n_feat=60000): print("Get features for lead time t0 + %imin" % pred_dt, end="") if mod_bound is not None: if mod_name=="": raise ValueError("Model name required") else: print(" (for %s)" % mod_name) mod_name = "_%s" % mod_name if len(mod_bound)!=2: raise ValueError("Model boundary list must have length 2") else: print(" (for all samples)") sys.stdout.flush() ## Check whether model already exists: if os.path.exists(os.path.join(model_path,"model_%i%s_t0diff_maxdepth6.pkl" % (pred_dt,mod_name))): use_existing = "" while (use_existing!="y" and use_existing!="n"): use_existing = raw_input(" Model exists alreay, fit a new one? [y/n] ") if use_existing=="n": print(" Use existing one, return from this function") return ## Calculate sample weights for XGB fitting: if set_log_weight: df_nonnan_nonzerot0["s_weight"] = calc_sample_weight(df_nonnan_nonzerot0["TRT_Rank|0"], df_nonnan_nonzerot0["TRT_Rank_diff|%i" % pred_dt]) ## Delete rows with TRT Rank close to zero at lead time: print(" Delete rows with TRT Rank close to zero at lead time") if delete_RADAR_t0: print(" Get predictor matrix X without RADAR variables at t0") X_feature_sel = "no_radar_t0" else: print(" Get predictor matrix X with RADAR variables at t0") X_feature_sel = "all" X, y = ipt.get_model_input(df_nonnan_nonzerot0, del_TRTeqZero_tpred=True, split_Xy=True, pred_dt=pred_dt, TRTRankt0_bound=mod_bound, X_feature_sel=X_feature_sel) del(df_nonnan_nonzerot0) if len(X)>max_n_feat: print(" *** Warning: Dataframe X probably to big to be converted, reduced to %i rows! ***" % max_n_feat) X = X.sample(n=max_n_feat,random_state=42) y = y.sample(n=max_n_feat,random_state=42) #X = X.values #X = X.astype(np.float16, order='C', copy=False) ## Setup model: print(" Setup XGBmodel with max_depth = 6") xgb_model = xgb.XGBRegressor(max_depth=6,silent=False,n_jobs=6,nthreads=6) ## Calculate sample weights for XGB fitting: if set_log_weight: s_weights = X["s_weight"].values X = X.drop(labels="s_weight", axis=1) else: s_weights = None ## Train model: print(" Train XGBmodel") d_start = dt.datetime.now() xgb_model.fit(X, y, verbose=True, sample_weight=s_weights) print(" Elapsed time for XGBoost model fitting: %s" % (dt.datetime.now()-d_start)) ## Save model to disk: print(" Save XGBmodel to disk") with open(os.path.join(model_path,"model_%i%s_t0diff_maxdepth6.pkl" % (pred_dt,mod_name)),"wb") as file: pickle.dump(xgb_model,file,protocol=2) ## Plot feature importance: print(" Plot feature importance") plot_feature_importance(xgb_model,X,pred_dt,cfg_tds,mod_name)
ls_model_names.append(raw_input(" Please provide model name: ")) if len(ls_model_bound) > 0: print(" Using model boundaries:") # %s" % ls_model_bound) for bound, name in zip(ls_model_bound, ls_model_names): print(" Model '%s': %s" % (name, bound)) use_model_boundaries = True else: print(" Using all samples") use_model_boundaries = False ls_model_bound.append(None) ls_model_names.append("") ## Plot XGB model weights (to push importance of strong TRT cells which are not decreasing): print("\nPlotting XGB model weights") df_nonnan_nonzerot0t10 = ipt.get_model_input(df_nonnan_nonzerot0, del_TRTeqZero_tpred=True, pred_dt=10, check_for_nans=False) feat.plot_XGB_model_weights(df_nonnan_nonzerot0t10, cfg_tds) del (df_nonnan_nonzerot0t10) use_XGB_model_weights = "" while (use_XGB_model_weights not in ["y", "n"]): use_XGB_model_weights = raw_input( "Should XGB model weights be applied, see plot on disk [y/n]: ") if use_XGB_model_weights == "y": print(" Apply model weights") XGB_mod_weight = True else: print(" Apply no model weights") XGB_mod_weight = False ## Ask user whether Radar variables should be used at t0:
path_to_df = pth.file_path_reader("pandas training dataframe (nonnan)", user_argv_path) model_path = pth.file_path_reader("model saving location") print("\nLoading nonnan dataframe into RAM") df_nonnan = pd.read_hdf(path_to_df, key="df_nonnan") ## Get lead-time from user: ls_pred_dt = feat.get_pred_dt_ls("the ANN fit", cfg_op["timestep"], cfg_op["n_integ"]) ## Loop over time-deltas: for pred_dt in ls_pred_dt: ## Get normalised training and testing data: X_train, X_test, y_train, y_test, scaler = ipt.get_model_input( df_nonnan, del_TRTeqZero_tpred=True, split_Xy_traintest=True, X_normalise=True, pred_dt=pred_dt) ## Fit ANN model with all features but only two hidden layers (100, 50): print( "Fit ANN model with all features but only two hidden layers (100, 50)") mlp_allfeat = MLPRegressor(hidden_layer_sizes=(100, 50), verbose=True) mlp_allfeat.fit(X_train, y_train) with open( os.path.join( model_path, "model_%i%s_t0diff_mlp_allfeat.pkl" % (pred_dt, mod_name)), "wb") as file: pickle.dump(mlp_allfeat, file, protocol=-1)
def make_model_evaluation(df_nonnan, model_path, ls_pred_dt, cfg_tds, cfg_op): X_test_ls = [] y_test_ls = [] cmap_pred_dt = plt.cm.get_cmap('viridis_r') ## Import dictionary with selected models: train_path_name = os.path.join( model_path, "model_dict_t0diff_maxdepth6_selfeat_gain.pkl") with open(train_path_name, "rb") as file: dict_sel_model = pickle.load(file) plt.close() fig = plt.figure(num=1, figsize=(7, 6)) ## Loop over lead times: for i, pred_dt in enumerate(ls_pred_dt): if i == 0: xgb_model_ls = [] pred_model_ls = [] Rank_obs_ls = [] top_features_ls = [] df_param_ls_diff = [] df_param_ls_rank = [] df_param_ls_rank_PM = [] df_param_ls_rank_pers = [] Rank_pred_XGB_ls = [] Rank_pred_XGB_PM_ls = [] if len(X_test_ls) == len(ls_pred_dt) and len(y_test_ls) == len( ls_pred_dt): X_test = X_test_ls[i] y_test = y_test_ls[i] else: if i == 0: X_test_ls = [] y_test_ls = [] X_train, X_test, y_train, y_test = ipt.get_model_input( df_nonnan, del_TRTeqZero_tpred=True, split_Xy_traintest=True, X_normalise=False, pred_dt=pred_dt, check_for_nans=False, verbose=True) del (X_train, y_train) X_test_ls.append(X_test) y_test_ls.append(y_test) ## Load XGB model fitted to all features: with open( os.path.join(model_path, "model_%i_t0diff_maxdepth6.pkl" % pred_dt), "rb") as file: xgb_model_feat = pickle.load(file) xgb_model_ls.append(xgb_model_feat) top_features = pd.DataFrame.from_dict( xgb_model_feat.get_booster().get_score(importance_type='gain'), orient="index", columns=["F_score"]).sort_values(by=['F_score'], ascending=False) top_features_ls.append(top_features) ## Get specific predictive model for this leadtime: pred_model = dict_sel_model["pred_mod_%i" % pred_dt] pred_model_ls.append(pred_model) ## Check that features agree: features_pred_model = pred_model.get_booster().feature_names n_features = len(features_pred_model) if set(features_pred_model) != set(top_features.index[:n_features]): raise ValueError( "Features of predictive model and top features of model fitted with all features do not agree" ) ## Make prediction of TRT Rank differences: TRT_diff_pred = pred_model.predict(X_test[features_pred_model]) ## Get set of different TRT Rank predictions: Rank_obs, Rank_pred_XGB, Rank_pred_XGB_PM, Rank_pred_pers, Rank_pred_pers_PM, \ Rank_pred_diff, Diff_pred_XGB = get_obs_fcst_TRT_Rank(X_test["TRT_Rank|0"], TRT_diff_pred, y_test, X_test["TRT_Rank|-5"]) Rank_obs_ls.append(Rank_obs) Rank_pred_XGB_ls.append(Rank_pred_XGB) Rank_pred_XGB_PM_ls.append(Rank_pred_XGB_PM) ## Plot scatterplots obs vs. predicted: plot_pred_vs_obs_core(y_test, Diff_pred_XGB.values, pred_dt, "_XGB%i" % n_features, cfg_tds, outtype="TRT_Rank_diff") plot_pred_vs_obs_core(Rank_obs, Rank_pred_XGB.values, pred_dt, "_XGB%i" % n_features, cfg_tds, outtype="TRT_Rank") plot_pred_vs_obs_core(Rank_obs, Rank_pred_XGB_PM.values, pred_dt, "_XGB%i-ProbMatch" % n_features, cfg_tds, outtype="TRT_Rank") plot_pred_vs_obs_core(Rank_obs, Rank_pred_pers.values, pred_dt, "_Pers", cfg_tds, outtype="TRT_Rank") plot_pred_vs_obs_core(Rank_obs, Rank_pred_pers_PM.values, pred_dt, "_Pers-ProbMatch", cfg_tds, outtype="TRT_Rank") plot_pred_vs_obs_core(Rank_obs, Rank_pred_diff.values, pred_dt, "_ConstDiff", cfg_tds, outtype="TRT_Rank") ## Calculate different term elements for R^2 / Brier Score calculation: df_param_ls_diff.append( get_R2_param(y_test.values, Diff_pred_XGB.values)) df_param_ls_rank.append( get_R2_param(Rank_obs.values, Rank_pred_XGB.values)) df_param_ls_rank_PM.append( get_R2_param(Rank_obs.values, Rank_pred_XGB_PM.values)) df_param_ls_rank_pers.append( get_R2_param(Rank_obs.values, Rank_pred_pers.values)) ## Calculate statistics for Taylor Diagram: stat_pred_XGB = sm.taylor_statistics(predicted=Rank_pred_XGB.values, reference=Rank_obs.values) stat_pred_XGB_PM = sm.taylor_statistics( predicted=Rank_pred_XGB_PM.values, reference=Rank_obs.values) stat_pred_pred_pers = sm.taylor_statistics( predicted=Rank_pred_pers.values, reference=Rank_obs.values) stat_pred_pred_diff = sm.taylor_statistics( predicted=Rank_pred_diff.values, reference=Rank_obs.values) stat_pred_pred_pers_PM = sm.taylor_statistics( predicted=Rank_pred_pers_PM.values, reference=Rank_obs.values) sdev = np.array([ stat_pred_XGB['sdev'][0], stat_pred_XGB['sdev'][1], stat_pred_XGB_PM['sdev'][1], stat_pred_pred_pers['sdev'][1] ]) crmsd = np.array([ stat_pred_XGB['crmsd'][0], stat_pred_XGB['crmsd'][1], stat_pred_XGB_PM['crmsd'][1], stat_pred_pred_pers['crmsd'][1] ]) ccoef = np.array([ stat_pred_XGB['ccoef'][0], stat_pred_XGB['ccoef'][1], stat_pred_XGB_PM['ccoef'][1], stat_pred_pred_pers['ccoef'][1] ]) #sdev = np.array([stat_pred_XGB['sdev'][0], stat_pred_XGB['sdev'][1], stat_pred_XGB_PM['sdev'][1], stat_pred_pred_pers['sdev'][1], stat_pred_pred_diff['sdev'][1]]) #crmsd = np.array([stat_pred_XGB['crmsd'][0], stat_pred_XGB['crmsd'][1], stat_pred_XGB_PM['crmsd'][1], stat_pred_pred_pers['crmsd'][1], stat_pred_pred_diff['crmsd'][1]]) #ccoef = np.array([stat_pred_XGB['ccoef'][0], stat_pred_XGB['ccoef'][1], stat_pred_XGB_PM['ccoef'][1], stat_pred_pred_pers['ccoef'][1], stat_pred_pred_diff['ccoef'][1]]) ## Plot Taylor Diagram: col_point = cmap_pred_dt(float(i) / len(ls_pred_dt)) col_point = (col_point[0], col_point[1], col_point[2], 0.8) plot_markerLabel = ["Obs", "+%imin" % pred_dt, "", ""] plot_markerLabelColor = "black" if i == 0: plot_markerLegend = 'on' plot_overlay = 'off' else: plot_markerLegend = "on" plot_overlay = 'on' #plot_markerLabelColor = None if i == len(ls_pred_dt) - 1: plot_markerLabelColor = None plot_markerLabel = ["Obs", "XGB", "XGB (PM)", "Persistance"] sm.taylor_diagram( sdev / sdev[0], crmsd, ccoef, styleOBS='-', colOBS='darkred', markerobs='o', titleOBS='Obs', markerLabel=plot_markerLabel, markerLabelColor=plot_markerLabelColor, alpha=0.1, markerColor=col_point, markerLegend=plot_markerLegend, axismax=1.2, markerSize=5, colRMS='grey', styleRMS='--', widthRMS=0.8, rincRMS=0.25, tickRMS=np.arange(0.25, 1.5, 0.25), #titleRMSangle = 110, colSTD='grey', styleSTD='-.', widthSTD=0.8, colCOR='grey', styleCOR=':', widthCOR=0.8, overlay=plot_overlay) ## Save Taylor Diagram: get_time_delta_colorbar(fig, ls_pred_dt, cmap_pred_dt, [0.7, 0.5, 0.05, 0.3]) plt.savefig( os.path.join(cfg_tds["fig_output_path"], "Taylor_Diagram_cmap.pdf")) plt.close() ## Plot histogram showing the effect of probability matching: print( "Save dataframe with observed, predicted, and predicted & PM TRT Ranks" ) Rank_obs_df = pd.concat(Rank_obs_ls, axis=1, sort=True) Rank_obs_df.columns = [ "TRT_Rank_obs|%i" % pred_dt for pred_dt in ls_pred_dt ] Rank_pred_XGB_df = pd.concat(Rank_pred_XGB_ls, axis=1, sort=True) Rank_pred_XGB_df.columns = [ "TRT_Rank_pred|%i" % pred_dt for pred_dt in ls_pred_dt ] Rank_pred_XGB_PM_df = pd.concat(Rank_pred_XGB_PM_ls, axis=1, sort=True) Rank_pred_XGB_PM_df.columns = [ "TRT_Rank_pred_PM|%i" % pred_dt for pred_dt in ls_pred_dt ] #plot_hist_probmatch(Rank_pred_XGB_df, Rank_pred_XGB_PM_df) Rank_obs_pred_df = pd.concat( [Rank_obs_df, Rank_pred_XGB_df, Rank_pred_XGB_PM_df], axis=1, sort=True) ## Get dataframe with observed, predicted, and predicted & PM TRT Ranks for operational PM: op_path_name = os.path.join(cfg_op["XGB_model_path"], "TRT_Rank_obs_pred.pkl") with open(op_path_name, "wb") as file: pickle.dump(Rank_obs_pred_df, file, protocol=2) print(" saved dict to 'XGB_model_path' location:\n %s" % op_path_name) prt_txt = """ --------------------------------------------------------------------------------- The file 'TRT_Rank_obs_pred.pkl' in the directory '%s' is now used for the operational probability matching procedure, be aware of that! ---------------------------------------------------------------------------------\n""" % ( cfg_op["XGB_model_path"]) print(prt_txt) ## Plot skill scores as function of lead-time: df_R2_param_rank = pd.concat(df_param_ls_rank, axis=0).set_index(np.array(ls_pred_dt)) df_R2_param_rank_PM = pd.concat(df_param_ls_rank_PM, axis=0).set_index(np.array(ls_pred_dt)) df_R2_param_diff = pd.concat(df_param_ls_diff, axis=0).set_index(np.array(ls_pred_dt)) df_R2_param_rank_pers = pd.concat(df_param_ls_rank_pers, axis=0).set_index(np.array(ls_pred_dt)) plot_stats(df_R2_param_rank, "TRT_Rank", cfg_tds) plot_stats(df_R2_param_diff, "TRT_Rank_diff", cfg_tds) plot_stats_nice(df_R2_param_rank, "TRT_Rank", cfg_tds) plot_stats_nice(df_R2_param_diff, "TRT_Rank_diff", cfg_tds) plot_stats_nice(df_R2_param_rank_pers, "TRT_Rank_pers", cfg_tds) plot_stats_nice(df_R2_param_rank_PM, "TRT_Rank_PM", cfg_tds) ## Print IDs of long TRT cells in testing dataset: print( "\nThese are the IDs of long TRT cells (>25 time steps) in the testing dataset:" ) TRT_ID = X_test_ls[-1].index TRT_ID = [TRT_ID_i[13:] for TRT_ID_i in TRT_ID.values] TRT_ID_count = Counter(TRT_ID) TRT_ID_count_sort = [ (k, TRT_ID_count[k]) for k in sorted(TRT_ID_count, key=TRT_ID_count.get, reverse=True) ] TRT_ID_count_sort_pd = pd.DataFrame(np.array(TRT_ID_count_sort), columns=["TRT_ID", "Count"]) TRT_ID_count_sort_pd["Count"] = TRT_ID_count_sort_pd["Count"].astype( np.uint16, inplace=True) TRT_ID_long = TRT_ID_count_sort_pd.loc[TRT_ID_count_sort_pd["Count"] > 25] print(TRT_ID_long) TRT_ID_casestudy = [ "2018080721250094", "2018080721300099", "2018080711400069", "2018080710200036" ] print(" Making analysis for TRT IDs (hardcoded!): %s" % TRT_ID_casestudy) TRT_ID_long_sel = TRT_ID_long.loc[TRT_ID_long['TRT_ID'].isin( TRT_ID_casestudy)] df_feature_ts_plot = pd.DataFrame.from_dict({ "Radar": ["CZC_lt57dBZ|-45|SUM", "CZC_lt57dBZ|-45|SUM", "CZC_lt57dBZ|-45|SUM"], "Satellite": [ "IR_097_stat|-20|PERC05", "IR_097_stat|-15|PERC01", "IR_097_stat|-20|MIN" ], "COSMO": [ "CAPE_MU_stat|-10|PERC50", "CAPE_MU_stat|-5|PERC75", "CAPE_ML_stat|0|SUM" ], "Lightning": [ "THX_densIC_stat|-30|SUM", "THX_curr_pos_stat|-40|SUM", "THX_curr_pos_stat|-30|SUM" ] }) for i_sel in range(len(TRT_ID_long_sel)): print(" Working on cell %s" % TRT_ID_long_sel.iloc[i_sel]["TRT_ID"]) plot_pred_time_series(TRT_ID_long_sel.iloc[i_sel], df_nonnan, Rank_pred_XGB_ls, ls_pred_dt, cfg_tds) plot_pred_time_series(TRT_ID_long_sel.iloc[i_sel], df_nonnan, Rank_pred_XGB_PM_ls, ls_pred_dt, cfg_tds, path_addon="PM", title_addon=" (PM)") plot_var_time_series_dt0_multiquant(TRT_ID_long_sel.iloc[i_sel], df_nonnan, cfg_tds) for i_pred_dt, pred_dt in enumerate([10, 20, 30]): fig = plt.figure(figsize=[10, 6]) ax_rad = fig.add_subplot(2, 2, 1) ax_sat = fig.add_subplot(2, 2, 2) ax_cos = fig.add_subplot(2, 2, 3) ax_thx = fig.add_subplot(2, 2, 4) ax_ls = [ax_rad, ax_sat, ax_cos, ax_thx] #fig, axes = plt.subplots(2,2) #fig.set_size_inches(8,6) for i_source, source in enumerate( ["Radar", "Satellite", "COSMO", "Lightning"]): ls_feat_param = df_feature_ts_plot[source].iloc[ i_pred_dt].split("|") past_dt = np.arange(-45, 0, 5) if int(ls_feat_param[1]) != 0 else [0] ax_ls[i_source] = plot_var_time_series( TRT_ID_long_sel.iloc[i_sel], df_nonnan, ls_feat_param[0], ls_feat_param[2], past_dt=past_dt, dt_highlight=int(ls_feat_param[1]), ax=ax_ls[i_source]) plt.tight_layout() plt.savefig( os.path.join( cfg_tds["fig_output_path"], "Feat_series_%i_%s.pdf" % (pred_dt, TRT_ID_long_sel.iloc[i_sel]["TRT_ID"]))) plt.close()