Python get_model_input 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: coalition3.statlearn.inputprep

메소드/함수: get_model_input

hotexamples.com에서의 예제들: 7

Python get_model_input - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 coalition3.statlearn.inputprep.get_model_input에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

def get_mse_from_n_feat(df_nonnan_nonzerot0,
                        pred_dt,
                        cfg_tds,
                        model_path,
                        mod_bound=None,
                        mod_name="",
                        delete_RADAR_t0=False,
                        set_log_weight=False):
    print("Get dependence of MSE on n features for lead time t0 + %imin" %
          pred_dt,
          end="")
    if mod_bound is not None:
        print(" (for %s)" % mod_name)
        mod_name = "_%s" % mod_name
    else:
        print(" (for all samples)")
    sys.stdout.flush()

    ## Check whether data on MSE already exists:
    calc_new_model = "y"
    if os.path.exists(
            os.path.join(
                model_path,
                "MSE_feature_count_gain_%i%s.pkl" % (pred_dt, mod_name))):
        calc_new_model = ""
        while (calc_new_model != "y" and calc_new_model != "n"):
            calc_new_model = raw_input(
                "  MSE data exists alreay, get new one? [y/n] ")
        #if calc_new_model=="n":
        #    print("  Use existing one, return from this function")
        #    return

    ## Calculate sample weights for XGB fitting:
    if set_log_weight:
        df_nonnan_nonzerot0["s_weight"] = feat.calc_sample_weight(
            df_nonnan_nonzerot0["TRT_Rank|0"],
            df_nonnan_nonzerot0["TRT_Rank_diff|%i" % pred_dt])

    ## Delete rows with TRT Rank close to zero at lead time:
    print("  Delete rows with TRT Rank close to zero at lead time")
    if delete_RADAR_t0:
        print("  Get predictor matrix X without RADAR variables at t0")
        X_feature_sel = "no_radar_t0"
    else:
        print("  Get predictor matrix X with RADAR variables at t0")
        X_feature_sel = "all"
    X_train, X_test, y_train, y_test = ipt.get_model_input(
        df_nonnan_nonzerot0,
        del_TRTeqZero_tpred=True,
        split_Xy_traintest=True,
        pred_dt=pred_dt,
        TRTRankt0_bound=mod_bound,
        check_for_nans=False,
        X_feature_sel=X_feature_sel)

    ## Load XGBmodel:
    print("  Load XGBmodel")
    with open(
            os.path.join(
                model_path,
                "model_%i%s_t0diff_maxdepth6.pkl" % (pred_dt, mod_name)),
            "rb") as file:
        xgb_model = pickle.load(file)

    ## Order features by importance (gain):
    top_features_gain = pd.DataFrame.from_dict(
        xgb_model.get_booster().get_score(importance_type='gain'),
        orient="index",
        columns=["F_score"]).sort_values(by=['F_score'], ascending=False)

    ## Create list of number of features to select for the fitting:
    n_feat_arr = get_n_feat_arr(model="xgb")

    ## Get models fitted with n top features:
    if calc_new_model == "y":
        print("  Get models fitted with n top features")
        ls_models = [
            fit_model_n_feat(X_train,
                             y_train,
                             top_features_gain,
                             n_feat,
                             n_feat_arr,
                             set_log_weight=set_log_weight)
            for n_feat in n_feat_arr
        ]
        print("    Save list of models as pickle to disk")
        with open(
                os.path.join(
                    model_path, "models_%i%s_t0diff_maxdepth6_nfeat.pkl" %
                    (pred_dt, mod_name)), "wb") as file:
            pickle.dump(ls_models, file, protocol=2)
    else:
        print("  Load existing models fitted with n top features")
        with open(
                os.path.join(
                    model_path, "models_%i%s_t0diff_maxdepth6_nfeat.pkl" %
                    (pred_dt, mod_name)), "rb") as file:
            ls_models = pickle.load(file)

    ## Get mean square error of models with n features:
    print("  Get mean square error of models with n features")
    MSE_r2_ls = [mse_r2_n_feat(X_test, y_test, top_features_gain, n_feat, model) \
                   for n_feat, model in zip(n_feat_arr,ls_models)]
    df_mse_feat_count = pd.DataFrame.from_dict({
        "Feature Count":
        n_feat_arr,
        "MSE %imin%s" % (pred_dt, mod_name): [score[0] for score in MSE_r2_ls],
        "R2 %imin%s" % (pred_dt, mod_name): [score[1] for score in MSE_r2_ls]
    })
    df_mse_feat_count.set_index("Feature Count", inplace=True)
    print("    Save dataframe with MSE to disk")
    with open(
            os.path.join(
                model_path,
                "MSE_feature_count_gain_%i%s.pkl" % (pred_dt, mod_name)),
            "wb") as file:
        pickle.dump(df_mse_feat_count, file, protocol=2)

    ## Append MSE values to existing HDF5 file (if existing):
    print("  Append MSE values to HDF5 file")
    df_mse_feat_count.to_hdf(os.path.join(model_path,
                                          "MSE_feature_count_gain.h5"),
                             key="MSE_%imin%s" % (pred_dt, mod_name),
                             mode="a",
                             format="t",
                             append=True)

예제 #2

파일 보기

def selected_model_fit(df_nonnan_nonzerot0,
                       pred_dt,
                       n_feat_ls,
                       cfg_tds,
                       model_path,
                       ls_mod_bound=[None],
                       ls_model_names=[""]):
    if len(ls_mod_bound) > 1:
        y_test_ls = []
        TRT_diff_pred_ls = []

    for mod_bound, n_feat, mod_name in zip(ls_mod_bound, n_feat_ls,
                                           ls_model_names):
        print("\nGet selected XGB model for prediction of lead time %imin" %
              (pred_dt),
              end="")
        if mod_bound is not None:
            print(" (%i features for model '%s')" % (n_feat, mod_name))
            mod_name = "_%s" % mod_name
        else:
            print(" (%i features for all samples)" % n_feat)
        sys.stdout.flush()

        ## Delete rows with TRT Rank close to zero at lead time:
        print("  Delete rows with TRT Rank close to zero at lead time")
        X_train, X_test, y_train, y_test = ipt.get_model_input(
            df_nonnan_nonzerot0,
            del_TRTeqZero_tpred=True,
            split_Xy_traintest=True,
            pred_dt=pred_dt,
            TRTRankt0_bound=mod_bound,
            check_for_nans=False,
        )

        precalc_n_feat = get_n_feat_arr("xgb")

        if n_feat not in precalc_n_feat:
            ## Load XGBmodel:
            print("  Load XGBmodel")
            with open(
                    os.path.join(
                        model_path, "model_%i%s_t0diff_maxdepth6.pkl" %
                        (pred_dt, mod_name)), "rb") as file:
                xgb_model = pickle.load(file)

            ## Order features by importance (gain):
            top_features_gain = pd.DataFrame.from_dict(
                xgb_model.get_booster().get_score(importance_type='gain'),
                orient="index",
                columns=["F_score"]).sort_values(by=['F_score'],
                                                 ascending=False)

            ## Fit model:
            model = fit_model_n_feat(X_train, y_train, top_features_gain,
                                     n_feat, np.array(n_feat))
        else:
            with open(
                    os.path.join(
                        model_path, "models_%i%s_t0diff_maxdepth6_nfeat.pkl" %
                        (pred_dt, mod_name)), "rb") as file:
                model = pickle.load(file)[np.where(
                    precalc_n_feat == n_feat)[0][0]]

        ## Save the model to disk:
        model_saving_name = "model_%i%s_t0diff_maxdepth6_%ifeat_gain.pkl" % (
            pred_dt, mod_name, n_feat)
        with open(os.path.join(model_path, model_saving_name), "wb") as file:
            pickle.dump(model, file, protocol=2)

        ## Get features:
        features = model.get_booster().feature_names

        ## Make prediction and get skill scores:
        TRT_diff_pred = model.predict(X_test[features])

        ## Append to list of results for combined plot:
        if len(ls_mod_bound) > 1:
            y_test_ls.append(y_test)
            TRT_diff_pred_ls.append(TRT_diff_pred)

    ## Make combined plot:
    if len(ls_mod_bound) > 1:
        y_test_combi = pd.concat(y_test_ls, axis=0)
        pred_gain_combi = np.concatenate(TRT_diff_pred_ls)
        mse_gain = sklearn.metrics.mean_squared_error(y_test_combi,
                                                      pred_gain_combi)
        r2_gain = sklearn.metrics.r2_score(y_test_combi, pred_gain_combi)
        plot_pred_vs_obs_core(y_test_combi, pred_gain_combi, pred_dt,
                              "_%s" % "|".join(ls_model_names), cfg_tds)

    ## Return model for and put into dictionary:
    if len(ls_mod_bound) > 1:
        raise ImplementationError("Not yet implemented to used models fitted with" + \
                                  "TRT Rank subset for prediction, not returned")
    else:
        return model

예제 #3

파일 보기

## Get prediction leadtime from model:
pred_dt = -1
while (pred_dt%5!=0 or pred_dt<0):
    pred_dt = int(raw_input("For which lead time should comparison be made? ")

## Get features of largest models (ANN and XGB)
top_features_gain = features = ls_models_xgb[-1].get_booster().feature_names
xgb_model  = ls_models_xgb[-1]
mlp_model  = ls_models_mlp[-1].best_estimator_

## Get scores for the following number of features:
n_feat_arr = fit.get_n_feat_arr("xgb")

## Get training and testing data (non-normalised for XGBoost model) and the scores:
X_train_nonnorm, X_test_nonnorm, y_train_nonnorm, \
        y_test_nonnorm = ipt.get_model_input(df_nonnan, del_TRTeqZero_tpred=True, split_Xy_traintest=True,
                                             pred_dt = pred_dt, X_normalise=False,check_for_nans=False,verbose=True)
pred_xgb = xgb_model.predict(X_test_nonnorm[features])
fit.plot_pred_vs_obs_core(y_test_nonnorm,pred_xgb,pred_dt,"_xgb1000",cfg_tds)
MSE_r2_ls_xgb = [fit.mse_r2_n_feat(X_test_nonnorm, y_test_nonnorm, top_features_gain, n_feat, model) for n_feat, model in zip(n_feat_arr[9:],ls_models_xgb[9:])]
del(X_train_nonnorm, X_test_nonnorm, y_train_nonnorm, y_test_nonnorm)

## Get training and testing data (normalised for ANN model) and the scores:
X_train_norm, X_test_norm, y_train_norm, \
    y_test_norm, scaler = ipt.get_model_input(df_nonnan, del_TRTeqZero_tpred=True, split_Xy_traintest=True,
                                              pred_dt = pred_dt, X_normalise=True,check_for_nans=False,verbose=True)
pred_mlp = mlp_model.predict(X_test_norm[features])
fit.plot_pred_vs_obs_core(y_test_norm,pred_mlp,pred_dt,"_mlp1000",cfg_tds)
MSE_r2_ls_mlp = [fit.mse_r2_n_feat(X_test_norm, y_test_norm, top_features_gain, n_feat, model) for n_feat, model in zip(n_feat_arr[9:],ls_models_mlp)]
del(X_train_norm, X_test_norm, y_train_norm, y_test_norm)

## Get scores into dataframe:

예제 #4

파일 보기

파일: feature.py 프로젝트: toowzh/coalition-3

def get_feature_importance(df_nonnan_nonzerot0,pred_dt,cfg_tds,model_path,mod_bound=None,
                           mod_name="",delete_RADAR_t0=False,set_log_weight=False,max_n_feat=60000):
    print("Get features for lead time t0 + %imin" % pred_dt, end="")
    if mod_bound is not None:
        if mod_name=="":
            raise ValueError("Model name required")
        else:
            print(" (for %s)" % mod_name)
            mod_name = "_%s" % mod_name
        if len(mod_bound)!=2:
            raise ValueError("Model boundary list must have length 2")
    else:
        print(" (for all samples)")
    sys.stdout.flush()

    ## Check whether model already exists:
    if os.path.exists(os.path.join(model_path,"model_%i%s_t0diff_maxdepth6.pkl" % (pred_dt,mod_name))):
        use_existing = ""
        while (use_existing!="y" and use_existing!="n"):
            use_existing = raw_input("  Model exists alreay, fit a new one? [y/n] ")
        if use_existing=="n":
            print("  Use existing one, return from this function")
            return

    ## Calculate sample weights for XGB fitting:
    if set_log_weight:
        df_nonnan_nonzerot0["s_weight"] = calc_sample_weight(df_nonnan_nonzerot0["TRT_Rank|0"],
                                                             df_nonnan_nonzerot0["TRT_Rank_diff|%i" % pred_dt])

    ## Delete rows with TRT Rank close to zero at lead time:
    print("  Delete rows with TRT Rank close to zero at lead time")
    if delete_RADAR_t0:
        print("  Get predictor matrix X without RADAR variables at t0")
        X_feature_sel = "no_radar_t0"
    else:
        print("  Get predictor matrix X with RADAR variables at t0")
        X_feature_sel = "all"
    X, y = ipt.get_model_input(df_nonnan_nonzerot0, del_TRTeqZero_tpred=True,
            split_Xy=True, pred_dt=pred_dt, TRTRankt0_bound=mod_bound, X_feature_sel=X_feature_sel)
    del(df_nonnan_nonzerot0)
    if len(X)>max_n_feat:
        print("   *** Warning: Dataframe X probably to big to be converted, reduced to %i rows! ***" % max_n_feat)
        X = X.sample(n=max_n_feat,random_state=42)
        y = y.sample(n=max_n_feat,random_state=42)
    #X = X.values
    #X = X.astype(np.float16, order='C', copy=False)

    ## Setup model:
    print("  Setup XGBmodel with max_depth = 6")
    xgb_model = xgb.XGBRegressor(max_depth=6,silent=False,n_jobs=6,nthreads=6)

    ## Calculate sample weights for XGB fitting:
    if set_log_weight:
        s_weights = X["s_weight"].values
        X = X.drop(labels="s_weight", axis=1)
    else:
        s_weights = None

    ## Train model:
    print("  Train XGBmodel")
    d_start = dt.datetime.now()
    xgb_model.fit(X, y, verbose=True, sample_weight=s_weights)
    print("    Elapsed time for XGBoost model fitting: %s" % (dt.datetime.now()-d_start))

    ## Save model to disk:
    print("  Save XGBmodel to disk")
    with open(os.path.join(model_path,"model_%i%s_t0diff_maxdepth6.pkl" % (pred_dt,mod_name)),"wb") as file:
        pickle.dump(xgb_model,file,protocol=2)

    ## Plot feature importance:
    print("  Plot feature importance")
    plot_feature_importance(xgb_model,X,pred_dt,cfg_tds,mod_name)

예제 #5

파일 보기

파일: script_XGB_featfit.py 프로젝트: toowzh/coalition-3

        ls_model_names.append(raw_input("  Please provide model name: "))
if len(ls_model_bound) > 0:
    print("  Using model boundaries:")  # %s" % ls_model_bound)
    for bound, name in zip(ls_model_bound, ls_model_names):
        print("   Model '%s': %s" % (name, bound))
    use_model_boundaries = True
else:
    print("  Using all samples")
    use_model_boundaries = False
    ls_model_bound.append(None)
    ls_model_names.append("")

## Plot XGB model weights (to push importance of strong TRT cells which are not decreasing):
print("\nPlotting XGB model weights")
df_nonnan_nonzerot0t10 = ipt.get_model_input(df_nonnan_nonzerot0,
                                             del_TRTeqZero_tpred=True,
                                             pred_dt=10,
                                             check_for_nans=False)
feat.plot_XGB_model_weights(df_nonnan_nonzerot0t10, cfg_tds)
del (df_nonnan_nonzerot0t10)
use_XGB_model_weights = ""
while (use_XGB_model_weights not in ["y", "n"]):
    use_XGB_model_weights = raw_input(
        "Should XGB model weights be applied, see plot on disk [y/n]: ")
if use_XGB_model_weights == "y":
    print("  Apply model weights")
    XGB_mod_weight = True
else:
    print("  Apply no model weights")
    XGB_mod_weight = False

## Ask user whether Radar variables should be used at t0:

예제 #6

파일 보기

파일: script_ANN_fit.py 프로젝트: toowzh/coalition-3

path_to_df = pth.file_path_reader("pandas training dataframe (nonnan)",
                                  user_argv_path)
model_path = pth.file_path_reader("model saving location")
print("\nLoading nonnan dataframe into RAM")
df_nonnan = pd.read_hdf(path_to_df, key="df_nonnan")

## Get lead-time from user:
ls_pred_dt = feat.get_pred_dt_ls("the ANN fit", cfg_op["timestep"],
                                 cfg_op["n_integ"])

## Loop over time-deltas:
for pred_dt in ls_pred_dt:
    ## Get normalised training and testing data:
    X_train, X_test, y_train, y_test, scaler = ipt.get_model_input(
        df_nonnan,
        del_TRTeqZero_tpred=True,
        split_Xy_traintest=True,
        X_normalise=True,
        pred_dt=pred_dt)

    ## Fit ANN model with all features but only two hidden layers (100, 50):
    print(
        "Fit ANN model with all features but only two hidden layers (100, 50)")
    mlp_allfeat = MLPRegressor(hidden_layer_sizes=(100, 50), verbose=True)
    mlp_allfeat.fit(X_train, y_train)
    with open(
            os.path.join(
                model_path,
                "model_%i%s_t0diff_mlp_allfeat.pkl" % (pred_dt, mod_name)),
            "wb") as file:
        pickle.dump(mlp_allfeat, file, protocol=-1)

예제 #7

파일 보기

파일: modeleval.py 프로젝트: toowzh/coalition-3

def make_model_evaluation(df_nonnan, model_path, ls_pred_dt, cfg_tds, cfg_op):
    X_test_ls = []
    y_test_ls = []
    cmap_pred_dt = plt.cm.get_cmap('viridis_r')

    ## Import dictionary with selected models:
    train_path_name = os.path.join(
        model_path, "model_dict_t0diff_maxdepth6_selfeat_gain.pkl")
    with open(train_path_name, "rb") as file:
        dict_sel_model = pickle.load(file)

    plt.close()
    fig = plt.figure(num=1, figsize=(7, 6))

    ## Loop over lead times:
    for i, pred_dt in enumerate(ls_pred_dt):

        if i == 0:
            xgb_model_ls = []
            pred_model_ls = []
            Rank_obs_ls = []
            top_features_ls = []
            df_param_ls_diff = []
            df_param_ls_rank = []
            df_param_ls_rank_PM = []
            df_param_ls_rank_pers = []
            Rank_pred_XGB_ls = []
            Rank_pred_XGB_PM_ls = []

        if len(X_test_ls) == len(ls_pred_dt) and len(y_test_ls) == len(
                ls_pred_dt):
            X_test = X_test_ls[i]
            y_test = y_test_ls[i]
        else:
            if i == 0:
                X_test_ls = []
                y_test_ls = []
            X_train, X_test, y_train, y_test = ipt.get_model_input(
                df_nonnan,
                del_TRTeqZero_tpred=True,
                split_Xy_traintest=True,
                X_normalise=False,
                pred_dt=pred_dt,
                check_for_nans=False,
                verbose=True)
            del (X_train, y_train)
            X_test_ls.append(X_test)
            y_test_ls.append(y_test)

        ## Load XGB model fitted to all features:
        with open(
                os.path.join(model_path,
                             "model_%i_t0diff_maxdepth6.pkl" % pred_dt),
                "rb") as file:
            xgb_model_feat = pickle.load(file)
        xgb_model_ls.append(xgb_model_feat)

        top_features = pd.DataFrame.from_dict(
            xgb_model_feat.get_booster().get_score(importance_type='gain'),
            orient="index",
            columns=["F_score"]).sort_values(by=['F_score'], ascending=False)
        top_features_ls.append(top_features)

        ## Get specific predictive model for this leadtime:
        pred_model = dict_sel_model["pred_mod_%i" % pred_dt]
        pred_model_ls.append(pred_model)

        ## Check that features agree:
        features_pred_model = pred_model.get_booster().feature_names
        n_features = len(features_pred_model)
        if set(features_pred_model) != set(top_features.index[:n_features]):
            raise ValueError(
                "Features of predictive model and top features of model fitted with all features do not agree"
            )

        ## Make prediction of TRT Rank differences:
        TRT_diff_pred = pred_model.predict(X_test[features_pred_model])

        ## Get set of different TRT Rank predictions:
        Rank_obs, Rank_pred_XGB, Rank_pred_XGB_PM, Rank_pred_pers, Rank_pred_pers_PM, \
            Rank_pred_diff, Diff_pred_XGB = get_obs_fcst_TRT_Rank(X_test["TRT_Rank|0"], TRT_diff_pred, y_test, X_test["TRT_Rank|-5"])
        Rank_obs_ls.append(Rank_obs)
        Rank_pred_XGB_ls.append(Rank_pred_XGB)
        Rank_pred_XGB_PM_ls.append(Rank_pred_XGB_PM)

        ## Plot scatterplots obs vs. predicted:
        plot_pred_vs_obs_core(y_test,
                              Diff_pred_XGB.values,
                              pred_dt,
                              "_XGB%i" % n_features,
                              cfg_tds,
                              outtype="TRT_Rank_diff")
        plot_pred_vs_obs_core(Rank_obs,
                              Rank_pred_XGB.values,
                              pred_dt,
                              "_XGB%i" % n_features,
                              cfg_tds,
                              outtype="TRT_Rank")
        plot_pred_vs_obs_core(Rank_obs,
                              Rank_pred_XGB_PM.values,
                              pred_dt,
                              "_XGB%i-ProbMatch" % n_features,
                              cfg_tds,
                              outtype="TRT_Rank")
        plot_pred_vs_obs_core(Rank_obs,
                              Rank_pred_pers.values,
                              pred_dt,
                              "_Pers",
                              cfg_tds,
                              outtype="TRT_Rank")
        plot_pred_vs_obs_core(Rank_obs,
                              Rank_pred_pers_PM.values,
                              pred_dt,
                              "_Pers-ProbMatch",
                              cfg_tds,
                              outtype="TRT_Rank")
        plot_pred_vs_obs_core(Rank_obs,
                              Rank_pred_diff.values,
                              pred_dt,
                              "_ConstDiff",
                              cfg_tds,
                              outtype="TRT_Rank")

        ## Calculate different term elements for R^2 / Brier Score calculation:
        df_param_ls_diff.append(
            get_R2_param(y_test.values, Diff_pred_XGB.values))
        df_param_ls_rank.append(
            get_R2_param(Rank_obs.values, Rank_pred_XGB.values))
        df_param_ls_rank_PM.append(
            get_R2_param(Rank_obs.values, Rank_pred_XGB_PM.values))
        df_param_ls_rank_pers.append(
            get_R2_param(Rank_obs.values, Rank_pred_pers.values))

        ## Calculate statistics for Taylor Diagram:
        stat_pred_XGB = sm.taylor_statistics(predicted=Rank_pred_XGB.values,
                                             reference=Rank_obs.values)
        stat_pred_XGB_PM = sm.taylor_statistics(
            predicted=Rank_pred_XGB_PM.values, reference=Rank_obs.values)
        stat_pred_pred_pers = sm.taylor_statistics(
            predicted=Rank_pred_pers.values, reference=Rank_obs.values)
        stat_pred_pred_diff = sm.taylor_statistics(
            predicted=Rank_pred_diff.values, reference=Rank_obs.values)
        stat_pred_pred_pers_PM = sm.taylor_statistics(
            predicted=Rank_pred_pers_PM.values, reference=Rank_obs.values)

        sdev = np.array([
            stat_pred_XGB['sdev'][0], stat_pred_XGB['sdev'][1],
            stat_pred_XGB_PM['sdev'][1], stat_pred_pred_pers['sdev'][1]
        ])
        crmsd = np.array([
            stat_pred_XGB['crmsd'][0], stat_pred_XGB['crmsd'][1],
            stat_pred_XGB_PM['crmsd'][1], stat_pred_pred_pers['crmsd'][1]
        ])
        ccoef = np.array([
            stat_pred_XGB['ccoef'][0], stat_pred_XGB['ccoef'][1],
            stat_pred_XGB_PM['ccoef'][1], stat_pred_pred_pers['ccoef'][1]
        ])
        #sdev  = np.array([stat_pred_XGB['sdev'][0], stat_pred_XGB['sdev'][1], stat_pred_XGB_PM['sdev'][1], stat_pred_pred_pers['sdev'][1], stat_pred_pred_diff['sdev'][1]])
        #crmsd = np.array([stat_pred_XGB['crmsd'][0], stat_pred_XGB['crmsd'][1], stat_pred_XGB_PM['crmsd'][1], stat_pred_pred_pers['crmsd'][1], stat_pred_pred_diff['crmsd'][1]])
        #ccoef = np.array([stat_pred_XGB['ccoef'][0], stat_pred_XGB['ccoef'][1], stat_pred_XGB_PM['ccoef'][1], stat_pred_pred_pers['ccoef'][1], stat_pred_pred_diff['ccoef'][1]])

        ## Plot Taylor Diagram:
        col_point = cmap_pred_dt(float(i) / len(ls_pred_dt))
        col_point = (col_point[0], col_point[1], col_point[2], 0.8)

        plot_markerLabel = ["Obs", "+%imin" % pred_dt, "", ""]
        plot_markerLabelColor = "black"
        if i == 0:
            plot_markerLegend = 'on'
            plot_overlay = 'off'
        else:
            plot_markerLegend = "on"
            plot_overlay = 'on'
            #plot_markerLabelColor = None
            if i == len(ls_pred_dt) - 1:
                plot_markerLabelColor = None
                plot_markerLabel = ["Obs", "XGB", "XGB (PM)", "Persistance"]

        sm.taylor_diagram(
            sdev / sdev[0],
            crmsd,
            ccoef,
            styleOBS='-',
            colOBS='darkred',
            markerobs='o',
            titleOBS='Obs',
            markerLabel=plot_markerLabel,
            markerLabelColor=plot_markerLabelColor,
            alpha=0.1,
            markerColor=col_point,
            markerLegend=plot_markerLegend,
            axismax=1.2,
            markerSize=5,
            colRMS='grey',
            styleRMS='--',
            widthRMS=0.8,
            rincRMS=0.25,
            tickRMS=np.arange(0.25, 1.5, 0.25),  #titleRMSangle = 110,
            colSTD='grey',
            styleSTD='-.',
            widthSTD=0.8,
            colCOR='grey',
            styleCOR=':',
            widthCOR=0.8,
            overlay=plot_overlay)

    ## Save Taylor Diagram:
    get_time_delta_colorbar(fig, ls_pred_dt, cmap_pred_dt,
                            [0.7, 0.5, 0.05, 0.3])
    plt.savefig(
        os.path.join(cfg_tds["fig_output_path"], "Taylor_Diagram_cmap.pdf"))
    plt.close()

    ## Plot histogram showing the effect of probability matching:
    print(
        "Save dataframe with observed, predicted, and predicted & PM TRT Ranks"
    )
    Rank_obs_df = pd.concat(Rank_obs_ls, axis=1, sort=True)
    Rank_obs_df.columns = [
        "TRT_Rank_obs|%i" % pred_dt for pred_dt in ls_pred_dt
    ]
    Rank_pred_XGB_df = pd.concat(Rank_pred_XGB_ls, axis=1, sort=True)
    Rank_pred_XGB_df.columns = [
        "TRT_Rank_pred|%i" % pred_dt for pred_dt in ls_pred_dt
    ]
    Rank_pred_XGB_PM_df = pd.concat(Rank_pred_XGB_PM_ls, axis=1, sort=True)
    Rank_pred_XGB_PM_df.columns = [
        "TRT_Rank_pred_PM|%i" % pred_dt for pred_dt in ls_pred_dt
    ]
    #plot_hist_probmatch(Rank_pred_XGB_df, Rank_pred_XGB_PM_df)
    Rank_obs_pred_df = pd.concat(
        [Rank_obs_df, Rank_pred_XGB_df, Rank_pred_XGB_PM_df],
        axis=1,
        sort=True)

    ## Get dataframe with observed, predicted, and predicted & PM TRT Ranks for operational PM:
    op_path_name = os.path.join(cfg_op["XGB_model_path"],
                                "TRT_Rank_obs_pred.pkl")
    with open(op_path_name, "wb") as file:
        pickle.dump(Rank_obs_pred_df, file, protocol=2)
    print("  saved dict to 'XGB_model_path' location:\n    %s" % op_path_name)
    prt_txt = """
    ---------------------------------------------------------------------------------
        The file 'TRT_Rank_obs_pred.pkl' in the
        directory '%s'
        is now used for the operational probability matching procedure, be aware of
        that!
    ---------------------------------------------------------------------------------\n""" % (
        cfg_op["XGB_model_path"])
    print(prt_txt)

    ## Plot skill scores as function of lead-time:
    df_R2_param_rank = pd.concat(df_param_ls_rank,
                                 axis=0).set_index(np.array(ls_pred_dt))
    df_R2_param_rank_PM = pd.concat(df_param_ls_rank_PM,
                                    axis=0).set_index(np.array(ls_pred_dt))
    df_R2_param_diff = pd.concat(df_param_ls_diff,
                                 axis=0).set_index(np.array(ls_pred_dt))
    df_R2_param_rank_pers = pd.concat(df_param_ls_rank_pers,
                                      axis=0).set_index(np.array(ls_pred_dt))
    plot_stats(df_R2_param_rank, "TRT_Rank", cfg_tds)
    plot_stats(df_R2_param_diff, "TRT_Rank_diff", cfg_tds)
    plot_stats_nice(df_R2_param_rank, "TRT_Rank", cfg_tds)
    plot_stats_nice(df_R2_param_diff, "TRT_Rank_diff", cfg_tds)
    plot_stats_nice(df_R2_param_rank_pers, "TRT_Rank_pers", cfg_tds)
    plot_stats_nice(df_R2_param_rank_PM, "TRT_Rank_PM", cfg_tds)

    ## Print IDs of long TRT cells in testing dataset:
    print(
        "\nThese are the IDs of long TRT cells (>25 time steps) in the testing dataset:"
    )
    TRT_ID = X_test_ls[-1].index
    TRT_ID = [TRT_ID_i[13:] for TRT_ID_i in TRT_ID.values]
    TRT_ID_count = Counter(TRT_ID)
    TRT_ID_count_sort = [
        (k, TRT_ID_count[k])
        for k in sorted(TRT_ID_count, key=TRT_ID_count.get, reverse=True)
    ]
    TRT_ID_count_sort_pd = pd.DataFrame(np.array(TRT_ID_count_sort),
                                        columns=["TRT_ID", "Count"])
    TRT_ID_count_sort_pd["Count"] = TRT_ID_count_sort_pd["Count"].astype(
        np.uint16, inplace=True)
    TRT_ID_long = TRT_ID_count_sort_pd.loc[TRT_ID_count_sort_pd["Count"] > 25]
    print(TRT_ID_long)

    TRT_ID_casestudy = [
        "2018080721250094", "2018080721300099", "2018080711400069",
        "2018080710200036"
    ]
    print("  Making analysis for TRT IDs (hardcoded!): %s" % TRT_ID_casestudy)

    TRT_ID_long_sel = TRT_ID_long.loc[TRT_ID_long['TRT_ID'].isin(
        TRT_ID_casestudy)]
    df_feature_ts_plot = pd.DataFrame.from_dict({
        "Radar":
        ["CZC_lt57dBZ|-45|SUM", "CZC_lt57dBZ|-45|SUM", "CZC_lt57dBZ|-45|SUM"],
        "Satellite": [
            "IR_097_stat|-20|PERC05", "IR_097_stat|-15|PERC01",
            "IR_097_stat|-20|MIN"
        ],
        "COSMO": [
            "CAPE_MU_stat|-10|PERC50", "CAPE_MU_stat|-5|PERC75",
            "CAPE_ML_stat|0|SUM"
        ],
        "Lightning": [
            "THX_densIC_stat|-30|SUM", "THX_curr_pos_stat|-40|SUM",
            "THX_curr_pos_stat|-30|SUM"
        ]
    })
    for i_sel in range(len(TRT_ID_long_sel)):
        print("    Working on cell %s" % TRT_ID_long_sel.iloc[i_sel]["TRT_ID"])
        plot_pred_time_series(TRT_ID_long_sel.iloc[i_sel], df_nonnan,
                              Rank_pred_XGB_ls, ls_pred_dt, cfg_tds)
        plot_pred_time_series(TRT_ID_long_sel.iloc[i_sel],
                              df_nonnan,
                              Rank_pred_XGB_PM_ls,
                              ls_pred_dt,
                              cfg_tds,
                              path_addon="PM",
                              title_addon=" (PM)")

        plot_var_time_series_dt0_multiquant(TRT_ID_long_sel.iloc[i_sel],
                                            df_nonnan, cfg_tds)

        for i_pred_dt, pred_dt in enumerate([10, 20, 30]):
            fig = plt.figure(figsize=[10, 6])
            ax_rad = fig.add_subplot(2, 2, 1)
            ax_sat = fig.add_subplot(2, 2, 2)
            ax_cos = fig.add_subplot(2, 2, 3)
            ax_thx = fig.add_subplot(2, 2, 4)
            ax_ls = [ax_rad, ax_sat, ax_cos, ax_thx]
            #fig, axes = plt.subplots(2,2)
            #fig.set_size_inches(8,6)
            for i_source, source in enumerate(
                ["Radar", "Satellite", "COSMO", "Lightning"]):
                ls_feat_param = df_feature_ts_plot[source].iloc[
                    i_pred_dt].split("|")
                past_dt = np.arange(-45, 0,
                                    5) if int(ls_feat_param[1]) != 0 else [0]
                ax_ls[i_source] = plot_var_time_series(
                    TRT_ID_long_sel.iloc[i_sel],
                    df_nonnan,
                    ls_feat_param[0],
                    ls_feat_param[2],
                    past_dt=past_dt,
                    dt_highlight=int(ls_feat_param[1]),
                    ax=ax_ls[i_source])
            plt.tight_layout()
            plt.savefig(
                os.path.join(
                    cfg_tds["fig_output_path"], "Feat_series_%i_%s.pdf" %
                    (pred_dt, TRT_ID_long_sel.iloc[i_sel]["TRT_ID"])))
            plt.close()