Ejemplo n.º 1
0
def train_logit_cv(model, event, predictors, predictors_logit, N):

    #Given a list of predictors (predictors logit), train a logistic regression model N times
    # with cross-validation, and output test stats
    #Also outputs conditional and threshold test stats (e.g. using total totals as a threshold),
    #	although the details of these tests must be changed in run_logit()

    import matplotlib.pyplot as plt

    pool = multiprocessing.Pool()

    if model == "era5":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "era5_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="era5")
    elif model == "barra":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="barra")
    else:
        raise ValueError("Invalid model name")

    test_cond = True
    test_param = True
    normalised = False
    if event == "is_sta":
        iterable = itertools.product(np.arange(0,N), \
         [df_sta[np.append(predictors,event)]], \
         [predictors], [predictors_logit], [normalised], [test_cond], [test_param])
    elif event == "is_conv_aws":
        iterable = itertools.product(np.arange(0,N), \
         [df_aws[np.append(predictors,event)]], \
         [predictors], [predictors_logit], [normalised], [test_cond], [test_param])
    print("Training Logit...")
    res = pool.map(run_logit, iterable)

    csi_logit = [res[i][0] for i in np.arange(0, len(res))]
    pod_logit = [res[i][1] for i in np.arange(0, len(res))]
    far_logit = [res[i][2] for i in np.arange(0, len(res))]
    csi_cond = [res[i][3] for i in np.arange(0, len(res))]
    pod_cond = [res[i][4] for i in np.arange(0, len(res))]
    far_cond = [res[i][5] for i in np.arange(0, len(res))]
    csi_param = [res[i][6] for i in np.arange(0, len(res))]
    pod_param = [res[i][7] for i in np.arange(0, len(res))]
    far_param = [res[i][8] for i in np.arange(0, len(res))]

    print(np.mean(csi_logit))
    print(np.mean(pod_logit))
    print(np.mean(far_logit))

    plt.figure()
    plt.boxplot([csi_logit, csi_cond, csi_param],
                labels=["logit", "cond", "t_totals"])
    plt.figure()
    plt.boxplot([pod_logit, pod_cond, pod_param],
                labels=["logit", "cond", "t_totals"])
    plt.figure()
    plt.boxplot([far_logit, far_cond, far_param],
                labels=["logit", "cond", "t_totals"])
    plt.show()
Ejemplo n.º 2
0
def plot_roc():

	_, era5_aws, era5_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
			"era5_allvars_v3_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
			is_pss="hss", model_name="era5_v5")
	_, barra_aws, barra_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
			"barra_allvars_v3_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
			is_pss="hss", model_name="barra_fc_v5")

	barra_aws_preds = ["ebwd","lr13","ml_el","Umean03","rhmin03"]
	barra_sta_preds = ["ebwd","Umean800_600","lr13","rhmin13","ml_el"]
	era5_aws_preds = ["ebwd","Umean800_600","lr13","rhmin13","srhe_left","q_melting","eff_lcl"]
	era5_sta_preds = ["ml_cape","Umean06","ebwd","lr13"]
	logit = LogisticRegression(class_weight="balanced", solver="liblinear",\
		max_iter=1000)
	era5_aws.loc[:,"logit"] = logit.fit(era5_aws[era5_aws_preds], era5_aws["is_conv_aws"]).predict_proba(era5_aws[era5_aws_preds])[:,1]
	era5_sta.loc[:,"logit"] = logit.fit(era5_sta[era5_sta_preds], era5_sta["is_sta"]).predict_proba(era5_sta[era5_sta_preds])[:,1]
	barra_aws.loc[:,"logit"] = logit.fit(barra_aws[barra_aws_preds], barra_aws["is_conv_aws"]).predict_proba(barra_aws[barra_aws_preds])[:,1]
	barra_sta.loc[:,"logit"] = logit.fit(barra_sta[barra_sta_preds], barra_sta["is_sta"]).predict_proba(barra_sta[barra_sta_preds])[:,1]

	plt.close(); plt.figure(figsize=[10,8]); 
	matplotlib.rcParams.update({'font.size': 12})
	plot_roc_fn(barra_aws, "t_totals", "is_conv_aws", -22000, 2000, -200, 60, 2, 2, 1, "BARRA Measured", "T-Totals")
	plot_roc_fn(barra_sta, "mlcape*s06", "is_sta", -6000, 1000, -200, 200, 2, 2, 2, "BARRA Reported", "MLCS6")
	plot_roc_fn(era5_aws, "t_totals", "is_conv_aws", -20000, 2000, -200, 60, 2, 2, 3, "ERA5 Measured", "T-Totals")
	plot_roc_fn(era5_sta, "dcp", "is_sta", -1000, 750, -200, 200, 2, 2, 4, "ERA5 Reported", "DCP")
	plt.savefig("figA5.eps", bbox_inches="tight", dpi=300)
Ejemplo n.º 3
0
def plot_roc_barra_logit_aws(cv=True):

	_, barra_aws, barra_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
			"barra_allvars_v3_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
			is_pss="hss", model_name="barra_fc_v5")

	barra_aws = add_logit(barra_aws, ["ebwd"], "EBWD",cv)
	barra_aws = add_logit(barra_aws, ["ebwd","lr13"], "EBWD+LR13",cv)
	barra_aws = add_logit(barra_aws, ["ebwd","lr13","ml_el"], "EBWD+LR13+MLEL",cv)
	barra_aws = add_logit(barra_aws, ["ebwd","lr13","ml_el","Umean03"], "EBWD+LR13+MLEL+Umean03",cv)
	barra_aws = add_logit(barra_aws, ["ebwd","lr13","ml_el","Umean03","rhmin03"], "EBWD+LR13+MLEL+Umean03+RHMIN03",cv)
	barra_aws = add_logit(barra_aws, ["ebwd","lr13","Umean03"], "EBWD+LR13+Umean03",cv)

	plt.figure(figsize=[10,8]); 
	matplotlib.rcParams.update({'font.size': 12})
	temp_roc_plot(barra_aws,"EBWD",plt.get_cmap("Blues")(0.3),cv)
	temp_roc_plot(barra_aws,"EBWD+LR13",plt.get_cmap("Blues")(0.45),cv)
	temp_roc_plot(barra_aws,"EBWD+LR13+MLEL",plt.get_cmap("Blues")(0.6),cv)
	temp_roc_plot(barra_aws,"EBWD+LR13+MLEL+Umean03",plt.get_cmap("Blues")(0.75),cv)
	temp_roc_plot(barra_aws,"EBWD+LR13+MLEL+Umean03+RHMIN03",plt.get_cmap("Blues")(0.9),cv)
	temp_roc_plot(barra_aws,"EBWD+LR13+Umean03",plt.get_cmap("Blues")(0.99),cv)
	temp_roc_plot(barra_aws,"mlcape*s06",plt.get_cmap("Reds")(0.2),cv)
	temp_roc_plot(barra_aws,"eff_sherb",plt.get_cmap("Reds")(0.4),cv)
	temp_roc_plot(barra_aws,"dcp",plt.get_cmap("Reds")(0.6),cv)
	temp_roc_plot(barra_aws,"t_totals",plt.get_cmap("Reds")(0.8),cv)
    
	plt.legend(loc="lower right", fontsize="x-small")
	plt.axhline(0.667, color="k", linestyle="--")
	if cv:
		plt.savefig("roc_cv.jpg", bbox_inches="tight", quality=95)
	else:
		plt.savefig("roc.jpg", bbox_inches="tight", quality=95)
Ejemplo n.º 4
0
def fwd_selection(model, event, pval_choice):

    #Perform the following procedure:
    #   1) Load the model data for the model and event given
    #   2) Create an intercept model using statsmodels
    #   3) For each variable, add the variable to the model and assess the p-value
    #   4) Accept a model for the next round either by using the lowest p-value (pval_choice==True), or by using HSS (pval_choice==False)

    print("INFO: Forward selection of variables for " + model + " using " +
          event)

    #Load diagnostics/events
    if model == "era5":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "era5_allvars_v2_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="era5_v3")
    elif model == "barra":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="barra_fc_v3")
    else:
        raise ValueError("Invalid model name")

    #Set the correct dataframe based on event type
    if event == "is_sta":
        df = df_sta.reset_index().drop(columns="index")
    elif event == "is_conv_aws":
        df = df_aws.reset_index().drop(columns="index")

    #Test predictors are all "variables" available
    preds = np.array(['ml_cape', 'mu_cape', 'sb_cape',\
         'ml_cin', 'sb_cin', 'mu_cin', 'ml_lcl', 'mu_lcl', 'sb_lcl', 'eff_cape',\
         'eff_cin', 'eff_lcl', 'lr01', 'lr03', 'lr13', 'lr36', 'lr24', 'lr_freezing',\
         'lr_subcloud', 'qmean01', 'qmean03', 'qmean06', 'qmeansubcloud', 'q_melting',\
         'q1', 'q3', 'q6', 'rhmin01', 'rhmin03', 'rhmin13', 'rhminsubcloud', 'tei', 'wbz',\
         'mhgt', 'mu_el', 'ml_el', 'sb_el', 'eff_el', 'pwat', \
         'te_diff', 'dpd850', 'dpd700', 'dcape', 'ddraft_temp', 'sfc_thetae',\
         'srhe_left', 'srh01_left', 'srh03_left', 'srh06_left', 'ebwd', 's010', 's06',\
         's03', 's01', 's13', 's36', 'scld', 'U500', 'U10', 'U1', 'U3', 'U6', 'Ust_left',\
         'Usr01_left', 'Usr03_left', 'Usr06_left', 'Uwindinf', 'Umeanwindinf',\
         'Umean800_600', 'Umean06', 'Umean01', 'Umean03', 'wg10' ])
    if model == "era5":
        preds = np.append(preds, "cp")

    #Initialise things
    from plot_param import resample_events
    from statsmodels.tools.tools import add_constant
    from statsmodels.discrete.discrete_model import Logit
    import warnings
    warnings.simplefilter("ignore")
    logit = LogisticRegression(class_weight="balanced",
                               solver="liblinear",
                               max_iter=1000)
    pool = multiprocessing.Pool()
    N = 1000
    np.random.seed(seed=0)

    #Train model with statsmodel
    mod = Logit(df[event], add_constant(df[preds])["const"]).fit()

    #Train model with sklearn and get HSS
    logit_mod = logit.fit(add_constant(df[preds])[["const"]], df[event])
    df["predict"] = logit_mod.predict_proba(
        add_constant(df[preds])[["const"]])[:, 1]
    iterable = itertools.product(np.linspace(0, 1,
                                             100), [df[["predict", event]]],
                                 ["predict"], [event], ["hss"])
    res2 = pool.map(pss, iterable)
    current_hss = np.max([res2[i][0] for i in np.arange(len(res2))])

    statsmod_preds = []
    statsmod_hss = []
    alph = 0.05
    is_pval = True  #Keep track of overall progress (i.e. whether or not to continue)
    while is_pval:
        pval_ls = []  #Keep track of the p-value of each individual added param
        is_pval_ls = [
        ]  #Keep track of if all coefficients within the added-parameter model are significant
        hss_ls = []  #Keep track of the HSS
        hss_thresh = []  #Keep track of the HSS thresh
        for p in tqdm.tqdm(preds):
            if p not in statsmod_preds:
                mod = Logit(df[event], add_constant(df[statsmod_preds +
                                                       [p]])).fit(disp=False)
                param_pval = mod.summary2().tables[1].loc[p, "P>|z|"]
                pval = mod.summary2().tables[1].loc[:, "P>|z|"]
                pval_ls.append(param_pval)
                is_pval_ls.append(all(pval <= alph))
                if not pval_choice:
                    logit_mod = logit.fit(df[statsmod_preds + [p]], df[event])
                    df["predict"] = logit_mod.predict_proba(df[statsmod_preds +
                                                               [p]])[:, 1]
                    iterable = itertools.product(np.linspace(0, 1, 100),
                                                 [df[["predict", event]]],
                                                 ["predict"], [event], ["hss"])
                    res2 = pool.map(pss, iterable)
                    hss_ls.append(
                        np.max([res2[i][0] for i in np.arange(len(res2))]))
            else:
                pval_ls.append(1)
                is_pval_ls.append(False)
                hss_ls.append(0)
        #If using pvalues to decide which variable to add, then chose the one with the minimum pvalue
        if pval_choice:
            if (min(pval_ls) <= alph) & (is_pval_ls[np.argmin(pval_ls)]):
                is_pval = True
                statsmod_preds.append(preds[np.argmin(pval_ls)])
                print("INFO: There are " + str(np.sum(is_pval_ls)) +
                      " new models which add value based on p-value")
                print("INFO: The min p-value is " + str(np.min(pval_ls)) +
                      " based on " + preds[np.argmin(pval_ls)])
            else:
                print("INFO: Stopping at " + str(len(statsmod_preds)) +
                      " variables")
                is_pval = False
        #Else, use the optimised HSS to decide (note that a different module is used to fit the model)
        else:
            if any(
                (hss_ls > current_hss) & (is_pval_ls)
            ):  #If there is at least one predictor with a higher HSS and significant coef.
                for z in np.arange(
                        len(is_pval_ls)
                ):  #Remove variables which add HSS but don't have a significant coef.
                    if not is_pval_ls[z]:
                        hss_ls[z] = 0
                #Calculate bootstrapped HSS, and get the upper 5%
                if len(statsmod_preds) >= 1:
                    print("Bootstrapping the HSS to get confidence...")
                    logit_mod = logit.fit(df[statsmod_preds], df[event])
                    df["predict"] = logit_mod.predict_proba(
                        df[statsmod_preds])[:, 1]
                    iterable = itertools.product(np.linspace(0, 1, 100),
                                                 [df[["predict", event]]],
                                                 ["predict"], [event], ["hss"])
                    res2 = pool.map(pss, iterable)
                    hss_temp = [res2[i][0] for i in np.arange(len(res2))]
                    hss_thresh = [res2[i][1] for i in np.arange(len(res2))
                                  ][np.argmax(hss_temp)]
                    hss_boot = []
                    event_ind, non_inds = resample_events(
                        df, event, N, df[event].sum())
                    for i in tqdm.tqdm(np.arange(N)):
                        iterable = itertools.product([hss_thresh],\
                         [df.iloc[np.append(event_ind[i], non_inds[i])][["predict", event]]],\
                         ["predict"], [event], ["hss"])
                        res2 = pool.map(pss, iterable)
                        hss_boot.append(res2[0][0])
                else:
                    hss_boot = [0]
                #If the hss of the most skillful predictor is greater than the 95th percentile, then select the predictor and keep going.
                #Else, halt the proceudre
                if np.max(hss_ls) >= np.percentile(hss_boot, 95):
                    is_pval = True
                    statsmod_preds.append(preds[np.argmax(hss_ls)])
                    statsmod_hss.append(np.max(hss_ls))
                    print("INFO: There are "+str(np.sum((hss_ls > np.percentile(hss_boot, 95)) &\
                     (is_pval_ls)))+" new models which add value based on HSS and p-values")
                    print("INFO: The max HSS is " + str(np.max(hss_ls)) +
                          " based on " + preds[np.argmax(hss_ls)])
                    current_hss = max(hss_ls)
                else:
                    is_pval = False
                    print("INFO: Stopping at " + str(len(statsmod_preds)) +
                          " variables")
            else:
                is_pval = False
                print("INFO: Stopping at " + str(len(statsmod_preds)) +
                      " variables")

    #Now save the output
    logit_mod = logit.fit(df[statsmod_preds], df[event])
    mod = Logit(df[event], add_constant(df[statsmod_preds])).fit(disp=False)
    pval = mod.summary2().tables[1].loc[:, "P>|z|"]
    out_df = pd.DataFrame(
        {
            "coef": np.squeeze(logit_mod.coef_),
            "non_cv_hss": statsmod_hss
        },
        index=statsmod_preds)
    out_df.loc["const", "coef"] = logit_mod.intercept_
    out_df.loc["const", "non_cv_hss"] = 0
    out_df.loc[:, "p-val"] = pval
    if pval_choice:
        pval_str = "pval"
    else:
        pval_str = "hss"
    out_df.to_csv(
        "/g/data/eg3/ab4502/ExtremeWind/skill_scores/logit_fwd_sel_" + model +
        "_" + event + "_" + pval_str + "_v2.csv")
Ejemplo n.º 5
0
def logit_predictor_test(model, event, preds, param, n_splits):

    #Test the performance of a set of predictors (preds) relative to a set of params with given thresholds
    #Output the mean HSS (over 16 CVs), the absolute HSS (trained/tested on the same dataset) and the absolute AUC

    import warnings
    np.random.seed(seed=0)
    warnings.simplefilter("ignore")

    #Load diagnostics/events
    if model == "era5":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "era5_allvars_v2_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="era5_v3")
    elif model == "barra":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="barra_fc_v3")
    else:
        raise ValueError("Invalid model name")

    #Set the correct dataframe based on event type
    if event == "is_sta":
        df = df_sta
    elif event == "is_conv_aws":
        df = df_aws

    train_dfs = []
    test_dfs = []
    split = StratifiedShuffleSplit(n_splits=n_splits,
                                   test_size=0.8,
                                   random_state=0)
    for test_index, train_index in split.split(X=df, y=df[event]):
        train_dfs.append(df.iloc[train_index, :])
        test_dfs.append(df.iloc[test_index, :])

    res = []
    thresh = []
    param_thresh = []
    param_res = []
    test_thresh = np.linspace(df.loc[:, param].min(),
                              np.percentile(df.loc[:, param], 99.95), 1000)
    pool = multiprocessing.Pool()
    for i in tqdm.tqdm(np.arange(len(train_dfs))):
        temp_hss, _, temp_thresh, _ = logit_train(
            [i, train_dfs, test_dfs, preds, event])
        res.append(temp_hss)
        thresh.append(temp_thresh)

        iterable = itertools.product(test_thresh, [test_dfs[i]], [param],
                                     [event], ["hss"])
        res2 = pool.map(pss, iterable)
        temp_param_thresh = [res2[i][1] for i in np.arange(len(res2))]
        pss_p = [res2[i][0] for i in np.arange(len(res2))]
        param_res.append(np.max(pss_p))
        param_thresh.append(temp_param_thresh[np.argmax(pss_p)])

    hss_cv = np.mean([res for i in np.arange(len(test_dfs))])
    hss_min = np.min([res for i in np.arange(len(test_dfs))])
    hss_max = np.max([res for i in np.arange(len(test_dfs))])
    avg_thresh = np.mean(thresh)
    hss_param = np.mean([param_res for i in np.arange(len(test_dfs))])
    hss_param_max = np.max([param_res for i in np.arange(len(test_dfs))])
    hss_param_min = np.min([param_res for i in np.arange(len(test_dfs))])
    hss, _, opt_thresh, _ = logit_train([0, [df], [df], preds, event])
    print("logit_cv: ", hss_cv, param + " cv: ", np.mean(param_res),
          "logit total: ", hss)
    return hss, hss_cv, hss_min, hss_max, avg_thresh, opt_thresh, hss_param, param, hss_param_min, hss_param_max, np.mean(
        param_thresh)
Ejemplo n.º 6
0
def colin_test():

    #Test the collinearity of the logistic equations by using VFE
    from sklearn.metrics import r2_score
    from scipy.stats import spearmanr

    #BARRA
    logit = LogisticRegression(class_weight="balanced", solver="liblinear")
    pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
     "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
     is_pss="hss", model_name="barra_fc_v3")
    #Convective AWS
    event = "is_conv_aws"
    preds = ["eff_lcl", "U1", "sb_cape", "lr13", "rhmin03", "lr36", "eff_cin"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    preds = ["eff_lcl", "U1", "sb_cape", "lr13", "rhmin03", "eff_cin"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df2 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    preds = ["ml_el", "Umean06", "lr36", "rhmin13", "dcape"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df3 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    preds = ["ml_el", "Umean06", "rhmin13", "dcape"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df4 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    (pd.concat([df1, df2], axis=1)).to_csv(
        "/g/data/eg3/ab4502/ExtremeWind/skill_scores/vif_barra_aws.csv",
        float_format="%.2e")
    print(pd.concat([df1, df2], axis=1))

    #Test CV HSS scores
    #preds = ["eff_lcl","U1","sb_cape","lr13","rhmin03","eff_cin"]
    #barra_aws = logit_predictor_test("barra", "is_conv_aws", preds, "t_totals", 16)

    #STA
    preds = ["ml_cape", "Umean06", "eff_lcl", "scld"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)

    print(df1)

    #ERA5
    pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
     "era5_allvars_v2_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
     is_pss="hss", model_name="era5")
    #Convective AWS
    preds = ["ml_el", "Umean03", "eff_lcl", "dpd700", "lr36", "rhmin01"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)
    preds = ["ml_el", "Umean03", "eff_lcl", "dpd700", "rhmin01"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df2 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)
    (pd.concat([df1, df2], axis=1)).to_csv(
        "/g/data/eg3/ab4502/ExtremeWind/skill_scores/vif_era5_aws.csv",
        float_format="%.2e")
    print(pd.concat([df1, df2], axis=1))
    #Test CV HSS scores
    #preds = ["ml_el","Umean03","eff_lcl","dpd700","rhmin01"]
    #era5_aws = logit_predictor_test("era5", "is_sta", preds, "t_totals", 16)

    #STA
    preds = ["ml_cape", "Umean06", "srhe_left", "lr13"]
    vifs = [vif(np.array(df_aws[preds]), i) for i in np.arange(len(preds))]
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df1 = pd.DataFrame({
        "VIF": vifs,
        "coefs": np.squeeze(logit_mod.coef_)
    },
                       index=preds)
    print(df1)
Ejemplo n.º 7
0
def rfe_selection_custom(event, model, K=5):

    #Use recursive feature elimination to find the N most important variables for logistic
    # regression

    #Load diagnostics and events
    if model == "era5":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "era5_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="era5")
    elif model == "barra":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="barra")
    else:
        raise ValueError("Invalid model name")

    #Get preds by taking the unique diagnostics which are in the top 20 ranked HSS for each type
    # of event (lightning, SCW given lightning, STA, AWS)
    preds = np.empty(0)
    for i in [
            "pss_light", "pss_conv_aws", "pss_sta", "pss_conv_aws_cond_light"
    ]:
        preds = np.append(
            preds,
            pss_df.sort_values(i, ascending=False).index[0:20].values)
    preds = np.unique(preds)

    logit = LogisticRegression(class_weight="balanced",
                               solver="liblinear",
                               max_iter=1000)

    pool = multiprocessing.Pool()
    from sklearn.model_selection import KFold
    train_dfs = []
    test_dfs = []
    if event == "is_sta":
        train_dfs = []
        test_dfs = []
        split = StratifiedShuffleSplit(n_splits=K,
                                       test_size=0.8,
                                       random_state=0)
        for test_index, train_index in split.split(X=df_sta, y=df_sta[event]):
            train_dfs.append(df_sta.iloc[train_index, :])
            test_dfs.append(df_sta.iloc[test_index, :])
    elif event == "is_conv_aws":
        train_dfs = []
        test_dfs = []
        split = StratifiedShuffleSplit(n_splits=K,
                                       test_size=0.8,
                                       random_state=0)
        for test_index, train_index in split.split(X=df_aws, y=df_aws[event]):
            train_dfs.append(df_aws.iloc[train_index, :])
            test_dfs.append(df_aws.iloc[test_index, :])

    #Clip the data at the 99.9 and 0.1 percentiles
    if event == "is_sta":
        upper = np.percentile(df_sta[preds], 99.9, axis=0)
        lower = np.percentile(df_sta[preds], 0.1, axis=0)
        for p in preds:
            df_sta.loc[df_sta.loc[:, p] >= (upper[preds == p])[0],
                       p] = upper[preds == p]
            df_sta.loc[df_sta.loc[:, p] <= (lower[preds == p])[0],
                       p] = lower[preds == p]
    elif event == "is_conv_aws":
        upper = np.percentile(df_aws[preds], 99.9, axis=0)
        lower = np.percentile(df_aws[preds], 0.1, axis=0)
        for p in preds:
            df_aws.loc[df_aws.loc[:, p] >= (upper[preds == p])[0],
                       p] = upper[preds == p]
            df_aws.loc[df_aws.loc[:, p] <= (lower[preds == p])[0],
                       p] = lower[preds == p]

    scaler = MinMaxScaler()
    scaler = RobustScaler()
    #DO THIS RECURSIVELY
    ###
    old_hss = -1
    new_hss = 0

    hss_all = []
    eliminated_preds = []
    #while new_hss > old_hss:
    while len(preds) > 1:
        temp_hss = new_hss

        coefs_list = []
        hss_list_ext = []
        for i in np.arange(K):
            print(i)
            mod = logit.fit(scaler.fit_transform(train_dfs[i][preds]),
                            train_dfs[i][event])
            probs = mod.predict_proba(scaler.fit_transform(
                test_dfs[i][preds]))[:, 1]

            hss_list_int = []
            for t in np.linspace(0, 1, 100):
                hits = float(((test_dfs[i][event] == 1) & (probs > t)).sum())
                misses = float(
                    ((test_dfs[i][event] == 1) & (probs <= t)).sum())
                fa = float(((test_dfs[i][event] == 0) & (probs > t)).sum())
                cn = float(((test_dfs[i][event] == 0) & (probs <= t)).sum())
                if (hits / (hits + misses)) > 0.66:
                    hss = ( 2*(hits*cn - misses*fa) ) / \
                     ( misses*misses + fa*fa + 2*hits*cn + (misses + fa) * \
                     (hits + cn) )
                else:
                    hss = 0
                hss_list_int.append(hss)
            coefs_list.append(np.squeeze(mod.coef_))
            hss_list_ext.append(np.max(hss_list_int))
        preds = preds[~(
            preds == preds[np.argmin(abs(np.stack(coefs_list).mean(axis=0)))])]
        new_hss = np.mean(hss_list_ext)
        #old_hss = temp_hss
        hss_all.append(new_hss)
        eliminated_preds.append(preds[(preds == preds[np.argmin( abs(np.stack(coefs_list).\
         mean(axis=0)))])])

    ###


    pd.DataFrame({"preds":preds,"ranking":rfecv.ranking_, "support":rfecv.support_}).\
     to_csv("/g/data/eg3/ab4502/ExtremeWind/points/logit_preds_ranking_"+\
     model+"_"+event+".pkl", index=False)
Ejemplo n.º 8
0
def rfe_selection(event, model, cv):

    #Use recursive feature elimination to find the N most important variables for logistic
    # regression

    #Load reanalysis data at stations, which has already been combined with event data
    if model == "era5":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "era5_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="era5")
    elif model == "barra":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "barra_allvars_2005_2018_2.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="barra_fc")
    else:
        raise ValueError("Invalid model name")

    #Get preds by taking the unique diagnostics which are in the top 20 ranked HSS for each type
    # of event (lightning, SCW given lightning, STA, AWS)
    #preds = np.empty(0)
    #for i in ["pss_light","pss_conv_aws","pss_sta","pss_conv_aws_cond_light"]:
    #	preds = np.append(preds,pss_df.sort_values(i, ascending=False).index[0:20].values)
    #preds = np.unique(preds)
    preds = np.array(['ml_cape', 'mu_cape', 'sb_cape',\
         'ml_cin', 'sb_cin', 'mu_cin', 'ml_lcl', 'mu_lcl', 'sb_lcl', 'eff_cape',\
         'eff_cin', 'eff_lcl', 'lr01', 'lr03', 'lr13', 'lr36', 'lr24', 'lr_freezing',\
         'lr_subcloud', 'qmean01', 'qmean03', 'qmean06', 'qmeansubcloud', 'q_melting',\
         'q1', 'q3', 'q6', 'rhmin01', 'rhmin03', 'rhmin13', 'rhminsubcloud', 'tei', 'wbz',\
         'mhgt', 'mu_el', 'ml_el', 'sb_el', 'eff_el', 'pwat', \
         'te_diff', 'dpd850', 'dpd700', 'dcape', 'ddraft_temp', 'sfc_thetae',\
         'srhe_left', 'srh01_left', 'srh03_left', 'srh06_left', 'ebwd', 's010', 's06',\
         's03', 's01', 's13', 's36', 'scld', 'U500', 'U10', 'U1', 'U3', 'U6', 'Ust_left',\
         'Usr01_left', 'Usr03_left', 'Usr06_left', 'Uwindinf', 'Umeanwindinf',\
         'Umean800_600', 'Umean06', 'Umean01', 'Umean03', 'wg10' ])

    #Clip the data at the 99.9 and 0.1 percentiles
    if event == "is_sta":
        upper = np.percentile(df_sta[preds], 99.9, axis=0)
        lower = np.percentile(df_sta[preds], 0.1, axis=0)
        for p in preds:
            df_sta.loc[df_sta.loc[:, p] >= (upper[preds == p])[0],
                       p] = upper[preds == p]
            df_sta.loc[df_sta.loc[:, p] <= (lower[preds == p])[0],
                       p] = lower[preds == p]
    elif event == "is_conv_aws":
        upper = np.percentile(df_aws[preds], 99.9, axis=0)
        lower = np.percentile(df_aws[preds], 0.1, axis=0)
        for p in preds:
            df_aws.loc[df_aws.loc[:, p] >= (upper[preds == p])[0],
                       p] = upper[preds == p]
            df_aws.loc[df_aws.loc[:, p] <= (lower[preds == p])[0],
                       p] = lower[preds == p]

    scaler = MinMaxScaler()
    logit = LogisticRegression(class_weight="balanced",
                               solver="liblinear",
                               max_iter=1000)

    if cv:
        rfecv = RFECV(estimator=logit, step=1, verbose=10,\
         scoring="roc_auc",n_jobs=-1,cv=5)
        if event == "is_sta":
            rfecv = rfecv.fit(scaler.fit_transform(df_sta[preds]),
                              df_sta[event])
        elif event == "is_conv_aws":
            rfecv = rfecv.fit(scaler.fit_transform(df_aws[preds]),
                              df_aws[event])
        pd.DataFrame({"preds":preds,"ranking":rfecv.ranking_, "support":rfecv.support_}).\
         to_csv("/g/data/eg3/ab4502/ExtremeWind/points/logit_preds_ranking_cv_"+\
         model+"_"+event+".pkl", index=False)
    else:
        rfe = RFE(estimator=logit, step=1, verbose=10,\
         n_features_to_select=1)
        if event == "is_sta":
            rfe = rfe.fit(scaler.fit_transform(df_sta[preds]), df_sta[event])
        elif event == "is_conv_aws":
            rfe = rfe.fit(scaler.fit_transform(df_aws[preds]), df_aws[event])
        pd.DataFrame({"preds":preds,"ranking":rfe.ranking_, "support":rfe.support_}).\
         to_csv("/g/data/eg3/ab4502/ExtremeWind/points/logit_preds_ranking_"+\
         model+"_"+event+".pkl", index=False)
Ejemplo n.º 9
0
def logit_test(all_predictors, model, event, model_diagnostics=None):

    #Try every possible combination of "all_predictors" predictors, and
    #	train a logstic model using cross validation
    #10 variables = 1023 combinations

    #Load diagnostics/events
    if model == "era5":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "era5_allvars_v2_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="era5_v2")
    elif model == "barra":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "barra_allvars_2005_2018_2.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="barra_fc")
    else:
        raise ValueError("Invalid model name")

    #Set the correct dataframe based on event type
    if event == "is_sta":
        df = df_sta
    elif event == "is_conv_aws":
        df = df_aws

    #Create 16 unique train/test datasets with balanced number of events
    pool = multiprocessing.Pool()
    train_dfs = []
    test_dfs = []
    split = StratifiedShuffleSplit(n_splits=16, test_size=0.8, random_state=0)
    for test_index, train_index in split.split(X=df, y=df[event]):
        train_dfs.append(df.iloc[train_index, :])
        test_dfs.append(df.iloc[test_index, :])

    #For each combination of variables, calculate the optimal pss/hss, and save the
    # scores and thresholds
    param_out = []
    hss_thresh_out = []
    pss_thresh_out = []
    hss_out = []
    pss_out = []
    for r in np.arange(2, 11):
        params = itertools.combinations(all_predictors, r)
        for predictors in params:
            print(predictors)
            start = dt.datetime.now()

            iterable = itertools.product(np.arange(16),\
             [train_dfs], [test_dfs], [predictors], [event])
            res = pool.map(logit_train, iterable)

            param_out.append(" ".join(predictors))
            hss_out.append(np.mean([res[i][0] for i in np.arange(16)]))
            pss_out.append(np.mean([res[i][1] for i in np.arange(16)]))
            hss_thresh_out.append(np.mean([res[i][2] for i in np.arange(16)]))
            pss_thresh_out.append(np.mean([res[i][3] for i in np.arange(16)]))

    pd.DataFrame({"predictors":param_out, "hss":hss_out, "pss":pss_out,\
     "pss_thresh":pss_thresh_out, "hss_thresh":hss_thresh_out}).\
     to_csv("/g/data/eg3/ab4502/ExtremeWind/points/logit_skill_"+model+"_v2_"+event+".csv",\
     index=False)

    #Now do the same scores for diagnostics
    try:
        for p in model_diagnostics:
            print(p)
            hss_thresh_out = []
            hss_out = []
            for i in np.arange(len(test_dfs)):
                test_thresh = np.linspace(df.loc[:,p].min(), \
                 np.percentile(df.loc[:,p],99.95), 1000)
                iterable = itertools.product(test_thresh, [test_dfs[i]], [p],\
                 [event], ["hss"])
                res = pool.map(pss, iterable)
                thresh = [res[i][1] for i in np.arange(len(res))]
                pss_p = [res[i][0] for i in np.arange(len(res))]

                hss_out.append(np.max(pss_p))
                hss_thresh_out.append(thresh[np.argmax(np.array(pss_p))])
            pd.DataFrame({"predictors":[p], "hss":[np.mean(hss_out)], \
             "hss_thresh":[np.mean(hss_thresh_out)]}).\
             to_csv("/g/data/eg3/ab4502/ExtremeWind/points/"+p+\
              "_skill_"+model+"_"+event+".csv",\
             index=False)
    except:
        pass
Ejemplo n.º 10
0
def run_logit():

    #BARRA
    logit = LogisticRegression(class_weight="balanced", solver="liblinear")
    pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
     "barra_allvars_2005_2018_2.pkl", T=1000, compute=False, l_thresh=2,\
     is_pss="hss", model_name="barra_fc")
    #Convective AWS
    preds = ["lr36", "lr_freezing", "ml_el", "s06", "srhe_left", "Umean06"]
    event = "is_conv_aws"
    p = "t_totals"
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df_aws["logit"] = logit_mod.predict_proba(df_aws[preds])[:, 1]
    res = [
        pss([t, df_aws, "logit", event, "hss"])
        for t in np.linspace(0, 1, 100)
    ]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_logit = hss_thresh[np.argmax(hss)]
    hss_logit = np.max(hss)
    res = [pss([t, df_aws, p, event, "hss"]) for t in \
     np.linspace(np.percentile(df_sta.loc[:,p],50),\
         np.percentile(df_sta.loc[:,p],99.5),100)]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_p = hss_thresh[np.argmax(hss)]
    hss_p = np.max(hss)
    print("BARRA Conv AWS")
    print(p, "hss: ", hss_p, "thresh: ", hss_thresh_p)
    print("logit", "hss: ", hss_logit, "hss_thresh: ", hss_thresh_logit)
    #STA
    #preds = ["lr36","lr_freezing","mhgt","ml_el","s06","srhe_left","Umean06"]
    preds = ['qmean06', 'pwat', 'qmean01', 'sb_lcl', 'ddraft_temp']
    event = "is_sta"
    p = "dcp"
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df_aws["logit"] = logit_mod.predict_proba(df_aws[preds])[:, 1]
    res = [
        pss([t, df_aws, "logit", event, "hss"])
        for t in np.linspace(0, 1, 100)
    ]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_logit = hss_thresh[np.argmax(hss)]
    hss_logit = np.max(hss)
    res = [pss([t, df_aws, p, event, "hss"]) for t in \
     np.linspace(np.percentile(df_sta.loc[:,p],50),\
         np.percentile(df_sta.loc[:,p],99.5),100)]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_p = hss_thresh[np.argmax(hss)]
    hss_p = np.max(hss)
    print("BARRA STA")
    print(p, "hss: ", hss_p, "thresh: ", hss_thresh_p)
    print("logit", "hss: ", hss_logit, "hss_thresh: ", hss_thresh_logit)

    #ERA5
    pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
     "era5_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
     is_pss="hss", model_name="era5")
    #Convective AWS
    preds = ["lr36", "mhgt", "ml_el", "qmean01", "srhe_left", "Umean06"]
    event = "is_conv_aws"
    p = "t_totals"
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df_aws["logit"] = logit_mod.predict_proba(df_aws[preds])[:, 1]
    res = [
        pss([t, df_aws, "logit", event, "hss"])
        for t in np.linspace(0, 1, 100)
    ]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_logit = hss_thresh[np.argmax(hss)]
    hss_logit = np.max(hss)
    res = [pss([t, df_aws, p, event, "hss"]) for t in \
     np.linspace(np.percentile(df_sta.loc[:,p],50),\
         np.percentile(df_sta.loc[:,p],99.5),100)]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_p = hss_thresh[np.argmax(hss)]
    hss_p = np.max(hss)
    print("ERA5 Conv AWS")
    print(p, "hss: ", hss_p, "thresh: ", hss_thresh_p)
    print("logit", "hss: ", hss_logit, "hss_thresh: ", hss_thresh_logit)
    #STA
    preds = ["lr36", "ml_cape", "srhe_left", "Umean06"]
    event = "is_sta"
    p = "dcp"
    logit_mod = logit.fit(df_aws[preds], df_aws[event])
    df_aws["logit"] = logit_mod.predict_proba(df_aws[preds])[:, 1]
    res = [
        pss([t, df_aws, "logit", event, "hss"])
        for t in np.linspace(0, 1, 100)
    ]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_logit = hss_thresh[np.argmax(hss)]
    hss_logit = np.max(hss)
    res = [pss([t, df_aws, p, event, "hss"]) for t in \
     np.linspace(np.percentile(df_sta.loc[:,p],50),\
         np.percentile(df_sta.loc[:,p],99.5),100)]
    hss = [res[i][0] for i in np.arange(len(res))]
    hss_thresh = [res[i][1] for i in np.arange(len(res))]
    hss_thresh_p = hss_thresh[np.argmax(hss)]
    hss_p = np.max(hss)
    print("ERA5 STA")
    print(p, "hss: ", hss_p, "thresh: ", hss_thresh_p)
    print("logit", "hss: ", hss_logit, "hss_thresh: ", hss_thresh_logit)
Ejemplo n.º 11
0
def rfe_selection(event, model, cv):

    #Use recursive feature elimination to find the N most important variables for logistic
    # regression

    #Load reanalysis data at stations, which has already been combined with event data
    if model == "era5":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "era5_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="era5")
    elif model == "barra":
        pss_df, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/"+\
         "barra_allvars_2005_2018.pkl", T=1000, compute=False, l_thresh=2,\
         is_pss="hss", model_name="barra")
    else:
        raise ValueError("Invalid model name")

    #Get preds by taking the unique diagnostics which are in the top 20 ranked HSS for each type
    # of event (lightning, SCW given lightning, STA, AWS)
    preds = np.empty(0)
    for i in [
            "pss_light", "pss_conv_aws", "pss_sta", "pss_conv_aws_cond_light"
    ]:
        preds = np.append(
            preds,
            pss_df.sort_values(i, ascending=False).index[0:20].values)
    preds = np.unique(preds)

    #Clip the data at the 99.9 and 0.1 percentiles
    if event == "is_sta":
        upper = np.percentile(df_sta[preds], 99.9, axis=0)
        lower = np.percentile(df_sta[preds], 0.1, axis=0)
        for p in preds:
            df_sta.loc[df_sta.loc[:, p] >= (upper[preds == p])[0],
                       p] = upper[preds == p]
            df_sta.loc[df_sta.loc[:, p] <= (lower[preds == p])[0],
                       p] = lower[preds == p]
    elif event == "is_conv_aws":
        upper = np.percentile(df_aws[preds], 99.9, axis=0)
        lower = np.percentile(df_aws[preds], 0.1, axis=0)
        for p in preds:
            df_aws.loc[df_aws.loc[:, p] >= (upper[preds == p])[0],
                       p] = upper[preds == p]
            df_aws.loc[df_aws.loc[:, p] <= (lower[preds == p])[0],
                       p] = lower[preds == p]

    scaler = MinMaxScaler()
    logit = LogisticRegression(class_weight="balanced",
                               solver="liblinear",
                               max_iter=1000)

    if cv:
        rfecv = RFECV(estimator=logit, step=1, verbose=10,\
         scoring="roc_auc",n_jobs=-1,cv=5)
        if event == "is_sta":
            rfecv = rfecv.fit(scaler.fit_transform(df_sta[preds]),
                              df_sta[event])
        elif event == "is_conv_aws":
            rfecv = rfecv.fit(scaler.fit_transform(df_aws[preds]),
                              df_aws[event])
        pd.DataFrame({"preds":preds,"ranking":rfecv.ranking_, "support":rfecv.support_}).\
         to_csv("/g/data/eg3/ab4502/ExtremeWind/points/logit_preds_ranking_cv_"+\
         model+"_"+event+".pkl", index=False)
    else:
        rfe = RFE(estimator=logit, step=1, verbose=10,\
         n_features_to_select=1)
        if event == "is_sta":
            rfe = rfe.fit(scaler.fit_transform(df_sta[preds]), df_sta[event])
        elif event == "is_conv_aws":
            rfe = rfe.fit(scaler.fit_transform(df_aws[preds]), df_aws[event])
        pd.DataFrame({"preds":preds,"ranking":rfe.ranking_, "support":rfe.support_}).\
         to_csv("/g/data/eg3/ab4502/ExtremeWind/points/logit_preds_ranking_"+\
         model+"_"+event+".pkl", index=False)
Ejemplo n.º 12
0
        out.append(hits / (hits + misses))
        hits_out.append(hits)
        misses_out.append(misses)
        fa_out.append(fa)
        cn_out.append(cn)
    return np.array(out), pd.DataFrame({"pod":out}, index=v_list), pd.DataFrame({"hits":hits_out, "misses":misses_out, "false_alarms":fa_out,\
         "correct_negatives":cn_out}, index=v_list)


if __name__ == "__main__":

    #Settings
    N = 1000  #Bootstrap

    #Load HSS and thresholds for ERA5
    hss, df_aws, df_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/points/era5_allvars_v3_2005_2018.pkl",T=1000,\
  compute=True, l_thresh=2, is_pss="hss", model_name="era5_v5",time="floor")
    df_aws["logit"] = 1 / (
        1 +
        np.exp(-(df_aws["ebwd"] * 6.1e-2 + df_aws["Umean800_600"] * 1.5e-1 +
                 df_aws["lr13"] * 9.4e-1 + df_aws["rhmin13"] * 3.9e-2 +
                 df_aws["srhe_left"] * 1.7e-2 + df_aws["q_melting"] * 3.8e-1 +
                 df_aws["eff_lcl"] * 4.7e-4 - 1.3e+1)))
    df_aws["logit_sta"] = 1 / (
        1 +
        np.exp(-(df_aws["ebwd"] * 1.3e-1 + df_aws["Umean06"] * 1.7e-1 +
                 df_aws["ml_cape"] * 1.6e-3 + df_aws["lr13"] * 4.1e-1 - 5.6)))

    #Calculate optimal HSS for each logistic regression model, and add to the HSS dataframe for all other diagnostics
    #Measured model
    pool = multiprocessing.Pool()
    temp_df = df_aws.loc[:, ["is_conv_aws", "logit"]]
Ejemplo n.º 13
0
    p_list = ["dcp", "mucape*s06", "mlcape*s06", "t_totals", "mu_cape"]

    #Load daily max obs
    l_thresh = 2
    df = pd.read_pickle("/g/data/eg3/ab4502/ExtremeWind/obs/aws/"+\
         "convective_wind_gust_aus_2005_2018.pkl")
    df = df[df.tc_affected == 0]
    df.loc[:, "is_conv_aws"] = np.where((df.wind_gust >= 25) & (df.lightning >= l_thresh) &\
         (df.tc_affected==0), 1, 0)
    df.loc[:, "is_sta"] = np.where((df.is_sta == 1) & (df.tc_affected == 0), 1,
                                   0)

    #Load reanalysis diagnostics, fit logistic regression to daily data, and apply to daily and
    # hourly diagnostics
    pss_df, mod_aws, mod_sta = optimise_pss("/g/data/eg3/ab4502/ExtremeWind/"+\
         "points/era5_allvars_v3_2005_2018.pkl",\
         T=1000, compute=False, l_thresh=2, is_pss="hss", model_name="era5_v5")
    mod_hourly = pd.read_pickle("/g/data/eg3/ab4502/ExtremeWind/points/"+\
        "era5_allvars_v3_2005_2018.pkl").dropna()

    #Plot seasonal and diurnal cycle using hourly reanalysis data, overwriting the "diurnal_df"
    # and "monthly_df" dataframes
    df["aws_hour"] = pd.DatetimeIndex(df.gust_time_lt).round("H").hour
    df["sta_hour"] = pd.DatetimeIndex(utc_to_lt(\
        df.rename(columns={"sta_date":"time","stn_name":"loc_id"})).time).round("H").hour
    df["month"] = pd.DatetimeIndex(df.gust_time_lt).month
    aws_hr = pd.DataFrame(np.unique(df[df["is_conv_aws"]==1]["aws_hour"], \
         return_counts=True)).T.set_index(0).\
         rename(columns={1:"Measured"})
    sta_hr = pd.DataFrame(np.unique(df[df["is_sta"]==1]["sta_hour"], \
         return_counts=True)).T.set_index(0).\