Esempio n. 1
0
def wrapper_fun_add_aux_static_variables(pkl_path, ds=None):
    cfg_set_input, cfg_var, cfg_var_combi = cfg.get_config_info_op()
    if ds is None:
        file_path = os.path.join(pkl_path, "Combined_stat_pixcount.pkl")
        if not os.path.exists(file_path):
            file_path = os.path.join(pkl_path, "nc/Combined_stat_pixcount.nc")
        ds = rxr.xarray_file_loader(file_path)
    print(" Adding auxiliary variables to xarray object in file:\n   %s" %
          file_path)
    cfg_set_input["verbose"] = True

    ## Add auxilirary variables:
    ds = stat.add_aux_static_variables(ds, cfg_set_input)

    ## Save NetCDF:
    print("  Start saving NetCDF file")
    file_new = os.path.join(pkl_path, "nc/Combined_stat_pixcount_aux.nc")
    ds.to_netcdf(file_new)  #,
    #encoding={'zlib': True, 'complevel': 5},
    #compute=True)

    ## Save Pickle:
    file_new = os.path.join(pkl_path, "Combined_stat_pixcount_aux.pkl")
    try:
        print("  Start saving pickle file")
        with open(file_new, "wb") as output_file:
            pickle.dump(ds, output_file, protocol=-1)
    except struct.error:
        print("   *** File to large to be saved as pickle ***")
    return (ds)
Esempio n. 2
0
import pandas as pd
import xgboost as xgb
import matplotlib.pylab as plt

import coalition3.inout.paths as pth
import coalition3.inout.readconfig as cfg
import coalition3.statlearn.fitting as fit
import coalition3.statlearn.inputprep as ipt

## Uncomment when running on Mac OS:
#os.environ['KMP_DUPLICATE_LIB_OK']='True'
    
## ============================================================================
## Get config info:
cfg_tds = cfg.get_config_info_tds()
cfg_op, __, __ = cfg.get_config_info_op()

## Load training dataframe:
user_argv_path = sys.argv[1] if len(sys.argv)==2 else None
path_to_df = pth.file_path_reader("pandas training dataframe (nonnan)",user_argv_path)
print("\nLoading nonnan dataframe into RAM")
df_nonnan  = pd.read_hdf(path_to_df,key="df_nonnan")

## Load list with models:
model_path_xgb = pth.file_path_reader("XGBoost model list")
model_path_mlp = pth.file_path_reader("MLP model list")
with open(model_path_xgb,"rb") as file: ls_models_xgb = pickle.load(file)
with open(model_path_mlp,"rb") as file: ls_models_mlp = pickle.load(file)

## Get prediction leadtime from model:
pred_dt = -1
Esempio n. 3
0
        user_time_point = datetime.datetime.strptime(
            raw_input("Please enter a time point [%Y%m%d%H%M]: "),
            "%Y%m%d%H%M")
    except ValueError:
        print("  Date is not in the right format, please repeat")
        user_time_point = None
print("\nGet TRT values at time point %s" % user_time_point)

## Initialise empty lists for results:
TRT_rank_ls = []
COAL3_rank_median_sel_ls = []
COAL3_rank_median_all_ls = []

## Get config info
cfg_set_tds = cfg.get_config_info_tds()
cfg_set_input, cfg_var, cfg_var_combi = cfg.get_config_info_op()

## Initialise fields (CCS4 meshgrid and VIL, EchoTop and MaxEcho observations):
ccs4_CH = np.meshgrid(
    np.arange(255000, 965000, 1000) + 500,
    np.arange(-160000, 480000, 1000) + 500)
ET45 = rccs.get_vararr_t(user_time_point, "EZC45", cfg_set_input)
CZC = rccs.get_vararr_t(user_time_point, "CZC", cfg_set_input)
LZC = rccs.get_vararr_t(user_time_point, "LZC", cfg_set_input)
RZC = rccs.get_vararr_t(user_time_point, "RZC", cfg_set_input)

## Get TRT file:
filename = pth.path_creator(user_time_point, "TRT", "TRT", cfg_set_input)[0]
if len(filename) == 0: raise IOError("No TRT file found")
elif len(filename) > 1: raise IOError("More than one TRT file found")
file = open(filename[0], "r")
Esempio n. 4
0
def get_TRT_cell_info(dt_sampling_list,
                      cfg_set_tds,
                      cfg_set_input=None,
                      len_ini_df=None):
    """Get information on TRT cells within time period.
    
    Parameters:
    -----------
    
    len_ini_df : uint
        Length of initial dataframe (to setup the dataframe, if number of TRT cells
        exceeds this initial length, additional lines are appended, if there are fewer,
        the exceeding lines are deleted.
    """
    print("Estimate number of samples within training period")

    ## Get input data config file
    if cfg_set_input is None:
        cfg_set_input, cfg_var = cfg.get_config_info_op()

    ## Create empty DataFrame
    if len_ini_df is None: len_ini_df = len(dt_sampling_list) * 3
    ## Old:
    #df_cols = ["traj_ID","date","RANKr","area","lat","lon","iCH","jCH"]
    #samples_df = pd.DataFrame(np.zeros((len_ini_df,len(df_cols)))*np.nan,
    #                          columns=df_cols)
    ## New:
    #samples_df = Nip.df_empty(["traj_ID"]+cfg_set_input["TRT_cols"],[np.object]+cfg_set_input["TRT_dtype"])
    samples_ls = []

    #ind_df = 0; first_append = True; doy_temp = -1

    ## Loop over time steps to gather information on TRT cells at specific time step:
    t_start = datetime.datetime.now()
    t_exp = "(calculating)"
    for counter, sampling_time in enumerate(dt_sampling_list):
        perc_checked = np.round(
            (sampling_time.hour * 60 + sampling_time.minute) / 1440., 2)
        if counter % 100 == 0 and counter > 10:
            t_exp = (datetime.datetime.now() + \
                     (datetime.datetime.now() - t_start)*int((1-perc_checked)/perc_checked)).strftime("%d.%m.%Y %H:%M")

        print("  Check input data availability of date: %s - %3d%% (expected finishing time: %s) " % \
            (sampling_time.strftime("%d.%m.%Y"),100*perc_checked,t_exp), end='\r')

        ## Update time in config dict:
        cfg_set_input["t0"] = sampling_time
        t0 = cfg_set_input["t0"]
        cfg_set_input["t0_doy"] = t0.timetuple().tm_yday
        cfg_set_input["t0_str"] = t0.strftime("%Y%m%d%H%M")
        #if cfg_set_input["t0_doy"]%10==0 and cfg_set_input["t0_doy"]!=doy_temp:
        #    print("   For doy: %s" % cfg_set_input["t0_doy"])
        #    doy_temp = cfg_set_input["t0_doy"]

        ## Get file path to respective TRT file of time point sampling_time:
        filepaths, timestamps = path.path_creator(sampling_time, "TRT", "TRT",
                                                  cfg_set_input)

        ## In case file is not available, look for files just right before and after this timepoint
        ## (e.g. if no file available at 16:35, look at 16:25/16:30/16:40/16:45), otherwise skip this time point.
        if filepaths[0] is None:
            for dt_daily_shift_fac in [-1, 1, -2, 2]:
                sampling_time_temp = sampling_time + dt_daily_shift_fac * datetime.timedelta(
                    minutes=cfg_set_tds["dt_daily_shift"])
                filepaths_temp, timestamps = path.path_creator(
                    sampling_time_temp, "TRT", "TRT", cfg_set_input)
                if filepaths_temp[0] is not None:
                    filepaths = filepaths_temp
                    print("       Instead using dataset: %s" % filepaths[0])
                    break
        if filepaths[0] is None:
            print("       No files found, skip this timepoint")
            continue

        ## Read in TRT-info:
        traj_IDs, TRTcells, cell_mask = swisstrt.readRdt(filepaths[0])
        for traj_ID in traj_IDs:
            ## New:
            dict_cellinfo = {
                key: value
                for key, value in TRTcells[traj_ID].__dict__.items()
                if not key.startswith('__') and not callable(key)
            }
            #cell_info_df  = pd.DataFrame.from_records([dict_cellinfo], index=[9])
            #samples_df_append = pd.DataFrame([[traj_ID]],columns=["traj_ID"],index=[9]).join(pd.DataFrame.from_records([dict_cellinfo],index=[9]))
            #samples_df = samples_df.append(samples_df_append, ignore_index=True, sort=True)
            samples_ls.append(
                pd.DataFrame([[traj_ID]], columns=["traj_ID"], index=[9]).join(
                    pd.DataFrame.from_records([dict_cellinfo], index=[9])))
            ## Old:
            """
            cell = TRTcells[traj_ID]
            cell_date = datetime.datetime.strptime(cell.date,"%Y%m%d%H%M")
            if ind_df <= len_ini_df-1:
                samples_df.iloc[ind_df,:] = [traj_ID,cell_date,cell.RANKr,cell.area,
                                             cell.lat,cell.lon,int(cell.iCH),int(cell.jCH)]
            else:            
                if first_append: print("   *** Start appending to dataframe at t = %s ***" % sampling_time)
                first_append = False
                samples_df = samples_df.append(pd.DataFrame([[traj_ID,cell_date,cell.RANKr,cell.area,
                                                              cell.lat,cell.lon,int(cell.iCH),int(cell.jCH)]],
                                               columns=["traj_ID","date","RANKr","area","lat","lon","iCH","jCH"]))
            ind_df += 1
            """

    samples_df = pd.concat(samples_ls)

    ## Only keep non-nan lines (where there are TRT cells):
    #print("   Lenght of dataframe before dropping of nans: %s" % samples_df.shape[0])
    #print("   Index of dataframe after filling: %s" % ind_df)
    samples_df = samples_df.dropna()
    print("   Lenght of dataframe after dropping of nans: %s" %
          samples_df.shape[0])
    print("   Number of different TRT cells: %s\n" %
          len(np.unique(samples_df["traj_ID"])))
    print(samples_df.info(), "\n")
    print(samples_df, "\n")
    samples_df.to_pickle(
        os.path.join(cfg_set_tds["root_path_tds"],
                     u"Training_Dataset_Sampling.pkl"))
    print("   Dataframe saved in: %s" % os.path.join(
        cfg_set_tds["root_path_tds"], u"Training_Dataset_Sampling.pkl"))
    return (samples_df)
Esempio n. 5
0
def convert_ds2df(ds, outpath, diff_option=None):
    ## Get time delta values (also just pos/neg ones):
    time_del = ds.time_delta.values
    neg_del = time_del[time_del < 0]
    neg0_del = time_del[time_del <= 0]
    pos_del = time_del[time_del > 0]
    pos0_del = time_del[time_del >= 0]

    ## Check for new variables which cannot be converted yet (e.g. categorical variables):
    unconvertable_vars = [
        var for var in ds.data_vars if "CMA" in var or "CT" in var
    ]
    if len(unconvertable_vars
           ) > 1:  # and unconvertable_vars[0]!='TOPO_ASPECT_stat':
        raise NotImplementedError("Categorical counting not yet implemented")

    ## Extract future TRT Ranks (target variable) and calculate
    ## Rank difference to t0.
    print("  Extract future TRT Ranks and pixel counts (treated seperately)")
    ds_TRTrank_val = ds[
        "TRT_Rank"]  #.where(ds["time_delta"]>0, drop=True).rename("TRT_Rank")
    ds_TRTrank_diff = ds_TRTrank_val - ds["TRT_Rank"].sel(
        time_delta=0)  #ds_TRTrank_val.sel(time_delta=slice(5,45)) - \
    ds_TRTrank_diff = ds_TRTrank_diff.rename("TRT_Rank_diff")

    ## Extract pixel counts of Radar variables with "nonmin" statistics:
    ds_pixc_radar = ds[[
        var[:-12] + u"_pixc" for var in ds.data_vars if "nonmin" in var
    ]]
    ds_pixc_radar = ds_pixc_radar.sel(
        pixel_count="PC_NONMIN").drop("pixel_count").where(
            ds_pixc_radar["time_delta"] <= 0, drop=True).astype(np.int16)

    ## Delete unwanted or already extracted (see above) variables
    ## (e.g. pixel counts, TRT_Ranks):
    drop_list = [var for var in ds.data_vars if "_pixc" in var]
    drop_list += [
        u"TRT_Rank", u"TRT_Rank_diff", u"TRT_domain_indices", u"pixel_count",
        u"TRT_cellcentre_indices", "date"
    ]
    ds_drop = ds.drop(drop_list)

    ## Extract TRT variables (CG, Dvel_x, ..) and solar time:
    print("  Extract 1D variables (TRT vars and solar time)")
    ds_1d = ds_drop[[
        var for var in ds_drop.data_vars if len(ds_drop[var].shape) < 2
    ]]

    ## Delete future values (time_delta > 0) and calculate absolute difference
    ## between statistics at t0 and time_delta < 0. Also, set NaN-values in
    ## "_nonmin" statistics to min_value:
    print("  Extract 2D variables (with 'time_delta' coordinate)")
    ds_23d = ds_drop[[
        var for var in ds_drop.data_vars if len(ds_drop[var].shape) >= 2
    ]]
    del (ds_drop)

    ## Decide between deltas between time steps to delta to t0:
    print_text = """
        \nHow should variables be treated over time:
          Option 1 -> Keep the absolute values of the statistics [path addon 'nodiff']
                      (e.g. MaxRZC(t0-45min), MaxRZC(t0-40min), .. , MaxRZC(t0))
          Option 2 -> Take the difference to the statistic at t0 and keep absolute value at t0 [path addon 't0diff']
                      (e.g. MaxRZC(t0)-MaxRZC(t-45min), MaxRZC(t0)-MaxRZC(t-40min), .. , MaxRZC(t0))
          Option 3 -> Between each time step (and keep absolute value at t0) [path addon 'dtdiff']
                      (e.g. MaxRZC(t0-40min)-MaxRZC(t0-45min), MaxRZC(t0-35min)-MaxRZC(t0-40min), .. , MaxRZC(t0))
    """
    if diff_option is None:
        print(print_text)
    while (diff_option != "1" and diff_option != "2" and diff_option != "3"):
        diff_option = str(raw_input("Which option do you choose? [1/2/3] "))

    ## Delete "future" values:
    print(
        "     Take difference to t0 value / set NaN to min_value in '_nonmin' statistics (TIME CONSUMING)"
    )
    ds_past = ds_23d.where(ds_23d["time_delta"] <= 0, drop=True)
    del (ds_23d)

    ## Take the difference:
    cfg_set, cfg_var, cfg_var_combi = cfg.get_config_info_op()
    for var in ds_past.data_vars:
        if diff_option == "2":
            if len(ds_past[var].sel(time_delta=0).values.shape) == 1:
                ## Special case for variable 'CZC_lt57dBZ'
                sub_val = ds_past[var].sel(time_delta=slice(
                    neg_del[0], neg_del[-1])).values - ds_past[var].sel(
                        time_delta=0).values[:, np.newaxis]
                #ds_past[var].values = np.concatenate([sub_val,ds_past[var].sel(time_delta=0).values[:,np.newaxis]],axis=1)
                ds_past[var].values = np.concatenate([
                    ds_past[var].sel(time_delta=0).values[:, np.newaxis],
                    sub_val
                ],
                                                     axis=1)
            else:
                sub_val = ds_past[var].sel(time_delta=slice(
                    neg_del[0], neg_del[-1])).values - ds_past[var].sel(
                        time_delta=0).values[:, np.newaxis, :]
                #ds_past[var].values = np.concatenate([sub_val,ds_past[var].sel(time_delta=0).values[:,np.newaxis,:]],axis=1)
                ds_past[var].values = np.concatenate([
                    ds_past[var].sel(time_delta=0).values[:, np.newaxis, :],
                    sub_val
                ],
                                                     axis=1)

        elif diff_option == "3":
            sub_val = ds_past[var].sel(
                time_delta=slice(neg_del[1], 0)).values - ds_past[var].sel(
                    time_delta=slice(neg_del[0], neg_del[-1])).values
            if len(ds_past[var].sel(time_delta=0).values.shape) == 1:
                ## Special case for variable 'CZC_lt57dBZ'
                ds_past[var].values = np.concatenate([
                    sub_val, ds_past[var].sel(time_delta=0).values[:,
                                                                   np.newaxis]
                ],
                                                     axis=1)
            else:
                ds_past[var].values = np.concatenate([
                    sub_val,
                    ds_past[var].sel(time_delta=0).values[:, np.newaxis, :]
                ],
                                                     axis=1)

        ## Set NaN-values in "_nonmin" statistics to min_value:
        if "_nonmin" in var:
            ds_past[var].values[np.isnan(
                ds_past[var].values)] = cfg_set["minval_dict"][var[:-12]]

    ## Convert 3d dataarrays (xarray) to 2d dataframes (pandas) - TIME CONSUMING!
    print("  Converting 3D variables to dataframe (TIME CONSUMING)")
    df_list_3d = [
        da2df(ds_past[da], ds_past.data_vars) for da in ds_past.data_vars
        if len(ds_past[da].shape) == 3
    ]
    #df_list.compute()
    df_3d = pd.concat(
        df_list_3d,
        axis=1,
        copy=False,
        keys=[da for da in ds_past.data_vars if len(ds_past[da].shape) == 3])
    del (df_list_3d)

    ## Concatenate column names:
    df_3d.columns.set_levels(df_3d.columns.levels[2].values.astype(np.unicode),
                             level=2,
                             inplace=True)
    df_3d.columns.rename("Variable", level=0, inplace=True)
    df_3d.columns = df_3d.columns.map('{0[0]}|{0[1]}|{0[2]}'.format)
    df_3d.index = df_3d.index.astype(np.unicode)
    #df_3d.to_hdf("df_23km_nd.h5",key="df_3d",mode="w",complevel=0)

    ## Convert 2d dataarrays (xarray) to 2d dataframes (pandas)
    print("  Converting 2D variables to dataframe")
    df_list_2d = [
        ds_past[u'CZC_lt57dBZ'].sel(
            time_delta=deltime).drop("time_delta").to_dataframe()
        for deltime in neg0_del
    ]
    df_list_colnames = [
        u'CZC_lt57dBZ|%i|SUM' % deltime for deltime in neg0_del
    ]
    for var in ds_pixc_radar.data_vars:
        df_list_2d += [
            ds_pixc_radar[var].sel(
                time_delta=deltime).drop("time_delta").to_dataframe()
            for deltime in neg0_del
        ]
        df_list_colnames += [
            u'%s_NONMIN|%i|SUM' % (var, deltime) for deltime in neg0_del
        ]
    df_2d = pd.concat(df_list_2d, axis=1, copy=False)
    df_2d.columns = df_list_colnames
    df_2d = df_2d.astype(np.int16)
    del (df_list_2d, df_list_colnames, ds_past, ds_pixc_radar)
    #df_2d.to_hdf("df_23km_nd.h5",key="df_2d",mode="a",complevel=0)

    df_list_TRT_val = [
        ds_TRTrank_val.sel(
            time_delta=deltime).drop("time_delta").to_dataframe()
        for deltime in time_del
    ]
    df_TRT_val = pd.concat(df_list_TRT_val, axis=1, copy=False)
    df_TRT_val.columns = [u'TRT_Rank|%i' % deltime for deltime in time_del]
    df_list_TRT_diff = [
        ds_TRTrank_diff.sel(
            time_delta=deltime).drop("time_delta").to_dataframe()
        for deltime in time_del
    ]
    df_TRT_diff = pd.concat(df_list_TRT_diff, axis=1, copy=False)
    df_TRT_diff.columns = [
        u'TRT_Rank_diff|%i' % deltime for deltime in time_del
    ]
    del (df_list_TRT_val, df_list_TRT_diff)
    #df_TRT.to_hdf("df_23km_nd.h5",key="df_TRT",mode="a",complevel=0)

    ## Convert 1d dataarrays (xarray) to 2d dataframes (pandas)
    print("  Converting 1D variables to dataframe")
    df_1d = ds_1d.to_dataframe()
    #df_1d.to_hdf("df_23km_nd.h5",key="df_1d",mode="a",complevel=0)

    ## Concatenate 3d/2d/1d dataframes and save to disk:
    print("  Concatenate into one big dataframe and save to disk")
    df = pd.concat([df_1d, df_2d, df_3d, df_TRT_val, df_TRT_diff],
                   axis=1,
                   copy=False)
    del (df_1d, df_2d, df_3d, df_TRT_val, df_TRT_diff)
    if diff_option == "1":
        path_addon = "nodiff"
    elif diff_option == "2":
        path_addon = "t0diff"
    elif diff_option == "3":
        path_addon = "dtdiff"
    outpath = "%s_%s.h5" % (os.path.splitext(outpath)[0], path_addon)
    df.to_hdf(outpath, key="df", mode="w", complevel=0)
    print("    Saving successful to file:\n      %s" % outpath)