Example #1
0
 def load_files(self, path_hashfile=str, hash_str: str = None):
     #%%
     if hash_str is None:
         hash_str = '{}_a{}_{}_{}'.format(self._name, self.alpha,
                                          self.distance_eps,
                                          self.min_area_in_degrees2)
     if path_hashfile is None:
         path_hashfile = functions_pp.get_download_path()
     f_name = None
     for root, dirs, files in os.walk(path_hashfile):
         for file in files:
             if re.findall(f'{hash_str}', file):
                 print(f'Found file {file}')
                 f_name = file
     if f_name is not None:
         filepath = os.path.join(path_hashfile, f_name)
         self.ds = core_pp.import_ds_lazy(filepath)
         self.corr_xr = self.ds['corr_xr']
         self.alpha = self.corr_xr.attrs['alpha']
         self.FDR_control = bool(self.corr_xr.attrs['FDR_control'])
         self.precur_arr = self.ds['precur_arr']
         # self._tfreq = self.precur_arr.attrs['_tfreq']
         if 'prec_labels' in self.ds.variables.keys():
             self.prec_labels = self.ds['prec_labels']
             self.distance_eps = self.prec_labels.attrs['distance_eps']
             self.min_area_in_degrees2 = self.prec_labels.attrs[
                 'min_area_in_degrees2']
             self.group_lag = bool(self.prec_labels.attrs['group_lag'])
             self.group_split = bool(self.prec_labels.attrs['group_split'])
         loaded = True
     else:
         print('No file that matches the hash_str or instance settings in '
               f'folder {path_hashfile}')
         loaded = False
     return loaded
def ds_oos_lindetrend(dsclust, df_splits, path):

    kwrgs_NaN_handling = {
        'missing_data_ts_to_nan': False,
        'extra_NaN_limit': False,
        'inter_method': False,
        'final_NaN_to_clim': False
    }
    years = list(range(1950, 2020))
    selbox = [253, 290, 28, 52]
    ds_raw = core_pp.import_ds_lazy(
        raw_filename,
        var='variable',
        selbox=selbox,
        kwrgs_NaN_handling=kwrgs_NaN_handling).rename({'z': 'time'})
    ds_raw.name = 'Soy_Yield'
    ds_raw['time'] = pd.to_datetime(
        [f'{y+1949}-01-01' for y in ds_raw.time.values])
    ds_raw = ds_raw.sel(time=core_pp.get_oneyr(ds_raw, *years))

    label = int(target_dataset.split('__')[-1])
    clusmask = dsclust['xrclustered'] == label
    ds_raw = ds_raw.where(clusmask)
    ds_out = utils_paper3.detrend_oos_3d(ds_raw,
                                         min_length=30,
                                         df_splits=df_splits,
                                         standardize=True,
                                         path=path)
    return ds_out
Example #3
0
def spatial_mean_clusters(var_filename, xrclust, kwrgs_load: dict = {}):
    #%%
    if type(var_filename) is str:
        xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_load)
    elif type(var_filename) is xr.DataArray:
        xarray = var_filename
    else:
        raise TypeError('Give var_filename as str or xr.DataArray')

    labels = xrclust.values
    nparray = xarray.values
    track_names = []
    area_grid = find_precursors.get_area(xarray)
    regions_for_ts = list(np.unique(labels[~np.isnan(labels)]))
    a_wghts = area_grid / area_grid.mean()

    # this array will be the time series for each feature
    ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts)))

    # calculate area-weighted mean over labels
    for r in regions_for_ts:
        track_names.append(int(r))
        idx = regions_for_ts.index(r)
        # start with empty lonlat array
        B = np.zeros(xrclust.shape)
        # Mask everything except region of interest
        B[labels == r] = 1
        # Calculates how values inside region vary over time
        ts_clusters[:, idx] = np.nanmean(nparray[:, B == 1] * a_wghts[B == 1],
                                         axis=1)
    xrts = xr.DataArray(ts_clusters.T,
                        coords={
                            'cluster': track_names,
                            'time': xarray.time
                        },
                        dims=['cluster', 'time'])

    # extract selected setting for ts
    dims = list(xrclust.coords.keys())
    standard_dim = ['latitude', 'longitude', 'time', 'mask', 'cluster']
    dims = [d for d in dims if d not in standard_dim]
    if 'n_clusters' in dims:
        idx = dims.index('n_clusters')
        dims[idx] = 'ncl'
        xrclust = xrclust.rename({'n_clusters': dims[idx]}).copy()
    var1 = str(xrclust[dims[0]])
    dim1 = dims[0]
    xrts.attrs[dim1] = var1
    xrclust.attrs[dim1] = var1
    xrclust = xrclust.drop(dim1)
    if len(dims) == 2:
        var2 = int(xrclust[dims[1]])
        dim2 = dims[1]
        xrts.attrs[dim2] = var2
        xrclust.attrs[dim2] = var2
        xrclust = xrclust.drop(dim2)
    ds = xr.Dataset({'xrclustered': xrclust, 'ts': xrts})
    #%%
    return ds
Example #4
0
def get_spatial_ma(var_filename, mask=None, kwrgs_l_spatial: dict = {}):
    '''
    var_filename must be 3d netcdf file with only one variable
    mask can be nc file containing only a mask, or a latlon box in format
    [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees
    Is build upon sklean clustering. Techniques available are listed in sklearn.cluster.__dict__,
    e.g. KMeans, or AgglomerativeClustering, kwrgs are techinque dependend, see sklearn docs.
    '''
    if mask is None:
        xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)
        lons = xarray.longitude.values
        lats = xarray.latitude.values
        mask = [min(lons), max(lons), min(lats), max(lats)]
        print(f'no mask given, entire array of box {mask} will be clustered')
    if type(mask) is str:
        xrmask = core_pp.import_ds_lazy(mask, **kwrgs_l_spatial)
        if xrmask.attrs['is_DataArray'] == False:
            variables = list(xrmask.variables.keys())
            strvars = [' {} '.format(var) for var in variables]
            common_fields = ' time time_bnds longitude latitude lev lon lat level '
            var = [var for var in strvars if var not in common_fields]
            if len(var) != 0:
                var = var[0].replace(' ', '')
                npmask = xrmask[var].values
        else:
            npmask = xrmask.values
    elif type(mask) is list or type(mask) is tuple:
        xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)
        selregion = core_pp.import_ds_lazy(var_filename, selbox=mask)
        lons_mask = list(selregion.longitude.values)
        lon_mask = [
            True if l in lons_mask else False for l in xarray.longitude
        ]
        lats_mask = list(selregion.latitude.values)
        lat_mask = [True if l in lats_mask else False for l in xarray.latitude]
        npmasklon = np.meshgrid(lon_mask, lat_mask)[0]
        npmasklat = np.meshgrid(lon_mask, lat_mask)[1]
        npmask = np.logical_and(npmasklon, npmasklat)
    elif type(mask) is type(xr.DataArray([0])):
        # lo_min = float(mask.longitude.min()); lo_max = float(mask.longitude.max())
        # la_min = float(mask.latitude.min()); la_max = float(mask.latitude.max())
        # selbox = (lo_min, lo_max, la_min, la_max)
        # selregion = core_pp.import_ds_lazy(var_filename, selbox=selbox)
        # selregion = selregion.where(mask)
        npmask = mask.values
    return npmask
def get_spatial_ma(var_filename=str, mask=None, kwrgs_l_spatial: dict = {}):
    '''
    var_filename must be 3d netcdf file with only one variable
    mask can be nc file containing only a mask, or a latlon box in format
    [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees
    '''
    if mask is None:
        xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)
        lons = xarray.longitude.values
        lats = xarray.latitude.values
        mask = [min(lons), max(lons), min(lats), max(lats)]
        print(f'Loaded array with coordinates {mask}')
    if type(mask) is str:
        xrmask = core_pp.import_ds_lazy(mask, **kwrgs_l_spatial)
        if xrmask.attrs['is_DataArray'] == False:
            variables = list(xrmask.variables.keys())
            strvars = [' {} '.format(var) for var in variables]
            common_fields = ' time time_bnds longitude latitude lev lon lat level '
            var = [var for var in strvars if var not in common_fields]
            if len(var) != 0:
                var = var[0].replace(' ', '')
                npmask = xrmask[var].values
        else:
            npmask = xrmask.values
    # creates a subdomain within a larger domain
    elif type(mask) is list or type(mask) is tuple:
        xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)
        selregion = core_pp.import_ds_lazy(var_filename, selbox=mask)
        lons_mask = list(selregion.longitude.values)
        lon_mask = [
            True if l in lons_mask else False for l in xarray.longitude
        ]
        lats_mask = list(selregion.latitude.values)
        lat_mask = [True if l in lats_mask else False for l in xarray.latitude]
        npmasklon = np.meshgrid(lon_mask, lat_mask)[0]
        npmasklat = np.meshgrid(lon_mask, lat_mask)[1]
        npmask = np.logical_and(npmasklon, npmasklat)
    elif type(mask) is type(xr.DataArray([0])):
        # lo_min = float(mask.longitude.min()); lo_max = float(mask.longitude.max())
        # la_min = float(mask.latitude.min()); la_max = float(mask.latitude.max())
        # selbox = (lo_min, lo_max, la_min, la_max)
        # selregion = core_pp.import_ds_lazy(var_filename, selbox=selbox)
        # selregion = selregion.where(mask)
        npmask = mask.values
    return npmask
Example #6
0
def update_dates(cls, ex):
    import os
    file_path = os.path.join(cls.path_pp, cls.filename_pp)
    kwrgs_pp = {'selbox':ex['selbox'],
                'loadleap':False }
    ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp)

    temporal_freq = pd.Timedelta((ds['time'][1] - ds['time'][0]).values)
    cls.dates = pd.to_datetime(ds['time'].values)
    cls.temporal_freq = '{}days'.format(temporal_freq.days)
    return cls, ex
Example #7
0
def ENSO_34(file_path, ex, df_splits=None):
    #%%
    #    file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc'
    '''
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf    
    '''
    if df_splits is None:
        RV = ex[ex['RV_name']]
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)
        seldates = None
    else:
        seldates = df_splits.loc[0].index

    kwrgs_pp = {
        'selbox': {
            'la_min': -5,  # select domain in degrees east
            'la_max': 5,
            'lo_min': -170,
            'lo_max': -120
        },
        'seldates': seldates
    }

    ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp)

    to_freq = ex['tfreq']
    if to_freq != 1:
        ds, dates = functions_pp.time_mean_bins(ds,
                                                ex,
                                                to_freq=to_freq,
                                                seldays='all')
        ds['time'] = dates

    dates = pd.to_datetime(ds.time.values)
    splits = df_splits.index.levels[0]

    list_splits = []
    for s in splits:

        progress = 100 * (s + 1) / splits.size
        print(f"\rProgress ENSO traintest set {progress}%)", end="")

        data = functions_pp.area_weighted(ds).mean(dim=('latitude',
                                                        'longitude'))

        list_splits.append(
            pd.DataFrame(data=data.values,
                         index=dates,
                         columns=['0_900_ENSO34']))

    df_ENSO = pd.concat(list_splits, axis=0, keys=splits)
    #%%
    return df_ENSO
Example #8
0
def load_precursor(ex):
    #%%
    dates_all = ex['dates_all']
    # =============================================================================
    # Load Precursor
    # =============================================================================
    prec_filename = os.path.join(ex['path_pp'], ex['filename_precur'])
    #    if ex['datafolder'] == 'EC':
    #        try:
    #            datesRV = func_CPPA.make_datestr(dates_all, ex,
    #                            ex['startyear'], ex['endyear'], lpyr=False)
    #            dates_prec = subset_dates(datesRV, ex)
    ##            varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec)
    #        except:
    #            datesRV = func_CPPA.make_datestr(dates_all, ex,
    #                                    ex['startyear'], ex['endyear'], lpyr=True)
    #            dates_prec = subset_dates(datesRV, ex)
    #            varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec)
    #    else:
    Prec_reg = functions_pp.import_ds_timemeanbins(prec_filename,
                                                   ex['tfreq'],
                                                   loadleap=True,
                                                   to_xarr=False,
                                                   seldates=ex['dates_all'])
    Prec_reg = core_pp.convert_longitude(Prec_reg, 'only_east')
    if ex['add_lsm']:
        kwrgs_2d = {'selbox': ex['selbox'], 'format_lon': 'only_east'}
        lsm_filename = os.path.join(ex['path_mask'], ex['mask_file'])
        lsm = core_pp.import_ds_lazy(lsm_filename, **kwrgs_2d)

        Prec_reg['lsm'] = (('latitude', 'longitude'), (lsm < 0.3).values)
        Prec_reg = Prec_reg.where(Prec_reg['lsm'])

    if 'exclude_yrs' in ex.keys():
        if len(ex['exclude_yrs']) != 0:
            print('excluding yr(s): {} from analysis'.format(
                ex['exclude_yrs']))

            all_yrs = np.unique(dates_all.year)
            yrs_keep = [y for y in all_yrs if y not in ex['exclude_yrs']]
            idx_yrs = [
                i for i in np.arange(dates_all.year.size)
                if dates_all.year[i] in yrs_keep
            ]
            #        dates_all = dates_all[idx_yrs]
            mask_all = np.zeros(dates_all.size, dtype=bool)
            mask_all[idx_yrs] = True
            dates_excl_yrs = dates_all[mask_all]
            Prec_reg = Prec_reg.sel(time=dates_excl_yrs)

    #%%
    return Prec_reg, ex
Example #9
0
def check(rg, list_of_name_path, cluster_nr):

    import matplotlib.pyplot as plt
    import core_pp

    t2m_path = list_of_name_path[0][1]
    t2m = core_pp.import_ds_lazy(t2m_path, format_lon='west_east')
    t2m_clus = t2m.sel(cluster=cluster_nr)

    sst_path = list_of_name_path[1][1]
    sst = core_pp.import_ds_lazy(sst_path, format_lon='west_east')

    swvl12_path = list_of_name_path[2][1]
    swvl12 = core_pp.import_ds_lazy(swvl12_path, format_lon='west_east')

    #example time series plot for first cluster
    plt.figure()
    t2m_clus.ts.plot()

    #check plot for sst
    plt.figure()
    sst[0].plot()

    #check plot for swvl
    plt.figure()
    swvl12[0].plot()

    # Check plot of clusters
    # if TVpath contains the xr.DataArray that is clustered beforehand, we can have a look at the spatial regions.
    ds = rg.get_clust(format_lon='west_east')
    fig = plot_maps.plot_labels(ds['xrclustered'],
                                kwrgs_plot={
                                    'col_dim': 'n_clusters',
                                    'title': 'Hierarchical Clustering',
                                    'cbar_tick_dict': {
                                        'labelsize': 8
                                    },
                                    'add_cfeature': 'BORDERS'
                                })
Example #10
0
def dendogram_clustering(var_filename,
                         mask=None,
                         q=70,
                         clustermethodkey='AgglomerativeClustering',
                         kwrgs={'n_clusters': 3}):

    xarray = core_pp.import_ds_lazy(var_filename)
    npmask = get_spatial_ma(var_filename, mask)
    xarray = binary_occurences_quantile(xarray, q=q)
    xrclustered, results = skclustering(xarray,
                                        npmask,
                                        clustermethodkey=clustermethodkey,
                                        kwrgs=kwrgs)
    return xrclustered, results
Example #11
0
def check_pp_done(cls, ex):
    #%%
    '''
    Check if pre processed ncdf already exists
    '''
    # =============================================================================
    # load dataset lazy
    # =============================================================================

    import pandas as pd
    filename = os.path.join(ex['path_raw'], cls.filename)
    kwrgs_pp = {'selbox':ex['selbox'], 'loadleap':False}
    ds = core_pp.import_ds_lazy(filename, **kwrgs_pp)
    dates = pd.to_datetime(ds['time'].values)

    # =============================================================================
    # get time series that you request
    # =============================================================================

#    dates = timeseries_tofit_bins(ds, ex, seldays='part')[1]

    start_day = get_oneyr(dates)[0]
    end_day   = get_oneyr(dates)[-1]

    # =============================================================================
    # give appropriate name to output file
    # =============================================================================
    outfilename = cls.filename[:-3]+'.nc'
#    outfilename = outfilename.replace('daily', 'dt-{}days'.format(1))
    months = dict( {1:'jan',2:'feb',3:'mar',4:'apr',5:'may',6:'jun',7:'jul',
                         8:'aug',9:'sep',10:'okt',11:'nov',12:'dec' } )

    if ex['input_freq'] == 'daily':
        startdatestr = '_{}{}_'.format(start_day.day, months[start_day.month])
        enddatestr   = '_{}{}_'.format(end_day.day, months[end_day.month])
    elif ex['input_freq'] == 'monthly':
        startdatestr = '_{}_'.format(months[start_day.month])
        enddatestr   = '_{}_'.format(months[end_day.month])

    outfilename = outfilename.replace('_{}_'.format(1), startdatestr)
    outfilename = outfilename.replace('_{}_'.format(12), enddatestr)
    cls.filename_pp = outfilename
    cls.path_pp = ex['path_pp']
    outfile = os.path.join(ex['path_pp'], outfilename)
    cls.dates_fit_tfreq = dates
    print('output file of pp will be saved as: \n' + outfile)
    #%%
    return outfile, cls, ex
Example #12
0
def regrid_array(xr_or_filestr, to_grid, periodic=False):
    import functions_pp

    if type(xr_or_filestr) == str:
        xarray = core_pp.import_ds_lazy(xr_or_filestr)
        plot_maps.plot_corr_maps(xarray[0])
        xr_regrid = functions_pp.regrid_xarray(xarray,
                                               to_grid,
                                               periodic=periodic)
        plot_maps.plot_corr_maps(xr_regrid[0])
    else:
        plot_maps.plot_labels(xr_or_filestr)
        xr_regrid = functions_pp.regrid_xarray(xr_or_filestr,
                                               to_grid,
                                               periodic=periodic)
        plot_maps.plot_labels(xr_regrid)
        plot_maps.plot_labels(xr_regrid.where(xr_regrid.values == 3))
    return xr_regrid
Example #13
0
def import_ds_timemeanbins(file_path, ex, loadleap=False, to_xarr=True,
                           seldates=None):


    kwrgs_pp = {'selbox':ex['selbox'],
                'loadleap':loadleap,
                'seldates':seldates }

    ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp)
    to_freq = ex['tfreq']
    if to_freq != 1:
        ds, dates = time_mean_bins(ds, ex, to_freq=to_freq, seldays='part')
        ds['time'] = dates
#    print('temporal frequency \'dt\' is: \n{}'.format(dates[1]- dates[0]))
    if to_xarr:
        if type(ds) == type(xr.DataArray(data=[0])):
            ds = ds.squeeze()
        else:
            ds = ds.to_array().squeeze()

    return ds
Example #14
0
#
## In[ ]:
#

#rg.list_precur_pp

var_filename = '/Users/semvijverberg/surfdrive/Data_EC/input_pp/tas_2000-2159_1jan_31dec_daily_1.125deg.nc'
LSM = '/Users/semvijverberg/surfdrive/Data_EC/input_raw/mask_North_America_1.125deg.nc'
#%%
import make_country_mask
selbox = (225, 300, 20, 70)
xarray, Country = make_country_mask.create_mask(var_filename,
                                                kwrgs_load={'selbox': selbox},
                                                level='Countries')
mask_US = xarray.values == Country.US
lsm = core_pp.import_ds_lazy(LSM, selbox=selbox)
mask_US = np.logical_and(mask_US, (lsm > .3).values)
xr_mask = xarray.where(mask_US)
xr_mask.values[mask_US] = 1
xr_mask = xrmask_by_latlon(xr_mask, lonmin=237)
xr_mask = xrmask_by_latlon(xr_mask, lonmin=238, latmin=39)
xr_mask = xrmask_by_latlon(xr_mask, lonmin=239, latmin=38)
xr_mask = xrmask_by_latlon(xr_mask, lonmin=240, latmin=36)
plot_maps.plot_labels(xr_mask)

# In[9]:
# =============================================================================
# Clustering co-occurence of anomalies
# =============================================================================
q = [80, 85, 90, 95]
n_clusters = [2, 3, 4, 5, 6, 7, 8]
Example #15
0
#%%
import make_country_mask

# xarray, Country = make_country_mask.create_mask(var_filename, kwrgs_load={'selbox':selbox}, level='Countries')
# mask_US = (xarray.values == Country.US)
# mask_US = make_country_mask.binary_erosion(mask_US)
# mask_US = make_country_mask.binary_erosion(mask_US)
# mask_US = make_country_mask.binary_opening(mask_US)
# xr_mask = xarray.where(mask_US)
# xr_mask.values[mask_US]  = 1
# xr_mask = cl.mask_latlon(xr_mask, latmax=63, lonmax=270)

selbox = (232, 295, 25, 50)
xr_mask = core_pp.import_ds_lazy(
    user_dir +
    '/surfdrive/Scripts/rasterio/mask_North_America_0.25deg_orig.nc',
    var='lsm',
    selbox=selbox)
xr_mask.values = make_country_mask.binary_erosion(xr_mask.values)
plot_maps.plot_labels(xr_mask)

# In[9]:
# =============================================================================
# Clustering co-occurence of anomalies
# =============================================================================
q = [80, 85, 90, 95]
n_clusters = [2, 3, 4, 5, 6, 7, 8]
tfreq = 1
from time import time
t0 = time()
xrclustered, results = cl.dendogram_clustering(var_filename,
Example #16
0
def plot_ss2(agg_level, skillscores, col_wrap, metric=None):
    #%%
    import find_precursors

    cluster_nc_path = get_list_of_name_path(agg_level, 1)[0][1]
    ds = core_pp.import_ds_lazy(cluster_nc_path, format_lon='west_east')
    cluster_labels_org = ds.coords['cluster']
    ds = ds['xrclustered']

    #create list of skill score names
    skillscores_multi_idx = skillscores.index.levels
    ss_list = []
    for i in skillscores_multi_idx[1:][0]:
        for j in skillscores_multi_idx[1:][1]:
            ss_name = '{}_{}'.format(i, j)
            ss_list.append(ss_name)

    if metric is not None:  #only apply single metric
        ss_list = [metric]

    #add dimensions and coordinates
    xr_score = ds.copy()
    xr_score.attrs = {}
    list_xr = [xr_score.copy().expand_dims('metric', axis=0) for m in ss_list]
    xr_score = xr.concat(list_xr, dim='metric')
    xr_score['metric'] = ('metric', ss_list)
    list_xr = [
        xr_score.copy().expand_dims('target_month', axis=0)
        for m in skillscores.columns
    ]
    xr_score = xr.concat(list_xr, dim='target_month')
    xr_score['target_month'] = ('target_month', skillscores.columns)

    #replace labels with skillscores
    for metric_nr, metric in enumerate(xr_score.metric.values):
        test_or_train = metric[:metric.find("_")]
        ss = metric[metric.find("_") + 1:]
        for month_nr, month in enumerate(xr_score.target_month.values):
            #slice over metric, month in skill score df
            metric_cluster_dict = skillscores[month].xs(
                (test_or_train, ss), level=(1, 2)).to_dict()
            #replace cluster_labels with their skill score
            cluster_labels_new = [
                metric_cluster_dict.get(x, x)
                for x in cluster_labels_org.values
            ]
            #set all non replaced values of cluster labels to np.nan
            cluster_labels_new = [
                np.nan if isinstance(x, np.int32) else x
                for x in cluster_labels_new
            ]

            #replace values
            xarr_labels_to_replace = ds
            xr_score[month_nr,
                     metric_nr] = find_precursors.view_or_replace_labels(
                         xarr_labels_to_replace,
                         regions=list(cluster_labels_org.values),
                         replacement_labels=cluster_labels_new)

    #set col wrap and subtitles
    col_wrap = col_wrap  #int determines nr of cols
    import math
    subtitles = [[] for i in range(
        int(math.ceil(xr_score.target_month.values.size / col_wrap)))]
    total_nr_fields = col_wrap * len(subtitles)
    j = -1
    for i, month in enumerate(xr_score.target_month.values):
        if i % col_wrap == 0:
            j += 1
        subtitles[j].append('{}, {}'.format(month, metric))
        if i == max(
                list(enumerate(xr_score.target_month.values))
        )[0] and total_nr_fields > xr_score.target_month.values.size:
            for k in range(total_nr_fields -
                           xr_score.target_month.values.size):
                subtitles[j].append('0')

    #plot
    fig = plot_maps.plot_corr_maps(xr_score,
                                   col_dim='target_month',
                                   row_dim='metric',
                                   size=4,
                                   clevels=np.arange(-.5, 0.51, .1),
                                   cbar_vert=-0.1,
                                   hspace=-0.2,
                                   subtitles=subtitles,
                                   col_wrap=col_wrap)
    #%%
    return fig
Example #17
0
def sklearn_clustering(var_filename,
                       mask=None,
                       kwrgs_load={},
                       clustermethodkey='DBSCAN',
                       kwrgs_clust={'eps': 600}):

    if 'selbox' in kwrgs_load.keys():
        if kwrgs_load['selbox'] is not None:
            mask = kwrgs_load.pop('selbox')
            print(
                'mask overwritten with selbox list. Both selbox and mask are given.'
                'Both adapt the domain over which to cluster')
    kwrgs_l_spatial = {}  # kwrgs affecting spatial extent/format
    if 'format_lon' in kwrgs_load.keys():
        kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon']

    xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)
    npmask = get_spatial_ma(var_filename,
                            mask,
                            kwrgs_l_spatial=kwrgs_l_spatial)

    kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list}
    [
        kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items()
        if type(i) == list
    ]

    if len(kwrgs_loop) == 1:
        # insert fake axes
        kwrgs_loop['fake'] = [0]
    if len(kwrgs_loop) >= 1:
        new_coords = []
        xrclustered = xarray[0].drop('time')
        for k, list_v in kwrgs_loop.items():  # in alphabetical order
            new_coords.append(k)
            dim_coords = {str(k): list_v}
            xrclustered = xrclustered.expand_dims(dim_coords).copy()
        new_coords = [
            d for d in xrclustered.dims if d not in ['latitude', 'longitude']
        ]
        results = []
        first_loop = kwrgs_loop[new_coords[0]]
        second_loop = kwrgs_loop[new_coords[1]]
        for i, v1 in enumerate(first_loop):
            for j, v2 in enumerate(second_loop):
                kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2)
                kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2)
                print(
                    f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ",
                    end="")
                xarray = functions_pp.import_ds_timemeanbins(
                    var_filename, **kwrgs_l)

                xrclustered[i, j], result = skclustering(
                    xarray,
                    npmask,
                    clustermethodkey=clustermethodkey,
                    kwrgs=kwrgs)
                results.append(result)
        if 'fake' in new_coords:
            xrclustered = xrclustered.squeeze().drop('fake').copy()
    else:
        xrclustered, results = skclustering(xarray,
                                            npmask,
                                            clustermethodkey=clustermethodkey,
                                            kwrgs=kwrgs_clust)
    xrclustered.attrs['method'] = clustermethodkey
    xrclustered.attrs['kwrgs'] = str(kwrgs_clust)
    xrclustered.attrs['target'] = f'{xarray.name}'
    if 'hash' not in xrclustered.attrs.keys():
        xrclustered.attrs['hash'] = uuid.uuid4().hex[:5]
    return xrclustered, results

rg = RGCPD(list_of_name_path=list_of_name_path,
           list_for_MI=list_for_MI,
           list_import_ts=None,
           start_end_TVdate=start_end_TVdate,
           start_end_date=start_end_date,
           start_end_year=start_end_year,
           path_outmain=path_out_main,
           append_pathsub=append_main)

rg.pp_precursors()
rg.pp_TV(anomaly=False, detrend=True)
rg.traintest(method)

ds = core_pp.import_ds_lazy(rg.list_precur_pp[0][1])
season = ds.resample(time='QS-DEC').mean()
#%% on post-processed (anomaly, detrended) SST
import climate_indices
df_ENSO, ENSO_yrs, df_states = climate_indices.ENSO_34(rg.list_precur_pp[0][1],
                                  rg.df_splits.copy(),
                                  get_ENSO_states=True)

cycle = df_states[[f'EN_cycle']].loc[0]
print('El Nino yrs', list(cycle[cycle=='EN0'].dropna().index.year))
cycle = df_states[[f'LN_cycle']].loc[0]
print('La Nina yrs', list(cycle[cycle=='LN0'].dropna().index.year))

#%% Composites of Anderson 2017 ENSO states

for title in ['EN-1', 'EN0', 'EN+1', 'LN-1', 'LN0', 'LN+1']:
Example #19
0
def dendogram_clustering(var_filename=str,
                         mask=None,
                         kwrgs_load={},
                         clustermethodkey='AgglomerativeClustering',
                         kwrgs_clust={
                             'q': 70,
                             'n_clusters': 3
                         },
                         n_cpu=None):
    '''


    Parameters
    ----------
    var_filename : str
        path to pre-processed Netcdf file.
    mask : [xr.DataArray, path to netcdf file with mask, list or tuple], optional
        See get_spatial_ma?. The default is None.
    kwrgs_load : TYPE, optional
        See functions_pp.import_ds_timemeanbins? for parameters. The default is {}.
    clustermethodkey : TYPE, optional
        See cluster.cluster.__dict__ for all sklean cluster algorithms.
        The default is 'AgglomerativeClustering'.
    kwrgs_clust : dict, optional
        Note that q is in percentiles, i.e. 50 refers to the median.
        The default is {'q':70, 'n_clusters':3}.

    Yields
    ------
    xrclustered : xr.DataArray
    results : list of sklearn cluster method instances.

    '''
    if n_cpu is None:
        n_cpu = multiprocessing.cpu_count() - 1

    if 'selbox' in kwrgs_load.keys():
        if kwrgs_load['selbox'] is not None:
            mask = kwrgs_load.pop('selbox')
            print('mask overwritten because both selbox and mask are given.'
                  'both adapt the domain over which to cluster')
    kwrgs_l_spatial = {}  # kwrgs affecting spatial extent/format
    if 'format_lon' in kwrgs_load.keys():
        kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon']

    xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)
    npmask = get_spatial_ma(var_filename,
                            mask,
                            kwrgs_l_spatial=kwrgs_l_spatial)

    kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list}
    kwrgs_loop_load = {k: i for k, i in kwrgs_load.items() if type(i) == list}
    [kwrgs_loop.update({k: i}) for k, i in kwrgs_loop_load.items()]
    q = kwrgs_clust['q']

    if len(kwrgs_loop) == 1:
        # insert fake axes
        kwrgs_loop['fake'] = [0]
    if len(kwrgs_loop) >= 1:

        new_coords = []
        xrclustered = xarray[0].drop('time')
        for k, list_v in kwrgs_loop.items():  # in alphabetical order
            new_coords.append(k)
            dim_coords = {str(k): list_v}
            xrclustered = xrclustered.expand_dims(dim_coords).copy()
        new_coords = [
            d for d in xrclustered.dims if d not in ['latitude', 'longitude']
        ]
        results = []
        first_loop = kwrgs_loop[new_coords[0]]
        second_loop = kwrgs_loop[new_coords[1]]
        comb = [[v1, v2] for v1, v2 in product(first_loop, second_loop)]

        def generator(var_filename, xarray, comb, new_coords, kwrgs_clust,
                      kwrgs_load, q):
            for v1, v2 in comb:
                kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2)
                kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2)
                del kwrgs['q']
                print(
                    f"clustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2}")
                yield kwrgs, kwrgs_l, v1, v2

        def execute_to_dict(var_filename, npmask, v1, v2, q, clustermethodkey,
                            kwrgs, kwrgs_l):

            # if reload: # some param has been adjusted
            xarray_ts = functions_pp.import_ds_timemeanbins(
                var_filename, **kwrgs_l)
            if type(q) is int:
                xarray = binary_occurences_quantile(xarray_ts, q=q)
            if type(q) is list:
                xarray = binary_occurences_quantile(xarray_ts, q=v2)

            xrclusteredij, result = skclustering(
                xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs)
            return {f'{v1}..{v2}': (xrclusteredij, result)}

        if n_cpu > 1:
            futures = []
            for kwrgs, kwrgs_l, v1, v2 in generator(var_filename, xarray, comb,
                                                    new_coords, kwrgs_clust,
                                                    kwrgs_load, q):
                futures.append(
                    delayed(execute_to_dict)(var_filename, npmask, v1, v2, q,
                                             clustermethodkey, kwrgs, kwrgs_l))
            output = Parallel(n_jobs=n_cpu, backend='loky')(futures)
        else:
            output = []
            for kwrgs, kwrgs_l, v1, v2 in generator(var_filename, xarray, comb,
                                                    new_coords, kwrgs_clust,
                                                    kwrgs_load, q):
                output.append(
                    execute_to_dict(var_filename, npmask, v1, v2, q,
                                    clustermethodkey, kwrgs, kwrgs_l))

        # unpack output when done loop over parameters
        for run in output:
            for key, out in run.items():
                v1, v2 = float(key.split('..')[0]), float(key.split('..')[1])
                i, j = first_loop.index(v1), second_loop.index(v2)
                xrclustered[i, j] = xr.DataArray(
                    out[0],
                    dims=['latitude', 'longitude'],
                    coords=[xarray.latitude, xarray.longitude])

        if 'fake' in new_coords:
            xrclustered = xrclustered.squeeze().drop('fake').copy()
    else:
        del kwrgs_clust['q']
        npclustered, results = skclustering(xarray,
                                            npmask,
                                            clustermethodkey=clustermethodkey,
                                            kwrgs=kwrgs_clust)
        xrclustered = xr.DataArray(npclustered,
                                   dims=['latitude', 'longitude'],
                                   coords=[xarray.latitude, xarray.longitude])
    print('\n')
    xrclustered.attrs['method'] = clustermethodkey
    xrclustered.attrs['kwrgs'] = str(kwrgs_clust)
    xrclustered.attrs[
        'target'] = f'{xarray.name}_exceedances_of_{q}th_percentile'
    if 'hash' not in xrclustered.attrs.keys():
        xrclustered.attrs['hash'] = uuid.uuid4().hex[:5]
    return xrclustered, results
Example #20
0
# In[3]:

rg.pp_precursors()

# In[ ]:

rg.list_precur_pp

var_filename = rg.list_precur_pp[0][1]
region = 'USCAnew'

#%%

import pandas as pd

ds = core_pp.import_ds_lazy(var_filename)
ds.sel(time=core_pp.get_subdates(pd.to_datetime(ds.time.values),
                                 start_end_date=('06-01', '08-31'))).mean(
                                     dim='time').plot()

#%%

if region == 'USCAnew':
    selbox = (230, 300, 25, 70)
    TVpath = os.path.join(path_data, 'tfreq15_nc7_dendo_57db0USCA.nc')
    # np_array_xy = np.array([[-97, 39], [-89, 39], [-82, 40],
    #                        [-116,36], [-122,41], [-117,46]])
    np_array_xy = np.array([[-96, 36], [-92, 41], [-84, 35], [-84, 41],
                            [-114, 36], [-120, 36], [-122, 44], [-118, 48]])
    t, c = 15, 7
# elif region == 'USCA':
Example #21
0
def percentile_cluster(var_filename,
                       xrclust,
                       q=75,
                       tailmean=True,
                       selbox=None):
    xarray = core_pp.import_ds_lazy(var_filename, selbox=selbox)
    labels = xrclust.values
    nparray = xarray.values
    n_t = xarray.time.size
    track_names = []
    area_grid = find_precursors.get_area(xarray)
    regions_for_ts = list(np.unique(labels[~np.isnan(labels)]))

    if tailmean:
        tmp_wgts = (area_grid / area_grid.mean())[:, :]
        a_wghts = np.tile(tmp_wgts[None, :], (n_t, 1, 1))
    else:
        a_wghts = area_grid / area_grid.mean()
    # this array will be the time series for each feature
    ts_clusters = np.zeros((xarray.shape[0], len(regions_for_ts)))

    # calculate area-weighted mean over labels
    for r in regions_for_ts:
        track_names.append(int(r))
        idx = regions_for_ts.index(r)
        # start with empty lonlat array
        B = np.zeros(xrclust.shape)
        # Mask everything except region of interest
        B[labels == r] = 1
        # Calculates how values inside region vary over time
        if tailmean == False:
            ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1] *
                                                   a_wghts[B == 1],
                                                   q=q,
                                                   axis=1)
        elif tailmean:
            # calc percentile of space for each timestep, not we will
            # have a timevarying spatial mask.
            ts_clusters[:, idx] = np.nanpercentile(nparray[:, B == 1],
                                                   q=q,
                                                   axis=1)
            # take a mean over all gridpoints that pass the percentile instead
            # of taking the single percentile value of a spatial region
            mask_B_perc = nparray[:, B == 1] > ts_clusters[:, idx, None]
            # if unlucky, the amount of gridcells that pass the percentile
            # value, were not always equal in each timestep. When this happens,
            # we can no longer reshape the array to (time, space) axis, and thus
            # we cannot take the mean over time.
            # check if have same size over time
            cs_ = [
                int(mask_B_perc[t][mask_B_perc[t]].shape[0])
                for t in range(n_t)
            ]
            if np.unique(cs_).size != 1:
                # what is the most common size:
                common_shape = cs_[np.argmax(
                    [cs_.count(v) for v in np.unique(cs_)])]

                # convert all masks to most common size by randomly
                # adding/removing a True
                for t in range(n_t):
                    while mask_B_perc[t][
                            mask_B_perc[t]].shape[0] < common_shape:
                        mask_B_perc[t][np.argwhere(
                            mask_B_perc[t] == False)[0][0]] = True
                    while mask_B_perc[t][
                            mask_B_perc[t]].shape[0] > common_shape:
                        mask_B_perc[t][np.argwhere(
                            mask_B_perc[t] == True)[0][0]] = False

            nptimespacefull = nparray[:, B == 1].reshape(nparray.shape[0], -1)
            npuppertail = nptimespacefull[mask_B_perc]
            wghtsuppertail = a_wghts[:, B == 1][mask_B_perc]

            y = np.nanmean(npuppertail.reshape(n_t,-1) * \
                            wghtsuppertail.reshape(n_t,-1), axis =1)

            ts_clusters[:, idx] = y
    xrts = xr.DataArray(ts_clusters.T,
                        coords={
                            'cluster': track_names,
                            'time': xarray.time
                        },
                        dims=['cluster', 'time'])
    return xrts
Example #22
0
def PDO_temp(filename, ex, df_splits=None):
    #%%
    '''
    PDO is calculated based upon all data points in the training years,
    Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train)
    to enable retrieving the PDO timeseries on a subset on the year.
    It is similarly also projected on the dates_test.
    From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices
    '''

    if df_splits is None:
        RV = ex[ex['RV_name']]
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)

    kwrgs_pp = {
        'selbox': {
            'la_min': 20,  # select domain in degrees east
            'la_max': 65,
            'lo_min': 115,
            'lo_max': 250
        },
        'format_lon': 'only_east'
    }
    ds = core_pp.import_ds_lazy(filename, **kwrgs_pp)

    to_freq = ex['tfreq']
    if to_freq != 1:
        ds, dates = functions_pp.time_mean_bins(ds,
                                                ex,
                                                to_freq=to_freq,
                                                seldays='all')
        ds['time'] = dates

    dates = pd.to_datetime(ds.time.values)

    splits = df_splits.index.levels[0]
    data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size))
    PDO_patterns = xr.DataArray(
        data,
        coords=[splits, ds.latitude.values, ds.longitude.values],
        dims=['split', 'latitude', 'longitude'])
    list_splits = []
    for s in splits:

        progress = 100 * (s + 1) / splits.size
        dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s]
                                                      ['TrainIsTrue']].index
        train_yrs = np.unique(dates_train.year)
        dates_all_train = pd.to_datetime(
            [d for d in dates if d.year in train_yrs])
        dates_test = df_splits.loc[s]['TrainIsTrue'][
            ~df_splits.loc[s]['TrainIsTrue']].index
        n = dates_train.size
        r = int(100 * n / df_splits.loc[s].index.size)
        print(
            f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)",
            end="")

        PDO_patterns[s], solver, adjust_sign = get_PDO(
            ds.sel(time=dates_all_train))

        PDO_patterns[s] = PDO_patterns[s].interpolate_na(dim='longitude')
        data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train),
                                                  PDO_patterns[s])
        data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test),
                                                 PDO_patterns[s])

        df_test = pd.DataFrame(data=data_test.values,
                               index=dates_test,
                               columns=['0_901_PDO'])
        df_train = pd.DataFrame(data=data_train.values,
                                index=dates_train,
                                columns=['0_901_PDO'])

        df = pd.concat([df_test, df_train]).sort_index()
        list_splits.append(df)

    df_PDO = pd.concat(list_splits, axis=0, keys=splits)
    #%%
    return df_PDO
Example #23
0
        'y': 1.0,
        'fontsize': 18
    }
}
save = True
# rg.plot_maps_corr(var='z500', save=save,
#                   min_detect_gc=min_detect_gc,
#                   kwrgs_plot=kwrgs_plot,
#                   append_str=''.join(map(str, z500_green_bb))+TV+str(cluster_label))

z500 = rg.list_for_MI[0]
xrvals, xrmask = RGCPD._get_sign_splits_masked(z500.corr_xr, min_detect_gc,
                                               z500.corr_xr['mask'])
g = plot_maps.plot_corr_maps(xrvals, xrmask, **kwrgs_plot)

ds = core_pp.import_ds_lazy(TVpathtemp)
xrclustered = find_precursors.view_or_replace_labels(ds['xrclustered'],
                                                     cluster_label)
g.axes[0, 0].contour(xrclustered.longitude,
                     xrclustered.latitude,
                     np.isnan(xrclustered),
                     transform=ccrs.PlateCarree(),
                     levels=[0, 2],
                     linewidths=2,
                     linestyles=['solid'],
                     colors=['white'])
filename = os.path.join(rg.path_outsub1,
                        'z500vsRW_' + ''.join(map(str, z500_green_bb)))
g.fig.savefig(filename + '.pdf', bbox_inches='tight')
g.fig.savefig(filename + '.jpg', dpi=300, bbox_inches='tight')
Example #24
0
                                                kwrgs_load={'selbox': selbox},
                                                level='Countries')
if domain == 'USCA':
    mask_US_CA = np.logical_or(xarray.values == Country.US,
                               xarray.values == Country.CA)
elif domain == 'US':
    mask_US_CA = xarray.values == Country.US
# xr_mask =  xarray.where(mask_US_CA)
xr_mask = xarray.where(make_country_mask.binary_erosion(mask_US_CA))
# xr_mask =  xarray.where(make_country_mask.binary_erosion(np.nan_to_num(xr_mask)))
xr_mask.values[~np.isnan(xr_mask)] = 1
xr_mask = find_precursors.xrmask_by_latlon(xr_mask, upper_right=(270, 63))
# mask small Western US Island
xr_mask = find_precursors.xrmask_by_latlon(xr_mask, bottom_left=(228, 58))
# add Rocky mask
geo_surf_height = core_pp.import_ds_lazy(
    orography, var='z_NON_CDM', selbox=selbox) / 9.81
geo_surf_height = geo_surf_height.drop('time').drop('realization')
plot_maps.plot_corr_maps(geo_surf_height,
                         cmap=plt.cm.Oranges,
                         clevels=np.arange(0, 2600, 500))
max_height = 1500
mask_Rockies = geo_surf_height < max_height
plot_maps.plot_labels(mask_Rockies)
xr_mask = xr_mask.where(mask_Rockies)

plot_maps.plot_labels(xr_mask)

# In[9]:
# =============================================================================
# Clustering co-occurence of anomalies different tfreqs
# =============================================================================
Example #25
0
def PDO(filename, ex, df_splits=None):
    #%%
    '''
    PDO is calculated based upon all data points in the training years,
    Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train)
    to enable retrieving the PDO timeseries on a subset on the year.
    It is similarly also projected on the dates_test
    From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf
    '''
    t0 = time()
    if df_splits is None:
        RV = ex[ex['RV_name']]
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)

    kwrgs_pp = {
        'selbox': {
            'la_min': 20,  # select domain in degrees east
            'la_max': 65,
            'lo_min': 115,
            'lo_max': 250
        },
        'format_lon': 'only_east'
    }
    ds = core_pp.import_ds_lazy(filename, **kwrgs_pp)

    to_freq = ex['tfreq']
    if to_freq != 1:
        ds, dates = functions_pp.time_mean_bins(ds,
                                                ex,
                                                to_freq=to_freq,
                                                seldays='all')
        ds['time'] = dates

    dates = pd.to_datetime(ds.time.values)

    splits = df_splits.index.levels[0]
    data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size))
    PDO_patterns = xr.DataArray(
        data,
        coords=[splits, ds.latitude.values, ds.longitude.values],
        dims=['split', 'latitude', 'longitude'])

    def PDO_single_split(s, ds, df_splits, PDO_patterns):
        progress = 100 * (s + 1) / splits.size
        dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s]
                                                      ['TrainIsTrue']].index
        train_yrs = np.unique(dates_train.year)
        dates_all_train = pd.to_datetime(
            [d for d in dates if d.year in train_yrs])
        ###        dates_train_yrs = ###
        dates_test = df_splits.loc[s]['TrainIsTrue'][
            ~df_splits.loc[s]['TrainIsTrue']].index
        n = dates_train.size
        r = int(100 * n / df_splits.loc[s].index.size)
        print(
            f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)",
            end="")

        PDO_pattern, solver, adjust_sign = get_PDO(
            ds.sel(time=dates_all_train))
        data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train),
                                                  PDO_patterns[s])
        data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test),
                                                 PDO_patterns[s])

        df_test = pd.DataFrame(data=data_test.values,
                               index=dates_test,
                               columns=['0_901_PDO'])
        df_train = pd.DataFrame(data=data_train.values,
                                index=dates_train,
                                columns=['0_901_PDO'])

        df = pd.concat([df_test, df_train]).sort_index()
        return (df, PDO_pattern)

    pool = ProcessPoolExecutor(os.cpu_count() - 1)  # amount of cores - 1
    futures = [
        pool.submit(PDO_single_split, s, ds, df_splits, PDO_patterns)
        for s in splits
    ]
    results = [future.result() for future in futures]

    list_splits = [r[0] for r in results]

    time_ = time() - t0
    print(time_ / 60)

    for s in splits:
        PDO_patterns[s] = results[s][1]

    df_PDO = pd.concat(list_splits, axis=0, keys=splits)
    #%%
    return df_PDO, PDO_patterns
Example #26
0
def create_mask(path, kwrgs_load={}, level='Continents'):
    '''
    Parameters
    ----------
    path: str
        full path to netcdf file for which you want to create a mask
    kwrgs_load : Dict, optional
        See kwargs core_pp.import_ds_lazy?
    Level : TYPE, optional
        Countries or Continents The default is 'Continents'.

    Returns
    -------
    xr.DataArray
        mask with labels for each country.

    '''
    f_name = os.path.splitext(path)[0].split('/')[-1]
    folder_file = '/'.join(os.path.splitext(path)[0].split('/')[:-1])
    mask_dir = os.path.join(folder_file, 'masks')
    if os.path.isdir(mask_dir) != True: os.makedirs(mask_dir)
    mask_file = os.path.join(mask_dir, f_name + '_' + level)
    if 'selbox' in kwrgs_load.keys():
        lo_min, lo_max, la_min, la_max = kwrgs_load['selbox']
        domainstr = '_lats[{}_{}]_lons[{}_{}]'.format(int(la_min), int(la_max),
                                                      int(lo_min), int(lo_max))
        mask_file = mask_file + domainstr
    if os.path.exists(mask_file + '.nc'):
        return core_pp.import_ds_lazy(mask_file + '.nc'), Country

    ds = core_pp.import_ds_lazy(path, **kwrgs_load)

    # Load Coordinates and Normalize to ShapeFile Coordinates
    coordinates = era_coordinate_grid(ds)
    coordinates[..., 0][coordinates[..., 0] > 180] -= 360

    # Take Center of Grid Cell as Coordinate
    coordinates[..., 0] += (coordinates[0, 1, 0] - coordinates[0, 0, 0]) / 2
    coordinates[..., 1] += (coordinates[1, 0, 1] - coordinates[0, 0, 1]) / 2

    # Create Mask
    if level == 'Continents':
        mask = Continent_mask(coordinates.reshape(-1, 2)).reshape(
            coordinates.shape[:2])
    elif level == 'Countries':
        mask = country_mask(coordinates.reshape(-1, 2)).reshape(
            coordinates.shape[:2])

    country_code = [{
        k: Country.__getitem__(k).value
    } for k in Country._member_names_]
    # np.save(mask_file+'.npy', mask)
    if 'time' in ds.dims:
        mask_xr = ds.isel(time=0).copy().drop('time')
    else:
        mask_xr = ds.copy()
    mask_xr.name = 'country_mask'
    for dic in country_code:
        key, value = list(dic.items())[0]
        mask_xr.attrs[key] = value


#    mask_xr.attrs = {'country_code': country_code}
    mask_xr.values = mask
    mask_xr = mask_xr.astype(int)
    mask_xr.to_netcdf(mask_file + '.nc', mode='w')

    return mask_xr, Country
Example #27
0
                         **kwrgs_plot)
plt.savefig(os.path.join(rg.path_outsub1, f'snapshots_{var}_rm{rm}.pdf'))
#%% Correlation PNA-like RW with Wavenumber 6 phase 2 # only for eastern
import core_pp, find_precursors
values = []
if west_or_east == 'eastern':
    lags_list = range(-10,10)
    for lag in lags_list:
        selbox = (0,360,25,60)
        # selbox = (140,300,20,73)
        tfreq = 1
        # lag = 0
        dates_RV = core_pp.get_subdates(pd.to_datetime(rg.fulltso.time.values),
                                       start_end_date=rg.start_end_TVdate)
        RV_ts = rg.fulltso.sel(time=dates_RV)
        ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1])
        dslocal = core_pp.get_selbox(ds_v300, selbox=selbox)



        datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values),
                                       start_end_date=rg.start_end_TVdate)
        datesRW = datesRW + pd.Timedelta(f'{lag}d')
        dslocal = dslocal.sel(time=datesRW)

        wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox)
        patternlocal = wv6local.mean(dim='lag')
        ts = find_precursors.calc_spatcov(dslocal, patternlocal)
        ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate,
                                                   closed_on_date=start_end_TVdate[-1])
        RV_15, d = functions_pp.time_mean_bins(RV_ts, tfreq, start_end_date=start_end_TVdate,
Example #28
0
def PDO(filepath, df_splits=None, n_jobs=1):
    #%%
    '''
    PDO is calculated based upon all data points in the training years,
    Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train)
    to enable retrieving the PDO timeseries on a subset on the year.
    It is similarly also projected on the dates_test
    From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf

    selbox has format of (lon_min, lon_max, lat_min, lat_max)
    '''
    t0 = time()
    #    old format selbox
    #    {'la_min':20, # select domain in degrees east
    #     'la_max':70,
    #     'lo_min':115,
    #     'lo_max':250},

    kwrgs_pp = {'selbox': (115, 250, 20, 70), 'format_lon': 'only_east'}

    ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp)
    ds_monthly = ds.resample(time='M',
                             restore_coord_dims=False).mean(dim='time',
                                                            skipna=True)
    # ds_global = core_pp.import_ds_lazy(filepath)
    # ds.mean(dim=('latitude','longitude')) # global mean SST anomaly each timestep

    if df_splits is None:
        print('No train-test split')
        iterables = [np.array([0]), pd.to_datetime(ds.time.values)]
        df_splits = pd.DataFrame(data=np.ones(ds.time.size),
                                 index=pd.MultiIndex.from_product(iterables),
                                 columns=['TrainIsTrue'],
                                 dtype=bool)
    splits = df_splits.index.levels[0]
    data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size))
    PDO_patterns = xr.DataArray(
        data,
        coords=[splits, ds.latitude.values, ds.longitude.values],
        dims=['split', 'latitude', 'longitude'])

    if n_jobs > 1:
        with ProcessPoolExecutor(max_workers=os.cpu_count()) as pool:
            futures = [
                pool.submit(PDO_single_split, s, ds_monthly, ds, df_splits)
                for s in range(splits.size)
            ]
            results = [future.result() for future in futures]
    else:
        results = [
            PDO_single_split(s, ds_monthly, ds, df_splits)
            for s in range(splits.size)
        ]

    list_PDO_ts = [r[0] for r in results]

    time_ = time() - t0
    print(time_ / 60)

    for s in splits:
        PDO_patterns[s] = results[s][1]

    df_PDO = pd.concat(list_PDO_ts, axis=0, keys=splits)
    # merge df_splits
    df_PDO = df_PDO.merge(df_splits, left_index=True, right_index=True)
    if splits.size > 1:
        # train test splits should not be equal
        assert float((df_PDO.loc[1] - df_PDO.loc[0]).mean()) != 0, (
            'something '
            'went wrong with train test splits')
    #%%
    return df_PDO, PDO_patterns
Example #29
0
def spatial_valid(var_filename,
                  mask,
                  y_pred_all,
                  y_pred_c,
                  lags_i=None,
                  seldates=None,
                  clusters=None,
                  kwrgs_events=None,
                  alpha=0.05,
                  n_boot=0,
                  blocksize=10,
                  threshold_pred='upper_clim'):
    '''
    var_filename must be 3d netcdf file with only one variable
    mask can be nc file containing only a mask, or a latlon box in format
    [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees 
    '''
    var_filename = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/t2mmax_US_1979-2018_1jan_31dec_daily_0.25deg.nc'
    mask = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/cluster_output.nc'

    if lags_i is None:
        lags_i = list(y_pred_all.columns)

    # load in daily xarray and mask
    xarray = core_pp.import_ds_lazy(var_filename)
    npmask = cl.get_spatial_ma(var_filename, mask)

    # process temporal infor
    freq = (y_pred_c.index[1] - y_pred_c.index[0]).days
    if seldates is None:
        seldates = aggr_to_daily_dates(y_pred_c.index)
        start = f'{seldates[0].month}-{seldates[0].day}'
        end = f'{seldates[-1].month}-{seldates[-1].day}'
        start_end_date = (start, end)
    xarray, dates = functions_pp.time_mean_bins(xarray,
                                                to_freq=freq,
                                                start_end_date=start_end_date)

    # if switching to event timeseries:
    if kwrgs_events is None:
        kwrgs_events = {'event_percentile': 66}
    # unpack other optional arguments for defining event timeseries
    kwrgs = {
        key: item
        for key, item in kwrgs_events.items() if key != 'event_percentile'
    }

    if clusters is None:
        clusters = list(np.unique(npmask[~np.isnan(npmask)]))
    elif type(clusters) is int:
        clusters = [clusters]
    elif clusters is not None:
        clusters = clusters

    dict_allclus = {}
    for clus in clusters:

        latloni = np.where(npmask == clus)
        latloni = [(latloni[0][i], latloni[1][i])
                   for i in range(latloni[0].size)]

        futures = {}
        with ProcessPoolExecutor(max_workers=max_cpu) as pool:

            for ll in latloni:
                latloni = latloni
                xr_gridcell = xarray.isel(latitude=ll[0]).isel(longitude=ll[1])
                threshold = func_fc.Ev_threshold(
                    xr_gridcell, kwrgs_events['event_percentile'])
                y_i = func_fc.Ev_timeseries(xr_gridcell, threshold, **kwrgs)[0]

                futures[ll] = pool.submit(valid.get_metrics_sklearn,
                                          y_i.values,
                                          y_pred_all[lags_i],
                                          y_pred_c,
                                          alpha=alpha,
                                          n_boot=n_boot,
                                          blocksize=blocksize,
                                          threshold_pred=threshold_pred)
        results = {key: future.result() for key, future in futures.items()}
        dict_allclus[clus] = results

    df_valid = dict_allclus[clus][ll][0]
    metrics = np.unique(df_valid.index.get_level_values(0))
    lags_tf = [l * freq for l in lags_i]
    if freq != 1:
        # the last day of the time mean bin is tfreq/2 later then the centerered day
        lags_tf = [
            l_tf - int(freq / 2) if l_tf != 0 else 0 for l_tf in lags_tf
        ]

    for clus in clusters:
        results = dict_allclus[clus]
        xroutput = xarray.isel(time=lags_i).rename({'time': 'lag'})
        xroutput['lag'] = lags_tf
        xroutput = xroutput.expand_dims({'metric': metrics}, 0)
        npdata = np.array(np.zeros_like(xroutput), dtype='float32')
        for ll in latloni:
            df_valid = dict_allclus[clus][ll][0]
            for i, met in enumerate(metrics):
                lat_i = ll[0]
                lon_i = ll[1]
                npdata[i, :, lat_i, lon_i] = df_valid.loc[met].loc[met]
        xroutput.values = npdata

    plot_maps.plot_corr_maps(xroutput.where(npmask == clus),
                             row_dim='metric',
                             size=4,
                             clevels=np.arange(-1, 1.1, 0.2))
    BSS = xroutput.where(npmask == clus).sel(metric='BSS')
    plot_maps.plot_corr_maps(BSS,
                             row_dim='metric',
                             size=4,
                             clevels=np.arange(-0.25, 0.251, 0.05),
                             cbar_vert=-0.1)
Example #30
0
def ENSO_34(filepath, df_splits=None, get_ENSO_states: bool = True):
    #%%
    #    file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc'
    '''
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf
    selbox has format of (lon_min, lon_max, lat_min, lat_max)
    '''

    # if df_splits is None:
    #     seldates = None
    # else:
    #     seldates = df_splits.loc[0].index

    #    {'la_min':-5, # select domain in degrees east
    #     'la_max':5,
    #     'lo_min':-170,
    #     'lo_max':-120},

    kwrgs_pp = {
        'selbox': (190, 240, -5, 5),
        'format_lon': 'only_east',
        'seldates': None
    }

    ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp)
    dates = pd.to_datetime(ds.time.values)
    data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude'))
    df_ENSO = pd.DataFrame(data=data.values, index=dates, columns=['ENSO34'])
    if df_splits is not None:
        splits = df_splits.index.levels[0]
        df_ENSO = pd.concat([df_ENSO] * splits.size, axis=0, keys=splits)

    if get_ENSO_states:
        '''
        From Anderson 2017 - Life cycles of agriculturally relevant ENSO
        teleconnections in North and South America.
        http://doi.wiley.com/10.1002/joc.4916
        mean boreal wintertime (October, November, December) SST anomaly amplitude
        in the NiƱo 3.4 region exceeded 1 of 2 standard deviation.
        '''
        if hasattr(df_ENSO.index, 'levels'):
            df_ENSO_s = df_ENSO.loc[0]
        else:
            df_ENSO_s = df_ENSO
        dates = df_ENSO_s.index
        df_3monthmean = df_ENSO_s.rolling(3, center=True, min_periods=1).mean()
        std_ENSO = df_3monthmean.std()
        OND, groups = core_pp.get_subdates(dates,
                                           start_end_date=('10-01', '12-31'),
                                           returngroups=True)
        OND_ENSO = df_3monthmean.loc[OND].groupby(groups).mean()
        nino_yrs = OND_ENSO[OND_ENSO > df_3monthmean.mean() +
                            std_ENSO][:].dropna().index  #+ 1
        nina_yrs = OND_ENSO[OND_ENSO < df_3monthmean.mean() -
                            std_ENSO][:].dropna().index  #+ 1
        neutral = [
            y for y in OND_ENSO.index
            if y not in core_pp.flatten([nina_yrs, nino_yrs])
        ]
        states = {}
        for i, d in enumerate(dates):
            if d.year in nina_yrs:
                states[d.year] = -1
            if d.year in neutral:
                states[d.year] = 0
            if d.year in nino_yrs:
                states[d.year] = 1

        cycle_list = []
        for s, v in [('EN', 1), ('LN', -1)]:
            ENSO_cycle = {d.year: 0 for d in dates}
            for i, year in enumerate(np.unique(dates.year)):
                # d = dates[1]
                # if states[year] == v:
                #     s = 'EN'
                # elif states[year] == -1:
                #     s = 'LN'
                if states[year] == v:
                    ENSO_cycle[year] = f'{s}0'
                    if year - 1 in dates.year and states[year - 1] != v:
                        ENSO_cycle[year - 1] = f'{s}-1'
                    if year + 1 in dates.year and states[year + 1] != v:
                        ENSO_cycle[year + 1] = f'{s}+1'
            cycle_list.append(ENSO_cycle)

        time_index = pd.to_datetime([f'{y}-01-01' for y in states.keys()])
        df_state = pd.concat([
            pd.Series(states),
            pd.Series(cycle_list[0]),
            pd.Series(cycle_list[1])
        ],
                             axis=1,
                             keys=['state', 'EN_cycle', 'LN_cycle'])
        df_state.index = time_index

        if hasattr(df_ENSO.index, 'levels'):  # copy to other traintest splits
            df_state = pd.concat([df_state] * splits.size, keys=splits)

        composites = np.zeros(3, dtype=object)
        for i, yrs in enumerate([nina_yrs, neutral, nino_yrs]):
            composite = [d for d in dates if d.year in yrs]
            composites[i] = ds.sel(time=composite).mean(dim='time')
        composites = xr.concat(composites, dim='state')
        composites['state'] = ['Nina', 'Neutral', 'Nino']

        plot_maps.plot_corr_maps(composites, row_dim='state', hspace=0.5)
        out = df_ENSO, [
            np.array(nina_yrs),
            np.array(neutral),
            np.array(nino_yrs)
        ], df_state
    else:
        out = df_ENSO
    #%%
    return out