Exemple #1
0
    def get_HM_data(self, filepath, dim='latitude'):
        self.filepath = filepath
        self.dim = dim
        if self.seldates is not None:
            self.kwrgs_load['seldates'] = self.seldates_ext
            self.ds_seldates = functions_pp.import_ds_timemeanbins(self.filepath, **self.kwrgs_load)
            ds_name = self.ds_seldates.name

            if self.rollingmeanwindow is not None:
            # apply rolling mean
                self.ds = self.ds_seldates.rolling(time=self.rollingmeanwindow).mean()
            else:
                self.ds = self.ds_seldates
            # calculating std based on seldates
            self.std = self.ds.sel(time=self.seldates).std(dim='time')
            if self.t_test == True:
                self.ds_all = self.ds.sel(time=self.seldates)
            # now that we have std over seldates, select dates for HM
            self.ds = self.ds.sel(time=np.concatenate(self.event_lagged))
        else:
            self.kwrgs_load['seldates'] = np.concatenate(self.event_lagged)
            self.ds = functions_pp.import_ds_timemeanbins(self.filepath, **self.kwrgs_load)
            ds_name = self.ds.name

        if self.name is None:
                self.name = ds_name

        if 'units' in list(self.ds.attrs.keys()):
            self.units = self.ds.attrs['units']

        if self.standardize:
            self.units = 'std [-]'
            self.ds = self.ds / self.std



        if self.event_dates is not None:
            self.xarray = self.ds.copy().rename({'time':'lag'})
            self.xarray = self.xarray.assign_coords(lag=np.concatenate(self.lag_axes))
        else:
            self.xarray = self.ds

        if self.zoomdim is not None:
            xarray_w = self.xarray.sel(latitude=slice(self.zoomdim[0],
                                                      self.zoomdim[1]))
            xarray_w = functions_pp.area_weighted(xarray_w)
        else:
            xarray_w = functions_pp.area_weighted(self.xarray)
        xarray_meandim = xarray_w.mean(dim=dim)
        self.xr_HM = xarray_meandim.groupby('lag').mean()
        if self.t_test:
            full = (self.ds_all/self.std).mean(dim=dim)
            self.xr_mask = self.xr_HM.astype(bool).copy()
            pvals = np.zeros_like(self.xr_mask.values, dtype=float)
            for i, lag in enumerate(self.xr_mask.lag.values):
                sample = xarray_meandim.sel(lag=lag)
                T, p, mask = Welchs_t_test(sample, full, equal_var=False)
                pvals[i] = p
            self.xr_mask.values = pvals
Exemple #2
0
    def get_ts(self, tfreq_ts=1, df_splits=None):
        if df_splits is None:
            df_splits = self.df_splits
        else:
            df_splits = df_splits
        splits = self.eofs['split'].values
        neofs  = self.eofs['eof'].values
        ds = functions_pp.import_ds_timemeanbins(self.filepath,
                                                tfreq=tfreq_ts,
                                                selbox=self.selbox,
                                                start_end_date=self.start_end_date,
                                                start_end_year=self.start_end_year)
        df_data_s   = np.zeros( (splits.size) , dtype=object)
        dates = pd.to_datetime(ds['time'].values)
        for s in splits:

            dfs = pd.DataFrame(columns=neofs, index=dates)
            for i, e in enumerate(neofs):

                pattern = self.eofs.sel(split=s, eof=e)
                data = find_precursors.calc_spatcov(ds, pattern)
                dfs[e] = pd.Series(data.values,
                                   index=dates)
                if i == neofs.size-1:
                    dfs = dfs.merge(df_splits.loc[s], left_index=True, right_index=True)
            df_data_s[s] = dfs
        self.df = pd.concat(list(df_data_s), keys=range(splits.size))
Exemple #3
0
def load_precursor(ex):
    #%%
    dates_all = ex['dates_all']
    # =============================================================================
    # Load Precursor
    # =============================================================================
    prec_filename = os.path.join(ex['path_pp'], ex['filename_precur'])
    #    if ex['datafolder'] == 'EC':
    #        try:
    #            datesRV = func_CPPA.make_datestr(dates_all, ex,
    #                            ex['startyear'], ex['endyear'], lpyr=False)
    #            dates_prec = subset_dates(datesRV, ex)
    ##            varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec)
    #        except:
    #            datesRV = func_CPPA.make_datestr(dates_all, ex,
    #                                    ex['startyear'], ex['endyear'], lpyr=True)
    #            dates_prec = subset_dates(datesRV, ex)
    #            varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec)
    #    else:
    Prec_reg = functions_pp.import_ds_timemeanbins(prec_filename,
                                                   ex['tfreq'],
                                                   loadleap=True,
                                                   to_xarr=False,
                                                   seldates=ex['dates_all'])
    Prec_reg = core_pp.convert_longitude(Prec_reg, 'only_east')
    if ex['add_lsm']:
        kwrgs_2d = {'selbox': ex['selbox'], 'format_lon': 'only_east'}
        lsm_filename = os.path.join(ex['path_mask'], ex['mask_file'])
        lsm = core_pp.import_ds_lazy(lsm_filename, **kwrgs_2d)

        Prec_reg['lsm'] = (('latitude', 'longitude'), (lsm < 0.3).values)
        Prec_reg = Prec_reg.where(Prec_reg['lsm'])

    if 'exclude_yrs' in ex.keys():
        if len(ex['exclude_yrs']) != 0:
            print('excluding yr(s): {} from analysis'.format(
                ex['exclude_yrs']))

            all_yrs = np.unique(dates_all.year)
            yrs_keep = [y for y in all_yrs if y not in ex['exclude_yrs']]
            idx_yrs = [
                i for i in np.arange(dates_all.year.size)
                if dates_all.year[i] in yrs_keep
            ]
            #        dates_all = dates_all[idx_yrs]
            mask_all = np.zeros(dates_all.size, dtype=bool)
            mask_all[idx_yrs] = True
            dates_excl_yrs = dates_all[mask_all]
            Prec_reg = Prec_reg.sel(time=dates_excl_yrs)

    #%%
    return Prec_reg, ex
Exemple #4
0
    def load_and_aggregate_precur(self, kwrgs_load):
        '''
        Wrapper to load in Netcdf and aggregated to n-mean bins or a period
        mean, e.g. DJF mean (see seasonal_mode.ipynb).

        Parameters
        ----------
        kwrgs_load : TYPE
            dictionary passed to functions_pp.import_ds_timemeanbins or
            to functions_pp.time_mean_periods.
        df_splits : pd.DataFrame, optional
            See class_RGCPD. The default is using the df_splits that was used
            for calculating the correlation map.

        Returns
        -------
        None.

        '''
        # precur = rg.list_for_MI[0] ; df_splits = rg.df_splits ; kwrgs_load = rg.kwrgs_load
        name = self.name
        filepath = self.filepath

        # for name, filepath in list_precur_pp: # loop over all variables
        # =============================================================================
        # Unpack non-default arguments
        # =============================================================================
        kwrgs = {'selbox': self.selbox, 'dailytomonths': self.dailytomonths}
        for key, value in kwrgs_load.items():
            if type(value) is list and name in value[1].keys():
                kwrgs[key] = value[1][name]
            elif type(value) is list and name not in value[1].keys():
                kwrgs[key] = value[0]  # plugging in default value
            elif hasattr(self, key):
                # Overwrite RGCPD parameters with MI specific parameters
                kwrgs[key] = self.__dict__[key]
            else:
                kwrgs[key] = value
        if self.lag_as_gap: kwrgs['tfreq'] = 1
        self.kwrgs_load = kwrgs.copy()
        #===========================================
        # Precursor field
        #===========================================
        self.precur_arr = functions_pp.import_ds_timemeanbins(
            filepath, **kwrgs)

        if type(self.lags[0]) == np.ndarray:
            tmp = functions_pp.time_mean_periods
            self.precur_arr = tmp(self.precur_arr, self.lags,
                                  kwrgs_load['start_end_year'])
        return
Exemple #5
0
        def execute_to_dict(var_filename, npmask, v1, v2, q, clustermethodkey,
                            kwrgs, kwrgs_l):

            # if reload: # some param has been adjusted
            xarray_ts = functions_pp.import_ds_timemeanbins(
                var_filename, **kwrgs_l)
            if type(q) is int:
                xarray = binary_occurences_quantile(xarray_ts, q=q)
            if type(q) is list:
                xarray = binary_occurences_quantile(xarray_ts, q=v2)

            xrclusteredij, result = skclustering(
                xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs)
            return {f'{v1}..{v2}': (xrclusteredij, result)}
Exemple #6
0
def calculate_corr_maps(RV,
                        df_splits,
                        kwrgs_load,
                        list_varclass=list,
                        lags=[0],
                        alpha=0.05,
                        FDR_control=True,
                        plot=True):

    #%%

    outdic_actors = dict()

    class act:
        def __init__(self, name, corr_xr, precur_arr):
            self.name = name
            self.corr_xr = corr_xr
            self.precur_arr = precur_arr
            self.lat_grid = precur_arr.latitude.values
            self.lon_grid = precur_arr.longitude.values
            self.area_grid = rgcpd.get_area(precur_arr)
            self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0])

    for var_class in list_varclass:  # loop over all variables
        #===========================================
        # 3c) Precursor field
        #===========================================
        file_path = os.path.join(var_class.path_pp, var_class.filename_pp)
        precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex)
        #        precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east')
        # =============================================================================
        # Calculate correlation
        # =============================================================================
        corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr,
                                             RV,
                                             df_splits,
                                             lags=lags,
                                             alpha=alpha,
                                             FDR_control=FDR_control)

        # =============================================================================
        # Cluster into precursor regions
        # =============================================================================
        actor = act(var_class.name, corr_xr, precur_arr)

        outdic_actors[actor.name] = actor

    return outdic_actors
Exemple #7
0
def calculate_corr_maps(TV, df_splits, kwrgs_load, list_precur_pp=list, lags=np.array([1]), 
                        alpha=0.05, FDR_control=True):
    '''
    tfreq : aggregate precursors with bins of window size = tfreq 
    selbox : selbox is tuple of:
            (degrees_east, degrees_west, degrees_south, degrees_north)
    loadleap : if leap day should loaded yes/no
    seldates : if a selection of dates should be loaded
    
    '''                  
    #%%


    outdic_precur = dict()
    class act:
        def __init__(self, name, corr_xr, precur_arr):
            self.name = name
            self.corr_xr = corr_xr
            self.precur_arr = precur_arr
            self.lat_grid = precur_arr.latitude.values
            self.lon_grid = precur_arr.longitude.values
            self.area_grid = get_area(precur_arr)
            self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0])


    for name, filepath in list_precur_pp: # loop over all variables
        #===========================================
        # 3c) Precursor field
        #===========================================
        precur_arr = functions_pp.import_ds_timemeanbins(filepath, **kwrgs_load)
        # =============================================================================
        # Calculate correlation
        # =============================================================================
        corr_xr = calc_corr_coeffs_new(precur_arr, TV, df_splits, lags=lags,
                                             alpha=alpha, FDR_control=FDR_control)

        # =============================================================================
        # Cluster into precursor regions
        # =============================================================================
        actor = act(name, corr_xr, precur_arr)

        outdic_precur[actor.name] = actor

    return outdic_precur
Exemple #8
0
def sklearn_clustering(var_filename,
                       mask=None,
                       kwrgs_load={},
                       clustermethodkey='DBSCAN',
                       kwrgs_clust={'eps': 600}):

    if 'selbox' in kwrgs_load.keys():
        if kwrgs_load['selbox'] is not None:
            mask = kwrgs_load.pop('selbox')
            print(
                'mask overwritten with selbox list. Both selbox and mask are given.'
                'Both adapt the domain over which to cluster')
    kwrgs_l_spatial = {}  # kwrgs affecting spatial extent/format
    if 'format_lon' in kwrgs_load.keys():
        kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon']

    xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)
    npmask = get_spatial_ma(var_filename,
                            mask,
                            kwrgs_l_spatial=kwrgs_l_spatial)

    kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list}
    [
        kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items()
        if type(i) == list
    ]

    if len(kwrgs_loop) == 1:
        # insert fake axes
        kwrgs_loop['fake'] = [0]
    if len(kwrgs_loop) >= 1:
        new_coords = []
        xrclustered = xarray[0].drop('time')
        for k, list_v in kwrgs_loop.items():  # in alphabetical order
            new_coords.append(k)
            dim_coords = {str(k): list_v}
            xrclustered = xrclustered.expand_dims(dim_coords).copy()
        new_coords = [
            d for d in xrclustered.dims if d not in ['latitude', 'longitude']
        ]
        results = []
        first_loop = kwrgs_loop[new_coords[0]]
        second_loop = kwrgs_loop[new_coords[1]]
        for i, v1 in enumerate(first_loop):
            for j, v2 in enumerate(second_loop):
                kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2)
                kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2)
                print(
                    f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ",
                    end="")
                xarray = functions_pp.import_ds_timemeanbins(
                    var_filename, **kwrgs_l)

                xrclustered[i, j], result = skclustering(
                    xarray,
                    npmask,
                    clustermethodkey=clustermethodkey,
                    kwrgs=kwrgs)
                results.append(result)
        if 'fake' in new_coords:
            xrclustered = xrclustered.squeeze().drop('fake').copy()
    else:
        xrclustered, results = skclustering(xarray,
                                            npmask,
                                            clustermethodkey=clustermethodkey,
                                            kwrgs=kwrgs_clust)
    xrclustered.attrs['method'] = clustermethodkey
    xrclustered.attrs['kwrgs'] = str(kwrgs_clust)
    xrclustered.attrs['target'] = f'{xarray.name}'
    if 'hash' not in xrclustered.attrs.keys():
        xrclustered.attrs['hash'] = uuid.uuid4().hex[:5]
    return xrclustered, results
def sklearn_clustering(var_filename: str,
                       mask: Optional[np.ndarray] = None,
                       dimension: str = 'temporal',
                       kwrgs_load: Dict = {},
                       clustermethodkey: str = 'DBSCAN',
                       kwrgs_clust: str = {'eps': 600}):
    """
    Performs clustering for combinations of clustering parameters (kwrgs_clust) and parameters for variable loading (kwrgs_load)

    Parameters
    ----------

    var_filename : str
        path to .nc file

    mask : 2-d numpy.ndarray, optional
        mask with spatial coordinates to be clustered, the default is None

    dimension : str
        temporal or spatial, the default is temporal

    kwrgs_load : dict, optional
        keywords for loading variable, the default is {}; see functions_pp.import_ds_timemeanbins? for parameters

    clustermethodkey : str
        Is build upon sklean clustering. Techniques available are listed in sklearn.cluster.__dict__,
    	e.g. KMeans, or AgglomerativeClustering. See kwrgs_clust for algorithm parameters.

    kwrgs_clust : dict
        (algorithm dependent) dictionary of clustering parameters, the default is {'eps': 600}

    Returns
    -------
    xr_temporal: list of temporally clustered xarray objects
    xrclustered: spatially clustered xaray object
    results: list of sklearn.cluster objects
    """

    if 'selbox' in kwrgs_load.keys():
        assert isinstance(kwrgs_load['selbox'], list), 'selbox is not a list'
        assert len(
            kwrgs_load['selbox']
        ) == 4, 'selbox list does not have shape [lon_min, lon_max, lat_min, lat_max]'

    # we can either give a mask for coordinates or just select a box with coordinates
    if 'selbox' in kwrgs_load.keys():
        if kwrgs_load['selbox'] is not None and mask is not None:
            print(
                'Mask overwritten with selbox list. Both selbox and mask are given.'
                'Both adapt the domain over which to cluster')
            mask = None

    kwrgs_l_spatial = {}  # kwrgs affecting spatial extent/format
    if 'format_lon' in kwrgs_load.keys():
        kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon']

    if 'selbox' in kwrgs_load.keys():
        kwrgs_l_spatial['selbox'] = kwrgs_load['selbox']

    # here we import an .nc file and convert it into an xarray object
    xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial)

    # here we create a numpy array mask for coordinates selected using mask (or selbox if selbox in kwrgs_load())
    npmask = get_spatial_ma(var_filename,
                            mask,
                            kwrgs_l_spatial=kwrgs_l_spatial)

    # arguments loop
    kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list}
    [
        kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items()
        if type(i) == list
    ]
    if 'selbox' in kwrgs_loop.keys():
        kwrgs_loop.pop('selbox')

    if len(kwrgs_loop) == 1:
        # insert fake axes
        kwrgs_loop['fake'] = [0]

    if len(kwrgs_loop) >= 1:
        new_coords = []

        if dimension == 'spatial':
            xrclustered = xarray[0].drop('time')
        else:
            xrclustered = xarray

        for k, list_v in kwrgs_loop.items():  # in alphabetical order

            # new_coords contains keys from kwrgs_clust and kwrgs_load
            new_coords.append(k)

            # in every iteration of the loop, we create a dictionary using key and value from kwrgs_clust and kwrgs_load
            dim_coords = {str(k): list_v}

            # expanding the xarray dataset by dim_coords dictionaries
            xrclustered = xrclustered.expand_dims(dim_coords).copy()

        # create a list of coordinates/dimensions added in the for loop above (from kwrgs_clust and kwrgs_load)
        new_coords = [
            d for d in xrclustered.dims
            if d not in ['latitude', 'longitude', 'time']
        ]

        # to store sklearn objects
        results = []

        # separating kwrgs into lists to loop over
        first_loop = kwrgs_loop[new_coords[0]]
        second_loop = kwrgs_loop[new_coords[1]]

        if dimension == 'temporal':
            xr_temporal = np.empty(
                [len(first_loop), len(second_loop)], dtype=object)

        # if kwrgs_load is empty we can load in the xarray here -> it won't be changing

        # loop over kwrgs_load and kwrgs_clust values
        for i, v1 in enumerate(first_loop):
            for j, v2 in enumerate(second_loop):

                # create dictionaries of all possible combinations of kwrgs_load and kwrgs_clust ??
                kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2)

                # if we don't have any kwrgs_load we don't need it -> add an if statement for memory optimization
                # and add the 5 lines below into the if statement
                kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2)

                if 'tfreq' in kwrgs_l.keys():
                    assert isinstance(kwrgs_l['tfreq'],
                                      int), 'tfreq is not an integer'

                print(
                    f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ",
                    end="")
                xarray = functions_pp.import_ds_timemeanbins(
                    var_filename, **kwrgs_l)

                # updating xarray object and results - here change for supervised/unsupervised clustering
                if dimension == 'spatial':
                    xrclustered[i, j], result = skclustering(
                        xarray,
                        npmask,
                        clustermethodkey=clustermethodkey,
                        kwrgs=kwrgs,
                        dimension=dimension)
                else:
                    xr_temporal[i, j], result = skclustering(
                        xarray,
                        npmask,
                        clustermethodkey=clustermethodkey,
                        kwrgs=kwrgs,
                        dimension=dimension)

                    # storing arbitrary metadata for temporal clustering
                    xr_temporal[i, j].attrs['method'] = clustermethodkey
                    xr_temporal[i, j].attrs['target'] = f'{xarray.name}'
                    if new_coords[0] != 'fake':
                        xr_temporal[i, j].attrs[new_coords[0]] = v1
                    xr_temporal[i, j].attrs[new_coords[1]] = v2
                    if 'hash' not in xr_temporal[i, j].attrs.keys():
                        xr_temporal[i, j].attrs['hash'] = uuid.uuid4().hex[:5]

                results.append(result)

        if 'fake' in new_coords and dimension == 'spatial':
            xrclustered = xrclustered.squeeze().drop('fake').copy()

    else:
        xrclustered, results = skclustering(xarray,
                                            npmask,
                                            clustermethodkey=clustermethodkey,
                                            kwrgs=kwrgs_clust,
                                            dimension=dimension)
        return xrclustered, results

    # storing arbitrary metadata for spatial clustering
    xrclustered.attrs['method'] = clustermethodkey
    xrclustered.attrs['kwrgs'] = str(kwrgs_clust)
    xrclustered.attrs['target'] = f'{xarray.name}'
    if 'hash' not in xrclustered.attrs.keys():
        xrclustered.attrs['hash'] = uuid.uuid4().hex[:5]

    if dimension == 'temporal':
        return xr_temporal, results
    # dimension == 'spatial'
    else:
        return xrclustered, results
Exemple #10
0
def sklearn_clustering(var_filename,
                       mask=None,
                       kwrgs_load={},
                       clustermethodkey='DBSCAN',
                       kwrgs_clust={'eps': 600}):

    if 'selbox' in kwrgs_load.keys():
        kwrgs_l = dict(selbox=kwrgs_load['selbox'])
    else:
        kwrgs_l = {}
    xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l)

    if 'selbox' in kwrgs_l.keys() and mask is None:
        npmask = np.ones_like(xarray[0].values, dtype=bool)
    else:
        npmask = get_spatial_ma(var_filename, mask)

    kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list}
    [
        kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items()
        if type(i) == list
    ]

    if len(kwrgs_loop) == 1:
        # insert fake axes
        kwrgs_loop['fake'] = [0]
    if len(kwrgs_loop) >= 1:
        new_coords = []
        xrclustered = xarray[0].drop('time')
        for k, list_v in kwrgs_loop.items():  # in alphabetical order
            new_coords.append(k)
            dim_coords = {str(k): list_v}
            xrclustered = xrclustered.expand_dims(dim_coords).copy()
        new_coords = [
            d for d in xrclustered.dims if d not in ['latitude', 'longitude']
        ]
        results = []
        first_loop = kwrgs_loop[new_coords[0]]
        second_loop = kwrgs_loop[new_coords[1]]
        for i, v1 in enumerate(first_loop):
            for j, v2 in enumerate(second_loop):
                kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2)
                kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2)
                print(
                    f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ",
                    end="")
                xarray = functions_pp.import_ds_timemeanbins(
                    var_filename, **kwrgs_l)

                xrclustered[i, j], result = skclustering(
                    xarray,
                    npmask,
                    clustermethodkey=clustermethodkey,
                    kwrgs=kwrgs)
                results.append(result)
        if 'fake' in new_coords:
            xrclustered = xrclustered.squeeze().drop('fake').copy()
    else:
        xrclustered, results = skclustering(xarray,
                                            npmask,
                                            clustermethodkey=clustermethodkey,
                                            kwrgs=kwrgs_clust)
    xrclustered.attrs['method'] = clustermethodkey
    xrclustered.attrs['kwrgs'] = str(kwrgs_clust)
    xrclustered.attrs['target'] = f'{xarray.name}'
    if 'hash' not in xrclustered.attrs.keys():
        xrclustered.attrs['hash'] = uuid.uuid4().hex[:5]
    return xrclustered, results
Exemple #11
0
def calculate_corr_maps(ex, map_proj):
    #%%
    # =============================================================================
    # Load 'exp' dictionairy with information of pre-processed data (variables, paths, filenames, etcetera..)
    # and add RGCPD/Tigrimate experiment settings
    # =============================================================================
    # Response Variable is what we want to predict
    RV = ex[ex['RV_name']]
    ex['time_cycle'] = RV.dates[
        RV.dates.year ==
        RV.startyear].size  # time-cycle of data. total timesteps in one year
    ex['time_range_all'] = [0, RV.dates.size]
    #==================================================================================
    # Start of experiment
    #==================================================================================

    # Define traintest:
    df_RVfullts = pd.DataFrame(RV.RVfullts.values,
                               index=pd.to_datetime(RV.RVfullts.time.values))
    df_RV_ts = pd.DataFrame(RV.RV_ts.values,
                            index=pd.to_datetime(RV.RV_ts.time.values))
    if ex['method'][:9] == 'ran_strat':
        kwrgs_events = ex['kwrgs_events']
        RV = func_fc.RV_class(df_RVfullts, df_RV_ts, kwrgs_events)
    else:
        RV = func_fc.RV_class(df_RVfullts, df_RV_ts)
    if ex['import_prec_ts']:
        # Retrieve same train test split as imported ts
        path_data = ''.join(ex['precursor_ts'][0][1])
        df_splits = func_fc.load_hdf5(
            path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']]
        test_yrs = functions_pp.get_testyrs(df_splits)
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex, test_yrs)
        assert (np.equal(test_yrs,
                         ex['tested_yrs'])).all(), "Train test split not equal"
    else:
        df_splits, ex = functions_pp.rand_traintest_years(RV, ex)

    # =============================================================================
    # 2) DEFINE PRECURSOS COMMUNITIES:
    # =============================================================================
    # - calculate and plot pattern correltion for differnt fields
    # - create time-series over these regions
    #=====================================================================================
    outdic_actors = dict()

    class act:
        def __init__(self, name, corr_xr, precur_arr):
            self.name = var
            self.corr_xr = corr_xr
            self.precur_arr = precur_arr
            self.lat_grid = precur_arr.latitude.values
            self.lon_grid = precur_arr.longitude.values
            self.area_grid = rgcpd.get_area(precur_arr)
            self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0])

    allvar = ex['vars'][0]  # list of all variable names
    for var in allvar[ex['excludeRV']:]:  # loop over all variables
        actor = ex[var]
        #===========================================
        # 3c) Precursor field
        #===========================================
        file_path = os.path.join(actor.path_pp, actor.filename_pp)
        precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex)
        #        precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east')
        # =============================================================================
        # Calculate correlation
        # =============================================================================
        corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, ex)

        # =============================================================================
        # Cluster into precursor regions
        # =============================================================================
        actor = act(var, corr_xr, precur_arr)
        actor, ex = rgcpd.cluster_DBSCAN_regions(actor, ex)
        if np.isnan(actor.prec_labels.values).all() == False:
            rgcpd.plot_regs_xarray(actor.prec_labels.copy(), ex)
        outdic_actors[var] = actor
        # =============================================================================
        # Plot
        # =============================================================================
        if ex['plotin1fig'] == False:
            plot_maps.plot_corr_maps(corr_xr, corr_xr['mask'], map_proj)
            fig_filename = '{}_corr_{}_vs_{}'.format(
                ex['params'], ex['RV_name'], var) + ex['file_type2']
            plt.savefig(os.path.join(ex['fig_path'], fig_filename),
                        bbox_inches='tight',
                        dpi=200)
            if ex['showplot'] == False:
                plt.close()


#%%
    return ex, outdic_actors
Exemple #12
0
    def get_pattern(self, filepath, df_splits=None):
        # filepath = '/Users/semvijverberg/surfdrive/ERA5/input_raw/preprocessed/sst_1979-2018_1jan_31dec_daily_2.5deg.nc'
        self.filepath = filepath


        if self.tfreq_EOF == 'monthly':
            self.ds_EOF = functions_pp.import_ds_timemeanbins(self.filepath, tfreq=1,
                                                     selbox=self.selbox,
                                                     dailytomonths=True,
                                                     start_end_date=self.start_end_date,
                                                     start_end_year=self.start_end_year)
        else:
            self.ds_EOF = functions_pp.import_ds_timemeanbins(self.filepath,
                                                          tfreq=self.tfreq_EOF,
                                                          selbox=self.selbox,
                                                          start_end_date=self.start_end_date,
                                                          start_end_year=self.start_end_year,
                                                          closed_on_date=self.start_end_date[-1])
        if self.name is None:
            if hasattr(self.ds_EOF, 'name'):
                # take name of variable
                self.name = self.ds_EOF.name

        if df_splits is None:
            print('no train test splits for fitting EOF')
            data = np.zeros( (1, self.neofs, self.ds_EOF.latitude.size, self.ds_EOF.longitude.size) )
            coords = [[0], [f'EOF{n}_'+self.name for n in range(self.neofs)],
                      self.ds_EOF.latitude.values, self.ds_EOF.longitude.values]
            self.eofs = xr.DataArray(data,
                                coords=coords,
                                dims = ['split', 'eof', 'latitude', 'longitude'])
            solvers = []
            self.eofs[0,:], solver = self._get_EOF_xarray(self.ds_EOF, self.neofs)
            solvers.append(solver)
            self.solvers = solvers
        else:
            self.df_splits = df_splits
            splits = df_splits.index.levels[0]
            func = self._get_EOF_xarray
            if self.n_cpu > 1:
                try:
                    with ProcessPoolExecutor(max_workers=os.cpu_count()) as pool:
                        futures = []
                        for s in range(splits.size):
                            progress = int(100 * (s+1) / splits.size)
                            print(f"\rProgress traintest set {progress}%", end="")
                            futures.append(pool.submit(self._single_split, func,
                                                      self.ds_EOF, s, df_splits,
                                                      self.neofs))
                            results = [future.result() for future in futures]
                        pool.shutdown()
                except:
                    results = [self._single_split(func, self.ds_EOF, s, df_splits, self.neofs) for s in range(splits.size)]
            else:
                results = [self._single_split(func, self.ds_EOF, s, df_splits, self.neofs) for s in range(splits.size)]
            # unpack results
            data = np.zeros( (splits.size, self.neofs, self.ds_EOF.latitude.size,
                              self.ds_EOF.longitude.size) )
            coords = [splits, [f'0..{n+1}..EOF_'+self.name for n in range(self.neofs)],
                      self.ds_EOF.latitude.values, self.ds_EOF.longitude.values]
            self.eofs = xr.DataArray(data,
                                    coords=coords,
                                    dims = ['split', 'eof', 'latitude', 'longitude'])
            solvers = []
            for s in splits:
                self.eofs[s,:] = results[s][0]
                solvers.append(results[s][1])
                # ensure same sign
                mask_pos = (self.eofs[0] > self.eofs[0].mean())
                sign = np.sign(self.eofs[s].where(mask_pos).mean(axis=(1,2)))
                self.eofs[s,:] = sign * self.eofs[s,:]