def get_HM_data(self, filepath, dim='latitude'): self.filepath = filepath self.dim = dim if self.seldates is not None: self.kwrgs_load['seldates'] = self.seldates_ext self.ds_seldates = functions_pp.import_ds_timemeanbins(self.filepath, **self.kwrgs_load) ds_name = self.ds_seldates.name if self.rollingmeanwindow is not None: # apply rolling mean self.ds = self.ds_seldates.rolling(time=self.rollingmeanwindow).mean() else: self.ds = self.ds_seldates # calculating std based on seldates self.std = self.ds.sel(time=self.seldates).std(dim='time') if self.t_test == True: self.ds_all = self.ds.sel(time=self.seldates) # now that we have std over seldates, select dates for HM self.ds = self.ds.sel(time=np.concatenate(self.event_lagged)) else: self.kwrgs_load['seldates'] = np.concatenate(self.event_lagged) self.ds = functions_pp.import_ds_timemeanbins(self.filepath, **self.kwrgs_load) ds_name = self.ds.name if self.name is None: self.name = ds_name if 'units' in list(self.ds.attrs.keys()): self.units = self.ds.attrs['units'] if self.standardize: self.units = 'std [-]' self.ds = self.ds / self.std if self.event_dates is not None: self.xarray = self.ds.copy().rename({'time':'lag'}) self.xarray = self.xarray.assign_coords(lag=np.concatenate(self.lag_axes)) else: self.xarray = self.ds if self.zoomdim is not None: xarray_w = self.xarray.sel(latitude=slice(self.zoomdim[0], self.zoomdim[1])) xarray_w = functions_pp.area_weighted(xarray_w) else: xarray_w = functions_pp.area_weighted(self.xarray) xarray_meandim = xarray_w.mean(dim=dim) self.xr_HM = xarray_meandim.groupby('lag').mean() if self.t_test: full = (self.ds_all/self.std).mean(dim=dim) self.xr_mask = self.xr_HM.astype(bool).copy() pvals = np.zeros_like(self.xr_mask.values, dtype=float) for i, lag in enumerate(self.xr_mask.lag.values): sample = xarray_meandim.sel(lag=lag) T, p, mask = Welchs_t_test(sample, full, equal_var=False) pvals[i] = p self.xr_mask.values = pvals
def get_ts(self, tfreq_ts=1, df_splits=None): if df_splits is None: df_splits = self.df_splits else: df_splits = df_splits splits = self.eofs['split'].values neofs = self.eofs['eof'].values ds = functions_pp.import_ds_timemeanbins(self.filepath, tfreq=tfreq_ts, selbox=self.selbox, start_end_date=self.start_end_date, start_end_year=self.start_end_year) df_data_s = np.zeros( (splits.size) , dtype=object) dates = pd.to_datetime(ds['time'].values) for s in splits: dfs = pd.DataFrame(columns=neofs, index=dates) for i, e in enumerate(neofs): pattern = self.eofs.sel(split=s, eof=e) data = find_precursors.calc_spatcov(ds, pattern) dfs[e] = pd.Series(data.values, index=dates) if i == neofs.size-1: dfs = dfs.merge(df_splits.loc[s], left_index=True, right_index=True) df_data_s[s] = dfs self.df = pd.concat(list(df_data_s), keys=range(splits.size))
def load_precursor(ex): #%% dates_all = ex['dates_all'] # ============================================================================= # Load Precursor # ============================================================================= prec_filename = os.path.join(ex['path_pp'], ex['filename_precur']) # if ex['datafolder'] == 'EC': # try: # datesRV = func_CPPA.make_datestr(dates_all, ex, # ex['startyear'], ex['endyear'], lpyr=False) # dates_prec = subset_dates(datesRV, ex) ## varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec) # except: # datesRV = func_CPPA.make_datestr(dates_all, ex, # ex['startyear'], ex['endyear'], lpyr=True) # dates_prec = subset_dates(datesRV, ex) # varfullgl = func_CPPA.import_ds_lazy(prec_filename, ex, seldates=dates_prec) # else: Prec_reg = functions_pp.import_ds_timemeanbins(prec_filename, ex['tfreq'], loadleap=True, to_xarr=False, seldates=ex['dates_all']) Prec_reg = core_pp.convert_longitude(Prec_reg, 'only_east') if ex['add_lsm']: kwrgs_2d = {'selbox': ex['selbox'], 'format_lon': 'only_east'} lsm_filename = os.path.join(ex['path_mask'], ex['mask_file']) lsm = core_pp.import_ds_lazy(lsm_filename, **kwrgs_2d) Prec_reg['lsm'] = (('latitude', 'longitude'), (lsm < 0.3).values) Prec_reg = Prec_reg.where(Prec_reg['lsm']) if 'exclude_yrs' in ex.keys(): if len(ex['exclude_yrs']) != 0: print('excluding yr(s): {} from analysis'.format( ex['exclude_yrs'])) all_yrs = np.unique(dates_all.year) yrs_keep = [y for y in all_yrs if y not in ex['exclude_yrs']] idx_yrs = [ i for i in np.arange(dates_all.year.size) if dates_all.year[i] in yrs_keep ] # dates_all = dates_all[idx_yrs] mask_all = np.zeros(dates_all.size, dtype=bool) mask_all[idx_yrs] = True dates_excl_yrs = dates_all[mask_all] Prec_reg = Prec_reg.sel(time=dates_excl_yrs) #%% return Prec_reg, ex
def load_and_aggregate_precur(self, kwrgs_load): ''' Wrapper to load in Netcdf and aggregated to n-mean bins or a period mean, e.g. DJF mean (see seasonal_mode.ipynb). Parameters ---------- kwrgs_load : TYPE dictionary passed to functions_pp.import_ds_timemeanbins or to functions_pp.time_mean_periods. df_splits : pd.DataFrame, optional See class_RGCPD. The default is using the df_splits that was used for calculating the correlation map. Returns ------- None. ''' # precur = rg.list_for_MI[0] ; df_splits = rg.df_splits ; kwrgs_load = rg.kwrgs_load name = self.name filepath = self.filepath # for name, filepath in list_precur_pp: # loop over all variables # ============================================================================= # Unpack non-default arguments # ============================================================================= kwrgs = {'selbox': self.selbox, 'dailytomonths': self.dailytomonths} for key, value in kwrgs_load.items(): if type(value) is list and name in value[1].keys(): kwrgs[key] = value[1][name] elif type(value) is list and name not in value[1].keys(): kwrgs[key] = value[0] # plugging in default value elif hasattr(self, key): # Overwrite RGCPD parameters with MI specific parameters kwrgs[key] = self.__dict__[key] else: kwrgs[key] = value if self.lag_as_gap: kwrgs['tfreq'] = 1 self.kwrgs_load = kwrgs.copy() #=========================================== # Precursor field #=========================================== self.precur_arr = functions_pp.import_ds_timemeanbins( filepath, **kwrgs) if type(self.lags[0]) == np.ndarray: tmp = functions_pp.time_mean_periods self.precur_arr = tmp(self.precur_arr, self.lags, kwrgs_load['start_end_year']) return
def execute_to_dict(var_filename, npmask, v1, v2, q, clustermethodkey, kwrgs, kwrgs_l): # if reload: # some param has been adjusted xarray_ts = functions_pp.import_ds_timemeanbins( var_filename, **kwrgs_l) if type(q) is int: xarray = binary_occurences_quantile(xarray_ts, q=q) if type(q) is list: xarray = binary_occurences_quantile(xarray_ts, q=v2) xrclusteredij, result = skclustering( xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs) return {f'{v1}..{v2}': (xrclusteredij, result)}
def calculate_corr_maps(RV, df_splits, kwrgs_load, list_varclass=list, lags=[0], alpha=0.05, FDR_control=True, plot=True): #%% outdic_actors = dict() class act: def __init__(self, name, corr_xr, precur_arr): self.name = name self.corr_xr = corr_xr self.precur_arr = precur_arr self.lat_grid = precur_arr.latitude.values self.lon_grid = precur_arr.longitude.values self.area_grid = rgcpd.get_area(precur_arr) self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0]) for var_class in list_varclass: # loop over all variables #=========================================== # 3c) Precursor field #=========================================== file_path = os.path.join(var_class.path_pp, var_class.filename_pp) precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex) # precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east') # ============================================================================= # Calculate correlation # ============================================================================= corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, df_splits, lags=lags, alpha=alpha, FDR_control=FDR_control) # ============================================================================= # Cluster into precursor regions # ============================================================================= actor = act(var_class.name, corr_xr, precur_arr) outdic_actors[actor.name] = actor return outdic_actors
def calculate_corr_maps(TV, df_splits, kwrgs_load, list_precur_pp=list, lags=np.array([1]), alpha=0.05, FDR_control=True): ''' tfreq : aggregate precursors with bins of window size = tfreq selbox : selbox is tuple of: (degrees_east, degrees_west, degrees_south, degrees_north) loadleap : if leap day should loaded yes/no seldates : if a selection of dates should be loaded ''' #%% outdic_precur = dict() class act: def __init__(self, name, corr_xr, precur_arr): self.name = name self.corr_xr = corr_xr self.precur_arr = precur_arr self.lat_grid = precur_arr.latitude.values self.lon_grid = precur_arr.longitude.values self.area_grid = get_area(precur_arr) self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0]) for name, filepath in list_precur_pp: # loop over all variables #=========================================== # 3c) Precursor field #=========================================== precur_arr = functions_pp.import_ds_timemeanbins(filepath, **kwrgs_load) # ============================================================================= # Calculate correlation # ============================================================================= corr_xr = calc_corr_coeffs_new(precur_arr, TV, df_splits, lags=lags, alpha=alpha, FDR_control=FDR_control) # ============================================================================= # Cluster into precursor regions # ============================================================================= actor = act(name, corr_xr, precur_arr) outdic_precur[actor.name] = actor return outdic_precur
def sklearn_clustering(var_filename, mask=None, kwrgs_load={}, clustermethodkey='DBSCAN', kwrgs_clust={'eps': 600}): if 'selbox' in kwrgs_load.keys(): if kwrgs_load['selbox'] is not None: mask = kwrgs_load.pop('selbox') print( 'mask overwritten with selbox list. Both selbox and mask are given.' 'Both adapt the domain over which to cluster') kwrgs_l_spatial = {} # kwrgs affecting spatial extent/format if 'format_lon' in kwrgs_load.keys(): kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon'] xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) npmask = get_spatial_ma(var_filename, mask, kwrgs_l_spatial=kwrgs_l_spatial) kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list} [ kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items() if type(i) == list ] if len(kwrgs_loop) == 1: # insert fake axes kwrgs_loop['fake'] = [0] if len(kwrgs_loop) >= 1: new_coords = [] xrclustered = xarray[0].drop('time') for k, list_v in kwrgs_loop.items(): # in alphabetical order new_coords.append(k) dim_coords = {str(k): list_v} xrclustered = xrclustered.expand_dims(dim_coords).copy() new_coords = [ d for d in xrclustered.dims if d not in ['latitude', 'longitude'] ] results = [] first_loop = kwrgs_loop[new_coords[0]] second_loop = kwrgs_loop[new_coords[1]] for i, v1 in enumerate(first_loop): for j, v2 in enumerate(second_loop): kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2) kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2) print( f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ", end="") xarray = functions_pp.import_ds_timemeanbins( var_filename, **kwrgs_l) xrclustered[i, j], result = skclustering( xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs) results.append(result) if 'fake' in new_coords: xrclustered = xrclustered.squeeze().drop('fake').copy() else: xrclustered, results = skclustering(xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs_clust) xrclustered.attrs['method'] = clustermethodkey xrclustered.attrs['kwrgs'] = str(kwrgs_clust) xrclustered.attrs['target'] = f'{xarray.name}' if 'hash' not in xrclustered.attrs.keys(): xrclustered.attrs['hash'] = uuid.uuid4().hex[:5] return xrclustered, results
def sklearn_clustering(var_filename: str, mask: Optional[np.ndarray] = None, dimension: str = 'temporal', kwrgs_load: Dict = {}, clustermethodkey: str = 'DBSCAN', kwrgs_clust: str = {'eps': 600}): """ Performs clustering for combinations of clustering parameters (kwrgs_clust) and parameters for variable loading (kwrgs_load) Parameters ---------- var_filename : str path to .nc file mask : 2-d numpy.ndarray, optional mask with spatial coordinates to be clustered, the default is None dimension : str temporal or spatial, the default is temporal kwrgs_load : dict, optional keywords for loading variable, the default is {}; see functions_pp.import_ds_timemeanbins? for parameters clustermethodkey : str Is build upon sklean clustering. Techniques available are listed in sklearn.cluster.__dict__, e.g. KMeans, or AgglomerativeClustering. See kwrgs_clust for algorithm parameters. kwrgs_clust : dict (algorithm dependent) dictionary of clustering parameters, the default is {'eps': 600} Returns ------- xr_temporal: list of temporally clustered xarray objects xrclustered: spatially clustered xaray object results: list of sklearn.cluster objects """ if 'selbox' in kwrgs_load.keys(): assert isinstance(kwrgs_load['selbox'], list), 'selbox is not a list' assert len( kwrgs_load['selbox'] ) == 4, 'selbox list does not have shape [lon_min, lon_max, lat_min, lat_max]' # we can either give a mask for coordinates or just select a box with coordinates if 'selbox' in kwrgs_load.keys(): if kwrgs_load['selbox'] is not None and mask is not None: print( 'Mask overwritten with selbox list. Both selbox and mask are given.' 'Both adapt the domain over which to cluster') mask = None kwrgs_l_spatial = {} # kwrgs affecting spatial extent/format if 'format_lon' in kwrgs_load.keys(): kwrgs_l_spatial['format_lon'] = kwrgs_load['format_lon'] if 'selbox' in kwrgs_load.keys(): kwrgs_l_spatial['selbox'] = kwrgs_load['selbox'] # here we import an .nc file and convert it into an xarray object xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l_spatial) # here we create a numpy array mask for coordinates selected using mask (or selbox if selbox in kwrgs_load()) npmask = get_spatial_ma(var_filename, mask, kwrgs_l_spatial=kwrgs_l_spatial) # arguments loop kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list} [ kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items() if type(i) == list ] if 'selbox' in kwrgs_loop.keys(): kwrgs_loop.pop('selbox') if len(kwrgs_loop) == 1: # insert fake axes kwrgs_loop['fake'] = [0] if len(kwrgs_loop) >= 1: new_coords = [] if dimension == 'spatial': xrclustered = xarray[0].drop('time') else: xrclustered = xarray for k, list_v in kwrgs_loop.items(): # in alphabetical order # new_coords contains keys from kwrgs_clust and kwrgs_load new_coords.append(k) # in every iteration of the loop, we create a dictionary using key and value from kwrgs_clust and kwrgs_load dim_coords = {str(k): list_v} # expanding the xarray dataset by dim_coords dictionaries xrclustered = xrclustered.expand_dims(dim_coords).copy() # create a list of coordinates/dimensions added in the for loop above (from kwrgs_clust and kwrgs_load) new_coords = [ d for d in xrclustered.dims if d not in ['latitude', 'longitude', 'time'] ] # to store sklearn objects results = [] # separating kwrgs into lists to loop over first_loop = kwrgs_loop[new_coords[0]] second_loop = kwrgs_loop[new_coords[1]] if dimension == 'temporal': xr_temporal = np.empty( [len(first_loop), len(second_loop)], dtype=object) # if kwrgs_load is empty we can load in the xarray here -> it won't be changing # loop over kwrgs_load and kwrgs_clust values for i, v1 in enumerate(first_loop): for j, v2 in enumerate(second_loop): # create dictionaries of all possible combinations of kwrgs_load and kwrgs_clust ?? kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2) # if we don't have any kwrgs_load we don't need it -> add an if statement for memory optimization # and add the 5 lines below into the if statement kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2) if 'tfreq' in kwrgs_l.keys(): assert isinstance(kwrgs_l['tfreq'], int), 'tfreq is not an integer' print( f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ", end="") xarray = functions_pp.import_ds_timemeanbins( var_filename, **kwrgs_l) # updating xarray object and results - here change for supervised/unsupervised clustering if dimension == 'spatial': xrclustered[i, j], result = skclustering( xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs, dimension=dimension) else: xr_temporal[i, j], result = skclustering( xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs, dimension=dimension) # storing arbitrary metadata for temporal clustering xr_temporal[i, j].attrs['method'] = clustermethodkey xr_temporal[i, j].attrs['target'] = f'{xarray.name}' if new_coords[0] != 'fake': xr_temporal[i, j].attrs[new_coords[0]] = v1 xr_temporal[i, j].attrs[new_coords[1]] = v2 if 'hash' not in xr_temporal[i, j].attrs.keys(): xr_temporal[i, j].attrs['hash'] = uuid.uuid4().hex[:5] results.append(result) if 'fake' in new_coords and dimension == 'spatial': xrclustered = xrclustered.squeeze().drop('fake').copy() else: xrclustered, results = skclustering(xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs_clust, dimension=dimension) return xrclustered, results # storing arbitrary metadata for spatial clustering xrclustered.attrs['method'] = clustermethodkey xrclustered.attrs['kwrgs'] = str(kwrgs_clust) xrclustered.attrs['target'] = f'{xarray.name}' if 'hash' not in xrclustered.attrs.keys(): xrclustered.attrs['hash'] = uuid.uuid4().hex[:5] if dimension == 'temporal': return xr_temporal, results # dimension == 'spatial' else: return xrclustered, results
def sklearn_clustering(var_filename, mask=None, kwrgs_load={}, clustermethodkey='DBSCAN', kwrgs_clust={'eps': 600}): if 'selbox' in kwrgs_load.keys(): kwrgs_l = dict(selbox=kwrgs_load['selbox']) else: kwrgs_l = {} xarray = core_pp.import_ds_lazy(var_filename, **kwrgs_l) if 'selbox' in kwrgs_l.keys() and mask is None: npmask = np.ones_like(xarray[0].values, dtype=bool) else: npmask = get_spatial_ma(var_filename, mask) kwrgs_loop = {k: i for k, i in kwrgs_clust.items() if type(i) == list} [ kwrgs_loop.update({k: i}) for k, i in kwrgs_load.items() if type(i) == list ] if len(kwrgs_loop) == 1: # insert fake axes kwrgs_loop['fake'] = [0] if len(kwrgs_loop) >= 1: new_coords = [] xrclustered = xarray[0].drop('time') for k, list_v in kwrgs_loop.items(): # in alphabetical order new_coords.append(k) dim_coords = {str(k): list_v} xrclustered = xrclustered.expand_dims(dim_coords).copy() new_coords = [ d for d in xrclustered.dims if d not in ['latitude', 'longitude'] ] results = [] first_loop = kwrgs_loop[new_coords[0]] second_loop = kwrgs_loop[new_coords[1]] for i, v1 in enumerate(first_loop): for j, v2 in enumerate(second_loop): kwrgs = adjust_kwrgs(kwrgs_clust.copy(), new_coords, v1, v2) kwrgs_l = adjust_kwrgs(kwrgs_load.copy(), new_coords, v1, v2) print( f"\rclustering {new_coords[0]}: {v1}, {new_coords[1]}: {v2} ", end="") xarray = functions_pp.import_ds_timemeanbins( var_filename, **kwrgs_l) xrclustered[i, j], result = skclustering( xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs) results.append(result) if 'fake' in new_coords: xrclustered = xrclustered.squeeze().drop('fake').copy() else: xrclustered, results = skclustering(xarray, npmask, clustermethodkey=clustermethodkey, kwrgs=kwrgs_clust) xrclustered.attrs['method'] = clustermethodkey xrclustered.attrs['kwrgs'] = str(kwrgs_clust) xrclustered.attrs['target'] = f'{xarray.name}' if 'hash' not in xrclustered.attrs.keys(): xrclustered.attrs['hash'] = uuid.uuid4().hex[:5] return xrclustered, results
def calculate_corr_maps(ex, map_proj): #%% # ============================================================================= # Load 'exp' dictionairy with information of pre-processed data (variables, paths, filenames, etcetera..) # and add RGCPD/Tigrimate experiment settings # ============================================================================= # Response Variable is what we want to predict RV = ex[ex['RV_name']] ex['time_cycle'] = RV.dates[ RV.dates.year == RV.startyear].size # time-cycle of data. total timesteps in one year ex['time_range_all'] = [0, RV.dates.size] #================================================================================== # Start of experiment #================================================================================== # Define traintest: df_RVfullts = pd.DataFrame(RV.RVfullts.values, index=pd.to_datetime(RV.RVfullts.time.values)) df_RV_ts = pd.DataFrame(RV.RV_ts.values, index=pd.to_datetime(RV.RV_ts.time.values)) if ex['method'][:9] == 'ran_strat': kwrgs_events = ex['kwrgs_events'] RV = func_fc.RV_class(df_RVfullts, df_RV_ts, kwrgs_events) else: RV = func_fc.RV_class(df_RVfullts, df_RV_ts) if ex['import_prec_ts']: # Retrieve same train test split as imported ts path_data = ''.join(ex['precursor_ts'][0][1]) df_splits = func_fc.load_hdf5( path_data)['df_data'].loc[:, ['TrainIsTrue', 'RV_mask']] test_yrs = functions_pp.get_testyrs(df_splits) df_splits, ex = functions_pp.rand_traintest_years(RV, ex, test_yrs) assert (np.equal(test_yrs, ex['tested_yrs'])).all(), "Train test split not equal" else: df_splits, ex = functions_pp.rand_traintest_years(RV, ex) # ============================================================================= # 2) DEFINE PRECURSOS COMMUNITIES: # ============================================================================= # - calculate and plot pattern correltion for differnt fields # - create time-series over these regions #===================================================================================== outdic_actors = dict() class act: def __init__(self, name, corr_xr, precur_arr): self.name = var self.corr_xr = corr_xr self.precur_arr = precur_arr self.lat_grid = precur_arr.latitude.values self.lon_grid = precur_arr.longitude.values self.area_grid = rgcpd.get_area(precur_arr) self.grid_res = abs(self.lon_grid[1] - self.lon_grid[0]) allvar = ex['vars'][0] # list of all variable names for var in allvar[ex['excludeRV']:]: # loop over all variables actor = ex[var] #=========================================== # 3c) Precursor field #=========================================== file_path = os.path.join(actor.path_pp, actor.filename_pp) precur_arr = functions_pp.import_ds_timemeanbins(file_path, ex) # precur_arr = rgcpd.convert_longitude(precur_arr, 'only_east') # ============================================================================= # Calculate correlation # ============================================================================= corr_xr = rgcpd.calc_corr_coeffs_new(precur_arr, RV, ex) # ============================================================================= # Cluster into precursor regions # ============================================================================= actor = act(var, corr_xr, precur_arr) actor, ex = rgcpd.cluster_DBSCAN_regions(actor, ex) if np.isnan(actor.prec_labels.values).all() == False: rgcpd.plot_regs_xarray(actor.prec_labels.copy(), ex) outdic_actors[var] = actor # ============================================================================= # Plot # ============================================================================= if ex['plotin1fig'] == False: plot_maps.plot_corr_maps(corr_xr, corr_xr['mask'], map_proj) fig_filename = '{}_corr_{}_vs_{}'.format( ex['params'], ex['RV_name'], var) + ex['file_type2'] plt.savefig(os.path.join(ex['fig_path'], fig_filename), bbox_inches='tight', dpi=200) if ex['showplot'] == False: plt.close() #%% return ex, outdic_actors
def get_pattern(self, filepath, df_splits=None): # filepath = '/Users/semvijverberg/surfdrive/ERA5/input_raw/preprocessed/sst_1979-2018_1jan_31dec_daily_2.5deg.nc' self.filepath = filepath if self.tfreq_EOF == 'monthly': self.ds_EOF = functions_pp.import_ds_timemeanbins(self.filepath, tfreq=1, selbox=self.selbox, dailytomonths=True, start_end_date=self.start_end_date, start_end_year=self.start_end_year) else: self.ds_EOF = functions_pp.import_ds_timemeanbins(self.filepath, tfreq=self.tfreq_EOF, selbox=self.selbox, start_end_date=self.start_end_date, start_end_year=self.start_end_year, closed_on_date=self.start_end_date[-1]) if self.name is None: if hasattr(self.ds_EOF, 'name'): # take name of variable self.name = self.ds_EOF.name if df_splits is None: print('no train test splits for fitting EOF') data = np.zeros( (1, self.neofs, self.ds_EOF.latitude.size, self.ds_EOF.longitude.size) ) coords = [[0], [f'EOF{n}_'+self.name for n in range(self.neofs)], self.ds_EOF.latitude.values, self.ds_EOF.longitude.values] self.eofs = xr.DataArray(data, coords=coords, dims = ['split', 'eof', 'latitude', 'longitude']) solvers = [] self.eofs[0,:], solver = self._get_EOF_xarray(self.ds_EOF, self.neofs) solvers.append(solver) self.solvers = solvers else: self.df_splits = df_splits splits = df_splits.index.levels[0] func = self._get_EOF_xarray if self.n_cpu > 1: try: with ProcessPoolExecutor(max_workers=os.cpu_count()) as pool: futures = [] for s in range(splits.size): progress = int(100 * (s+1) / splits.size) print(f"\rProgress traintest set {progress}%", end="") futures.append(pool.submit(self._single_split, func, self.ds_EOF, s, df_splits, self.neofs)) results = [future.result() for future in futures] pool.shutdown() except: results = [self._single_split(func, self.ds_EOF, s, df_splits, self.neofs) for s in range(splits.size)] else: results = [self._single_split(func, self.ds_EOF, s, df_splits, self.neofs) for s in range(splits.size)] # unpack results data = np.zeros( (splits.size, self.neofs, self.ds_EOF.latitude.size, self.ds_EOF.longitude.size) ) coords = [splits, [f'0..{n+1}..EOF_'+self.name for n in range(self.neofs)], self.ds_EOF.latitude.values, self.ds_EOF.longitude.values] self.eofs = xr.DataArray(data, coords=coords, dims = ['split', 'eof', 'latitude', 'longitude']) solvers = [] for s in splits: self.eofs[s,:] = results[s][0] solvers.append(results[s][1]) # ensure same sign mask_pos = (self.eofs[0] > self.eofs[0].mean()) sign = np.sign(self.eofs[s].where(mask_pos).mean(axis=(1,2))) self.eofs[s,:] = sign * self.eofs[s,:]