def pers_ano_to_extr(filename_ts, RV, kwrgs_events_daily, dict_experiments, name_exp, name_model, n_boot): # loading in daily timeseries RVfullts = np.load(filename_ts, encoding='latin1', allow_pickle=True).item()['RVfullts95'] # Retrieve information on input timeseries import functions_pp dates = functions_pp.get_oneyr(RV.RV_ts.index) tfreq = (dates[1] - dates[0]).days start_date = dates[0] - pd.Timedelta(f'{tfreq/2}d') end_date = dates[-1] + pd.Timedelta(f'{-1+tfreq/2}d') yr_daily = pd.DatetimeIndex(start=start_date, end=end_date, freq=pd.Timedelta('1d')) ext_dates = functions_pp.make_dates(RV.RV_ts.index, yr_daily, RV.RV_ts.index.year[-1]) df_RV_ts_e = pd.DataFrame(RVfullts.sel(time=ext_dates).values, index=ext_dates, columns=['RV_ts']) df_RVfullts = pd.DataFrame(RVfullts.values, index=pd.to_datetime(RVfullts.time.values), columns=['RVfullts']) # Make new class based on new kwrgs_events_daily RV_d = func_fc.RV_class(df_RVfullts, df_RV_ts_e, kwrgs_events_daily) # Ensure that the bins on the daily time series matches the original ex = dict(sstartdate=f'{yr_daily[0].month}-{yr_daily[0].day}', senddate=f'{yr_daily[-1].month}-{yr_daily[-1].day}', startyear=ext_dates.year[0], endyear=ext_dates.year[-1]) RV_d.RV_bin, dates_gr = functions_pp.time_mean_bins(RV_d.RV_bin, ex, tfreq) RV_d.RV_bin[RV_d.RV_bin > 0] = 1 RV_d.TrainIsTrue = RV.TrainIsTrue RV_d.RV_mask = RV.RV_mask # add new probability of event occurence RV_d.prob_clim = func_fc.get_obs_clim(RV_d) dict_comparison = {} # loading model predicting pers. anomalies orig_event_perc = np.round(1 - float(RV.prob_clim.mean()), 2) new_name = '{}d mean +{}p to +{}p events'.format( tfreq, orig_event_perc, kwrgs_events_daily['event_percentile']) dict_sum = dict_experiments[name_exp] df_valid, RV, y_pred = dict_sum[models[-1]] blocksize = valid.get_bstrap_size(RV.RVfullts, plot=False) out = valid.get_metrics_sklearn(RV_d, y_pred, RV_d.prob_clim, n_boot=n_boot, blocksize=blocksize) df_valid, metrics_dict = out dict_comparison[new_name] = {name_model: (df_valid, RV_d, y_pred)} return dict_comparison
def _daily_to_aggr(df_data, daily_to_aggr=int): import functions_pp if hasattr(df_data.index, 'levels'): splits = df_data.index.levels[0] df_data_s = np.zeros( (splits.size) , dtype=object) for s in splits: df_data_s[s], dates_tobin = functions_pp.time_mean_bins( df_data.loc[s], tfreq=daily_to_aggr, start_end_date=None, start_end_year=None, verbosity=0) df_data_resample = pd.concat(list(df_data_s), keys= range(splits.size)) else: df_data_resample, dates_tobin = functions_pp.time_mean_bins(df_data, tfreq=daily_to_aggr, start_end_date=None, start_end_year=None, verbosity=0) return df_data_resample, dates_tobin
def ENSO_34(file_path, ex, df_splits=None): #%% # file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc' ''' See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf ''' if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) seldates = None else: seldates = df_splits.loc[0].index kwrgs_pp = { 'selbox': { 'la_min': -5, # select domain in degrees east 'la_max': 5, 'lo_min': -170, 'lo_max': -120 }, 'seldates': seldates } ds = core_pp.import_ds_lazy(file_path, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] list_splits = [] for s in splits: progress = 100 * (s + 1) / splits.size print(f"\rProgress ENSO traintest set {progress}%)", end="") data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude')) list_splits.append( pd.DataFrame(data=data.values, index=dates, columns=['0_900_ENSO34'])) df_ENSO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_ENSO
def import_precur_ts(import_prec_ts, df_splits, to_freq, start_end_date, start_end_year): ''' import_prec_ts has format tuple (name, path_data) ''' splits = df_splits.index.levels[0] df_data_ext_s = np.zeros( (splits.size) , dtype=object) counter = 0 for i, (name, path_data) in enumerate(import_prec_ts): for s in range(splits.size): # skip first col because it is the RV ts df_data_e = func_fc.load_hdf5(path_data)['df_data'].iloc[:,1:].loc[s] cols_ts = np.logical_or(df_data_e.dtypes == 'float64', df_data_e.dtypes == 'float32') cols_ext = list(df_data_e.columns[cols_ts]) # cols_ext must be of format '{}_{int}_{}' lab_int = 100 for i, c in enumerate(cols_ext): char = c.split('_')[1] if char.isdigit(): pass else: cols_ext[i] = c.replace(char, str(lab_int)) + char lab_int += 1 df_data_ext_s[s] = df_data_e[cols_ext] tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days if to_freq != tfreq_date_e: try: df_data_ext_s[s] = functions_pp.time_mean_bins(df_data_ext_s[s], to_freq, start_end_date, start_end_year)[0] except KeyError as e: print('KeyError captured, likely the requested dates ' 'given by start_end_date and start_end_year are not' 'found in external pandas timeseries.\n{}'.format(str(e))) if counter == 0: df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size)) else: df_data_ext.merge(df_data_ext, left_index=True, right_index=True) return df_data_ext
def __init__(self, fullts: pd.DataFrame, RV_ts: pd.DataFrame, kwrgs_events: Union[dict, tuple], only_RV_events: bool = True, fit_model_dates: Tuple[str, str] = None): ''' only_RV_events : bool. Decides whether to calculate the RV_bin on the whole fullts timeseries, or only on RV_ts ''' #%% # self.RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]] ) # self.fullts = pd.DataFrame(df_data[df_data.columns[0]][0]) self.name = fullts.columns[0] self.RV_ts = RV_ts self.fullts = fullts self.dates_all = fullts.index self.dates_RV = RV_ts.index self.n_oneRVyr = self.dates_RV[self.dates_RV.year == self.dates_RV.year[0]].size nonleap = self.dates_all[~self.dates_all.is_leap_year] self.tfreq = (nonleap[1] - nonleap[0]).days if self.tfreq != 365 or self.tfreq != 1: self.dates_tobin = self.aggr_to_daily_dates(self.dates_RV, tfreq=self.tfreq) def handle_fit_model_dates(dates_RV, dates_all, RV_ts, fit_model_dates): if fit_model_dates is None: # RV_ts and RV_ts_fit are equal if fit_model_dates = None bool_mask = [ True if d in dates_RV else False for d in dates_all ] fit_model_mask = pd.DataFrame(bool_mask, columns=['fit_model_mask'], index=dates_all) RV_ts_fit = RV_ts fit_dates = dates_RV else: startperiod, endperiod = fit_model_dates startyr = dates_all[0].year endyr = dates_all[-1].year # if dates_all.resolution == 'day': # tfreq = (dates_all[1] - dates_all[0]).days start_end_date = (startperiod, endperiod) start_end_year = (startyr, endyr) fit_dates = core_pp.get_subdates(dates_all, start_end_date=start_end_date, start_end_year=start_end_year) bool_mask = [ True if d in fit_dates else False for d in dates_all ] fit_model_mask = pd.DataFrame(bool_mask, columns=['fit_model_mask'], index=dates_all) RV_ts_fit = fullts[fit_model_mask.values] fit_dates = fit_dates return fit_model_mask, fit_dates, RV_ts_fit out = handle_fit_model_dates(self.dates_RV, self.dates_all, self.RV_ts, fit_model_dates) self.fit_model_mask, self.fit_dates, self.RV_ts_fit = out if kwrgs_events is not None: # make RV_bin for events based on aggregated daymeans if kwrgs_events['window'] == 'mean': # RV_ts and RV_ts_fit are equal if fit_model_dates = None self.threshold = Ev_threshold(self.RV_ts, kwrgs_events['event_percentile']) self.threshold_ts_fit = Ev_threshold( self.RV_ts_fit, kwrgs_events['event_percentile']) # unpack other optional arguments for defining event timeseries redun_keys = ['event_percentile', 'window'] kwrgs = { key: item for key, item in kwrgs_events.items() if key not in redun_keys } if only_RV_events == True: out = Ev_timeseries(self.RV_ts_fit, threshold=self.threshold_ts_fit, **kwrgs) self.RV_bin_fit, self.RV_dur = out self.RV_bin = self.RV_bin_fit.loc[self.dates_RV] elif only_RV_events == False: out = Ev_timeseries(self.fullts, threshold=self.threshold, **kwrgs) self.RV_b_full, self.RV_dur = out self.RV_bin = self.RV_b_full.loc[self.dates_RV] self.freq_per_year = RV_class.get_freq_years(self) # make RV_bin for extreme occurring in time window if type(kwrgs_events['window']) is pd.DataFrame: fullts = kwrgs_events['window'] dates_RVe = self.aggr_to_daily_dates(self.dates_RV, tfreq=self.tfreq) dates_alle = self.aggr_to_daily_dates(self.dates_all, tfreq=self.tfreq) self.df_RV_ts_e = fullts.loc[dates_RVe] df_fullts_e = fullts.loc[dates_alle] out = handle_fit_model_dates(dates_RVe, dates_alle, self.df_RV_ts_e, fit_model_dates) self.fit_model_mask, self.fit_dates, self.RV_ts_fit_e = out # RV_ts and RV_ts_fit are equal if fit_model_dates = None self.threshold = Ev_threshold(self.df_RV_ts_e, kwrgs_events['event_percentile']) self.threshold_ts_fit = Ev_threshold( self.RV_ts_fit_e, kwrgs_events['event_percentile']) # unpack other optional arguments for defining event timeseries redun_keys = ['event_percentile', 'window'] kwrgs = { key: item for key, item in kwrgs_events.items() if key not in redun_keys } if only_RV_events == True: # RV_bin_fit is defined such taht we can fit on RV_bin_fit # but validate on RV_bin out = Ev_timeseries(self.df_RV_ts_e, threshold=self.threshold_ts_fit, **kwrgs) self.RV_bin_fit_e, self.RV_dur = out self.RV_bin_e = self.RV_bin_fit_e.loc[dates_RVe] elif only_RV_events == False: print('check code, not supported yet') # convert daily binary to window binary if self.tfreq != 1: self.RV_bin, dates_gr = functions_pp.time_mean_bins( self.RV_bin_e.astype('float'), self.tfreq, None, None) self.RV_bin_fit, dates_gr = functions_pp.time_mean_bins( self.RV_bin_fit_e.astype('float'), self.tfreq, None, None) else: print( 'Warning: tfreq must be larger than 1 to calculate the window binary' ) # all bins, with mean > 0 contained an 'extreme' event self.RV_bin_fit[self.RV_bin_fit > 0] = 1 self.RV_bin[self.RV_bin > 0] = 1
def spatial_valid(var_filename, mask, y_pred_all, y_pred_c, lags_i=None, seldates=None, clusters=None, kwrgs_events=None, alpha=0.05, n_boot=0, blocksize=10, threshold_pred='upper_clim'): ''' var_filename must be 3d netcdf file with only one variable mask can be nc file containing only a mask, or a latlon box in format [west_lon, east_lon, south_lat, north_lat] in format in common west-east degrees ''' var_filename = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/t2mmax_US_1979-2018_1jan_31dec_daily_0.25deg.nc' mask = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/preprocessed/cluster_output.nc' if lags_i is None: lags_i = list(y_pred_all.columns) # load in daily xarray and mask xarray = core_pp.import_ds_lazy(var_filename) npmask = cl.get_spatial_ma(var_filename, mask) # process temporal infor freq = (y_pred_c.index[1] - y_pred_c.index[0]).days if seldates is None: seldates = aggr_to_daily_dates(y_pred_c.index) start = f'{seldates[0].month}-{seldates[0].day}' end = f'{seldates[-1].month}-{seldates[-1].day}' start_end_date = (start, end) xarray, dates = functions_pp.time_mean_bins(xarray, to_freq=freq, start_end_date=start_end_date) # if switching to event timeseries: if kwrgs_events is None: kwrgs_events = {'event_percentile': 66} # unpack other optional arguments for defining event timeseries kwrgs = { key: item for key, item in kwrgs_events.items() if key != 'event_percentile' } if clusters is None: clusters = list(np.unique(npmask[~np.isnan(npmask)])) elif type(clusters) is int: clusters = [clusters] elif clusters is not None: clusters = clusters dict_allclus = {} for clus in clusters: latloni = np.where(npmask == clus) latloni = [(latloni[0][i], latloni[1][i]) for i in range(latloni[0].size)] futures = {} with ProcessPoolExecutor(max_workers=max_cpu) as pool: for ll in latloni: latloni = latloni xr_gridcell = xarray.isel(latitude=ll[0]).isel(longitude=ll[1]) threshold = func_fc.Ev_threshold( xr_gridcell, kwrgs_events['event_percentile']) y_i = func_fc.Ev_timeseries(xr_gridcell, threshold, **kwrgs)[0] futures[ll] = pool.submit(valid.get_metrics_sklearn, y_i.values, y_pred_all[lags_i], y_pred_c, alpha=alpha, n_boot=n_boot, blocksize=blocksize, threshold_pred=threshold_pred) results = {key: future.result() for key, future in futures.items()} dict_allclus[clus] = results df_valid = dict_allclus[clus][ll][0] metrics = np.unique(df_valid.index.get_level_values(0)) lags_tf = [l * freq for l in lags_i] if freq != 1: # the last day of the time mean bin is tfreq/2 later then the centerered day lags_tf = [ l_tf - int(freq / 2) if l_tf != 0 else 0 for l_tf in lags_tf ] for clus in clusters: results = dict_allclus[clus] xroutput = xarray.isel(time=lags_i).rename({'time': 'lag'}) xroutput['lag'] = lags_tf xroutput = xroutput.expand_dims({'metric': metrics}, 0) npdata = np.array(np.zeros_like(xroutput), dtype='float32') for ll in latloni: df_valid = dict_allclus[clus][ll][0] for i, met in enumerate(metrics): lat_i = ll[0] lon_i = ll[1] npdata[i, :, lat_i, lon_i] = df_valid.loc[met].loc[met] xroutput.values = npdata plot_maps.plot_corr_maps(xroutput.where(npmask == clus), row_dim='metric', size=4, clevels=np.arange(-1, 1.1, 0.2)) BSS = xroutput.where(npmask == clus).sel(metric='BSS') plot_maps.plot_corr_maps(BSS, row_dim='metric', size=4, clevels=np.arange(-0.25, 0.251, 0.05), cbar_vert=-0.1)
start_end_date=rg.start_end_TVdate) RV_ts = rg.fulltso.sel(time=dates_RV) ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1]) dslocal = core_pp.get_selbox(ds_v300, selbox=selbox) datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values), start_end_date=rg.start_end_TVdate) datesRW = datesRW + pd.Timedelta(f'{lag}d') dslocal = dslocal.sel(time=datesRW) wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox) patternlocal = wv6local.mean(dim='lag') ts = find_precursors.calc_spatcov(dslocal, patternlocal) ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate, closed_on_date=start_end_TVdate[-1]) RV_15, d = functions_pp.time_mean_bins(RV_ts, tfreq, start_end_date=start_end_TVdate, closed_on_date=start_end_TVdate[-1]) corr_value = np.corrcoef(ts_15.values.squeeze(), RV_15.values.squeeze())[0][1] print('corr: {:.2f}'.format(corr_value)) values.append(corr_value) plt.plot(range(-9,10), values[1:]) # df_wv6 = ts_15.to_dataframe(name='wv6p2') #%% sst = rg.list_for_MI[2] dates_years = functions_pp.get_oneyr(sst.df_splits.loc[0].index, *event_dates.year) sst.precur_arr.sel(time=dates_years).mean(dim='time').plot(vmin=-.3, vmax=.3, cmap=plt.cm.RdBu_r)
def import_precur_ts(list_import_ts: List[tuple], df_splits: pd.DataFrame, start_end_date: Tuple[str, str], start_end_year: Tuple[int, int], start_end_TVdate: Tuple[str, str], cols: list = None, precur_aggr: int = 1): ''' list_import_ts has format List[tuples], [(name, path_data)] ''' #%% # df_splits = rg.df_splits splits = df_splits.index.levels[0] orig_traintest = functions_pp.get_testyrs(df_splits) df_data_ext_s = np.zeros((splits.size), dtype=object) counter = 0 for i, (name, path_data) in enumerate(list_import_ts): df_data_e_all = functions_pp.load_hdf5(path_data)['df_data'] if type(df_data_e_all) is pd.Series: df_data_e_all = pd.DataFrame(df_data_e_all) df_data_e_all = df_data_e_all.iloc[:, :] # not sure why needed if cols is None: cols = list( df_data_e_all.columns[(df_data_e_all.dtypes != bool).values]) elif type(cols) is str: cols = [cols] if hasattr(df_data_e_all.index, 'levels'): dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:, dates_subset], :] else: dates_subset = core_pp.get_subdates(df_data_e_all.index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[dates_subset] if 'TrainIsTrue' in df_data_e_all.columns: _c = [ k for k in df_splits.columns if k in ['TrainIsTrue', 'RV_mask'] ] # check if traintest split is correct ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c]) _check_traintest = all( np.equal(core_pp.flatten(ext_traintest), core_pp.flatten(orig_traintest))) assert _check_traintest, ( 'Train test years of df_splits are not the ' 'same as imported timeseries') for s in range(splits.size): if 'TrainIsTrue' in df_data_e_all.columns: df_data_e = df_data_e_all.loc[s] else: df_data_e = df_data_e_all df_data_ext_s[s] = df_data_e[cols] tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days if precur_aggr != tfreq_date_e: try: df_data_ext_s[s] = functions_pp.time_mean_bins( df_data_ext_s[s], precur_aggr, start_end_date, start_end_year, start_end_TVdate=start_end_TVdate)[0] except KeyError as e: print('KeyError captured, likely the requested dates ' 'given by start_end_date and start_end_year are not' 'found in external pandas timeseries.\n{}'.format( str(e))) print(f'loaded in exterinal timeseres: {cols}') if counter == 0: df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size)) else: df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size)) df_data_ext = df_data_ext.merge(df_add, left_index=True, right_index=True) counter += 1 cols = None #%% return df_data_ext
def run_PCMCI(ex, outdic_actors, s, df_splits, map_proj): #===================================================================================== # # 4) PCMCI-algorithm # #===================================================================================== # save output if ex['SaveTF'] == True: # from contextlib import redirect_stdout orig_stdout = sys.stdout # buffer print statement output to f if sys.version[:1] == '3': sys.stdout = f = io.StringIO() elif sys.version[:1] == '2': sys.stdout = f = open(os.path.join(ex['fig_subpath'], 'old.txt'), 'w+') #%% # amount of text printed: verbosity = 3 # alpha level for independence test within the pc procedure (finding parents) pc_alpha = ex['pcA_sets'][ex['pcA_set']] # alpha level for multiple linear regression model while conditining on parents of # parents alpha_level = ex['alpha_level_tig'] print('run tigramite 4, run.pcmci') print(('alpha level(s) for independence tests within the pc procedure' '(finding parents): {}'.format(pc_alpha))) print(( 'alpha level for multiple linear regression model while conditining on parents of ' 'parents: {}'.format(ex['alpha_level_tig']))) # Retrieve traintest info traintest = df_splits # load Response Variable class RV = ex[ex['RV_name']] # create list with all actors, these will be merged into the fulldata array allvar = ex['vars'][0] var_names_corr = [] actorlist = [] cols = [[RV.name]] for var in allvar[:]: print(var) actor = outdic_actors[var] if actor.ts_corr[s].size != 0: ts_train = actor.ts_corr[s].values actorlist.append(ts_train) # create array which numbers the regions var_idx = allvar.index(var) n_regions = actor.ts_corr[s].shape[1] actor.var_info = [[i + 1, actor.ts_corr[s].columns[i], var_idx] for i in range(n_regions)] # Array of corresponing regions with var_names_corr (first entry is RV) var_names_corr = var_names_corr + actor.var_info cols.append(list(actor.ts_corr[s].columns)) index_dates = actor.ts_corr[s].index var_names_corr.insert(0, RV.name) # stack actor time-series together: fulldata = np.concatenate(tuple(actorlist), axis=1) print(('There are {} regions in total'.format(fulldata.shape[1]))) # add the full 1D time series of interest as first entry: fulldata = np.column_stack((RV.RVfullts, fulldata)) df_data = pd.DataFrame(fulldata, columns=flatten(cols), index=index_dates) if ex['import_prec_ts'] == True: var_names_full = var_names_corr.copy() for d in ex['precursor_ts']: path_data = d[1] if len(path_data) > 1: path_data = ''.join(list(path_data)) # skip first col because it is the RV ts df_data_ext = func_fc.load_hdf5( path_data)['df_data'].iloc[:, 1:].loc[s] cols_ts = np.logical_or(df_data_ext.dtypes == 'float64', df_data_ext.dtypes == 'float32') cols_ext = list(df_data_ext.columns[cols_ts]) # cols_ext must be of format '{}_{int}_{}' lab_int = 100 for i, c in enumerate(cols_ext): char = c.split('_')[1] if char.isdigit(): pass else: cols_ext[i] = c.replace(char, str(lab_int)) + char lab_int += 1 df_data_ext = df_data_ext[cols_ext] to_freq = ex['tfreq'] if to_freq != 1: start_end_date = (ex['sstartdate'], ex['senddate']) start_end_year = (ex['startyear'], ex['endyear']) df_data_ext = functions_pp.time_mean_bins(df_data_ext, to_freq, start_end_date, start_end_year, seldays='part')[0] # df_data_ext = functions_pp.time_mean_bins(df_data_ext, # ex, ex['tfreq'], # seldays='part')[0] # Expand var_names_corr n = var_names_full[-1][0] + 1 add_n = n + len(cols_ext) n_var_idx = var_names_full[-1][-1] + 1 for i in range(n, add_n): var_names_full.append([i, cols_ext[i - n], n_var_idx]) df_data = df_data.merge(df_data_ext, left_index=True, right_index=True) else: var_names_full = var_names_corr bool_train = traintest.loc[s]['TrainIsTrue'] bool_RV_train = np.logical_and(bool_train, traintest.loc[s]['RV_mask']) dates_train = traintest.loc[s]['TrainIsTrue'][bool_train].index dates_RV_train = traintest.loc[s]['TrainIsTrue'][bool_RV_train].index RVfull_train = RV.RVfullts.sel(time=dates_train) datesfull_train = pd.to_datetime(RVfull_train.time.values) data = df_data.loc[datesfull_train].values print((data.shape)) # get RV datamask (same shape als data) data_mask = [ True if d in dates_RV_train else False for d in datesfull_train ] data_mask = np.repeat(data_mask, data.shape[1]).reshape(data.shape) # add traintest mask to fulldata # dates_all = pd.to_datetime(RV.RVfullts.index) # dates_RV = pd.to_datetime(RV.RV_ts.index) dates_all = pd.to_datetime(RV.RVfullts.time.values) dates_RV = pd.to_datetime(RV.RV_ts.time.values) df_data['TrainIsTrue'] = [ True if d in datesfull_train else False for d in dates_all ] df_data['RV_mask'] = [True if d in dates_RV else False for d in dates_all] # ====================================================================================================================== # tigramite 3 # ====================================================================================================================== T, N = data.shape # Time, Regions # ====================================================================================================================== # Initialize dataframe object (needed for tigramite functions) # ====================================================================================================================== dataframe = pp.DataFrame(data=data, mask=data_mask, var_names=var_names_full) # ====================================================================================================================== # pc algorithm: only parents for selected_variables are calculated # ====================================================================================================================== parcorr = ParCorr(significance='analytic', mask_type='y', verbosity=verbosity) #========================================================================== # multiple testing problem: #========================================================================== pcmci = PCMCI(dataframe=dataframe, cond_ind_test=parcorr, selected_variables=None, verbosity=4) # selected_variables : list of integers, optional (default: range(N)) # Specify to estimate parents only for selected variables. If None is # passed, parents are estimated for all variables. # ====================================================================================================================== #selected_links = dictionary/None results = pcmci.run_pcmci(tau_max=ex['tigr_tau_max'], pc_alpha=pc_alpha, tau_min=0, max_combinations=ex['max_comb_actors']) q_matrix = pcmci.get_corrected_pvalues(p_matrix=results['p_matrix'], fdr_method='fdr_bh') pcmci.print_significant_links(p_matrix=results['p_matrix'], q_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) # returns all parents, not just causal precursors (of lag>0) sig = rgcpd.return_sign_parents(pcmci, pq_matrix=q_matrix, val_matrix=results['val_matrix'], alpha_level=alpha_level) all_parents = sig['parents'] # link_matrix = sig['link_matrix'] links_RV = all_parents[0] df = rgcpd.bookkeeping_precursors(links_RV, var_names_full) #%% rgcpd.print_particular_region_new(links_RV, var_names_corr, s, outdic_actors, map_proj, ex) #%% if ex['SaveTF'] == True: if sys.version[:1] == '3': fname = f's{s}_' + ex['params'] + '.txt' file = io.open(os.path.join(ex['fig_subpath'], fname), mode='w+') file.write(f.getvalue()) file.close() f.close() elif sys.version[:1] == '2': f.close() sys.stdout = orig_stdout return df, df_data
def plot_ts_matric(df_init, win: int=None, lag=0, columns: list=None, rename: dict=None, period='fullyear', plot_sign_stars=True, fontsizescaler=0): #%% ''' period = ['fullyear', 'summer60days', 'pre60days'] ''' if columns is None: columns = list(df_init.columns[(df_init.dtypes != bool).values]) df_cols = df_init[columns] if hasattr(df_init.index, 'levels'): splits = df_init.index.levels[0] print('extracting RV dates from test set') dates_RV_orig = df_init.loc[0].index[df_init.loc[0]['RV_mask']==True] TrainIsTrue = df_init['TrainIsTrue'] dates_full_orig = df_init.loc[0].index list_test = [] for s in range(splits.size): TestIsTrue = TrainIsTrue[s]==False list_test.append(df_cols.loc[s][TestIsTrue]) df_test = pd.concat(list_test).sort_index() else: df_test = df_init dates_full_orig = df_init.index if lag != 0: # shift precursor vs. tmax for c in df_test.columns[1:]: df_test[c] = df_test[c].shift(periods=-lag) # bin means if win is not None: oneyr = get_oneyr(df_test.index) start_end_date = (f'{oneyr[0].month:02d}-{oneyr[0].day:02d}', f'{oneyr[-1].month:02d}-{oneyr[-1].day:02d}') df_test = time_mean_bins(df_test, win, start_end_date=start_end_date)[0] if period=='fullyear': dates_sel = dates_full_orig.strftime('%Y-%m-%d') if 'RV_mask' in df_init.columns: if period == 'RV_mask': dates_sel = dates_RV_orig.strftime('%Y-%m-%d') elif period == 'RM_mask_lag60': dates_sel = (dates_RV_orig - pd.Timedelta(60, unit='d')).strftime('%Y-%m-%d') # after resampling, not all dates are in their: dates_sel = pd.to_datetime([d for d in dates_sel if d in df_test.index] ) df_period = df_test.loc[dates_sel, :].dropna() if rename is not None: df_period = df_period.rename(rename, axis=1) corr, sig_mask, pvals = corr_matrix_pval(df_period, alpha=0.01) # Generate a mask for the upper triangle mask_tri = np.zeros_like(corr, dtype=np.bool) mask_tri[np.triu_indices_from(mask_tri)] = True mask_sig = mask_tri.copy() mask_sig[sig_mask==False] = True # removing meaningless row and column cols = corr.columns corr = corr.drop(cols[0], axis=0).drop(cols[-1], axis=1) mask_sig = mask_sig[1:, :-1] mask_tri = mask_tri[1:, :-1] # Set up the matplotlib figure f, ax = plt.subplots(figsize=(10, 10)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, n=9, l=30, as_cmap=True) ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1E99, center=0, square=True, linewidths=.5, annot=False, annot_kws={'size':30+fontsizescaler}, cbar=False) if plot_sign_stars: sig_bold_labels = sig_bold_annot(corr, mask_sig) else: sig_bold_labels = corr.round(2).astype(str).values # Draw the heatmap with the mask and correct aspect ratio ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .8}, annot=sig_bold_labels, annot_kws={'size':30+fontsizescaler}, cbar=False, fmt='s') ax.tick_params(axis='both', labelsize=15+fontsizescaler, bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False) ax.set_xticklabels(corr.columns, fontdict={'fontweight':'bold', 'fontsize':20+fontsizescaler}) ax.set_yticklabels(corr.index, fontdict={'fontweight':'bold', 'fontsize':20+fontsizescaler}, rotation=0) #%% return
def __init__(self, fullts, RV_ts, kwrgs_events=None, only_RV_events=True, fit_model_dates=None): ''' only_RV_events : bool. Decides whether to calculate the RV_bin on the whole fullts timeseries, or only on RV_ts ''' #%% # self.RV_ts = pd.DataFrame(df_data[df_data.columns[0]][0][df_data['RV_mask'][0]] ) # self.fullts = pd.DataFrame(df_data[df_data.columns[0]][0]) self.RV_ts = RV_ts self.fullts = fullts self.dates_all = fullts.index self.dates_RV = RV_ts.index self.n_oneRVyr = self.dates_RV[self.dates_RV.year == self.dates_RV.year[0]].size self.tfreq = (self.dates_all[1] - self.dates_all[0]).days def handle_fit_model_dates(dates_RV, dates_all, RV_ts, fit_model_dates): if fit_model_dates is None: # RV_ts and RV_ts_fit are equal if fit_model_dates = None bool_mask = [ True if d in dates_RV else False for d in dates_all ] fit_model_mask = pd.DataFrame(bool_mask, columns=['fit_model_mask'], index=dates_all) RV_ts_fit = RV_ts fit_dates = dates_RV else: startperiod, endperiod = fit_model_dates startyr = dates_all[0].year endyr = dates_all[-1].year if dates_all.resolution == 'day': tfreq = (dates_all[1] - dates_all[0]).days ex = { 'startperiod': startperiod, 'endperiod': endperiod, 'tfreq': tfreq } fit_dates = functions_pp.make_RVdatestr( dates_all, ex, startyr, endyr) bool_mask = [ True if d in fit_dates else False for d in dates_all ] fit_model_mask = pd.DataFrame(bool_mask, columns=['fit_model_mask'], index=dates_all) RV_ts_fit = fullts[fit_model_mask.values] fit_dates = fit_dates return fit_model_mask, fit_dates, RV_ts_fit out = handle_fit_model_dates(self.dates_RV, self.dates_all, self.RV_ts, fit_model_dates) self.fit_model_mask, self.fit_dates, self.RV_ts_fit = out # make RV_bin for events based on aggregated daymeans if kwrgs_events is not None and (type(kwrgs_events) is not tuple or self.tfreq == 1): if type(kwrgs_events) is tuple: kwrgs_events = kwrgs_events[1] # RV_ts and RV_ts_fit are equal if fit_model_dates = None self.threshold = func_fc.Ev_threshold( self.RV_ts, kwrgs_events['event_percentile']) self.threshold_ts_fit = func_fc.Ev_threshold( self.RV_ts_fit, kwrgs_events['event_percentile']) # unpack other optional arguments for defining event timeseries kwrgs = { key: item for key, item in kwrgs_events.items() if key != 'event_percentile' } if only_RV_events == True: self.RV_bin_fit = func_fc.Ev_timeseries( self.RV_ts_fit, threshold=self.threshold_ts_fit, **kwrgs)[0] self.RV_bin = self.RV_bin_fit.loc[self.dates_RV] elif only_RV_events == False: self.RV_b_full = func_fc.Ev_timeseries( self.fullts, threshold=self.threshold, **kwrgs)[0] self.RV_bin = self.RV_b_full.loc[self.dates_RV] self.freq = func_fc.get_freq_years(self.RV_bin) # make RV_bin for extreme occurring in time window if kwrgs_events is not None and type( kwrgs_events) is tuple and self.tfreq != 1: filename_ts = kwrgs_events[0] kwrgs_events_daily = kwrgs_events[1] # loading in daily timeseries fullts_xr = np.load(filename_ts, encoding='latin1', allow_pickle=True).item()['RVfullts95'] # Retrieve information on input timeseries def aggr_to_daily_dates(dates_precur_data): dates = functions_pp.get_oneyr(dates_precur_data) tfreq = (dates[1] - dates[0]).days start_date = dates[0] - pd.Timedelta(f'{int(tfreq/2)}d') end_date = dates[-1] + pd.Timedelta(f'{int(-1+tfreq/2+0.5)}d') yr_daily = pd.date_range(start=start_date, end=end_date, freq=pd.Timedelta('1d')) years = np.unique(dates_precur_data.year) ext_dates = functions_pp.make_dates(yr_daily, years) return ext_dates dates_RVe = aggr_to_daily_dates(self.dates_RV) dates_alle = aggr_to_daily_dates(self.dates_all) df_RV_ts_e = pd.DataFrame(fullts_xr.sel(time=dates_RVe).values, index=dates_RVe, columns=['RV_ts']) df_fullts_e = pd.DataFrame(fullts_xr.sel(time=dates_alle).values, index=dates_alle, columns=['fullts']) out = handle_fit_model_dates(dates_RVe, dates_alle, df_RV_ts_e, fit_model_dates) self.fit_model_mask, self.fit_dates, self.RV_ts_fit_e = out # RV_ts and RV_ts_fit are equal if fit_model_dates = None self.threshold = func_fc.Ev_threshold( df_RV_ts_e, kwrgs_events_daily['event_percentile']) self.threshold_ts_fit = func_fc.Ev_threshold( self.RV_ts_fit_e, kwrgs_events_daily['event_percentile']) if only_RV_events == True: # RV_bin_fit is defined such taht we can fit on RV_bin_fit # but validate on RV_bin self.RV_bin_fit = func_fc.Ev_timeseries( df_RV_ts_e, threshold=self.threshold_ts_fit, min_dur=kwrgs_events_daily['min_dur'], max_break=kwrgs_events_daily['max_break'], grouped=kwrgs_events_daily['grouped'])[0] self.RV_bin = self.RV_bin_fit.loc[dates_RVe] elif only_RV_events == False: self.RV_b_full = func_fc.Ev_timeseries( self.fullts, threshold=self.threshold, min_dur=kwrgs_events_daily['min_dur'], max_break=kwrgs_events_daily['max_break'], grouped=kwrgs_events_daily['grouped'])[0] self.RV_bin = self.RV_b_full.loc[self.dates_RV] # convert daily binary to window probability binary if self.tfreq != 1: self.RV_bin, dates_gr = functions_pp.time_mean_bins( self.RV_bin.astype('float'), self.tfreq, None, None) self.RV_bin_fit, dates_gr = functions_pp.time_mean_bins( self.RV_bin_fit.astype('float'), self.tfreq, None, None) # all bins, with mean > 0 contained an 'extreme' event self.RV_bin_fit[self.RV_bin_fit > 0] = 1 self.RV_bin[self.RV_bin > 0] = 1
def PDO(filename, ex, df_splits=None): #%% ''' PDO is calculated based upon all data points in the training years, Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train) to enable retrieving the PDO timeseries on a subset on the year. It is similarly also projected on the dates_test From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf ''' t0 = time() if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) kwrgs_pp = { 'selbox': { 'la_min': 20, # select domain in degrees east 'la_max': 65, 'lo_min': 115, 'lo_max': 250 }, 'format_lon': 'only_east' } ds = core_pp.import_ds_lazy(filename, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size)) PDO_patterns = xr.DataArray( data, coords=[splits, ds.latitude.values, ds.longitude.values], dims=['split', 'latitude', 'longitude']) def PDO_single_split(s, ds, df_splits, PDO_patterns): progress = 100 * (s + 1) / splits.size dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s] ['TrainIsTrue']].index train_yrs = np.unique(dates_train.year) dates_all_train = pd.to_datetime( [d for d in dates if d.year in train_yrs]) ### dates_train_yrs = ### dates_test = df_splits.loc[s]['TrainIsTrue'][ ~df_splits.loc[s]['TrainIsTrue']].index n = dates_train.size r = int(100 * n / df_splits.loc[s].index.size) print( f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)", end="") PDO_pattern, solver, adjust_sign = get_PDO( ds.sel(time=dates_all_train)) data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train), PDO_patterns[s]) data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test), PDO_patterns[s]) df_test = pd.DataFrame(data=data_test.values, index=dates_test, columns=['0_901_PDO']) df_train = pd.DataFrame(data=data_train.values, index=dates_train, columns=['0_901_PDO']) df = pd.concat([df_test, df_train]).sort_index() return (df, PDO_pattern) pool = ProcessPoolExecutor(os.cpu_count() - 1) # amount of cores - 1 futures = [ pool.submit(PDO_single_split, s, ds, df_splits, PDO_patterns) for s in splits ] results = [future.result() for future in futures] list_splits = [r[0] for r in results] time_ = time() - t0 print(time_ / 60) for s in splits: PDO_patterns[s] = results[s][1] df_PDO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_PDO, PDO_patterns
def PDO_temp(filename, ex, df_splits=None): #%% ''' PDO is calculated based upon all data points in the training years, Subsequently, the PDO pattern is projection on the sst.sel(time=dates_train) to enable retrieving the PDO timeseries on a subset on the year. It is similarly also projected on the dates_test. From https://climatedataguide.ucar.edu/climate-data/pacific-decadal-oscillation-pdo-definition-and-indices ''' if df_splits is None: RV = ex[ex['RV_name']] df_splits, ex = functions_pp.rand_traintest_years(RV, ex) kwrgs_pp = { 'selbox': { 'la_min': 20, # select domain in degrees east 'la_max': 65, 'lo_min': 115, 'lo_max': 250 }, 'format_lon': 'only_east' } ds = core_pp.import_ds_lazy(filename, **kwrgs_pp) to_freq = ex['tfreq'] if to_freq != 1: ds, dates = functions_pp.time_mean_bins(ds, ex, to_freq=to_freq, seldays='all') ds['time'] = dates dates = pd.to_datetime(ds.time.values) splits = df_splits.index.levels[0] data = np.zeros((splits.size, ds.latitude.size, ds.longitude.size)) PDO_patterns = xr.DataArray( data, coords=[splits, ds.latitude.values, ds.longitude.values], dims=['split', 'latitude', 'longitude']) list_splits = [] for s in splits: progress = 100 * (s + 1) / splits.size dates_train = df_splits.loc[s]['TrainIsTrue'][df_splits.loc[s] ['TrainIsTrue']].index train_yrs = np.unique(dates_train.year) dates_all_train = pd.to_datetime( [d for d in dates if d.year in train_yrs]) dates_test = df_splits.loc[s]['TrainIsTrue'][ ~df_splits.loc[s]['TrainIsTrue']].index n = dates_train.size r = int(100 * n / df_splits.loc[s].index.size) print( f"\rProgress PDO traintest set {progress}%, trainsize=({n}dp, {r}%)", end="") PDO_patterns[s], solver, adjust_sign = get_PDO( ds.sel(time=dates_all_train)) PDO_patterns[s] = PDO_patterns[s].interpolate_na(dim='longitude') data_train = find_precursors.calc_spatcov(ds.sel(time=dates_train), PDO_patterns[s]) data_test = find_precursors.calc_spatcov(ds.sel(time=dates_test), PDO_patterns[s]) df_test = pd.DataFrame(data=data_test.values, index=dates_test, columns=['0_901_PDO']) df_train = pd.DataFrame(data=data_train.values, index=dates_train, columns=['0_901_PDO']) df = pd.concat([df_test, df_train]).sort_index() list_splits.append(df) df_PDO = pd.concat(list_splits, axis=0, keys=splits) #%% return df_PDO