def _check_y_fitmask(fit_masks, lag_i, base_lag): ''' If lag_i is uneven, taking the mean over the RV period may result in a shorter y_fit (RV_mask) then the original RV_mask (where the time mean bins were done on its own time axis. Hence y_fit is redefined by adding lag_i+base_lag to x_fit mask. Note: y_fit_mask and y_pred_mask the same now ''' fit_masks_n = fit_masks.copy() y_fit = fit_masks['y_fit'] x_fit = fit_masks['x_fit'] y_dates_RV = x_fit[x_fit].index + pd.Timedelta(lag_i + base_lag, 'd') y_dates_pr = y_fit[y_fit].index mismatch = (functions_pp.get_oneyr(y_dates_pr)[0]- \ functions_pp.get_oneyr(y_dates_RV)[0] ).days y_fit_corr = y_dates_RV + pd.Timedelta(mismatch, 'd') y_fit_mask = [True if d in y_fit_corr else False for d in x_fit.index] fit_masks_n.loc[:, 'y_fit'] = np.array(y_fit_mask) y_pred = fit_masks['y_pred'] x_pred = fit_masks['x_pred'] y_dates_RV = x_pred[x_pred].index + pd.Timedelta(lag_i + base_lag, 'd') y_dates_pr = y_pred[y_pred].index mismatch = (functions_pp.get_oneyr(y_dates_pr)[0]- \ functions_pp.get_oneyr(y_dates_RV)[0] ).days y_pred_corr = y_dates_RV + pd.Timedelta(mismatch, 'd') y_pred_mask = [True if d in y_pred_corr else False for d in x_pred.index] fit_masks_n.loc[:, 'y_pred'] = np.array(y_pred_mask) size_y_fit = fit_masks_n['y_fit'][fit_masks_n['y_fit']].dropna().size assert size_y_fit == y_dates_RV.size, ('y_fit mask will not match RV ' ' dates length') return fit_masks_n
def check_NaNs(field, ts): ''' Return shortened timeseries of both field and ts if a few NaNs are detected at boundary due to large lag. At boundary time-axis, large lags often result in NaNs due to missing data. Removing timesteps from timeseries if 1. Entire field is filled with NaNs 2. Number of timesteps are less than a single year of datapoints. ''' t = functions_pp.get_oneyr(field).size # threshold NaNs allowed. field = np.reshape(field.values, (field.shape[0], -1)) i = 0 # check NaNs in first year if bool(np.isnan(field[i]).all()): i += 1 while bool(np.isnan(field[i]).all()): i += 1 if i > t: raise ValueError('More NaNs detected then # of datapoints in ' 'single year') j = -1 # check NaNs in last year if bool(np.isnan(field[j]).all()): j -= 1 while bool(np.isnan(field[j]).all()): j -= 1 if j < t: raise ValueError('More NaNs detected then # of datapoints in ' 'single year') else: j = field.shape[0] return field[i:j], ts[i:j]
def start_end_date_mean(df_data, start_end_date): # create mask to aggregate if hasattr(df_data.index, 'levels'): pd_dates = df_data.loc[0].index else: pd_dates = df_data.index subset_dates = core_pp.get_subdates(pd_dates , start_end_date) dates_to_aggr_mask = pd.Series(np.repeat(False, pd_dates.size), index=pd_dates) dates_to_aggr_mask.loc[subset_dates] = True if hasattr(df_data.index, 'levels'): years = df_data.loc[0][dates_to_aggr_mask].index.year else: years = df_data[dates_to_aggr_mask].index.year index = [functions_pp.get_oneyr(subset_dates, yr).mean() for yr in np.unique(years)] if hasattr(df_data.index, 'levels'): splits = df_data.index.levels[0] df_data_s = np.zeros( (splits.size) , dtype=object) for s in splits: df_s = df_data.loc[s] df_s = df_s[dates_to_aggr_mask].groupby(years).mean() df_s.index = pd.to_datetime(index) df_data_s[s] = df_s df_data_resample = pd.concat(list(df_data_s), keys= range(splits.size)) else: df_data_resample = df_data[dates_to_aggr_mask].groupby(years).mean() df_data_resample.index = pd.to_datetime(index) return df_data_resample
def pers_ano_to_extr(filename_ts, RV, kwrgs_events_daily, dict_experiments, name_exp, name_model, n_boot): # loading in daily timeseries RVfullts = np.load(filename_ts, encoding='latin1', allow_pickle=True).item()['RVfullts95'] # Retrieve information on input timeseries import functions_pp dates = functions_pp.get_oneyr(RV.RV_ts.index) tfreq = (dates[1] - dates[0]).days start_date = dates[0] - pd.Timedelta(f'{tfreq/2}d') end_date = dates[-1] + pd.Timedelta(f'{-1+tfreq/2}d') yr_daily = pd.DatetimeIndex(start=start_date, end=end_date, freq=pd.Timedelta('1d')) ext_dates = functions_pp.make_dates(RV.RV_ts.index, yr_daily, RV.RV_ts.index.year[-1]) df_RV_ts_e = pd.DataFrame(RVfullts.sel(time=ext_dates).values, index=ext_dates, columns=['RV_ts']) df_RVfullts = pd.DataFrame(RVfullts.values, index=pd.to_datetime(RVfullts.time.values), columns=['RVfullts']) # Make new class based on new kwrgs_events_daily RV_d = func_fc.RV_class(df_RVfullts, df_RV_ts_e, kwrgs_events_daily) # Ensure that the bins on the daily time series matches the original ex = dict(sstartdate=f'{yr_daily[0].month}-{yr_daily[0].day}', senddate=f'{yr_daily[-1].month}-{yr_daily[-1].day}', startyear=ext_dates.year[0], endyear=ext_dates.year[-1]) RV_d.RV_bin, dates_gr = functions_pp.time_mean_bins(RV_d.RV_bin, ex, tfreq) RV_d.RV_bin[RV_d.RV_bin > 0] = 1 RV_d.TrainIsTrue = RV.TrainIsTrue RV_d.RV_mask = RV.RV_mask # add new probability of event occurence RV_d.prob_clim = func_fc.get_obs_clim(RV_d) dict_comparison = {} # loading model predicting pers. anomalies orig_event_perc = np.round(1 - float(RV.prob_clim.mean()), 2) new_name = '{}d mean +{}p to +{}p events'.format( tfreq, orig_event_perc, kwrgs_events_daily['event_percentile']) dict_sum = dict_experiments[name_exp] df_valid, RV, y_pred = dict_sum[models[-1]] blocksize = valid.get_bstrap_size(RV.RVfullts, plot=False) out = valid.get_metrics_sklearn(RV_d, y_pred, RV_d.prob_clim, n_boot=n_boot, blocksize=blocksize) df_valid, metrics_dict = out dict_comparison[new_name] = {name_model: (df_valid, RV_d, y_pred)} return dict_comparison
def aggr_to_daily_dates(dates_precur_data): dates = functions_pp.get_oneyr(dates_precur_data) tfreq = (dates[1] - dates[0]).days start_date = dates[0] - pd.Timedelta(f'{int(tfreq/2)}d') end_date = dates[-1] + pd.Timedelta(f'{int(-1+tfreq/2+0.5)}d') yr_daily = pd.date_range(start=start_date, end=end_date, freq=pd.Timedelta('1d')) years = np.unique(dates_precur_data.year) ext_dates = functions_pp.make_dates(yr_daily, years) return ext_dates
def pp_calc_ts(precur, precur_aggr=None, kwrgs_load: dict=None, force_reload: bool=False, lags: list=None): ''' Pre-process for calculating timeseries of precursor regions or pattern. ''' #%% corr_xr = precur.corr_xr prec_labels = precur.prec_labels if lags is not None: lags = np.array(lags) # ensure lag is np.ndarray corr_xr = corr_xr.sel(lag=lags).copy() prec_labels = prec_labels.sel(lag=lags).copy() else: lags = prec_labels.lag.values dates = pd.to_datetime(precur.precur_arr.time.values) oneyr = functions_pp.get_oneyr(dates) if oneyr.size == 1: # single val per year precursor tfreq = 365 else: tfreq = (oneyr[1] - oneyr[0]).days if precur_aggr is None and force_reload==False: precur_arr = precur.precur_arr else: if precur_aggr is not None: precur.tfreq = precur_aggr precur.load_and_aggregate_precur(kwrgs_load.copy()) precur_arr = precur.precur_arr if type(precur.lags[0]) is np.ndarray and precur_aggr is None: precur.period_means_array = True else: precur.period_means_array = False if precur_arr.shape[-2:] != corr_xr.shape[-2:]: print('shape loaded precur_arr != corr map, matching coords') corr_xr, prec_labels = functions_pp.match_coords_xarrays(precur_arr, *[corr_xr, prec_labels]) #%% return precur_arr, corr_xr, prec_labels
def loop_get_spatcov(precur, precur_aggr=None, kwrgs_load: dict = None, force_reload: bool = False, lags: list = None): name = precur.name use_sign_pattern = precur.use_sign_pattern corr_xr = precur.corr_xr prec_labels = precur.prec_labels splits = corr_xr.split if lags is not None: lags = np.array(lags) # ensure lag is np.ndarray corr_xr = corr_xr.sel(lag=lags).copy() prec_labels = prec_labels.sel(lag=lags).copy() else: lags = prec_labels.lag.values dates = pd.to_datetime(precur.precur_arr.time.values) oneyr = functions_pp.get_oneyr(dates) if oneyr.size == 1: # single val per year precursor tfreq = 365 else: tfreq = (oneyr[1] - oneyr[0]).days if precur_aggr is None and force_reload == False: precur_arr = precur.precur_arr if tfreq == 365: precur_arr = precur.precur_arr # use precursor array with temporal aggregation that was used to create # correlation map. When tfreq=365, aggregation (one-value-per-year) # is already done. period used to aggregate was defined by the lag else: if precur_aggr is not None: precur.tfreq = precur_aggr precur.load_and_aggregate_precur(kwrgs_load.copy()) precur_arr = precur.precur_arr precur.area_grid = find_precursors.get_area(precur_arr) if precur_arr.shape[-2:] != corr_xr.shape[-2:]: print('shape loaded precur_arr != corr map, matching coords') corr_xr, prec_labels = functions_pp.match_coords_xarrays( precur_arr, *[corr_xr, prec_labels]) ts_sp = np.zeros((splits.size), dtype=object) for s in splits: ts_list = np.zeros((lags.size), dtype=list) track_names = [] for il, lag in enumerate(lags): # if lag represents aggregation period: if type(precur.lags[il]) is np.ndarray and precur_aggr is None: precur_arr = precur.precur_arr.sel(lag=il) corr_vals = corr_xr.sel(split=s).isel(lag=il) mask = prec_labels.sel(split=s).isel(lag=il) pattern = corr_vals.where(~np.isnan(mask)) if use_sign_pattern == True: pattern = np.sign(pattern) if np.isnan(pattern.values).all(): # no regions of this variable and split nants = np.zeros((precur_arr.time.size, 1)) nants[:] = np.nan ts_list[il] = nants pass else: # if normalize == True: # spatcov_full = calc_spatcov(full_timeserie, pattern) # mean = spatcov_full.sel(time=dates_train).mean(dim='time') # std = spatcov_full.sel(time=dates_train).std(dim='time') # spatcov_test = ((spatcov_full - mean) / std) # elif normalize == False: xrts = find_precursors.calc_spatcov(precur_arr, pattern) ts_list[il] = xrts.values[:, None] track_names.append(f'{lag}..0..{precur.name}' + '_sp') # concatenate timeseries all of lags tsCorr = np.concatenate(tuple(ts_list), axis=1) dates = pd.to_datetime(precur_arr.time.values) ts_sp[s] = pd.DataFrame(tsCorr, index=dates, columns=track_names) # df_sp = pd.concat(list(ts_sp), keys=range(splits.size)) return ts_sp
def bivariateMI_map(self, precur_arr, df_splits, RV): # #%% # precur_arr = self.precur_arr ; df_splits = rg.df_splits ; RV = rg.TV """ This function calculates the correlation maps for precur_arr for different lags. Field significance is applied to test for correltion. RV_period: indices that matches the response variable time series alpha: significance level A land sea mask is assumed from settin all the nan value to True (masked). For xrcorr['mask'], all gridcell which are significant are not masked, i.e. bool == False """ if type(self.lags) is np.ndarray and type( self.lags[0]) is not np.ndarray: self.lags = np.array(self.lags, dtype=np.int16) # fix dtype self.lag_coordname = self.lags else: self.lag_coordname = np.arange(len(self.lags)) # for period_means n_lags = len(self.lags) lags = self.lags self.df_splits = df_splits # add df_splits to self dates = self.df_splits.loc[0].index targetstepsoneyr = functions_pp.get_oneyr(RV.RV_ts) if type(self.lags[0]) == np.ndarray and targetstepsoneyr.size > 1: raise ValueError( 'Precursor and Target do not align.\n' 'One aggregated value taken for months ' f'{self.lags[0]}, while target timeseries has ' f'multiple timesteps per year:\n{targetstepsoneyr}') yrs_precur_arr = np.unique(precur_arr.time.dt.year) if np.unique(dates.year).size != yrs_precur_arr.size: raise ValueError( 'Numer of years between precursor and Target ' 'not match. Check if precursor period is crossyr, ' 'while target period is not. ' 'Mannually ensure start_end_year is aligned.') oneyr = functions_pp.get_oneyr(dates) if oneyr.size == 1: # single val per year precursor self._tfreq = 365 else: self._tfreq = (oneyr[1] - oneyr[0]).days n_spl = df_splits.index.levels[0].size # make new xarray to store results xrcorr = precur_arr.isel(time=0).drop('time').copy() orig_mask = np.isnan(precur_arr[1]) if 'lag' not in xrcorr.dims: # add lags list_xr = [ xrcorr.expand_dims('lag', axis=0) for i in range(n_lags) ] xrcorr = xr.concat(list_xr, dim='lag') xrcorr['lag'] = ('lag', self.lag_coordname) # add train test split list_xr = [xrcorr.expand_dims('split', axis=0) for i in range(n_spl)] xrcorr = xr.concat(list_xr, dim='split') xrcorr['split'] = ('split', range(n_spl)) xrpvals = xrcorr.copy() def MI_single_split(RV_ts, precur_train, s, alpha=.05, FDR_control=True): lat = precur_train.latitude.values lon = precur_train.longitude.values z = np.zeros((lat.size * lon.size, len(lags))) Corr_Coeff = np.ma.array(z, mask=z) pvals = np.ones((lat.size * lon.size, len(lags))) dates_RV = RV_ts.index for i, lag in enumerate(lags): if type(lag) is np.int16 and self.lag_as_gap == False: # dates_lag = functions_pp.func_dates_min_lag(dates_RV, self._tfreq*lag)[1] m = apply_shift_lag(self.df_splits.loc[s], lag) dates_lag = m[np.logical_and(m['TrainIsTrue'], m['x_fit'])].index corr_val, pval = self.func( precur_train.sel(time=dates_lag), RV_ts.values.squeeze(), **self.kwrgs_func) elif type(lag) == np.int16 and self.lag_as_gap == True: # if only shift tfreq, then gap=0 datesdaily = RV.aggr_to_daily_dates(dates_RV, tfreq=self._tfreq) dates_lag = functions_pp.func_dates_min_lag( datesdaily, self._tfreq + lag)[1] tmb = functions_pp.time_mean_bins corr_val, pval = self.func( tmb(precur_train.sel(time=dates_lag), to_freq=self._tfreq)[0], RV_ts.values.squeeze(), **self.kwrgs_func) elif type(lag) == np.ndarray: corr_val, pval = self.func(precur_train.sel(lag=i), RV_ts.values.squeeze(), **self.kwrgs_func) mask = np.ones(corr_val.size, dtype=bool) if FDR_control == True: # test for Field significance and mask unsignificant values # FDR control: adjusted_pvalues = multicomp.multipletests(pval, method='fdr_bh') ad_p = adjusted_pvalues[1] pvals[:, i] = ad_p mask[ad_p <= alpha] = False else: pvals[:, i] = pval mask[pval <= alpha] = False Corr_Coeff[:, i] = corr_val[:] Corr_Coeff[:, i].mask = mask Corr_Coeff = np.ma.array(data=Corr_Coeff[:, :], mask=Corr_Coeff.mask[:, :]) Corr_Coeff = Corr_Coeff.reshape(lat.size, lon.size, len(lags)).swapaxes(2, 1).swapaxes( 1, 0) pvals = pvals.reshape(lat.size, lon.size, len(lags)).swapaxes(2, 1).swapaxes(1, 0) return Corr_Coeff, pvals print('\n{} - calculating correlation maps'.format(precur_arr.name)) np_data = np.zeros_like(xrcorr.values) np_mask = np.zeros_like(xrcorr.values) np_pvals = np.zeros_like(xrcorr.values) RV_mask = df_splits.loc[0]['RV_mask'] for s in xrcorr.split.values: progress = int(100 * (s + 1) / n_spl) # ============================================================================= # Split train test methods ['random'k'fold', 'leave_'k'_out', ', 'no_train_test_split'] # ============================================================================= RV_train_mask = np.logical_and(RV_mask, df_splits.loc[s]['TrainIsTrue']) RV_ts = RV.fullts[RV_train_mask.values] TrainIsTrue = df_splits.loc[s]['TrainIsTrue'].values if self.lag_as_gap: # no clue why selecting all datapoints, changed 26-01-2021 train_dates = df_splits.loc[s]['TrainIsTrue'][ TrainIsTrue].index precur_train = precur_arr.sel(time=train_dates) else: precur_train = precur_arr[TrainIsTrue] # only train data dates_RV = RV_ts.index n = dates_RV.size r = int(100 * n / RV.dates_RV.size) print( f"\rProgress traintest set {progress}%, trainsize=({n}dp, {r}%)", end="") ma_data, pvals = MI_single_split(RV_ts, precur_train.copy(), s, alpha=self.alpha, FDR_control=self.FDR_control) np_data[s] = ma_data.data np_mask[s] = ma_data.mask np_pvals[s] = pvals print("\n") xrcorr.values = np_data xrpvals.values = np_pvals mask = (('split', 'lag', 'latitude', 'longitude'), np_mask) xrcorr.coords['mask'] = mask # fill nans with mask = True xrcorr['mask'] = xrcorr['mask'].where(orig_mask == False, other=orig_mask).drop('time') #%% return xrcorr, xrpvals
from func_models import standardize_on_train # summerdates = core_pp.get_subdates(dates, start_end_TVdate) df_PDOsplit = df_PDO.loc[0]#.loc[summerdates] # standardize = preprocessing.StandardScaler() # standardize.fit(df_PDOsplit[df_PDOsplit['TrainIsTrue'].values].values.reshape(-1,1)) # df_PDOsplit = pd.DataFrame(standardize.transform(df_PDOsplit['PDO'].values.reshape(-1,1)), # index=df_PDOsplit.index, columns=['PDO']) df_PDOsplit = df_PDOsplit[['PDO']].apply(standardize_on_train, args=[df_PDO.loc[0]['TrainIsTrue']], result_type='broadcast') # Butter Lowpass yr = 2 dates = df_PDOsplit.index freqraw = (dates[1] - dates[0]).days window = int(yr*functions_pp.get_oneyr(dates).size) # 2 year fig, ax = plt.subplots(1,1) ax.plot_date(dates, df_PDOsplit.values, label=f'raw ({freqraw} daymeans)', alpha=.2, linestyle='solid', marker=None) ax.plot_date(dates, filters.lowpass(df_PDOsplit, period=window), label='Butterworth', linestyle='solid', linewidth=1, marker=None) df_PDOrm = df_PDOsplit.rolling(window=window, center=True, min_periods=1).mean() # ax.plot_date(dates, filters.lowpass(df_PDOrm, period=window), label='Butterworth on rolling mean', # linestyle='solid', linewidth=1, marker=None) ax.plot_date(dates, df_PDOrm, label='rolling mean', color='green', linestyle='solid', linewidth=1, marker=None) ax.legend()
start_end_date=rg.start_end_TVdate) RV_ts = rg.fulltso.sel(time=dates_RV) ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1]) dslocal = core_pp.get_selbox(ds_v300, selbox=selbox) datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values), start_end_date=rg.start_end_TVdate) datesRW = datesRW + pd.Timedelta(f'{lag}d') dslocal = dslocal.sel(time=datesRW) wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox) patternlocal = wv6local.mean(dim='lag') ts = find_precursors.calc_spatcov(dslocal, patternlocal) ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate, closed_on_date=start_end_TVdate[-1]) RV_15, d = functions_pp.time_mean_bins(RV_ts, tfreq, start_end_date=start_end_TVdate, closed_on_date=start_end_TVdate[-1]) corr_value = np.corrcoef(ts_15.values.squeeze(), RV_15.values.squeeze())[0][1] print('corr: {:.2f}'.format(corr_value)) values.append(corr_value) plt.plot(range(-9,10), values[1:]) # df_wv6 = ts_15.to_dataframe(name='wv6p2') #%% sst = rg.list_for_MI[2] dates_years = functions_pp.get_oneyr(sst.df_splits.loc[0].index, *event_dates.year) sst.precur_arr.sel(time=dates_years).mean(dim='time').plot(vmin=-.3, vmax=.3, cmap=plt.cm.RdBu_r)
def spatial_mean_regions(precur, precur_aggr=None, kwrgs_load: dict = None, force_reload: bool = False, lags: list = None): ''' Wrapper for calculating 1-d spatial mean timeseries per precursor region. Parameters ---------- precur : class_BivariateMI instance precur_aggr : int, optional If None, same precur_arr is used as for the correlation maps. kwrgs_load : dict, optional kwrgs to load in timeseries. See functions_pp.import_ds_timemeanbins or functions_pp.time_mean_period. The default is None. force_reload : bool, optional Force reload a different precursor array (precur_arr). The default is False. Returns ------- ts_corr : TYPE DESCRIPTION. ''' #%% name = precur.name corr_xr = precur.corr_xr prec_labels = precur.prec_labels n_spl = corr_xr.split.size use_coef_wghts = precur.use_coef_wghts if lags is not None: lags = np.array(lags) # ensure lag is np.ndarray corr_xr = corr_xr.sel(lag=lags).copy() prec_labels = prec_labels.sel(lag=lags).copy() else: lags = prec_labels.lag.values dates = pd.to_datetime(precur.precur_arr.time.values) oneyr = functions_pp.get_oneyr(dates) if oneyr.size == 1: # single val per year precursor tfreq = 365 else: tfreq = (oneyr[1] - oneyr[0]).days if precur_aggr is None and force_reload == False: precur_arr = precur.precur_arr if tfreq == 365: precur_arr = precur.precur_arr # use precursor array with temporal aggregation that was used to create # correlation map. When tfreq=365, aggregation (one-value-per-year) # is already done. period used to aggregate was defined by the lag else: if precur_aggr is not None: precur.tfreq = precur_aggr precur.load_and_aggregate_precur(kwrgs_load.copy()) precur_arr = precur.precur_arr precur.area_grid = get_area(precur_arr) if precur_arr.shape[-2:] != corr_xr.shape[-2:]: print('shape loaded precur_arr != corr map, matching coords') corr_xr, prec_labels = functions_pp.match_coords_xarrays( precur_arr, *[corr_xr, prec_labels]) ts_corr = np.zeros((n_spl), dtype=object) for s in range(n_spl): corr = corr_xr.isel(split=s) labels = prec_labels.isel(split=s) ts_list = np.zeros((lags.size), dtype=list) track_names = [] for l_idx, lag in enumerate(lags): labels_lag = labels.sel(lag=lag).values # if lag represents aggregation period: if type(precur.lags[l_idx]) is np.ndarray and precur_aggr is None: precur_arr = precur.precur_arr.sel(lag=l_idx) regions_for_ts = list(np.unique(labels_lag[~np.isnan(labels_lag)])) a_wghts = precur.area_grid / precur.area_grid.mean() if use_coef_wghts: coef_wghts = abs(corr.sel(lag=lag)) / abs( corr.sel(lag=lag)).max() a_wghts *= coef_wghts.values # area & corr. value weighted # this array will be the time series for each feature ts_regions_lag_i = np.zeros( (precur_arr.values.shape[0], len(regions_for_ts))) # track sign of eacht region sign_ts_regions = np.zeros(len(regions_for_ts)) # calculate area-weighted mean over features for r in regions_for_ts: idx = regions_for_ts.index(r) # start with empty lonlat array B = np.zeros(labels_lag.shape) # Mask everything except region of interest B[labels_lag == r] = 1 # # Calculates how values inside region vary over time, wgts vs anomaly # wgts_ano = meanbox[B==1] / meanbox[B==1].max() # ts_regions_lag_i[:,idx] = np.nanmean(actbox[:,B==1] * cos_box_array[:,B==1] * wgts_ano, axis =1) # Calculates how values inside region vary over time ts = np.nanmean(precur_arr.values[:, B == 1] * a_wghts[B == 1], axis=1) # check for nans if ts[np.isnan(ts)].size != 0: print(ts) perc_nans = ts[np.isnan(ts)].size / ts.size if perc_nans == 1: # all NaNs print(f'All timesteps were NaNs split {s}' f' for region {r} at lag {lag}') else: print(f'{perc_nans} NaNs split {s}' f' for region {r} at lag {lag}') track_names.append(f'{lag}..{int(r)}..{name}') ts_regions_lag_i[:, idx] = ts # get sign of region sign_ts_regions[idx] = np.sign( np.mean(corr.isel(lag=l_idx).values[B == 1])) ts_list[l_idx] = ts_regions_lag_i dates = pd.to_datetime(precur_arr.time.values) tsCorr = np.concatenate(tuple(ts_list), axis=1) df_tscorr = pd.DataFrame(tsCorr, index=dates, columns=track_names) df_tscorr.name = str(s) ts_corr[s] = df_tscorr if any(df_tscorr.isna().values.flatten()): print('Warnning: nans detected') #%% return ts_corr
print('\n', month, '\n') keys = [k for k in rg.df_data.columns[:-2] if k not in [rg.TV.name, 'PDO']] if target == 'easterntemp': keys = [k for k in keys if int(k.split('..')[1]) in [1, 2]] if remove_PDO: y_keys = [k for k in keys if 'sst' in k] rg.df_data[y_keys], fig = wPCMCI.df_data_remove_z(rg.df_data, z=['PDO'], keys=y_keys, standardize=False) fig_path = os.path.join(rg.path_outsub1, f'regressing_out_PDO_tf{month}') fig.savefig(fig_path + rg.figext, bbox_inches='tight') if any(rg._df_count == rg.n_spl): # at least one timeseries always present oneyr = functions_pp.get_oneyr( rg.df_data['RV_mask'].loc[0][rg.df_data['RV_mask'].loc[0]]) oneyrsize = oneyr.size if monthkeys.index(month) >= 1: nextyr = functions_pp.get_oneyr( rg.df_data['RV_mask'].loc[0][rg.df_data['RV_mask'].loc[0]]) if nextyr.size != oneyrsize: raise ValueError fc_mask = rg.df_data.iloc[:, -1].loc[0] #.shift(lag, fill_value=False) # rg.df_data = rg._replace_RV_mask(rg.df_data, replace_RV_mask=(fc_mask)) target_ts = rg.df_data.iloc[:, [0]].loc[0][fc_mask] target_ts = (target_ts - target_ts.mean()) / target_ts.std() # ScikitModel = scikitlinear.LassoCV out_fit = rg.fit_df_data_ridge(target=target_ts,
plt.ylabel('frequency (1/year)') RV.freq_per_year.plot(kind='bar') fname = 'freq_per_year.png' filename = os.path.join(ex['path_fig'], fname) plt.savefig(filename) #%% get timeseries: #ERA5_filename = 'era5_t2mmax_US_1979-2018_averAggljacc0.25d_tf1_n4__to_t2mmax_US_tf1_selclus4_okt19.npy' GHCND_filename = "PEP-T95TimeSeries.txt" RV, ex = load_data.load_response_variable(ex) T95_ERA5 = RV.RV_ts ex['RV1d_ts_path'] = '/Users/semvijverberg/surfdrive/MckinRepl/RVts' T95_GHCND, GHCND_dates = load_data.read_T95(GHCND_filename, ex) dates = functions_pp.get_oneyr(RV.dates_RV, 2012) shared_dates = functions_pp.get_oneyr(RV.dates_RV, *list(range(1982, 2016))) #%% data = np.stack([ T95_GHCND.sel(time=shared_dates).values, T95_ERA5.loc[shared_dates].values.squeeze() ], axis=1) df = pd.DataFrame(data, columns=['GHCND', 'ERA-5'], index=shared_dates) dfplots.plot_oneyr_events(df, 'std', 2012) plt.savefig(os.path.join(ex['path_fig'], 'timeseries_ERA5_GHCND.png'), bbox_inches='tight') #%% Weighing features if there are extracted every run (training set) # weighted by persistence of pattern over
def plot_ts_matric(df_init, win: int=None, lag=0, columns: list=None, rename: dict=None, period='fullyear', plot_sign_stars=True, fontsizescaler=0): #%% ''' period = ['fullyear', 'summer60days', 'pre60days'] ''' if columns is None: columns = list(df_init.columns[(df_init.dtypes != bool).values]) df_cols = df_init[columns] if hasattr(df_init.index, 'levels'): splits = df_init.index.levels[0] print('extracting RV dates from test set') dates_RV_orig = df_init.loc[0].index[df_init.loc[0]['RV_mask']==True] TrainIsTrue = df_init['TrainIsTrue'] dates_full_orig = df_init.loc[0].index list_test = [] for s in range(splits.size): TestIsTrue = TrainIsTrue[s]==False list_test.append(df_cols.loc[s][TestIsTrue]) df_test = pd.concat(list_test).sort_index() else: df_test = df_init dates_full_orig = df_init.index if lag != 0: # shift precursor vs. tmax for c in df_test.columns[1:]: df_test[c] = df_test[c].shift(periods=-lag) # bin means if win is not None: oneyr = get_oneyr(df_test.index) start_end_date = (f'{oneyr[0].month:02d}-{oneyr[0].day:02d}', f'{oneyr[-1].month:02d}-{oneyr[-1].day:02d}') df_test = time_mean_bins(df_test, win, start_end_date=start_end_date)[0] if period=='fullyear': dates_sel = dates_full_orig.strftime('%Y-%m-%d') if 'RV_mask' in df_init.columns: if period == 'RV_mask': dates_sel = dates_RV_orig.strftime('%Y-%m-%d') elif period == 'RM_mask_lag60': dates_sel = (dates_RV_orig - pd.Timedelta(60, unit='d')).strftime('%Y-%m-%d') # after resampling, not all dates are in their: dates_sel = pd.to_datetime([d for d in dates_sel if d in df_test.index] ) df_period = df_test.loc[dates_sel, :].dropna() if rename is not None: df_period = df_period.rename(rename, axis=1) corr, sig_mask, pvals = corr_matrix_pval(df_period, alpha=0.01) # Generate a mask for the upper triangle mask_tri = np.zeros_like(corr, dtype=np.bool) mask_tri[np.triu_indices_from(mask_tri)] = True mask_sig = mask_tri.copy() mask_sig[sig_mask==False] = True # removing meaningless row and column cols = corr.columns corr = corr.drop(cols[0], axis=0).drop(cols[-1], axis=1) mask_sig = mask_sig[1:, :-1] mask_tri = mask_tri[1:, :-1] # Set up the matplotlib figure f, ax = plt.subplots(figsize=(10, 10)) # Generate a custom diverging colormap cmap = sns.diverging_palette(220, 10, n=9, l=30, as_cmap=True) ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1E99, center=0, square=True, linewidths=.5, annot=False, annot_kws={'size':30+fontsizescaler}, cbar=False) if plot_sign_stars: sig_bold_labels = sig_bold_annot(corr, mask_sig) else: sig_bold_labels = corr.round(2).astype(str).values # Draw the heatmap with the mask and correct aspect ratio ax = sns.heatmap(corr, ax=ax, mask=mask_tri, cmap=cmap, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .8}, annot=sig_bold_labels, annot_kws={'size':30+fontsizescaler}, cbar=False, fmt='s') ax.tick_params(axis='both', labelsize=15+fontsizescaler, bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False) ax.set_xticklabels(corr.columns, fontdict={'fontweight':'bold', 'fontsize':20+fontsizescaler}) ax.set_yticklabels(corr.index, fontdict={'fontweight':'bold', 'fontsize':20+fontsizescaler}, rotation=0) #%% return
def plot_timeseries(y, timesteps: list=None, selyears: Union[list, int]=None, title=None, legend: bool=True, nth_xyear: int=10, ax=None): # ax=None #%% if hasattr(y.index,'levels'): y_ac = y.loc[0] else: y_ac = y if type(y_ac.index) == pd.core.indexes.datetimes.DatetimeIndex: datetimes = y_ac.index if timesteps is None and selyears is None: ac, con_int = autocorr_sm(y_ac) where = np.where(con_int[:,0] < 0 )[0] # has to be below 0 for n times (not necessarily consecutive): n = 1 n_of_times = np.array([idx+1 - where[0] for idx in where]) if n_of_times.size != 0: cutoff = where[np.where(n_of_times == n)[0][0] ] else: cutoff = 100 timesteps = min(y_ac.index.size, 10*cutoff) datetimes = y_ac.iloc[:timesteps].index if selyears is not None and timesteps is None: if type(selyears) is int: selyears = [selyears] datetimes = get_oneyr(y.index, *selyears) if timesteps is not None and selyears is None: datetimes = datetimes[:timesteps] if ax is None: fig, ax = plt.subplots(constrained_layout=True) if hasattr(y.index,'levels'): for fold in y.index.levels[0]: if legend: label = f'f {fold+1}' ; color = None ; alpha=.5 else: label = None ; color = 'red' ; alpha=.1 ax.plot(datetimes, y.loc[fold, datetimes], alpha=alpha, label=label, color=color) if legend: ax.legend(prop={'size':6}) else: ax.plot(datetimes, y.loc[datetimes]) if nth_xyear is None: nth_xtick = round(len(ax.xaxis.get_ticklabels())/5) for n, label in enumerate(ax.xaxis.get_ticklabels()): if n % nth_xtick != 0: label.set_visible(False) else: ax.xaxis.set_major_locator(mdates.YearLocator(1)) # set tick every year ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y')) # format %Y for n, label in enumerate(ax.xaxis.get_ticklabels()): if n % nth_xyear != 0: label.set_visible(False) ax.tick_params(axis='both', which='major', labelsize=8) if title is not None: ax.set_title(title, fontsize=10) #%% return ax
# standardize.fit(df_PDOsplit[df_PDOsplit['TrainIsTrue'].values].values.reshape(-1,1)) # df_PDOsplit = pd.DataFrame(standardize.transform(df_PDOsplit['PDO'].values.reshape(-1,1)), # index=df_PDOsplit.index, columns=['PDO']) df_PDOsplit = df_PDOsplit[['PDO']].apply(standardize_on_train, args=[df_PDO.loc[0]['TrainIsTrue']], result_type='broadcast') # Butter Lowpass dates = df_PDOsplit.index freqraw = (dates[1] - dates[0]).days ls = ['solid', 'dotted', 'dashdot', 'dashed'] fig, ax = plt.subplots(1,1) list_dfPDO = [df_PDOsplit] lowpass_yrs = [.25, .5, 1.0, 2.0] for i, yr in enumerate(lowpass_yrs): window = int(yr*functions_pp.get_oneyr(dates).size) # 2 year if i ==0: ax.plot_date(dates, df_PDOsplit.values, label=f'Raw ({freqraw} day means)', alpha=.3, linestyle='solid', marker=None) df_PDObw = pd.Series(filters.lowpass(df_PDOsplit, period=window).squeeze(), index=dates, name=f'PDO{yr}bw') ax.plot_date(dates, df_PDObw, label=f'Butterworth {yr}-year low-pass', color='red',linestyle=ls[i], linewidth=1, marker=None) df_PDOrm = df_PDOsplit.rolling(window=window, closed='right', min_periods=window).mean() df_PDOrm = df_PDOrm.rename({'PDO':f'PDO{yr}rm'}, axis=1) ax.plot_date(dates, df_PDOrm, label=f'Rolling mean {yr}-year low-pass (closed right)', color='green',linestyle=ls[i], linewidth=1, marker=None) list_dfPDO.append(df_PDObw) ; list_dfPDO.append(df_PDOrm) ax.legend()