def handle_fit_model_dates(dates_RV, dates_all, RV_ts, fit_model_dates): if fit_model_dates is None: # RV_ts and RV_ts_fit are equal if fit_model_dates = None bool_mask = [ True if d in dates_RV else False for d in dates_all ] fit_model_mask = pd.DataFrame(bool_mask, columns=['fit_model_mask'], index=dates_all) RV_ts_fit = RV_ts fit_dates = dates_RV else: startperiod, endperiod = fit_model_dates startyr = dates_all[0].year endyr = dates_all[-1].year # if dates_all.resolution == 'day': # tfreq = (dates_all[1] - dates_all[0]).days start_end_date = (startperiod, endperiod) start_end_year = (startyr, endyr) fit_dates = core_pp.get_subdates(dates_all, start_end_date=start_end_date, start_end_year=start_end_year) bool_mask = [ True if d in fit_dates else False for d in dates_all ] fit_model_mask = pd.DataFrame(bool_mask, columns=['fit_model_mask'], index=dates_all) RV_ts_fit = fullts[fit_model_mask.values] fit_dates = fit_dates return fit_model_mask, fit_dates, RV_ts_fit
def start_end_date_mean(df_data, start_end_date): # create mask to aggregate if hasattr(df_data.index, 'levels'): pd_dates = df_data.loc[0].index else: pd_dates = df_data.index subset_dates = core_pp.get_subdates(pd_dates, start_end_date) dates_to_aggr_mask = pd.Series(np.repeat(False, pd_dates.size), index=pd_dates) dates_to_aggr_mask.loc[subset_dates] = True if hasattr(df_data.index, 'levels'): years = df_data.loc[0][dates_to_aggr_mask].index.year else: years = df_data[dates_to_aggr_mask].index.year index = [ functions_pp.get_oneyr(subset_dates, yr).mean() for yr in np.unique(years) ] if hasattr(df_data.index, 'levels'): splits = df_data.index.levels[0] df_data_s = np.zeros((splits.size), dtype=object) for s in splits: df_s = df_data.loc[s] df_s = df_s[dates_to_aggr_mask].groupby(years).mean() df_s.index = pd.to_datetime(index) df_data_s[s] = df_s df_data_resample = pd.concat(list(df_data_s), keys=range(splits.size)) else: df_data_resample = df_data[dates_to_aggr_mask].groupby(years).mean() df_data_resample.index = pd.to_datetime(index) return df_data_resample
def _redefine_RV_mask(self, start_end_TVdate): self.df_data = self.df_data.copy() self.start_end_TVdate_orig = fcev._get_start_end_TVdate(self) self.start_end_TVdate = start_end_TVdate RV_mask_orig = self.df_data['RV_mask'].copy() dates_RV = core_pp.get_subdates(self.dates_df, start_end_TVdate, start_end_year=None) new_RVmask = RV_mask_orig.loc[0].copy() new_RVmask.loc[:] = False new_RVmask.loc[dates_RV] = True self.df_data['RV_mask'] = pd.concat([new_RVmask] * self.splits.size, keys=self.splits)
def select_period(self, df, targ_var_mask, start_date, end_date, start_end_year, leap_year, rename=False): dates_full_origin = df.loc[0].index dates_target_var_origin = df.loc[0].index[df.loc[0]['RV_mask'] == True] df_resample = self.resample(df=df) df_period = get_subdates(dates_target_var_origin, start_date, end_date, start_end_year, leap_year) if rename: df_period = df_period.rename(rename, axis=1) return df_period return df_period
'hspace':.2, 'cbar_vert':.05, 'clevels':np.arange(-.5, .51, .1)} plot_maps.plot_corr_maps(xr_snap, row_dim='lag', col_dim='split', **kwrgs_plot) plt.savefig(os.path.join(rg.path_outsub1, f'snapshots_{var}_rm{rm}.pdf')) #%% Correlation PNA-like RW with Wavenumber 6 phase 2 # only for eastern import core_pp, find_precursors values = [] if west_or_east == 'eastern': lags_list = range(-10,10) for lag in lags_list: selbox = (0,360,25,60) # selbox = (140,300,20,73) tfreq = 1 # lag = 0 dates_RV = core_pp.get_subdates(pd.to_datetime(rg.fulltso.time.values), start_end_date=rg.start_end_TVdate) RV_ts = rg.fulltso.sel(time=dates_RV) ds_v300 = core_pp.import_ds_lazy(rg.list_precur_pp[1][1]) dslocal = core_pp.get_selbox(ds_v300, selbox=selbox) datesRW = core_pp.get_subdates(pd.to_datetime(dslocal.time.values), start_end_date=rg.start_end_TVdate) datesRW = datesRW + pd.Timedelta(f'{lag}d') dslocal = dslocal.sel(time=datesRW) wv6local = core_pp.get_selbox(xarray.sel(lag=5), selbox=selbox) patternlocal = wv6local.mean(dim='lag') ts = find_precursors.calc_spatcov(dslocal, patternlocal) ts_15, d = functions_pp.time_mean_bins(ts, tfreq, start_end_date=start_end_TVdate,
rg.pp_precursors() # In[ ]: rg.list_precur_pp var_filename = rg.list_precur_pp[0][1] region = 'USCAnew' #%% import pandas as pd ds = core_pp.import_ds_lazy(var_filename) ds.sel(time=core_pp.get_subdates(pd.to_datetime(ds.time.values), start_end_date=('06-01', '08-31'))).mean( dim='time').plot() #%% if region == 'USCAnew': selbox = (230, 300, 25, 70) TVpath = os.path.join(path_data, 'tfreq15_nc7_dendo_57db0USCA.nc') # np_array_xy = np.array([[-97, 39], [-89, 39], [-82, 40], # [-116,36], [-122,41], [-117,46]]) np_array_xy = np.array([[-96, 36], [-92, 41], [-84, 35], [-84, 41], [-114, 36], [-120, 36], [-122, 44], [-118, 48]]) t, c = 15, 7 # elif region == 'USCA': # selbox = (230, 300, 25, 70) # TVpath = os.path.join(path_outmain, 'tf10_nc5_dendo_5dbee_USCA.nc')
def import_precur_ts(list_import_ts: List[tuple], df_splits: pd.DataFrame, start_end_date: Tuple[str, str], start_end_year: Tuple[int, int], start_end_TVdate: Tuple[str, str], cols: list = None, precur_aggr: int = 1): ''' list_import_ts has format List[tuples], [(name, path_data)] ''' #%% # df_splits = rg.df_splits splits = df_splits.index.levels[0] orig_traintest = functions_pp.get_testyrs(df_splits) df_data_ext_s = np.zeros((splits.size), dtype=object) counter = 0 for i, (name, path_data) in enumerate(list_import_ts): df_data_e_all = functions_pp.load_hdf5(path_data)['df_data'] if type(df_data_e_all) is pd.Series: df_data_e_all = pd.DataFrame(df_data_e_all) df_data_e_all = df_data_e_all.iloc[:, :] # not sure why needed if cols is None: cols = list( df_data_e_all.columns[(df_data_e_all.dtypes != bool).values]) elif type(cols) is str: cols = [cols] if hasattr(df_data_e_all.index, 'levels'): dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:, dates_subset], :] else: dates_subset = core_pp.get_subdates(df_data_e_all.index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[dates_subset] if 'TrainIsTrue' in df_data_e_all.columns: _c = [ k for k in df_splits.columns if k in ['TrainIsTrue', 'RV_mask'] ] # check if traintest split is correct ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c]) _check_traintest = all( np.equal(core_pp.flatten(ext_traintest), core_pp.flatten(orig_traintest))) assert _check_traintest, ( 'Train test years of df_splits are not the ' 'same as imported timeseries') for s in range(splits.size): if 'TrainIsTrue' in df_data_e_all.columns: df_data_e = df_data_e_all.loc[s] else: df_data_e = df_data_e_all df_data_ext_s[s] = df_data_e[cols] tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days if precur_aggr != tfreq_date_e: try: df_data_ext_s[s] = functions_pp.time_mean_bins( df_data_ext_s[s], precur_aggr, start_end_date, start_end_year, start_end_TVdate=start_end_TVdate)[0] except KeyError as e: print('KeyError captured, likely the requested dates ' 'given by start_end_date and start_end_year are not' 'found in external pandas timeseries.\n{}'.format( str(e))) print(f'loaded in exterinal timeseres: {cols}') if counter == 0: df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size)) else: df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size)) df_data_ext = df_data_ext.merge(df_add, left_index=True, right_index=True) counter += 1 cols = None #%% return df_data_ext
out = rgPDO.fit_df_data_ridge(target=target, df_data = df_prec, tau_min=0, tau_max=0, kwrgs_model={'alphas':np.array([.01,.1,1,5,10])}) predict = out[0].rename({0:'AR1'}, axis=1) lowPDO, df_lagmask = get_lagged_ts(rgPDO.df_data.copy(), 1, ['PDO0.5rm']) # perPDO = rgPDO.df_data[keys_ext][persmask['x_fit']] # persPDO[persmask['x_fit']] = persPDO[persmask['x_fit']] # perPDO.index = rgPDO.df_data[rgPDO.df_data['RV_mask']].index perPDO = lowPDO.rename({'PDO1.0rm_2':'persistence'}, axis=1) perPDO = perPDO.loc[df_prec.index] predict = predict.merge(perPDO, left_index=True, right_index=True) dates = core_pp.get_subdates(rgPDO.dates_TV, start_end_year=(1980,2020)) predict = predict.loc[pd.IndexSlice[:, dates], :] test = fc_utils.get_scores(predict, score_func_list=[fc_utils.corrcoef, fc_utils.metrics.mean_squared_error])[2] df_test = functions_pp.get_df_test(predict, df_splits=rgPDO.df_data.loc[predict.index][['TrainIsTrue', 'RV_mask']]) df_z = df_test[['AR1']] df_z = lowPDO # df_z = functions_pp.get_df_test(df_prec, # df_splits=rgPDO.df_data.loc[predict.index][['TrainIsTrue', 'RV_mask']]) # years = functions_pp.get_oneyr(df_z, *list(range(1980, 2020+1))) # df_z = df_z.loc[years] kwrgs_func = {'filepath':df_z, 'lag_z':0}
legend=True, label=yr, alpha=alpha) # ax.set_ylim(-.1,.1) ax.hlines(y=0.5, xmin=0.05, xmax=.95, transform=ax.transAxes) ax.set_xticks(range(df_yr.index.values.size)) xticklabels = [ '{} {}'.format(*list(item)) for item in df_yr.index.tolist() ] ax.set_xticklabels(xticklabels, rotation=-45) allyrs.append(list(df_yr['SM'].values)) #%% summerdays = core_pp.get_subdates(df_T.mean(0, level=1).index, start_end_date=('08-01', '08-31'), start_end_year=(1980, 2018)) df_sum = df_T.mean(0, level=1).loc[summerdays] summmerSM = df_sum['SM'].groupby(df_sum.index.year).mean() winterdays = core_pp.get_subdates(df_SST[['SST pattern']].mean(0, level=1).index, start_end_date=('01-01', '08-31'), start_end_year=(1979, 2017)) winterdays = functions_pp.func_dates_min_lag(winterdays, lag=92)[1] df_win = df_SST[['SST pattern']].mean(0, level=1).loc[winterdays] winterSST = df_win.groupby(df_win.index.year).mean().iloc[:-1] falldays = core_pp.get_subdates(df_SST[['SST pattern']].mean(0, level=1).index, start_end_date=('09-01', '12-31'), start_end_year=(1980, 2018)) df_fall = df_SST[['SST pattern']].mean(0, level=1).loc[falldays] fallSST = df_win.groupby(df_win.index.year).mean().iloc[1:]
'BSS {:.2f}\n'.format(df_train_m.mean(0).loc[0]['BSS']), 'AUC {:.2f}'.format(df_train_m.mean(0).loc[0]['roc_auc_score'])) #%% Correlating both the gradient and absolute timeseries of ENSO with target df_ENSO_s = df_ENSO.loc[0] grad_w = 3 gap = 6 for month in range(1,13): grad_ENSO = df_ENSO_s.shift(int(1+grad_w/2+gap)).rolling(int(grad_w/2), center=True, min_periods=1).mean() - \ df_ENSO_s.rolling(int(grad_w/2), center=True, min_periods=1).mean() X_dates = core_pp.get_subdates(df_ENSO_s.index, start_end_date=(f'{month}-01',f'{month}-28'), start_end_year=(1951, 2019)) target_data = rg.TV_ts[1:].values # df_ENSO_s.loc[X_dates].plot() corr_grad = np.corrcoef(grad_ENSO.loc[X_dates].values.squeeze(), target_data)[0][1] corr_abs = np.corrcoef(df_ENSO_s.loc[X_dates].values.squeeze(), target_data)[0][1] print('{:02d}'.format(month), 'Gradient ENSO {:.2f}\n'.format(corr_grad), ' Absolute values {:.2f}'.format(corr_abs) )
def ENSO_34(filepath, df_splits=None, get_ENSO_states: bool = True): #%% # file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc' ''' See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf selbox has format of (lon_min, lon_max, lat_min, lat_max) ''' # if df_splits is None: # seldates = None # else: # seldates = df_splits.loc[0].index # {'la_min':-5, # select domain in degrees east # 'la_max':5, # 'lo_min':-170, # 'lo_max':-120}, kwrgs_pp = { 'selbox': (190, 240, -5, 5), 'format_lon': 'only_east', 'seldates': None } ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp) dates = pd.to_datetime(ds.time.values) data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude')) df_ENSO = pd.DataFrame(data=data.values, index=dates, columns=['ENSO34']) if df_splits is not None: splits = df_splits.index.levels[0] df_ENSO = pd.concat([df_ENSO] * splits.size, axis=0, keys=splits) if get_ENSO_states: ''' From Anderson 2017 - Life cycles of agriculturally relevant ENSO teleconnections in North and South America. http://doi.wiley.com/10.1002/joc.4916 mean boreal wintertime (October, November, December) SST anomaly amplitude in the NiƱo 3.4 region exceeded 1 of 2 standard deviation. ''' if hasattr(df_ENSO.index, 'levels'): df_ENSO_s = df_ENSO.loc[0] else: df_ENSO_s = df_ENSO dates = df_ENSO_s.index df_3monthmean = df_ENSO_s.rolling(3, center=True, min_periods=1).mean() std_ENSO = df_3monthmean.std() OND, groups = core_pp.get_subdates(dates, start_end_date=('10-01', '12-31'), returngroups=True) OND_ENSO = df_3monthmean.loc[OND].groupby(groups).mean() nino_yrs = OND_ENSO[OND_ENSO > df_3monthmean.mean() + std_ENSO][:].dropna().index #+ 1 nina_yrs = OND_ENSO[OND_ENSO < df_3monthmean.mean() - std_ENSO][:].dropna().index #+ 1 neutral = [ y for y in OND_ENSO.index if y not in core_pp.flatten([nina_yrs, nino_yrs]) ] states = {} for i, d in enumerate(dates): if d.year in nina_yrs: states[d.year] = -1 if d.year in neutral: states[d.year] = 0 if d.year in nino_yrs: states[d.year] = 1 cycle_list = [] for s, v in [('EN', 1), ('LN', -1)]: ENSO_cycle = {d.year: 0 for d in dates} for i, year in enumerate(np.unique(dates.year)): # d = dates[1] # if states[year] == v: # s = 'EN' # elif states[year] == -1: # s = 'LN' if states[year] == v: ENSO_cycle[year] = f'{s}0' if year - 1 in dates.year and states[year - 1] != v: ENSO_cycle[year - 1] = f'{s}-1' if year + 1 in dates.year and states[year + 1] != v: ENSO_cycle[year + 1] = f'{s}+1' cycle_list.append(ENSO_cycle) time_index = pd.to_datetime([f'{y}-01-01' for y in states.keys()]) df_state = pd.concat([ pd.Series(states), pd.Series(cycle_list[0]), pd.Series(cycle_list[1]) ], axis=1, keys=['state', 'EN_cycle', 'LN_cycle']) df_state.index = time_index if hasattr(df_ENSO.index, 'levels'): # copy to other traintest splits df_state = pd.concat([df_state] * splits.size, keys=splits) composites = np.zeros(3, dtype=object) for i, yrs in enumerate([nina_yrs, neutral, nino_yrs]): composite = [d for d in dates if d.year in yrs] composites[i] = ds.sel(time=composite).mean(dim='time') composites = xr.concat(composites, dim='state') composites['state'] = ['Nina', 'Neutral', 'Nino'] plot_maps.plot_corr_maps(composites, row_dim='state', hspace=0.5) out = df_ENSO, [ np.array(nina_yrs), np.array(neutral), np.array(nino_yrs) ], df_state else: out = df_ENSO #%% return out
('12-01', '02-28'), ('02-01', '05-30'), ('06-01', '08-31')] seasons = ['{annual}', '{DJF}', '{MAM}', '{JJA}'] f, ax = plt.subplots(len(seasons), figsize=(10,18), sharex=True) only_summer = True for p, startenddate in enumerate(start_end_TVdates): if only_summer: seldates = core_pp.get_subdates(df_RWE.index, start_end_date=startenddate) df_RWE_am = df_RWE.loc[seldates].groupby(seldates.year).mean() else: idx = max(0, p-1) _d = dfs[idx].mean(axis=0, level=1) seldates = core_pp.get_subdates(_d.index, start_end_date=startenddate) df_RWE_am = _d.loc[seldates].groupby(seldates.year).mean() seas = seasons[p] RWcolname = df_RWE_am.columns[0] df_RWE_am = df_RWE_am.rename({RWcolname: f'$RW_{seas}^E$'}, axis=1) df_merge = df_PDO_am.merge(df_RWE_am, left_index=True, right_index=True) df_merge = (df_merge - df_merge.mean(0) ) / df_merge.std(0)