def prediction_wrapper(df_data, lags, target_ts=None, keys: list=None, match_lag: bool=False, n_boot: int=1): # alphas = np.append(np.logspace(.1, 1.5, num=25), [250]) alphas = np.logspace(.1, 1.5, num=25) kwrgs_model = {'scoring':'neg_mean_absolute_error', 'alphas':alphas, # large a, strong regul. 'normalize':False} if target_ts is None: fc_mask = df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False) target_ts = df_data.iloc[:,[0]].loc[0][fc_mask] else: target_ts = target_ts target_ts = (target_ts - target_ts.mean()) / target_ts.std() out = rg.fit_df_data_ridge(df_data=df_data, target=target_ts, keys=keys, tau_min=min(lags), tau_max=max(lags), kwrgs_model=kwrgs_model, match_lag_region_to_lag_fc=match_lag, transformer=fc_utils.standardize_on_train) prediction, weights, models_lags = out # get skill scores clim_mean_temp = float(target_ts.mean()) RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS] df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(prediction, df_data.iloc[:,-2:], score_func_list, n_boot = n_boot, blocksize=blocksize, rng_seed=1) index = np.unique(core_pp.flatten([k.split('_') for k in keys])) AR = [l for l in index if '..' not in l] AR = [l for l in AR if 'PDO' not in l] index = [k for k in index if k not in AR] df_test_m.index = ['AR' + ''.join(AR) +'_'+'_'.join(index)] n_splits = df_data.index.levels[0].size # test for high alpha for col in df_test_m.columns.levels[0]: cvfitalpha = [models_lags[f'lag_{col}'][f'split_{s}'].alpha_ for s in range(n_splits)] print('lag {} mean alpha {:.0f}'.format(col, np.mean(cvfitalpha))) maxalpha_c = list(cvfitalpha).count(alphas[-1]) if maxalpha_c > n_splits/3: print(f'\nlag {col} alpha {int(np.mean(cvfitalpha))}') print(f'{maxalpha_c} splits are max alpha\n') # maximum regularization selected. No information in timeseries # df_test_m.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 # df_boot.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0 no_info_fc.append(col) df_test = functions_pp.get_df_test(prediction.merge(df_data.iloc[:,-2:], left_index=True, right_index=True)).iloc[:,:-2] return prediction, df_test, df_test_m, df_boot, models_lags, weights, df_test_s_m, df_train_m
def import_precur_ts(list_import_ts: List[tuple], df_splits: pd.DataFrame, start_end_date: Tuple[str, str], start_end_year: Tuple[int, int], start_end_TVdate: Tuple[str, str], cols: list = None, precur_aggr: int = 1): ''' list_import_ts has format List[tuples], [(name, path_data)] ''' #%% # df_splits = rg.df_splits splits = df_splits.index.levels[0] orig_traintest = functions_pp.get_testyrs(df_splits) df_data_ext_s = np.zeros((splits.size), dtype=object) counter = 0 for i, (name, path_data) in enumerate(list_import_ts): df_data_e_all = functions_pp.load_hdf5(path_data)['df_data'] if type(df_data_e_all) is pd.Series: df_data_e_all = pd.DataFrame(df_data_e_all) df_data_e_all = df_data_e_all.iloc[:, :] # not sure why needed if cols is None: cols = list( df_data_e_all.columns[(df_data_e_all.dtypes != bool).values]) elif type(cols) is str: cols = [cols] if hasattr(df_data_e_all.index, 'levels'): dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:, dates_subset], :] else: dates_subset = core_pp.get_subdates(df_data_e_all.index, start_end_date, start_end_year) df_data_e_all = df_data_e_all.loc[dates_subset] if 'TrainIsTrue' in df_data_e_all.columns: _c = [ k for k in df_splits.columns if k in ['TrainIsTrue', 'RV_mask'] ] # check if traintest split is correct ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c]) _check_traintest = all( np.equal(core_pp.flatten(ext_traintest), core_pp.flatten(orig_traintest))) assert _check_traintest, ( 'Train test years of df_splits are not the ' 'same as imported timeseries') for s in range(splits.size): if 'TrainIsTrue' in df_data_e_all.columns: df_data_e = df_data_e_all.loc[s] else: df_data_e = df_data_e_all df_data_ext_s[s] = df_data_e[cols] tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days if precur_aggr != tfreq_date_e: try: df_data_ext_s[s] = functions_pp.time_mean_bins( df_data_ext_s[s], precur_aggr, start_end_date, start_end_year, start_end_TVdate=start_end_TVdate)[0] except KeyError as e: print('KeyError captured, likely the requested dates ' 'given by start_end_date and start_end_year are not' 'found in external pandas timeseries.\n{}'.format( str(e))) print(f'loaded in exterinal timeseres: {cols}') if counter == 0: df_data_ext = pd.concat(list(df_data_ext_s), keys=range(splits.size)) else: df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size)) df_data_ext = df_data_ext.merge(df_add, left_index=True, right_index=True) counter += 1 cols = None #%% return df_data_ext
def ENSO_34(filepath, df_splits=None, get_ENSO_states: bool = True): #%% # file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc' ''' See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf selbox has format of (lon_min, lon_max, lat_min, lat_max) ''' # if df_splits is None: # seldates = None # else: # seldates = df_splits.loc[0].index # {'la_min':-5, # select domain in degrees east # 'la_max':5, # 'lo_min':-170, # 'lo_max':-120}, kwrgs_pp = { 'selbox': (190, 240, -5, 5), 'format_lon': 'only_east', 'seldates': None } ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp) dates = pd.to_datetime(ds.time.values) data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude')) df_ENSO = pd.DataFrame(data=data.values, index=dates, columns=['ENSO34']) if df_splits is not None: splits = df_splits.index.levels[0] df_ENSO = pd.concat([df_ENSO] * splits.size, axis=0, keys=splits) if get_ENSO_states: ''' From Anderson 2017 - Life cycles of agriculturally relevant ENSO teleconnections in North and South America. http://doi.wiley.com/10.1002/joc.4916 mean boreal wintertime (October, November, December) SST anomaly amplitude in the Niño 3.4 region exceeded 1 of 2 standard deviation. ''' if hasattr(df_ENSO.index, 'levels'): df_ENSO_s = df_ENSO.loc[0] else: df_ENSO_s = df_ENSO dates = df_ENSO_s.index df_3monthmean = df_ENSO_s.rolling(3, center=True, min_periods=1).mean() std_ENSO = df_3monthmean.std() OND, groups = core_pp.get_subdates(dates, start_end_date=('10-01', '12-31'), returngroups=True) OND_ENSO = df_3monthmean.loc[OND].groupby(groups).mean() nino_yrs = OND_ENSO[OND_ENSO > df_3monthmean.mean() + std_ENSO][:].dropna().index #+ 1 nina_yrs = OND_ENSO[OND_ENSO < df_3monthmean.mean() - std_ENSO][:].dropna().index #+ 1 neutral = [ y for y in OND_ENSO.index if y not in core_pp.flatten([nina_yrs, nino_yrs]) ] states = {} for i, d in enumerate(dates): if d.year in nina_yrs: states[d.year] = -1 if d.year in neutral: states[d.year] = 0 if d.year in nino_yrs: states[d.year] = 1 cycle_list = [] for s, v in [('EN', 1), ('LN', -1)]: ENSO_cycle = {d.year: 0 for d in dates} for i, year in enumerate(np.unique(dates.year)): # d = dates[1] # if states[year] == v: # s = 'EN' # elif states[year] == -1: # s = 'LN' if states[year] == v: ENSO_cycle[year] = f'{s}0' if year - 1 in dates.year and states[year - 1] != v: ENSO_cycle[year - 1] = f'{s}-1' if year + 1 in dates.year and states[year + 1] != v: ENSO_cycle[year + 1] = f'{s}+1' cycle_list.append(ENSO_cycle) time_index = pd.to_datetime([f'{y}-01-01' for y in states.keys()]) df_state = pd.concat([ pd.Series(states), pd.Series(cycle_list[0]), pd.Series(cycle_list[1]) ], axis=1, keys=['state', 'EN_cycle', 'LN_cycle']) df_state.index = time_index if hasattr(df_ENSO.index, 'levels'): # copy to other traintest splits df_state = pd.concat([df_state] * splits.size, keys=splits) composites = np.zeros(3, dtype=object) for i, yrs in enumerate([nina_yrs, neutral, nino_yrs]): composite = [d for d in dates if d.year in yrs] composites[i] = ds.sel(time=composite).mean(dim='time') composites = xr.concat(composites, dim='state') composites['state'] = ['Nina', 'Neutral', 'Nino'] plot_maps.plot_corr_maps(composites, row_dim='state', hspace=0.5) out = df_ENSO, [ np.array(nina_yrs), np.array(neutral), np.array(nino_yrs) ], df_state else: out = df_ENSO #%% return out
# Predictions # ============================================================================= # out_regr2PDO = prediction_wrapper(df_data_r2PDO.copy(), keys=keys, # match_lag=match_lag, n_boot=n_boot) dates = core_pp.get_subdates(rg.dates_TV, start_end_year=(1980,2020)) target_ts_temp = rg.TV.RV_ts.loc[dates] clim_mean_temp = float(target_ts_temp.mean()) RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS, fc_utils.metrics.mean_absolute_error] # predictions temp using SST regions df_prec = merge_lagged_wrapper(rg.df_data.copy() , [1,2,3], keys) df_prec = df_prec.loc[pd.IndexSlice[:, dates], :] df_prec = df_prec.merge(rg.df_splits.loc[pd.IndexSlice[:, dates], :], left_index=True, right_index=True) keys1 = core_pp.flatten([[k+f'_{l}' for l in [1] for k in keys]]) SST1 = prediction_wrapper(df_prec, lags=np.array([0]), target_ts=target_ts_temp, keys=keys1, match_lag=False, n_boot=n_boot) keys2 = core_pp.flatten([[k+f'_{l}' for l in [1,2] for k in keys]]) SST2 = prediction_wrapper(df_prec, lags=np.array([0]), target_ts=target_ts_temp, keys=keys2, match_lag=False, n_boot=n_boot) keys3 = core_pp.flatten([[k+f'_{l}' for l in [1,2,3] for k in keys]]) SST3 = prediction_wrapper(df_prec, lags=np.array([0]), target_ts=target_ts_temp, keys=keys3,
if y == target_dataset: links.append([(j, -l) for l in range(t_min, t_max + 1)]) elif y == z and 'sst' in y: # no autocorr for SST, do not condition on it's own past pass #links.append([(j,-l) for l in range(t_min, 1)]) elif 'sst' in y and 'smi' not in z: # do not remove information of SMI from SST precursor, not physical links.append([(j, -l) for l in range(t_min, t_max + 1)]) elif 'smi' in y: links.append([(j, -l) for l in range(t_min, t_max + 1)]) else: links.append([(j, -l) for l in range(t_min, t_max + 1)]) selected_links[i] = core_pp.flatten(links) selected_links_splits[s] = selected_links tigr_function_call = 'run_pcmci' kwrgs_tigr = { 'tau_min': t_min, 'tau_max': t_max, 'pc_alpha': .05, 'max_conds_py': None, 'max_conds_px': None, 'max_combinations': 3, 'selected_links': selected_links_splits } rg.PCMCI_df_data(keys=keys, tigr_function_call=tigr_function_call,