Ejemplo n.º 1
0
def prediction_wrapper(df_data, lags, target_ts=None, keys: list=None, match_lag: bool=False,
                       n_boot: int=1):

    # alphas = np.append(np.logspace(.1, 1.5, num=25), [250])
    alphas = np.logspace(.1, 1.5, num=25)
    kwrgs_model = {'scoring':'neg_mean_absolute_error',
                   'alphas':alphas, # large a, strong regul.
                   'normalize':False}

    if target_ts is None:
        fc_mask = df_data.iloc[:,-1].loc[0]#.shift(lag, fill_value=False)
        target_ts = df_data.iloc[:,[0]].loc[0][fc_mask]

    else:
        target_ts = target_ts
    target_ts = (target_ts - target_ts.mean()) / target_ts.std()
    out = rg.fit_df_data_ridge(df_data=df_data,
                               target=target_ts,
                               keys=keys,
                               tau_min=min(lags), tau_max=max(lags),
                               kwrgs_model=kwrgs_model,
                               match_lag_region_to_lag_fc=match_lag,
                               transformer=fc_utils.standardize_on_train)

    prediction, weights, models_lags = out
    # get skill scores
    clim_mean_temp = float(target_ts.mean())
    RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE
    MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE
    score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS]

    df_train_m, df_test_s_m, df_test_m, df_boot = fc_utils.get_scores(prediction,
                                                             df_data.iloc[:,-2:],
                                                             score_func_list,
                                                             n_boot = n_boot,
                                                             blocksize=blocksize,
                                                             rng_seed=1)
    index = np.unique(core_pp.flatten([k.split('_') for k in  keys]))
    AR = [l for l in index if '..' not in l]
    AR = [l for l in AR if 'PDO' not in l]
    index = [k for k in index if k not in AR]
    df_test_m.index = ['AR' + ''.join(AR) +'_'+'_'.join(index)]
    n_splits = df_data.index.levels[0].size # test for high alpha
    for col in df_test_m.columns.levels[0]:
        cvfitalpha = [models_lags[f'lag_{col}'][f'split_{s}'].alpha_ for s in range(n_splits)]
        print('lag {} mean alpha {:.0f}'.format(col, np.mean(cvfitalpha)))
        maxalpha_c = list(cvfitalpha).count(alphas[-1])
        if maxalpha_c > n_splits/3:
            print(f'\nlag {col} alpha {int(np.mean(cvfitalpha))}')
            print(f'{maxalpha_c} splits are max alpha\n')
            # maximum regularization selected. No information in timeseries
            # df_test_m.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0
            # df_boot.loc[:,pd.IndexSlice[col, 'corrcoef']][:] = 0
            no_info_fc.append(col)
    df_test = functions_pp.get_df_test(prediction.merge(df_data.iloc[:,-2:],
                                                    left_index=True,
                                                    right_index=True)).iloc[:,:-2]
    return prediction, df_test, df_test_m, df_boot, models_lags, weights, df_test_s_m, df_train_m
Ejemplo n.º 2
0
def import_precur_ts(list_import_ts: List[tuple],
                     df_splits: pd.DataFrame,
                     start_end_date: Tuple[str, str],
                     start_end_year: Tuple[int, int],
                     start_end_TVdate: Tuple[str, str],
                     cols: list = None,
                     precur_aggr: int = 1):
    '''
    list_import_ts has format List[tuples],
    [(name, path_data)]
    '''
    #%%
    # df_splits = rg.df_splits

    splits = df_splits.index.levels[0]
    orig_traintest = functions_pp.get_testyrs(df_splits)
    df_data_ext_s = np.zeros((splits.size), dtype=object)
    counter = 0
    for i, (name, path_data) in enumerate(list_import_ts):

        df_data_e_all = functions_pp.load_hdf5(path_data)['df_data']
        if type(df_data_e_all) is pd.Series:
            df_data_e_all = pd.DataFrame(df_data_e_all)

        df_data_e_all = df_data_e_all.iloc[:, :]  # not sure why needed
        if cols is None:
            cols = list(
                df_data_e_all.columns[(df_data_e_all.dtypes != bool).values])
        elif type(cols) is str:
            cols = [cols]

        if hasattr(df_data_e_all.index, 'levels'):
            dates_subset = core_pp.get_subdates(df_data_e_all.loc[0].index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[pd.IndexSlice[:,
                                                            dates_subset], :]
        else:
            dates_subset = core_pp.get_subdates(df_data_e_all.index,
                                                start_end_date, start_end_year)
            df_data_e_all = df_data_e_all.loc[dates_subset]

        if 'TrainIsTrue' in df_data_e_all.columns:
            _c = [
                k for k in df_splits.columns
                if k in ['TrainIsTrue', 'RV_mask']
            ]
            # check if traintest split is correct
            ext_traintest = functions_pp.get_testyrs(df_data_e_all[_c])
            _check_traintest = all(
                np.equal(core_pp.flatten(ext_traintest),
                         core_pp.flatten(orig_traintest)))
            assert _check_traintest, (
                'Train test years of df_splits are not the '
                'same as imported timeseries')

        for s in range(splits.size):
            if 'TrainIsTrue' in df_data_e_all.columns:
                df_data_e = df_data_e_all.loc[s]
            else:
                df_data_e = df_data_e_all

            df_data_ext_s[s] = df_data_e[cols]
            tfreq_date_e = (df_data_e.index[1] - df_data_e.index[0]).days

            if precur_aggr != tfreq_date_e:
                try:
                    df_data_ext_s[s] = functions_pp.time_mean_bins(
                        df_data_ext_s[s],
                        precur_aggr,
                        start_end_date,
                        start_end_year,
                        start_end_TVdate=start_end_TVdate)[0]
                except KeyError as e:
                    print('KeyError captured, likely the requested dates '
                          'given by start_end_date and start_end_year are not'
                          'found in external pandas timeseries.\n{}'.format(
                              str(e)))
        print(f'loaded in exterinal timeseres: {cols}')

        if counter == 0:
            df_data_ext = pd.concat(list(df_data_ext_s),
                                    keys=range(splits.size))
        else:
            df_add = pd.concat(list(df_data_ext_s), keys=range(splits.size))
            df_data_ext = df_data_ext.merge(df_add,
                                            left_index=True,
                                            right_index=True)
        counter += 1
        cols = None
    #%%
    return df_data_ext
Ejemplo n.º 3
0
def ENSO_34(filepath, df_splits=None, get_ENSO_states: bool = True):
    #%%
    #    file_path = '/Users/semvijverberg/surfdrive/Data_era5/input_raw/sst_1979-2018_1_12_daily_2.5deg.nc'
    '''
    See http://www.cgd.ucar.edu/staff/cdeser/docs/deser.sstvariability.annrevmarsci10.pdf
    selbox has format of (lon_min, lon_max, lat_min, lat_max)
    '''

    # if df_splits is None:
    #     seldates = None
    # else:
    #     seldates = df_splits.loc[0].index

    #    {'la_min':-5, # select domain in degrees east
    #     'la_max':5,
    #     'lo_min':-170,
    #     'lo_max':-120},

    kwrgs_pp = {
        'selbox': (190, 240, -5, 5),
        'format_lon': 'only_east',
        'seldates': None
    }

    ds = core_pp.import_ds_lazy(filepath, **kwrgs_pp)
    dates = pd.to_datetime(ds.time.values)
    data = functions_pp.area_weighted(ds).mean(dim=('latitude', 'longitude'))
    df_ENSO = pd.DataFrame(data=data.values, index=dates, columns=['ENSO34'])
    if df_splits is not None:
        splits = df_splits.index.levels[0]
        df_ENSO = pd.concat([df_ENSO] * splits.size, axis=0, keys=splits)

    if get_ENSO_states:
        '''
        From Anderson 2017 - Life cycles of agriculturally relevant ENSO
        teleconnections in North and South America.
        http://doi.wiley.com/10.1002/joc.4916
        mean boreal wintertime (October, November, December) SST anomaly amplitude
        in the Niño 3.4 region exceeded 1 of 2 standard deviation.
        '''
        if hasattr(df_ENSO.index, 'levels'):
            df_ENSO_s = df_ENSO.loc[0]
        else:
            df_ENSO_s = df_ENSO
        dates = df_ENSO_s.index
        df_3monthmean = df_ENSO_s.rolling(3, center=True, min_periods=1).mean()
        std_ENSO = df_3monthmean.std()
        OND, groups = core_pp.get_subdates(dates,
                                           start_end_date=('10-01', '12-31'),
                                           returngroups=True)
        OND_ENSO = df_3monthmean.loc[OND].groupby(groups).mean()
        nino_yrs = OND_ENSO[OND_ENSO > df_3monthmean.mean() +
                            std_ENSO][:].dropna().index  #+ 1
        nina_yrs = OND_ENSO[OND_ENSO < df_3monthmean.mean() -
                            std_ENSO][:].dropna().index  #+ 1
        neutral = [
            y for y in OND_ENSO.index
            if y not in core_pp.flatten([nina_yrs, nino_yrs])
        ]
        states = {}
        for i, d in enumerate(dates):
            if d.year in nina_yrs:
                states[d.year] = -1
            if d.year in neutral:
                states[d.year] = 0
            if d.year in nino_yrs:
                states[d.year] = 1

        cycle_list = []
        for s, v in [('EN', 1), ('LN', -1)]:
            ENSO_cycle = {d.year: 0 for d in dates}
            for i, year in enumerate(np.unique(dates.year)):
                # d = dates[1]
                # if states[year] == v:
                #     s = 'EN'
                # elif states[year] == -1:
                #     s = 'LN'
                if states[year] == v:
                    ENSO_cycle[year] = f'{s}0'
                    if year - 1 in dates.year and states[year - 1] != v:
                        ENSO_cycle[year - 1] = f'{s}-1'
                    if year + 1 in dates.year and states[year + 1] != v:
                        ENSO_cycle[year + 1] = f'{s}+1'
            cycle_list.append(ENSO_cycle)

        time_index = pd.to_datetime([f'{y}-01-01' for y in states.keys()])
        df_state = pd.concat([
            pd.Series(states),
            pd.Series(cycle_list[0]),
            pd.Series(cycle_list[1])
        ],
                             axis=1,
                             keys=['state', 'EN_cycle', 'LN_cycle'])
        df_state.index = time_index

        if hasattr(df_ENSO.index, 'levels'):  # copy to other traintest splits
            df_state = pd.concat([df_state] * splits.size, keys=splits)

        composites = np.zeros(3, dtype=object)
        for i, yrs in enumerate([nina_yrs, neutral, nino_yrs]):
            composite = [d for d in dates if d.year in yrs]
            composites[i] = ds.sel(time=composite).mean(dim='time')
        composites = xr.concat(composites, dim='state')
        composites['state'] = ['Nina', 'Neutral', 'Nino']

        plot_maps.plot_corr_maps(composites, row_dim='state', hspace=0.5)
        out = df_ENSO, [
            np.array(nina_yrs),
            np.array(neutral),
            np.array(nino_yrs)
        ], df_state
    else:
        out = df_ENSO
    #%%
    return out
Ejemplo n.º 4
0
# Predictions
# =============================================================================
# out_regr2PDO = prediction_wrapper(df_data_r2PDO.copy(), keys=keys,
#                                  match_lag=match_lag, n_boot=n_boot)
dates = core_pp.get_subdates(rg.dates_TV, start_end_year=(1980,2020))
target_ts_temp = rg.TV.RV_ts.loc[dates]
clim_mean_temp = float(target_ts_temp.mean())
RMSE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).RMSE
MAE_SS = fc_utils.ErrorSkillScore(constant_bench=clim_mean_temp).MAE
score_func_list = [RMSE_SS, fc_utils.corrcoef, MAE_SS, fc_utils.metrics.mean_absolute_error]
# predictions temp using SST regions
df_prec = merge_lagged_wrapper(rg.df_data.copy() , [1,2,3], keys)
df_prec = df_prec.loc[pd.IndexSlice[:, dates], :]
df_prec = df_prec.merge(rg.df_splits.loc[pd.IndexSlice[:, dates], :], left_index=True, right_index=True)

keys1 = core_pp.flatten([[k+f'_{l}' for l in [1] for k in keys]])
SST1 = prediction_wrapper(df_prec, lags=np.array([0]),
                          target_ts=target_ts_temp,
                          keys=keys1,
                          match_lag=False, n_boot=n_boot)

keys2 = core_pp.flatten([[k+f'_{l}' for l in [1,2] for k in keys]])
SST2 = prediction_wrapper(df_prec, lags=np.array([0]),
                          target_ts=target_ts_temp,
                          keys=keys2,
                          match_lag=False, n_boot=n_boot)

keys3 = core_pp.flatten([[k+f'_{l}' for l in [1,2,3] for k in keys]])
SST3 = prediction_wrapper(df_prec, lags=np.array([0]),
                          target_ts=target_ts_temp,
                          keys=keys3,
Ejemplo n.º 5
0
                if y == target_dataset:
                    links.append([(j, -l) for l in range(t_min, t_max + 1)])
                elif y == z and 'sst' in y:
                    # no autocorr for SST, do not condition on it's own past
                    pass

                    #links.append([(j,-l) for l in range(t_min, 1)])
                elif 'sst' in y and 'smi' not in z:
                    # do not remove information of SMI from SST precursor, not physical
                    links.append([(j, -l) for l in range(t_min, t_max + 1)])
                elif 'smi' in y:
                    links.append([(j, -l) for l in range(t_min, t_max + 1)])
            else:
                links.append([(j, -l) for l in range(t_min, t_max + 1)])

        selected_links[i] = core_pp.flatten(links)
    selected_links_splits[s] = selected_links

tigr_function_call = 'run_pcmci'
kwrgs_tigr = {
    'tau_min': t_min,
    'tau_max': t_max,
    'pc_alpha': .05,
    'max_conds_py': None,
    'max_conds_px': None,
    'max_combinations': 3,
    'selected_links': selected_links_splits
}

rg.PCMCI_df_data(keys=keys,
                 tigr_function_call=tigr_function_call,