Python store_hdf_df Exemples, functions_pp.store_hdf_df Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : wrapper_RGCPD_tig.py Projet : yangxhcaf/RGCPD

def store_ts(df_data, df_sum, dict_ds, outdic_actors, ex, add_spatcov=True):

    today = datetime.datetime.today().strftime('%Y-%m-%d')
    file_name = 'fulldata_{}_{}'.format(ex['params'], today)
    ex['path_data'] = os.path.join(ex['fig_subpath'], file_name + '.h5')

    if add_spatcov:
        df_sp_s = np.zeros((ex['n_spl']), dtype=object)
        for s in range(ex['n_spl']):
            df_split = df_data.loc[s]
            df_sp_s[s] = rgcpd.get_spatcovs(dict_ds,
                                            df_split,
                                            s,
                                            outdic_actors,
                                            normalize=True)

        df_sp = pd.concat(list(df_sp_s), keys=range(ex['n_spl']))
        df_data_to_store = pd.merge(df_data,
                                    df_sp,
                                    left_index=True,
                                    right_index=True)
        df_sum_to_store = rgcpd.add_sp_info(df_sum, df_sp)
    else:
        df_data_to_store = df_data
        df_sum_to_store = df_sum

    dict_of_dfs = {'df_data': df_data_to_store, 'df_sum': df_sum_to_store}
    if ex['store_format'] == 'hdf5':
        functions_pp.store_hdf_df(dict_of_dfs, ex['path_data'])
        print('Data stored in \n{}'.format(ex['path_data']))
    return

Exemple #2

0

Afficher le fichier

def store_ts(df_data, df_sum, dict_ds,
             filename):  # outdic_precur, add_spatcov=True
    import functions_pp

    df_data_to_store = df_data
    df_sum_to_store = df_sum

    dict_of_dfs = {'df_data': df_data_to_store, 'df_sum': df_sum_to_store}

    functions_pp.store_hdf_df(dict_of_dfs, filename)
    print('Data stored in \n{}'.format(filename))
    return

Exemple #3

0

Afficher le fichier

#%% Get timeseries at specific points within gridcell
ds_t2m = core_pp.import_ds_lazy(var_filename, selbox=selbox)
npts = np.zeros((np_array_xy.shape[0], ds_t2m.time.size))
for i, xy in enumerate(np_array_xy):
    npts[i] = ds_t2m.sel(longitude=(180 + (180 + xy[0])), latitude=xy[1])

columns = [f'{abs(c[0])}W-{c[1]}N' for c in np_array_xy]
df_ts = pd.DataFrame(npts.T,
                     index=pd.to_datetime(ds_t2m.time.values),
                     columns=columns)

TVpath = os.path.join(
    user_dir, 'surfdrive/Scripts/RGCPD/publications/NPJ_2021/data/',
    'df_ts_paper2_clustercorr_{}.h5'.format(xrclustered.attrs['hash']))

functions_pp.store_hdf_df({'df_ts': df_ts}, file_path=TVpath)
#%% Calculate corr maps

list_xr = []
for point in df_ts.columns:
    list_of_name_path = [
        ('', TVpath),
        ('t2m',
         root_data + '/input_raw/t2m_US_1979-2020_1_12_daily_0.25deg.nc')
    ]
    list_for_MI = [
        BivariateMI(name='t2m',
                    func=class_BivariateMI.corr_map,
                    alpha=.05,
                    FDR_control=True,
                    lags=np.array([0]))

Exemple #4

0

Afficher le fichier

                                 index=dates, name=f'PDO{yr}bw')
            ax.plot_date(dates, df_PDObw, label=f'Butterworth {yr}-year low-pass',
                    color='red',linestyle=ls[i], linewidth=1, marker=None)
            df_PDOrm = df_PDOsplit.rolling(window=window, closed='right', min_periods=window).mean()
            df_PDOrm = df_PDOrm.rename({'PDO':f'PDO{yr}rm'}, axis=1)
            ax.plot_date(dates, df_PDOrm,
                         label=f'Rolling mean {yr}-year low-pass (closed right)', color='green',linestyle=ls[i],
                         linewidth=1, marker=None)
            list_dfPDO.append(df_PDObw) ; list_dfPDO.append(df_PDOrm)
            ax.legend()

        filepath = os.path.join(path_out_main, 'Low-pass_filter.pdf')
        plt.savefig(filepath, bbox_inches='tight')
        df_PDOs = pd.concat(list_dfPDO,axis=1)

    functions_pp.store_hdf_df({'df_data':df_PDOs},
                              file_path=filepath_df_PDOs)
#%% Get ENSO 3.4 index
if 'parcorrENSO' == exper:
    try:
        df_ENSOs = functions_pp.load_hdf5(filepath_df_ENSO)['df_data']
    except:

        SST_pp_filepath = user_dir + '/surfdrive/ERA5/input_raw/preprocessed/sst_1979-2020_jan_dec_monthly_1.0deg.nc'

        if 'df_PDOsplit' not in globals():
            df_ENSO, ENSO_years, ENSO_cycle = climate_indices.ENSO_34(SST_pp_filepath)

        df_ENSO = (df_ENSO - df_ENSO.mean()) / df_ENSO.std()

        # Butter Lowpass
        dates = df_ENSO.index

Exemple #5

0

Afficher le fichier

Fichier : main.py Projet : semvijverberg/CPPA

        df_PDO, PDO_patterns = climate_indices.PDO(filepath, df_splits)
        df_data_lag = df_PDO.merge(df_data_lag,
                                   left_index=True,
                                   right_index=True)
        print('calculating ENSO')
        df_ENSO_34 = climate_indices.ENSO_34(filepath, df_splits)
        df_data_lag = df_ENSO_34.merge(df_data_lag,
                                       left_index=True,
                                       right_index=True)
    df_data_lag = add_RV(df_data_lag, RV)

    dict_of_dfs = {'df_data': df_data_lag}
    fname = '{}_{}_lag_{}_{}.h5'.format(ex['datafolder'], today, lag,
                                        ex['hash'])
    file_path = os.path.join(ex['path_data_out'], fname)
    functions_pp.store_hdf_df(dict_of_dfs, file_path)

#actor.ts_corr[ex['RV_name']] = pd.Series(RV.RVfullts.values, index=actor.ts_corr[0].index)
central_lon_plots = 200
map_proj = ccrs.LambertCylindrical(central_longitude=central_lon_plots)
kwrgs_corr = {'clim': (-0.5, 0.5), 'hspace': -0.6}
pdfs_folder = os.path.join(ex['path_fig'], 'pdfs')
if os.path.isdir(pdfs_folder) != True: os.makedirs(pdfs_folder)

f_format = '.png'
#lags_plot = [0, 20, 50]
lags_to_plot = lags_i
contour_mask = (CPPA_prec['prec_labels'] > 0).sel(
    lag=lags_to_plot).astype(bool)
plot_maps.plot_corr_maps(CPPA_prec.sel(lag=lags_to_plot), contour_mask,
                         map_proj, **kwrgs_corr)

Exemple #6

0

Afficher le fichier

def loop_analysis(agg_level,
                  n_lags,
                  kwrgs_MI,
                  fold_method,
                  n_jobs,
                  distinct_cl=None,
                  distinct_targetperiods=None):
    #%%
    # distinct_cl = cluster_numbers; distinct_targetperiods = TV_targetperiod
    #retrieve number of clusters with aggregation level
    if distinct_cl is None:
        ncl_dict = {'high': 20, 'medium': 42, 'low': 135}
        ncl = ncl_dict['{}'.format(agg_level)]
        cl_list = list(range(1, ncl + 1))
    else:
        cl_list = distinct_cl

    subfolder = f'{agg_level}_{fold_method}'

    #target periods, all or given
    all_targetperiods = [('01-01', '01-31'), ('02-01', '02-28'),
                         ('03-01', '03-31'), ('04-01', '04-30'),
                         ('05-01', '05-31'), ('06-01', '06-30'),
                         ('07-01', '07-31'), ('08-01', '08-31'),
                         ('09-01', '09-30'), ('10-01', '10-31'),
                         ('11-01', '11-30'), ('12-01', '12-31')]

    if distinct_targetperiods is None:
        targetperiods = all_targetperiods
    else:
        targetperiods = distinct_targetperiods

    #create dictionary of periods and month names
    all_targetperiods_names_list = [
        'January', 'February', 'March', 'April', 'May', 'June', 'July',
        'August', 'September', 'October', 'November', 'December'
    ]
    zip_iterator = zip(all_targetperiods, all_targetperiods_names_list)
    all_targetperiods_dict = dict(zip_iterator)

    #create indices for multi index result dataframe
    row_idx_1_arr = np.array([val for val in cl_list for _ in range(6)
                              ])  # 1 1 1 1 1 1 2 2 2 2 2 2 ...
    row_idx_2 = ['test', 'train']
    row_idx_2_list = [val for val in row_idx_2
                      for _ in range(3)]  # test test test train train train
    if len(cl_list) > 1:
        row_idx_2_list += (
            len(cl_list) - 1
        ) * row_idx_2_list  # test test test train train train test test test train ...
    row_idx_2_arr = np.array(row_idx_2_list)
    row_idx_3_list = [
        'RMSE_SS', 'MAE_SS', 'corrcoef', 'RMSE_SS', 'MAE_SS', 'corrcoef'
    ]  #RMSE MAE corrcoef RMSE MAE corrcoef
    if len(cl_list) > 1:
        row_idx_3_list += (
            len(cl_list) - 1
        ) * row_idx_3_list  #RMSE MAE corrcoef RMSE MAE corrcoef RMSE MAE ...
    row_idx_3_arr = np.array(row_idx_3_list)
    row_arrays = [row_idx_1_arr, row_idx_2_arr, row_idx_3_arr
                  ]  #cluster, test/train, scores as row multi index
    column_array = [all_targetperiods_dict[x]
                    for x in targetperiods]  #month names as column index

    #initiate zeros ss_result dataframe, rows = months, columns = scores per cluster
    df_ss_result_all = pd.DataFrame(np.zeros(
        (len(row_arrays[0]), len(column_array)), dtype=float),
                                    index=row_arrays,
                                    columns=column_array)

    #initiate zeros prediction_result dataframe, rows = not yet known, columns = target time series & prediction
    df_prediction_result = pd.DataFrame()

    #parallel function
    def parallel(cluster, month, agg_level, n_lags, kwrgs_MI, fold_method,
                 row_arrays, column_array, subfolder):
        #%%
        print(f'Starting cluster {cluster}, prediciting {month}')
        #get list_of_name_path
        list_of_name_path = get_list_of_name_path(agg_level, cluster)
        #run define
        rg, list_for_MI, lags, crossyr = define(list_of_name_path, month,
                                                n_lags, kwrgs_MI, subfolder)
        #run check (possible, not necessary)
        #check(rg, list_of_name_path, cluster)
        #run processing
        rg = process(rg, lags, fold_method, crossyr)
        #run forecast
        test_scores, train_scores, prediction = forecast(rg, crossyr)

        #store skill score results in df_ss_result dataframe
        df_ss_result = pd.DataFrame(np.zeros(
            (len(row_arrays[0]), len(column_array)), dtype=float),
                                    index=row_arrays,
                                    columns=column_array)
        for count, i in enumerate(
                row_idx_2_arr[:6]
        ):  #always loop over test test test train train train per cluster
            if count < 3:
                df_ss_result.loc[
                    (cluster, i, row_idx_3_arr[count]),
                    all_targetperiods_dict[month]] = test_scores[count]
            else:
                df_ss_result.loc[(cluster, i, row_idx_3_arr[count]),
                                 all_targetperiods_dict[month]] = train_scores[
                                     count - 3]

        #get test df actual and predictions
        test_df_pred = functions_pp.get_df_test(prediction,
                                                df_splits=pd.DataFrame(
                                                    rg.df_data.iloc[:, -2:]))

        #update dates
        delta = int(month[0][:2]) - 1
        date_list = test_df_pred.index.get_level_values(0).shift(delta,
                                                                 freq='MS')
        test_df_pred.set_index([date_list], inplace=True)

        #change column header of prediction to RV#ts_pred
        new_columns = test_df_pred.columns.values
        new_columns[1] = new_columns[0] + '_pred'
        test_df_pred.columns = new_columns

        #save intermediate cluster csv
        results_path = os.path.join(
            main_dir, 'Results', 'skillscores',
            f'{agg_level}_{fold_method}')  #path of results
        os.makedirs(results_path,
                    exist_ok=True)  # make folder if it doesn't exist
        df_ss_result.to_csv(
            os.path.join(
                results_path,
                str(cluster) + '_' + str(all_targetperiods_dict[month]) +
                '_ss_scores_' + agg_level +
                '.csv'))  #intermediate save skillscores per cluster to csv
        #%%
        return df_ss_result, test_df_pred, rg

    with joblib.parallel_backend('loky'):
        results = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(
            parallel)(cluster, month, agg_level, n_lags, kwrgs_MI, fold_method,
                      row_arrays, column_array, subfolder)
                                                 for cluster in cl_list
                                                 for month in targetperiods)

    #append all results to one dataframe - skill scores
    for result in results:
        df_ss_result_all = df_ss_result_all + result[0].values

    #append all results to one dataframe - predictions
    for result_counter, result in enumerate(results):
        if result_counter <= len(targetperiods) - 1:
            df_prediction_result = df_prediction_result.append(result[1])
        elif result_counter > len(targetperiods) - 1 and result_counter % len(
                targetperiods) == 0:
            df_prediction_result = df_prediction_result.join(result[1],
                                                             how='left')
        else:
            df_prediction_result.update(result[1], join='left')

    #return one rg
    rg = results[-1][-1]

    results_path = os.path.join(main_dir, 'Results', 'skillscores',
                                f'{agg_level}_{fold_method}')  #path of results
    functions_pp.store_hdf_df(
        {
            'df_ss_result': df_ss_result_all,
            'df_prediction_result': df_prediction_result
        },
        file_path=os.path.join(results_path, 'df_skill_predictions.h5'))
    #return df_ss_result dataframe and prediction
    #%%
    return df_ss_result_all, df_prediction_result, rg

Exemple #7

0

Afficher le fichier

    plt.savefig(filepath + '.png', dpi=200, bbox_inches='tight')

    df_ana.plot_ts_matric(df_data,
                          win=30,
                          columns=cols,
                          period='RV_mask',
                          plot_sign_stars=False,
                          fontsizescaler=-8)
    filepath = os.path.join(
        rg.path_outsub1,
        '30d_z500_' + '-'.join(map(str, z500_green_bb)) + rg.hash)
    plt.savefig(filepath + '.png', dpi=200, bbox_inches='tight')

    filepath = os.path.join(
        path_out_main, 'z500_' + '-'.join(map(str, z500_green_bb)) + rg.hash)
    functions_pp.store_hdf_df({'df_data': df_data}, filepath + '.h5')

#%% SST vs T
list_of_name_path = [(cluster_label, TVpath),
                     ('sst',
                      os.path.join(path_raw,
                                   'sst_1979-2020_1_12_daily_1.0deg.nc'))]

lags = np.array([0, 2])

list_for_MI = [
    BivariateMI(name='sst',
                func=class_BivariateMI.corr_map,
                alpha=.05,
                FDR_control=True,
                lags=lags,

Exemple #8

0

Afficher le fichier

    df_train_m, df_test_s_m, df_test_m, df_boot = verification_tuple

    m = models_lags[f'lag_{lag_}'][f'split_{0}']
    # plt.plot(kwrgs_model['alpha'], m.cv_results_['mean_test_score'])
    # plt.axvline(m.best_params_['alpha']) ; plt.show() ; plt.close()

    list_verification.append(verification_tuple)
    rg.verification_tuple = verification_tuple

#%% Plotting Continuous forecast

df_preds_save = utils_paper3.df_predictions_for_plot(rg_list)
d_dfs = {'df_predictions': df_preds_save}
filepath_dfs = os.path.join(rg.path_outsub1,
                            f'predictions_s{seed}_continuous.h5')
functions_pp.store_hdf_df(d_dfs, filepath_dfs)

df_scores, df_boot, df_tests = utils_paper3.df_scores_for_plot(
    rg_list, name_object='verification_tuple')
d_dfs = {'df_scores': df_scores, 'df_boot': df_boot, 'df_tests': df_tests}
filepath_dfs = os.path.join(rg.path_outsub1, f'scores_s{seed}_continuous.h5')
functions_pp.store_hdf_df(d_dfs, filepath_dfs)

d_dfs = functions_pp.load_hdf5(filepath_dfs)

f = utils_paper3.plot_scores_wrapper(df_scores, df_boot)
f_name = f'{method}_{seed}_cf_PacAtl'
fig_path = os.path.join(rg.path_outsub1, f_name) + rg.figext
if save:
    f.savefig(fig_path, bbox_inches='tight')