def Tb_evaluation(): result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\Tb_evaluation\validation.csv' DA_const_err = LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled') DA_varia_err = LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scl_errfile') ismn = ISMN_io(col_offs=DA_const_err.grid.tilegrids.loc['domain', 'i_offg'], row_offs=DA_const_err.grid.tilegrids.loc['domain', 'j_offg']) for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()): logging.info('%i/%i' % (i, len(ismn.list))) res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row for io, mode in zip([DA_const_err, DA_varia_err], ['const_err', 'varia_err']): ubRMSD = np.sqrt( (((io.timeseries['obs_obs'][:, row, col, :] - io.timeseries['obs_obs'][:, row, col, :].mean()) - (io.timeseries['obs_fcst'][:, row, col, :] - io.timeseries['obs_fcst'][:, row, col, :].mean()))**2 ).mean().values) ensstd = np.sqrt(io.timeseries['obs_anavar'][:, row, col, :].mean()).values res['ubrmsd_' + mode] = ubRMSD res['ensstd_' + mode] = ensstd if (os.path.isfile(result_file) == False): res.to_csv(result_file, float_format='%0.4f') else: res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
def plot_cat_timeseries(): outpath = r'D:\work\LDAS\2018-02_scaling\_new\ismn_eval\timeseries' fname = r"D:\work\LDAS\2018-02_scaling\_new\ismn_eval\validation.csv" res = pd.read_csv(fname) diff_srf = res['corr_DA_cal_pent_ma_sm_surface'] - res[ 'corr_DA_uncal_pent_ma_sm_surface'] diff_rz = res['corr_DA_cal_pent_ma_sm_rootzone'] - res[ 'corr_DA_uncal_pent_ma_sm_rootzone'] diff_prof = res['corr_DA_cal_pent_ma_sm_profile'] - res[ 'corr_DA_uncal_pent_ma_sm_profile'] ind = (diff_srf > 0.2) | (diff_rz > 0.2) | (diff_prof > 0.2) res = res.loc[ind, ['network', 'station', 'lat', 'lon']] ismn = ISMN_io() cal = LDAS_io('xhourly', 'US_M36_SMOS_DA_calibrated_scaled') uncal = LDAS_io('xhourly', 'US_M36_SMOS_DA_nocal_scaled_pentadal') variables = ['sm_surface', 'sm_rootzone', 'sm_profile'] for idx, stat in res.iterrows(): fname = os.path.join(outpath, stat.network + '_' + stat.station + '.png') ts_ismn = ismn.read(stat.network, stat.station) lat = stat.lat lon = stat.lon plt.figure(figsize=(17, 9)) for i, var in enumerate(variables): ax = plt.subplot(3, 1, i + 1) ts_cal = calc_anomaly(cal.read_ts(var, lon, lat), method='ma') ts_cal.index += pd.to_timedelta('2 hours') ts_uncal = calc_anomaly(uncal.read_ts(var, lon, lat), method='ma') ts_uncal.index += pd.to_timedelta('2 hours') df = pd.DataFrame({ 'cal': ts_cal, 'uncal': ts_uncal, 'insitu': calc_anomaly(ts_ismn[var], method='ma') }).dropna() if len(df) > 0: df.plot(ax=ax) else: continue title = 'R(ismn - cal) = %.2f , R(ismn - uncal) = %.2f' % ( df.corr().loc['insitu', 'cal'], df.corr().loc['insitu', 'uncal']) ax.set_title(title, fontsize=12) ax.set_xlim('2010-01-01', '2016-01-01') ax.set_ylim(-0.3, 0.3) ax.set_xlabel('') plt.tight_layout() plt.savefig(fname, dpi=150) plt.close()
def run(part): parts = 6 result_file = r'D:\work\ESA_CCI_SM\validation_%i.csv' % part cci = CCISM_io() ismn = ISMN_io() # ismn.list = ismn.list.iloc[100:120] # Split station list in 4 parts for parallelization subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int') subs[-1] = len(ismn.list) start = subs[part - 1] end = subs[part] ismn.list = ismn.list.iloc[start:end, :] periods = { 'p1': ['2007-10-01', '2010-01-14'], 'p2': ['2010-01-15', '2011-10-04'], 'p3': ['2011-10-05', '2012-06-30'], 'p4': ['2012-07-01', '2014-12-31'] } freq = ['abs', 'anom'] corr_tags = [ 'corr_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes for v in cci.versions for p in periods.keys() for f in freq ] p_tags = [ 'p_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes for v in cci.versions for p in periods.keys() for f in freq ] n_tags = [ 'n_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes for v in cci.versions for p in periods.keys() for f in freq ] res = ismn.list.copy() res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True) for col in corr_tags + p_tags: res[col] = np.nan for col in n_tags: res[col] = 0 for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()): print('%i/%i (Proc %i)' % (i, len(ismn.list), part)) if ts_insitu is None: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue ts_insitu = ts_insitu[periods['p1'][0]:periods['p4'][1]] if len(ts_insitu) < 10: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue df_insitu = pd.DataFrame(ts_insitu).dropna() df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna() for m in cci.modes: df_cci = cci.read(meta.lon, meta.lat, mode=m).dropna() if len(df_cci) < 10: print('No CCI ' + m + ' data for ' + meta.network + ' / ' + meta.station) continue for f in freq: if f == 'abs': matched = df_match(df_cci, df_insitu, window=0.5) else: for v in cci.versions: df_cci.loc[:, m + '_' + v] = calc_anomaly( df_cci[m + '_' + v]) df_cci.dropna(inplace=True) if (len(df_cci) < 10) | (len(df_insitu_anom) < 10): print('No in situ or CCI ' + m + ' anomaly data for ' + meta.network + ' / ' + meta.station) continue matched = df_match(df_cci, df_insitu_anom, window=0.5) data = df_cci.join(matched['insitu']).dropna() for p in periods.keys(): vals = data[periods[p][0]:periods[p][1]].values n_matches = vals.shape[0] if n_matches < 10: continue for k, v in enumerate(cci.versions): corr, p_value = pearsonr(vals[:, k], vals[:, -1]) res.loc[meta.name, 'corr_' + m + '_' + v + '_' + p + '_' + f] = corr res.loc[meta.name, 'p_' + m + '_' + v + '_' + p + '_' + f] = p_value res.loc[meta.name, 'n_' + m + '_' + v + '_' + p + '_' + f] = n_matches res.to_csv(result_file, float_format='%0.4f')
def EC_ascat_smap_ismn_ldas(): result_file = Path('/Users/u0116961/Documents/work/extended_collocation/ec_ascat_smap_ismn_ldas.csv') names = ['insitu', 'ascat', 'smap', 'ol', 'da'] combs = list(combinations(names, 2)) ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_noScl').timeseries ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries ds_da_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries['obs_ana'] tg = LDAS_io().grid.tilegrids modes = ['absolute','longterm','shortterm'] ismn = ISMN_io() ismn.list = ismn.list.iloc[70::] ascat = HSAF_io() smap = SMAP_io() lut = pd.read_csv(Paths().lut, index_col=0) i = 0 for meta, ts_insitu in ismn.iter_stations(surface_only=True): i += 1 logging.info('%i/%i' % (i, len(ismn.list))) try: if len(ts_insitu := ts_insitu['2015-04-01':'2020-04-01'].resample('1d').mean().dropna()) < 25: continue except: continue res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row colg = col + tg.loc['domain', 'i_offg'] # col / lon rowg = row + tg.loc['domain', 'j_offg'] # row / lat tmp_lut = lut[(lut.ease2_col == colg) & (lut.ease2_row == rowg)] if len(tmp_lut) == 0: continue gpi_smap = tmp_lut.index.values[0] gpi_ascat = tmp_lut.ascat_gpi.values[0] try: ts_ascat = ascat.read(gpi_ascat, resample_time=False).resample('1d').mean().dropna() ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')] ts_ascat.name = 'ASCAT' except: continue ts_smap = smap.read(gpi_smap) if (ts_ascat is None) | (ts_smap is None): continue ind = (ds_ol['snow_mass'][:, row, col].values == 0)&(ds_ol['soil_temp_layer1'][:, row, col].values > 277.15) ts_ol = ds_ol['sm_surface'][:, row, col].to_series().loc[ind].dropna() ts_ol.index += pd.to_timedelta('2 hours') ind = (ds_da['snow_mass'][:, row, col].values == 0)&(ds_da['soil_temp_layer1'][:, row, col].values > 277.15) ts_da = ds_da['sm_surface'][:, row, col].to_series().loc[ind].dropna() ts_da.index += pd.to_timedelta('2 hours') for mode in modes: if mode == 'absolute': ts_ins = ts_insitu.copy() ts_asc = ts_ascat.copy() ts_smp = ts_smap.copy() ts_ol = ts_ol.copy() ts_da = ts_da.copy() else: ts_ins = calc_anom(ts_ins.copy(), longterm=(mode=='longterm')).dropna() ts_asc = calc_anom(ts_asc.copy(), longterm=(mode == 'longterm')).dropna() ts_smp = calc_anom(ts_smp.copy(), longterm=(mode == 'longterm')).dropna() ts_ol = calc_anom(ts_ol.copy(), longterm=(mode == 'longterm')).dropna() ts_da = calc_anom(ts_da.copy(), longterm=(mode == 'longterm')).dropna() tmp = pd.DataFrame(dict(zip(names, [ts_ins, ts_asc, ts_smp, ts_ol, ts_da]))).dropna() corr = tmp.corr() ec_res = ecol(tmp[['insitu', 'ascat', 'smap', 'ol', 'da']], correlated=[['smap', 'ol'], ['smap', 'da'], ['ol', 'da']]) res[f'len_{mode}'] = len(tmp) for c in combs: res[f'corr_{"_".join(c)}'] = corr.loc[c] res[f'err_corr_smap_ol_{mode}'] = ec_res['err_corr_smap_ol'] res[f'err_corr_smap_da_{mode}'] = ec_res['err_corr_smap_da'] res[f'err_corr_ol_da_{mode}'] = ec_res['err_corr_ol_da'] if not result_file.exists(): res.to_csv(result_file, float_format='%0.4f') else: res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
def TCA_insitu_evaluation(): result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\TCA_evaluation\validation.csv' noDA = LDAS_io('xhourly', 'US_M36_SMOS40_noDA_cal_scaled') DA_const_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scaled') DA_varia_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scl_errfile') t_ana = pd.DatetimeIndex( LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled').timeseries.time. values).sort_values() ascat = HSAF_io() gpi_list = pd.read_csv( r"D:\data_sets\ASCAT\warp5_grid\pointlist_warp_conus.csv", index_col=0) ismn = ISMN_io(col_offs=noDA.grid.tilegrids.loc['domain', 'i_offg'], row_offs=noDA.grid.tilegrids.loc['domain', 'j_offg']) runs = ['noDA', 'DA_const_err', 'DA_varia_err'] tss = [noDA.timeseries, DA_const_err.timeseries, DA_varia_err.timeseries] variables = [ 'sm_surface', ] modes = [ 'absolute', ] for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()): logging.info('%i/%i' % (i, len(ismn.list))) try: res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list) ts_asc = ascat.read(gpi, resample_time=False) if ts_asc is None: continue ts_asc.name = 'ascat' ts_asc = pd.DataFrame(ts_asc) for var in variables: for mode in modes: ts_ins = ts_insitu[var].dropna() ts_ins.name = 'insitu' ts_ins = pd.DataFrame(ts_ins) for run, ts_model in zip(runs, tss): ind = (ts_model['snow_mass'][row, col].values == 0) & ( ts_model['soil_temp_layer1'][row, col].values > 277.15) ts_mod = ts_model[var][row, col].to_series().loc[ind] ts_mod.index += pd.to_timedelta('2 hours') ts_mod = ts_mod.loc[t_ana].dropna() ts_mod.name = 'model' ts_mod = pd.DataFrame(ts_mod) matched = df_match(ts_mod, ts_asc, ts_ins, window=0.5) data = ts_mod.join(matched[0][[ 'ascat', ]]).join(matched[1][[ 'insitu', ]]).dropna() tc_res = TCA(data['model'].values, data['ascat'].values, data['insitu'].values) res['RMSE_model_' + run + '_' + mode + '_' + var] = tc_res[1][0] res['RMSE_ascat_' + run + '_' + mode + '_' + var] = tc_res[1][1] res['RMSE_insitu_' + run + '_' + mode + '_' + var] = tc_res[1][2] res['beta_ascat_' + run + '_' + mode + '_' + var] = tc_res[2][1] res['beta_insitu_' + run + '_' + mode + '_' + var] = tc_res[2][2] res['len_' + mode + '_' + var] = len(data) if (os.path.isfile(result_file) == False): res.to_csv(result_file, float_format='%0.4f') else: res.to_csv(result_file, float_format='%0.4f', mode='a', header=False) except: continue
def insitu_evaluation(): result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\insitu_evaluation\validation.csv' noDA = LDAS_io('xhourly', 'US_M36_SMOS40_noDA_cal_scaled') DA_const_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scaled') DA_varia_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scl_errfile') t_ana = pd.DatetimeIndex( LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled').timeseries.time. values).sort_values() ismn = ISMN_io(col_offs=noDA.grid.tilegrids.loc['domain', 'i_offg'], row_offs=noDA.grid.tilegrids.loc['domain', 'j_offg']) runs = ['noDA', 'DA_const_err', 'DA_varia_err'] tss = [noDA.timeseries, DA_const_err.timeseries, DA_varia_err.timeseries] variables = ['sm_surface', 'sm_rootzone', 'sm_profile'] # modes = ['absolute','longterm','shortterm'] modes = [ 'absolute', ] # ismn.list = ismn.list.iloc[101::] i = 0 for meta, ts_insitu in ismn.iter_stations(): i += 1 logging.info('%i/%i' % (i, len(ismn.list))) res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row for var in variables: for mode in modes: if mode == 'absolute': ts_ref = ts_insitu[var].dropna() elif mode == 'mean': ts_ref = calc_anomaly(ts_insitu[var], mode).dropna() else: ts_ref = calc_anomaly( ts_insitu[var], method='moving_average', longterm=(mode == 'longterm')).dropna() for run, ts_model in zip(runs, tss): ind = (ts_model['snow_mass'][row, col].values == 0) & ( ts_model['soil_temp_layer1'][row, col].values > 277.15) ts_mod = ts_model[var][row, col].to_series().loc[ind] ts_mod.index += pd.to_timedelta('2 hours') # TODO: Make sure that time of netcdf file is correct!! if mode == 'absolute': ts_mod = ts_mod.dropna() else: ts_mod = calc_anomaly( ts_mod, method='moving_average', longterm=mode == 'longterm').dropna() tmp = pd.DataFrame({ 1: ts_ref, 2: ts_mod }).loc[t_ana, :].dropna() res['len_' + mode + '_' + var] = len(tmp) r, p = pearsonr(tmp[1], tmp[2]) res['corr_' + run + '_' + mode + '_' + var] = r if (r > 0) & (p < 0.01) else np.nan res['rmsd_' + run + '_' + mode + '_' + var] = np.sqrt( ((tmp[1] - tmp[2])**2).mean()) res['ubrmsd_' + run + '_' + mode + '_' + var] = np.sqrt( (((tmp[1] - tmp[1].mean()) - (tmp[2] - tmp[2].mean()))**2).mean()) if (os.path.isfile(result_file) == False): res.to_csv(result_file, float_format='%0.4f') else: res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
def run_ismn_eval(): experiments = [['SMOSSMAP', 'short']] names = ['open_loop'] + ['MadKF_SMOS40' ] + ['_'.join(exp) for exp in experiments] runs = ['US_M36_SMAP_TB_OL_noScl'] + [ 'US_M36_SMOS40_TB_MadKF_DA_it613' ] + [f'US_M36_SMAP_TB_DA_scl_{name}' for name in names[2::]] dss = [LDAS_io('xhourly', run).timeseries for run in runs] result_file = Path( '/Users/u0116961/Documents/work/LDAS/2020-03_scaling/validation/ismn_eval.csv' ) t_ana = pd.DatetimeIndex( LDAS_io('ObsFcstAna', runs[0]).timeseries.time.values).sort_values() variables = ['sm_surface', 'sm_rootzone', 'sm_profile'] modes = ['absolute', 'longterm', 'shortterm'] ismn = ISMN_io() ismn.list = ismn.list.iloc[70::] i = 0 for meta, ts_insitu in ismn.iter_stations(surface_only=False): i += 1 logging.info('%i/%i' % (i, len(ismn.list))) if len(ts_insitu := ts_insitu['2015-04-01':'2020-04-01']) < 50: continue res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row for var in variables: for mode in modes: if mode == 'absolute': ts_ref = ts_insitu[var].dropna() else: ts_ref = calc_anom(ts_insitu[var], longterm=(mode == 'longterm')).dropna() for run, ts_model in zip(names, dss): ind = (ts_model['snow_mass'][:, row, col].values == 0) & ( ts_model['soil_temp_layer1'][:, row, col].values > 277.15) ts_mod = ts_model[var][:, row, col].to_series().loc[ind] ts_mod.index += pd.to_timedelta('2 hours') if mode == 'absolute': ts_mod = ts_mod.dropna() else: ts_mod = calc_anom( ts_mod, longterm=mode == 'longterm').dropna() tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna() res['len_' + mode + '_' + var] = len(tmp) r, p = pearsonr( tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan) res['r_' + run + '_' + mode + '_' + var] = r # res['p_' + run +'_' + mode + '_' + var] = p # res['rmsd_' + run +'_' + mode + '_' + var] = np.sqrt(((tmp[1]-tmp[2])**2).mean()) res['ubrmsd_' + run + '_' + mode + '_' + var] = np.sqrt( (((tmp[1] - tmp[1].mean()) - (tmp[2] - tmp[2].mean()))**2).mean()) tmp = pd.DataFrame({ 1: ts_ref, 2: ts_mod }).reindex(t_ana).dropna() res['ana_len_' + mode + '_' + var] = len(tmp) r, p = pearsonr( tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan) res['ana_r_' + run + '_' + mode + '_' + var] = r # res['ana_p_' + run + '_' + mode + '_' + var] = p # res['ana_rmsd_' + run +'_' + mode + '_' + var] = np.sqrt(((tmp[1]-tmp[2])**2).mean()) res['ana_ubrmsd_' + run + '_' + mode + '_' + var] = np.sqrt((((tmp[1] - tmp[1].mean()) - (tmp[2] - tmp[2].mean()))**2).mean()) if not result_file.exists(): res.to_csv(result_file, float_format='%0.4f') else: res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
def run(part): parts = 15 smos = SMOS_io() ismn = ISMN_io() ascat = HSAF_io(ext=None) mswep = MSWEP_io() # Median Q from MadKF API/CONUS run. Q_avg = 12. R_avg = 74. # Select only SCAN and USCRN ismn.list = ismn.list[(ismn.list.network == 'SCAN') | (ismn.list.network == 'USCRN')] ismn.list.index = np.arange(len(ismn.list)) # Split station list in 4 parts for parallelization subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int') subs[-1] = len(ismn.list) start = subs[part - 1] end = subs[part] ismn.list = ismn.list.iloc[start:end, :] if platform.system() == 'Windows': result_file = os.path.join('D:', 'work', 'MadKF', 'CONUS', 'ismn_eval', 'result_part%i.csv' % part) elif platform.system() == 'Linux': result_file = os.path.join('/', 'scratch', 'leuven', '320', 'vsc32046', 'output', 'MadKF', 'CONUS', 'ismn_eval', 'result_part%i.csv' % part) else: result_file = os.path.join('/', 'work', 'MadKF', 'CONUS', 'ismn_eval', 'parts2', 'result_part%i.csv' % part) dt = ['2010-01-01', '2015-12-31'] for cnt, (station, insitu) in enumerate(ismn.iter_stations(surf_depth=0.1)): # station = ismn.list.loc[978,:] # insitu = ismn.read_first_surface_layer('SCAN','Los_Lunas_Pmc') print('%i / %i' % (cnt, len(ismn.list))) # if True: try: gpi = lonlat2gpi(station.lon, station.lat, mswep.grid) mswep_idx = mswep.grid.index[mswep.grid.dgg_gpi == gpi][0] smos_gpi = mswep.grid.loc[mswep_idx, 'smos_gpi'] precip = mswep.read(mswep_idx) sm_ascat = ascat.read(gpi) sm_smos = smos.read(smos_gpi) * 100. if (precip is None) | (sm_ascat is None) | (sm_smos is None) | ( insitu is None): continue precip = calc_anomaly(precip[dt[0]:dt[1]], method='moving_average', longterm=False) sm_ascat = calc_anomaly(sm_ascat[dt[0]:dt[1]], method='moving_average', longterm=False) sm_smos = calc_anomaly(sm_smos[dt[0]:dt[1]], method='moving_average', longterm=False) insitu = calc_anomaly(insitu[dt[0]:dt[1]].resample('1d').first(), method='moving_average', longterm=False).tz_localize(None) df = pd.DataFrame({ 1: precip, 2: sm_ascat, 3: sm_smos, 4: insitu }, index=pd.date_range(dt[0], dt[1])) df.loc[np.isnan(df[1]), 1] = 0. n = len(df) if len(df.dropna()) < 50: continue gamma = mswep.grid.loc[mswep_idx, 'gamma'] api = API(gamma=gamma) # --- OL run --- x_OL = np.full(n, np.nan) model = deepcopy(api) for t, f in enumerate(precip.values): x = model.step(f) x_OL[t] = x # ----- Calculate uncertainties ----- # convert (static) forcing to model uncertainty P_avg = Q_avg / (1 - gamma**2) # calculate TCA based uncertainty and scaling coefficients tmp_df = pd.DataFrame({ 1: x_OL, 2: sm_ascat, 3: sm_smos }, index=pd.date_range(dt[0], dt[1])).dropna() snr, r_tc, err, beta = tc(tmp_df) P_TC = err[0]**2 Q_TC = P_TC * (1 - gamma**2) R_TC = (err[1] / beta[1])**2 H_TC = beta[1] # Calculate RMSD based uncertainty R_rmsd = (np.nanmean( (tmp_df[1].values - H_TC * tmp_df[2].values)**2) - P_avg) if R_rmsd < 0: R_rmsd *= -1 # ----------------------------------- # ----- Run KF using TCA-based uncertainties ----- api_kf = API(gamma=gamma, Q=Q_TC) x_kf, P, R_innov_kf, checkvar_kf, K_kf = \ KF(api_kf, df[1].values.copy(), df[2].values.copy(), R_TC, H=H_TC) # ----- Run EnKF using static uncertainties ----- forc_pert = ['normal', 'additive', Q_avg] obs_pert = ['normal', 'additive', R_avg] x_avg, P, R_innov_avg, checkvar_avg, K_avg = \ EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50) # ----- Run EnKF using RMSD-based uncertainties (corrected for model uncertainty) ----- # forc_pert = ['normal', 'additive', Q_avg] # obs_pert = ['normal', 'additive', R_rmsd] # x_rmsd, P, R_innov_rmsd, checkvar_rmsd, K_rmsd = \ # EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50) # ----- Run MadKF ----- cnt = 0 checkvar_madkf = 9999. while ((checkvar_madkf < 0.95) | (checkvar_madkf > 1.05)) & (cnt < 5): cnt += 1 tmp_x_madkf, P_madkf, R_madkf, Q_madkf, H_madkf, R_innov_madkf, tmp_checkvar_madkf, K_madkf = \ MadKF(api, df[1].values.copy(), df[2].values.copy(), n_ens=100, n_iter=20) if abs(1 - tmp_checkvar_madkf) < abs(1 - checkvar_madkf): checkvar_madkf = tmp_checkvar_madkf x_madkf = tmp_x_madkf df['x_ol'] = x_OL df['x_kf'] = x_kf df['x_avg'] = x_avg # df['x_rmsd'] = x_rmsd df['x_madkf'] = x_madkf # tc_ol = tc(df[[4,3,'x_ol']]) # tc_kf = tc(df[[4,3,'x_kf']]) # tc_avg = tc(df[[4,3,'x_avg']]) # tc_rmsd = tc(df[[4,3,'x_rmsd']]) # tc_madkf = tc(df[[4,3,'x_madkf']]) ci_l_ol, ci_m_ol, ci_u_ol = bootstrap_tc(df[[4, 3, 'x_ol']]) ci_l_kf, ci_m_kf, ci_u_kf = bootstrap_tc(df[[4, 3, 'x_kf']]) ci_l_avg, ci_m_avg, ci_u_avg = bootstrap_tc(df[[4, 3, 'x_avg']]) # ci_l_rmsd, ci_m_rmsd, ci_u_rmsd = bootstrap_tc(df[[4,3,'x_rmsd']]) ci_l_madkf, ci_m_madkf, ci_u_madkf = bootstrap_tc( df[[4, 3, 'x_madkf']]) corr = df.dropna().corr() n_all = len(df.dropna()) result = pd.DataFrame( { 'lon': station.lon, 'lat': station.lat, 'network': station.network, 'station': station.station, 'gpi': gpi, 'n_all': n_all, 'Q_est_madkf': Q_madkf, 'R_est_madkf': R_madkf, 'corr_ol': corr[4]['x_ol'], 'corr_kf': corr[4]['x_kf'], 'corr_avg': corr[4]['x_avg'], # 'corr_rmsd': corr[4]['x_rmsd'], 'corr_madkf': corr[4]['x_madkf'], # 'snr_ol': tc_ol[0][2], # 'snr_kf': tc_kf[0][2], # 'snr_avg': tc_avg[0][2], # 'snr_rmsd': tc_rmsd[0][2], # 'snr_madkf': tc_madkf[0][2], # 'r_ol': tc_ol[1][2], # 'r_kf': tc_kf[1][2], # 'r_avg': tc_avg[1][2], # 'r_rmsd': tc_rmsd[1][2], # 'r_madkf': tc_madkf[1][2], # 'rmse_kf': tc_kf[2][2], # 'rmse_avg': tc_avg[2][2], # 'rmse_rmsd': tc_rmsd[2][2], # 'rmse_madkf': tc_madkf[2][2], # 'rmse_ol': tc_ol[2][2], 'r_ol_l': ci_l_ol, 'r_ol_m': ci_m_ol, 'r_ol_u': ci_u_ol, 'r_kf_l': ci_l_kf, 'r_kf_m': ci_m_kf, 'r_kf_u': ci_u_kf, 'r_avg_l': ci_l_avg, 'r_avg_m': ci_m_avg, 'r_avg_u': ci_u_avg, # 'r_rmsd_l': ci_l_rmsd, # 'r_rmsd_m': ci_m_rmsd, # 'r_rmsd_u': ci_u_rmsd, 'r_madkf_l': ci_l_madkf, 'r_madkf_m': ci_m_madkf, 'r_madkf_u': ci_u_madkf, 'checkvar_kf': checkvar_kf, 'checkvar_avg': checkvar_avg, # 'checkvar_rmsd': checkvar_rmsd, 'checkvar_madkf': checkvar_madkf, 'R_innov_kf': R_innov_kf, 'R_innov_avg': R_innov_avg, # 'R_innov_rmsd': R_innov_rmsd, 'R_innov_madkf': R_innov_madkf }, index=(station.name, )) if (os.path.isfile(result_file) == False): result.to_csv(result_file, float_format='%0.4f') else: result.to_csv(result_file, float_format='%0.4f', mode='a', header=False) except: print('GPI failed.') continue ascat.close() mswep.close()
def plot_suspicious_stations(root): statlist = pd.read_csv('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/station_list_r_diff.csv', index_col=0) rmsd_root = 'US_M36_SMAP_TB_DA_SM_PROXY_' rmsd_exps = list(np.sort([x.name.split(rmsd_root)[1] for x in Path('/Users/u0116961/data_sets/LDASsa_runs').glob('*SM_PROXY*')])) ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_scaled_4K_obserr').timeseries ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries ts_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries['obs_obs'] t_ana = pd.DatetimeIndex(ts_ana.time.values).sort_values() ascat = HSAF_io() gpi_list = pd.read_csv(ascat.root / 'warp5_grid' / 'pointlist_warp_conus.csv', index_col=0) ismn = ISMN_io() variables = ['sm_surface', 'sm_rootzone'] modes = ['absolute', 'longterm', 'shortterm'] ismn.list.index = ismn.list.network + '_' + ismn.list.station ismn.list.reindex(statlist.index) ismn.list = ismn.list.reindex(statlist.index) for i, (meta, ts_insitu) in enumerate(ismn.iter_stations(surface_only=False)): if 'tmp_res' in locals(): if (meta.network in tmp_res) & (meta.station in tmp_res): print(f'Skipping {i}') continue try: res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list) ts_ascat = ascat.read(gpi) / 100 * 0.6 if ts_ascat is None: continue for mode in modes: for var in variables: tmp = statlist[(statlist.network==meta.network)&(statlist.station==meta.station)] dpr = tmp[f'diff_pearsonr2_{mode}_{var}'].values[0] dtr = tmp[f'diff_tcar2_{mode}_{var}'].values[0] if not ((dtr < 0) & (dpr > 0)): continue if mode == 'absolute': ts_asc = ts_ascat.dropna() else: ts_asc = calc_anom(ts_ascat, longterm=(mode == 'longterm')).dropna() ts_asc.name = 'ascat' ts_asc = pd.DataFrame(ts_asc) if mode == 'absolute': ts_ins = ts_insitu[var].dropna() else: ts_ins = calc_anom(ts_insitu[var], longterm=(mode == 'longterm')).dropna() ts_ins.name = 'insitu' ts_ins = pd.DataFrame(ts_ins) ind = (ds_ol['snow_mass'].isel(lat=row, lon=col).values == 0) & \ (ds_ol['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15) ts_ol = ds_ol[var].isel(lat=row, lon=col).to_series().loc[ind] ts_ol.index += pd.to_timedelta('2 hours') ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1) if mode == 'absolute': ts_ol = ts_ol.reindex(t_ana[ind_obs]).dropna() else: ts_ol = calc_anom(ts_ol.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna() ts_ol.name = 'open_loop' ts_ol = pd.DataFrame(ts_ol) ind = (ds_da['snow_mass'].isel(lat=row, lon=col).values == 0) & \ (ds_da['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15) ts_da = ds_da[var].isel(lat=row, lon=col).to_series().loc[ind] ts_da.index += pd.to_timedelta('2 hours') ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1) if mode == 'absolute': ts_da = ts_da.reindex(t_ana[ind_obs]).dropna() else: ts_da = calc_anom(ts_da.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna() ts_da.name = 'DA_4K' ts_da = pd.DataFrame(ts_da) matched = df_match(ts_ol, ts_da, ts_asc, ts_ins, window=0.5) data = ts_ol.join(matched[0]['DA_4K']).join(matched[1]['ascat']).join(matched[2]['insitu']).dropna() dpr_triplets = data.corr()['DA_4K']['insitu'] - data.corr()['open_loop']['insitu'] if dpr_triplets < 0: continue f = plt.figure(figsize=(15, 5)) sns.lineplot(data=data[['open_loop', 'DA_4K', 'insitu']], dashes=False, linewidth=1.5, axes=plt.gca()) plt.title(f'{meta.network} / {meta.station} ({var}): d(Pearson R2) = {dpr_triplets:.3f} , d(TCA R2) = {dtr:.3f}') fbase = Path('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/timeseries') fname = fbase / f'{mode}_{var}_{meta.network}_{meta.station}.png' f.savefig(fname, dpi=300, bbox_inches='tight') plt.close() except: continue
def run(part): parts = 6 result_file = r'D:\work\ESA_CCI_SM\ismn_r2\ismn_r2_part%i.csv' % part cci = CCISM_io() ismn = ISMN_io() # ismn.list = ismn.list.iloc[100:120] # Split station list in 4 parts for parallelization subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int') subs[-1] = len(ismn.list) start = subs[part - 1] end = subs[part] ismn.list = ismn.list.iloc[start:end, :] freq = ['abs', 'anom'] res = ismn.list.copy() res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True) res['r_abs'] = np.nan res['r_anom'] = np.nan for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()): print('%i/%i (Proc %i)' % (i, len(ismn.list), part)) if ts_insitu is None: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue ts_insitu = ts_insitu['2007-10-01':'2014-12-31'] if len(ts_insitu) < 10: print('No in situ data for ' + meta.network + ' / ' + meta.station) continue df_insitu = pd.DataFrame(ts_insitu).dropna() df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna() df_cci = cci.read(meta.lon, meta.lat, version='v04.4', mode=['ACTIVE', 'PASSIVE']).dropna() if len(df_cci) < 10: print('No CCI data for ' + meta.network + ' / ' + meta.station) continue for f in freq: if f == 'abs': matched = df_match(df_cci, df_insitu, window=0.5) else: df_cci.loc[:, 'ACTIVE_v04.4'] = calc_anomaly( df_cci['ACTIVE_v04.4']) df_cci.loc[:, 'PASSIVE_v04.4'] = calc_anomaly( df_cci['PASSIVE_v04.4']) df_cci.dropna(inplace=True) if (len(df_cci) < 10) | (len(df_insitu_anom) < 10): print('No in situ or CCI anomaly data for ' + meta.network + ' / ' + meta.station) continue matched = df_match(df_cci, df_insitu_anom, window=0.5) data = df_cci.join(matched['insitu']).dropna() if len(data) < 100: continue vals = data[['insitu', 'ACTIVE_v04.4']].values c1, p1 = pearsonr(vals[:, 0], vals[:, 1]) vals = data[['insitu', 'PASSIVE_v04.4']].values c2, p2 = pearsonr(vals[:, 0], vals[:, 1]) vals = data[['ACTIVE_v04.4', 'PASSIVE_v04.4']].values c3, p3 = pearsonr(vals[:, 0], vals[:, 1]) if (c1 < 0) | (c2 < 0) | (c3 < 0) | (p1 > 0.05) | (p2 > 0.05) | ( p3 > 0.05): continue res.loc[meta.name, 'r_' + f] = np.sqrt(tc(data)[1][2]) res.to_csv(result_file, float_format='%0.4f')