def run(): """ This in the main routine that parallelizes the validation """ # The processing will be parallelized on 30 kernels parts = 30 # Confidence intervals will be calculated at a 80% confidence level alpha = 0.80 # The validation will be done using all available sensors. sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN'] res_path = Paths().result_root / ('CI%i' % (alpha * 100)) / ('_'.join(sensors)) if not res_path.exists(): res_path.mkdir(parents=True) # Parallelized processing p = Pool(parts) arg1 = np.arange(parts) + 1 arg2 = repeat(parts, parts) arg3 = repeat(sensors, parts) arg4 = repeat(alpha, parts) arg5 = repeat(res_path, parts) p.starmap(main, zip(arg1, arg2, arg3, arg4, arg5)) # merge in parallel generated results into one single result file. merge_result_files(res_path)
def resample_ascat(): """ This resamples ASCAT data from the DGG grid onto the EASE2 grid and stores data for each grid cell into .csv files. A grid look-up table needs to be created first (method: ancillary.grid.create_lut). """ paths = Paths() # get a list of all CONUS gpis gpi_lut = pd.read_csv(paths.lut, index_col=0)[['ascat_gpi']] io = HSAF_io() # Store NN of EASE2 grid points into CSV files dir_out = paths.ascat / 'timeseries' if not dir_out.exists(): dir_out.mkdir() for gpi, lut in gpi_lut.iterrows(): Ser = io.read(lut['ascat_gpi']) if Ser is not None: Ser = Ser['2015-01-01':'2018-12-31'] if len(Ser) > 10: Ser.index = Ser.index.round( 'min') # round time steps to full minutes. fname = dir_out / ('%i.csv' % gpi) Ser.to_csv(fname, float_format='%.4f')
def generate_station_list(): """ This routine generates a list of available ISMN stations and the EASEv2 grid point they are located in. """ paths = Paths() io = ISMN_Interface(paths.ismn_raw) # get metadata indices of all stations that measure soil moisture within the first 10 cm idx = io.get_dataset_ids('soil moisture', min_depth=0.0, max_depth=0.1) df = pd.DataFrame({'network': io.metadata[idx]['network'], 'station': io.metadata[idx]['station'], 'lat': io.metadata[idx]['latitude'], 'lon': io.metadata[idx]['longitude'], 'ease2_gpi': np.zeros(len(idx)).astype('int')}, index=idx) # merge indices for stations that have multiple sensors within the first 10 cm duplicate_idx = df.groupby(df.columns.tolist()).apply(lambda x: '-'.join(['%i'% i for i in x.index])).values df.drop_duplicates(inplace=True) df.index = duplicate_idx # create EASEv2 grid domain grid = EASE2() lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats) lons = lons.flatten() lats = lats.flatten() # find EASEv2 grid points in which the individual stations are located for i, (idx, data) in enumerate(df.iterrows()): print('%i / %i' % (i, len(df))) r = (lons - data.lon) ** 2 + (lats - data.lat) ** 2 df.loc[idx, 'ease2_gpi'] = np.where((r - r.min()) < 0.0001)[0][0] df.to_csv(paths.ismn / 'station_list.csv')
def generate_station_list(): paths = Paths() io = ISMN_Interface(paths.ismn / 'downloaded' / 'CONUS_20100101_20190101') # get metadata indices of all stations that measure soil moisture within the first 10 cm idx = io.get_dataset_ids('soil moisture', min_depth=0.0, max_depth=0.1) df = pd.DataFrame( { 'network': io.metadata[idx]['network'], 'station': io.metadata[idx]['station'], 'lat': io.metadata[idx]['latitude'], 'lon': io.metadata[idx]['longitude'], 'ease2_gpi': np.zeros(len(idx)).astype('int') }, index=idx) # merge indices for stations that have multiple sensors within the first 10 cm duplicate_idx = df.groupby(df.columns.tolist()).apply( lambda x: '-'.join(['%i' % i for i in x.index])).values df.drop_duplicates(inplace=True) df.index = duplicate_idx grid = EASE2() lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats) lons = lons.flatten() lats = lats.flatten() for i, (idx, data) in enumerate(df.iterrows()): print('%i / %i' % (i, len(df))) r = (lons - data.lon)**2 + (lats - data.lat)**2 df.loc[idx, 'ease2_gpi'] = np.where((r - r.min()) < 0.0001)[0][0] df.to_csv(paths.ismn / 'station_list.csv')
def reformat_smap(): """ This extracts raw SMAP EASEv2 data and stores it into .csv files for later processing. A grid look-up table needs to be created first (method: ancillary.grid.create_lut). """ paths = Paths() # generate idx. array to map ease col/row to gpi n_row = 406 n_col = 964 idx_arr = np.arange(n_row * n_col, dtype='int64').reshape((n_row, n_col)) # get a list of all CONUS gpis ease_gpis = pd.read_csv(paths.lut, index_col=0).index.values # Collect orbit file list and extract date info from file name fdir = paths.smap_raw files = sorted(fdir.glob('*')) dates = pd.to_datetime([str(f)[-29:-14] for f in files]).round('min') # Array with ALL possible dates and ALL CONUS gpis res_arr = np.full((len(dates), len(ease_gpis)), np.nan) # Fill in result array from orbit files for i, f in enumerate(files): print("%i / %i" % (i, len(files))) tmp = h5py.File(fdir / f) row = tmp['Soil_Moisture_Retrieval_Data']['EASE_row_index'][:] col = tmp['Soil_Moisture_Retrieval_Data']['EASE_column_index'][:] idx = idx_arr[row, col] # Check for valid data within orbit files for res_ind, gpi in enumerate(ease_gpis): sm_ind = np.where(idx == gpi)[0] if len(sm_ind) > 0: qf = tmp['Soil_Moisture_Retrieval_Data'][ 'retrieval_qual_flag'][sm_ind[0]] if (qf == 0) | (qf == 8): res_arr[i, res_ind] = tmp['Soil_Moisture_Retrieval_Data'][ 'soil_moisture'][sm_ind[0]] tmp.close() # Write out valid time series of all CONIS GPIS into separate .csv files dir_out = paths.smap / 'timeseries' if not dir_out.exists(): dir_out.mkdir() for i, gpi in enumerate(ease_gpis): Ser = pd.Series(res_arr[:, i], index=dates).dropna() if len(Ser) > 0: Ser = Ser.groupby( Ser.index).last() # Make sure that no time duplicates exist! fname = dir_out / ('%i.csv' % gpi) Ser.to_csv(fname, float_format='%.4f')
def __init__(self, sensors=None): if sensors is None: self.sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN'] else: self.sensors = sensors self.root = Paths().data_root
def resample_timeseries(): paths = Paths() io = ISMN_Interface(paths.ismn / 'downloaded' / 'CONUS_20100101_20190101') # get all stations / sensors for each grid cell. lut = pd.read_csv(paths.ismn / 'station_list.csv', index_col=0) lut = lut.groupby('ease2_gpi').apply( lambda x: '-'.join([i for i in x.index])) dir_out = paths.ismn / 'timeseries' for cnt, (gpi, indices) in enumerate(lut.iteritems()): print('%i / %i' % (cnt, len(lut))) fname = dir_out / ('%i.csv' % gpi) idx = indices.split('-') # Only one station within grid cell if len(idx) == 1: try: ts = io.read_ts(int(idx[0])) ts = ts[ts['soil moisture_flag'] == 'G']['soil moisture'] ts.tz_convert(None).to_csv(fname, float_format='%.4f') except: print('Corrupt file: ' + io.metadata[int(idx[0])]['filename']) # Multiple stations within grid cell else: df = [] for i in idx: try: ts = io.read_ts(int(i)) df += [ ts[ts['soil moisture_flag'] == 'G']['soil moisture'] ] except: print('Corrupt file: ' + io.metadata[int(i)]['filename']) if len(df) == 0: continue df = pd.concat(df, axis=1) df.columns = np.arange(len(df.columns)) # match temporal mean and standard deviation to those of the station with the maximum temporal coverage n = np.array([len(df[i].dropna()) for i in df]) ref = np.where(n == n.max())[0][0] for col in df: if col != ref: df[col] = (df[col] - df[col].mean()) / df[col].std( ) * df[ref].std() + df[ref].mean() # Average measurements of all stations df.mean(axis='columns').tz_convert(None).to_csv( fname, float_format='%.4f')
def resample_merra2(part=1, parts=1): """ This resamples MERRA-2 data from the MERRA grid onto the EASE2 grid and stores data for each grid cell into .csv files. A grid look-up table needs to be created first (method: ancillary.grid.create_lut). Parameters ---------- part : int Data subset to be processed - Data can be resampled in subsets for parallelization to speed-up the processing. parts : int Number of parts in which to split the data for parallel processing. Per default, all data are resampled at once. """ paths = Paths() dir_out = paths.merra2 / 'timeseries' if not dir_out.exists(): dir_out.mkdir() path = paths.merra2_raw files = np.array(sorted(path.glob('*'))) ds = xr.open_mfdataset(files) lats = ds.lat.values lons = ds.lon.values dates = pd.to_datetime(ds.time.values) # get a list of all CONUS gpis gpi_lut = pd.read_csv(paths.lut, index_col=0)[['merra2_lon','merra2_lat']] # split domain for parallelization subs = (np.arange(parts + 1) * len(gpi_lut) / parts).astype('int') subs[-1] = len(gpi_lut) start = subs[part - 1] end = subs[part] gpi_lut = gpi_lut.iloc[start:end,:] # find and write all EASE2 NN grid points for i, (gpi, lut) in enumerate(gpi_lut.iterrows()): print("%i / %i" % (i, len(gpi_lut))) ind_lat = np.where(lats == lut['merra2_lat'])[0][0] ind_lon = np.where(lons == lut['merra2_lon'])[0][0] ts = ds['TSOIL1'][:, ind_lat, ind_lon] - 273.15 swe = ds['SNOMAS'][:, ind_lat, ind_lon] ind_valid = ((ts>=4)&(swe==0)).values Ser = pd.Series(ds['SFMC'][ind_valid, ind_lat, ind_lon].values, index=dates[ind_valid]) fname = dir_out / ('%i.csv' % gpi) Ser.to_csv(fname, float_format='%.4f')
def merge_result_files(): sensors = ['ASCAT', 'SMOS', 'MERRA2', 'ISMN'] # sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN'] paths = Paths() path = paths.result_root / ('_'.join(sensors)) files = list(path.glob('**/*.csv')) result = pd.DataFrame() for f in files: tmp = pd.read_csv(f, index_col=0) result = result.append(tmp) f.unlink() result.sort_index().to_csv(path / 'result.csv', float_format='%0.3f')
def reshuffle_ascat(): paths = Paths() # get a list of all CONUS gpis gpi_lut = pd.read_csv(paths.lut, index_col=0)[['ascat_gpi']] io = HSAF_io() # Store NN of EASE2 grid points in CSV files dir_out = paths.ascat / 'resampled' for gpi, lut in gpi_lut.iterrows(): Ser = io.read(lut['ascat_gpi']) if Ser is not None: Ser = Ser['2015-01-01':'2018-12-31'] if len(Ser) > 10: Ser.index = Ser.index.round('min') fname = dir_out / ('%i.csv' % gpi) Ser.to_csv(fname, float_format='%.4f')
def __init__(self, version='h113', ext='h114'): paths = Paths() self.data_path = paths.ascat / version self.version = version.upper() grid = Dataset(paths.ascat / 'warp5_grid' / 'TUW_WARP5_grid_info_2_2.nc') self.gpis = grid['gpi'][:][grid['land_flag'][:] == 1] self.cells = grid['cell'][:][grid['land_flag'][:] == 1] grid.close() self.loaded_cell = None self.fid = None if ext is not None: self.ext = HSAF_io(version=ext, ext=None) else: self.ext = None
def generate_plots(): sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN'] path = Paths().result_root / 'CI80' / ('_'.join(sensors)) if not (path / 'plots').exists(): (path / 'plots').mkdir() spatial_plot_n(path) spatial_plot_relative_metrics_ci_diff(path) boxplot_relative_metrics(path) spatial_plot_tca_diff(path) spatial_plot_tca_ci_diff(path, sensors) boxplot_tca(path, sensors) boxplot_relative_metrics_ismn(path) boxplot_tca_ismn(path, sensors)
def reshuffle_merra2(part=1): paths = Paths() dir_out = paths.merra2 / 'timeseries' path = paths.merra2 / 'raw' / '2015-2018' files = np.array(sorted(path.glob('*'))) ds = xr.open_mfdataset(files) lats = ds.lat.values lons = ds.lon.values dates = pd.to_datetime(ds.time.values) # get a list of all CONUS gpis gpi_lut = pd.read_csv(paths.lut, index_col=0)[['merra2_lon', 'merra2_lat']] # split domain for parallelization parts = 2 subs = (np.arange(parts + 1) * len(gpi_lut) / parts).astype('int') subs[-1] = len(gpi_lut) start = subs[part - 1] end = subs[part] gpi_lut = gpi_lut.iloc[start:end, :] # find and write all EASE2 NN grid points for i, (gpi, lut) in enumerate(gpi_lut.iterrows()): print("%i / %i" % (i, len(gpi_lut))) ind_lat = np.where(lats == lut['merra2_lat'])[0][0] ind_lon = np.where(lons == lut['merra2_lon'])[0][0] ts = ds['TSOIL1'][:, ind_lat, ind_lon] - 273.15 swe = ds['SNOMAS'][:, ind_lat, ind_lon] ind_valid = ((ts >= 4) & (swe == 0)).values Ser = pd.Series(ds['SFMC'][ind_valid, ind_lat, ind_lon].values, index=dates[ind_valid]) fname = dir_out / ('%i.csv' % gpi) Ser.to_csv(fname, float_format='%.4f')
def main(part, parts, sensors, alpha, res_path): """ This calculates validation statistics for a subset of the study domain. Attributes ---------- part : int part of the subset to process parts : int number of subsets to divide the study domain into sensors : list of str sensors to be considered in the validation alpha : float [0,1] confidence level of the confidence intervals res_path : pathlib.Path Path where to store the result file """ result_file = res_path / ('result_%i.csv' % part) lut = pd.read_csv(Paths().lut, index_col=0) # Split grid cell list for parallelization subs = (np.arange(parts + 1) * len(lut) / parts).astype('int') subs[-1] = len(lut) start = subs[part - 1] end = subs[part] # Look-up table that contains the grid cells to iterate over lut = lut.iloc[start:end, :] io = reader(sensors) for cnt, (gpi, data) in enumerate(lut.iterrows()): print('%i / %i' % (cnt, len(lut))) # Get the template of data fields to store results into res = result_template(sensors, gpi) res.loc[gpi, 'col'] = data.ease2_col res.loc[gpi, 'row'] = data.ease2_row try: mode, dfs = io.read(gpi, calc_anom_lt=False) # Iterate over all data sets (absolute values and anomalies collocated with and without ISMN) for m, df in zip(mode, dfs): if df is not None: # Only calculate corrected sample size once to speed up processing res.loc[gpi, 'n_corr_' + m + '_tc'] = correct_n(df) # check if current data set contains ISMN data or not. scl = m[0:4] if scl == 'grid': res.loc[gpi, 'n_grid'] = len(df) else: res.loc[gpi, 'n_ismn'] = len(df) b = bias(df, alpha=alpha) R = Pearson_R(df, alpha=alpha, n_corr=b.loc[:, :, 'n_corr']) # rescale all columns to MERRA2 before calculating ubRMSD tmp_df = df.copy() for col in sensors: if (col == 'MERRA2') | (not col in tmp_df): continue tmp_df.loc[:, col] = ( (tmp_df[col] - tmp_df[col].mean()) / tmp_df[col].std() ) * tmp_df['MERRA2'].std() + tmp_df['MERRA2'].mean() ubrmsd = ubRMSD(tmp_df, alpha=alpha, n_corr=b.loc[:, :, 'n_corr']) res.loc[gpi, 'n_' + scl] = len(df) # calculate relative metrics for all pair-wise combinations for t in list(combinations(df.columns.values, 2)): res.loc[gpi, 'n_corr_' + m + '_' + '_'.join(t)] = R.loc[t[0], t[1], 'n_corr'] res.loc[gpi, 'bias_' + m + '_l_' + '_'.join(t)] = b.loc[t[0], t[1], 'CI_l_corr'] res.loc[gpi, 'bias_' + m + '_p_' + '_'.join(t)] = b.loc[t[0], t[1], 'bias'] res.loc[gpi, 'bias_' + m + '_u_' + '_'.join(t)] = b.loc[t[0], t[1], 'CI_u_corr'] res.loc[gpi, 'ubrmsd_' + m + '_l_' + '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'CI_l_corr'] res.loc[gpi, 'ubrmsd_' + m + '_p_' + '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'ubRMSD'] res.loc[gpi, 'ubrmsd_' + m + '_u_' + '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'CI_u_corr'] res.loc[gpi, 'r_' + m + '_l_' + '_'.join(t)] = R.loc[t[0], t[1], 'CI_l_corr'] res.loc[gpi, 'r_' + m + '_p_' + '_'.join(t)] = R.loc[t[0], t[1], 'R'] res.loc[gpi, 'r_' + m + '_u_' + '_'.join(t)] = R.loc[t[0], t[1], 'CI_u_corr'] res.loc[gpi, 'p_' + m + '_p_' + '_'.join(t)] = R.loc[t[0], t[1], 'p'] # calculate TCA-based metrics for all triplets except those including both SMOS and SMAP for t in list(combinations(df.columns.values, 3)): if (('SMOS' in t) & ('SMAP' in t)): continue tcstr = '_tc_' + '_'.join(t) tca = TCA(df[list(t)], alpha=alpha) # calculate TCA only for coarse-resolution data sets triplets that have been collocated # without ISMN data if (scl != 'grid') | (t[2] != 'ISMN'): for s in t: res.loc[gpi, 'bias_' + m + '_p_' + s + tcstr] = tca.loc['beta_p', s] res.loc[gpi, 'bias_' + m + '_l_' + s + tcstr] = tca.loc['beta_l', s] res.loc[gpi, 'bias_' + m + '_m_' + s + tcstr] = tca.loc['beta_m', s] res.loc[gpi, 'bias_' + m + '_u_' + s + tcstr] = tca.loc['beta_u', s] res.loc[gpi, 'ubrmse_' + m + '_p_' + s + tcstr] = tca.loc['ubRMSE_p', s] res.loc[gpi, 'ubrmse_' + m + '_l_' + s + tcstr] = tca.loc['ubRMSE_l', s] res.loc[gpi, 'ubrmse_' + m + '_m_' + s + tcstr] = tca.loc['ubRMSE_m', s] res.loc[gpi, 'ubrmse_' + m + '_u_' + s + tcstr] = tca.loc['ubRMSE_u', s] res.loc[gpi, 'r2_' + m + '_p_' + s + tcstr] = tca.loc['r2_p', s] res.loc[gpi, 'r2_' + m + '_l_' + s + tcstr] = tca.loc['r2_l', s] res.loc[gpi, 'r2_' + m + '_m_' + s + tcstr] = tca.loc['r2_m', s] res.loc[gpi, 'r2_' + m + '_u_' + s + tcstr] = tca.loc['r2_u', s] except: continue if not result_file.exists(): res.to_csv(result_file, float_format='%0.3f') else: res.to_csv(result_file, float_format='%0.3f', mode='a', header=False)
def run_ascat_eval_part(part, parts, ref='ascat'): import numpy as np import pandas as pd from pathlib import Path from scipy.stats import pearsonr from pyldas.interface import GEOSldas_io from myprojects.readers.ascat import HSAF_io from myprojects.timeseries import calc_anom from validation_good_practice.ancillary.paths import Paths res_path = Path( '~/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/validation_all' ).expanduser() if not res_path.exists(): Path.mkdir(res_path, parents=True) result_file = res_path / ('ascat_eval_part%i.csv' % part) tc_res_pc = pd.read_csv( '/Users/u0116961/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/sm_validation/Pcorr/result.csv', index_col=0) tc_res_nopc = pd.read_csv( '/Users/u0116961/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/sm_validation/noPcorr/result.csv', index_col=0) lut = pd.read_csv(Paths().lut, index_col=0) # Split grid cell list for parallelization subs = (np.arange(parts + 1) * len(lut) / parts).astype('int') subs[-1] = len(lut) start = subs[part - 1] end = subs[part] # Look-up table that contains the grid cells to iterate over lut = lut.iloc[start:end, :] root = Path('/Users/u0116961/data_sets/GEOSldas_runs') runs = [run.name for run in root.glob('*_DA_SMAP_*')] names = [run[20::] for run in runs] runs += ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_OL_noPcorr'] names += ['Pcorr_OL', 'noPcorr_OL'] # names = ['OL_Pcorr', 'OL_noPcorr'] + \ # [f'DA_{pc}_{err}' for pc in ['Pcorr','noPcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st']] # runs = ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_OL_noPcorr' ] + \ # [f'NLv4_M36_US_DA_SMAP_{pc}_{err}' for pc in ['Pcorr','noPcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st']] # names = ['OL_Pcorr', 'DA_Pcorr_LTST'] + \ # [f'DA_{pc}_{err}{mode}' for pc in ['Pcorr'] for err in ['4K','anom_lt', 'anom_lt_ScYH', 'anom_lst','anom_st'] for mode in ['', '_ScDY', '_ScYH']] # # runs = ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_DA_Pcorr_LTST'] + \ # [f'NLv4_M36_US_DA_SMAP_{pc}_{err}{mode}' for pc in ['Pcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st'] for mode in ['', '_ScDY', '_ScYH']] dss = [ GEOSldas_io('tavg3_1d_lnr_Nt', run).timeseries if 'DA' in run else GEOSldas_io('SMAP_L4_SM_gph', run).timeseries for run in runs ] grid = GEOSldas_io('ObsFcstAna', runs[0]).grid ds_full = GEOSldas_io('SMAP_L4_SM_gph', 'NLv4_M36_US_OL_Pcorr').timeseries ds_full = ds_full.assign_coords( {'time': ds_full['time'].values + pd.to_timedelta('2 hours')}) ds_obs_smap = GEOSldas_io( 'ObsFcstAna', 'NLv4_M36_US_DA_SMAP_Pcorr_4K').timeseries['obs_obs'] modes = ['abs', 'anom_lt', 'anom_st', 'anom_lst'] ascat = HSAF_io() for cnt, (gpi, data) in enumerate(lut.iterrows()): print('%i / %i, gpi: %i' % (cnt, len(lut), gpi)) col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg']) row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg']) res = pd.DataFrame(index=(gpi, )) res['col'] = int(data.ease2_col) res['row'] = int(data.ease2_row) res['lcol'] = col res['lrow'] = row try: ts_ascat = ascat.read( data['ascat_gpi']).resample('1d').mean().dropna() ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')] ts_ascat.name = 'ASCAT' except: continue try: t_df_smap = ds_obs_smap.sel(species=[1, 2]).isel( lat=row, lon=col).to_pandas() t_ana = t_df_smap[~np.isnan(t_df_smap[1]) | ~np.isnan(t_df_smap[2])].index t_ana = pd.Series(1, index=t_ana).resample('1d').mean().dropna().index except: t_ana = pd.DatetimeIndex([]) var = 'sm_surface' for mode in modes: if mode == 'anom_lst': ts_ref = calc_anom(ts_ascat.copy(), mode='climatological').dropna() elif mode == 'anom_st': ts_ref = calc_anom(ts_ascat.copy(), mode='shortterm').dropna() elif mode == 'anom_lt': ts_ref = calc_anom(ts_ascat.copy(), mode='longterm').dropna() else: ts_ref = ts_ascat.dropna() for run, ts_model in zip(names, dss): try: if 'noPcorr' in run: r_asc = np.sqrt(tc_res_nopc.loc[ gpi, f'r2_grid_{mode}_m_ASCAT_tc_ASCAT_SMAP_CLSM']) r_mod = np.sqrt(tc_res_nopc.loc[ gpi, f'r2_grid_{mode}_m_CLSM_tc_ASCAT_SMAP_CLSM']) else: r_asc = np.sqrt(tc_res_pc.loc[ gpi, f'r2_grid_{mode}_m_ASCAT_tc_ASCAT_SMAP_CLSM']) r_mod = np.sqrt(tc_res_pc.loc[ gpi, f'r2_grid_{mode}_m_CLSM_tc_ASCAT_SMAP_CLSM']) except: r_asc = np.nan r_mod = np.nan ind_valid = ds_full.time.values[ (ds_full['snow_depth'][:, row, col].values == 0) & (ds_full['soil_temp_layer1'][:, row, col].values > 277.15)] ts_mod = ts_model[var][:, row, col].to_series() ts_mod.index += pd.to_timedelta('2 hours') ts_mod = ts_mod.reindex(ind_valid) if mode == 'anom_lst': ts_mod = calc_anom(ts_mod.copy(), mode='climatological').dropna() elif mode == 'anom_st': ts_mod = calc_anom(ts_mod.copy(), mode='shortterm').dropna() elif mode == 'anom_lt': ts_mod = calc_anom(ts_mod.copy(), mode='longterm').dropna() else: ts_mod = ts_mod.dropna() ts_mod = ts_mod.resample('1d').mean() if 'OL_' in run: res[f'r_tca_{run}_{mode}'] = r_mod tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna() res[f'len_{run}_{mode}'] = len(tmp) r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan) res[f'r_{run}_{mode}'] = r res[f'p_{run}_{mode}'] = p res[f'r_corr_{run}_{mode}'] = min(r / r_asc, 1) tmp = pd.DataFrame({ 1: ts_ref, 2: ts_mod }).reindex(t_ana).dropna() res[f'ana_len_{run}_{mode}'] = len(tmp) r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan) res[f'ana_r_{run}_{mode}'] = r res[f'ana_p_{run}_{mode}'] = p res[f'ana_r_corr_{run}_{mode}'] = min(r / r_asc, 1) if not result_file.exists(): res.to_csv(result_file, float_format='%0.3f') else: res.to_csv(result_file, float_format='%0.3f', mode='a', header=False)
def resample_ismn(): """ This resamples ISMN data onto the EASE2 grid and stores data for each grid cell into .csv files. If single grid cells contain multiple stations, they are averaged. A grid look-up table needs to be created first (method: ancillary.grid.create_lut). """ paths = Paths() io = ISMN_Interface(paths.ismn_raw) # get all stations / sensors for each grid cell. lut = pd.read_csv(paths.ismn / 'station_list.csv',index_col=0) lut = lut.groupby('ease2_gpi').apply(lambda x: '-'.join([i for i in x.index])) dir_out = paths.ismn / 'timeseries' if not dir_out.exists(): dir_out.mkdir() for cnt, (gpi, indices) in enumerate(lut.iteritems()): print('%i / %i' % (cnt, len(lut))) fname = dir_out / ('%i.csv' % gpi) idx = indices.split('-') # Only one station within grid cell if len(idx) == 1: try: ts = io.read_ts(int(idx[0])) ts = ts[ts['soil moisture_flag'] == 'G']['soil moisture'] # Get only "good" data based on ISMN QC ts.tz_convert(None).to_csv(fname, float_format='%.4f') except: print('Corrupt file: ' + io.metadata[int(idx[0])]['filename']) # Multiple stations within grid cell else: df = [] for i in idx: try: ts = io.read_ts(int(i)) df += [ts[ts['soil moisture_flag'] == 'G']['soil moisture']] # Get only "good" data based on ISMN QC except: print('Corrupt file: ' + io.metadata[int(i)]['filename']) if len(df) == 0: continue df = pd.concat(df, axis=1) df.columns = np.arange(len(df.columns)) # match temporal mean and standard deviation to those of the station with the maximum temporal coverage n = np.array([len(df[i].dropna()) for i in df]) ref = np.where(n==n.max())[0][0] for col in df: if col != ref: df[col] = (df[col] - df[col].mean())/df[col].std() * df[ref].std() + df[ref].mean() # Average measurements of all stations df.mean(axis='columns').tz_convert(None).to_csv(fname, float_format='%.4f')
def run_ascat_eval_part(part, parts): res_path = Path( '/Users/u0116961/Documents/work/LDAS/2020-03_scaling/validation') result_file = res_path / ('ascat_eval_part%i.csv' % part) lut = pd.read_csv(Paths().lut, index_col=0) # Split grid cell list for parallelization subs = (np.arange(parts + 1) * len(lut) / parts).astype('int') subs[-1] = len(lut) start = subs[part - 1] end = subs[part] # Look-up table that contains the grid cells to iterate over lut = lut.iloc[start:end, :] names = ['open_loop', 'SMOSSMAP_short', 'MadKF_SMOS40'] runs = [ 'US_M36_SMAP_TB_OL_noScl', 'US_M36_SMAP_TB_DA_scl_SMOSSMAP_short', 'US_M36_SMOS40_TB_MadKF_DA_it613' ] dss = [LDAS_io('xhourly', run).timeseries for run in runs] grid = LDAS_io().grid # t_ana = pd.DatetimeIndex(LDAS_io('ObsFcstAna', runs[0]).timeseries.time.values).sort_values() ds_obs_smap = (LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_OL_noScl').timeseries['obs_ana']) ds_obs_smos = (LDAS_io( 'ObsFcstAna', 'US_M36_SMOS40_TB_MadKF_DA_it613').timeseries['obs_ana']) modes = ['absolute', 'longterm', 'shortterm'] ascat = HSAF_io() for cnt, (gpi, data) in enumerate(lut.iterrows()): print('%i / %i' % (cnt, len(lut))) col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg']) row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg']) res = pd.DataFrame(index=(gpi, )) res['col'] = int(data.ease2_col) res['row'] = int(data.ease2_row) res['lcol'] = col res['lrow'] = row try: ts_ascat = ascat.read( data['ascat_gpi'], resample_time=False).resample('1d').mean().dropna() ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')] ts_ascat.name = 'ASCAT' except: continue t_df_smap = ds_obs_smap.sel(species=[1, 2]).isel(lat=row, lon=col).to_pandas() t_df_smos = ds_obs_smos.sel(species=[1, 2]).isel(lat=row, lon=col).to_pandas() t_ana_smap = t_df_smap[~np.isnan(t_df_smap[1]) | ~np.isnan(t_df_smap[2])].resample( '1d').mean().index t_ana_smos = t_df_smos[~np.isnan(t_df_smos[1]) | ~np.isnan(t_df_smos[2])].resample( '1d').mean().index var = 'sm_surface' for mode in modes: if mode == 'absolute': ts_ref = ts_ascat.copy() else: ts_ref = calc_anom(ts_ascat.copy(), longterm=(mode == 'longterm')).dropna() for run, ts_model in zip(names, dss): t_ana = t_ana_smos if run == 'MadKF_SMOS40' else t_ana_smap ind = (ts_model['snow_mass'][:, row, col].values == 0) & ( ts_model['soil_temp_layer1'][:, row, col].values > 277.15) ts_mod = ts_model[var][:, row, col].to_series().loc[ind] ts_mod.index += pd.to_timedelta('2 hours') if mode == 'absolute': ts_mod = ts_mod.dropna() else: ts_mod = calc_anom(ts_mod, longterm=mode == 'longterm').dropna() ts_mod = ts_mod.resample('1d').mean() tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna() res['len_' + run + '_' + mode] = len(tmp) r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan) res['r_' + run + '_' + mode] = r # res['p_' + run + '_' + mode] = p # res['rmsd_' + run + '_' + mode] = np.sqrt(((tmp[1] - tmp[2]) ** 2).mean()) res['ubrmsd_' + run + '_' + mode] = np.sqrt( (((tmp[1] - tmp[1].mean()) - (tmp[2] - tmp[2].mean()))**2).mean()) tmp = pd.DataFrame({ 1: ts_ref, 2: ts_mod }).reindex(t_ana).dropna() res['ana_len_' + run + '_' + mode] = len(tmp) r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan) res['ana_r_' + run + '_' + mode] = r # res['ana_p_' + run + '_' + mode] = p # res['ana_rmsd_' + run + '_' + mode] = np.sqrt(((tmp[1] - tmp[2]) ** 2).mean()) res['ana_ubrmsd_' + run + '_' + mode] = np.sqrt( (((tmp[1] - tmp[1].mean()) - (tmp[2] - tmp[2].mean()))**2).mean()) if not result_file.exists(): res.to_csv(result_file, float_format='%0.3f') else: res.to_csv(result_file, float_format='%0.3f', mode='a', header=False)
def EC_ascat_smap_ismn_ldas(): result_file = Path('/Users/u0116961/Documents/work/extended_collocation/ec_ascat_smap_ismn_ldas.csv') names = ['insitu', 'ascat', 'smap', 'ol', 'da'] combs = list(combinations(names, 2)) ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_noScl').timeseries ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries ds_da_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries['obs_ana'] tg = LDAS_io().grid.tilegrids modes = ['absolute','longterm','shortterm'] ismn = ISMN_io() ismn.list = ismn.list.iloc[70::] ascat = HSAF_io() smap = SMAP_io() lut = pd.read_csv(Paths().lut, index_col=0) i = 0 for meta, ts_insitu in ismn.iter_stations(surface_only=True): i += 1 logging.info('%i/%i' % (i, len(ismn.list))) try: if len(ts_insitu := ts_insitu['2015-04-01':'2020-04-01'].resample('1d').mean().dropna()) < 25: continue except: continue res = pd.DataFrame(meta.copy()).transpose() col = meta.ease_col row = meta.ease_row colg = col + tg.loc['domain', 'i_offg'] # col / lon rowg = row + tg.loc['domain', 'j_offg'] # row / lat tmp_lut = lut[(lut.ease2_col == colg) & (lut.ease2_row == rowg)] if len(tmp_lut) == 0: continue gpi_smap = tmp_lut.index.values[0] gpi_ascat = tmp_lut.ascat_gpi.values[0] try: ts_ascat = ascat.read(gpi_ascat, resample_time=False).resample('1d').mean().dropna() ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')] ts_ascat.name = 'ASCAT' except: continue ts_smap = smap.read(gpi_smap) if (ts_ascat is None) | (ts_smap is None): continue ind = (ds_ol['snow_mass'][:, row, col].values == 0)&(ds_ol['soil_temp_layer1'][:, row, col].values > 277.15) ts_ol = ds_ol['sm_surface'][:, row, col].to_series().loc[ind].dropna() ts_ol.index += pd.to_timedelta('2 hours') ind = (ds_da['snow_mass'][:, row, col].values == 0)&(ds_da['soil_temp_layer1'][:, row, col].values > 277.15) ts_da = ds_da['sm_surface'][:, row, col].to_series().loc[ind].dropna() ts_da.index += pd.to_timedelta('2 hours') for mode in modes: if mode == 'absolute': ts_ins = ts_insitu.copy() ts_asc = ts_ascat.copy() ts_smp = ts_smap.copy() ts_ol = ts_ol.copy() ts_da = ts_da.copy() else: ts_ins = calc_anom(ts_ins.copy(), longterm=(mode=='longterm')).dropna() ts_asc = calc_anom(ts_asc.copy(), longterm=(mode == 'longterm')).dropna() ts_smp = calc_anom(ts_smp.copy(), longterm=(mode == 'longterm')).dropna() ts_ol = calc_anom(ts_ol.copy(), longterm=(mode == 'longterm')).dropna() ts_da = calc_anom(ts_da.copy(), longterm=(mode == 'longterm')).dropna() tmp = pd.DataFrame(dict(zip(names, [ts_ins, ts_asc, ts_smp, ts_ol, ts_da]))).dropna() corr = tmp.corr() ec_res = ecol(tmp[['insitu', 'ascat', 'smap', 'ol', 'da']], correlated=[['smap', 'ol'], ['smap', 'da'], ['ol', 'da']]) res[f'len_{mode}'] = len(tmp) for c in combs: res[f'corr_{"_".join(c)}'] = corr.loc[c] res[f'err_corr_smap_ol_{mode}'] = ec_res['err_corr_smap_ol'] res[f'err_corr_smap_da_{mode}'] = ec_res['err_corr_smap_da'] res[f'err_corr_ol_da_{mode}'] = ec_res['err_corr_ol_da'] if not result_file.exists(): res.to_csv(result_file, float_format='%0.4f') else: res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
def main(part): parts = 30 sensors = ['ASCAT', 'SMOS', 'MERRA2', 'ISMN'] # sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN'] paths = Paths() result_file = paths.result_root / ('_'.join(sensors)) / ('result_%i.csv' % part) if not result_file.parent.exists(): result_file.parent.mkdir(parents=True) lut = pd.read_csv(paths.lut, index_col=0) # Split grid cell list for parallelization subs = (np.arange(parts + 1) * len(lut) / parts).astype('int') subs[-1] = len(lut) start = subs[part - 1] end = subs[part] lut = lut.iloc[start:end, :] io = reader(sensors) for cnt, (gpi, data) in enumerate(lut.iterrows()): print('%i / %i' % (cnt, len(lut))) res = result_template(sensors, gpi) res.loc[gpi, 'col'] = data.ease2_col res.loc[gpi, 'row'] = data.ease2_row try: mode, dfs = io.read(gpi) for m, df in zip(mode, dfs): if df is not None: res.loc[gpi, 'n_corr_' + m + '_tc'] = correct_n(df) scl = m[0:4] if scl == 'grid': res.loc[gpi, 'n_grid'] = len(df) else: res.loc[gpi, 'n_ismn'] = len(df) b = bias(df) R = Pearson_R(df, n_corr=b.loc[:, :, 'n_corr']) # rescale all columns to MERRA2 before calculating ubRMSD tmp_df = df.copy() # for col in ['ASCAT','SMOS','SMAP']: for col in ['ASCAT', 'SMOS']: tmp_df.loc[:, col] = ( (tmp_df[col] - tmp_df[col].mean()) / tmp_df[col].std() ) * tmp_df['MERRA2'].std() + tmp_df['MERRA2'].mean() ubrmsd = ubRMSD(tmp_df, n_corr=b.loc[:, :, 'n_corr']) res.loc[gpi, 'n_' + scl] = len(df) for t in list(combinations(df.columns.values, 2)): res.loc[gpi, 'n_corr_' + m + '_' + '_'.join(t)] = R.loc[t[0], t[1], 'n_corr'] res.loc[gpi, 'bias_' + m + '_l_' + '_'.join(t)] = b.loc[t[0], t[1], 'CI_l_corr'] res.loc[gpi, 'bias_' + m + '_p_' + '_'.join(t)] = b.loc[t[0], t[1], 'bias'] res.loc[gpi, 'bias_' + m + '_u_' + '_'.join(t)] = b.loc[t[0], t[1], 'CI_u_corr'] res.loc[gpi, 'ubrmsd_' + m + '_l_' + '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'CI_l_corr'] res.loc[gpi, 'ubrmsd_' + m + '_p_' + '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'ubRMSD'] res.loc[gpi, 'ubrmsd_' + m + '_u_' + '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'CI_u_corr'] res.loc[gpi, 'r_' + m + '_l_' + '_'.join(t)] = R.loc[t[0], t[1], 'CI_l_corr'] res.loc[gpi, 'r_' + m + '_p_' + '_'.join(t)] = R.loc[t[0], t[1], 'R'] res.loc[gpi, 'r_' + m + '_u_' + '_'.join(t)] = R.loc[t[0], t[1], 'CI_u_corr'] res.loc[gpi, 'p_' + m + '_p_' + '_'.join(t)] = R.loc[t[0], t[1], 'p'] for t in list(combinations(df.columns.values, 3)): if (('SMOS' in t) & ('SMAP' in t)): continue tcstr = '_tc_' + '_'.join(t) tca = TCA(df[list(t)]) if (scl != 'grid') | (t[2] != 'ISMN'): for s in t: res.loc[gpi, 'bias_' + m + '_p_' + s + tcstr] = tca.loc['beta_p', s] res.loc[gpi, 'bias_' + m + '_l_' + s + tcstr] = tca.loc['beta_l', s] res.loc[gpi, 'bias_' + m + '_m_' + s + tcstr] = tca.loc['beta_m', s] res.loc[gpi, 'bias_' + m + '_u_' + s + tcstr] = tca.loc['beta_u', s] res.loc[gpi, 'ubrmse_' + m + '_p_' + s + tcstr] = tca.loc['ubRMSE_p', s] res.loc[gpi, 'ubrmse_' + m + '_l_' + s + tcstr] = tca.loc['ubRMSE_l', s] res.loc[gpi, 'ubrmse_' + m + '_m_' + s + tcstr] = tca.loc['ubRMSE_m', s] res.loc[gpi, 'ubrmse_' + m + '_u_' + s + tcstr] = tca.loc['ubRMSE_u', s] res.loc[gpi, 'r2_' + m + '_p_' + s + tcstr] = tca.loc['r2_p', s] res.loc[gpi, 'r2_' + m + '_l_' + s + tcstr] = tca.loc['r2_l', s] res.loc[gpi, 'r2_' + m + '_m_' + s + tcstr] = tca.loc['r2_m', s] res.loc[gpi, 'r2_' + m + '_u_' + s + tcstr] = tca.loc['r2_u', s] except: continue if not result_file.exists(): res.to_csv(result_file, float_format='%0.3f') else: res.to_csv(result_file, float_format='%0.3f', mode='a', header=False)
def run_ascat_eval_smos_part(part, parts, ref='ascat'): periods = [ ['2010-04-01', '2020-04-01'], ['2010-04-01', '2015-04-01'], ['2015-04-01', '2020-04-01'], ['2010-04-01', '2012-10-01'], ['2012-10-01', '2015-04-01'], ['2015-04-01', '2017-10-01'], ['2017-10-01', '2020-04-01'], ] res_path = Path( f'~/Documents/work/MadKF/CLSM/SMOS40/validation/multiperiod/ascat' ).expanduser() if not res_path.exists(): Path.mkdir(res_path, parents=True) result_file = res_path / f'ascat_eval_smos_part{part}.csv' lut = pd.read_csv(Paths().lut, index_col=0) # Split grid cell list for parallelization subs = (np.arange(parts + 1) * len(lut) / parts).astype('int') subs[-1] = len(lut) start = subs[part - 1] end = subs[part] # Look-up table that contains the grid cells to iterate over lut = lut.iloc[start:end, :] names = ['open_loop'] + [f'SMOS40_it62{i}' for i in range(1, 5)] runs = ['US_M36_SMOS40_TB_OL_noScl' ] + [f'US_M36_SMOS40_TB_MadKF_DA_it62{i}' for i in range(1, 5)] grid = LDAS_io('ObsFcstAna', runs[0]).grid dss_xhourly = [LDAS_io('xhourly', run).timeseries for run in runs] dss_obs_ana = [ LDAS_io('ObsFcstAna', run).timeseries['obs_ana'] for run in runs ] modes = ['absolute', 'longterm', 'shortterm'] ascat = HSAF_io() for cnt, (gpi, data) in enumerate(lut.iterrows()): print('%i / %i' % (cnt, len(lut))) col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg']) row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg']) res = pd.DataFrame(index=(gpi, )) res['col'] = int(data.ease2_col) res['row'] = int(data.ease2_row) res['lcol'] = col res['lrow'] = row try: ts_ascat = ascat.read( data['ascat_gpi'], resample_time=False).resample('1d').mean().dropna() ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')] ts_ascat.name = 'ASCAT' except: continue dfs = [ ds.sel(species=[1, 2]).isel( lat=row, lon=col).to_pandas().resample('1d').mean() for ds in dss_obs_ana ] idx = [df[np.any(~np.isnan(df), axis=1)].index for df in dfs] t_ana = idx[0].intersection(idx[1]).intersection(idx[2]).intersection( idx[3]) var = 'sm_surface' for mode in modes: if mode == 'absolute': ts_ref = ts_ascat.copy() else: ts_ref = calc_anom(ts_ascat.copy(), longterm=(mode == 'longterm')).dropna() for run, ts_model in zip(names, dss_xhourly): ind = (ts_model['snow_mass'][:, row, col].values == 0) & ( ts_model['soil_temp_layer1'][:, row, col].values > 277.15) ts_mod = ts_model[var][:, row, col].to_series().loc[ind] ts_mod.index += pd.to_timedelta('2 hours') if mode == 'absolute': ts_mod = ts_mod.dropna() else: ts_mod = calc_anom(ts_mod, longterm=mode == 'longterm').dropna() ts_mod = ts_mod.reindex(t_ana).dropna() for i, p in enumerate(periods): tmp = pd.DataFrame({ 1: ts_ref, 2: ts_mod })[p[0]:p[1]].dropna() res[f'p{i}_len_{run}_{mode}'] = len(tmp) r, p = pearsonr( tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan) res[f'p{i}_r_{run}_{mode}'] = r if not result_file.exists(): res.to_csv(result_file, float_format='%0.3f') else: res.to_csv(result_file, float_format='%0.3f', mode='a', header=False)
def reshuffle_smos(): paths = Paths() # Collect all nc files path = paths.smos / 'raw' / 'MIR_SMUDP2_nc' nc_files = sorted(path.glob('**/*.nc')) # Get time stamp as the mean of start-of-orbit and end-of-orbit sdate = pd.to_datetime([str(f)[-44:-29] for f in nc_files]) edate = pd.to_datetime([str(f)[-28:-13] for f in nc_files]) dates = (sdate + (edate - sdate) / 2.).round('min') # get a list of all CONUS gpis gpi_lut = pd.read_csv(paths.lut, index_col=0)['smos_gpi'] ease_gpis = gpi_lut.index.values # Array with ALL possible dates and ALL CONUS gpis res_arr = np.full((len(dates), len(ease_gpis)), np.nan) # Fill in result array from orbit files for i, f in enumerate(nc_files): print("%i / %i" % (i, len(nc_files))) ds = Dataset(f) smos_gpis = ds.variables['Grid_Point_ID'][:] # Check for valid data within orbit files for res_ind, ease_gpi in enumerate(ease_gpis): smos_ind = np.where(smos_gpis == gpi_lut.loc[ease_gpi])[0] if len(smos_ind) > 0: sm = float(ds.variables['Soil_Moisture'][smos_ind]) if np.isnan(sm) | (sm < 0.): continue rfi = float(ds.variables['RFI_Prob'][smos_ind]) chi_2_p = float(ds.variables['Chi_2_P'][smos_ind]) valid = (rfi < 0.1) & (chi_2_p > 0.05) # cf = float(ds.variables['Confidence_Flags'][smos_ind]) # if np.isnan(cf): # continue # cf = int(cf) # sf = long(ds.variables['Science_Flags'][smos_ind]) # # valid = ((cf & 1 << 1) | (cf & 1 << 2) | (cf & 1 << 4) | (cf & 1 << 5) | (cf & 1 << 6) | # (sf & 1 << 5) | (sf & 1 << 16) | (sf & 1 << 18) | (sf & 1 << 19) | (sf & 1 << 26) == 0) & \ # (rfi < 0.1) if valid: res_arr[i, res_ind] = sm ds.close() # Write out valid time series of all CONIS GPIS into separate .csv files dir_out = paths.smos / 'timeseries' for i, gpi in enumerate(ease_gpis): Ser = pd.Series(res_arr[:, i], index=dates).dropna() if len(Ser) > 0: Ser = Ser.groupby(Ser.index).last() fname = dir_out / ('%i.csv' % gpi) Ser.to_csv(fname, float_format='%.4f')
def create_lut(): initiate = True add_ascat = True add_smos = True add_merra = True paths = Paths() fname = paths.lut # Rough bounding coordinates to pre-clip CONUS for speeding up calculations lonmin = -125. lonmax = -66.5 latmin = 24.5 latmax = 49.5 if initiate is True: grid = EASE2() lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats) cols, rows = np.meshgrid(np.arange(len(grid.ease_lons)), np.arange(len(grid.ease_lats))) lut = pd.DataFrame({ 'ease2_col': cols.flatten(), 'ease2_row': rows.flatten(), 'ease2_lon': lons.flatten(), 'ease2_lat': lats.flatten(), 'ascat_gpi': -1, 'ascat_lon': np.nan, 'ascat_lat': np.nan, 'smos_gpi': -1, 'smos_lon': np.nan, 'smos_lat': np.nan, 'merra2_lon': np.nan, 'merra2_lat': np.nan }) lut = lut[(lut.ease2_lon >= lonmin) & (lut.ease2_lon <= lonmax) & (lut.ease2_lat >= latmin) & (lut.ease2_lat <= latmax)] else: lut = pd.read_csv(fname, index_col=0) # ------------------------------------------------------------------------------------------------------------------ # A list of ASCAT gpis over the USA can be exported from https://www.geo.tuwien.ac.at/dgg/index.php # This list is used here to restrict EASE2-grid cells to CONUS only. if add_ascat is True: ascat_gpis = pd.read_csv(paths.ascat / 'warp5_grid' / 'pointlist_United States of America_warp.csv', index_col=0) ascat_gpis = ascat_gpis[(ascat_gpis.lon >= lonmin) & (ascat_gpis.lon <= lonmax) & (ascat_gpis.lat >= latmin) & (ascat_gpis.lat <= latmax)] ascat_gpis['ease2_gpi'] = -1 ascat_gpis['r'] = -1 # Get ease grid indices and distence for each ASCAT grid cell for i, (idx, data) in enumerate(ascat_gpis.iterrows()): print('%i / %i' % (i, len(ascat_gpis))) r = (lut.ease2_lon - data.lon)**2 + (lut.ease2_lat - data.lat)**2 ascat_gpis.loc[idx, 'ease2_gpi'] = lut[( r - r.min()) < 0.0001].index.values[0] ascat_gpis.loc[idx, 'r'] = r[(r - r.min()) < 0.0001].values[0] # Find the nearest matched ASCAT grid cell for each EASE grid cells for i, (idx, data) in enumerate(lut.iterrows()): print('%i / %i' % (i, len(lut))) matches = ascat_gpis[ascat_gpis.ease2_gpi == idx] if len(matches) > 0: match = matches[(matches.r - matches.r.min()) < 0.0001] lut.loc[idx, 'ascat_gpi'] = match.index.values[0] lut.loc[idx, 'ascat_lon'] = match['lon'].values[0] lut.loc[idx, 'ascat_lat'] = match['lat'].values[0] # Remove grid cells the don't have a closest ASCAT cell lut = lut[lut.ascat_gpi != -1] # ------------------------------------------------------------------------------------------------------------------ # Read SMOS grid information and clip CONUS if add_smos is True: smos = pd.read_csv(paths.smos / 'smos_grid.txt', delim_whitespace=True, names=['gpi', 'lon', 'lat', 'alt', 'wf']) smos = smos[(smos.lon >= lonmin) & (smos.lon <= lonmax) & (smos.lat >= latmin) & (smos.lat <= latmax)] # Find closest SMOS gpis and append to the EASE lookup-table for i, (idx, data) in enumerate(lut.iterrows()): print('%i / %i' % (i, len(lut))) r = (smos.lon - data.ease2_lon)**2 + (smos.lat - data.ease2_lat)**2 lut.loc[idx, 'smos_gpi'] = smos[(r - r.min()) < 0.0001]['gpi'].values[0] lut.loc[idx, 'smos_lon'] = smos[(r - r.min()) < 0.0001]['lon'].values[0] lut.loc[idx, 'smos_lat'] = smos[(r - r.min()) < 0.0001]['lat'].values[0] # ------------------------------------------------------------------------------------------------------------------ # Read MERRA grid information (lats/lons taken from a CONUS netcdf image subset) if add_merra is True: merra = Dataset(paths.merra2 / 'raw' / '2015-2018' / 'MERRA2_400.tavg1_2d_lnd_Nx.20150101.SUB.nc') lons, lats = np.meshgrid(merra.variables['lon'][:].data, merra.variables['lat'][:].data) lons = lons.flatten() lats = lats.flatten() # Find closest MERRA gpis and append coordinates to the EASE lookup-table for i, (idx, data) in enumerate(lut.iterrows()): print('%i / %i' % (i, len(lut))) r = (lons - data.ease2_lon)**2 + (lats - data.ease2_lat)**2 lut.loc[idx, 'merra2_lon'] = lons[np.where((r - r.min()) < 0.0001)] lut.loc[idx, 'merra2_lat'] = lats[np.where((r - r.min()) < 0.0001)] lut.to_csv(fname, float_format='%.6f')
def resample_smos(): """ This resamples SMOS data from the SMOS grid onto the EASE2 grid and stores data for each grid cell into .csv files. A grid look-up table needs to be created first (method: ancillary.grid.create_lut). """ paths = Paths() # Collect all .nc files path = paths.smos_raw nc_files = sorted(path.glob('**/*.nc')) # Get time stamp as the mean of start-of-orbit and end-of-orbit sdate = pd.to_datetime([str(f)[-44:-29] for f in nc_files]) edate = pd.to_datetime([str(f)[-28:-13] for f in nc_files]) dates = (sdate + (edate - sdate) / 2.).round('min') # get a list of all CONUS gpis gpi_lut = pd.read_csv(paths.lut, index_col=0)['smos_gpi'] ease_gpis = gpi_lut.index.values # Array with ALL possible dates and ALL CONUS gpis res_arr = np.full((len(dates), len(ease_gpis)), np.nan) # Fill in result array from orbit files for i, f in enumerate(nc_files): print("%i / %i" % (i, len(nc_files))) ds = Dataset(f) smos_gpis = ds.variables['Grid_Point_ID'][:] # Check for valid data within orbit files for res_ind, ease_gpi in enumerate(ease_gpis): smos_ind = np.where(smos_gpis == gpi_lut.loc[ease_gpi])[0] if len(smos_ind) > 0: # extract soil moisture data sm = float(ds.variables['Soil_Moisture'][smos_ind]) if np.isnan(sm) | (sm < 0.): continue # Mask for RFI and Chi-2 flag rfi = float(ds.variables['RFI_Prob'][smos_ind]) chi_2_p = float(ds.variables['Chi_2_P'][smos_ind]) valid = (rfi < 0.1) & (chi_2_p > 0.05) if valid: res_arr[i, res_ind] = sm ds.close() # Write out valid time series of all CONIS GPIS into separate .csv files dir_out = paths.smos / 'timeseries' if not dir_out.exists(): dir_out.mkdir() for i, gpi in enumerate(ease_gpis): Ser = pd.Series(res_arr[:, i], index=dates).dropna() if len(Ser) > 0: Ser = Ser.groupby( Ser.index).last() # Make sure that no time duplicates exist! fname = dir_out / ('%i.csv' % gpi) Ser.to_csv(fname, float_format='%.4f')