def run():
    """ This in the main routine that parallelizes the validation """

    # The processing will be parallelized on 30 kernels
    parts = 30

    # Confidence intervals will be calculated at a 80% confidence level
    alpha = 0.80

    # The validation will be done using all available sensors.
    sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN']

    res_path = Paths().result_root / ('CI%i' %
                                      (alpha * 100)) / ('_'.join(sensors))
    if not res_path.exists():
        res_path.mkdir(parents=True)

    # Parallelized processing
    p = Pool(parts)
    arg1 = np.arange(parts) + 1
    arg2 = repeat(parts, parts)
    arg3 = repeat(sensors, parts)
    arg4 = repeat(alpha, parts)
    arg5 = repeat(res_path, parts)
    p.starmap(main, zip(arg1, arg2, arg3, arg4, arg5))

    # merge in parallel generated results into one single result file.
    merge_result_files(res_path)
def resample_ascat():
    """
    This resamples ASCAT data from the DGG grid onto the EASE2 grid and stores data for each grid cell into .csv files.

    A grid look-up table needs to be created first (method: ancillary.grid.create_lut).

    """

    paths = Paths()

    # get a list of all CONUS gpis
    gpi_lut = pd.read_csv(paths.lut, index_col=0)[['ascat_gpi']]

    io = HSAF_io()

    # Store NN of EASE2 grid points into CSV files
    dir_out = paths.ascat / 'timeseries'
    if not dir_out.exists():
        dir_out.mkdir()

    for gpi, lut in gpi_lut.iterrows():
        Ser = io.read(lut['ascat_gpi'])
        if Ser is not None:
            Ser = Ser['2015-01-01':'2018-12-31']
            if len(Ser) > 10:
                Ser.index = Ser.index.round(
                    'min')  # round time steps to full minutes.
                fname = dir_out / ('%i.csv' % gpi)
                Ser.to_csv(fname, float_format='%.4f')
Ejemplo n.º 3
0
def generate_station_list():
    """ This routine generates a list of available ISMN stations and the EASEv2 grid point they are located in. """

    paths = Paths()

    io = ISMN_Interface(paths.ismn_raw)

    # get metadata indices of all stations that measure soil moisture within the first 10 cm
    idx = io.get_dataset_ids('soil moisture', min_depth=0.0, max_depth=0.1)
    df = pd.DataFrame({'network': io.metadata[idx]['network'],
                       'station': io.metadata[idx]['station'],
                       'lat': io.metadata[idx]['latitude'],
                       'lon': io.metadata[idx]['longitude'],
                       'ease2_gpi': np.zeros(len(idx)).astype('int')}, index=idx)

    # merge indices for stations that have multiple sensors within the first 10 cm
    duplicate_idx = df.groupby(df.columns.tolist()).apply(lambda x: '-'.join(['%i'% i for i in x.index])).values
    df.drop_duplicates(inplace=True)
    df.index = duplicate_idx

    # create EASEv2 grid domain
    grid = EASE2()
    lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats)
    lons = lons.flatten()
    lats = lats.flatten()

    # find EASEv2 grid points in which the individual stations are located
    for i, (idx, data) in enumerate(df.iterrows()):
        print('%i / %i' % (i, len(df)))
        r = (lons - data.lon) ** 2 + (lats - data.lat) ** 2
        df.loc[idx, 'ease2_gpi'] = np.where((r - r.min()) < 0.0001)[0][0]

    df.to_csv(paths.ismn / 'station_list.csv')
Ejemplo n.º 4
0
def generate_station_list():

    paths = Paths()

    io = ISMN_Interface(paths.ismn / 'downloaded' / 'CONUS_20100101_20190101')

    # get metadata indices of all stations that measure soil moisture within the first 10 cm
    idx = io.get_dataset_ids('soil moisture', min_depth=0.0, max_depth=0.1)
    df = pd.DataFrame(
        {
            'network': io.metadata[idx]['network'],
            'station': io.metadata[idx]['station'],
            'lat': io.metadata[idx]['latitude'],
            'lon': io.metadata[idx]['longitude'],
            'ease2_gpi': np.zeros(len(idx)).astype('int')
        },
        index=idx)

    # merge indices for stations that have multiple sensors within the first 10 cm
    duplicate_idx = df.groupby(df.columns.tolist()).apply(
        lambda x: '-'.join(['%i' % i for i in x.index])).values
    df.drop_duplicates(inplace=True)
    df.index = duplicate_idx

    grid = EASE2()
    lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats)
    lons = lons.flatten()
    lats = lats.flatten()

    for i, (idx, data) in enumerate(df.iterrows()):
        print('%i / %i' % (i, len(df)))
        r = (lons - data.lon)**2 + (lats - data.lat)**2
        df.loc[idx, 'ease2_gpi'] = np.where((r - r.min()) < 0.0001)[0][0]

    df.to_csv(paths.ismn / 'station_list.csv')
Ejemplo n.º 5
0
def reformat_smap():
    """
    This extracts raw SMAP EASEv2 data and stores it into .csv files for later processing.

    A grid look-up table needs to be created first (method: ancillary.grid.create_lut).

    """

    paths = Paths()

    # generate idx. array to map ease col/row to gpi
    n_row = 406
    n_col = 964
    idx_arr = np.arange(n_row * n_col, dtype='int64').reshape((n_row, n_col))

    # get a list of all CONUS gpis
    ease_gpis = pd.read_csv(paths.lut, index_col=0).index.values

    # Collect orbit file list and extract date info from file name
    fdir = paths.smap_raw
    files = sorted(fdir.glob('*'))
    dates = pd.to_datetime([str(f)[-29:-14] for f in files]).round('min')

    # Array with ALL possible dates and ALL CONUS gpis
    res_arr = np.full((len(dates), len(ease_gpis)), np.nan)

    # Fill in result array from orbit files
    for i, f in enumerate(files):
        print("%i / %i" % (i, len(files)))

        tmp = h5py.File(fdir / f)
        row = tmp['Soil_Moisture_Retrieval_Data']['EASE_row_index'][:]
        col = tmp['Soil_Moisture_Retrieval_Data']['EASE_column_index'][:]
        idx = idx_arr[row, col]

        # Check for valid data within orbit files
        for res_ind, gpi in enumerate(ease_gpis):
            sm_ind = np.where(idx == gpi)[0]
            if len(sm_ind) > 0:
                qf = tmp['Soil_Moisture_Retrieval_Data'][
                    'retrieval_qual_flag'][sm_ind[0]]
                if (qf == 0) | (qf == 8):
                    res_arr[i, res_ind] = tmp['Soil_Moisture_Retrieval_Data'][
                        'soil_moisture'][sm_ind[0]]

        tmp.close()

    # Write out valid time series of all CONIS GPIS into separate .csv files
    dir_out = paths.smap / 'timeseries'
    if not dir_out.exists():
        dir_out.mkdir()

    for i, gpi in enumerate(ease_gpis):
        Ser = pd.Series(res_arr[:, i], index=dates).dropna()
        if len(Ser) > 0:
            Ser = Ser.groupby(
                Ser.index).last()  # Make sure that no time duplicates exist!
            fname = dir_out / ('%i.csv' % gpi)
            Ser.to_csv(fname, float_format='%.4f')
Ejemplo n.º 6
0
    def __init__(self, sensors=None):

        if sensors is None:
            self.sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN']
        else:
            self.sensors = sensors

        self.root = Paths().data_root
Ejemplo n.º 7
0
def resample_timeseries():

    paths = Paths()

    io = ISMN_Interface(paths.ismn / 'downloaded' / 'CONUS_20100101_20190101')

    # get all stations / sensors for each grid cell.
    lut = pd.read_csv(paths.ismn / 'station_list.csv', index_col=0)
    lut = lut.groupby('ease2_gpi').apply(
        lambda x: '-'.join([i for i in x.index]))

    dir_out = paths.ismn / 'timeseries'

    for cnt, (gpi, indices) in enumerate(lut.iteritems()):
        print('%i / %i' % (cnt, len(lut)))

        fname = dir_out / ('%i.csv' % gpi)

        idx = indices.split('-')

        # Only one station within grid cell
        if len(idx) == 1:
            try:
                ts = io.read_ts(int(idx[0]))
                ts = ts[ts['soil moisture_flag'] == 'G']['soil moisture']
                ts.tz_convert(None).to_csv(fname, float_format='%.4f')
            except:
                print('Corrupt file: ' + io.metadata[int(idx[0])]['filename'])

        # Multiple stations within grid cell
        else:
            df = []
            for i in idx:
                try:
                    ts = io.read_ts(int(i))
                    df += [
                        ts[ts['soil moisture_flag'] == 'G']['soil moisture']
                    ]
                except:
                    print('Corrupt file: ' + io.metadata[int(i)]['filename'])
            if len(df) == 0:
                continue

            df = pd.concat(df, axis=1)
            df.columns = np.arange(len(df.columns))

            # match temporal mean and standard deviation to those of the station with the maximum temporal coverage
            n = np.array([len(df[i].dropna()) for i in df])
            ref = np.where(n == n.max())[0][0]
            for col in df:
                if col != ref:
                    df[col] = (df[col] - df[col].mean()) / df[col].std(
                    ) * df[ref].std() + df[ref].mean()

            # Average measurements of all stations
            df.mean(axis='columns').tz_convert(None).to_csv(
                fname, float_format='%.4f')
def resample_merra2(part=1, parts=1):
    """
    This resamples MERRA-2 data from the MERRA grid onto the EASE2 grid and stores data for each grid cell into .csv files.

    A grid look-up table needs to be created first (method: ancillary.grid.create_lut).

    Parameters
    ----------
    part : int
        Data subset to be processed - Data can be resampled in subsets for parallelization to speed-up the processing.
    parts : int
        Number of parts in which to split the data for parallel processing.
        Per default, all data are resampled at once.

    """

    paths = Paths()

    dir_out = paths.merra2 / 'timeseries'
    if not dir_out.exists():
        dir_out.mkdir()

    path = paths.merra2_raw
    files = np.array(sorted(path.glob('*')))
    ds = xr.open_mfdataset(files)
    lats = ds.lat.values
    lons = ds.lon.values
    dates = pd.to_datetime(ds.time.values)

    # get a list of all CONUS gpis
    gpi_lut = pd.read_csv(paths.lut, index_col=0)[['merra2_lon','merra2_lat']]

    # split domain for parallelization
    subs = (np.arange(parts + 1) * len(gpi_lut) / parts).astype('int')
    subs[-1] = len(gpi_lut)
    start = subs[part - 1]
    end = subs[part]

    gpi_lut = gpi_lut.iloc[start:end,:]

    # find and write all EASE2 NN grid points
    for i, (gpi, lut) in enumerate(gpi_lut.iterrows()):
        print("%i / %i" % (i, len(gpi_lut)))

        ind_lat = np.where(lats == lut['merra2_lat'])[0][0]
        ind_lon = np.where(lons == lut['merra2_lon'])[0][0]

        ts = ds['TSOIL1'][:, ind_lat, ind_lon] - 273.15
        swe = ds['SNOMAS'][:, ind_lat, ind_lon]
        ind_valid = ((ts>=4)&(swe==0)).values

        Ser = pd.Series(ds['SFMC'][ind_valid, ind_lat, ind_lon].values, index=dates[ind_valid])
        fname = dir_out / ('%i.csv' % gpi)
        Ser.to_csv(fname, float_format='%.4f')
Ejemplo n.º 9
0
def merge_result_files():

    sensors = ['ASCAT', 'SMOS', 'MERRA2', 'ISMN']
    # sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN']

    paths = Paths()
    path = paths.result_root / ('_'.join(sensors))

    files = list(path.glob('**/*.csv'))

    result = pd.DataFrame()
    for f in files:
        tmp = pd.read_csv(f, index_col=0)
        result = result.append(tmp)
        f.unlink()

    result.sort_index().to_csv(path / 'result.csv', float_format='%0.3f')
Ejemplo n.º 10
0
def reshuffle_ascat():

    paths = Paths()

    # get a list of all CONUS gpis
    gpi_lut = pd.read_csv(paths.lut, index_col=0)[['ascat_gpi']]

    io = HSAF_io()

    # Store NN of EASE2 grid points in CSV files
    dir_out = paths.ascat / 'resampled'
    for gpi, lut in gpi_lut.iterrows():
        Ser = io.read(lut['ascat_gpi'])
        if Ser is not None:
            Ser = Ser['2015-01-01':'2018-12-31']
            if len(Ser) > 10:
                Ser.index = Ser.index.round('min')
                fname = dir_out / ('%i.csv' % gpi)
                Ser.to_csv(fname, float_format='%.4f')
Ejemplo n.º 11
0
    def __init__(self, version='h113', ext='h114'):

        paths = Paths()

        self.data_path = paths.ascat / version
        self.version = version.upper()

        grid = Dataset(paths.ascat / 'warp5_grid' /
                       'TUW_WARP5_grid_info_2_2.nc')
        self.gpis = grid['gpi'][:][grid['land_flag'][:] == 1]
        self.cells = grid['cell'][:][grid['land_flag'][:] == 1]
        grid.close()

        self.loaded_cell = None
        self.fid = None

        if ext is not None:
            self.ext = HSAF_io(version=ext, ext=None)
        else:
            self.ext = None
Ejemplo n.º 12
0
def generate_plots():

    sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN']

    path = Paths().result_root / 'CI80' / ('_'.join(sensors))

    if not (path / 'plots').exists():
        (path / 'plots').mkdir()

    spatial_plot_n(path)

    spatial_plot_relative_metrics_ci_diff(path)
    boxplot_relative_metrics(path)

    spatial_plot_tca_diff(path)
    spatial_plot_tca_ci_diff(path, sensors)
    boxplot_tca(path, sensors)

    boxplot_relative_metrics_ismn(path)
    boxplot_tca_ismn(path, sensors)
Ejemplo n.º 13
0
def reshuffle_merra2(part=1):

    paths = Paths()

    dir_out = paths.merra2 / 'timeseries'

    path = paths.merra2 / 'raw' / '2015-2018'
    files = np.array(sorted(path.glob('*')))
    ds = xr.open_mfdataset(files)
    lats = ds.lat.values
    lons = ds.lon.values
    dates = pd.to_datetime(ds.time.values)

    # get a list of all CONUS gpis
    gpi_lut = pd.read_csv(paths.lut, index_col=0)[['merra2_lon', 'merra2_lat']]

    # split domain for parallelization
    parts = 2
    subs = (np.arange(parts + 1) * len(gpi_lut) / parts).astype('int')
    subs[-1] = len(gpi_lut)
    start = subs[part - 1]
    end = subs[part]

    gpi_lut = gpi_lut.iloc[start:end, :]

    # find and write all EASE2 NN grid points
    for i, (gpi, lut) in enumerate(gpi_lut.iterrows()):
        print("%i / %i" % (i, len(gpi_lut)))

        ind_lat = np.where(lats == lut['merra2_lat'])[0][0]
        ind_lon = np.where(lons == lut['merra2_lon'])[0][0]

        ts = ds['TSOIL1'][:, ind_lat, ind_lon] - 273.15
        swe = ds['SNOMAS'][:, ind_lat, ind_lon]
        ind_valid = ((ts >= 4) & (swe == 0)).values

        Ser = pd.Series(ds['SFMC'][ind_valid, ind_lat, ind_lon].values,
                        index=dates[ind_valid])
        fname = dir_out / ('%i.csv' % gpi)
        Ser.to_csv(fname, float_format='%.4f')
def main(part, parts, sensors, alpha, res_path):
    """
    This calculates validation statistics for a subset of the study domain.

    Attributes
    ----------
    part : int
        part of the subset to process
    parts : int
        number of subsets to divide the study domain into
    sensors : list of str
        sensors to be considered in the validation
    alpha : float [0,1]
        confidence level of the confidence intervals
    res_path : pathlib.Path
        Path where to store the result file

    """

    result_file = res_path / ('result_%i.csv' % part)

    lut = pd.read_csv(Paths().lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    lut = lut.iloc[start:end, :]

    io = reader(sensors)

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i' % (cnt, len(lut)))

        # Get the template of data fields to store results into
        res = result_template(sensors, gpi)

        res.loc[gpi, 'col'] = data.ease2_col
        res.loc[gpi, 'row'] = data.ease2_row

        try:
            mode, dfs = io.read(gpi, calc_anom_lt=False)

            # Iterate over all data sets (absolute values and anomalies collocated with and without ISMN)
            for m, df in zip(mode, dfs):

                if df is not None:

                    # Only calculate corrected sample size once to speed up processing
                    res.loc[gpi, 'n_corr_' + m + '_tc'] = correct_n(df)

                    # check if current data set contains ISMN data or not.
                    scl = m[0:4]
                    if scl == 'grid':
                        res.loc[gpi, 'n_grid'] = len(df)
                    else:
                        res.loc[gpi, 'n_ismn'] = len(df)

                    b = bias(df, alpha=alpha)
                    R = Pearson_R(df,
                                  alpha=alpha,
                                  n_corr=b.loc[:, :, 'n_corr'])

                    # rescale all columns to MERRA2 before calculating ubRMSD
                    tmp_df = df.copy()
                    for col in sensors:
                        if (col == 'MERRA2') | (not col in tmp_df):
                            continue

                        tmp_df.loc[:, col] = (
                            (tmp_df[col] - tmp_df[col].mean()) /
                            tmp_df[col].std()
                        ) * tmp_df['MERRA2'].std() + tmp_df['MERRA2'].mean()
                    ubrmsd = ubRMSD(tmp_df,
                                    alpha=alpha,
                                    n_corr=b.loc[:, :, 'n_corr'])

                    res.loc[gpi, 'n_' + scl] = len(df)

                    # calculate relative metrics for all pair-wise combinations
                    for t in list(combinations(df.columns.values, 2)):

                        res.loc[gpi, 'n_corr_' + m + '_' +
                                '_'.join(t)] = R.loc[t[0], t[1], 'n_corr']

                        res.loc[gpi, 'bias_' + m + '_l_' +
                                '_'.join(t)] = b.loc[t[0], t[1], 'CI_l_corr']
                        res.loc[gpi, 'bias_' + m + '_p_' +
                                '_'.join(t)] = b.loc[t[0], t[1], 'bias']
                        res.loc[gpi, 'bias_' + m + '_u_' +
                                '_'.join(t)] = b.loc[t[0], t[1], 'CI_u_corr']

                        res.loc[gpi, 'ubrmsd_' + m + '_l_' +
                                '_'.join(t)] = ubrmsd.loc[t[0], t[1],
                                                          'CI_l_corr']
                        res.loc[gpi, 'ubrmsd_' + m + '_p_' +
                                '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'ubRMSD']
                        res.loc[gpi, 'ubrmsd_' + m + '_u_' +
                                '_'.join(t)] = ubrmsd.loc[t[0], t[1],
                                                          'CI_u_corr']

                        res.loc[gpi, 'r_' + m + '_l_' +
                                '_'.join(t)] = R.loc[t[0], t[1], 'CI_l_corr']
                        res.loc[gpi,
                                'r_' + m + '_p_' + '_'.join(t)] = R.loc[t[0],
                                                                        t[1],
                                                                        'R']
                        res.loc[gpi, 'r_' + m + '_u_' +
                                '_'.join(t)] = R.loc[t[0], t[1], 'CI_u_corr']
                        res.loc[gpi,
                                'p_' + m + '_p_' + '_'.join(t)] = R.loc[t[0],
                                                                        t[1],
                                                                        'p']

                    # calculate TCA-based metrics for all triplets except those including both SMOS and SMAP
                    for t in list(combinations(df.columns.values, 3)):
                        if (('SMOS' in t) & ('SMAP' in t)):
                            continue

                        tcstr = '_tc_' + '_'.join(t)

                        tca = TCA(df[list(t)], alpha=alpha)

                        # calculate TCA only for coarse-resolution data sets triplets that have been collocated
                        # without ISMN data
                        if (scl != 'grid') | (t[2] != 'ISMN'):

                            for s in t:
                                res.loc[gpi, 'bias_' + m + '_p_' + s +
                                        tcstr] = tca.loc['beta_p', s]
                                res.loc[gpi, 'bias_' + m + '_l_' + s +
                                        tcstr] = tca.loc['beta_l', s]
                                res.loc[gpi, 'bias_' + m + '_m_' + s +
                                        tcstr] = tca.loc['beta_m', s]
                                res.loc[gpi, 'bias_' + m + '_u_' + s +
                                        tcstr] = tca.loc['beta_u', s]

                                res.loc[gpi, 'ubrmse_' + m + '_p_' + s +
                                        tcstr] = tca.loc['ubRMSE_p', s]
                                res.loc[gpi, 'ubrmse_' + m + '_l_' + s +
                                        tcstr] = tca.loc['ubRMSE_l', s]
                                res.loc[gpi, 'ubrmse_' + m + '_m_' + s +
                                        tcstr] = tca.loc['ubRMSE_m', s]
                                res.loc[gpi, 'ubrmse_' + m + '_u_' + s +
                                        tcstr] = tca.loc['ubRMSE_u', s]

                                res.loc[gpi, 'r2_' + m + '_p_' + s +
                                        tcstr] = tca.loc['r2_p', s]
                                res.loc[gpi, 'r2_' + m + '_l_' + s +
                                        tcstr] = tca.loc['r2_l', s]
                                res.loc[gpi, 'r2_' + m + '_m_' + s +
                                        tcstr] = tca.loc['r2_m', s]
                                res.loc[gpi, 'r2_' + m + '_u_' + s +
                                        tcstr] = tca.loc['r2_u', s]

        except:
            continue

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
Ejemplo n.º 15
0
def run_ascat_eval_part(part, parts, ref='ascat'):

    import numpy as np
    import pandas as pd

    from pathlib import Path
    from scipy.stats import pearsonr

    from pyldas.interface import GEOSldas_io
    from myprojects.readers.ascat import HSAF_io
    from myprojects.timeseries import calc_anom
    from validation_good_practice.ancillary.paths import Paths

    res_path = Path(
        '~/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/validation_all'
    ).expanduser()
    if not res_path.exists():
        Path.mkdir(res_path, parents=True)

    result_file = res_path / ('ascat_eval_part%i.csv' % part)

    tc_res_pc = pd.read_csv(
        '/Users/u0116961/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/sm_validation/Pcorr/result.csv',
        index_col=0)
    tc_res_nopc = pd.read_csv(
        '/Users/u0116961/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/sm_validation/noPcorr/result.csv',
        index_col=0)

    lut = pd.read_csv(Paths().lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    lut = lut.iloc[start:end, :]

    root = Path('/Users/u0116961/data_sets/GEOSldas_runs')

    runs = [run.name for run in root.glob('*_DA_SMAP_*')]
    names = [run[20::] for run in runs]

    runs += ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_OL_noPcorr']
    names += ['Pcorr_OL', 'noPcorr_OL']

    # names = ['OL_Pcorr', 'OL_noPcorr'] + \
    #         [f'DA_{pc}_{err}' for pc in ['Pcorr','noPcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st']]
    # runs = ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_OL_noPcorr' ] + \
    #     [f'NLv4_M36_US_DA_SMAP_{pc}_{err}' for pc in ['Pcorr','noPcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st']]

    # names = ['OL_Pcorr', 'DA_Pcorr_LTST'] + \
    #         [f'DA_{pc}_{err}{mode}' for pc in ['Pcorr'] for err in ['4K','anom_lt', 'anom_lt_ScYH', 'anom_lst','anom_st'] for mode in ['', '_ScDY', '_ScYH']]
    #
    # runs = ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_DA_Pcorr_LTST'] + \
    #     [f'NLv4_M36_US_DA_SMAP_{pc}_{err}{mode}' for pc in ['Pcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st'] for mode in ['', '_ScDY', '_ScYH']]

    dss = [
        GEOSldas_io('tavg3_1d_lnr_Nt', run).timeseries
        if 'DA' in run else GEOSldas_io('SMAP_L4_SM_gph', run).timeseries
        for run in runs
    ]
    grid = GEOSldas_io('ObsFcstAna', runs[0]).grid

    ds_full = GEOSldas_io('SMAP_L4_SM_gph', 'NLv4_M36_US_OL_Pcorr').timeseries
    ds_full = ds_full.assign_coords(
        {'time': ds_full['time'].values + pd.to_timedelta('2 hours')})

    ds_obs_smap = GEOSldas_io(
        'ObsFcstAna', 'NLv4_M36_US_DA_SMAP_Pcorr_4K').timeseries['obs_obs']

    modes = ['abs', 'anom_lt', 'anom_st', 'anom_lst']

    ascat = HSAF_io()

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i, gpi: %i' % (cnt, len(lut), gpi))

        col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg'])
        row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg'])

        res = pd.DataFrame(index=(gpi, ))
        res['col'] = int(data.ease2_col)
        res['row'] = int(data.ease2_row)
        res['lcol'] = col
        res['lrow'] = row

        try:
            ts_ascat = ascat.read(
                data['ascat_gpi']).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        try:
            t_df_smap = ds_obs_smap.sel(species=[1, 2]).isel(
                lat=row, lon=col).to_pandas()
            t_ana = t_df_smap[~np.isnan(t_df_smap[1])
                              | ~np.isnan(t_df_smap[2])].index
            t_ana = pd.Series(1,
                              index=t_ana).resample('1d').mean().dropna().index
        except:
            t_ana = pd.DatetimeIndex([])

        var = 'sm_surface'
        for mode in modes:

            if mode == 'anom_lst':
                ts_ref = calc_anom(ts_ascat.copy(),
                                   mode='climatological').dropna()
            elif mode == 'anom_st':
                ts_ref = calc_anom(ts_ascat.copy(), mode='shortterm').dropna()
            elif mode == 'anom_lt':
                ts_ref = calc_anom(ts_ascat.copy(), mode='longterm').dropna()
            else:
                ts_ref = ts_ascat.dropna()

            for run, ts_model in zip(names, dss):

                try:
                    if 'noPcorr' in run:
                        r_asc = np.sqrt(tc_res_nopc.loc[
                            gpi, f'r2_grid_{mode}_m_ASCAT_tc_ASCAT_SMAP_CLSM'])
                        r_mod = np.sqrt(tc_res_nopc.loc[
                            gpi, f'r2_grid_{mode}_m_CLSM_tc_ASCAT_SMAP_CLSM'])
                    else:
                        r_asc = np.sqrt(tc_res_pc.loc[
                            gpi, f'r2_grid_{mode}_m_ASCAT_tc_ASCAT_SMAP_CLSM'])
                        r_mod = np.sqrt(tc_res_pc.loc[
                            gpi, f'r2_grid_{mode}_m_CLSM_tc_ASCAT_SMAP_CLSM'])
                except:
                    r_asc = np.nan
                    r_mod = np.nan

                ind_valid = ds_full.time.values[
                    (ds_full['snow_depth'][:, row, col].values == 0) &
                    (ds_full['soil_temp_layer1'][:, row, col].values > 277.15)]

                ts_mod = ts_model[var][:, row, col].to_series()
                ts_mod.index += pd.to_timedelta('2 hours')
                ts_mod = ts_mod.reindex(ind_valid)

                if mode == 'anom_lst':
                    ts_mod = calc_anom(ts_mod.copy(),
                                       mode='climatological').dropna()
                elif mode == 'anom_st':
                    ts_mod = calc_anom(ts_mod.copy(),
                                       mode='shortterm').dropna()
                elif mode == 'anom_lt':
                    ts_mod = calc_anom(ts_mod.copy(), mode='longterm').dropna()
                else:
                    ts_mod = ts_mod.dropna()
                ts_mod = ts_mod.resample('1d').mean()

                if 'OL_' in run:
                    res[f'r_tca_{run}_{mode}'] = r_mod

                tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna()
                res[f'len_{run}_{mode}'] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res[f'r_{run}_{mode}'] = r
                res[f'p_{run}_{mode}'] = p
                res[f'r_corr_{run}_{mode}'] = min(r / r_asc, 1)

                tmp = pd.DataFrame({
                    1: ts_ref,
                    2: ts_mod
                }).reindex(t_ana).dropna()
                res[f'ana_len_{run}_{mode}'] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res[f'ana_r_{run}_{mode}'] = r
                res[f'ana_p_{run}_{mode}'] = p
                res[f'ana_r_corr_{run}_{mode}'] = min(r / r_asc, 1)

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
Ejemplo n.º 16
0
def resample_ismn():
    """
    This resamples ISMN data onto the EASE2 grid and stores data for each grid cell into .csv files.
    If single grid cells contain multiple stations, they are averaged.

    A grid look-up table needs to be created first (method: ancillary.grid.create_lut).

    """

    paths = Paths()

    io = ISMN_Interface(paths.ismn_raw)

    # get all stations / sensors for each grid cell.
    lut = pd.read_csv(paths.ismn / 'station_list.csv',index_col=0)
    lut = lut.groupby('ease2_gpi').apply(lambda x: '-'.join([i for i in x.index]))

    dir_out = paths.ismn / 'timeseries'
    if not dir_out.exists():
        dir_out.mkdir()

    for cnt, (gpi, indices) in enumerate(lut.iteritems()):
        print('%i / %i' % (cnt, len(lut)))

        fname = dir_out / ('%i.csv' % gpi)

        idx = indices.split('-')

        # Only one station within grid cell
        if len(idx) == 1:
            try:
                ts = io.read_ts(int(idx[0]))
                ts = ts[ts['soil moisture_flag'] == 'G']['soil moisture'] # Get only "good" data based on ISMN QC
                ts.tz_convert(None).to_csv(fname, float_format='%.4f')
            except:
                print('Corrupt file: ' + io.metadata[int(idx[0])]['filename'])

        # Multiple stations within grid cell
        else:
            df = []
            for i in idx:
                try:
                    ts = io.read_ts(int(i))
                    df += [ts[ts['soil moisture_flag'] == 'G']['soil moisture']] # Get only "good" data based on ISMN QC
                except:
                    print('Corrupt file: ' + io.metadata[int(i)]['filename'])
            if len(df) == 0:
                continue

            df = pd.concat(df, axis=1)
            df.columns = np.arange(len(df.columns))

            # match temporal mean and standard deviation to those of the station with the maximum temporal coverage
            n = np.array([len(df[i].dropna()) for i in df])
            ref = np.where(n==n.max())[0][0]
            for col in df:
                if col != ref:
                    df[col] = (df[col] - df[col].mean())/df[col].std() * df[ref].std() + df[ref].mean()

            # Average measurements of all stations
            df.mean(axis='columns').tz_convert(None).to_csv(fname, float_format='%.4f')
Ejemplo n.º 17
0
def run_ascat_eval_part(part, parts):

    res_path = Path(
        '/Users/u0116961/Documents/work/LDAS/2020-03_scaling/validation')
    result_file = res_path / ('ascat_eval_part%i.csv' % part)

    lut = pd.read_csv(Paths().lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    lut = lut.iloc[start:end, :]

    names = ['open_loop', 'SMOSSMAP_short', 'MadKF_SMOS40']
    runs = [
        'US_M36_SMAP_TB_OL_noScl', 'US_M36_SMAP_TB_DA_scl_SMOSSMAP_short',
        'US_M36_SMOS40_TB_MadKF_DA_it613'
    ]

    dss = [LDAS_io('xhourly', run).timeseries for run in runs]
    grid = LDAS_io().grid

    # t_ana = pd.DatetimeIndex(LDAS_io('ObsFcstAna', runs[0]).timeseries.time.values).sort_values()
    ds_obs_smap = (LDAS_io('ObsFcstAna',
                           'US_M36_SMAP_TB_OL_noScl').timeseries['obs_ana'])
    ds_obs_smos = (LDAS_io(
        'ObsFcstAna', 'US_M36_SMOS40_TB_MadKF_DA_it613').timeseries['obs_ana'])

    modes = ['absolute', 'longterm', 'shortterm']

    ascat = HSAF_io()

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i' % (cnt, len(lut)))

        col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg'])
        row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg'])

        res = pd.DataFrame(index=(gpi, ))
        res['col'] = int(data.ease2_col)
        res['row'] = int(data.ease2_row)
        res['lcol'] = col
        res['lrow'] = row

        try:
            ts_ascat = ascat.read(
                data['ascat_gpi'],
                resample_time=False).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        t_df_smap = ds_obs_smap.sel(species=[1, 2]).isel(lat=row,
                                                         lon=col).to_pandas()
        t_df_smos = ds_obs_smos.sel(species=[1, 2]).isel(lat=row,
                                                         lon=col).to_pandas()
        t_ana_smap = t_df_smap[~np.isnan(t_df_smap[1])
                               | ~np.isnan(t_df_smap[2])].resample(
                                   '1d').mean().index
        t_ana_smos = t_df_smos[~np.isnan(t_df_smos[1])
                               | ~np.isnan(t_df_smos[2])].resample(
                                   '1d').mean().index

        var = 'sm_surface'
        for mode in modes:

            if mode == 'absolute':
                ts_ref = ts_ascat.copy()
            else:
                ts_ref = calc_anom(ts_ascat.copy(),
                                   longterm=(mode == 'longterm')).dropna()

            for run, ts_model in zip(names, dss):

                t_ana = t_ana_smos if run == 'MadKF_SMOS40' else t_ana_smap

                ind = (ts_model['snow_mass'][:, row, col].values == 0) & (
                    ts_model['soil_temp_layer1'][:, row, col].values > 277.15)
                ts_mod = ts_model[var][:, row, col].to_series().loc[ind]
                ts_mod.index += pd.to_timedelta('2 hours')

                if mode == 'absolute':
                    ts_mod = ts_mod.dropna()
                else:
                    ts_mod = calc_anom(ts_mod,
                                       longterm=mode == 'longterm').dropna()
                ts_mod = ts_mod.resample('1d').mean()

                tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna()
                res['len_' + run + '_' + mode] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res['r_' + run + '_' + mode] = r
                # res['p_' + run + '_' + mode] = p
                # res['rmsd_' + run + '_' + mode] = np.sqrt(((tmp[1] - tmp[2]) ** 2).mean())
                res['ubrmsd_' + run + '_' + mode] = np.sqrt(
                    (((tmp[1] - tmp[1].mean()) -
                      (tmp[2] - tmp[2].mean()))**2).mean())

                tmp = pd.DataFrame({
                    1: ts_ref,
                    2: ts_mod
                }).reindex(t_ana).dropna()
                res['ana_len_' + run + '_' + mode] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res['ana_r_' + run + '_' + mode] = r
                # res['ana_p_' + run + '_' + mode] = p
                # res['ana_rmsd_' + run + '_' + mode] = np.sqrt(((tmp[1] - tmp[2]) ** 2).mean())
                res['ana_ubrmsd_' + run + '_' + mode] = np.sqrt(
                    (((tmp[1] - tmp[1].mean()) -
                      (tmp[2] - tmp[2].mean()))**2).mean())

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
Ejemplo n.º 18
0
def EC_ascat_smap_ismn_ldas():

    result_file = Path('/Users/u0116961/Documents/work/extended_collocation/ec_ascat_smap_ismn_ldas.csv')

    names = ['insitu', 'ascat', 'smap', 'ol', 'da']
    combs = list(combinations(names, 2))

    ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_noScl').timeseries
    ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries
    ds_da_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries['obs_ana']
    tg = LDAS_io().grid.tilegrids

    modes = ['absolute','longterm','shortterm']

    ismn = ISMN_io()
    ismn.list = ismn.list.iloc[70::]
    ascat = HSAF_io()
    smap = SMAP_io()

    lut = pd.read_csv(Paths().lut, index_col=0)

    i = 0
    for meta, ts_insitu in ismn.iter_stations(surface_only=True):
        i += 1
        logging.info('%i/%i' % (i, len(ismn.list)))

        try:
            if len(ts_insitu := ts_insitu['2015-04-01':'2020-04-01'].resample('1d').mean().dropna()) < 25:
                continue
        except:
            continue

        res = pd.DataFrame(meta.copy()).transpose()
        col = meta.ease_col
        row = meta.ease_row

        colg = col + tg.loc['domain', 'i_offg']  # col / lon
        rowg = row + tg.loc['domain', 'j_offg']  # row / lat

        tmp_lut = lut[(lut.ease2_col == colg) & (lut.ease2_row == rowg)]
        if len(tmp_lut) == 0:
            continue

        gpi_smap = tmp_lut.index.values[0]
        gpi_ascat = tmp_lut.ascat_gpi.values[0]

        try:
            ts_ascat = ascat.read(gpi_ascat, resample_time=False).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        ts_smap = smap.read(gpi_smap)

        if (ts_ascat is None) | (ts_smap is None):
            continue

        ind = (ds_ol['snow_mass'][:, row, col].values == 0)&(ds_ol['soil_temp_layer1'][:, row, col].values > 277.15)
        ts_ol = ds_ol['sm_surface'][:, row, col].to_series().loc[ind].dropna()
        ts_ol.index += pd.to_timedelta('2 hours')

        ind = (ds_da['snow_mass'][:, row, col].values == 0)&(ds_da['soil_temp_layer1'][:, row, col].values > 277.15)
        ts_da = ds_da['sm_surface'][:, row, col].to_series().loc[ind].dropna()
        ts_da.index += pd.to_timedelta('2 hours')

        for mode in modes:

            if mode == 'absolute':
                ts_ins = ts_insitu.copy()
                ts_asc = ts_ascat.copy()
                ts_smp = ts_smap.copy()
                ts_ol = ts_ol.copy()
                ts_da = ts_da.copy()
            else:
                ts_ins = calc_anom(ts_ins.copy(), longterm=(mode=='longterm')).dropna()
                ts_asc = calc_anom(ts_asc.copy(), longterm=(mode == 'longterm')).dropna()
                ts_smp = calc_anom(ts_smp.copy(), longterm=(mode == 'longterm')).dropna()
                ts_ol = calc_anom(ts_ol.copy(), longterm=(mode == 'longterm')).dropna()
                ts_da = calc_anom(ts_da.copy(), longterm=(mode == 'longterm')).dropna()

            tmp = pd.DataFrame(dict(zip(names, [ts_ins, ts_asc, ts_smp, ts_ol, ts_da]))).dropna()

            corr = tmp.corr()
            ec_res = ecol(tmp[['insitu', 'ascat', 'smap', 'ol', 'da']], correlated=[['smap', 'ol'], ['smap', 'da'], ['ol', 'da']])

            res[f'len_{mode}'] = len(tmp)
            for c in combs:
                res[f'corr_{"_".join(c)}'] = corr.loc[c]
            res[f'err_corr_smap_ol_{mode}'] = ec_res['err_corr_smap_ol']
            res[f'err_corr_smap_da_{mode}'] = ec_res['err_corr_smap_da']
            res[f'err_corr_ol_da_{mode}'] = ec_res['err_corr_ol_da']

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.4f')
        else:
            res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
Ejemplo n.º 19
0
def main(part):

    parts = 30

    sensors = ['ASCAT', 'SMOS', 'MERRA2', 'ISMN']
    # sensors = ['ASCAT', 'SMOS', 'SMAP', 'MERRA2', 'ISMN']

    paths = Paths()

    result_file = paths.result_root / ('_'.join(sensors)) / ('result_%i.csv' %
                                                             part)
    if not result_file.parent.exists():
        result_file.parent.mkdir(parents=True)

    lut = pd.read_csv(paths.lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    lut = lut.iloc[start:end, :]

    io = reader(sensors)

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i' % (cnt, len(lut)))

        res = result_template(sensors, gpi)

        res.loc[gpi, 'col'] = data.ease2_col
        res.loc[gpi, 'row'] = data.ease2_row

        try:
            mode, dfs = io.read(gpi)

            for m, df in zip(mode, dfs):

                if df is not None:

                    res.loc[gpi, 'n_corr_' + m + '_tc'] = correct_n(df)

                    scl = m[0:4]

                    if scl == 'grid':
                        res.loc[gpi, 'n_grid'] = len(df)
                    else:
                        res.loc[gpi, 'n_ismn'] = len(df)

                    b = bias(df)
                    R = Pearson_R(df, n_corr=b.loc[:, :, 'n_corr'])

                    # rescale all columns to MERRA2 before calculating ubRMSD
                    tmp_df = df.copy()
                    # for col in ['ASCAT','SMOS','SMAP']:
                    for col in ['ASCAT', 'SMOS']:
                        tmp_df.loc[:, col] = (
                            (tmp_df[col] - tmp_df[col].mean()) /
                            tmp_df[col].std()
                        ) * tmp_df['MERRA2'].std() + tmp_df['MERRA2'].mean()
                    ubrmsd = ubRMSD(tmp_df, n_corr=b.loc[:, :, 'n_corr'])

                    res.loc[gpi, 'n_' + scl] = len(df)

                    for t in list(combinations(df.columns.values, 2)):

                        res.loc[gpi, 'n_corr_' + m + '_' +
                                '_'.join(t)] = R.loc[t[0], t[1], 'n_corr']

                        res.loc[gpi, 'bias_' + m + '_l_' +
                                '_'.join(t)] = b.loc[t[0], t[1], 'CI_l_corr']
                        res.loc[gpi, 'bias_' + m + '_p_' +
                                '_'.join(t)] = b.loc[t[0], t[1], 'bias']
                        res.loc[gpi, 'bias_' + m + '_u_' +
                                '_'.join(t)] = b.loc[t[0], t[1], 'CI_u_corr']

                        res.loc[gpi, 'ubrmsd_' + m + '_l_' +
                                '_'.join(t)] = ubrmsd.loc[t[0], t[1],
                                                          'CI_l_corr']
                        res.loc[gpi, 'ubrmsd_' + m + '_p_' +
                                '_'.join(t)] = ubrmsd.loc[t[0], t[1], 'ubRMSD']
                        res.loc[gpi, 'ubrmsd_' + m + '_u_' +
                                '_'.join(t)] = ubrmsd.loc[t[0], t[1],
                                                          'CI_u_corr']

                        res.loc[gpi, 'r_' + m + '_l_' +
                                '_'.join(t)] = R.loc[t[0], t[1], 'CI_l_corr']
                        res.loc[gpi,
                                'r_' + m + '_p_' + '_'.join(t)] = R.loc[t[0],
                                                                        t[1],
                                                                        'R']
                        res.loc[gpi, 'r_' + m + '_u_' +
                                '_'.join(t)] = R.loc[t[0], t[1], 'CI_u_corr']
                        res.loc[gpi,
                                'p_' + m + '_p_' + '_'.join(t)] = R.loc[t[0],
                                                                        t[1],
                                                                        'p']

                    for t in list(combinations(df.columns.values, 3)):

                        if (('SMOS' in t) & ('SMAP' in t)):
                            continue

                        tcstr = '_tc_' + '_'.join(t)

                        tca = TCA(df[list(t)])

                        if (scl != 'grid') | (t[2] != 'ISMN'):

                            for s in t:
                                res.loc[gpi, 'bias_' + m + '_p_' + s +
                                        tcstr] = tca.loc['beta_p', s]
                                res.loc[gpi, 'bias_' + m + '_l_' + s +
                                        tcstr] = tca.loc['beta_l', s]
                                res.loc[gpi, 'bias_' + m + '_m_' + s +
                                        tcstr] = tca.loc['beta_m', s]
                                res.loc[gpi, 'bias_' + m + '_u_' + s +
                                        tcstr] = tca.loc['beta_u', s]

                                res.loc[gpi, 'ubrmse_' + m + '_p_' + s +
                                        tcstr] = tca.loc['ubRMSE_p', s]
                                res.loc[gpi, 'ubrmse_' + m + '_l_' + s +
                                        tcstr] = tca.loc['ubRMSE_l', s]
                                res.loc[gpi, 'ubrmse_' + m + '_m_' + s +
                                        tcstr] = tca.loc['ubRMSE_m', s]
                                res.loc[gpi, 'ubrmse_' + m + '_u_' + s +
                                        tcstr] = tca.loc['ubRMSE_u', s]

                                res.loc[gpi, 'r2_' + m + '_p_' + s +
                                        tcstr] = tca.loc['r2_p', s]
                                res.loc[gpi, 'r2_' + m + '_l_' + s +
                                        tcstr] = tca.loc['r2_l', s]
                                res.loc[gpi, 'r2_' + m + '_m_' + s +
                                        tcstr] = tca.loc['r2_m', s]
                                res.loc[gpi, 'r2_' + m + '_u_' + s +
                                        tcstr] = tca.loc['r2_u', s]

        except:
            continue

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
Ejemplo n.º 20
0
def run_ascat_eval_smos_part(part, parts, ref='ascat'):

    periods = [
        ['2010-04-01', '2020-04-01'],
        ['2010-04-01', '2015-04-01'],
        ['2015-04-01', '2020-04-01'],
        ['2010-04-01', '2012-10-01'],
        ['2012-10-01', '2015-04-01'],
        ['2015-04-01', '2017-10-01'],
        ['2017-10-01', '2020-04-01'],
    ]

    res_path = Path(
        f'~/Documents/work/MadKF/CLSM/SMOS40/validation/multiperiod/ascat'
    ).expanduser()
    if not res_path.exists():
        Path.mkdir(res_path, parents=True)

    result_file = res_path / f'ascat_eval_smos_part{part}.csv'

    lut = pd.read_csv(Paths().lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    lut = lut.iloc[start:end, :]

    names = ['open_loop'] + [f'SMOS40_it62{i}' for i in range(1, 5)]
    runs = ['US_M36_SMOS40_TB_OL_noScl'
            ] + [f'US_M36_SMOS40_TB_MadKF_DA_it62{i}' for i in range(1, 5)]

    grid = LDAS_io('ObsFcstAna', runs[0]).grid
    dss_xhourly = [LDAS_io('xhourly', run).timeseries for run in runs]
    dss_obs_ana = [
        LDAS_io('ObsFcstAna', run).timeseries['obs_ana'] for run in runs
    ]

    modes = ['absolute', 'longterm', 'shortterm']

    ascat = HSAF_io()

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i' % (cnt, len(lut)))

        col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg'])
        row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg'])

        res = pd.DataFrame(index=(gpi, ))
        res['col'] = int(data.ease2_col)
        res['row'] = int(data.ease2_row)
        res['lcol'] = col
        res['lrow'] = row

        try:
            ts_ascat = ascat.read(
                data['ascat_gpi'],
                resample_time=False).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        dfs = [
            ds.sel(species=[1, 2]).isel(
                lat=row, lon=col).to_pandas().resample('1d').mean()
            for ds in dss_obs_ana
        ]
        idx = [df[np.any(~np.isnan(df), axis=1)].index for df in dfs]

        t_ana = idx[0].intersection(idx[1]).intersection(idx[2]).intersection(
            idx[3])

        var = 'sm_surface'
        for mode in modes:

            if mode == 'absolute':
                ts_ref = ts_ascat.copy()
            else:
                ts_ref = calc_anom(ts_ascat.copy(),
                                   longterm=(mode == 'longterm')).dropna()

            for run, ts_model in zip(names, dss_xhourly):

                ind = (ts_model['snow_mass'][:, row, col].values == 0) & (
                    ts_model['soil_temp_layer1'][:, row, col].values > 277.15)
                ts_mod = ts_model[var][:, row, col].to_series().loc[ind]
                ts_mod.index += pd.to_timedelta('2 hours')

                if mode == 'absolute':
                    ts_mod = ts_mod.dropna()
                else:
                    ts_mod = calc_anom(ts_mod,
                                       longterm=mode == 'longterm').dropna()
                ts_mod = ts_mod.reindex(t_ana).dropna()

                for i, p in enumerate(periods):
                    tmp = pd.DataFrame({
                        1: ts_ref,
                        2: ts_mod
                    })[p[0]:p[1]].dropna()
                    res[f'p{i}_len_{run}_{mode}'] = len(tmp)
                    r, p = pearsonr(
                        tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan)
                    res[f'p{i}_r_{run}_{mode}'] = r

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
Ejemplo n.º 21
0
def reshuffle_smos():

    paths = Paths()

    # Collect all nc files
    path = paths.smos / 'raw' / 'MIR_SMUDP2_nc'
    nc_files = sorted(path.glob('**/*.nc'))

    # Get time stamp as the mean of start-of-orbit and end-of-orbit
    sdate = pd.to_datetime([str(f)[-44:-29] for f in nc_files])
    edate = pd.to_datetime([str(f)[-28:-13] for f in nc_files])
    dates = (sdate + (edate - sdate) / 2.).round('min')

    # get a list of all CONUS gpis
    gpi_lut = pd.read_csv(paths.lut, index_col=0)['smos_gpi']
    ease_gpis = gpi_lut.index.values

    # Array with ALL possible dates and ALL CONUS gpis
    res_arr = np.full((len(dates), len(ease_gpis)), np.nan)

    # Fill in result array from orbit files
    for i, f in enumerate(nc_files):
        print("%i / %i" % (i, len(nc_files)))

        ds = Dataset(f)
        smos_gpis = ds.variables['Grid_Point_ID'][:]

        # Check for valid data within orbit files
        for res_ind, ease_gpi in enumerate(ease_gpis):
            smos_ind = np.where(smos_gpis == gpi_lut.loc[ease_gpi])[0]
            if len(smos_ind) > 0:

                sm = float(ds.variables['Soil_Moisture'][smos_ind])
                if np.isnan(sm) | (sm < 0.):
                    continue

                rfi = float(ds.variables['RFI_Prob'][smos_ind])
                chi_2_p = float(ds.variables['Chi_2_P'][smos_ind])
                valid = (rfi < 0.1) & (chi_2_p > 0.05)

                # cf = float(ds.variables['Confidence_Flags'][smos_ind])
                # if np.isnan(cf):
                #     continue
                # cf = int(cf)
                # sf = long(ds.variables['Science_Flags'][smos_ind])
                #
                # valid = ((cf & 1 << 1) | (cf & 1 << 2) | (cf & 1 << 4) | (cf & 1 << 5) | (cf & 1 << 6) |
                #         (sf & 1 << 5) | (sf & 1 << 16) | (sf & 1 << 18) | (sf & 1 << 19) | (sf & 1 << 26) == 0) & \
                #         (rfi < 0.1)

                if valid:
                    res_arr[i, res_ind] = sm

        ds.close()

    # Write out valid time series of all CONIS GPIS into separate .csv files
    dir_out = paths.smos / 'timeseries'
    for i, gpi in enumerate(ease_gpis):
        Ser = pd.Series(res_arr[:, i], index=dates).dropna()
        if len(Ser) > 0:
            Ser = Ser.groupby(Ser.index).last()
            fname = dir_out / ('%i.csv' % gpi)
            Ser.to_csv(fname, float_format='%.4f')
Ejemplo n.º 22
0
def create_lut():

    initiate = True
    add_ascat = True
    add_smos = True
    add_merra = True

    paths = Paths()

    fname = paths.lut

    # Rough bounding coordinates to pre-clip CONUS for speeding up calculations
    lonmin = -125.
    lonmax = -66.5
    latmin = 24.5
    latmax = 49.5

    if initiate is True:
        grid = EASE2()
        lons, lats = np.meshgrid(grid.ease_lons, grid.ease_lats)
        cols, rows = np.meshgrid(np.arange(len(grid.ease_lons)),
                                 np.arange(len(grid.ease_lats)))

        lut = pd.DataFrame({
            'ease2_col': cols.flatten(),
            'ease2_row': rows.flatten(),
            'ease2_lon': lons.flatten(),
            'ease2_lat': lats.flatten(),
            'ascat_gpi': -1,
            'ascat_lon': np.nan,
            'ascat_lat': np.nan,
            'smos_gpi': -1,
            'smos_lon': np.nan,
            'smos_lat': np.nan,
            'merra2_lon': np.nan,
            'merra2_lat': np.nan
        })

        lut = lut[(lut.ease2_lon >= lonmin) & (lut.ease2_lon <= lonmax) &
                  (lut.ease2_lat >= latmin) & (lut.ease2_lat <= latmax)]
    else:
        lut = pd.read_csv(fname, index_col=0)

    # ------------------------------------------------------------------------------------------------------------------
    # A list of ASCAT gpis over the USA can be exported from https://www.geo.tuwien.ac.at/dgg/index.php
    # This list is used here to restrict EASE2-grid cells to CONUS only.
    if add_ascat is True:
        ascat_gpis = pd.read_csv(paths.ascat / 'warp5_grid' /
                                 'pointlist_United States of America_warp.csv',
                                 index_col=0)
        ascat_gpis = ascat_gpis[(ascat_gpis.lon >= lonmin)
                                & (ascat_gpis.lon <= lonmax) &
                                (ascat_gpis.lat >= latmin) &
                                (ascat_gpis.lat <= latmax)]

        ascat_gpis['ease2_gpi'] = -1
        ascat_gpis['r'] = -1

        # Get ease grid indices and distence for each ASCAT grid cell
        for i, (idx, data) in enumerate(ascat_gpis.iterrows()):
            print('%i / %i' % (i, len(ascat_gpis)))

            r = (lut.ease2_lon - data.lon)**2 + (lut.ease2_lat - data.lat)**2
            ascat_gpis.loc[idx, 'ease2_gpi'] = lut[(
                r - r.min()) < 0.0001].index.values[0]
            ascat_gpis.loc[idx, 'r'] = r[(r - r.min()) < 0.0001].values[0]

        # Find the nearest matched ASCAT grid cell for each EASE grid cells
        for i, (idx, data) in enumerate(lut.iterrows()):
            print('%i / %i' % (i, len(lut)))

            matches = ascat_gpis[ascat_gpis.ease2_gpi == idx]
            if len(matches) > 0:
                match = matches[(matches.r - matches.r.min()) < 0.0001]
                lut.loc[idx, 'ascat_gpi'] = match.index.values[0]
                lut.loc[idx, 'ascat_lon'] = match['lon'].values[0]
                lut.loc[idx, 'ascat_lat'] = match['lat'].values[0]

        # Remove grid cells the don't have a closest ASCAT cell
        lut = lut[lut.ascat_gpi != -1]

    # ------------------------------------------------------------------------------------------------------------------
    # Read SMOS grid information and clip CONUS
    if add_smos is True:
        smos = pd.read_csv(paths.smos / 'smos_grid.txt',
                           delim_whitespace=True,
                           names=['gpi', 'lon', 'lat', 'alt', 'wf'])
        smos = smos[(smos.lon >= lonmin) & (smos.lon <= lonmax) &
                    (smos.lat >= latmin) & (smos.lat <= latmax)]

        # Find closest SMOS gpis and append to the EASE lookup-table
        for i, (idx, data) in enumerate(lut.iterrows()):
            print('%i / %i' % (i, len(lut)))

            r = (smos.lon - data.ease2_lon)**2 + (smos.lat - data.ease2_lat)**2
            lut.loc[idx,
                    'smos_gpi'] = smos[(r - r.min()) < 0.0001]['gpi'].values[0]
            lut.loc[idx,
                    'smos_lon'] = smos[(r - r.min()) < 0.0001]['lon'].values[0]
            lut.loc[idx,
                    'smos_lat'] = smos[(r - r.min()) < 0.0001]['lat'].values[0]

    # ------------------------------------------------------------------------------------------------------------------
    # Read MERRA grid information (lats/lons taken from a CONUS netcdf image subset)
    if add_merra is True:
        merra = Dataset(paths.merra2 / 'raw' / '2015-2018' /
                        'MERRA2_400.tavg1_2d_lnd_Nx.20150101.SUB.nc')
        lons, lats = np.meshgrid(merra.variables['lon'][:].data,
                                 merra.variables['lat'][:].data)
        lons = lons.flatten()
        lats = lats.flatten()

        # Find closest MERRA gpis and append coordinates to the EASE lookup-table
        for i, (idx, data) in enumerate(lut.iterrows()):
            print('%i / %i' % (i, len(lut)))

            r = (lons - data.ease2_lon)**2 + (lats - data.ease2_lat)**2
            lut.loc[idx, 'merra2_lon'] = lons[np.where((r - r.min()) < 0.0001)]
            lut.loc[idx, 'merra2_lat'] = lats[np.where((r - r.min()) < 0.0001)]

    lut.to_csv(fname, float_format='%.6f')
Ejemplo n.º 23
0
def resample_smos():
    """
    This resamples SMOS data from the SMOS grid onto the EASE2 grid and stores data for each grid cell into .csv files.

    A grid look-up table needs to be created first (method: ancillary.grid.create_lut).

    """

    paths = Paths()

    # Collect all .nc files
    path = paths.smos_raw
    nc_files = sorted(path.glob('**/*.nc'))

    # Get time stamp as the mean of start-of-orbit and end-of-orbit
    sdate = pd.to_datetime([str(f)[-44:-29] for f in nc_files])
    edate = pd.to_datetime([str(f)[-28:-13] for f in nc_files])
    dates = (sdate + (edate - sdate) / 2.).round('min')

    # get a list of all CONUS gpis
    gpi_lut = pd.read_csv(paths.lut, index_col=0)['smos_gpi']
    ease_gpis = gpi_lut.index.values

    # Array with ALL possible dates and ALL CONUS gpis
    res_arr = np.full((len(dates), len(ease_gpis)), np.nan)

    # Fill in result array from orbit files
    for i, f in enumerate(nc_files):
        print("%i / %i" % (i, len(nc_files)))

        ds = Dataset(f)
        smos_gpis = ds.variables['Grid_Point_ID'][:]

        # Check for valid data within orbit files
        for res_ind, ease_gpi in enumerate(ease_gpis):
            smos_ind = np.where(smos_gpis == gpi_lut.loc[ease_gpi])[0]
            if len(smos_ind) > 0:

                # extract soil moisture data
                sm = float(ds.variables['Soil_Moisture'][smos_ind])
                if np.isnan(sm) | (sm < 0.):
                    continue

                # Mask for RFI and Chi-2 flag
                rfi = float(ds.variables['RFI_Prob'][smos_ind])
                chi_2_p = float(ds.variables['Chi_2_P'][smos_ind])
                valid = (rfi < 0.1) & (chi_2_p > 0.05)

                if valid:
                    res_arr[i, res_ind] = sm

        ds.close()

    # Write out valid time series of all CONIS GPIS into separate .csv files
    dir_out = paths.smos / 'timeseries'
    if not dir_out.exists():
        dir_out.mkdir()

    for i, gpi in enumerate(ease_gpis):
        Ser = pd.Series(res_arr[:, i], index=dates).dropna()
        if len(Ser) > 0:
            Ser = Ser.groupby(
                Ser.index).last()  # Make sure that no time duplicates exist!
            fname = dir_out / ('%i.csv' % gpi)
            Ser.to_csv(fname, float_format='%.4f')