Example #1
0
def calc_gamma_map():

    fname = r"D:\data_sets\MSWEP_V21\data\grid_new.csv"

    ascat = HSAF_io()
    mswep = MSWEP_io()

    mswep.grid['gamma'] = np.nan

    for i, (precip, info) in enumerate(mswep.iter_gp()):
        print(i)

        if len(precip.dropna()) == 0:
            continue
        try:
            precip = calc_anomaly(precip, method='harmonic', longterm=False)
            sm = calc_anomaly(ascat.read(
                info.dgg_gpi)['2007-01-01':'2016-12-31'],
                              method='harmonic',
                              longterm=False)
            ts = pd.concat((precip, sm), axis=1).values
            mswep.grid.loc[info.name,
                           'gamma'] = estimate_gamma(ts[:, 0], ts[:, 1])
        except:
            continue

    mswep.grid.dropna().to_csv(fname)
Example #2
0
def read_data():

    i_lat = 750
    i_lon = 750

    ascat = HSAF_io()
    merra2 = Dataset('/Users/u0116961/data_sets/MERRA2/MERRA2_timeseries.nc4')

    with Dataset(
            '/Users/u0116961/data_sets/DMP_COPERNICUS/DMP_COPERNICUS_timeseries.nc'
    ) as ds:
        time = pd.DatetimeIndex(
            num2date(ds['time'][:],
                     units=ds['time'].units,
                     only_use_python_datetimes=True,
                     only_use_cftime_datetimes=False))
        dmp_ts = pd.DataFrame({'DMP': ds['DMP'][:, i_lat, i_lon]}, index=time)
        lat = ds['lat'][i_lat].data
        lon = ds['lon'][i_lon].data

    ind_lat = abs(merra2['lat'][:] - lat).argmin()
    ind_lon = abs(merra2['lon'][:] - lon).argmin()
    gpi_ascat = ascat.latlon2gpi(lat, lon)

    time = pd.DatetimeIndex(
        num2date(merra2['time'][:],
                 units=merra2['time'].units,
                 only_use_python_datetimes=True,
                 only_use_cftime_datetimes=False))
    df = pd.DataFrame(
        {
            'time':
            time,
            'sm':
            merra2['SFMC'][:, ind_lat, ind_lon],
            'DMP':
            dmp_ts.reindex(time).values.flatten() / 10,
            'sig40_ascat':
            ascat.read(gpi_ascat, resample_time=True,
                       var='sigma40').reindex(time).values
        },
        index=time)
    merra2.close()
    ascat.close()

    return df
def run_ascat_eval_part(part, parts, ref='ascat'):

    import numpy as np
    import pandas as pd

    from pathlib import Path
    from scipy.stats import pearsonr

    from pyldas.interface import GEOSldas_io
    from myprojects.readers.ascat import HSAF_io
    from myprojects.timeseries import calc_anom
    from validation_good_practice.ancillary.paths import Paths

    res_path = Path(
        '~/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/validation_all'
    ).expanduser()
    if not res_path.exists():
        Path.mkdir(res_path, parents=True)

    result_file = res_path / ('ascat_eval_part%i.csv' % part)

    tc_res_pc = pd.read_csv(
        '/Users/u0116961/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/sm_validation/Pcorr/result.csv',
        index_col=0)
    tc_res_nopc = pd.read_csv(
        '/Users/u0116961/Documents/work/MadKF/CLSM/SM_err_ratio/GEOSldas/sm_validation/noPcorr/result.csv',
        index_col=0)

    lut = pd.read_csv(Paths().lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    lut = lut.iloc[start:end, :]

    root = Path('/Users/u0116961/data_sets/GEOSldas_runs')

    runs = [run.name for run in root.glob('*_DA_SMAP_*')]
    names = [run[20::] for run in runs]

    runs += ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_OL_noPcorr']
    names += ['Pcorr_OL', 'noPcorr_OL']

    # names = ['OL_Pcorr', 'OL_noPcorr'] + \
    #         [f'DA_{pc}_{err}' for pc in ['Pcorr','noPcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st']]
    # runs = ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_OL_noPcorr' ] + \
    #     [f'NLv4_M36_US_DA_SMAP_{pc}_{err}' for pc in ['Pcorr','noPcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st']]

    # names = ['OL_Pcorr', 'DA_Pcorr_LTST'] + \
    #         [f'DA_{pc}_{err}{mode}' for pc in ['Pcorr'] for err in ['4K','anom_lt', 'anom_lt_ScYH', 'anom_lst','anom_st'] for mode in ['', '_ScDY', '_ScYH']]
    #
    # runs = ['NLv4_M36_US_OL_Pcorr', 'NLv4_M36_US_DA_Pcorr_LTST'] + \
    #     [f'NLv4_M36_US_DA_SMAP_{pc}_{err}{mode}' for pc in ['Pcorr'] for err in ['4K','abs','anom_lt','anom_lst','anom_st'] for mode in ['', '_ScDY', '_ScYH']]

    dss = [
        GEOSldas_io('tavg3_1d_lnr_Nt', run).timeseries
        if 'DA' in run else GEOSldas_io('SMAP_L4_SM_gph', run).timeseries
        for run in runs
    ]
    grid = GEOSldas_io('ObsFcstAna', runs[0]).grid

    ds_full = GEOSldas_io('SMAP_L4_SM_gph', 'NLv4_M36_US_OL_Pcorr').timeseries
    ds_full = ds_full.assign_coords(
        {'time': ds_full['time'].values + pd.to_timedelta('2 hours')})

    ds_obs_smap = GEOSldas_io(
        'ObsFcstAna', 'NLv4_M36_US_DA_SMAP_Pcorr_4K').timeseries['obs_obs']

    modes = ['abs', 'anom_lt', 'anom_st', 'anom_lst']

    ascat = HSAF_io()

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i, gpi: %i' % (cnt, len(lut), gpi))

        col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg'])
        row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg'])

        res = pd.DataFrame(index=(gpi, ))
        res['col'] = int(data.ease2_col)
        res['row'] = int(data.ease2_row)
        res['lcol'] = col
        res['lrow'] = row

        try:
            ts_ascat = ascat.read(
                data['ascat_gpi']).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        try:
            t_df_smap = ds_obs_smap.sel(species=[1, 2]).isel(
                lat=row, lon=col).to_pandas()
            t_ana = t_df_smap[~np.isnan(t_df_smap[1])
                              | ~np.isnan(t_df_smap[2])].index
            t_ana = pd.Series(1,
                              index=t_ana).resample('1d').mean().dropna().index
        except:
            t_ana = pd.DatetimeIndex([])

        var = 'sm_surface'
        for mode in modes:

            if mode == 'anom_lst':
                ts_ref = calc_anom(ts_ascat.copy(),
                                   mode='climatological').dropna()
            elif mode == 'anom_st':
                ts_ref = calc_anom(ts_ascat.copy(), mode='shortterm').dropna()
            elif mode == 'anom_lt':
                ts_ref = calc_anom(ts_ascat.copy(), mode='longterm').dropna()
            else:
                ts_ref = ts_ascat.dropna()

            for run, ts_model in zip(names, dss):

                try:
                    if 'noPcorr' in run:
                        r_asc = np.sqrt(tc_res_nopc.loc[
                            gpi, f'r2_grid_{mode}_m_ASCAT_tc_ASCAT_SMAP_CLSM'])
                        r_mod = np.sqrt(tc_res_nopc.loc[
                            gpi, f'r2_grid_{mode}_m_CLSM_tc_ASCAT_SMAP_CLSM'])
                    else:
                        r_asc = np.sqrt(tc_res_pc.loc[
                            gpi, f'r2_grid_{mode}_m_ASCAT_tc_ASCAT_SMAP_CLSM'])
                        r_mod = np.sqrt(tc_res_pc.loc[
                            gpi, f'r2_grid_{mode}_m_CLSM_tc_ASCAT_SMAP_CLSM'])
                except:
                    r_asc = np.nan
                    r_mod = np.nan

                ind_valid = ds_full.time.values[
                    (ds_full['snow_depth'][:, row, col].values == 0) &
                    (ds_full['soil_temp_layer1'][:, row, col].values > 277.15)]

                ts_mod = ts_model[var][:, row, col].to_series()
                ts_mod.index += pd.to_timedelta('2 hours')
                ts_mod = ts_mod.reindex(ind_valid)

                if mode == 'anom_lst':
                    ts_mod = calc_anom(ts_mod.copy(),
                                       mode='climatological').dropna()
                elif mode == 'anom_st':
                    ts_mod = calc_anom(ts_mod.copy(),
                                       mode='shortterm').dropna()
                elif mode == 'anom_lt':
                    ts_mod = calc_anom(ts_mod.copy(), mode='longterm').dropna()
                else:
                    ts_mod = ts_mod.dropna()
                ts_mod = ts_mod.resample('1d').mean()

                if 'OL_' in run:
                    res[f'r_tca_{run}_{mode}'] = r_mod

                tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna()
                res[f'len_{run}_{mode}'] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res[f'r_{run}_{mode}'] = r
                res[f'p_{run}_{mode}'] = p
                res[f'r_corr_{run}_{mode}'] = min(r / r_asc, 1)

                tmp = pd.DataFrame({
                    1: ts_ref,
                    2: ts_mod
                }).reindex(t_ana).dropna()
                res[f'ana_len_{run}_{mode}'] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res[f'ana_r_{run}_{mode}'] = r
                res[f'ana_p_{run}_{mode}'] = p
                res[f'ana_r_corr_{run}_{mode}'] = min(r / r_asc, 1)

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
def run_ascat_eval_smos_part(part, parts, ref='ascat'):

    periods = [
        ['2010-04-01', '2020-04-01'],
        ['2010-04-01', '2015-04-01'],
        ['2015-04-01', '2020-04-01'],
        ['2010-04-01', '2012-10-01'],
        ['2012-10-01', '2015-04-01'],
        ['2015-04-01', '2017-10-01'],
        ['2017-10-01', '2020-04-01'],
    ]

    res_path = Path(
        f'~/Documents/work/MadKF/CLSM/SMOS40/validation/multiperiod/ascat'
    ).expanduser()
    if not res_path.exists():
        Path.mkdir(res_path, parents=True)

    result_file = res_path / f'ascat_eval_smos_part{part}.csv'

    lut = pd.read_csv(Paths().lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    lut = lut.iloc[start:end, :]

    names = ['open_loop'] + [f'SMOS40_it62{i}' for i in range(1, 5)]
    runs = ['US_M36_SMOS40_TB_OL_noScl'
            ] + [f'US_M36_SMOS40_TB_MadKF_DA_it62{i}' for i in range(1, 5)]

    grid = LDAS_io('ObsFcstAna', runs[0]).grid
    dss_xhourly = [LDAS_io('xhourly', run).timeseries for run in runs]
    dss_obs_ana = [
        LDAS_io('ObsFcstAna', run).timeseries['obs_ana'] for run in runs
    ]

    modes = ['absolute', 'longterm', 'shortterm']

    ascat = HSAF_io()

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i' % (cnt, len(lut)))

        col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg'])
        row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg'])

        res = pd.DataFrame(index=(gpi, ))
        res['col'] = int(data.ease2_col)
        res['row'] = int(data.ease2_row)
        res['lcol'] = col
        res['lrow'] = row

        try:
            ts_ascat = ascat.read(
                data['ascat_gpi'],
                resample_time=False).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        dfs = [
            ds.sel(species=[1, 2]).isel(
                lat=row, lon=col).to_pandas().resample('1d').mean()
            for ds in dss_obs_ana
        ]
        idx = [df[np.any(~np.isnan(df), axis=1)].index for df in dfs]

        t_ana = idx[0].intersection(idx[1]).intersection(idx[2]).intersection(
            idx[3])

        var = 'sm_surface'
        for mode in modes:

            if mode == 'absolute':
                ts_ref = ts_ascat.copy()
            else:
                ts_ref = calc_anom(ts_ascat.copy(),
                                   longterm=(mode == 'longterm')).dropna()

            for run, ts_model in zip(names, dss_xhourly):

                ind = (ts_model['snow_mass'][:, row, col].values == 0) & (
                    ts_model['soil_temp_layer1'][:, row, col].values > 277.15)
                ts_mod = ts_model[var][:, row, col].to_series().loc[ind]
                ts_mod.index += pd.to_timedelta('2 hours')

                if mode == 'absolute':
                    ts_mod = ts_mod.dropna()
                else:
                    ts_mod = calc_anom(ts_mod,
                                       longterm=mode == 'longterm').dropna()
                ts_mod = ts_mod.reindex(t_ana).dropna()

                for i, p in enumerate(periods):
                    tmp = pd.DataFrame({
                        1: ts_ref,
                        2: ts_mod
                    })[p[0]:p[1]].dropna()
                    res[f'p{i}_len_{run}_{mode}'] = len(tmp)
                    r, p = pearsonr(
                        tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan, np.nan)
                    res[f'p{i}_r_{run}_{mode}'] = r

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
def EC_ascat_smap_ismn_ldas():

    result_file = Path('/Users/u0116961/Documents/work/extended_collocation/ec_ascat_smap_ismn_ldas.csv')

    names = ['insitu', 'ascat', 'smap', 'ol', 'da']
    combs = list(combinations(names, 2))

    ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_noScl').timeseries
    ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries
    ds_da_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_MadKF_DA_it11').timeseries['obs_ana']
    tg = LDAS_io().grid.tilegrids

    modes = ['absolute','longterm','shortterm']

    ismn = ISMN_io()
    ismn.list = ismn.list.iloc[70::]
    ascat = HSAF_io()
    smap = SMAP_io()

    lut = pd.read_csv(Paths().lut, index_col=0)

    i = 0
    for meta, ts_insitu in ismn.iter_stations(surface_only=True):
        i += 1
        logging.info('%i/%i' % (i, len(ismn.list)))

        try:
            if len(ts_insitu := ts_insitu['2015-04-01':'2020-04-01'].resample('1d').mean().dropna()) < 25:
                continue
        except:
            continue

        res = pd.DataFrame(meta.copy()).transpose()
        col = meta.ease_col
        row = meta.ease_row

        colg = col + tg.loc['domain', 'i_offg']  # col / lon
        rowg = row + tg.loc['domain', 'j_offg']  # row / lat

        tmp_lut = lut[(lut.ease2_col == colg) & (lut.ease2_row == rowg)]
        if len(tmp_lut) == 0:
            continue

        gpi_smap = tmp_lut.index.values[0]
        gpi_ascat = tmp_lut.ascat_gpi.values[0]

        try:
            ts_ascat = ascat.read(gpi_ascat, resample_time=False).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        ts_smap = smap.read(gpi_smap)

        if (ts_ascat is None) | (ts_smap is None):
            continue

        ind = (ds_ol['snow_mass'][:, row, col].values == 0)&(ds_ol['soil_temp_layer1'][:, row, col].values > 277.15)
        ts_ol = ds_ol['sm_surface'][:, row, col].to_series().loc[ind].dropna()
        ts_ol.index += pd.to_timedelta('2 hours')

        ind = (ds_da['snow_mass'][:, row, col].values == 0)&(ds_da['soil_temp_layer1'][:, row, col].values > 277.15)
        ts_da = ds_da['sm_surface'][:, row, col].to_series().loc[ind].dropna()
        ts_da.index += pd.to_timedelta('2 hours')

        for mode in modes:

            if mode == 'absolute':
                ts_ins = ts_insitu.copy()
                ts_asc = ts_ascat.copy()
                ts_smp = ts_smap.copy()
                ts_ol = ts_ol.copy()
                ts_da = ts_da.copy()
            else:
                ts_ins = calc_anom(ts_ins.copy(), longterm=(mode=='longterm')).dropna()
                ts_asc = calc_anom(ts_asc.copy(), longterm=(mode == 'longterm')).dropna()
                ts_smp = calc_anom(ts_smp.copy(), longterm=(mode == 'longterm')).dropna()
                ts_ol = calc_anom(ts_ol.copy(), longterm=(mode == 'longterm')).dropna()
                ts_da = calc_anom(ts_da.copy(), longterm=(mode == 'longterm')).dropna()

            tmp = pd.DataFrame(dict(zip(names, [ts_ins, ts_asc, ts_smp, ts_ol, ts_da]))).dropna()

            corr = tmp.corr()
            ec_res = ecol(tmp[['insitu', 'ascat', 'smap', 'ol', 'da']], correlated=[['smap', 'ol'], ['smap', 'da'], ['ol', 'da']])

            res[f'len_{mode}'] = len(tmp)
            for c in combs:
                res[f'corr_{"_".join(c)}'] = corr.loc[c]
            res[f'err_corr_smap_ol_{mode}'] = ec_res['err_corr_smap_ol']
            res[f'err_corr_smap_da_{mode}'] = ec_res['err_corr_smap_da']
            res[f'err_corr_ol_da_{mode}'] = ec_res['err_corr_ol_da']

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.4f')
        else:
            res.to_csv(result_file, float_format='%0.4f', mode='a', header=False)
Example #6
0
def TCA_insitu_evaluation():

    result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\TCA_evaluation\validation.csv'

    noDA = LDAS_io('xhourly', 'US_M36_SMOS40_noDA_cal_scaled')

    DA_const_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scaled')
    DA_varia_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scl_errfile')

    t_ana = pd.DatetimeIndex(
        LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled').timeseries.time.
        values).sort_values()

    ascat = HSAF_io()
    gpi_list = pd.read_csv(
        r"D:\data_sets\ASCAT\warp5_grid\pointlist_warp_conus.csv", index_col=0)

    ismn = ISMN_io(col_offs=noDA.grid.tilegrids.loc['domain', 'i_offg'],
                   row_offs=noDA.grid.tilegrids.loc['domain', 'j_offg'])

    runs = ['noDA', 'DA_const_err', 'DA_varia_err']
    tss = [noDA.timeseries, DA_const_err.timeseries, DA_varia_err.timeseries]

    variables = [
        'sm_surface',
    ]
    modes = [
        'absolute',
    ]

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):
        logging.info('%i/%i' % (i, len(ismn.list)))

        try:

            res = pd.DataFrame(meta.copy()).transpose()
            col = meta.ease_col
            row = meta.ease_row

            gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list)

            ts_asc = ascat.read(gpi, resample_time=False)
            if ts_asc is None:
                continue
            ts_asc.name = 'ascat'
            ts_asc = pd.DataFrame(ts_asc)

            for var in variables:
                for mode in modes:

                    ts_ins = ts_insitu[var].dropna()
                    ts_ins.name = 'insitu'
                    ts_ins = pd.DataFrame(ts_ins)

                    for run, ts_model in zip(runs, tss):

                        ind = (ts_model['snow_mass'][row, col].values == 0) & (
                            ts_model['soil_temp_layer1'][row,
                                                         col].values > 277.15)
                        ts_mod = ts_model[var][row, col].to_series().loc[ind]
                        ts_mod.index += pd.to_timedelta('2 hours')
                        ts_mod = ts_mod.loc[t_ana].dropna()
                        ts_mod.name = 'model'
                        ts_mod = pd.DataFrame(ts_mod)

                        matched = df_match(ts_mod, ts_asc, ts_ins, window=0.5)
                        data = ts_mod.join(matched[0][[
                            'ascat',
                        ]]).join(matched[1][[
                            'insitu',
                        ]]).dropna()

                        tc_res = TCA(data['model'].values,
                                     data['ascat'].values,
                                     data['insitu'].values)

                        res['RMSE_model_' + run + '_' + mode + '_' +
                            var] = tc_res[1][0]
                        res['RMSE_ascat_' + run + '_' + mode + '_' +
                            var] = tc_res[1][1]
                        res['RMSE_insitu_' + run + '_' + mode + '_' +
                            var] = tc_res[1][2]

                        res['beta_ascat_' + run + '_' + mode + '_' +
                            var] = tc_res[2][1]
                        res['beta_insitu_' + run + '_' + mode + '_' +
                            var] = tc_res[2][2]

                        res['len_' + mode + '_' + var] = len(data)

            if (os.path.isfile(result_file) == False):
                res.to_csv(result_file, float_format='%0.4f')
            else:
                res.to_csv(result_file,
                           float_format='%0.4f',
                           mode='a',
                           header=False)

        except:
            continue
Example #7
0
def reformat_ascat():

    outfile_ts = '/data_sets/LIS/ASCAT/timeseries.nc'
    outfile_img = '/data_sets/LIS/ASCAT/images.nc'

    with rasterio.open('/data_sets/LIS/NoahMP_belgium/mask.tif') as ds:
        mask = np.flipud(ds.read()[0, :, :])

    with Dataset('/data_sets/LIS/NoahMP_belgium/images.nc') as ds:
        lats = ds.variables['lat'][:, :]
        lons = ds.variables['lon'][:, :]
        timeunit = ds['time'].units
        dates = ds['time'][:]
        pydates = pd.to_datetime(num2date(dates, units=timeunit))

    io = HSAF_io()
    gpis = pd.read_csv(
        '/data_sets/LIS/NoahMP_belgium/pointlist_Belgium_warp.csv',
        index_col=0)

    lats.mask[mask == 0] = True
    lons.mask[mask == 0] = True
    inds = np.where(~lats.mask)

    tmp_list = pd.DataFrame({
        'row': inds[0],
        'col': inds[1],
        'gpi': np.full(len(inds[0]), 0, dtype='int64'),
        'cell': np.full(len(inds[0]), 0, dtype='int64')
    })
    for idx, data in tmp_list.iterrows():
        print('%i / %i' % (idx + 1, len(tmp_list)))
        r, c = data['row'], data['col']
        gpi = ((gpis.lat - lats[r, c])**2 +
               (gpis.lon - lons[r, c])**2).idxmin()
        tmp_list.loc[idx, 'gpi'] = gpi
        tmp_list.loc[idx, 'cell'] = gpis.loc[gpi, 'cell']
    tmp_list.to_csv('/data_sets/LIS/NoahMP_belgium/tmp_list.csv')
    # tmp_list = pd.read_csv('/data_sets/LIS/NoahMP_belgium/tmp_list.csv', index_col=0)

    with Dataset(outfile_ts, mode='w') as res:

        res.createDimension('lat', lats.shape[0])
        res.createDimension('lon', lons.shape[1])
        res.createDimension('time', len(dates))

        res.createVariable('lat',
                           ds['lat'].dtype,
                           dimensions=('lat', 'lon'),
                           chunksizes=(1, 1),
                           zlib=True)
        res.createVariable('lon',
                           ds['lon'].dtype,
                           dimensions=('lat', 'lon'),
                           chunksizes=(1, 1),
                           zlib=True)
        res.createVariable('time',
                           dates.dtype,
                           dimensions=('time', ),
                           chunksizes=(len(dates), ),
                           zlib=True)
        res.variables['lat'][:, :] = lats
        res.variables['lon'][:, :] = lons
        res.variables['time'][:] = dates

        # Coordinate attributes following CF-conventions
        res.variables['time'].setncatts({
            'long_name': 'time',
            'units': timeunit
        })
        res.variables['lon'].setncatts({
            'long_name': 'longitude',
            'units': 'degrees_east'
        })
        res.variables['lat'].setncatts({
            'long_name': 'latitude',
            'units': 'degrees_north'
        })

        res.createVariable('SoilMoisture',
                           'float32',
                           dimensions=('time', 'lat', 'lon'),
                           chunksizes=(len(dates), 1, 1),
                           zlib=True)
        res.variables['SoilMoisture'].setncatts({'missing_value': -9999})

        i = 0
        for cell in tmp_list['cell'].unique():
            for gpi in tmp_list.loc[tmp_list['cell'] == cell, 'gpi'].unique():
                print('%i / %i' % (i, len(tmp_list)))

                cell_gpi_list = tmp_list.loc[tmp_list['gpi'] == gpi]

                try:
                    ts = io.read(gpi, resample_time=False).resample(
                        '6h').mean().dropna()[pydates].values
                    np.place(ts, np.isnan(ts), -9999)
                    for idx, data in cell_gpi_list.iterrows():
                        i += 1
                        res.variables['SoilMoisture'][:, data['row'],
                                                      data['col']] = ts

                except:
                    print('gpi %i failed' % gpi)
                    continue

    cmdBase = 'ncks -4 -L 4 --cnk_dmn time,1 --cnk_dmn lat,%i --cnk_dmn lon,%i ' % lats.shape
    cmd = ' '.join([cmdBase, outfile_ts, outfile_img])
    os.system(cmd)
Example #8
0
def run_ascat_eval_part(part, parts):

    res_path = Path(
        '/Users/u0116961/Documents/work/LDAS/2020-03_scaling/validation')
    result_file = res_path / ('ascat_eval_part%i.csv' % part)

    lut = pd.read_csv(Paths().lut, index_col=0)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(lut) / parts).astype('int')
    subs[-1] = len(lut)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    lut = lut.iloc[start:end, :]

    names = ['open_loop', 'SMOSSMAP_short', 'MadKF_SMOS40']
    runs = [
        'US_M36_SMAP_TB_OL_noScl', 'US_M36_SMAP_TB_DA_scl_SMOSSMAP_short',
        'US_M36_SMOS40_TB_MadKF_DA_it613'
    ]

    dss = [LDAS_io('xhourly', run).timeseries for run in runs]
    grid = LDAS_io().grid

    # t_ana = pd.DatetimeIndex(LDAS_io('ObsFcstAna', runs[0]).timeseries.time.values).sort_values()
    ds_obs_smap = (LDAS_io('ObsFcstAna',
                           'US_M36_SMAP_TB_OL_noScl').timeseries['obs_ana'])
    ds_obs_smos = (LDAS_io(
        'ObsFcstAna', 'US_M36_SMOS40_TB_MadKF_DA_it613').timeseries['obs_ana'])

    modes = ['absolute', 'longterm', 'shortterm']

    ascat = HSAF_io()

    for cnt, (gpi, data) in enumerate(lut.iterrows()):
        print('%i / %i' % (cnt, len(lut)))

        col = int(data.ease2_col - grid.tilegrids.loc['domain', 'i_offg'])
        row = int(data.ease2_row - grid.tilegrids.loc['domain', 'j_offg'])

        res = pd.DataFrame(index=(gpi, ))
        res['col'] = int(data.ease2_col)
        res['row'] = int(data.ease2_row)
        res['lcol'] = col
        res['lrow'] = row

        try:
            ts_ascat = ascat.read(
                data['ascat_gpi'],
                resample_time=False).resample('1d').mean().dropna()
            ts_ascat = ts_ascat[~ts_ascat.index.duplicated(keep='first')]
            ts_ascat.name = 'ASCAT'
        except:
            continue

        t_df_smap = ds_obs_smap.sel(species=[1, 2]).isel(lat=row,
                                                         lon=col).to_pandas()
        t_df_smos = ds_obs_smos.sel(species=[1, 2]).isel(lat=row,
                                                         lon=col).to_pandas()
        t_ana_smap = t_df_smap[~np.isnan(t_df_smap[1])
                               | ~np.isnan(t_df_smap[2])].resample(
                                   '1d').mean().index
        t_ana_smos = t_df_smos[~np.isnan(t_df_smos[1])
                               | ~np.isnan(t_df_smos[2])].resample(
                                   '1d').mean().index

        var = 'sm_surface'
        for mode in modes:

            if mode == 'absolute':
                ts_ref = ts_ascat.copy()
            else:
                ts_ref = calc_anom(ts_ascat.copy(),
                                   longterm=(mode == 'longterm')).dropna()

            for run, ts_model in zip(names, dss):

                t_ana = t_ana_smos if run == 'MadKF_SMOS40' else t_ana_smap

                ind = (ts_model['snow_mass'][:, row, col].values == 0) & (
                    ts_model['soil_temp_layer1'][:, row, col].values > 277.15)
                ts_mod = ts_model[var][:, row, col].to_series().loc[ind]
                ts_mod.index += pd.to_timedelta('2 hours')

                if mode == 'absolute':
                    ts_mod = ts_mod.dropna()
                else:
                    ts_mod = calc_anom(ts_mod,
                                       longterm=mode == 'longterm').dropna()
                ts_mod = ts_mod.resample('1d').mean()

                tmp = pd.DataFrame({1: ts_ref, 2: ts_mod}).dropna()
                res['len_' + run + '_' + mode] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res['r_' + run + '_' + mode] = r
                # res['p_' + run + '_' + mode] = p
                # res['rmsd_' + run + '_' + mode] = np.sqrt(((tmp[1] - tmp[2]) ** 2).mean())
                res['ubrmsd_' + run + '_' + mode] = np.sqrt(
                    (((tmp[1] - tmp[1].mean()) -
                      (tmp[2] - tmp[2].mean()))**2).mean())

                tmp = pd.DataFrame({
                    1: ts_ref,
                    2: ts_mod
                }).reindex(t_ana).dropna()
                res['ana_len_' + run + '_' + mode] = len(tmp)
                r, p = pearsonr(tmp[1], tmp[2]) if len(tmp) > 10 else (np.nan,
                                                                       np.nan)
                res['ana_r_' + run + '_' + mode] = r
                # res['ana_p_' + run + '_' + mode] = p
                # res['ana_rmsd_' + run + '_' + mode] = np.sqrt(((tmp[1] - tmp[2]) ** 2).mean())
                res['ana_ubrmsd_' + run + '_' + mode] = np.sqrt(
                    (((tmp[1] - tmp[1].mean()) -
                      (tmp[2] - tmp[2].mean()))**2).mean())

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.3f')
        else:
            res.to_csv(result_file,
                       float_format='%0.3f',
                       mode='a',
                       header=False)
Example #9
0
def run(part):

    parts = 15

    smos = SMOS_io()
    ismn = ISMN_io()
    ascat = HSAF_io(ext=None)
    mswep = MSWEP_io()

    # Median Q from MadKF API/CONUS run.
    Q_avg = 12.
    R_avg = 74.

    # Select only SCAN and USCRN
    ismn.list = ismn.list[(ismn.list.network == 'SCAN') |
                          (ismn.list.network == 'USCRN')]
    ismn.list.index = np.arange(len(ismn.list))

    # Split station list in 4 parts for parallelization
    subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int')
    subs[-1] = len(ismn.list)
    start = subs[part - 1]
    end = subs[part]
    ismn.list = ismn.list.iloc[start:end, :]

    if platform.system() == 'Windows':
        result_file = os.path.join('D:', 'work', 'MadKF', 'CONUS', 'ismn_eval',
                                   'result_part%i.csv' % part)
    elif platform.system() == 'Linux':
        result_file = os.path.join('/', 'scratch', 'leuven', '320', 'vsc32046',
                                   'output', 'MadKF', 'CONUS', 'ismn_eval',
                                   'result_part%i.csv' % part)
    else:
        result_file = os.path.join('/', 'work', 'MadKF', 'CONUS', 'ismn_eval',
                                   'parts2', 'result_part%i.csv' % part)

    dt = ['2010-01-01', '2015-12-31']

    for cnt, (station,
              insitu) in enumerate(ismn.iter_stations(surf_depth=0.1)):

        # station = ismn.list.loc[978,:]
        # insitu = ismn.read_first_surface_layer('SCAN','Los_Lunas_Pmc')

        print('%i / %i' % (cnt, len(ismn.list)))

        # if True:
        try:
            gpi = lonlat2gpi(station.lon, station.lat, mswep.grid)
            mswep_idx = mswep.grid.index[mswep.grid.dgg_gpi == gpi][0]
            smos_gpi = mswep.grid.loc[mswep_idx, 'smos_gpi']

            precip = mswep.read(mswep_idx)
            sm_ascat = ascat.read(gpi)
            sm_smos = smos.read(smos_gpi) * 100.

            if (precip is None) | (sm_ascat is None) | (sm_smos is None) | (
                    insitu is None):
                continue

            precip = calc_anomaly(precip[dt[0]:dt[1]],
                                  method='moving_average',
                                  longterm=False)
            sm_ascat = calc_anomaly(sm_ascat[dt[0]:dt[1]],
                                    method='moving_average',
                                    longterm=False)
            sm_smos = calc_anomaly(sm_smos[dt[0]:dt[1]],
                                   method='moving_average',
                                   longterm=False)
            insitu = calc_anomaly(insitu[dt[0]:dt[1]].resample('1d').first(),
                                  method='moving_average',
                                  longterm=False).tz_localize(None)

            df = pd.DataFrame({
                1: precip,
                2: sm_ascat,
                3: sm_smos,
                4: insitu
            },
                              index=pd.date_range(dt[0], dt[1]))
            df.loc[np.isnan(df[1]), 1] = 0.
            n = len(df)

            if len(df.dropna()) < 50:
                continue
            gamma = mswep.grid.loc[mswep_idx, 'gamma']
            api = API(gamma=gamma)

            # --- OL run ---
            x_OL = np.full(n, np.nan)
            model = deepcopy(api)
            for t, f in enumerate(precip.values):
                x = model.step(f)
                x_OL[t] = x

            # ----- Calculate uncertainties -----
            # convert (static) forcing to model uncertainty
            P_avg = Q_avg / (1 - gamma**2)

            # calculate TCA based uncertainty and scaling coefficients
            tmp_df = pd.DataFrame({
                1: x_OL,
                2: sm_ascat,
                3: sm_smos
            },
                                  index=pd.date_range(dt[0], dt[1])).dropna()
            snr, r_tc, err, beta = tc(tmp_df)
            P_TC = err[0]**2
            Q_TC = P_TC * (1 - gamma**2)
            R_TC = (err[1] / beta[1])**2
            H_TC = beta[1]

            # Calculate RMSD based uncertainty
            R_rmsd = (np.nanmean(
                (tmp_df[1].values - H_TC * tmp_df[2].values)**2) - P_avg)
            if R_rmsd < 0:
                R_rmsd *= -1
            # -----------------------------------

            # ----- Run KF using TCA-based uncertainties -----
            api_kf = API(gamma=gamma, Q=Q_TC)
            x_kf, P, R_innov_kf, checkvar_kf, K_kf = \
                KF(api_kf, df[1].values.copy(), df[2].values.copy(), R_TC, H=H_TC)

            # ----- Run EnKF using static uncertainties -----
            forc_pert = ['normal', 'additive', Q_avg]
            obs_pert = ['normal', 'additive', R_avg]
            x_avg, P, R_innov_avg, checkvar_avg, K_avg = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run EnKF using RMSD-based uncertainties (corrected for model uncertainty) -----
            # forc_pert = ['normal', 'additive', Q_avg]
            # obs_pert = ['normal', 'additive', R_rmsd]
            # x_rmsd, P, R_innov_rmsd, checkvar_rmsd, K_rmsd = \
            #     EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run MadKF -----
            cnt = 0
            checkvar_madkf = 9999.
            while ((checkvar_madkf < 0.95) |
                   (checkvar_madkf > 1.05)) & (cnt < 5):
                cnt += 1
                tmp_x_madkf, P_madkf, R_madkf, Q_madkf, H_madkf, R_innov_madkf, tmp_checkvar_madkf, K_madkf = \
                    MadKF(api, df[1].values.copy(), df[2].values.copy(), n_ens=100, n_iter=20)
                if abs(1 - tmp_checkvar_madkf) < abs(1 - checkvar_madkf):
                    checkvar_madkf = tmp_checkvar_madkf
                    x_madkf = tmp_x_madkf

            df['x_ol'] = x_OL
            df['x_kf'] = x_kf
            df['x_avg'] = x_avg
            # df['x_rmsd'] = x_rmsd
            df['x_madkf'] = x_madkf

            # tc_ol = tc(df[[4,3,'x_ol']])
            # tc_kf = tc(df[[4,3,'x_kf']])
            # tc_avg = tc(df[[4,3,'x_avg']])
            # tc_rmsd = tc(df[[4,3,'x_rmsd']])
            # tc_madkf = tc(df[[4,3,'x_madkf']])

            ci_l_ol, ci_m_ol, ci_u_ol = bootstrap_tc(df[[4, 3, 'x_ol']])
            ci_l_kf, ci_m_kf, ci_u_kf = bootstrap_tc(df[[4, 3, 'x_kf']])
            ci_l_avg, ci_m_avg, ci_u_avg = bootstrap_tc(df[[4, 3, 'x_avg']])
            # ci_l_rmsd, ci_m_rmsd, ci_u_rmsd = bootstrap_tc(df[[4,3,'x_rmsd']])
            ci_l_madkf, ci_m_madkf, ci_u_madkf = bootstrap_tc(
                df[[4, 3, 'x_madkf']])

            corr = df.dropna().corr()
            n_all = len(df.dropna())

            result = pd.DataFrame(
                {
                    'lon': station.lon,
                    'lat': station.lat,
                    'network': station.network,
                    'station': station.station,
                    'gpi': gpi,
                    'n_all': n_all,
                    'Q_est_madkf': Q_madkf,
                    'R_est_madkf': R_madkf,
                    'corr_ol': corr[4]['x_ol'],
                    'corr_kf': corr[4]['x_kf'],
                    'corr_avg': corr[4]['x_avg'],
                    # 'corr_rmsd': corr[4]['x_rmsd'],
                    'corr_madkf': corr[4]['x_madkf'],
                    # 'snr_ol': tc_ol[0][2],
                    # 'snr_kf': tc_kf[0][2],
                    # 'snr_avg': tc_avg[0][2],
                    # 'snr_rmsd': tc_rmsd[0][2],
                    # 'snr_madkf': tc_madkf[0][2],
                    # 'r_ol': tc_ol[1][2],
                    # 'r_kf': tc_kf[1][2],
                    # 'r_avg': tc_avg[1][2],
                    # 'r_rmsd': tc_rmsd[1][2],
                    # 'r_madkf': tc_madkf[1][2],
                    # 'rmse_kf': tc_kf[2][2],
                    # 'rmse_avg': tc_avg[2][2],
                    # 'rmse_rmsd': tc_rmsd[2][2],
                    # 'rmse_madkf': tc_madkf[2][2],
                    # 'rmse_ol': tc_ol[2][2],
                    'r_ol_l': ci_l_ol,
                    'r_ol_m': ci_m_ol,
                    'r_ol_u': ci_u_ol,
                    'r_kf_l': ci_l_kf,
                    'r_kf_m': ci_m_kf,
                    'r_kf_u': ci_u_kf,
                    'r_avg_l': ci_l_avg,
                    'r_avg_m': ci_m_avg,
                    'r_avg_u': ci_u_avg,
                    # 'r_rmsd_l': ci_l_rmsd,
                    # 'r_rmsd_m': ci_m_rmsd,
                    # 'r_rmsd_u': ci_u_rmsd,
                    'r_madkf_l': ci_l_madkf,
                    'r_madkf_m': ci_m_madkf,
                    'r_madkf_u': ci_u_madkf,
                    'checkvar_kf': checkvar_kf,
                    'checkvar_avg': checkvar_avg,
                    # 'checkvar_rmsd': checkvar_rmsd,
                    'checkvar_madkf': checkvar_madkf,
                    'R_innov_kf': R_innov_kf,
                    'R_innov_avg': R_innov_avg,
                    # 'R_innov_rmsd': R_innov_rmsd,
                    'R_innov_madkf': R_innov_madkf
                },
                index=(station.name, ))

            if (os.path.isfile(result_file) == False):
                result.to_csv(result_file, float_format='%0.4f')
            else:
                result.to_csv(result_file,
                              float_format='%0.4f',
                              mode='a',
                              header=False)
        except:
            print('GPI failed.')
            continue

    ascat.close()
    mswep.close()
Example #10
0
def plot_suspicious_stations(root):

    statlist = pd.read_csv('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/station_list_r_diff.csv', index_col=0)

    rmsd_root = 'US_M36_SMAP_TB_DA_SM_PROXY_'
    rmsd_exps = list(np.sort([x.name.split(rmsd_root)[1] for x in Path('/Users/u0116961/data_sets/LDASsa_runs').glob('*SM_PROXY*')]))

    ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_scaled_4K_obserr').timeseries
    ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries

    ts_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries['obs_obs']
    t_ana = pd.DatetimeIndex(ts_ana.time.values).sort_values()

    ascat = HSAF_io()
    gpi_list = pd.read_csv(ascat.root / 'warp5_grid' / 'pointlist_warp_conus.csv', index_col=0)

    ismn = ISMN_io()

    variables = ['sm_surface', 'sm_rootzone']
    modes = ['absolute', 'longterm', 'shortterm']

    ismn.list.index = ismn.list.network + '_' + ismn.list.station
    ismn.list.reindex(statlist.index)
    ismn.list = ismn.list.reindex(statlist.index)

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations(surface_only=False)):
        if 'tmp_res' in locals():
            if (meta.network in tmp_res) & (meta.station in tmp_res):
                print(f'Skipping {i}')
                continue

        try:
            res = pd.DataFrame(meta.copy()).transpose()
            col = meta.ease_col
            row = meta.ease_row

            gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list)

            ts_ascat = ascat.read(gpi) / 100 * 0.6
            if ts_ascat is None:
                continue

            for mode in modes:
                for var in variables:

                    tmp = statlist[(statlist.network==meta.network)&(statlist.station==meta.station)]
                    dpr = tmp[f'diff_pearsonr2_{mode}_{var}'].values[0]
                    dtr = tmp[f'diff_tcar2_{mode}_{var}'].values[0]

                    if not ((dtr < 0) & (dpr > 0)):
                        continue

                    if mode == 'absolute':
                        ts_asc = ts_ascat.dropna()
                    else:
                        ts_asc = calc_anom(ts_ascat, longterm=(mode == 'longterm')).dropna()
                    ts_asc.name = 'ascat'
                    ts_asc = pd.DataFrame(ts_asc)

                    if mode == 'absolute':
                        ts_ins = ts_insitu[var].dropna()
                    else:
                        ts_ins = calc_anom(ts_insitu[var], longterm=(mode == 'longterm')).dropna()
                    ts_ins.name = 'insitu'
                    ts_ins = pd.DataFrame(ts_ins)

                    ind = (ds_ol['snow_mass'].isel(lat=row, lon=col).values == 0) & \
                          (ds_ol['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15)
                    ts_ol = ds_ol[var].isel(lat=row, lon=col).to_series().loc[ind]
                    ts_ol.index += pd.to_timedelta('2 hours')
                    ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1)
                    if mode == 'absolute':
                        ts_ol = ts_ol.reindex(t_ana[ind_obs]).dropna()
                    else:
                        ts_ol = calc_anom(ts_ol.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna()
                    ts_ol.name = 'open_loop'
                    ts_ol = pd.DataFrame(ts_ol)

                    ind = (ds_da['snow_mass'].isel(lat=row, lon=col).values == 0) & \
                          (ds_da['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15)
                    ts_da = ds_da[var].isel(lat=row, lon=col).to_series().loc[ind]
                    ts_da.index += pd.to_timedelta('2 hours')
                    ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1)
                    if mode == 'absolute':
                        ts_da = ts_da.reindex(t_ana[ind_obs]).dropna()
                    else:
                        ts_da = calc_anom(ts_da.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna()
                    ts_da.name = 'DA_4K'
                    ts_da = pd.DataFrame(ts_da)

                    matched = df_match(ts_ol, ts_da, ts_asc, ts_ins, window=0.5)
                    data = ts_ol.join(matched[0]['DA_4K']).join(matched[1]['ascat']).join(matched[2]['insitu']).dropna()

                    dpr_triplets = data.corr()['DA_4K']['insitu'] - data.corr()['open_loop']['insitu']
                    if dpr_triplets < 0:
                        continue

                    f = plt.figure(figsize=(15, 5))
                    sns.lineplot(data=data[['open_loop', 'DA_4K', 'insitu']], dashes=False, linewidth=1.5, axes=plt.gca())
                    plt.title(f'{meta.network} / {meta.station} ({var}): d(Pearson R2) = {dpr_triplets:.3f} , d(TCA R2) = {dtr:.3f}')

                    fbase = Path('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/timeseries')
                    fname = fbase / f'{mode}_{var}_{meta.network}_{meta.station}.png'
                    f.savefig(fname, dpi=300, bbox_inches='tight')
                    plt.close()

        except:
            continue
Example #11
0
def noahmp_version_comparison(part, parts):

    result_file = Path(
        f'/Users/u0116961/Documents/work/LIS/noahmp_version_comparison/result_part{part}.csv'
    )
    if not result_file.parent.exists():
        Path.mkdir(result_file.parent, parents=True)

    ascat = HSAF_io()
    smap = SMAP_io()

    noah3 = Dataset('/Users/u0116961/data_sets/LIS/noahmp36/timeseries.nc')
    noah4 = Dataset('/Users/u0116961/data_sets/LIS/noahmp401/timeseries.nc')

    lats = noah3['lat'][:, :]
    lons = noah3['lon'][:, :]

    ind_lat, ind_lon = np.where(~lats.mask)

    # Split grid cell list for parallelization
    subs = (np.arange(parts + 1) * len(ind_lat) / parts).astype('int')
    subs[-1] = len(ind_lat)
    start = subs[part - 1]
    end = subs[part]

    # Look-up table that contains the grid cells to iterate over
    ind_lat = ind_lat[start:end]
    ind_lon = ind_lon[start:end]

    for i, (i_r, i_c) in enumerate(zip(ind_lat, ind_lon)):
        i += 1
        logging.info(f'{i} / {len(ind_lat)}')

        lat = lats[i_r, i_c]
        lon = lons[i_r, i_c]

        res = pd.DataFrame({'lat': lat, 'lon': lon}, index=(i, ))

        for v in [
                'SM1', 'SM2', 'SM3', 'SM4', 'ST1', 'ST2', 'ST3', 'ST4', 'LAI',
                'SWE'
        ]:
            if ('SM' in v) | ('ST' in v):
                res[f'mdiff_{v}'], res[f'sdiff_{v}'], res[f'r2_{v}'] = \
                    stats(noah4[v[0:2]][:, int(v[-1])-1, i_r, i_c], noah3[v[0:2]][:, int(v[-1])-1, i_r, i_c])
            else:
                res[f'mdiff_{v}'], res[f'sdiff_{v}'], res[f'r2_{v}'] = \
                    stats(noah4[v][:, i_r, i_c], noah3[v][:, i_r, i_c])

        time = pd.DatetimeIndex(
            num2date(noah3['time'][:],
                     units=noah3['time'].units,
                     only_use_python_datetimes=True,
                     only_use_cftime_datetimes=False))
        df = pd.DataFrame(
            {
                'noahmp36': noah3['SM'][:, 0, i_r, i_c],
                'noahmp401': noah4['SM'][:, 0, i_r, i_c]
            },
            index=time)

        ts_ascat = ascat.read(lat, lon)
        if ts_ascat is None:
            ts_ascat = pd.Series(name='ascat')

        ts_smap = smap.read(lat, lon)
        if ts_smap is None:
            ts_smap = pd.Series(name='smap')

        df = pd.concat((df, ts_ascat, ts_smap), axis='columns').dropna()

        for mode in ['abs', 'anom']:
            if mode == 'anom':
                for c in df.columns.values:
                    df[c] = calc_anom(df[c], longterm=False)

            res[f'len_{mode}'] = len(df)

            ec_res = ecol(df, correlated=[['noahmp36', 'noahmp401']])
            for c in df.columns.values:
                snr = 10**(ec_res[f'snr_{c}'] / 10)
                res[f'tcr2_{mode}_{c}'] = snr / (1 + snr)

        if not result_file.exists():
            res.to_csv(result_file, float_format='%0.4f')
        else:
            res.to_csv(result_file,
                       float_format='%0.4f',
                       mode='a',
                       header=False)
Example #12
0
def run(cell=None, gpi=None):

    if (cell is None) and (gpi is None):
        print('No cell/gpi specified.')
        return

    smos = SMOS_io()
    ascat = HSAF_io(ext=None)
    mswep = MSWEP_io()

    if gpi is not None:
        cell = mswep.gpi2cell(gpi)

    # Median Q/R from TC run.
    Q_avg = 12.
    R_avg = 74.

    if platform.system() == 'Windows':
        result_file = os.path.join('D:', 'work', 'MadKF', 'CONUS',
                                   'result_%04i.csv' % cell)
    else:
        result_file = os.path.join('/', 'scratch', 'leuven', '320', 'vsc32046',
                                   'output', 'MadKF', 'CONUS',
                                   'result_%04i.csv' % cell)

    dt = ['2010-01-01', '2015-12-31']

    for data, info in mswep.iter_cell(cell, gpis=gpi):

        # print info.name
        # if True:
        try:
            precip = mswep.read(info.name)
            sm_ascat = ascat.read(info.dgg_gpi)
            sm_smos = smos.read(info.smos_gpi) * 100.

            if (precip is None) | (sm_ascat is None) | (sm_smos is None):
                continue

            precip = calc_anomaly(precip[dt[0]:dt[1]],
                                  method='moving_average',
                                  longterm=False)
            sm_ascat = calc_anomaly(sm_ascat[dt[0]:dt[1]],
                                    method='moving_average',
                                    longterm=False)
            sm_smos = calc_anomaly(sm_smos[dt[0]:dt[1]],
                                   method='moving_average',
                                   longterm=False)

            api = API(gamma=info.gamma)

            # Regularize time steps
            df = pd.DataFrame({
                1: precip,
                2: sm_ascat,
                3: sm_smos
            },
                              index=pd.date_range(dt[0], dt[1]))

            n_inv_precip = len(np.where(np.isnan(df[1]))[0])
            n_inv_ascat = len(np.where(np.isnan(df[2]))[0])
            n_inv_smos = len(np.where(np.isnan(df[3]))[0])
            n_inv_asc_smo = len(np.where(np.isnan(df[2]) & np.isnan(df[3]))[0])

            df.loc[np.isnan(df[1]), 1] = 0.

            # --- get OL ts  ---
            OL = np.full(len(precip), np.nan)
            model = API(gamma=info.gamma)
            for t, f in enumerate(df[1].values):
                x = model.step(f)
                OL[t] = x

            # collocate OL and satellite data sets.
            df2 = pd.DataFrame({
                1: OL,
                2: sm_ascat,
                3: sm_smos
            },
                               index=pd.date_range(dt[0], dt[1])).dropna()

            # ----- Calculate uncertainties -----
            # convert (static) forcing to model uncertainty
            P_avg = Q_avg / (1 - info.gamma**2)

            # calculate TCA based uncertainty and scaling coefficients
            snr, err, beta = tcol_snr(df2[1].values, df2[2].values,
                                      df2[3].values)
            P_TC = err[0]**2
            Q_TC = P_TC * (1 - info.gamma**2)
            R_TC = (err[1] / beta[1])**2
            H_TC = beta[1]

            # Calculate RMSD based uncertainty
            R_rmsd = (np.nanmean(
                (df2[1].values - H_TC * df2[2].values)**2) - P_avg)
            if R_rmsd < 0:
                R_rmsd *= -1
            # -----------------------------------

            # ----- Run KF using TCA-based uncertainties -----
            api_kf = API(gamma=info.gamma, Q=Q_TC)
            R_2D = np.array([(err[1] / beta[1])**2, (err[2] / beta[2])**2])
            H_2D = np.array([beta[1]**(-1), beta[2]**(-1)])
            x_2d, P, checkvar1_2d, checkvar2_2d, checkvar3_2d, K1_2d, K2_2d = \
                KF_2D(api_kf, df[1].values.copy(), df[2].values.copy(), df[3].values.copy(), R_2D, H=H_2D)

            # ----- Run KF using TCA-based uncertainties -----
            api_kf = API(gamma=info.gamma, Q=Q_TC)
            x_kf, P, R_innov_kf, checkvar_kf, K_kf = \
                KF(api_kf, df[1].values.copy(), df[2].values.copy(), R_TC, H=H_TC)

            # ----- Run EnKF using TCA-based uncertainties -----
            forc_pert = ['normal', 'additive', Q_TC]
            obs_pert = ['normal', 'additive', R_TC]
            x_tc, P, R_innov_tc, checkvar_tc, K_tc = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run EnKF using static uncertainties -----
            forc_pert = ['normal', 'additive', Q_avg]
            obs_pert = ['normal', 'additive', R_avg]
            x_avg, P, R_innov_avg, checkvar_avg, K_avg = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run EnKF using RMSD-based uncertainties (corrected for model uncertainty) -----
            t = timeit.default_timer()
            forc_pert = ['normal', 'additive', Q_avg]
            obs_pert = ['normal', 'additive', R_rmsd]
            x_rmsd, P, R_innov_rmsd, checkvar_rmsd, K_rmsd = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)
            t_enkf = timeit.default_timer() - t

            # ----- Run MadKF -----
            t = timeit.default_timer()
            x_madkf, P, R_madkf, Q_madkf, H_madkf, R_innov_madkf, checkvar_madkf, K_madkf = \
                MadKF(api, df[1].values.copy(), df[2].values.copy(), n_ens=100, n_iter=20)
            t_madkf = timeit.default_timer() - t

            # TC evaluation of assimilation results
            # df3 = pd.DataFrame({1: x_tc, 2: x_avg, 3: x_rmsd, 4: x_madkf, 5: sm_ascat, 6: sm_smos}, index=pd.date_range(dt[0], dt[1])).dropna()
            #
            # rmse_ana_tc = tcol_snr(df3[1].values, df3[5].values, df3[6].values)[1][0]
            # rmse_ana_avg = tcol_snr(df3[2].values, df3[5].values, df3[6].values)[1][0]
            # rmse_ana_rmsd = tcol_snr(df3[3].values, df3[5].values, df3[6].values)[1][0]
            # rmse_ana_madkf = tcol_snr(df3[4].values, df3[5].values, df3[6].values)[1][0]

            result = pd.DataFrame(
                {
                    'lon': info.lon,
                    'lat': info.lat,
                    'col': info.col,
                    'row': info.row,
                    'P_tc': P_TC,
                    'Q_tc': Q_TC,
                    'R_tc': R_TC,
                    'H_tc': H_TC,
                    'K_tc': K_tc,
                    'R_innov_tc': R_innov_tc,
                    'checkvar_tc': checkvar_tc,
                    'K_kf': K_kf,
                    'R_innov_kf': R_innov_kf,
                    'checkvar_kf': checkvar_kf,
                    'K1_2d': K1_2d,
                    'K2_2d': K2_2d,
                    'checkvar1_2d': checkvar1_2d,
                    'checkvar2_2d': checkvar2_2d,
                    'checkvar3_2d': checkvar3_2d,
                    'P_avg': P_avg,
                    'Q_avg': Q_avg,
                    'R_avg': R_avg,
                    'K_avg': K_avg,
                    'R_innov_avg': R_innov_avg,
                    'checkvar_avg': checkvar_avg,
                    'R_rmsd': R_rmsd,
                    'K_rmsd': K_rmsd,
                    'R_innov_rmsd': R_innov_rmsd,
                    'checkvar_rmsd': checkvar_rmsd,
                    'P_madkf': Q_madkf / (1 - info.gamma**2),
                    'Q_madkf': Q_madkf,
                    'R_madkf': R_madkf,
                    'H_madkf': H_madkf,
                    'K_madkf': K_madkf,
                    'R_innov_madkf': R_innov_madkf,
                    'checkvar_madkf': checkvar_madkf,
                    't_enkf': t_enkf,
                    't_madkf': t_madkf,
                    'n_inv_precip': n_inv_precip,
                    'n_inv_ascat': n_inv_ascat,
                    'n_inv_smos': n_inv_smos,
                    'n_inv_asc_smo': n_inv_asc_smo
                },
                index=(info.name, ))

            if (os.path.isfile(result_file) == False):
                result.to_csv(result_file, float_format='%0.4f')
            else:
                result.to_csv(result_file,
                              float_format='%0.4f',
                              mode='a',
                              header=False)
        except:
            print('GPI failed.')
            continue

    ascat.close()
    mswep.close()