def test_df_match_match_on_window_border():
    """
    test matching if a value lies exactly on the window border.
    """

    ref_df = pd.DataFrame({"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0),
                                                                      "2007-01-05", freq="D"))
    match_df = pd.DataFrame({"matched_data": np.arange(4)},
                            index=[datetime(2007, 1, 1, 9),
                                   datetime(2007, 1, 2, 9),
                                   datetime(2007, 1, 3, 12),
                                   datetime(2007, 1, 5, 9)])
    matched = tmatching.df_match(ref_df, match_df, window=0.5)

    nptest.assert_allclose(
        np.array([0.375, 0.375, 0.5, -0.5, 0.375]), matched.distance.values)
    nptest.assert_allclose([0, 1, 2, 2, 3], matched.matched_data)

    # test asym_window keyword
    matched = tmatching.df_match(
        ref_df, match_df, window=0.5, asym_window="<=")

    nptest.assert_allclose(
        np.array([0.375, 0.375, 0.5, np.nan, 0.375]), matched.distance.values)
    nptest.assert_allclose([0, 1, 2, np.nan, 3], matched.matched_data)

    matched = tmatching.df_match(
        ref_df, match_df, window=0.5, asym_window=">=")

    nptest.assert_allclose(
        np.array([0.375, 0.375, np.nan, -0.5, 0.375]), matched.distance.values)
    nptest.assert_allclose([0, 1, np.nan, 2, 3], matched.matched_data)
Example #2
0
    def match(self, reference, *args):
        """
        takes reference and other dataframe and returnes a joined Dataframe
        in this case the reference dataset for the grid is also the
        temporal reference dataset
        """
        matched_datasets = temp_match.df_match(reference, *args, dropna=True,
                                               dropduplicates=True,
                                               window=self.window)

        if type(matched_datasets) != tuple:
            matched_datasets = [matched_datasets]

        matched_data = pd.DataFrame(reference)

        for match in matched_datasets:
            if LooseVersion(pd.__version__) < LooseVersion('0.23'):
                match = match.drop(('index', ''), axis=1)
            else:
                match = match.drop('index', axis=1)
                
            match = match.drop('distance', axis=1)
            matched_data = matched_data.join(match)

        return matched_data.dropna(how='all')
Example #3
0
    def match(self, reference, *args):
        """
        takes reference and other dataframe and returnes a joined Dataframe
        in this case the reference dataset for the grid is also the
        temporal reference dataset
        """
        matched_datasets = temp_match.df_match(reference,
                                               *args,
                                               dropna=True,
                                               dropduplicates=True,
                                               window=self.window)

        if type(matched_datasets) != tuple:
            matched_datasets = [matched_datasets]

        matched_data = pd.DataFrame(reference)

        for match in matched_datasets:
            if LooseVersion(pd.__version__) < LooseVersion('0.23'):
                match = match.drop(('index', ''), axis=1)
            else:
                match = match.drop('index', axis=1)

            match = match.drop('distance', axis=1)
            matched_data = matched_data.join(match)

        return matched_data.dropna(how='all')
def test_df_match_borders():
    """
    Border values can be problematic for temporal matching.

    See issue #51
    """

    ref_df = pd.DataFrame(
        {"data": np.arange(5)},
        index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"),
    )
    match_df = pd.DataFrame(
        {"matched_data": np.arange(5)},
        index=[
            datetime(2007, 1, 1, 9),
            datetime(2007, 1, 2, 9),
            datetime(2007, 1, 3, 9),
            datetime(2007, 1, 4, 9),
            datetime(2007, 1, 5, 9),
        ],
    )
    matched = tmatching.df_match(ref_df, match_df)

    nptest.assert_allclose(np.array([0.375, 0.375, 0.375, 0.375, 0.375]),
                           matched.distance.values)
    nptest.assert_allclose(np.arange(5), matched.matched_data)
def test_df_match_match_on_window_border():
    """
    test matching if a value lies exactly on the window border.
    """

    ref_df = pd.DataFrame(
        {"data": np.arange(5)},
        index=pd.date_range(datetime(2007, 1, 1, 0), "2007-01-05", freq="D"),
    )
    match_df = pd.DataFrame(
        {"matched_data": np.arange(4)},
        index=[
            datetime(2007, 1, 1, 9),
            datetime(2007, 1, 2, 9),
            datetime(2007, 1, 3, 12),
            datetime(2007, 1, 5, 9),
        ],
    )
    with pytest.deprecated_call():
        matched = tmatching.df_match(ref_df, match_df, window=0.5)

    nptest.assert_allclose(np.array([0.375, 0.375, 0.5, -0.5, 0.375]),
                           matched.distance.values)
    nptest.assert_allclose([0, 1, 2, 2, 3], matched.matched_data)

    # test asym_window keyword
    with pytest.deprecated_call():
        matched = tmatching.df_match(ref_df,
                                     match_df,
                                     window=0.5,
                                     asym_window="<=")

    nptest.assert_allclose(np.array([0.375, 0.375, 0.5, np.nan, 0.375]),
                           matched.distance.values)
    nptest.assert_allclose([0, 1, 2, np.nan, 3], matched.matched_data)

    with pytest.deprecated_call():
        matched = tmatching.df_match(ref_df,
                                     match_df,
                                     window=0.5,
                                     asym_window=">=")

    nptest.assert_allclose(np.array([0.375, 0.375, np.nan, -0.5, 0.375]),
                           matched.distance.values)
    nptest.assert_allclose([0, 1, np.nan, 2, 3], matched.matched_data)
Example #6
0
def PCA(Ser1, Ser2, window=1):

    df1 = pd.DataFrame(Ser1).dropna(); df1.columns = ['ds1']
    df2 = pd.DataFrame(Ser2).dropna(); df2.columns = ['ds2']

    if (len(df1) < 10) | (len(df2) <= 10):
        return pd.DataFrame(columns=['PC-1', 'PC-2'])

    if len(df1) < len(df2):
        matched = df_match(df1, df2, window=window)
        df = df1.join(matched['ds2']).dropna()
    else:
        matched = df_match(df2, df1, window=window)
        df = df2.join(matched['ds1']).dropna()

    if len(df) < 10:
        return pd.DataFrame(columns=['PC-1', 'PC-2'])

    X = df.values.copy()
    X_mean = X.mean(axis=0)
    X -= X_mean

    C = (X.T @ X) / (len(X)-1)
    eigen_vals, eigen_vecs = np.linalg.eig(C)

    # Rotate Eigenvectors 180 degrees if major PC is pointing in the "wrong" direction.
    if (np.sign(eigen_vecs[:,np.argmax(eigen_vals)]).sum() == -2) & (np.sign(np.corrcoef(X.T)[0,1]) == 1):
        eigen_vecs *= -1

    X_pca = X @ eigen_vecs
    if eigen_vals[0] < eigen_vals[1]:
        X_pca = np.roll(X_pca, 1, axis=1)
    X_pca[:,0] += X_mean.mean()

    df_pca = pd.DataFrame(X_pca, columns=['PC-1', 'PC-2'], index=df.index)
    return pd.concat((df, df_pca), axis='columns'), eigen_vals, eigen_vecs
Example #7
0
    def match_reverse(self, reference, other):
        """
        takes reference and other dataframe and returnes a joined Dataframe
        in this case the reference dataset for the grid is also the
        temporal reference dataset
        """
        # temporal match comparison to reference TimeSeries
        try:
            matched_ref = temp_match.df_match(other, reference,
                                              window=self.window, dropna=True)
        except ValueError:
            return pd.DataFrame()
        matched_ref = matched_ref.drop(['distance', 'index'], axis=1)

        return matched_ref.join(other)
def test_df_match_borders_unequal_query_points():
    """
    Border values can be problematic for temporal matching.

    See issue #51
    """

    ref_df = pd.DataFrame({"data": np.arange(5)}, index=pd.date_range(datetime(2007, 1, 1, 0),
                                                                      "2007-01-05", freq="D"))
    match_df = pd.DataFrame({"matched_data": np.arange(4)},
                            index=[datetime(2007, 1, 1, 9),
                                   datetime(2007, 1, 2, 9),
                                   datetime(2007, 1, 4, 9),
                                   datetime(2007, 1, 5, 9)])
    matched = tmatching.df_match(ref_df, match_df)

    nptest.assert_allclose(
        np.array([0.375, 0.375, -0.625, 0.375, 0.375]), matched.distance.values)
    nptest.assert_allclose(np.array([0, 1, 1, 2, 3]), matched.matched_data)
Example #9
0
def collocate(df):

    dts = np.arange(24)

    res_df = pd.DataFrame(columns=df.columns.values)
    for d in dts:
        ref_df = pd.DataFrame(index=pd.date_range(df.index.min().date(),
                                                  df.index.max().date()) +
                              pd.Timedelta(d, 'h'))
        args = [df[col].dropna() for col in df]
        matched = df_match(ref_df, *args, window=0.5)
        if len(df.columns) == 1:
            ref_df[df.columns.values[0]] = matched[df.columns.values[0]]
        else:
            for i, col in enumerate(df):
                ref_df[col] = matched[i][col]
        ref_df.dropna(inplace=True)

        if len(ref_df) > len(res_df):
            res_df = ref_df.copy()

    return res_df
Example #10
0
def collocate(df):
    """
    Collocates the columns of a pd.DataFrame. Data points are resampled to reference time stemps with 24 hr distance,
    which are optimized to maximize the number of matches.

    Parameters
    ----------
    df : pd.DataFrame
        Input Dataframe

    """

    res_df = pd.DataFrame(columns=df.columns.values)

    # Test each hour of the day as potential reference time step to optimize the number of collocated data points
    for d in np.arange(24):

        # Create reference time steps for the respective reference hour of the day of this iteration
        ref_df = pd.DataFrame(index=pd.date_range(df.index.min().date(),
                                                  df.index.max().date()) +
                              pd.Timedelta(d, 'h'))

        # Find the NN to the reference time steps for each data set
        args = [df[col].dropna() for col in df]
        matched = df_match(ref_df, *args, window=0.5)
        if len(df.columns) == 1:
            ref_df[df.columns.values[0]] = matched[df.columns.values[0]]
        else:
            for i, col in enumerate(df):
                ref_df[col] = matched[i][col]
        ref_df.dropna(inplace=True)

        # Check if collocation at this hour gave more temporal matches than collocation at the previous hour
        if len(ref_df) > len(res_df):
            res_df = ref_df.copy()

    return res_df
Example #11
0
def TCA_insitu_evaluation():

    result_file = r'D:\work\LDAS\2018-06_rmse_uncertainty\TCA_evaluation\validation.csv'

    noDA = LDAS_io('xhourly', 'US_M36_SMOS40_noDA_cal_scaled')

    DA_const_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scaled')
    DA_varia_err = LDAS_io('xhourly', 'US_M36_SMOS40_DA_cal_scl_errfile')

    t_ana = pd.DatetimeIndex(
        LDAS_io('ObsFcstAna', 'US_M36_SMOS40_DA_cal_scaled').timeseries.time.
        values).sort_values()

    ascat = HSAF_io()
    gpi_list = pd.read_csv(
        r"D:\data_sets\ASCAT\warp5_grid\pointlist_warp_conus.csv", index_col=0)

    ismn = ISMN_io(col_offs=noDA.grid.tilegrids.loc['domain', 'i_offg'],
                   row_offs=noDA.grid.tilegrids.loc['domain', 'j_offg'])

    runs = ['noDA', 'DA_const_err', 'DA_varia_err']
    tss = [noDA.timeseries, DA_const_err.timeseries, DA_varia_err.timeseries]

    variables = [
        'sm_surface',
    ]
    modes = [
        'absolute',
    ]

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):
        logging.info('%i/%i' % (i, len(ismn.list)))

        try:

            res = pd.DataFrame(meta.copy()).transpose()
            col = meta.ease_col
            row = meta.ease_row

            gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list)

            ts_asc = ascat.read(gpi, resample_time=False)
            if ts_asc is None:
                continue
            ts_asc.name = 'ascat'
            ts_asc = pd.DataFrame(ts_asc)

            for var in variables:
                for mode in modes:

                    ts_ins = ts_insitu[var].dropna()
                    ts_ins.name = 'insitu'
                    ts_ins = pd.DataFrame(ts_ins)

                    for run, ts_model in zip(runs, tss):

                        ind = (ts_model['snow_mass'][row, col].values == 0) & (
                            ts_model['soil_temp_layer1'][row,
                                                         col].values > 277.15)
                        ts_mod = ts_model[var][row, col].to_series().loc[ind]
                        ts_mod.index += pd.to_timedelta('2 hours')
                        ts_mod = ts_mod.loc[t_ana].dropna()
                        ts_mod.name = 'model'
                        ts_mod = pd.DataFrame(ts_mod)

                        matched = df_match(ts_mod, ts_asc, ts_ins, window=0.5)
                        data = ts_mod.join(matched[0][[
                            'ascat',
                        ]]).join(matched[1][[
                            'insitu',
                        ]]).dropna()

                        tc_res = TCA(data['model'].values,
                                     data['ascat'].values,
                                     data['insitu'].values)

                        res['RMSE_model_' + run + '_' + mode + '_' +
                            var] = tc_res[1][0]
                        res['RMSE_ascat_' + run + '_' + mode + '_' +
                            var] = tc_res[1][1]
                        res['RMSE_insitu_' + run + '_' + mode + '_' +
                            var] = tc_res[1][2]

                        res['beta_ascat_' + run + '_' + mode + '_' +
                            var] = tc_res[2][1]
                        res['beta_insitu_' + run + '_' + mode + '_' +
                            var] = tc_res[2][2]

                        res['len_' + mode + '_' + var] = len(data)

            if (os.path.isfile(result_file) == False):
                res.to_csv(result_file, float_format='%0.4f')
            else:
                res.to_csv(result_file,
                           float_format='%0.4f',
                           mode='a',
                           header=False)

        except:
            continue
def temp_resam(df, startdate, enddate, freq):
    ref_dr = pd.date_range(startdate, enddate, freq=freq)
    ref_df = pd.DataFrame(index=ref_dr)
    
    matched_df = tm.df_match(ref_df, df)
    return matched_df
    return matched_df
    
    


if __name__ == '__main__':
    
    cur_path = os.path.abspath(os.path.curdir)
    input_dataidx_file = os.path.join(cur_path,'data/ascatssf/0673')
    
    gpi = 720360
    
    dat_obj = datasets.DatasetTs(input_dataidx_file)
    
    print dat_obj.dat_data.dtype
    print type(dat_obj.dat_data), dat_obj.dat_data.shape
    
    gpi_data = dat_obj.read_ts(gpi)
    print gpi_data.dtype, gpi_data.shape
    
    ref_dr = pd.date_range('1970-01-01 12:00:00', '2016-01-01 12:00:00', freq='D')
    ref_df = pd.DataFrame(index=ref_dr)
    
    ascat_dr = julian.julian2datetimeindex(gpi_data['jd'])
    ascat_df = pd.DataFrame(gpi_data, index=ascat_dr)
    matched_df = tm.df_match(ref_df, ascat_df)
    
    matched_df.plot()
    plt.show()
    pass
    
Example #14
0
def plot_suspicious_stations(root):

    statlist = pd.read_csv('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/station_list_r_diff.csv', index_col=0)

    rmsd_root = 'US_M36_SMAP_TB_DA_SM_PROXY_'
    rmsd_exps = list(np.sort([x.name.split(rmsd_root)[1] for x in Path('/Users/u0116961/data_sets/LDASsa_runs').glob('*SM_PROXY*')]))

    ds_ol = LDAS_io('xhourly', 'US_M36_SMAP_TB_OL_scaled_4K_obserr').timeseries
    ds_da = LDAS_io('xhourly', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries

    ts_ana = LDAS_io('ObsFcstAna', 'US_M36_SMAP_TB_DA_scaled_4K_obserr').timeseries['obs_obs']
    t_ana = pd.DatetimeIndex(ts_ana.time.values).sort_values()

    ascat = HSAF_io()
    gpi_list = pd.read_csv(ascat.root / 'warp5_grid' / 'pointlist_warp_conus.csv', index_col=0)

    ismn = ISMN_io()

    variables = ['sm_surface', 'sm_rootzone']
    modes = ['absolute', 'longterm', 'shortterm']

    ismn.list.index = ismn.list.network + '_' + ismn.list.station
    ismn.list.reindex(statlist.index)
    ismn.list = ismn.list.reindex(statlist.index)

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations(surface_only=False)):
        if 'tmp_res' in locals():
            if (meta.network in tmp_res) & (meta.station in tmp_res):
                print(f'Skipping {i}')
                continue

        try:
            res = pd.DataFrame(meta.copy()).transpose()
            col = meta.ease_col
            row = meta.ease_row

            gpi = lonlat2gpi(meta.lon, meta.lat, gpi_list)

            ts_ascat = ascat.read(gpi) / 100 * 0.6
            if ts_ascat is None:
                continue

            for mode in modes:
                for var in variables:

                    tmp = statlist[(statlist.network==meta.network)&(statlist.station==meta.station)]
                    dpr = tmp[f'diff_pearsonr2_{mode}_{var}'].values[0]
                    dtr = tmp[f'diff_tcar2_{mode}_{var}'].values[0]

                    if not ((dtr < 0) & (dpr > 0)):
                        continue

                    if mode == 'absolute':
                        ts_asc = ts_ascat.dropna()
                    else:
                        ts_asc = calc_anom(ts_ascat, longterm=(mode == 'longterm')).dropna()
                    ts_asc.name = 'ascat'
                    ts_asc = pd.DataFrame(ts_asc)

                    if mode == 'absolute':
                        ts_ins = ts_insitu[var].dropna()
                    else:
                        ts_ins = calc_anom(ts_insitu[var], longterm=(mode == 'longterm')).dropna()
                    ts_ins.name = 'insitu'
                    ts_ins = pd.DataFrame(ts_ins)

                    ind = (ds_ol['snow_mass'].isel(lat=row, lon=col).values == 0) & \
                          (ds_ol['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15)
                    ts_ol = ds_ol[var].isel(lat=row, lon=col).to_series().loc[ind]
                    ts_ol.index += pd.to_timedelta('2 hours')
                    ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1)
                    if mode == 'absolute':
                        ts_ol = ts_ol.reindex(t_ana[ind_obs]).dropna()
                    else:
                        ts_ol = calc_anom(ts_ol.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna()
                    ts_ol.name = 'open_loop'
                    ts_ol = pd.DataFrame(ts_ol)

                    ind = (ds_da['snow_mass'].isel(lat=row, lon=col).values == 0) & \
                          (ds_da['soil_temp_layer1'].isel(lat=row, lon=col).values > 277.15)
                    ts_da = ds_da[var].isel(lat=row, lon=col).to_series().loc[ind]
                    ts_da.index += pd.to_timedelta('2 hours')
                    ind_obs = np.bitwise_or.reduce(~np.isnan(ts_ana[:, :, row, col].values), 1)
                    if mode == 'absolute':
                        ts_da = ts_da.reindex(t_ana[ind_obs]).dropna()
                    else:
                        ts_da = calc_anom(ts_da.reindex(t_ana[ind_obs]), longterm=(mode == 'longterm')).dropna()
                    ts_da.name = 'DA_4K'
                    ts_da = pd.DataFrame(ts_da)

                    matched = df_match(ts_ol, ts_da, ts_asc, ts_ins, window=0.5)
                    data = ts_ol.join(matched[0]['DA_4K']).join(matched[1]['ascat']).join(matched[2]['insitu']).dropna()

                    dpr_triplets = data.corr()['DA_4K']['insitu'] - data.corr()['open_loop']['insitu']
                    if dpr_triplets < 0:
                        continue

                    f = plt.figure(figsize=(15, 5))
                    sns.lineplot(data=data[['open_loop', 'DA_4K', 'insitu']], dashes=False, linewidth=1.5, axes=plt.gca())
                    plt.title(f'{meta.network} / {meta.station} ({var}): d(Pearson R2) = {dpr_triplets:.3f} , d(TCA R2) = {dtr:.3f}')

                    fbase = Path('/Users/u0116961/Documents/work/MadKF/CLSM/suspicious_stations/timeseries')
                    fname = fbase / f'{mode}_{var}_{meta.network}_{meta.station}.png'
                    f.savefig(fname, dpi=300, bbox_inches='tight')
                    plt.close()

        except:
            continue
Example #15
0
def mask_data():

    era_matched = temp_match.df_match(ascat_data, era_interim_data, window=1)

    ascat_masked = ascat_data[(era_matched['snow_depth'] <= mask['snow_depth'])
                              & (era_matched['st_l1'] > mask['st_l1'])
                              & (era_matched['air_temp'] > mask['air_temp'])]

    if mask['use_ssf'] == True:
        ascat_masked = ascat_masked[ascat_masked['ssf'] == 1]

    ascat_masked = ascat_masked[[ascat_label, 'jd']]

    relevant_depth = None
    ISMN_station = ISMN.get_station_by_id(station_id)
    for depth in ISMN_station.sm_depths:
        if float(depth.depth_from) - 0.05 < 0.001:
            relevant_depth = depth

    if relevant_depth == None:
        return 0, -1

    ISMN_data = ISMN_station.get_soil_moisture_for_depth(relevant_depth,
                                                         start_date=datetime(
                                                             2007, 1, 1))

    sensor = ISMN_data.keys()[0]
    ISMN_data = ISMN_data[sensor]
    ISMN_ts_name = 'insitu sm %.2f - %.2f m sensor: ' % (float(
        relevant_depth.depth_from), float(relevant_depth.depth_to)) + sensor

    era_insitu_matched = temp_match.df_match(ISMN_data,
                                             era_interim_data,
                                             window=1)

    insitu_masked = ISMN_data[
        (era_insitu_matched['snow_depth'] <= mask['snow_depth'])
        & (era_insitu_matched['st_l1'] > mask['st_l1'])
        & (era_insitu_matched['air_temp'] > mask['air_temp'])]

    if mask['use_ssf'] == True:
        ascat_insitu_matched = temp_match.df_match(insitu_masked,
                                                   ascat_data,
                                                   window=1)
        insitu_masked = insitu_masked[ascat_insitu_matched['ssf'] == 1]

    ISMN_data = insitu_masked[['insitu', 'jd']]

    # slice to same period as insitu data
    era_matched = era_matched[scaled_data.index[0]:scaled_data.
                              index[scaled_data.index.values.size - 1]]

    era_matched.rename(columns={
        'st_l1': 'soil temperature layer 1',
        'air_temp': '2m air temperature'
    },
                       inplace=True)

    era_matched = era_matched[[
        'snow_depth', 'soil temperature layer 1', '2m air temperature'
    ]]

    era_labels, era_values = era_matched.to_dygraph_format()

    masking_data = {'labels': masking_labels, 'data': masking_values}
Example #16
0
def run(part):

    parts = 6

    result_file = r'D:\work\ESA_CCI_SM\validation_%i.csv' % part

    cci = CCISM_io()
    ismn = ISMN_io()

    # ismn.list = ismn.list.iloc[100:120]

    # Split station list in 4 parts for parallelization
    subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int')
    subs[-1] = len(ismn.list)
    start = subs[part - 1]
    end = subs[part]
    ismn.list = ismn.list.iloc[start:end, :]

    periods = {
        'p1': ['2007-10-01', '2010-01-14'],
        'p2': ['2010-01-15', '2011-10-04'],
        'p3': ['2011-10-05', '2012-06-30'],
        'p4': ['2012-07-01', '2014-12-31']
    }

    freq = ['abs', 'anom']

    corr_tags = [
        'corr_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes
        for v in cci.versions for p in periods.keys() for f in freq
    ]
    p_tags = [
        'p_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes
        for v in cci.versions for p in periods.keys() for f in freq
    ]
    n_tags = [
        'n_' + m + '_' + v + '_' + p + '_' + f for m in cci.modes
        for v in cci.versions for p in periods.keys() for f in freq
    ]

    res = ismn.list.copy()
    res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True)
    for col in corr_tags + p_tags:
        res[col] = np.nan
    for col in n_tags:
        res[col] = 0

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):
        print('%i/%i (Proc %i)' % (i, len(ismn.list), part))

        if ts_insitu is None:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        ts_insitu = ts_insitu[periods['p1'][0]:periods['p4'][1]]
        if len(ts_insitu) < 10:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        df_insitu = pd.DataFrame(ts_insitu).dropna()
        df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna()

        for m in cci.modes:
            df_cci = cci.read(meta.lon, meta.lat, mode=m).dropna()
            if len(df_cci) < 10:
                print('No CCI ' + m + ' data for ' + meta.network + ' / ' +
                      meta.station)
                continue

            for f in freq:
                if f == 'abs':
                    matched = df_match(df_cci, df_insitu, window=0.5)
                else:
                    for v in cci.versions:
                        df_cci.loc[:, m + '_' + v] = calc_anomaly(
                            df_cci[m + '_' + v])
                    df_cci.dropna(inplace=True)
                    if (len(df_cci) < 10) | (len(df_insitu_anom) < 10):
                        print('No in situ or CCI ' + m + ' anomaly data for ' +
                              meta.network + ' / ' + meta.station)
                        continue
                    matched = df_match(df_cci, df_insitu_anom, window=0.5)

                data = df_cci.join(matched['insitu']).dropna()

                for p in periods.keys():
                    vals = data[periods[p][0]:periods[p][1]].values

                    n_matches = vals.shape[0]
                    if n_matches < 10:
                        continue
                    for k, v in enumerate(cci.versions):
                        corr, p_value = pearsonr(vals[:, k], vals[:, -1])
                        res.loc[meta.name, 'corr_' + m + '_' + v + '_' + p +
                                '_' + f] = corr
                        res.loc[meta.name, 'p_' + m + '_' + v + '_' + p + '_' +
                                f] = p_value
                        res.loc[meta.name, 'n_' + m + '_' + v + '_' + p + '_' +
                                f] = n_matches

    res.to_csv(result_file, float_format='%0.4f')
Example #17
0
            mask_frozen_prob=5,
            mask_snow_prob=5)

        #drop nan values before doing any matching
        ascat_time_series.data = ascat_time_series.data.dropna()
        ISMN_time_series.data = ISMN_time_series.data.dropna()

        #rename the soil moisture column in ISMN_time_series.data to insitu_sm
        #to clearly differentiate the time series when they are plotted together
        ISMN_time_series.data.rename(columns={'soil moisture': label_insitu},
                                     inplace=True)

        #get ISMN data that was observerd within +- 1 hour(1/24. day) of the ASCAT observation
        #do not include those indexes where no observation was found
        matched_ISMN_data = temp_match.df_match(ascat_time_series.data,
                                                ISMN_time_series.data,
                                                window=1 / 24.,
                                                dropna=True)
        #matched ISMN data is now a dataframe with the same datetime index
        #as ascat_time_series.data and the nearest insitu observation

        #temporal matching also includes distance information
        #but we are not interested in it right now so let's drop it
        matched_ISMN_data = matched_ISMN_data.drop(['distance'], axis=1)

        #this joins the SSM column of the ASCAT data to the matched ISMN data
        matched_data = matched_ISMN_data.join(
            ascat_time_series.data[label_ascat])

        #the plot shows that ISMN and ASCAT are observed in different units
        matched_data.plot(secondary_y=[label_ascat])
        plt.show()
Example #18
0
                                                      mask_ssf=True,
                                                      mask_frozen_prob = 5,
                                                      mask_snow_prob = 5)
        

        #drop nan values before doing any matching
        ascat_time_series.data = ascat_time_series.data.dropna()
        ISMN_time_series.data = ISMN_time_series.data.dropna()
        
        #rename the soil moisture column in ISMN_time_series.data to insitu_sm
        #to clearly differentiate the time series when they are plotted together
        ISMN_time_series.data.rename(columns={'soil moisture':label_insitu},inplace=True)
        
        #get ISMN data that was observerd within +- 1 hour(1/24. day) of the ASCAT observation
        #do not include those indexes where no observation was found
        matched_ISMN_data = temp_match.df_match(ascat_time_series.data,ISMN_time_series.data,
                                                window=1/24.,dropna=True)
        #matched ISMN data is now a dataframe with the same datetime index
        #as ascat_time_series.data and the nearest insitu observation
        
        #temporal matching also includes distance information
        #but we are not interested in it right now so let's drop it
        matched_ISMN_data = matched_ISMN_data.drop(['distance'],axis=1)
        
        #this joins the SSM column of the ASCAT data to the matched ISMN data
        matched_data = matched_ISMN_data.join(ascat_time_series.data[label_ascat])       
        
        #the plot shows that ISMN and ASCAT are observed in different units
        matched_data.plot(secondary_y=[label_ascat])
        plt.show()
        
        #this takes the matched_data DataFrame and adds a column 
Example #19
0
def run(part):

    parts = 6

    result_file = r'D:\work\ESA_CCI_SM\ismn_r2\ismn_r2_part%i.csv' % part

    cci = CCISM_io()
    ismn = ISMN_io()

    # ismn.list = ismn.list.iloc[100:120]

    # Split station list in 4 parts for parallelization
    subs = (np.arange(parts + 1) * len(ismn.list) / parts).astype('int')
    subs[-1] = len(ismn.list)
    start = subs[part - 1]
    end = subs[part]
    ismn.list = ismn.list.iloc[start:end, :]

    freq = ['abs', 'anom']

    res = ismn.list.copy()
    res.drop(['ease_col', 'ease_row'], axis='columns', inplace=True)
    res['r_abs'] = np.nan
    res['r_anom'] = np.nan

    for i, (meta, ts_insitu) in enumerate(ismn.iter_stations()):
        print('%i/%i (Proc %i)' % (i, len(ismn.list), part))

        if ts_insitu is None:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        ts_insitu = ts_insitu['2007-10-01':'2014-12-31']
        if len(ts_insitu) < 10:
            print('No in situ data for ' + meta.network + ' / ' + meta.station)
            continue
        df_insitu = pd.DataFrame(ts_insitu).dropna()
        df_insitu_anom = pd.DataFrame(calc_anomaly(ts_insitu)).dropna()

        df_cci = cci.read(meta.lon,
                          meta.lat,
                          version='v04.4',
                          mode=['ACTIVE', 'PASSIVE']).dropna()
        if len(df_cci) < 10:
            print('No CCI data for ' + meta.network + ' / ' + meta.station)
            continue

        for f in freq:
            if f == 'abs':
                matched = df_match(df_cci, df_insitu, window=0.5)
            else:
                df_cci.loc[:, 'ACTIVE_v04.4'] = calc_anomaly(
                    df_cci['ACTIVE_v04.4'])
                df_cci.loc[:, 'PASSIVE_v04.4'] = calc_anomaly(
                    df_cci['PASSIVE_v04.4'])
                df_cci.dropna(inplace=True)
                if (len(df_cci) < 10) | (len(df_insitu_anom) < 10):
                    print('No in situ or CCI anomaly data for ' +
                          meta.network + ' / ' + meta.station)
                    continue
                matched = df_match(df_cci, df_insitu_anom, window=0.5)

            data = df_cci.join(matched['insitu']).dropna()

            if len(data) < 100:
                continue

            vals = data[['insitu', 'ACTIVE_v04.4']].values
            c1, p1 = pearsonr(vals[:, 0], vals[:, 1])
            vals = data[['insitu', 'PASSIVE_v04.4']].values
            c2, p2 = pearsonr(vals[:, 0], vals[:, 1])
            vals = data[['ACTIVE_v04.4', 'PASSIVE_v04.4']].values
            c3, p3 = pearsonr(vals[:, 0], vals[:, 1])

            if (c1 < 0) | (c2 < 0) | (c3 < 0) | (p1 > 0.05) | (p2 > 0.05) | (
                    p3 > 0.05):
                continue

            res.loc[meta.name, 'r_' + f] = np.sqrt(tc(data)[1][2])

    res.to_csv(result_file, float_format='%0.4f')