Ejemplo n.º 1
0
def test_tcol_snr():
    """
    Test the triple collocation based estimation of
    signal to noise ratio, absolute errors and rescaling coefficients
    """

    n = 1000000

    mean_signal = 0.3
    sig_signal = 0.2
    signal = np.random.normal(mean_signal, sig_signal, n)

    sig_err_x = 0.02
    sig_err_y = 0.07
    sig_err_z = 0.04
    err_x = np.random.normal(0, sig_err_x, n)
    err_y = np.random.normal(0, sig_err_y, n)
    err_z = np.random.normal(0, sig_err_z, n)

    alpha_y = 0.2
    alpha_z = 0.5

    beta_y = 0.9
    beta_z = 1.6

    x = signal + err_x
    y = alpha_y + beta_y * (signal + err_y)
    z = alpha_z + beta_z * (signal + err_z)

    beta_pred = 1. / np.array((1, beta_y, beta_z))
    err_pred = np.array((sig_err_x, sig_err_y, sig_err_z))
    snr_pred = np.array(((sig_signal / sig_err_x), (sig_signal / sig_err_y),
                         (sig_signal / sig_err_z)))

    snr, err, beta = met.tcol_snr(x, y, z, ref_ind=0)

    nptest.assert_almost_equal(beta, beta_pred, decimal=2)
    nptest.assert_almost_equal(err, err_pred, decimal=2)
    nptest.assert_almost_equal(np.sqrt(10**(snr / 10.)), snr_pred, decimal=1)
Ejemplo n.º 2
0
def test_tcol_snr():
    """
    Test the triple collocation based estimation of
    signal to noise ratio, absolute errors and rescaling coefficients
    """

    n = 1000000

    mean_signal = 0.3
    sig_signal = 0.2
    signal = np.random.normal(mean_signal, sig_signal, n)

    sig_err_x = 0.02
    sig_err_y = 0.07
    sig_err_z = 0.04
    err_x = np.random.normal(0, sig_err_x, n)
    err_y = np.random.normal(0, sig_err_y, n)
    err_z = np.random.normal(0, sig_err_z, n)

    alpha_y = 0.2
    alpha_z = 0.5

    beta_y = 0.9
    beta_z = 1.6

    x = signal + err_x
    y = alpha_y + beta_y * (signal + err_y)
    z = alpha_z + beta_z * (signal + err_z)

    beta_pred = 1. / np.array((1, beta_y, beta_z))
    err_pred = np.array((sig_err_x, sig_err_y, sig_err_z))
    snr_pred = np.array(
        ((sig_signal / sig_err_x), (sig_signal / sig_err_y), (sig_signal / sig_err_z)))

    snr, err, beta = met.tcol_snr(x, y, z, ref_ind=0)

    nptest.assert_almost_equal(beta, beta_pred, decimal=2)
    nptest.assert_almost_equal(err, err_pred, decimal=2)
    nptest.assert_almost_equal(np.sqrt(10 ** (snr / 10.)), snr_pred, decimal=1)
Ejemplo n.º 3
0
def test_tcol_error():
    """
    Test the triple collocation error estimation based on
    a random signal and error.
    Also compare the results to the other method
    """

    n = 1000000

    signal = np.sin(np.linspace(0, 2 * np.pi, n))

    sig_err_x = 0.02
    sig_err_y = 0.07
    sig_err_z = 0.04
    err_pred = np.array((sig_err_x, sig_err_y, sig_err_z))
    err_x = np.random.normal(0, sig_err_x, n)
    err_y = np.random.normal(0, sig_err_y, n)
    err_z = np.random.normal(0, sig_err_z, n)

    alpha_y = 0.2
    alpha_z = 0.5

    beta_y = 0.9
    beta_z = 1.6

    x = signal + err_x
    y = alpha_y + beta_y * (signal + err_y)
    z = alpha_z + beta_z * (signal + err_z)

    snr, err, beta = met.tcol_snr(x, y, z, ref_ind=0)
    # classical triple collocation errors use scaled (removed alpha and beta)
    # input arrays
    ex, ey, ez = met.tcol_error(signal + err_x, signal + err_y, signal + err_z)

    nptest.assert_almost_equal(err, np.array([ex, ey, ez]), decimal=2)
    nptest.assert_almost_equal(err_pred, np.array([ex, ey, ez]), decimal=2)
Ejemplo n.º 4
0
def test_tcol_error():
    """
    Test the triple collocation error estimation based on
    a random signal and error.
    Also compare the results to the other method
    """

    n = 1000000

    signal = np.sin(np.linspace(0, 2 * np.pi, n))

    sig_err_x = 0.02
    sig_err_y = 0.07
    sig_err_z = 0.04
    err_pred = np.array((sig_err_x, sig_err_y, sig_err_z))
    err_x = np.random.normal(0, sig_err_x, n)
    err_y = np.random.normal(0, sig_err_y, n)
    err_z = np.random.normal(0, sig_err_z, n)

    alpha_y = 0.2
    alpha_z = 0.5

    beta_y = 0.9
    beta_z = 1.6

    x = signal + err_x
    y = alpha_y + beta_y * (signal + err_y)
    z = alpha_z + beta_z * (signal + err_z)

    snr, err, beta = met.tcol_snr(x, y, z, ref_ind=0)
    # classical triple collocation errors use scaled (removed alpha and beta)
    # input arrays
    ex, ey, ez = met.tcol_error(signal + err_x, signal + err_y, signal + err_z)

    nptest.assert_almost_equal(err, np.array([ex, ey, ez]), decimal=2)
    nptest.assert_almost_equal(err_pred, np.array([ex, ey, ez]), decimal=2)
Ejemplo n.º 5
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with 3 columns, the first column is the reference dataset
            named 'ref'
            the second and third column are the datasets to compare against
            named 'k1 and k2'
        gpi_info : tuple
            Grid point info (i.e. gpi, lon, lat)
        """
        dataset = super(HSAF_Metrics, self).calc_metrics(data, gpi_info)

        for season in self.seasons:

            if season != 'ALL':
                subset = self.month_to_season[data.index.month] == season
            else:
                subset = np.ones(len(data), dtype=bool)

            # number of observations
            n_obs = subset.sum()
            if n_obs < self.min_obs:
                continue
            dataset['{:}_n_obs'.format(season)][0] = n_obs

            # get single dataset metrics
            # calculate SNR
            x = data[self.df_columns[0]].values[subset]
            y = data[self.df_columns[1]].values[subset]
            z = data[self.df_columns[2]].values[subset]

            snr, err, beta = metrics.tcol_snr(x, y, z)

            for i, name in enumerate(self.ds_names):
                dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i]
                dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i]
                dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i]

            # calculate Pearson correlation
            pearson_R, pearson_p = df_metrics.pearsonr(data)
            pearson_R = pearson_R._asdict()
            pearson_p = pearson_p._asdict()

            # calculate Spearman correlation
            spea_rho, spea_p = df_metrics.spearmanr(data)
            spea_rho = spea_rho._asdict()
            spea_p = spea_p._asdict()

            # scale data to reference in order to calculate absolute metrics
            data_scaled = scale(data, method='min_max')

            # calculate bias
            bias_nT = df_metrics.bias(data_scaled)
            bias_dict = bias_nT._asdict()

            # calculate ubRMSD
            ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
            ubRMSD_dict = ubRMSD_nT._asdict()

            for tds_name in self.tds_names:
                R = pearson_R[tds_name]
                p_R = pearson_p[tds_name]
                rho = spea_rho[tds_name]
                p_rho = spea_p[tds_name]
                bias = bias_dict[tds_name]
                ubRMSD = ubRMSD_dict[tds_name]

                split_tds_name = tds_name.split('_and_')
                tds_name_key = "{:}_{:}".format(
                    self.ds_names_lut[split_tds_name[0]],
                    self.ds_names_lut[split_tds_name[1]])

                dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R
                dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R
                dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho
                dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \
                    p_rho
                dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias
                dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \
                    ubRMSD

        return dataset
Ejemplo n.º 6
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < 10:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R = pearson_R._asdict()
        pearson_p = pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho = spea_rho._asdict()
        spea_p = spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict = tau._asdict()
            p_tau_dict = p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        #data_scaled = scale(data, method='mean_std')
        # calculate ubRMSD
        ubRMSD_nT = df_metrics.ubrmsd(data)
        ubRMSD_dict = ubRMSD_nT._asdict()

        # get single dataset metrics
        # calculate SNR
        x = data[self.df_columns[0]].values[subset]
        y = data[self.df_columns[1]].values[subset]
        z = data[self.df_columns[2]].values[subset]

        snr, err, beta = metrics.tcol_snr(x, y, z)

        for i, name in enumerate(self.ds_names):
            dataset['{:}_snr'.format(name)][0] = snr[i]
            dataset['{:}_err_var'.format(name)][0] = err[i]
            dataset['{:}_beta'.format(name)][0] = beta[i]

        for tds_name in self.tds_names:
            R = pearson_R[tds_name]
            p_R = pearson_p[tds_name]
            rho = spea_rho[tds_name]
            p_rho = spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split('_and_')
            tds_name_key = "{:}_{:}".format(
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]])

            dataset['R_between_{:}'.format(tds_name_key)][0] = R
            dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R
            dataset['rho_between_{:}'.format(tds_name_key)][0] = rho
            dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho
            dataset['bias_between_{:}'.format(tds_name_key)][0] = bias
            dataset['mse_between_{:}'.format(tds_name_key)][0] = mse
            dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr
            dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias
            dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var
            dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd
            dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD

            if self.calc_tau:
                dataset['tau_between_{:}'.format(tds_name_key)][0] = tau
                dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau

        return dataset
Ejemplo n.º 7
0
def run(cell=None, gpi=None):

    if (cell is None) and (gpi is None):
        print('No cell/gpi specified.')
        return

    smos = SMOS_io()
    ascat = HSAF_io(ext=None)
    mswep = MSWEP_io()

    if gpi is not None:
        cell = mswep.gpi2cell(gpi)

    # Median Q/R from TC run.
    Q_avg = 12.
    R_avg = 74.

    if platform.system() == 'Windows':
        result_file = os.path.join('D:', 'work', 'MadKF', 'CONUS',
                                   'result_%04i.csv' % cell)
    else:
        result_file = os.path.join('/', 'scratch', 'leuven', '320', 'vsc32046',
                                   'output', 'MadKF', 'CONUS',
                                   'result_%04i.csv' % cell)

    dt = ['2010-01-01', '2015-12-31']

    for data, info in mswep.iter_cell(cell, gpis=gpi):

        # print info.name
        # if True:
        try:
            precip = mswep.read(info.name)
            sm_ascat = ascat.read(info.dgg_gpi)
            sm_smos = smos.read(info.smos_gpi) * 100.

            if (precip is None) | (sm_ascat is None) | (sm_smos is None):
                continue

            precip = calc_anomaly(precip[dt[0]:dt[1]],
                                  method='moving_average',
                                  longterm=False)
            sm_ascat = calc_anomaly(sm_ascat[dt[0]:dt[1]],
                                    method='moving_average',
                                    longterm=False)
            sm_smos = calc_anomaly(sm_smos[dt[0]:dt[1]],
                                   method='moving_average',
                                   longterm=False)

            api = API(gamma=info.gamma)

            # Regularize time steps
            df = pd.DataFrame({
                1: precip,
                2: sm_ascat,
                3: sm_smos
            },
                              index=pd.date_range(dt[0], dt[1]))

            n_inv_precip = len(np.where(np.isnan(df[1]))[0])
            n_inv_ascat = len(np.where(np.isnan(df[2]))[0])
            n_inv_smos = len(np.where(np.isnan(df[3]))[0])
            n_inv_asc_smo = len(np.where(np.isnan(df[2]) & np.isnan(df[3]))[0])

            df.loc[np.isnan(df[1]), 1] = 0.

            # --- get OL ts  ---
            OL = np.full(len(precip), np.nan)
            model = API(gamma=info.gamma)
            for t, f in enumerate(df[1].values):
                x = model.step(f)
                OL[t] = x

            # collocate OL and satellite data sets.
            df2 = pd.DataFrame({
                1: OL,
                2: sm_ascat,
                3: sm_smos
            },
                               index=pd.date_range(dt[0], dt[1])).dropna()

            # ----- Calculate uncertainties -----
            # convert (static) forcing to model uncertainty
            P_avg = Q_avg / (1 - info.gamma**2)

            # calculate TCA based uncertainty and scaling coefficients
            snr, err, beta = tcol_snr(df2[1].values, df2[2].values,
                                      df2[3].values)
            P_TC = err[0]**2
            Q_TC = P_TC * (1 - info.gamma**2)
            R_TC = (err[1] / beta[1])**2
            H_TC = beta[1]

            # Calculate RMSD based uncertainty
            R_rmsd = (np.nanmean(
                (df2[1].values - H_TC * df2[2].values)**2) - P_avg)
            if R_rmsd < 0:
                R_rmsd *= -1
            # -----------------------------------

            # ----- Run KF using TCA-based uncertainties -----
            api_kf = API(gamma=info.gamma, Q=Q_TC)
            R_2D = np.array([(err[1] / beta[1])**2, (err[2] / beta[2])**2])
            H_2D = np.array([beta[1]**(-1), beta[2]**(-1)])
            x_2d, P, checkvar1_2d, checkvar2_2d, checkvar3_2d, K1_2d, K2_2d = \
                KF_2D(api_kf, df[1].values.copy(), df[2].values.copy(), df[3].values.copy(), R_2D, H=H_2D)

            # ----- Run KF using TCA-based uncertainties -----
            api_kf = API(gamma=info.gamma, Q=Q_TC)
            x_kf, P, R_innov_kf, checkvar_kf, K_kf = \
                KF(api_kf, df[1].values.copy(), df[2].values.copy(), R_TC, H=H_TC)

            # ----- Run EnKF using TCA-based uncertainties -----
            forc_pert = ['normal', 'additive', Q_TC]
            obs_pert = ['normal', 'additive', R_TC]
            x_tc, P, R_innov_tc, checkvar_tc, K_tc = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run EnKF using static uncertainties -----
            forc_pert = ['normal', 'additive', Q_avg]
            obs_pert = ['normal', 'additive', R_avg]
            x_avg, P, R_innov_avg, checkvar_avg, K_avg = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)

            # ----- Run EnKF using RMSD-based uncertainties (corrected for model uncertainty) -----
            t = timeit.default_timer()
            forc_pert = ['normal', 'additive', Q_avg]
            obs_pert = ['normal', 'additive', R_rmsd]
            x_rmsd, P, R_innov_rmsd, checkvar_rmsd, K_rmsd = \
                EnKF(api, df[1].values.copy(), df[2].values.copy(), forc_pert, obs_pert, H=H_TC, n_ens=50)
            t_enkf = timeit.default_timer() - t

            # ----- Run MadKF -----
            t = timeit.default_timer()
            x_madkf, P, R_madkf, Q_madkf, H_madkf, R_innov_madkf, checkvar_madkf, K_madkf = \
                MadKF(api, df[1].values.copy(), df[2].values.copy(), n_ens=100, n_iter=20)
            t_madkf = timeit.default_timer() - t

            # TC evaluation of assimilation results
            # df3 = pd.DataFrame({1: x_tc, 2: x_avg, 3: x_rmsd, 4: x_madkf, 5: sm_ascat, 6: sm_smos}, index=pd.date_range(dt[0], dt[1])).dropna()
            #
            # rmse_ana_tc = tcol_snr(df3[1].values, df3[5].values, df3[6].values)[1][0]
            # rmse_ana_avg = tcol_snr(df3[2].values, df3[5].values, df3[6].values)[1][0]
            # rmse_ana_rmsd = tcol_snr(df3[3].values, df3[5].values, df3[6].values)[1][0]
            # rmse_ana_madkf = tcol_snr(df3[4].values, df3[5].values, df3[6].values)[1][0]

            result = pd.DataFrame(
                {
                    'lon': info.lon,
                    'lat': info.lat,
                    'col': info.col,
                    'row': info.row,
                    'P_tc': P_TC,
                    'Q_tc': Q_TC,
                    'R_tc': R_TC,
                    'H_tc': H_TC,
                    'K_tc': K_tc,
                    'R_innov_tc': R_innov_tc,
                    'checkvar_tc': checkvar_tc,
                    'K_kf': K_kf,
                    'R_innov_kf': R_innov_kf,
                    'checkvar_kf': checkvar_kf,
                    'K1_2d': K1_2d,
                    'K2_2d': K2_2d,
                    'checkvar1_2d': checkvar1_2d,
                    'checkvar2_2d': checkvar2_2d,
                    'checkvar3_2d': checkvar3_2d,
                    'P_avg': P_avg,
                    'Q_avg': Q_avg,
                    'R_avg': R_avg,
                    'K_avg': K_avg,
                    'R_innov_avg': R_innov_avg,
                    'checkvar_avg': checkvar_avg,
                    'R_rmsd': R_rmsd,
                    'K_rmsd': K_rmsd,
                    'R_innov_rmsd': R_innov_rmsd,
                    'checkvar_rmsd': checkvar_rmsd,
                    'P_madkf': Q_madkf / (1 - info.gamma**2),
                    'Q_madkf': Q_madkf,
                    'R_madkf': R_madkf,
                    'H_madkf': H_madkf,
                    'K_madkf': K_madkf,
                    'R_innov_madkf': R_innov_madkf,
                    'checkvar_madkf': checkvar_madkf,
                    't_enkf': t_enkf,
                    't_madkf': t_madkf,
                    'n_inv_precip': n_inv_precip,
                    'n_inv_ascat': n_inv_ascat,
                    'n_inv_smos': n_inv_smos,
                    'n_inv_asc_smo': n_inv_asc_smo
                },
                index=(info.name, ))

            if (os.path.isfile(result_file) == False):
                result.to_csv(result_file, float_format='%0.4f')
            else:
                result.to_csv(result_file,
                              float_format='%0.4f',
                              mode='a',
                              header=False)
        except:
            print('GPI failed.')
            continue

    ascat.close()
    mswep.close()
Ejemplo n.º 8
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < 10:
            return dataset

        dataset['n_obs'][0] = n_obs


        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R = pearson_R._asdict()
        pearson_p = pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho = spea_rho._asdict()
        spea_p = spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict = tau._asdict()
            p_tau_dict = p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        #data_scaled = scale(data, method='mean_std')
        # calculate ubRMSD
        ubRMSD_nT = df_metrics.ubrmsd(data)
        ubRMSD_dict = ubRMSD_nT._asdict()

        # get single dataset metrics
        # calculate SNR
        x = data[self.df_columns[0]].values[subset]
        y = data[self.df_columns[1]].values[subset]
        z = data[self.df_columns[2]].values[subset]

        snr, err, beta = metrics.tcol_snr(x, y, z)

        for i, name in enumerate(self.ds_names):
            dataset['{:}_snr'.format(name)][0] = snr[i]
            dataset['{:}_err_var'.format(name)][0] = err[i]
            dataset['{:}_beta'.format(name)][0] = beta[i]


        for tds_name in self.tds_names:
            R = pearson_R[tds_name]
            p_R = pearson_p[tds_name]
            rho = spea_rho[tds_name]
            p_rho = spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]


            split_tds_name = tds_name.split('_and_')
            tds_name_key = "{:}_{:}".format(self.ds_names_lut[
                split_tds_name[0]],
                self.ds_names_lut[
                split_tds_name[1]])

            dataset['R_between_{:}'.format(tds_name_key)][0] = R
            dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R
            dataset['rho_between_{:}'.format(tds_name_key)][0] = rho
            dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho
            dataset['bias_between_{:}'.format(tds_name_key)][0] = bias
            dataset['mse_between_{:}'.format(tds_name_key)][0] = mse
            dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr
            dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias
            dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var
            dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd
            dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD

            if self.calc_tau:
                dataset['tau_between_{:}'.format(tds_name_key)][0] = tau
                dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau

        return dataset
Ejemplo n.º 9
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with 3 columns, the first column is the reference dataset
            named 'ref'
            the second and third column are the datasets to compare against
            named 'k1 and k2'
        gpi_info : tuple
            Grid point info (i.e. gpi, lon, lat)
        """
        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        for season in self.seasons:

            if season != 'ALL':
                subset = self.month_to_season[data.index.month] == season
            else:
                subset = np.ones(len(data), dtype=bool)

            # number of observations
            n_obs = subset.sum()
            if n_obs < 10:
                continue
            dataset['{:}_n_obs'.format(season)][0] = n_obs

            # get single dataset metrics
            # calculate SNR
            x = data[self.df_columns[0]].values[subset]
            y = data[self.df_columns[1]].values[subset]
            z = data[self.df_columns[2]].values[subset]

            snr, err, beta = metrics.tcol_snr(x, y, z)

            for i, name in enumerate(self.ds_names):
                dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i]
                dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i]
                dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i]

            # calculate Pearson correlation
            pearson_R, pearson_p = df_metrics.pearsonr(data)
            pearson_R = pearson_R._asdict()
            pearson_p = pearson_p._asdict()

            # calculate Spearman correlation
            spea_rho, spea_p = df_metrics.spearmanr(data)
            spea_rho = spea_rho._asdict()
            spea_p = spea_p._asdict()

            # scale data to reference in order to calculate absolute metrics
            data_scaled = scale(data, method='min_max')

            # calculate bias
            bias_nT = df_metrics.bias(data_scaled)
            bias_dict = bias_nT._asdict()

            # calculate ubRMSD
            ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
            ubRMSD_dict = ubRMSD_nT._asdict()

            for tds_name in self.tds_names:
                R = pearson_R[tds_name]
                p_R = pearson_p[tds_name]
                rho = spea_rho[tds_name]
                p_rho = spea_p[tds_name]
                bias = bias_dict[tds_name]
                ubRMSD = ubRMSD_dict[tds_name]

                split_tds_name = tds_name.split('_and_')
                tds_name_key = "{:}_{:}".format(self.ds_names_lut[
                    split_tds_name[0]],
                    self.ds_names_lut[
                    split_tds_name[1]])

                dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R
                dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R
                dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho
                dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \
                    p_rho
                dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias
                dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \
                    ubRMSD

        return dataset