Ejemplo n.º 1
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pd.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref' other columns are the datasets to compare against
            named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = super(IntercomparisonMetrics,
                        self).calc_metrics(data, gpi_info)

        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < self.min_obs:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict, mse_corr_dict, mse_bias_dict, mse_var_dict = \
            mse._asdict(), mse_corr._asdict(), mse_bias._asdict(), mse_var._asdict()

        # calculate RSS
        rss = df_metrics.RSS(data)
        rss_dict = rss._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        # No extra scaling is performed here.
        # always scale for ubRMSD with mean std
        # calculate ubRMSD
        data_scaled = scale(data, method='mean_std')
        ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
        ubRMSD_dict = ubRMSD_nT._asdict()

        for tds_name in self.tds_names:
            R, p_R = pearson_R[tds_name], pearson_p[tds_name]
            rho, p_rho = spea_rho[tds_name], spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            rss = rss_dict[tds_name]

            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split(self.ds_names_split)
            tds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]]
            ])

            dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R
            dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R
            dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho
            dataset[self.metric_ds_split.join(['p_rho',
                                               tds_name_key])][0] = p_rho
            dataset[self.metric_ds_split.join(['BIAS',
                                               tds_name_key])][0] = bias
            dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse
            dataset[self.metric_ds_split.join(['mse_corr',
                                               tds_name_key])][0] = mse_corr
            dataset[self.metric_ds_split.join(['mse_bias',
                                               tds_name_key])][0] = mse_bias
            dataset[self.metric_ds_split.join(['mse_var',
                                               tds_name_key])][0] = mse_var
            dataset[self.metric_ds_split.join(['RMSD',
                                               tds_name_key])][0] = rmsd
            dataset[self.metric_ds_split.join(['urmsd',
                                               tds_name_key])][0] = ubRMSD
            dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss

            if self.calc_tau:
                dataset[self.metric_ds_split.join(['tau',
                                                   tds_name_key])][0] = tau
                dataset[self.metric_ds_split.join(['p_tau',
                                                   tds_name_key])][0] = p_tau

        return dataset
Ejemplo n.º 2
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < 10:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R = pearson_R._asdict()
        pearson_p = pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho = spea_rho._asdict()
        spea_p = spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict = tau._asdict()
            p_tau_dict = p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        #data_scaled = scale(data, method='mean_std')
        # calculate ubRMSD
        ubRMSD_nT = df_metrics.ubrmsd(data)
        ubRMSD_dict = ubRMSD_nT._asdict()

        # get single dataset metrics
        # calculate SNR
        x = data[self.df_columns[0]].values[subset]
        y = data[self.df_columns[1]].values[subset]
        z = data[self.df_columns[2]].values[subset]

        snr, err, beta = metrics.tcol_snr(x, y, z)

        for i, name in enumerate(self.ds_names):
            dataset['{:}_snr'.format(name)][0] = snr[i]
            dataset['{:}_err_var'.format(name)][0] = err[i]
            dataset['{:}_beta'.format(name)][0] = beta[i]

        for tds_name in self.tds_names:
            R = pearson_R[tds_name]
            p_R = pearson_p[tds_name]
            rho = spea_rho[tds_name]
            p_rho = spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split('_and_')
            tds_name_key = "{:}_{:}".format(
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]])

            dataset['R_between_{:}'.format(tds_name_key)][0] = R
            dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R
            dataset['rho_between_{:}'.format(tds_name_key)][0] = rho
            dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho
            dataset['bias_between_{:}'.format(tds_name_key)][0] = bias
            dataset['mse_between_{:}'.format(tds_name_key)][0] = mse
            dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr
            dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias
            dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var
            dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd
            dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD

            if self.calc_tau:
                dataset['tau_between_{:}'.format(tds_name_key)][0] = tau
                dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau

        return dataset
Ejemplo n.º 3
0
    def calc_metrics(self, data, gpi_info):
        """
        Calculate Triple Collocation metrics

        Parameters
        ----------
        data : pd.DataFrame
            with >2 columns, the first column is the reference dataset named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        if self.metadata_template != None:
            for key, value in self.metadata_template.items():
                dataset[key][0] = gpi_info[3][key]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < self.min_obs:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict()
        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict()
        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()
        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()
        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()
        # calculate RSS
        rss = df_metrics.RSS(data)
        rss_dict = rss._asdict()
        # calculate ubRMSD
        # todo: we could use the TC derived scaling parameters here?
        data_scaled = scale(data, method='mean_std')
        ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
        ubRMSD_dict = ubRMSD_nT._asdict()
        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None
        # calculate TC metrics
        ref_ind = np.where(np.array(data.columns) == self.ref_name)[0][0]
        snrs, err_stds, betas = df_metrics.tcol_snr(data, ref_ind=ref_ind)
        snr_dict = self._tc_res_dict(snrs)
        err_std_dict = self._tc_res_dict(err_stds)
        beta_dict = self._tc_res_dict(betas)

        # store TC results
        for thds_name in self.thds_names:
            snr = snr_dict[thds_name]
            err_std = err_std_dict[thds_name]
            beta = beta_dict[thds_name]

            split_thds_name = thds_name.split(self.ds_names_split)
            thds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_thds_name[0]],
                self.ds_names_lut[split_thds_name[1]],
                self.ds_names_lut[split_thds_name[2]]
            ])

            for metr, res in dict(snr=snr, err_std=err_std, beta=beta).items():
                for ds, ds_res in res.items():
                    m_ds = "{}_{}".format(metr, self.ds_names_lut[ds])
                    n = '{}{}{}'.format(m_ds, self.metric_ds_split,
                                        thds_name_key)
                    if n in dataset.keys():
                        dataset[n][0] = ds_res

        # Store basic metrics results
        for tds_name in self.tds_names:
            R, p_R = pearson_R[tds_name], pearson_p[tds_name]
            rho, p_rho = spea_rho[tds_name], spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            rss = rss_dict[tds_name]

            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split(self.ds_names_split)
            tds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]]
            ])

            dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R
            dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R
            dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho
            dataset[self.metric_ds_split.join(['p_rho',
                                               tds_name_key])][0] = p_rho
            dataset[self.metric_ds_split.join(['BIAS',
                                               tds_name_key])][0] = bias
            dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse
            dataset[self.metric_ds_split.join(['mse_corr',
                                               tds_name_key])][0] = mse_corr
            dataset[self.metric_ds_split.join(['mse_bias',
                                               tds_name_key])][0] = mse_bias
            dataset[self.metric_ds_split.join(['mse_var',
                                               tds_name_key])][0] = mse_var
            dataset[self.metric_ds_split.join(['RMSD',
                                               tds_name_key])][0] = rmsd
            dataset[self.metric_ds_split.join(['urmsd',
                                               tds_name_key])][0] = ubRMSD
            dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss

            if self.calc_tau:
                dataset[self.metric_ds_split.join(['tau',
                                                   tds_name_key])][0] = tau
                dataset[self.metric_ds_split.join(['p_tau',
                                                   tds_name_key])][0] = p_tau

        return dataset
Ejemplo n.º 4
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < 10:
            return dataset

        dataset['n_obs'][0] = n_obs


        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R = pearson_R._asdict()
        pearson_p = pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho = spea_rho._asdict()
        spea_p = spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict = tau._asdict()
            p_tau_dict = p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        #data_scaled = scale(data, method='mean_std')
        # calculate ubRMSD
        ubRMSD_nT = df_metrics.ubrmsd(data)
        ubRMSD_dict = ubRMSD_nT._asdict()

        # get single dataset metrics
        # calculate SNR
        x = data[self.df_columns[0]].values[subset]
        y = data[self.df_columns[1]].values[subset]
        z = data[self.df_columns[2]].values[subset]

        snr, err, beta = metrics.tcol_snr(x, y, z)

        for i, name in enumerate(self.ds_names):
            dataset['{:}_snr'.format(name)][0] = snr[i]
            dataset['{:}_err_var'.format(name)][0] = err[i]
            dataset['{:}_beta'.format(name)][0] = beta[i]


        for tds_name in self.tds_names:
            R = pearson_R[tds_name]
            p_R = pearson_p[tds_name]
            rho = spea_rho[tds_name]
            p_rho = spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]


            split_tds_name = tds_name.split('_and_')
            tds_name_key = "{:}_{:}".format(self.ds_names_lut[
                split_tds_name[0]],
                self.ds_names_lut[
                split_tds_name[1]])

            dataset['R_between_{:}'.format(tds_name_key)][0] = R
            dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R
            dataset['rho_between_{:}'.format(tds_name_key)][0] = rho
            dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho
            dataset['bias_between_{:}'.format(tds_name_key)][0] = bias
            dataset['mse_between_{:}'.format(tds_name_key)][0] = mse
            dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr
            dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias
            dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var
            dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd
            dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD

            if self.calc_tau:
                dataset['tau_between_{:}'.format(tds_name_key)][0] = tau
                dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau

        return dataset