Example #1
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pd.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref' other columns are the datasets to compare against
            named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = super(IntercomparisonMetrics,
                        self).calc_metrics(data, gpi_info)

        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < self.min_obs:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict, mse_corr_dict, mse_bias_dict, mse_var_dict = \
            mse._asdict(), mse_corr._asdict(), mse_bias._asdict(), mse_var._asdict()

        # calculate RSS
        rss = df_metrics.RSS(data)
        rss_dict = rss._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        # No extra scaling is performed here.
        # always scale for ubRMSD with mean std
        # calculate ubRMSD
        data_scaled = scale(data, method='mean_std')
        ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
        ubRMSD_dict = ubRMSD_nT._asdict()

        for tds_name in self.tds_names:
            R, p_R = pearson_R[tds_name], pearson_p[tds_name]
            rho, p_rho = spea_rho[tds_name], spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            rss = rss_dict[tds_name]

            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split(self.ds_names_split)
            tds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]]
            ])

            dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R
            dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R
            dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho
            dataset[self.metric_ds_split.join(['p_rho',
                                               tds_name_key])][0] = p_rho
            dataset[self.metric_ds_split.join(['BIAS',
                                               tds_name_key])][0] = bias
            dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse
            dataset[self.metric_ds_split.join(['mse_corr',
                                               tds_name_key])][0] = mse_corr
            dataset[self.metric_ds_split.join(['mse_bias',
                                               tds_name_key])][0] = mse_bias
            dataset[self.metric_ds_split.join(['mse_var',
                                               tds_name_key])][0] = mse_var
            dataset[self.metric_ds_split.join(['RMSD',
                                               tds_name_key])][0] = rmsd
            dataset[self.metric_ds_split.join(['urmsd',
                                               tds_name_key])][0] = ubRMSD
            dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss

            if self.calc_tau:
                dataset[self.metric_ds_split.join(['tau',
                                                   tds_name_key])][0] = tau
                dataset[self.metric_ds_split.join(['p_tau',
                                                   tds_name_key])][0] = p_tau

        return dataset
Example #2
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < 10:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R = pearson_R._asdict()
        pearson_p = pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho = spea_rho._asdict()
        spea_p = spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict = tau._asdict()
            p_tau_dict = p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        #data_scaled = scale(data, method='mean_std')
        # calculate ubRMSD
        ubRMSD_nT = df_metrics.ubrmsd(data)
        ubRMSD_dict = ubRMSD_nT._asdict()

        # get single dataset metrics
        # calculate SNR
        x = data[self.df_columns[0]].values[subset]
        y = data[self.df_columns[1]].values[subset]
        z = data[self.df_columns[2]].values[subset]

        snr, err, beta = metrics.tcol_snr(x, y, z)

        for i, name in enumerate(self.ds_names):
            dataset['{:}_snr'.format(name)][0] = snr[i]
            dataset['{:}_err_var'.format(name)][0] = err[i]
            dataset['{:}_beta'.format(name)][0] = beta[i]

        for tds_name in self.tds_names:
            R = pearson_R[tds_name]
            p_R = pearson_p[tds_name]
            rho = spea_rho[tds_name]
            p_rho = spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split('_and_')
            tds_name_key = "{:}_{:}".format(
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]])

            dataset['R_between_{:}'.format(tds_name_key)][0] = R
            dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R
            dataset['rho_between_{:}'.format(tds_name_key)][0] = rho
            dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho
            dataset['bias_between_{:}'.format(tds_name_key)][0] = bias
            dataset['mse_between_{:}'.format(tds_name_key)][0] = mse
            dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr
            dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias
            dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var
            dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd
            dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD

            if self.calc_tau:
                dataset['tau_between_{:}'.format(tds_name_key)][0] = tau
                dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau

        return dataset
Example #3
0
    def calc_metrics(self, data, gpi_info):
        """
        Calculate Triple Collocation metrics

        Parameters
        ----------
        data : pd.DataFrame
            with >2 columns, the first column is the reference dataset named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        if self.metadata_template != None:
            for key, value in self.metadata_template.items():
                dataset[key][0] = gpi_info[3][key]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < self.min_obs:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict()
        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict()
        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()
        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()
        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()
        # calculate RSS
        rss = df_metrics.RSS(data)
        rss_dict = rss._asdict()
        # calculate ubRMSD
        # todo: we could use the TC derived scaling parameters here?
        data_scaled = scale(data, method='mean_std')
        ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
        ubRMSD_dict = ubRMSD_nT._asdict()
        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None
        # calculate TC metrics
        ref_ind = np.where(np.array(data.columns) == self.ref_name)[0][0]
        snrs, err_stds, betas = df_metrics.tcol_snr(data, ref_ind=ref_ind)
        snr_dict = self._tc_res_dict(snrs)
        err_std_dict = self._tc_res_dict(err_stds)
        beta_dict = self._tc_res_dict(betas)

        # store TC results
        for thds_name in self.thds_names:
            snr = snr_dict[thds_name]
            err_std = err_std_dict[thds_name]
            beta = beta_dict[thds_name]

            split_thds_name = thds_name.split(self.ds_names_split)
            thds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_thds_name[0]],
                self.ds_names_lut[split_thds_name[1]],
                self.ds_names_lut[split_thds_name[2]]
            ])

            for metr, res in dict(snr=snr, err_std=err_std, beta=beta).items():
                for ds, ds_res in res.items():
                    m_ds = "{}_{}".format(metr, self.ds_names_lut[ds])
                    n = '{}{}{}'.format(m_ds, self.metric_ds_split,
                                        thds_name_key)
                    if n in dataset.keys():
                        dataset[n][0] = ds_res

        # Store basic metrics results
        for tds_name in self.tds_names:
            R, p_R = pearson_R[tds_name], pearson_p[tds_name]
            rho, p_rho = spea_rho[tds_name], spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            rss = rss_dict[tds_name]

            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split(self.ds_names_split)
            tds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]]
            ])

            dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R
            dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R
            dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho
            dataset[self.metric_ds_split.join(['p_rho',
                                               tds_name_key])][0] = p_rho
            dataset[self.metric_ds_split.join(['BIAS',
                                               tds_name_key])][0] = bias
            dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse
            dataset[self.metric_ds_split.join(['mse_corr',
                                               tds_name_key])][0] = mse_corr
            dataset[self.metric_ds_split.join(['mse_bias',
                                               tds_name_key])][0] = mse_bias
            dataset[self.metric_ds_split.join(['mse_var',
                                               tds_name_key])][0] = mse_var
            dataset[self.metric_ds_split.join(['RMSD',
                                               tds_name_key])][0] = rmsd
            dataset[self.metric_ds_split.join(['urmsd',
                                               tds_name_key])][0] = ubRMSD
            dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss

            if self.calc_tau:
                dataset[self.metric_ds_split.join(['tau',
                                                   tds_name_key])][0] = tau
                dataset[self.metric_ds_split.join(['p_tau',
                                                   tds_name_key])][0] = p_tau

        return dataset
Example #4
0
        plt.show()
        
        # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe
        x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values
        
        print("ISMN time series:", ISMN_time_series)
        print("compared to")
        print(ascat_time_series)
        print("Results:")
        
        # df_metrics takes a DataFrame as input and automatically
        # calculates the metric on all combinations of columns
        # returns a named tuple for easy printing
        print(df_metrics.pearsonr(scaled_data))
        print("Spearman's (rho,p_value)", metrics.spearmanr(x, y))
        print("Kendalls's (tau,p_value)", metrics.kendalltau(x, y))
        print(df_metrics.kendalltau(scaled_data))
        print(df_metrics.rmsd(scaled_data))
        print("Bias", metrics.bias(x, y))
        print("Nash Sutcliffe", metrics.nash_sutcliffe(x, y))
        
        
    i += 1

    # only show the first 2 stations, otherwise this program would run a long time
    # and produce a lot of plots
    if i >= 2:
        break


Example #5
0
def optimise(params,
             timespan=('2009-01', '2009-12'),
             gpi=None,
             rescaling=None):
    """
    This function is optimising the parameters vegetation water content
    'm_veg', soil moisture 'm_soil' and, if specified, a third optional
    parameter. The third optional parameter can eitehr be sand 'sand',
    clay 'clay', fractional root mean square height 'f_rms',
    stem volume 's_vol' or temperature 'temp'.

    Parameters
    ----------
    params : list of dicts
        Model parameters. At least
        four of the following parameters needs to be specified if an optional
        parameter has been selected, otherwise all of them needs to be
        specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol'
    gpi : int, optional
        Grid point index. If specified, it will read data from datapool.

    Returns
    -------
    df : pandas.DataFrame
        Optimised soil moisture, vegetation water concent and, if specified,
        optional optimised parameter.
    """

    if gpi is None:
        ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"),
                               index_col=0,
                               parse_dates=True)[timespan[0]:timespan[1]]
        gpi = 2011528
    else:
        ts_resam = read_resam(gpi)[timespan[0]:timespan[1]]

    m_veg_x0 = params.pop('m_veg_x0')
    m_soil_x0 = params.pop('m_soil_x0')
    columns = ['m_veg', 'm_soil']

    x0 = np.array([m_veg_x0, m_soil_x0])

    df = pd.DataFrame(index=ts_resam.index, columns=columns)
    df = df.fillna(np.nan)
    # optimise  m_soil and m_veg
    for index, row in ts_resam.iterrows():

        ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist())
        ascat_sig = \
            db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist()))

        args = (ascat_inc, ascat_sig, params, '')
        res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead')

        if res['success'] == True:
            df['m_veg'][index] = res['x'][0]
            df['m_soil'][index] = res['x'][1]

    str_static_p = \
                ', '.join("%s: %r" % t for t in locals().iteritems())

    str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(
        m_veg_x0, m_soil_x0)

    ismn_file = os.path.join(
        'data',
        'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm'
    )
    ismn_data = ismn_readers.read_data(ismn_file)
    insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(
        columns={'soil moisture': 'insitu'})
    gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'),
                        parse_dates=True,
                        index_col=0)
    gldas.rename(columns={'086_L1': 'gldas'}, inplace=True)
    gldas = pd.DataFrame(gldas['gldas'])
    ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'})

    matched = temp_match.matching(ascat, insitu, gldas)

    if rescaling is not None:
        scaled = scaling.scale(matched, rescaling, reference_index=1)
    else:
        scaled = matched

    metrics = OrderedDict()
    metrics['bias'] = df_metrics.bias(scaled)
    metrics['pearson'] = df_metrics.pearsonr(scaled)
    metrics['kendall'] = df_metrics.kendalltau(scaled)
    metrics['ubrmsd'] = df_metrics.ubrmsd(scaled)
    metrics['var_ratio'] = df_var_ratio(scaled)
    tcol_error = df_metrics.tcol_error(scaled)._asdict()

    ts_title = "Soil moisture. "
    if rescaling is not None:
        ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling])
    else:
        ts_title = ' '.join([ts_title, 'No rescaling.'])

    axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8))

    # these are matplotlib.patch.Patch properties
    props = dict(facecolor='white', alpha=0)

    columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas')
    row_labels = [
        'bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio'
    ]
    cell_text = []
    for metric in metrics:
        metric_values = metrics[metric]
        if type(metric_values) == tuple:
            metric_values = metric_values[0]
        metric_values = metric_values._asdict()
        cell_text.append([
            "%.2f" % metric_values['ascat_and_insitu'],
            "%.2f" % metric_values['ascat_and_gldas'],
            "%.2f" % metric_values['insitu_and_gldas']
        ])

    table = plt.table(cellText=cell_text,
                      colLabels=columns,
                      colWidths=[0.1, 0.1, 0.1],
                      rowLabels=row_labels,
                      loc='bottom',
                      bbox=(0.2, -1.25, 0.5, 0.8))

    tcol_table = plt.table(cellText=[[
        "%.2f" % tcol_error['ascat'],
        "%.2f" % tcol_error['gldas'],
        "%.2f" % tcol_error['insitu']
    ]],
                           colLabels=('ascat', 'gldas', 'insitu'),
                           colWidths=[0.1, 0.1, 0.1],
                           rowLabels=['Triple collocation error'],
                           loc='bottom',
                           bbox=(0.2, -1.65, 0.5, 0.3))

    plt.subplots_adjust(left=0.08, bottom=0.35)

    axes = scatter_matrix(scaled)
    axes.flat[0].figure.suptitle(ts_title)

    # only draw 1:1 line if scaling was applied
    if rescaling is not None:
        for j, ax in enumerate(axes.flatten()):

            if np.remainder(j + 1, 3 + 1) != 1:
                min_x, max_x = ax.get_xlim()
                min_y, max_y = ax.get_ylim()
                # find minimum lower left coordinate and maximum upper right
                min_ll = min([min_x, min_y])
                max_ur = max([max_x, max_y])
                ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6')

    return df
Example #6
0
        plt.show()
        
        # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe
        x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values
        
        print "ISMN time series:", ISMN_time_series
        print "compared to"
        print ascat_time_series
        print "Results:"
        
        # df_metrics takes a DataFrame as input and automatically
        # calculates the metric on all combinations of columns
        # returns a named tuple for easy printing
        print df_metrics.pearsonr(scaled_data)
        print "Spearman's (rho,p_value)", metrics.spearmanr(x, y)
        print "Kendalls's (tau,p_value)", metrics.kendalltau(x, y)
        print df_metrics.kendalltau(scaled_data)
        print df_metrics.rmsd(scaled_data)
        print "Bias", metrics.bias(x, y)
        print "Nash Sutcliffe", metrics.nash_sutcliffe(x, y)
        
        
    i += 1
    
    # only show the first 2 stations, otherwise this program would run a long time
    # and produce a lot of plots
    if i >= 2:
        break    


def optimise(params,
             timespan=('2009-01', '2009-12'), gpi=None, rescaling=None):
    """
    This function is optimising the parameters vegetation water content
    'm_veg', soil moisture 'm_soil' and, if specified, a third optional
    parameter. The third optional parameter can eitehr be sand 'sand',
    clay 'clay', fractional root mean square height 'f_rms',
    stem volume 's_vol' or temperature 'temp'.

    Parameters
    ----------
    params : list of dicts
        Model parameters. At least
        four of the following parameters needs to be specified if an optional
        parameter has been selected, otherwise all of them needs to be
        specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol'
    gpi : int, optional
        Grid point index. If specified, it will read data from datapool.

    Returns
    -------
    df : pandas.DataFrame
        Optimised soil moisture, vegetation water concent and, if specified,
        optional optimised parameter.
    """

    if gpi is None:
        ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0,
                               parse_dates=True)[timespan[0]:timespan[1]]
        gpi = 2011528
    else:
        ts_resam = read_resam(gpi)[timespan[0]:timespan[1]]

    m_veg_x0 = params.pop('m_veg_x0')
    m_soil_x0 = params.pop('m_soil_x0')
    columns = ['m_veg', 'm_soil']

    x0 = np.array([m_veg_x0, m_soil_x0])

    df = pd.DataFrame(index=ts_resam.index, columns=columns)
    df = df.fillna(np.nan)
    # optimise  m_soil and m_veg
    for index, row in ts_resam.iterrows():

        ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist())
        ascat_sig = \
            db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist()))

        args = (ascat_inc, ascat_sig, params, '')
        res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead')

        if res['success'] == True:
            df['m_veg'][index] = res['x'][0]
            df['m_soil'][index] = res['x'][1]

    str_static_p = \
                ', '.join("%s: %r" % t for t in locals().iteritems())

    str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(m_veg_x0, m_soil_x0)

    ismn_file = os.path.join('data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm')
    ismn_data = ismn_readers.read_data(ismn_file)
    insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(columns={'soil moisture': 'insitu'})
    gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0)
    gldas.rename(columns={'086_L1': 'gldas'}, inplace=True)
    gldas = pd.DataFrame(gldas['gldas'])
    ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'})

    matched = temp_match.matching(ascat, insitu, gldas)

    if rescaling is not None:
        scaled = scaling.scale(matched, rescaling, reference_index=1)
    else:
        scaled = matched

    metrics = OrderedDict()
    metrics['bias'] = df_metrics.bias(scaled)
    metrics['pearson'] = df_metrics.pearsonr(scaled)
    metrics['kendall'] = df_metrics.kendalltau(scaled)
    metrics['ubrmsd'] = df_metrics.ubrmsd(scaled)
    metrics['var_ratio'] = df_var_ratio(scaled)
    tcol_error = df_metrics.tcol_error(scaled)._asdict()

    ts_title = "Soil moisture. "
    if rescaling is not None:
        ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling])
    else:
        ts_title = ' '.join([ts_title, 'No rescaling.'])

    axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8))

    # these are matplotlib.patch.Patch properties
    props = dict(facecolor='white', alpha=0)

    columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas')
    row_labels = ['bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio']
    cell_text = []
    for metric in metrics:
        metric_values = metrics[metric]
        if type(metric_values) == tuple:
            metric_values = metric_values[0]
        metric_values = metric_values._asdict()
        cell_text.append(["%.2f" % metric_values['ascat_and_insitu'],
                              "%.2f" % metric_values['ascat_and_gldas'],
                              "%.2f" % metric_values['insitu_and_gldas']])

    table = plt.table(
              cellText=cell_text,
              colLabels=columns,
              colWidths=[0.1, 0.1, 0.1],
              rowLabels=row_labels, loc='bottom',
              bbox=(0.2, -1.25, 0.5, 0.8))

    tcol_table = plt.table(
              cellText=[["%.2f" % tcol_error['ascat'],
                         "%.2f" % tcol_error['gldas'],
                         "%.2f" % tcol_error['insitu']]],
              colLabels=('ascat', 'gldas', 'insitu'),
              colWidths=[0.1, 0.1, 0.1],
              rowLabels=['Triple collocation error'], loc='bottom',
              bbox=(0.2, -1.65, 0.5, 0.3))

    plt.subplots_adjust(left=0.08, bottom=0.35)

    axes = scatter_matrix(scaled)
    axes.flat[0].figure.suptitle(ts_title)

    # only draw 1:1 line if scaling was applied
    if rescaling is not None:
        for j, ax in enumerate(axes.flatten()):

            if np.remainder(j + 1, 3 + 1) != 1:
                min_x, max_x = ax.get_xlim()
                min_y, max_y = ax.get_ylim()
                # find minimum lower left coordinate and maximum upper right
                min_ll = min([min_x, min_y])
                max_ur = max([max_x, max_y])
                ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6')

    return df
Example #8
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < 10:
            return dataset

        dataset['n_obs'][0] = n_obs


        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R = pearson_R._asdict()
        pearson_p = pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho = spea_rho._asdict()
        spea_p = spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict = tau._asdict()
            p_tau_dict = p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        #data_scaled = scale(data, method='mean_std')
        # calculate ubRMSD
        ubRMSD_nT = df_metrics.ubrmsd(data)
        ubRMSD_dict = ubRMSD_nT._asdict()

        # get single dataset metrics
        # calculate SNR
        x = data[self.df_columns[0]].values[subset]
        y = data[self.df_columns[1]].values[subset]
        z = data[self.df_columns[2]].values[subset]

        snr, err, beta = metrics.tcol_snr(x, y, z)

        for i, name in enumerate(self.ds_names):
            dataset['{:}_snr'.format(name)][0] = snr[i]
            dataset['{:}_err_var'.format(name)][0] = err[i]
            dataset['{:}_beta'.format(name)][0] = beta[i]


        for tds_name in self.tds_names:
            R = pearson_R[tds_name]
            p_R = pearson_p[tds_name]
            rho = spea_rho[tds_name]
            p_rho = spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]


            split_tds_name = tds_name.split('_and_')
            tds_name_key = "{:}_{:}".format(self.ds_names_lut[
                split_tds_name[0]],
                self.ds_names_lut[
                split_tds_name[1]])

            dataset['R_between_{:}'.format(tds_name_key)][0] = R
            dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R
            dataset['rho_between_{:}'.format(tds_name_key)][0] = rho
            dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho
            dataset['bias_between_{:}'.format(tds_name_key)][0] = bias
            dataset['mse_between_{:}'.format(tds_name_key)][0] = mse
            dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr
            dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias
            dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var
            dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd
            dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD

            if self.calc_tau:
                dataset['tau_between_{:}'.format(tds_name_key)][0] = tau
                dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau

        return dataset