Example #1
0
def test_scale(method):

    n = 1000
    x = np.arange(n)
    y = np.arange(n) * 0.5

    df = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y'])
    if method in ["lin_cdf_match", "cdf_match"]:
        with pytest.deprecated_call():
            df_scaled = scaling.scale(df, method=method, reference_index=0)
    else:
        df_scaled = scaling.scale(df, method=method, reference_index=0)
    nptest.assert_almost_equal(df_scaled['x'].values, df_scaled['y'].values)
Example #2
0
def calc_rho(ascat_ssm, FP_df, hoal_df):
    # multiply ASCAT with porosity (0.54) to get same units
    ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat']*0.54
    
    matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'], 
                            hoal_df['HOAL_sm0.05'])
    matched_data.plot()
    plt.title('Matched data: ASCAT, FP, HOAL')
    plt.show()
    
    data_together = scale(matched_data)#, method="mean_std")
    
    ascat_rho = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3], 
                                  data_together['ssm_ascat'].iloc[:-3])
    
    hoal_rho_sm = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3], 
                                    data_together['HOAL_sm0.05'].iloc[:-3])
    
    exclude = ['HOAL_ts0.05', 'air_temperature_celsius', 'par_umole_m2s',
               'merge_key']
    data_together.ix[:, data_together.columns.difference(exclude)].plot()
    plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen, station 22',
              fontsize=24)
              #+'\n rho_ASCAT_Parrot: '+str(np.round(ascat_rho[0],3))+
              #', rho_HOAL_Parrot: '+str(np.round(hoal_rho_sm[0],3)))
    plt.ylabel('Volumetric Water Content [%]',fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.ylim([0,60])
    plt.show()
Example #3
0
def plot_alltogether(time_lag,
                     lon,
                     lat,
                     ts1,
                     ts2,
                     scale_ts=False,
                     save_fig=False,
                     *args):

    matched_data = temp_match.matching(ts1, ts2, *args)
    if len(matched_data) == 0:
        print "Empty dataset."
        return
    if scale_ts:
        matched_data = scaling.scale(matched_data, method="mean_std")

    matched_data.plot(figsize=(15, 5))
    plt.title('SWI and Vegetation indices comparison (rescaled)')
    if save_fig:
        plt.savefig("C:\\Users\\i.pfeil\\Desktop\\TS_plots\\lon_" + str(lon) +
                    "_lat_" + str(lat) + '_' + str(time_lag) + ".png",
                    bbox_inches='tight')
        plt.clf()
    else:
        plt.show()
Example #4
0
def calc_rho(ascat_ssm, FP_df, hoal_df):
    # multiply ASCAT with porosity (0.54) to get same units
    ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat'] * 0.54

    matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'],
                            hoal_df['HOAL_sm0.05'])
    matched_data.plot()
    plt.title('Matched data: ASCAT, FP, HOAL')
    plt.show()

    data_together = scale(matched_data)  #, method="mean_std")

    ascat_rho = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3],
                                  data_together['ssm_ascat'].iloc[:-3])

    hoal_rho_sm = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3],
                                    data_together['HOAL_sm0.05'].iloc[:-3])

    exclude = [
        'HOAL_ts0.05', 'air_temperature_celsius', 'par_umole_m2s', 'merge_key'
    ]
    data_together.ix[:, data_together.columns.difference(exclude)].plot()
    plt.title(
        'Satellite and in-situ soil moisture, HOAL Petzenkirchen, station 22',
        fontsize=24)
    #+'\n rho_ASCAT_Parrot: '+str(np.round(ascat_rho[0],3))+
    #', rho_HOAL_Parrot: '+str(np.round(hoal_rho_sm[0],3)))
    plt.ylabel('Volumetric Water Content [%]', fontsize=20)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.ylim([0, 60])
    plt.show()
Example #5
0
    def scale(self, data, reference_index, gpi_info):
        """
        Scale all columns in data to the
        column at the reference_index.

        Parameters
        ----------
        data: pandas.DataFrame
            temporally matched dataset
        reference_index: int
            Which column of the data contains the
            scaling reference.
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)
            Where gpi has to be the grid point indices
            of the grid of this scaler.

        Raises
        ------
        ValueError
            if scaling is not successful
        """
        return scaling.scale(data,
                             method=self.method,
                             reference_index=reference_index)
Example #6
0
    def scale(self, data, reference_index, gpi_info):
        """
        Scale all columns in data to the
        column at the reference_index.

        Parameters
        ----------
        data: pandas.DataFrame
            temporally matched dataset
        reference_index: int
            Which column of the data contains the
            scaling reference.
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)
            Where gpi has to be the grid point indices
            of the grid of this scaler.

        Raises
        ------
        ValueError
            if scaling is not successful
        """
        return scaling.scale(data,
                             method=self.method,
                             reference_index=reference_index)
Example #7
0
def test_scale(method):

    n = 1000
    x = np.arange(n)
    y = np.arange(n) * 0.5

    df = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y'])
    df_scaled = scaling.scale(df, method=method, reference_index=0)
    nptest.assert_almost_equal(df_scaled['x'].values, df_scaled['y'].values)
Example #8
0
def test_scale(method):

    n = 1000
    x = np.arange(n)
    y = np.arange(n) * 0.5

    df = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y'])
    df_scaled = scaling.scale(df,
                              method=method,
                              reference_index=0)
    nptest.assert_almost_equal(df_scaled['x'].values,
                               df_scaled['y'].values)
Example #9
0
def rescale_df(ascat_ssm, FP_df):
    ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat'] * 0.54
    ascat_ssm.plot()
    plt.show()

    matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'])
    matched_data.plot()
    plt.show()

    scaled_data = scale(matched_data, method="mean_std")

    scaled_data.plot()
    plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen')
    plt.ylabel('Volumetric Water Content [%]')
    plt.show()
Example #10
0
def rescale_df(ascat_ssm, FP_df):
    ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat']*0.54
    ascat_ssm.plot()
    plt.show()
    
    matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'])
    matched_data.plot()
    plt.show()
    
    scaled_data = scale(matched_data, method="mean_std")
    
    scaled_data.plot()
    plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen')
    plt.ylabel('Volumetric Water Content [%]')
    plt.show()
Example #11
0
def rescale_df(ascat_ssm, FP_df, hoal_df):
    ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat']*0.54
    #ascat_ssm.plot()
    #plt.show()
    
    matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'], hoal_df)
    matched_data.plot()
    plt.title('Matched data: ASCAT, FP, HOAL')
    plt.show()
    
    scaled_data = scale(matched_data)#, method="mean_std")
    
    scaled_data.plot()
    plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen')
    plt.ylabel('Volumetric Water Content [%]')
    plt.ylim([0,60])
    plt.show()
Example #12
0
def rescale_df(ascat_ssm, FP_df, hoal_df):
    ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat'] * 0.54
    #ascat_ssm.plot()
    #plt.show()

    matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'], hoal_df)
    matched_data.plot()
    plt.title('Matched data: ASCAT, FP, HOAL')
    plt.show()

    scaled_data = scale(matched_data)  #, method="mean_std")

    scaled_data.plot()
    plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen')
    plt.ylabel('Volumetric Water Content [%]')
    plt.ylim([0, 60])
    plt.show()
def optimise(params,
             timespan=('2009-01', '2009-12'), gpi=None, rescaling=None):
    """
    This function is optimising the parameters vegetation water content
    'm_veg', soil moisture 'm_soil' and, if specified, a third optional
    parameter. The third optional parameter can eitehr be sand 'sand',
    clay 'clay', fractional root mean square height 'f_rms',
    stem volume 's_vol' or temperature 'temp'.

    Parameters
    ----------
    params : list of dicts
        Model parameters. At least
        four of the following parameters needs to be specified if an optional
        parameter has been selected, otherwise all of them needs to be
        specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol'
    gpi : int, optional
        Grid point index. If specified, it will read data from datapool.

    Returns
    -------
    df : pandas.DataFrame
        Optimised soil moisture, vegetation water concent and, if specified,
        optional optimised parameter.
    """

    if gpi is None:
        ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0,
                               parse_dates=True)[timespan[0]:timespan[1]]
        gpi = 2011528
    else:
        ts_resam = read_resam(gpi)[timespan[0]:timespan[1]]

    m_veg_x0 = params.pop('m_veg_x0')
    m_soil_x0 = params.pop('m_soil_x0')
    columns = ['m_veg', 'm_soil']

    x0 = np.array([m_veg_x0, m_soil_x0])

    df = pd.DataFrame(index=ts_resam.index, columns=columns)
    df = df.fillna(np.nan)
    # optimise  m_soil and m_veg
    for index, row in ts_resam.iterrows():

        ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist())
        ascat_sig = \
            db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist()))

        args = (ascat_inc, ascat_sig, params, '')
        res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead')

        if res['success'] == True:
            df['m_veg'][index] = res['x'][0]
            df['m_soil'][index] = res['x'][1]

    str_static_p = \
                ', '.join("%s: %r" % t for t in locals().iteritems())

    str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(m_veg_x0, m_soil_x0)

    ismn_file = os.path.join('data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm')
    ismn_data = ismn_readers.read_data(ismn_file)
    insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(columns={'soil moisture': 'insitu'})
    gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0)
    gldas.rename(columns={'086_L1': 'gldas'}, inplace=True)
    gldas = pd.DataFrame(gldas['gldas'])
    ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'})

    matched = temp_match.matching(ascat, insitu, gldas)

    if rescaling is not None:
        scaled = scaling.scale(matched, rescaling, reference_index=1)
    else:
        scaled = matched

    metrics = OrderedDict()
    metrics['bias'] = df_metrics.bias(scaled)
    metrics['pearson'] = df_metrics.pearsonr(scaled)
    metrics['kendall'] = df_metrics.kendalltau(scaled)
    metrics['ubrmsd'] = df_metrics.ubrmsd(scaled)
    metrics['var_ratio'] = df_var_ratio(scaled)
    tcol_error = df_metrics.tcol_error(scaled)._asdict()

    ts_title = "Soil moisture. "
    if rescaling is not None:
        ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling])
    else:
        ts_title = ' '.join([ts_title, 'No rescaling.'])

    axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8))

    # these are matplotlib.patch.Patch properties
    props = dict(facecolor='white', alpha=0)

    columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas')
    row_labels = ['bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio']
    cell_text = []
    for metric in metrics:
        metric_values = metrics[metric]
        if type(metric_values) == tuple:
            metric_values = metric_values[0]
        metric_values = metric_values._asdict()
        cell_text.append(["%.2f" % metric_values['ascat_and_insitu'],
                              "%.2f" % metric_values['ascat_and_gldas'],
                              "%.2f" % metric_values['insitu_and_gldas']])

    table = plt.table(
              cellText=cell_text,
              colLabels=columns,
              colWidths=[0.1, 0.1, 0.1],
              rowLabels=row_labels, loc='bottom',
              bbox=(0.2, -1.25, 0.5, 0.8))

    tcol_table = plt.table(
              cellText=[["%.2f" % tcol_error['ascat'],
                         "%.2f" % tcol_error['gldas'],
                         "%.2f" % tcol_error['insitu']]],
              colLabels=('ascat', 'gldas', 'insitu'),
              colWidths=[0.1, 0.1, 0.1],
              rowLabels=['Triple collocation error'], loc='bottom',
              bbox=(0.2, -1.65, 0.5, 0.3))

    plt.subplots_adjust(left=0.08, bottom=0.35)

    axes = scatter_matrix(scaled)
    axes.flat[0].figure.suptitle(ts_title)

    # only draw 1:1 line if scaling was applied
    if rescaling is not None:
        for j, ax in enumerate(axes.flatten()):

            if np.remainder(j + 1, 3 + 1) != 1:
                min_x, max_x = ax.get_xlim()
                min_y, max_y = ax.get_ylim()
                # find minimum lower left coordinate and maximum upper right
                min_ll = min([min_x, min_y])
                max_ur = max([max_x, max_y])
                ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6')

    return df
Example #14
0
 matched_data = temp_match.matching(ascat_time_series.data, ISMN_time_series.data,
                                         window=1 / 24.)
 # matched ISMN data is now a dataframe with the same datetime index
 # as ascat_time_series.data and the nearest insitu observation      
 
 # continue only with relevant columns
 matched_data = matched_data[[label_ascat, label_insitu]]
 
 # the plot shows that ISMN and ASCAT are observed in different units
 matched_data.plot(figsize=(15, 5), secondary_y=[label_ascat],
                   title='temporally merged data')
 plt.show()
 
 # this takes the matched_data DataFrame and scales all columns to the 
 # column with the given reference_index, in this case in situ 
 scaled_data = scaling.scale(matched_data, method='lin_cdf_match',
                                  reference_index=1)
 
 # now the scaled ascat data and insitu_sm are in the same space    
 scaled_data.plot(figsize=(15, 5), title='scaled data')
 plt.show()
 
 plt.scatter(scaled_data[label_ascat].values, scaled_data[label_insitu].values)
 plt.xlabel(label_ascat)
 plt.ylabel(label_insitu)
 plt.show()
 
 # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe
 x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values
 
 print("ISMN time series:", ISMN_time_series)
 print("compared to")
Example #15
0
    def perform_validation(self,
                           df_dict,
                           gpi_info):
        """
        Perform the validation for one grid point index and return the
        matched datasets as well as the calculated metrics.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            temporally matched data stored by (n, k) tuples
        results: dict
            Dictonary of calculated metrics stored by dataset combinations tuples.
        used_data: dict
            The DataFrame used for calculation of each set of metrics.
        """
        results = {}
        used_data = {}
        matched_n = {}

        if self.masking_dm is not None:
            ref_df = df_dict[self.temporal_ref]
            masked_ref_df = self.mask_dataset(ref_df,
                                              gpi_info)
            if len(masked_ref_df) == 0:
                return matched_n, results, used_data

            df_dict[self.temporal_ref] = masked_ref_df

        matched_n = self.temporal_match_datasets(df_dict)

        for n, k in self.metrics_c:
            n_matched_data = matched_n[(n, k)]
            if len(n_matched_data) == 0:
                continue
            result_names = get_result_names(self.data_manager.ds_dict,
                                            self.temporal_ref,
                                            n=k)
            for data, result_key in self.k_datasets_from(n_matched_data,
                                                         result_names):

                if len(data) == 0:
                    continue
                # at this stage we can drop the column multiindex and just use
                # the dataset name
                data.columns = data.columns.droplevel(level=1)
                # Rename the columns to 'ref', 'k1', 'k2', ...
                rename_dict = {}
                f = lambda x: "k{}".format(x) if x > 0 else 'ref'
                for i, r in enumerate(result_key):
                    rename_dict[r[0]] = f(i)
                data.rename(columns=rename_dict, inplace=True)

                if self.scaling is not None:
                    # get scaling index by finding the column in the
                    # DataFrame that belongs to the scaling reference
                    scaling_index = data.columns.tolist().index(
                        rename_dict[self.scaling_ref])
                    try:
                        data = scaling.scale(data,
                                             method=self.scaling,
                                             reference_index=scaling_index)
                    except ValueError:
                        continue

                if result_key not in results.keys():
                    results[result_key] = []

                metrics_calculator = self.metrics_c[(n, k)]
                used_data[result_key] = data
                metrics = metrics_calculator(data, gpi_info)
                results[result_key].append(metrics)

        return matched_n, results, used_data
Example #16
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with 3 columns, the first column is the reference dataset
            named 'ref'
            the second and third column are the datasets to compare against
            named 'k1 and k2'
        gpi_info : tuple
            Grid point info (i.e. gpi, lon, lat)
        """
        dataset = super(HSAF_Metrics, self).calc_metrics(data, gpi_info)

        for season in self.seasons:

            if season != 'ALL':
                subset = self.month_to_season[data.index.month] == season
            else:
                subset = np.ones(len(data), dtype=bool)

            # number of observations
            n_obs = subset.sum()
            if n_obs < self.min_obs:
                continue
            dataset['{:}_n_obs'.format(season)][0] = n_obs

            # get single dataset metrics
            # calculate SNR
            x = data[self.df_columns[0]].values[subset]
            y = data[self.df_columns[1]].values[subset]
            z = data[self.df_columns[2]].values[subset]

            snr, err, beta = metrics.tcol_snr(x, y, z)

            for i, name in enumerate(self.ds_names):
                dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i]
                dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i]
                dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i]

            # calculate Pearson correlation
            pearson_R, pearson_p = df_metrics.pearsonr(data)
            pearson_R = pearson_R._asdict()
            pearson_p = pearson_p._asdict()

            # calculate Spearman correlation
            spea_rho, spea_p = df_metrics.spearmanr(data)
            spea_rho = spea_rho._asdict()
            spea_p = spea_p._asdict()

            # scale data to reference in order to calculate absolute metrics
            data_scaled = scale(data, method='min_max')

            # calculate bias
            bias_nT = df_metrics.bias(data_scaled)
            bias_dict = bias_nT._asdict()

            # calculate ubRMSD
            ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
            ubRMSD_dict = ubRMSD_nT._asdict()

            for tds_name in self.tds_names:
                R = pearson_R[tds_name]
                p_R = pearson_p[tds_name]
                rho = spea_rho[tds_name]
                p_rho = spea_p[tds_name]
                bias = bias_dict[tds_name]
                ubRMSD = ubRMSD_dict[tds_name]

                split_tds_name = tds_name.split('_and_')
                tds_name_key = "{:}_{:}".format(
                    self.ds_names_lut[split_tds_name[0]],
                    self.ds_names_lut[split_tds_name[1]])

                dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R
                dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R
                dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho
                dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \
                    p_rho
                dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias
                dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \
                    ubRMSD

        return dataset
Example #17
0
 matched_data = temp_match.matching(ascat_time_series.data, ISMN_time_series.data,
                                         window=1 / 24.)
 # matched ISMN data is now a dataframe with the same datetime index
 # as ascat_time_series.data and the nearest insitu observation      
 
 # continue only with relevant columns
 matched_data = matched_data[[label_ascat, label_insitu]]
 
 # the plot shows that ISMN and ASCAT are observed in different units
 matched_data.plot(figsize=(15, 5), secondary_y=[label_ascat],
                   title='temporally merged data')
 plt.show()
 
 # this takes the matched_data DataFrame and scales all columns to the 
 # column with the given reference_index, in this case in situ 
 scaled_data = scaling.scale(matched_data, method='lin_cdf_match',
                                  reference_index=1)
 
 # now the scaled ascat data and insitu_sm are in the same space    
 scaled_data.plot(figsize=(15, 5), title='scaled data')
 plt.show()
 
 plt.scatter(scaled_data[label_ascat].values, scaled_data[label_insitu].values)
 plt.xlabel(label_ascat)
 plt.ylabel(label_insitu)
 plt.show()
 
 # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe
 x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values
 
 print "ISMN time series:", ISMN_time_series
 print "compared to"
Example #18
0
def validate(params,
             timespan=('2009-01', '2009-12'), gpi=None, rescaling=None,
             y_axis_range=None):
    """
    This function is optimising the parameters vegetation water content
    'm_veg', soil moisture 'm_soil' and, if specified, a third optional
    parameter. The third optional parameter can eitehr be sand 'sand',
    clay 'clay', fractional root mean square height 'f_rms',
    stem volume 's_vol' or temperature 'temp'.

    Parameters
    ----------
    params : list of dicts
        Model parameters. At least
        four of the following parameters needs to be specified if an optional
        parameter has been selected, otherwise all of them needs to be
        specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol'
    timespan : tuple, optional
        timespan to analyze
    gpi : int, optional
        Grid point index. If specified, it will read data from datapool.
    rescaling : string, optional
        rescaling method, one of 'min_max', 'linreg', 'mean_std' and 'lin_cdf_match'
        Default: None
        insitu is the reference to which is scaled
    y_axis_range : tuple, optional
        specify (min, max) of y axis


    Returns
    -------
    df : pandas.DataFrame
        Optimised soil moisture, vegetation water concent and, if specified,
        optional optimised parameter.
    """

    unit_dict = {'freq': 'GHz',
                 'sand': '',
                 'clay': '',
                 'temp': '$^\circ$C',
                 'eps': '',
                 'theta': '$^\circ$',
                 'f_rms': '',
                 'sig_bare': 'dB',
                 'm_soil': '%',
                 'm_veg': '%',
                 'm_soil_x0': '%',
                 'm_veg_x0': '%',
                 's_vol': '$m^3ha^{-1}$',
                 'sig_canopy': 'dB',
                 'sig_for': 'dB',
                 'sig_floor': 'dB',
                 'polarization': ''}

    param_should = ['sand', 'clay', 'temp',
                    's_vol', 'f_rms',
                    'm_veg_x0', 'm_soil_x0']

    for param in param_should:
        assert param in params.keys()

    if gpi is None:
        ts_resam = pd.read_csv(os.path.join(os.path.split(os.path.abspath(__file__))[0],'data','2011528_2009.csv'), index_col=0,
                               parse_dates=True)[timespan[0]:timespan[1]]
        gpi = 2011528
    else:
        ts_resam = read_resam(gpi)[timespan[0]:timespan[1]]

    m_veg_x0 = params.pop('m_veg_x0')
    m_soil_x0 = params.pop('m_soil_x0')
    columns = ['m_veg', 'm_soil']

    x0 = np.array([m_veg_x0, m_soil_x0])

    df = pd.DataFrame(index=ts_resam.index, columns=columns)
    df = df.fillna(np.nan)
    # optimise  m_soil and m_veg
    for index, row in ts_resam.iterrows():

        ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist())
        ascat_sig = \
            db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist()))

        args = (ascat_inc, ascat_sig, params, '')
        res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead')

        if res['success'] == True:
            df['m_veg'][index] = res['x'][0]
            df['m_soil'][index] = res['x'][1]

    str_static_p = \
                ', '.join("%s: %r" % t for t in locals().iteritems())

    str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(m_veg_x0, m_soil_x0)

	
    ismn_file = os.path.join(os.path.split(os.path.abspath(__file__))[0],'data','ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm')
    ismn_data = ismn_readers.read_data(ismn_file)
    insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(columns={'soil moisture': 'insitu'})
    gldas = pd.read_csv(os.path.join(os.path.split(os.path.abspath(__file__))[0],'data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0)
    gldas.rename(columns={'086_L1': 'gldas'}, inplace=True)
    gldas = pd.DataFrame(gldas['gldas']) / 100.0
    ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'})

    matched = temp_match.matching(ascat, insitu, gldas)

    if rescaling is not None:
        scaled = scaling.scale(matched, rescaling, reference_index=1)
    else:
        scaled = matched

    metrics = OrderedDict()
    metrics['bias'] = df_metrics.bias(scaled)
    metrics['pearson'] = df_metrics.pearsonr(scaled)
    metrics['spearman'] = df_metrics.spearmanr(scaled)
    metrics['ubrmsd'] = df_metrics.rmsd(scaled)
    metrics['std_ratio'] = df_std_ratio(scaled)
    tcol_error = df_metrics.tcol_error(scaled)._asdict()

    ts_title = "Soil moisture. "
    if rescaling is not None:
        ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling])
        rmsd_title = 'unbiased RMSD'
    else:
        ts_title = ' '.join([ts_title, 'No rescaling.'])
        rmsd_title = 'RMSD'


    axes = scaled.plot(title=ts_title, figsize=(18, 8))
    plt.legend()

    # these are matplotlib.patch.Patch properties
    props = dict(facecolor='white', alpha=0)

    columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas')
    row_labels = ['bias', 'pearson R', 'spearman rho', rmsd_title, 'stddev ratio']
    cell_text = []
    for metric in metrics:
        metric_values = metrics[metric]
        if type(metric_values) == tuple:
            metric_values = metric_values[0]
        metric_values = metric_values._asdict()
        cell_text.append(["%.2f" % metric_values['ascat_and_insitu'],
                              "%.2f" % metric_values['ascat_and_gldas'],
                              "%.2f" % metric_values['insitu_and_gldas']])


    table = plt.table(
              cellText=cell_text,
              colLabels=columns,
              colWidths=[0.1, 0.1, 0.1],
              rowLabels=row_labels, loc='bottom',
              bbox=(0.2, -0.5, 0.5, 0.3))

    tcol_table = plt.table(
              cellText=[["%.2f" % tcol_error['ascat'],
                         "%.2f" % tcol_error['gldas'],
                         "%.2f" % tcol_error['insitu']]],
              colLabels=('ascat      ', 'gldas      ', 'insitu      '),
              colWidths=[0.1, 0.1, 0.1],
              rowLabels=['Triple collocation error'], loc='bottom',
              bbox=(0.2, -0.6, 0.5, 0.1))

    plt.subplots_adjust(left=0.08, bottom=0.35, right=0.85)
    plt.draw()
#
    if y_axis_range is not None:
        axes.set_ylim(y_axis_range)

    params['m_veg_x0'] = m_veg_x0
    params['m_soil_x0'] = m_soil_x0

    infotext = []
    for label in sorted(param_should):
        infotext.append('%s = %s %s' % (label, params[label], unit_dict[label]))

    infotext = '\n'.join(infotext)

    # place a text box in upper left in axes coords
    axes.text(1.03, 1, infotext, transform=axes.transAxes, fontsize=12,
            verticalalignment='top', bbox=props)

    axes = scatter_matrix(scaled)
    axes.flat[0].figure.suptitle(ts_title)

    # only draw 1:1 line if scaling was applied
    for j, ax in enumerate(axes.flatten()):
        if y_axis_range is not None:
            ax.set_xlim(y_axis_range)

        if np.remainder(j + 1, 3 + 1) != 1:
            if y_axis_range is not None:
                ax.set_ylim(y_axis_range)
            min_x, max_x = ax.get_xlim()
            min_y, max_y = ax.get_ylim()
            # find minimum lower left coordinate and maximum upper right
            min_ll = min([min_x, min_y])
            max_ur = max([max_x, max_y])
            ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6')
Example #19
0
    def perform_validation(self, df_dict, gpi_info):
        """
        Perform the validation for one grid point index and return the
        matched datasets as well as the calculated metrics.

        Parameters
        ----------
        df_dict: dict of pandas.DataFrames
            DataFrames read by the data readers for each dataset
        gpi_info: tuple
            tuple of at least, (gpi, lon, lat)

        Returns
        -------
        matched_n: dict of pandas.DataFrames
            temporally matched data stored by (n, k) tuples
        results: dict
            Dictonary of calculated metrics stored by dataset combinations tuples.
        used_data: dict
            The DataFrame used for calculation of each set of metrics.
        """
        results = {}
        used_data = {}
        matched_n = {}

        if self.masking_dm is not None:
            ref_df = df_dict[self.temporal_ref]
            masked_ref_df = self.mask_dataset(ref_df, gpi_info)
            if len(masked_ref_df) == 0:
                return matched_n, results, used_data

            df_dict[self.temporal_ref] = masked_ref_df

        matched_n = self.temporal_match_datasets(df_dict)

        for n, k in self.metrics_c:
            n_matched_data = matched_n[(n, k)]
            if len(n_matched_data) == 0:
                continue
            result_names = get_result_names(self.data_manager.ds_dict,
                                            self.temporal_ref,
                                            n=k)
            for data, result_key in self.k_datasets_from(
                    n_matched_data, result_names):

                if len(data) == 0:
                    continue
                # at this stage we can drop the column multiindex and just use
                # the dataset name
                data.columns = data.columns.droplevel(level=1)
                # Rename the columns to 'ref', 'k1', 'k2', ...
                rename_dict = {}
                f = lambda x: "k{}".format(x) if x > 0 else 'ref'
                for i, r in enumerate(result_key):
                    rename_dict[r[0]] = f(i)
                data.rename(columns=rename_dict, inplace=True)

                if self.scaling is not None:
                    # get scaling index by finding the column in the
                    # DataFrame that belongs to the scaling reference
                    scaling_index = data.columns.tolist().index(
                        rename_dict[self.scaling_ref])
                    try:
                        data = scaling.scale(data,
                                             method=self.scaling,
                                             reference_index=scaling_index)
                    except ValueError:
                        continue

                if result_key not in results.keys():
                    results[result_key] = []

                metrics_calculator = self.metrics_c[(n, k)]
                used_data[result_key] = data
                metrics = metrics_calculator(data, gpi_info)
                results[result_key].append(metrics)

        return matched_n, results, used_data
Example #20
0
    def calc(self, job):
        """
        Takes either a cell or a gpi_info tuple and performs the validation.

        Parameters
        ----------
        job : object
            Job of type that self.get_processing_jobs() returns.

        Returns
        -------
        compact_results : dict of dicts
            Keys: result names, combinations of
                  (referenceDataset.column, otherDataset.column)
            Values: dict containing the elements returned by metrics_calculator
        """
        result_names = self.data_manager.get_results_names()
        results = {}

        if self.cell_based_jobs:
            process_gpis, process_lons, process_lats = self.data_manager.\
                reference_grid.grid_points_for_cell(job)
        else:
            process_gpis, process_lons, process_lats = [
                job[0]], [job[1]], [job[2]]

        for gpi_info in zip(process_gpis, process_lons, process_lats):
            # if processing is cell based gpi_metainfo is limited to gpi, lon,
            # lat at the moment
            if self.cell_based_jobs:
                gpi_meta = gpi_info
            else:
                gpi_meta = job

            ref_dataframe = self.data_manager.read_reference(gpi_info[0])
            # if no reference data available continue with the next gpi
            if ref_dataframe is None:
                continue

            other_dataframes = {}
            for other_name in self.data_manager.other_name:
                grids_compatible = self.data_manager.datasets[
                    other_name]['grids_compatible']
                if grids_compatible:
                    other_dataframe = self.data_manager.read_other(
                        other_name, gpi_info[0])
                elif self.luts[other_name] is not None:
                    other_gpi = self.luts[other_name][gpi_info[0]]
                    if other_gpi == -1:
                        continue
                    other_dataframe = self.data_manager.read_other(
                        other_name, other_gpi)
                else:
                    other_dataframe = self.data_manager.read_other(
                        other_name, gpi_info[1], gpi_info[2])

                if other_dataframe is not None:
                    other_dataframes[other_name] = other_dataframe

            # if no other data available continue with the next gpi
            if len(other_dataframes) == 0:
                continue

            joined_data = {}
            for other in other_dataframes.keys():
                joined = self.temp_matching(ref_dataframe,
                                            other_dataframes[other])

                if len(joined) != 0:
                    joined_data[other] = joined

            if len(joined_data) == 0:
                continue

            # compute results for each combination of (ref, other) columns
            rescaled_data = {}
            for result in result_names:
                ref_col = result[0].split('.')[1]
                other_col = result[1].split('.')[1]
                other_name = result[1].split('.')[0]

                try:
                    data = joined_data[other_name][
                        [ref_col, other_col]].dropna()
                except KeyError:
                    continue

                data.rename(
                    columns={ref_col: 'ref', other_col: 'other'}, inplace=True)

                if len(data) == 0:
                    continue

                if self.scaling is not None:
                    try:
                        data = scaling.scale(
                            data, method=self.scaling, reference_index=self.scale_to_index)
                        rescaled_data[other_name] = data
                    except ValueError:
                        continue

                if result not in results.keys():
                    results[result] = []

                results[result].append(self.calc_metrics(data, gpi_meta))

        compact_results = {}
        for key in results.keys():
            compact_results[key] = {}
            for field_name in results[key][0].keys():
                entries = []
                for result in results[key]:
                    entries.append(result[field_name][0])
                compact_results[key][field_name] = \
                    np.array(entries, dtype=results[key][0][field_name].dtype)

        if self.data_postproc is not None:
            self.data_postproc(compact_results, rescaled_data)

        return compact_results
Example #21
0
    def calc_metrics(self, data, gpi_info):
        """
        Calculate Triple Collocation metrics

        Parameters
        ----------
        data : pd.DataFrame
            with >2 columns, the first column is the reference dataset named 'ref'
            other columns are the data sets to compare against named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        if self.metadata_template != None:
            for key, value in self.metadata_template.items():
                dataset[key][0] = gpi_info[3][key]

        # number of observations
        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < self.min_obs:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict()
        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict()
        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()
        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()
        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict = mse._asdict()
        mse_corr_dict = mse_corr._asdict()
        mse_bias_dict = mse_bias._asdict()
        mse_var_dict = mse_var._asdict()
        # calculate RSS
        rss = df_metrics.RSS(data)
        rss_dict = rss._asdict()
        # calculate ubRMSD
        # todo: we could use the TC derived scaling parameters here?
        data_scaled = scale(data, method='mean_std')
        ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
        ubRMSD_dict = ubRMSD_nT._asdict()
        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None
        # calculate TC metrics
        ref_ind = np.where(np.array(data.columns) == self.ref_name)[0][0]
        snrs, err_stds, betas = df_metrics.tcol_snr(data, ref_ind=ref_ind)
        snr_dict = self._tc_res_dict(snrs)
        err_std_dict = self._tc_res_dict(err_stds)
        beta_dict = self._tc_res_dict(betas)

        # store TC results
        for thds_name in self.thds_names:
            snr = snr_dict[thds_name]
            err_std = err_std_dict[thds_name]
            beta = beta_dict[thds_name]

            split_thds_name = thds_name.split(self.ds_names_split)
            thds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_thds_name[0]],
                self.ds_names_lut[split_thds_name[1]],
                self.ds_names_lut[split_thds_name[2]]
            ])

            for metr, res in dict(snr=snr, err_std=err_std, beta=beta).items():
                for ds, ds_res in res.items():
                    m_ds = "{}_{}".format(metr, self.ds_names_lut[ds])
                    n = '{}{}{}'.format(m_ds, self.metric_ds_split,
                                        thds_name_key)
                    if n in dataset.keys():
                        dataset[n][0] = ds_res

        # Store basic metrics results
        for tds_name in self.tds_names:
            R, p_R = pearson_R[tds_name], pearson_p[tds_name]
            rho, p_rho = spea_rho[tds_name], spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            rss = rss_dict[tds_name]

            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split(self.ds_names_split)
            tds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]]
            ])

            dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R
            dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R
            dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho
            dataset[self.metric_ds_split.join(['p_rho',
                                               tds_name_key])][0] = p_rho
            dataset[self.metric_ds_split.join(['BIAS',
                                               tds_name_key])][0] = bias
            dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse
            dataset[self.metric_ds_split.join(['mse_corr',
                                               tds_name_key])][0] = mse_corr
            dataset[self.metric_ds_split.join(['mse_bias',
                                               tds_name_key])][0] = mse_bias
            dataset[self.metric_ds_split.join(['mse_var',
                                               tds_name_key])][0] = mse_var
            dataset[self.metric_ds_split.join(['RMSD',
                                               tds_name_key])][0] = rmsd
            dataset[self.metric_ds_split.join(['urmsd',
                                               tds_name_key])][0] = ubRMSD
            dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss

            if self.calc_tau:
                dataset[self.metric_ds_split.join(['tau',
                                                   tds_name_key])][0] = tau
                dataset[self.metric_ds_split.join(['p_tau',
                                                   tds_name_key])][0] = p_tau

        return dataset
Example #22
0
def optimise(params,
             timespan=('2009-01', '2009-12'),
             gpi=None,
             rescaling=None):
    """
    This function is optimising the parameters vegetation water content
    'm_veg', soil moisture 'm_soil' and, if specified, a third optional
    parameter. The third optional parameter can eitehr be sand 'sand',
    clay 'clay', fractional root mean square height 'f_rms',
    stem volume 's_vol' or temperature 'temp'.

    Parameters
    ----------
    params : list of dicts
        Model parameters. At least
        four of the following parameters needs to be specified if an optional
        parameter has been selected, otherwise all of them needs to be
        specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol'
    gpi : int, optional
        Grid point index. If specified, it will read data from datapool.

    Returns
    -------
    df : pandas.DataFrame
        Optimised soil moisture, vegetation water concent and, if specified,
        optional optimised parameter.
    """

    if gpi is None:
        ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"),
                               index_col=0,
                               parse_dates=True)[timespan[0]:timespan[1]]
        gpi = 2011528
    else:
        ts_resam = read_resam(gpi)[timespan[0]:timespan[1]]

    m_veg_x0 = params.pop('m_veg_x0')
    m_soil_x0 = params.pop('m_soil_x0')
    columns = ['m_veg', 'm_soil']

    x0 = np.array([m_veg_x0, m_soil_x0])

    df = pd.DataFrame(index=ts_resam.index, columns=columns)
    df = df.fillna(np.nan)
    # optimise  m_soil and m_veg
    for index, row in ts_resam.iterrows():

        ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist())
        ascat_sig = \
            db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist()))

        args = (ascat_inc, ascat_sig, params, '')
        res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead')

        if res['success'] == True:
            df['m_veg'][index] = res['x'][0]
            df['m_soil'][index] = res['x'][1]

    str_static_p = \
                ', '.join("%s: %r" % t for t in locals().iteritems())

    str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(
        m_veg_x0, m_soil_x0)

    ismn_file = os.path.join(
        'data',
        'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm'
    )
    ismn_data = ismn_readers.read_data(ismn_file)
    insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(
        columns={'soil moisture': 'insitu'})
    gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'),
                        parse_dates=True,
                        index_col=0)
    gldas.rename(columns={'086_L1': 'gldas'}, inplace=True)
    gldas = pd.DataFrame(gldas['gldas'])
    ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'})

    matched = temp_match.matching(ascat, insitu, gldas)

    if rescaling is not None:
        scaled = scaling.scale(matched, rescaling, reference_index=1)
    else:
        scaled = matched

    metrics = OrderedDict()
    metrics['bias'] = df_metrics.bias(scaled)
    metrics['pearson'] = df_metrics.pearsonr(scaled)
    metrics['kendall'] = df_metrics.kendalltau(scaled)
    metrics['ubrmsd'] = df_metrics.ubrmsd(scaled)
    metrics['var_ratio'] = df_var_ratio(scaled)
    tcol_error = df_metrics.tcol_error(scaled)._asdict()

    ts_title = "Soil moisture. "
    if rescaling is not None:
        ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling])
    else:
        ts_title = ' '.join([ts_title, 'No rescaling.'])

    axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8))

    # these are matplotlib.patch.Patch properties
    props = dict(facecolor='white', alpha=0)

    columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas')
    row_labels = [
        'bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio'
    ]
    cell_text = []
    for metric in metrics:
        metric_values = metrics[metric]
        if type(metric_values) == tuple:
            metric_values = metric_values[0]
        metric_values = metric_values._asdict()
        cell_text.append([
            "%.2f" % metric_values['ascat_and_insitu'],
            "%.2f" % metric_values['ascat_and_gldas'],
            "%.2f" % metric_values['insitu_and_gldas']
        ])

    table = plt.table(cellText=cell_text,
                      colLabels=columns,
                      colWidths=[0.1, 0.1, 0.1],
                      rowLabels=row_labels,
                      loc='bottom',
                      bbox=(0.2, -1.25, 0.5, 0.8))

    tcol_table = plt.table(cellText=[[
        "%.2f" % tcol_error['ascat'],
        "%.2f" % tcol_error['gldas'],
        "%.2f" % tcol_error['insitu']
    ]],
                           colLabels=('ascat', 'gldas', 'insitu'),
                           colWidths=[0.1, 0.1, 0.1],
                           rowLabels=['Triple collocation error'],
                           loc='bottom',
                           bbox=(0.2, -1.65, 0.5, 0.3))

    plt.subplots_adjust(left=0.08, bottom=0.35)

    axes = scatter_matrix(scaled)
    axes.flat[0].figure.suptitle(ts_title)

    # only draw 1:1 line if scaling was applied
    if rescaling is not None:
        for j, ax in enumerate(axes.flatten()):

            if np.remainder(j + 1, 3 + 1) != 1:
                min_x, max_x = ax.get_xlim()
                min_y, max_y = ax.get_ylim()
                # find minimum lower left coordinate and maximum upper right
                min_ll = min([min_x, min_y])
                max_ur = max([max_x, max_y])
                ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6')

    return df
Example #23
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pd.DataFrame
            with >2 columns, the first column is the reference dataset
            named 'ref' other columns are the datasets to compare against
            named 'other_i'
        gpi_info : tuple
            of (gpi, lon, lat)

        Notes
        -----
        Kendall tau is calculation is optional at the moment
        because the scipy implementation is very slow which is problematic for
        global comparisons
        """

        dataset = super(IntercomparisonMetrics,
                        self).calc_metrics(data, gpi_info)

        subset = np.ones(len(data), dtype=bool)

        n_obs = subset.sum()
        if n_obs < self.min_obs:
            return dataset

        dataset['n_obs'][0] = n_obs

        # calculate Pearson correlation
        pearson_R, pearson_p = df_metrics.pearsonr(data)
        pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict()

        # calculate Spearman correlation
        spea_rho, spea_p = df_metrics.spearmanr(data)
        spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict()

        # calculate bias
        bias_nT = df_metrics.bias(data)
        bias_dict = bias_nT._asdict()

        # calculate RMSD
        rmsd = df_metrics.rmsd(data)
        rmsd_dict = rmsd._asdict()

        # calculate MSE
        mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data)
        mse_dict, mse_corr_dict, mse_bias_dict, mse_var_dict = \
            mse._asdict(), mse_corr._asdict(), mse_bias._asdict(), mse_var._asdict()

        # calculate RSS
        rss = df_metrics.RSS(data)
        rss_dict = rss._asdict()

        # calulcate tau
        if self.calc_tau:
            tau, p_tau = df_metrics.kendalltau(data)
            tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict()
        else:
            tau = p_tau = p_tau_dict = tau_dict = None

        # No extra scaling is performed here.
        # always scale for ubRMSD with mean std
        # calculate ubRMSD
        data_scaled = scale(data, method='mean_std')
        ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
        ubRMSD_dict = ubRMSD_nT._asdict()

        for tds_name in self.tds_names:
            R, p_R = pearson_R[tds_name], pearson_p[tds_name]
            rho, p_rho = spea_rho[tds_name], spea_p[tds_name]
            bias = bias_dict[tds_name]
            mse = mse_dict[tds_name]
            mse_corr = mse_corr_dict[tds_name]
            mse_bias = mse_bias_dict[tds_name]
            mse_var = mse_var_dict[tds_name]
            rmsd = rmsd_dict[tds_name]
            ubRMSD = ubRMSD_dict[tds_name]
            rss = rss_dict[tds_name]

            if tau_dict and p_tau_dict:
                tau = tau_dict[tds_name]
                p_tau = p_tau_dict[tds_name]

            split_tds_name = tds_name.split(self.ds_names_split)
            tds_name_key = self.ds_names_split.join([
                self.ds_names_lut[split_tds_name[0]],
                self.ds_names_lut[split_tds_name[1]]
            ])

            dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R
            dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R
            dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho
            dataset[self.metric_ds_split.join(['p_rho',
                                               tds_name_key])][0] = p_rho
            dataset[self.metric_ds_split.join(['BIAS',
                                               tds_name_key])][0] = bias
            dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse
            dataset[self.metric_ds_split.join(['mse_corr',
                                               tds_name_key])][0] = mse_corr
            dataset[self.metric_ds_split.join(['mse_bias',
                                               tds_name_key])][0] = mse_bias
            dataset[self.metric_ds_split.join(['mse_var',
                                               tds_name_key])][0] = mse_var
            dataset[self.metric_ds_split.join(['RMSD',
                                               tds_name_key])][0] = rmsd
            dataset[self.metric_ds_split.join(['urmsd',
                                               tds_name_key])][0] = ubRMSD
            dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss

            if self.calc_tau:
                dataset[self.metric_ds_split.join(['tau',
                                                   tds_name_key])][0] = tau
                dataset[self.metric_ds_split.join(['p_tau',
                                                   tds_name_key])][0] = p_tau

        return dataset
Example #24
0
    def calc_metrics(self, data, gpi_info):
        """
        calculates the desired statistics

        Parameters
        ----------
        data : pandas.DataFrame
            with 3 columns, the first column is the reference dataset
            named 'ref'
            the second and third column are the datasets to compare against
            named 'k1 and k2'
        gpi_info : tuple
            Grid point info (i.e. gpi, lon, lat)
        """
        dataset = copy.deepcopy(self.result_template)

        dataset['gpi'][0] = gpi_info[0]
        dataset['lon'][0] = gpi_info[1]
        dataset['lat'][0] = gpi_info[2]

        for season in self.seasons:

            if season != 'ALL':
                subset = self.month_to_season[data.index.month] == season
            else:
                subset = np.ones(len(data), dtype=bool)

            # number of observations
            n_obs = subset.sum()
            if n_obs < 10:
                continue
            dataset['{:}_n_obs'.format(season)][0] = n_obs

            # get single dataset metrics
            # calculate SNR
            x = data[self.df_columns[0]].values[subset]
            y = data[self.df_columns[1]].values[subset]
            z = data[self.df_columns[2]].values[subset]

            snr, err, beta = metrics.tcol_snr(x, y, z)

            for i, name in enumerate(self.ds_names):
                dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i]
                dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i]
                dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i]

            # calculate Pearson correlation
            pearson_R, pearson_p = df_metrics.pearsonr(data)
            pearson_R = pearson_R._asdict()
            pearson_p = pearson_p._asdict()

            # calculate Spearman correlation
            spea_rho, spea_p = df_metrics.spearmanr(data)
            spea_rho = spea_rho._asdict()
            spea_p = spea_p._asdict()

            # scale data to reference in order to calculate absolute metrics
            data_scaled = scale(data, method='min_max')

            # calculate bias
            bias_nT = df_metrics.bias(data_scaled)
            bias_dict = bias_nT._asdict()

            # calculate ubRMSD
            ubRMSD_nT = df_metrics.ubrmsd(data_scaled)
            ubRMSD_dict = ubRMSD_nT._asdict()

            for tds_name in self.tds_names:
                R = pearson_R[tds_name]
                p_R = pearson_p[tds_name]
                rho = spea_rho[tds_name]
                p_rho = spea_p[tds_name]
                bias = bias_dict[tds_name]
                ubRMSD = ubRMSD_dict[tds_name]

                split_tds_name = tds_name.split('_and_')
                tds_name_key = "{:}_{:}".format(self.ds_names_lut[
                    split_tds_name[0]],
                    self.ds_names_lut[
                    split_tds_name[1]])

                dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R
                dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R
                dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho
                dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \
                    p_rho
                dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias
                dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \
                    ubRMSD

        return dataset