def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the datasets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = super(IntercomparisonMetrics, self).calc_metrics(data, gpi_info) subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict, mse_corr_dict, mse_bias_dict, mse_var_dict = \ mse._asdict(), mse_corr._asdict(), mse_bias._asdict(), mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # No extra scaling is performed here. # always scale for ubRMSD with mean std # calculate ubRMSD data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ Calculate Triple Collocation metrics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] if self.metadata_template != None: for key, value in self.metadata_template.items(): dataset[key][0] = gpi_info[3][key] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calculate ubRMSD # todo: we could use the TC derived scaling parameters here? data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # calculate TC metrics ref_ind = np.where(np.array(data.columns) == self.ref_name)[0][0] snrs, err_stds, betas = df_metrics.tcol_snr(data, ref_ind=ref_ind) snr_dict = self._tc_res_dict(snrs) err_std_dict = self._tc_res_dict(err_stds) beta_dict = self._tc_res_dict(betas) # store TC results for thds_name in self.thds_names: snr = snr_dict[thds_name] err_std = err_std_dict[thds_name] beta = beta_dict[thds_name] split_thds_name = thds_name.split(self.ds_names_split) thds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_thds_name[0]], self.ds_names_lut[split_thds_name[1]], self.ds_names_lut[split_thds_name[2]] ]) for metr, res in dict(snr=snr, err_std=err_std, beta=beta).items(): for ds, ds_res in res.items(): m_ds = "{}_{}".format(metr, self.ds_names_lut[ds]) n = '{}{}{}'.format(m_ds, self.metric_ds_split, thds_name_key) if n in dataset.keys(): dataset[n][0] = ds_res # Store basic metrics results for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with 3 columns, the first column is the reference dataset named 'ref' the second and third column are the datasets to compare against named 'k1 and k2' gpi_info : tuple Grid point info (i.e. gpi, lon, lat) """ dataset = super(HSAF_Metrics, self).calc_metrics(data, gpi_info) for season in self.seasons: if season != 'ALL': subset = self.month_to_season[data.index.month] == season else: subset = np.ones(len(data), dtype=bool) # number of observations n_obs = subset.sum() if n_obs < self.min_obs: continue dataset['{:}_n_obs'.format(season)][0] = n_obs # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i] dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i] dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i] # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # scale data to reference in order to calculate absolute metrics data_scaled = scale(data, method='min_max') # calculate bias bias_nT = df_metrics.bias(data_scaled) bias_dict = bias_nT._asdict() # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format( self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]]) dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \ p_rho dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \ ubRMSD return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < 10: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict = tau._asdict() p_tau_dict = p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None #data_scaled = scale(data, method='mean_std') # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data) ubRMSD_dict = ubRMSD_nT._asdict() # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_snr'.format(name)][0] = snr[i] dataset['{:}_err_var'.format(name)][0] = err[i] dataset['{:}_beta'.format(name)][0] = beta[i] for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format( self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]]) dataset['R_between_{:}'.format(tds_name_key)][0] = R dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R dataset['rho_between_{:}'.format(tds_name_key)][0] = rho dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho dataset['bias_between_{:}'.format(tds_name_key)][0] = bias dataset['mse_between_{:}'.format(tds_name_key)][0] = mse dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD if self.calc_tau: dataset['tau_between_{:}'.format(tds_name_key)][0] = tau dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau return dataset
def optimise(params, timespan=('2009-01', '2009-12'), gpi=None, rescaling=None): """ This function is optimising the parameters vegetation water content 'm_veg', soil moisture 'm_soil' and, if specified, a third optional parameter. The third optional parameter can eitehr be sand 'sand', clay 'clay', fractional root mean square height 'f_rms', stem volume 's_vol' or temperature 'temp'. Parameters ---------- params : list of dicts Model parameters. At least four of the following parameters needs to be specified if an optional parameter has been selected, otherwise all of them needs to be specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol' gpi : int, optional Grid point index. If specified, it will read data from datapool. Returns ------- df : pandas.DataFrame Optimised soil moisture, vegetation water concent and, if specified, optional optimised parameter. """ if gpi is None: ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0, parse_dates=True)[timespan[0]:timespan[1]] gpi = 2011528 else: ts_resam = read_resam(gpi)[timespan[0]:timespan[1]] m_veg_x0 = params.pop('m_veg_x0') m_soil_x0 = params.pop('m_soil_x0') columns = ['m_veg', 'm_soil'] x0 = np.array([m_veg_x0, m_soil_x0]) df = pd.DataFrame(index=ts_resam.index, columns=columns) df = df.fillna(np.nan) # optimise m_soil and m_veg for index, row in ts_resam.iterrows(): ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist()) ascat_sig = \ db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist())) args = (ascat_inc, ascat_sig, params, '') res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead') if res['success'] == True: df['m_veg'][index] = res['x'][0] df['m_soil'][index] = res['x'][1] str_static_p = \ ', '.join("%s: %r" % t for t in locals().iteritems()) str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format( m_veg_x0, m_soil_x0) ismn_file = os.path.join( 'data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm' ) ismn_data = ismn_readers.read_data(ismn_file) insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename( columns={'soil moisture': 'insitu'}) gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0) gldas.rename(columns={'086_L1': 'gldas'}, inplace=True) gldas = pd.DataFrame(gldas['gldas']) ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'}) matched = temp_match.matching(ascat, insitu, gldas) if rescaling is not None: scaled = scaling.scale(matched, rescaling, reference_index=1) else: scaled = matched metrics = OrderedDict() metrics['bias'] = df_metrics.bias(scaled) metrics['pearson'] = df_metrics.pearsonr(scaled) metrics['kendall'] = df_metrics.kendalltau(scaled) metrics['ubrmsd'] = df_metrics.ubrmsd(scaled) metrics['var_ratio'] = df_var_ratio(scaled) tcol_error = df_metrics.tcol_error(scaled)._asdict() ts_title = "Soil moisture. " if rescaling is not None: ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling]) else: ts_title = ' '.join([ts_title, 'No rescaling.']) axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8)) # these are matplotlib.patch.Patch properties props = dict(facecolor='white', alpha=0) columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas') row_labels = [ 'bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio' ] cell_text = [] for metric in metrics: metric_values = metrics[metric] if type(metric_values) == tuple: metric_values = metric_values[0] metric_values = metric_values._asdict() cell_text.append([ "%.2f" % metric_values['ascat_and_insitu'], "%.2f" % metric_values['ascat_and_gldas'], "%.2f" % metric_values['insitu_and_gldas'] ]) table = plt.table(cellText=cell_text, colLabels=columns, colWidths=[0.1, 0.1, 0.1], rowLabels=row_labels, loc='bottom', bbox=(0.2, -1.25, 0.5, 0.8)) tcol_table = plt.table(cellText=[[ "%.2f" % tcol_error['ascat'], "%.2f" % tcol_error['gldas'], "%.2f" % tcol_error['insitu'] ]], colLabels=('ascat', 'gldas', 'insitu'), colWidths=[0.1, 0.1, 0.1], rowLabels=['Triple collocation error'], loc='bottom', bbox=(0.2, -1.65, 0.5, 0.3)) plt.subplots_adjust(left=0.08, bottom=0.35) axes = scatter_matrix(scaled) axes.flat[0].figure.suptitle(ts_title) # only draw 1:1 line if scaling was applied if rescaling is not None: for j, ax in enumerate(axes.flatten()): if np.remainder(j + 1, 3 + 1) != 1: min_x, max_x = ax.get_xlim() min_y, max_y = ax.get_ylim() # find minimum lower left coordinate and maximum upper right min_ll = min([min_x, min_y]) max_ur = max([max_x, max_y]) ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6') return df
def optimise(params, timespan=('2009-01', '2009-12'), gpi=None, rescaling=None): """ This function is optimising the parameters vegetation water content 'm_veg', soil moisture 'm_soil' and, if specified, a third optional parameter. The third optional parameter can eitehr be sand 'sand', clay 'clay', fractional root mean square height 'f_rms', stem volume 's_vol' or temperature 'temp'. Parameters ---------- params : list of dicts Model parameters. At least four of the following parameters needs to be specified if an optional parameter has been selected, otherwise all of them needs to be specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol' gpi : int, optional Grid point index. If specified, it will read data from datapool. Returns ------- df : pandas.DataFrame Optimised soil moisture, vegetation water concent and, if specified, optional optimised parameter. """ if gpi is None: ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0, parse_dates=True)[timespan[0]:timespan[1]] gpi = 2011528 else: ts_resam = read_resam(gpi)[timespan[0]:timespan[1]] m_veg_x0 = params.pop('m_veg_x0') m_soil_x0 = params.pop('m_soil_x0') columns = ['m_veg', 'm_soil'] x0 = np.array([m_veg_x0, m_soil_x0]) df = pd.DataFrame(index=ts_resam.index, columns=columns) df = df.fillna(np.nan) # optimise m_soil and m_veg for index, row in ts_resam.iterrows(): ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist()) ascat_sig = \ db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist())) args = (ascat_inc, ascat_sig, params, '') res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead') if res['success'] == True: df['m_veg'][index] = res['x'][0] df['m_soil'][index] = res['x'][1] str_static_p = \ ', '.join("%s: %r" % t for t in locals().iteritems()) str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(m_veg_x0, m_soil_x0) ismn_file = os.path.join('data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm') ismn_data = ismn_readers.read_data(ismn_file) insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(columns={'soil moisture': 'insitu'}) gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0) gldas.rename(columns={'086_L1': 'gldas'}, inplace=True) gldas = pd.DataFrame(gldas['gldas']) ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'}) matched = temp_match.matching(ascat, insitu, gldas) if rescaling is not None: scaled = scaling.scale(matched, rescaling, reference_index=1) else: scaled = matched metrics = OrderedDict() metrics['bias'] = df_metrics.bias(scaled) metrics['pearson'] = df_metrics.pearsonr(scaled) metrics['kendall'] = df_metrics.kendalltau(scaled) metrics['ubrmsd'] = df_metrics.ubrmsd(scaled) metrics['var_ratio'] = df_var_ratio(scaled) tcol_error = df_metrics.tcol_error(scaled)._asdict() ts_title = "Soil moisture. " if rescaling is not None: ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling]) else: ts_title = ' '.join([ts_title, 'No rescaling.']) axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8)) # these are matplotlib.patch.Patch properties props = dict(facecolor='white', alpha=0) columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas') row_labels = ['bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio'] cell_text = [] for metric in metrics: metric_values = metrics[metric] if type(metric_values) == tuple: metric_values = metric_values[0] metric_values = metric_values._asdict() cell_text.append(["%.2f" % metric_values['ascat_and_insitu'], "%.2f" % metric_values['ascat_and_gldas'], "%.2f" % metric_values['insitu_and_gldas']]) table = plt.table( cellText=cell_text, colLabels=columns, colWidths=[0.1, 0.1, 0.1], rowLabels=row_labels, loc='bottom', bbox=(0.2, -1.25, 0.5, 0.8)) tcol_table = plt.table( cellText=[["%.2f" % tcol_error['ascat'], "%.2f" % tcol_error['gldas'], "%.2f" % tcol_error['insitu']]], colLabels=('ascat', 'gldas', 'insitu'), colWidths=[0.1, 0.1, 0.1], rowLabels=['Triple collocation error'], loc='bottom', bbox=(0.2, -1.65, 0.5, 0.3)) plt.subplots_adjust(left=0.08, bottom=0.35) axes = scatter_matrix(scaled) axes.flat[0].figure.suptitle(ts_title) # only draw 1:1 line if scaling was applied if rescaling is not None: for j, ax in enumerate(axes.flatten()): if np.remainder(j + 1, 3 + 1) != 1: min_x, max_x = ax.get_xlim() min_y, max_y = ax.get_ylim() # find minimum lower left coordinate and maximum upper right min_ll = min([min_x, min_y]) max_ur = max([max_x, max_y]) ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6') return df
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < 10: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict = tau._asdict() p_tau_dict = p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None #data_scaled = scale(data, method='mean_std') # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data) ubRMSD_dict = ubRMSD_nT._asdict() # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_snr'.format(name)][0] = snr[i] dataset['{:}_err_var'.format(name)][0] = err[i] dataset['{:}_beta'.format(name)][0] = beta[i] for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format(self.ds_names_lut[ split_tds_name[0]], self.ds_names_lut[ split_tds_name[1]]) dataset['R_between_{:}'.format(tds_name_key)][0] = R dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R dataset['rho_between_{:}'.format(tds_name_key)][0] = rho dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho dataset['bias_between_{:}'.format(tds_name_key)][0] = bias dataset['mse_between_{:}'.format(tds_name_key)][0] = mse dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD if self.calc_tau: dataset['tau_between_{:}'.format(tds_name_key)][0] = tau dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with 3 columns, the first column is the reference dataset named 'ref' the second and third column are the datasets to compare against named 'k1 and k2' gpi_info : tuple Grid point info (i.e. gpi, lon, lat) """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] for season in self.seasons: if season != 'ALL': subset = self.month_to_season[data.index.month] == season else: subset = np.ones(len(data), dtype=bool) # number of observations n_obs = subset.sum() if n_obs < 10: continue dataset['{:}_n_obs'.format(season)][0] = n_obs # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i] dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i] dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i] # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # scale data to reference in order to calculate absolute metrics data_scaled = scale(data, method='min_max') # calculate bias bias_nT = df_metrics.bias(data_scaled) bias_dict = bias_nT._asdict() # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format(self.ds_names_lut[ split_tds_name[0]], self.ds_names_lut[ split_tds_name[1]]) dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \ p_rho dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \ ubRMSD return dataset