def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the datasets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = super(IntercomparisonMetrics, self).calc_metrics(data, gpi_info) subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict, mse_corr_dict, mse_bias_dict, mse_var_dict = \ mse._asdict(), mse_corr._asdict(), mse_bias._asdict(), mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # No extra scaling is performed here. # always scale for ubRMSD with mean std # calculate ubRMSD data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < 10: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict = tau._asdict() p_tau_dict = p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None #data_scaled = scale(data, method='mean_std') # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data) ubRMSD_dict = ubRMSD_nT._asdict() # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_snr'.format(name)][0] = snr[i] dataset['{:}_err_var'.format(name)][0] = err[i] dataset['{:}_beta'.format(name)][0] = beta[i] for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format( self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]]) dataset['R_between_{:}'.format(tds_name_key)][0] = R dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R dataset['rho_between_{:}'.format(tds_name_key)][0] = rho dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho dataset['bias_between_{:}'.format(tds_name_key)][0] = bias dataset['mse_between_{:}'.format(tds_name_key)][0] = mse dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD if self.calc_tau: dataset['tau_between_{:}'.format(tds_name_key)][0] = tau dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ Calculate Triple Collocation metrics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] if self.metadata_template != None: for key, value in self.metadata_template.items(): dataset[key][0] = gpi_info[3][key] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calculate ubRMSD # todo: we could use the TC derived scaling parameters here? data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # calculate TC metrics ref_ind = np.where(np.array(data.columns) == self.ref_name)[0][0] snrs, err_stds, betas = df_metrics.tcol_snr(data, ref_ind=ref_ind) snr_dict = self._tc_res_dict(snrs) err_std_dict = self._tc_res_dict(err_stds) beta_dict = self._tc_res_dict(betas) # store TC results for thds_name in self.thds_names: snr = snr_dict[thds_name] err_std = err_std_dict[thds_name] beta = beta_dict[thds_name] split_thds_name = thds_name.split(self.ds_names_split) thds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_thds_name[0]], self.ds_names_lut[split_thds_name[1]], self.ds_names_lut[split_thds_name[2]] ]) for metr, res in dict(snr=snr, err_std=err_std, beta=beta).items(): for ds, ds_res in res.items(): m_ds = "{}_{}".format(metr, self.ds_names_lut[ds]) n = '{}{}{}'.format(m_ds, self.metric_ds_split, thds_name_key) if n in dataset.keys(): dataset[n][0] = ds_res # Store basic metrics results for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
plt.show() # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values print("ISMN time series:", ISMN_time_series) print("compared to") print(ascat_time_series) print("Results:") # df_metrics takes a DataFrame as input and automatically # calculates the metric on all combinations of columns # returns a named tuple for easy printing print(df_metrics.pearsonr(scaled_data)) print("Spearman's (rho,p_value)", metrics.spearmanr(x, y)) print("Kendalls's (tau,p_value)", metrics.kendalltau(x, y)) print(df_metrics.kendalltau(scaled_data)) print(df_metrics.rmsd(scaled_data)) print("Bias", metrics.bias(x, y)) print("Nash Sutcliffe", metrics.nash_sutcliffe(x, y)) i += 1 # only show the first 2 stations, otherwise this program would run a long time # and produce a lot of plots if i >= 2: break
def optimise(params, timespan=('2009-01', '2009-12'), gpi=None, rescaling=None): """ This function is optimising the parameters vegetation water content 'm_veg', soil moisture 'm_soil' and, if specified, a third optional parameter. The third optional parameter can eitehr be sand 'sand', clay 'clay', fractional root mean square height 'f_rms', stem volume 's_vol' or temperature 'temp'. Parameters ---------- params : list of dicts Model parameters. At least four of the following parameters needs to be specified if an optional parameter has been selected, otherwise all of them needs to be specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol' gpi : int, optional Grid point index. If specified, it will read data from datapool. Returns ------- df : pandas.DataFrame Optimised soil moisture, vegetation water concent and, if specified, optional optimised parameter. """ if gpi is None: ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0, parse_dates=True)[timespan[0]:timespan[1]] gpi = 2011528 else: ts_resam = read_resam(gpi)[timespan[0]:timespan[1]] m_veg_x0 = params.pop('m_veg_x0') m_soil_x0 = params.pop('m_soil_x0') columns = ['m_veg', 'm_soil'] x0 = np.array([m_veg_x0, m_soil_x0]) df = pd.DataFrame(index=ts_resam.index, columns=columns) df = df.fillna(np.nan) # optimise m_soil and m_veg for index, row in ts_resam.iterrows(): ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist()) ascat_sig = \ db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist())) args = (ascat_inc, ascat_sig, params, '') res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead') if res['success'] == True: df['m_veg'][index] = res['x'][0] df['m_soil'][index] = res['x'][1] str_static_p = \ ', '.join("%s: %r" % t for t in locals().iteritems()) str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format( m_veg_x0, m_soil_x0) ismn_file = os.path.join( 'data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm' ) ismn_data = ismn_readers.read_data(ismn_file) insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename( columns={'soil moisture': 'insitu'}) gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0) gldas.rename(columns={'086_L1': 'gldas'}, inplace=True) gldas = pd.DataFrame(gldas['gldas']) ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'}) matched = temp_match.matching(ascat, insitu, gldas) if rescaling is not None: scaled = scaling.scale(matched, rescaling, reference_index=1) else: scaled = matched metrics = OrderedDict() metrics['bias'] = df_metrics.bias(scaled) metrics['pearson'] = df_metrics.pearsonr(scaled) metrics['kendall'] = df_metrics.kendalltau(scaled) metrics['ubrmsd'] = df_metrics.ubrmsd(scaled) metrics['var_ratio'] = df_var_ratio(scaled) tcol_error = df_metrics.tcol_error(scaled)._asdict() ts_title = "Soil moisture. " if rescaling is not None: ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling]) else: ts_title = ' '.join([ts_title, 'No rescaling.']) axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8)) # these are matplotlib.patch.Patch properties props = dict(facecolor='white', alpha=0) columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas') row_labels = [ 'bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio' ] cell_text = [] for metric in metrics: metric_values = metrics[metric] if type(metric_values) == tuple: metric_values = metric_values[0] metric_values = metric_values._asdict() cell_text.append([ "%.2f" % metric_values['ascat_and_insitu'], "%.2f" % metric_values['ascat_and_gldas'], "%.2f" % metric_values['insitu_and_gldas'] ]) table = plt.table(cellText=cell_text, colLabels=columns, colWidths=[0.1, 0.1, 0.1], rowLabels=row_labels, loc='bottom', bbox=(0.2, -1.25, 0.5, 0.8)) tcol_table = plt.table(cellText=[[ "%.2f" % tcol_error['ascat'], "%.2f" % tcol_error['gldas'], "%.2f" % tcol_error['insitu'] ]], colLabels=('ascat', 'gldas', 'insitu'), colWidths=[0.1, 0.1, 0.1], rowLabels=['Triple collocation error'], loc='bottom', bbox=(0.2, -1.65, 0.5, 0.3)) plt.subplots_adjust(left=0.08, bottom=0.35) axes = scatter_matrix(scaled) axes.flat[0].figure.suptitle(ts_title) # only draw 1:1 line if scaling was applied if rescaling is not None: for j, ax in enumerate(axes.flatten()): if np.remainder(j + 1, 3 + 1) != 1: min_x, max_x = ax.get_xlim() min_y, max_y = ax.get_ylim() # find minimum lower left coordinate and maximum upper right min_ll = min([min_x, min_y]) max_ur = max([max_x, max_y]) ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6') return df
plt.show() # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values print "ISMN time series:", ISMN_time_series print "compared to" print ascat_time_series print "Results:" # df_metrics takes a DataFrame as input and automatically # calculates the metric on all combinations of columns # returns a named tuple for easy printing print df_metrics.pearsonr(scaled_data) print "Spearman's (rho,p_value)", metrics.spearmanr(x, y) print "Kendalls's (tau,p_value)", metrics.kendalltau(x, y) print df_metrics.kendalltau(scaled_data) print df_metrics.rmsd(scaled_data) print "Bias", metrics.bias(x, y) print "Nash Sutcliffe", metrics.nash_sutcliffe(x, y) i += 1 # only show the first 2 stations, otherwise this program would run a long time # and produce a lot of plots if i >= 2: break
def optimise(params, timespan=('2009-01', '2009-12'), gpi=None, rescaling=None): """ This function is optimising the parameters vegetation water content 'm_veg', soil moisture 'm_soil' and, if specified, a third optional parameter. The third optional parameter can eitehr be sand 'sand', clay 'clay', fractional root mean square height 'f_rms', stem volume 's_vol' or temperature 'temp'. Parameters ---------- params : list of dicts Model parameters. At least four of the following parameters needs to be specified if an optional parameter has been selected, otherwise all of them needs to be specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol' gpi : int, optional Grid point index. If specified, it will read data from datapool. Returns ------- df : pandas.DataFrame Optimised soil moisture, vegetation water concent and, if specified, optional optimised parameter. """ if gpi is None: ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0, parse_dates=True)[timespan[0]:timespan[1]] gpi = 2011528 else: ts_resam = read_resam(gpi)[timespan[0]:timespan[1]] m_veg_x0 = params.pop('m_veg_x0') m_soil_x0 = params.pop('m_soil_x0') columns = ['m_veg', 'm_soil'] x0 = np.array([m_veg_x0, m_soil_x0]) df = pd.DataFrame(index=ts_resam.index, columns=columns) df = df.fillna(np.nan) # optimise m_soil and m_veg for index, row in ts_resam.iterrows(): ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist()) ascat_sig = \ db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist())) args = (ascat_inc, ascat_sig, params, '') res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead') if res['success'] == True: df['m_veg'][index] = res['x'][0] df['m_soil'][index] = res['x'][1] str_static_p = \ ', '.join("%s: %r" % t for t in locals().iteritems()) str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(m_veg_x0, m_soil_x0) ismn_file = os.path.join('data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm') ismn_data = ismn_readers.read_data(ismn_file) insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(columns={'soil moisture': 'insitu'}) gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0) gldas.rename(columns={'086_L1': 'gldas'}, inplace=True) gldas = pd.DataFrame(gldas['gldas']) ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'}) matched = temp_match.matching(ascat, insitu, gldas) if rescaling is not None: scaled = scaling.scale(matched, rescaling, reference_index=1) else: scaled = matched metrics = OrderedDict() metrics['bias'] = df_metrics.bias(scaled) metrics['pearson'] = df_metrics.pearsonr(scaled) metrics['kendall'] = df_metrics.kendalltau(scaled) metrics['ubrmsd'] = df_metrics.ubrmsd(scaled) metrics['var_ratio'] = df_var_ratio(scaled) tcol_error = df_metrics.tcol_error(scaled)._asdict() ts_title = "Soil moisture. " if rescaling is not None: ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling]) else: ts_title = ' '.join([ts_title, 'No rescaling.']) axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8)) # these are matplotlib.patch.Patch properties props = dict(facecolor='white', alpha=0) columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas') row_labels = ['bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio'] cell_text = [] for metric in metrics: metric_values = metrics[metric] if type(metric_values) == tuple: metric_values = metric_values[0] metric_values = metric_values._asdict() cell_text.append(["%.2f" % metric_values['ascat_and_insitu'], "%.2f" % metric_values['ascat_and_gldas'], "%.2f" % metric_values['insitu_and_gldas']]) table = plt.table( cellText=cell_text, colLabels=columns, colWidths=[0.1, 0.1, 0.1], rowLabels=row_labels, loc='bottom', bbox=(0.2, -1.25, 0.5, 0.8)) tcol_table = plt.table( cellText=[["%.2f" % tcol_error['ascat'], "%.2f" % tcol_error['gldas'], "%.2f" % tcol_error['insitu']]], colLabels=('ascat', 'gldas', 'insitu'), colWidths=[0.1, 0.1, 0.1], rowLabels=['Triple collocation error'], loc='bottom', bbox=(0.2, -1.65, 0.5, 0.3)) plt.subplots_adjust(left=0.08, bottom=0.35) axes = scatter_matrix(scaled) axes.flat[0].figure.suptitle(ts_title) # only draw 1:1 line if scaling was applied if rescaling is not None: for j, ax in enumerate(axes.flatten()): if np.remainder(j + 1, 3 + 1) != 1: min_x, max_x = ax.get_xlim() min_y, max_y = ax.get_ylim() # find minimum lower left coordinate and maximum upper right min_ll = min([min_x, min_y]) max_ur = max([max_x, max_y]) ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6') return df
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < 10: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict = tau._asdict() p_tau_dict = p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None #data_scaled = scale(data, method='mean_std') # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data) ubRMSD_dict = ubRMSD_nT._asdict() # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_snr'.format(name)][0] = snr[i] dataset['{:}_err_var'.format(name)][0] = err[i] dataset['{:}_beta'.format(name)][0] = beta[i] for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format(self.ds_names_lut[ split_tds_name[0]], self.ds_names_lut[ split_tds_name[1]]) dataset['R_between_{:}'.format(tds_name_key)][0] = R dataset['p_R_between_{:}'.format(tds_name_key)][0] = p_R dataset['rho_between_{:}'.format(tds_name_key)][0] = rho dataset['p_rho_between_{:}'.format(tds_name_key)][0] = p_rho dataset['bias_between_{:}'.format(tds_name_key)][0] = bias dataset['mse_between_{:}'.format(tds_name_key)][0] = mse dataset['mse_corr_between_{:}'.format(tds_name_key)][0] = mse_corr dataset['mse_bias_between_{:}'.format(tds_name_key)][0] = mse_bias dataset['mse_var_between_{:}'.format(tds_name_key)][0] = mse_var dataset['rmsd_between_{:}'.format(tds_name_key)][0] = rmsd dataset['ubRMSD_between_{:}'.format(tds_name_key)][0] = ubRMSD if self.calc_tau: dataset['tau_between_{:}'.format(tds_name_key)][0] = tau dataset['p_tau_between_{:}'.format(tds_name_key)][0] = p_tau return dataset