def test_scale(method): n = 1000 x = np.arange(n) y = np.arange(n) * 0.5 df = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y']) if method in ["lin_cdf_match", "cdf_match"]: with pytest.deprecated_call(): df_scaled = scaling.scale(df, method=method, reference_index=0) else: df_scaled = scaling.scale(df, method=method, reference_index=0) nptest.assert_almost_equal(df_scaled['x'].values, df_scaled['y'].values)
def calc_rho(ascat_ssm, FP_df, hoal_df): # multiply ASCAT with porosity (0.54) to get same units ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat']*0.54 matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'], hoal_df['HOAL_sm0.05']) matched_data.plot() plt.title('Matched data: ASCAT, FP, HOAL') plt.show() data_together = scale(matched_data)#, method="mean_std") ascat_rho = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3], data_together['ssm_ascat'].iloc[:-3]) hoal_rho_sm = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3], data_together['HOAL_sm0.05'].iloc[:-3]) exclude = ['HOAL_ts0.05', 'air_temperature_celsius', 'par_umole_m2s', 'merge_key'] data_together.ix[:, data_together.columns.difference(exclude)].plot() plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen, station 22', fontsize=24) #+'\n rho_ASCAT_Parrot: '+str(np.round(ascat_rho[0],3))+ #', rho_HOAL_Parrot: '+str(np.round(hoal_rho_sm[0],3))) plt.ylabel('Volumetric Water Content [%]',fontsize=20) plt.tick_params(axis='both', which='major', labelsize=18) plt.ylim([0,60]) plt.show()
def plot_alltogether(time_lag, lon, lat, ts1, ts2, scale_ts=False, save_fig=False, *args): matched_data = temp_match.matching(ts1, ts2, *args) if len(matched_data) == 0: print "Empty dataset." return if scale_ts: matched_data = scaling.scale(matched_data, method="mean_std") matched_data.plot(figsize=(15, 5)) plt.title('SWI and Vegetation indices comparison (rescaled)') if save_fig: plt.savefig("C:\\Users\\i.pfeil\\Desktop\\TS_plots\\lon_" + str(lon) + "_lat_" + str(lat) + '_' + str(time_lag) + ".png", bbox_inches='tight') plt.clf() else: plt.show()
def calc_rho(ascat_ssm, FP_df, hoal_df): # multiply ASCAT with porosity (0.54) to get same units ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat'] * 0.54 matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'], hoal_df['HOAL_sm0.05']) matched_data.plot() plt.title('Matched data: ASCAT, FP, HOAL') plt.show() data_together = scale(matched_data) #, method="mean_std") ascat_rho = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3], data_together['ssm_ascat'].iloc[:-3]) hoal_rho_sm = metrics.spearmanr(data_together['Parrot_vwc'].iloc[:-3], data_together['HOAL_sm0.05'].iloc[:-3]) exclude = [ 'HOAL_ts0.05', 'air_temperature_celsius', 'par_umole_m2s', 'merge_key' ] data_together.ix[:, data_together.columns.difference(exclude)].plot() plt.title( 'Satellite and in-situ soil moisture, HOAL Petzenkirchen, station 22', fontsize=24) #+'\n rho_ASCAT_Parrot: '+str(np.round(ascat_rho[0],3))+ #', rho_HOAL_Parrot: '+str(np.round(hoal_rho_sm[0],3))) plt.ylabel('Volumetric Water Content [%]', fontsize=20) plt.tick_params(axis='both', which='major', labelsize=18) plt.ylim([0, 60]) plt.show()
def scale(self, data, reference_index, gpi_info): """ Scale all columns in data to the column at the reference_index. Parameters ---------- data: pandas.DataFrame temporally matched dataset reference_index: int Which column of the data contains the scaling reference. gpi_info: tuple tuple of at least, (gpi, lon, lat) Where gpi has to be the grid point indices of the grid of this scaler. Raises ------ ValueError if scaling is not successful """ return scaling.scale(data, method=self.method, reference_index=reference_index)
def test_scale(method): n = 1000 x = np.arange(n) y = np.arange(n) * 0.5 df = pd.DataFrame({'x': x, 'y': y}, columns=['x', 'y']) df_scaled = scaling.scale(df, method=method, reference_index=0) nptest.assert_almost_equal(df_scaled['x'].values, df_scaled['y'].values)
def rescale_df(ascat_ssm, FP_df): ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat'] * 0.54 ascat_ssm.plot() plt.show() matched_data = matching(ascat_ssm, FP_df['Parrot_vwc']) matched_data.plot() plt.show() scaled_data = scale(matched_data, method="mean_std") scaled_data.plot() plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen') plt.ylabel('Volumetric Water Content [%]') plt.show()
def rescale_df(ascat_ssm, FP_df): ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat']*0.54 ascat_ssm.plot() plt.show() matched_data = matching(ascat_ssm, FP_df['Parrot_vwc']) matched_data.plot() plt.show() scaled_data = scale(matched_data, method="mean_std") scaled_data.plot() plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen') plt.ylabel('Volumetric Water Content [%]') plt.show()
def rescale_df(ascat_ssm, FP_df, hoal_df): ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat']*0.54 #ascat_ssm.plot() #plt.show() matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'], hoal_df) matched_data.plot() plt.title('Matched data: ASCAT, FP, HOAL') plt.show() scaled_data = scale(matched_data)#, method="mean_std") scaled_data.plot() plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen') plt.ylabel('Volumetric Water Content [%]') plt.ylim([0,60]) plt.show()
def rescale_df(ascat_ssm, FP_df, hoal_df): ascat_ssm['ssm_ascat'] = ascat_ssm['ssm_ascat'] * 0.54 #ascat_ssm.plot() #plt.show() matched_data = matching(ascat_ssm, FP_df['Parrot_vwc'], hoal_df) matched_data.plot() plt.title('Matched data: ASCAT, FP, HOAL') plt.show() scaled_data = scale(matched_data) #, method="mean_std") scaled_data.plot() plt.title('Satellite and in-situ soil moisture, HOAL Petzenkirchen') plt.ylabel('Volumetric Water Content [%]') plt.ylim([0, 60]) plt.show()
def optimise(params, timespan=('2009-01', '2009-12'), gpi=None, rescaling=None): """ This function is optimising the parameters vegetation water content 'm_veg', soil moisture 'm_soil' and, if specified, a third optional parameter. The third optional parameter can eitehr be sand 'sand', clay 'clay', fractional root mean square height 'f_rms', stem volume 's_vol' or temperature 'temp'. Parameters ---------- params : list of dicts Model parameters. At least four of the following parameters needs to be specified if an optional parameter has been selected, otherwise all of them needs to be specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol' gpi : int, optional Grid point index. If specified, it will read data from datapool. Returns ------- df : pandas.DataFrame Optimised soil moisture, vegetation water concent and, if specified, optional optimised parameter. """ if gpi is None: ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0, parse_dates=True)[timespan[0]:timespan[1]] gpi = 2011528 else: ts_resam = read_resam(gpi)[timespan[0]:timespan[1]] m_veg_x0 = params.pop('m_veg_x0') m_soil_x0 = params.pop('m_soil_x0') columns = ['m_veg', 'm_soil'] x0 = np.array([m_veg_x0, m_soil_x0]) df = pd.DataFrame(index=ts_resam.index, columns=columns) df = df.fillna(np.nan) # optimise m_soil and m_veg for index, row in ts_resam.iterrows(): ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist()) ascat_sig = \ db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist())) args = (ascat_inc, ascat_sig, params, '') res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead') if res['success'] == True: df['m_veg'][index] = res['x'][0] df['m_soil'][index] = res['x'][1] str_static_p = \ ', '.join("%s: %r" % t for t in locals().iteritems()) str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(m_veg_x0, m_soil_x0) ismn_file = os.path.join('data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm') ismn_data = ismn_readers.read_data(ismn_file) insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(columns={'soil moisture': 'insitu'}) gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0) gldas.rename(columns={'086_L1': 'gldas'}, inplace=True) gldas = pd.DataFrame(gldas['gldas']) ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'}) matched = temp_match.matching(ascat, insitu, gldas) if rescaling is not None: scaled = scaling.scale(matched, rescaling, reference_index=1) else: scaled = matched metrics = OrderedDict() metrics['bias'] = df_metrics.bias(scaled) metrics['pearson'] = df_metrics.pearsonr(scaled) metrics['kendall'] = df_metrics.kendalltau(scaled) metrics['ubrmsd'] = df_metrics.ubrmsd(scaled) metrics['var_ratio'] = df_var_ratio(scaled) tcol_error = df_metrics.tcol_error(scaled)._asdict() ts_title = "Soil moisture. " if rescaling is not None: ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling]) else: ts_title = ' '.join([ts_title, 'No rescaling.']) axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8)) # these are matplotlib.patch.Patch properties props = dict(facecolor='white', alpha=0) columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas') row_labels = ['bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio'] cell_text = [] for metric in metrics: metric_values = metrics[metric] if type(metric_values) == tuple: metric_values = metric_values[0] metric_values = metric_values._asdict() cell_text.append(["%.2f" % metric_values['ascat_and_insitu'], "%.2f" % metric_values['ascat_and_gldas'], "%.2f" % metric_values['insitu_and_gldas']]) table = plt.table( cellText=cell_text, colLabels=columns, colWidths=[0.1, 0.1, 0.1], rowLabels=row_labels, loc='bottom', bbox=(0.2, -1.25, 0.5, 0.8)) tcol_table = plt.table( cellText=[["%.2f" % tcol_error['ascat'], "%.2f" % tcol_error['gldas'], "%.2f" % tcol_error['insitu']]], colLabels=('ascat', 'gldas', 'insitu'), colWidths=[0.1, 0.1, 0.1], rowLabels=['Triple collocation error'], loc='bottom', bbox=(0.2, -1.65, 0.5, 0.3)) plt.subplots_adjust(left=0.08, bottom=0.35) axes = scatter_matrix(scaled) axes.flat[0].figure.suptitle(ts_title) # only draw 1:1 line if scaling was applied if rescaling is not None: for j, ax in enumerate(axes.flatten()): if np.remainder(j + 1, 3 + 1) != 1: min_x, max_x = ax.get_xlim() min_y, max_y = ax.get_ylim() # find minimum lower left coordinate and maximum upper right min_ll = min([min_x, min_y]) max_ur = max([max_x, max_y]) ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6') return df
matched_data = temp_match.matching(ascat_time_series.data, ISMN_time_series.data, window=1 / 24.) # matched ISMN data is now a dataframe with the same datetime index # as ascat_time_series.data and the nearest insitu observation # continue only with relevant columns matched_data = matched_data[[label_ascat, label_insitu]] # the plot shows that ISMN and ASCAT are observed in different units matched_data.plot(figsize=(15, 5), secondary_y=[label_ascat], title='temporally merged data') plt.show() # this takes the matched_data DataFrame and scales all columns to the # column with the given reference_index, in this case in situ scaled_data = scaling.scale(matched_data, method='lin_cdf_match', reference_index=1) # now the scaled ascat data and insitu_sm are in the same space scaled_data.plot(figsize=(15, 5), title='scaled data') plt.show() plt.scatter(scaled_data[label_ascat].values, scaled_data[label_insitu].values) plt.xlabel(label_ascat) plt.ylabel(label_insitu) plt.show() # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values print("ISMN time series:", ISMN_time_series) print("compared to")
def perform_validation(self, df_dict, gpi_info): """ Perform the validation for one grid point index and return the matched datasets as well as the calculated metrics. Parameters ---------- df_dict: dict of pandas.DataFrames DataFrames read by the data readers for each dataset gpi_info: tuple tuple of at least, (gpi, lon, lat) Returns ------- matched_n: dict of pandas.DataFrames temporally matched data stored by (n, k) tuples results: dict Dictonary of calculated metrics stored by dataset combinations tuples. used_data: dict The DataFrame used for calculation of each set of metrics. """ results = {} used_data = {} matched_n = {} if self.masking_dm is not None: ref_df = df_dict[self.temporal_ref] masked_ref_df = self.mask_dataset(ref_df, gpi_info) if len(masked_ref_df) == 0: return matched_n, results, used_data df_dict[self.temporal_ref] = masked_ref_df matched_n = self.temporal_match_datasets(df_dict) for n, k in self.metrics_c: n_matched_data = matched_n[(n, k)] if len(n_matched_data) == 0: continue result_names = get_result_names(self.data_manager.ds_dict, self.temporal_ref, n=k) for data, result_key in self.k_datasets_from(n_matched_data, result_names): if len(data) == 0: continue # at this stage we can drop the column multiindex and just use # the dataset name data.columns = data.columns.droplevel(level=1) # Rename the columns to 'ref', 'k1', 'k2', ... rename_dict = {} f = lambda x: "k{}".format(x) if x > 0 else 'ref' for i, r in enumerate(result_key): rename_dict[r[0]] = f(i) data.rename(columns=rename_dict, inplace=True) if self.scaling is not None: # get scaling index by finding the column in the # DataFrame that belongs to the scaling reference scaling_index = data.columns.tolist().index( rename_dict[self.scaling_ref]) try: data = scaling.scale(data, method=self.scaling, reference_index=scaling_index) except ValueError: continue if result_key not in results.keys(): results[result_key] = [] metrics_calculator = self.metrics_c[(n, k)] used_data[result_key] = data metrics = metrics_calculator(data, gpi_info) results[result_key].append(metrics) return matched_n, results, used_data
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with 3 columns, the first column is the reference dataset named 'ref' the second and third column are the datasets to compare against named 'k1 and k2' gpi_info : tuple Grid point info (i.e. gpi, lon, lat) """ dataset = super(HSAF_Metrics, self).calc_metrics(data, gpi_info) for season in self.seasons: if season != 'ALL': subset = self.month_to_season[data.index.month] == season else: subset = np.ones(len(data), dtype=bool) # number of observations n_obs = subset.sum() if n_obs < self.min_obs: continue dataset['{:}_n_obs'.format(season)][0] = n_obs # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i] dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i] dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i] # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # scale data to reference in order to calculate absolute metrics data_scaled = scale(data, method='min_max') # calculate bias bias_nT = df_metrics.bias(data_scaled) bias_dict = bias_nT._asdict() # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format( self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]]) dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \ p_rho dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \ ubRMSD return dataset
matched_data = temp_match.matching(ascat_time_series.data, ISMN_time_series.data, window=1 / 24.) # matched ISMN data is now a dataframe with the same datetime index # as ascat_time_series.data and the nearest insitu observation # continue only with relevant columns matched_data = matched_data[[label_ascat, label_insitu]] # the plot shows that ISMN and ASCAT are observed in different units matched_data.plot(figsize=(15, 5), secondary_y=[label_ascat], title='temporally merged data') plt.show() # this takes the matched_data DataFrame and scales all columns to the # column with the given reference_index, in this case in situ scaled_data = scaling.scale(matched_data, method='lin_cdf_match', reference_index=1) # now the scaled ascat data and insitu_sm are in the same space scaled_data.plot(figsize=(15, 5), title='scaled data') plt.show() plt.scatter(scaled_data[label_ascat].values, scaled_data[label_insitu].values) plt.xlabel(label_ascat) plt.ylabel(label_insitu) plt.show() # calculate correlation coefficients, RMSD, bias, Nash Sutcliffe x, y = scaled_data[label_ascat].values, scaled_data[label_insitu].values print "ISMN time series:", ISMN_time_series print "compared to"
def validate(params, timespan=('2009-01', '2009-12'), gpi=None, rescaling=None, y_axis_range=None): """ This function is optimising the parameters vegetation water content 'm_veg', soil moisture 'm_soil' and, if specified, a third optional parameter. The third optional parameter can eitehr be sand 'sand', clay 'clay', fractional root mean square height 'f_rms', stem volume 's_vol' or temperature 'temp'. Parameters ---------- params : list of dicts Model parameters. At least four of the following parameters needs to be specified if an optional parameter has been selected, otherwise all of them needs to be specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol' timespan : tuple, optional timespan to analyze gpi : int, optional Grid point index. If specified, it will read data from datapool. rescaling : string, optional rescaling method, one of 'min_max', 'linreg', 'mean_std' and 'lin_cdf_match' Default: None insitu is the reference to which is scaled y_axis_range : tuple, optional specify (min, max) of y axis Returns ------- df : pandas.DataFrame Optimised soil moisture, vegetation water concent and, if specified, optional optimised parameter. """ unit_dict = {'freq': 'GHz', 'sand': '', 'clay': '', 'temp': '$^\circ$C', 'eps': '', 'theta': '$^\circ$', 'f_rms': '', 'sig_bare': 'dB', 'm_soil': '%', 'm_veg': '%', 'm_soil_x0': '%', 'm_veg_x0': '%', 's_vol': '$m^3ha^{-1}$', 'sig_canopy': 'dB', 'sig_for': 'dB', 'sig_floor': 'dB', 'polarization': ''} param_should = ['sand', 'clay', 'temp', 's_vol', 'f_rms', 'm_veg_x0', 'm_soil_x0'] for param in param_should: assert param in params.keys() if gpi is None: ts_resam = pd.read_csv(os.path.join(os.path.split(os.path.abspath(__file__))[0],'data','2011528_2009.csv'), index_col=0, parse_dates=True)[timespan[0]:timespan[1]] gpi = 2011528 else: ts_resam = read_resam(gpi)[timespan[0]:timespan[1]] m_veg_x0 = params.pop('m_veg_x0') m_soil_x0 = params.pop('m_soil_x0') columns = ['m_veg', 'm_soil'] x0 = np.array([m_veg_x0, m_soil_x0]) df = pd.DataFrame(index=ts_resam.index, columns=columns) df = df.fillna(np.nan) # optimise m_soil and m_veg for index, row in ts_resam.iterrows(): ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist()) ascat_sig = \ db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist())) args = (ascat_inc, ascat_sig, params, '') res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead') if res['success'] == True: df['m_veg'][index] = res['x'][0] df['m_soil'][index] = res['x'][1] str_static_p = \ ', '.join("%s: %r" % t for t in locals().iteritems()) str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format(m_veg_x0, m_soil_x0) ismn_file = os.path.join(os.path.split(os.path.abspath(__file__))[0],'data','ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm') ismn_data = ismn_readers.read_data(ismn_file) insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename(columns={'soil moisture': 'insitu'}) gldas = pd.read_csv(os.path.join(os.path.split(os.path.abspath(__file__))[0],'data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0) gldas.rename(columns={'086_L1': 'gldas'}, inplace=True) gldas = pd.DataFrame(gldas['gldas']) / 100.0 ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'}) matched = temp_match.matching(ascat, insitu, gldas) if rescaling is not None: scaled = scaling.scale(matched, rescaling, reference_index=1) else: scaled = matched metrics = OrderedDict() metrics['bias'] = df_metrics.bias(scaled) metrics['pearson'] = df_metrics.pearsonr(scaled) metrics['spearman'] = df_metrics.spearmanr(scaled) metrics['ubrmsd'] = df_metrics.rmsd(scaled) metrics['std_ratio'] = df_std_ratio(scaled) tcol_error = df_metrics.tcol_error(scaled)._asdict() ts_title = "Soil moisture. " if rescaling is not None: ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling]) rmsd_title = 'unbiased RMSD' else: ts_title = ' '.join([ts_title, 'No rescaling.']) rmsd_title = 'RMSD' axes = scaled.plot(title=ts_title, figsize=(18, 8)) plt.legend() # these are matplotlib.patch.Patch properties props = dict(facecolor='white', alpha=0) columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas') row_labels = ['bias', 'pearson R', 'spearman rho', rmsd_title, 'stddev ratio'] cell_text = [] for metric in metrics: metric_values = metrics[metric] if type(metric_values) == tuple: metric_values = metric_values[0] metric_values = metric_values._asdict() cell_text.append(["%.2f" % metric_values['ascat_and_insitu'], "%.2f" % metric_values['ascat_and_gldas'], "%.2f" % metric_values['insitu_and_gldas']]) table = plt.table( cellText=cell_text, colLabels=columns, colWidths=[0.1, 0.1, 0.1], rowLabels=row_labels, loc='bottom', bbox=(0.2, -0.5, 0.5, 0.3)) tcol_table = plt.table( cellText=[["%.2f" % tcol_error['ascat'], "%.2f" % tcol_error['gldas'], "%.2f" % tcol_error['insitu']]], colLabels=('ascat ', 'gldas ', 'insitu '), colWidths=[0.1, 0.1, 0.1], rowLabels=['Triple collocation error'], loc='bottom', bbox=(0.2, -0.6, 0.5, 0.1)) plt.subplots_adjust(left=0.08, bottom=0.35, right=0.85) plt.draw() # if y_axis_range is not None: axes.set_ylim(y_axis_range) params['m_veg_x0'] = m_veg_x0 params['m_soil_x0'] = m_soil_x0 infotext = [] for label in sorted(param_should): infotext.append('%s = %s %s' % (label, params[label], unit_dict[label])) infotext = '\n'.join(infotext) # place a text box in upper left in axes coords axes.text(1.03, 1, infotext, transform=axes.transAxes, fontsize=12, verticalalignment='top', bbox=props) axes = scatter_matrix(scaled) axes.flat[0].figure.suptitle(ts_title) # only draw 1:1 line if scaling was applied for j, ax in enumerate(axes.flatten()): if y_axis_range is not None: ax.set_xlim(y_axis_range) if np.remainder(j + 1, 3 + 1) != 1: if y_axis_range is not None: ax.set_ylim(y_axis_range) min_x, max_x = ax.get_xlim() min_y, max_y = ax.get_ylim() # find minimum lower left coordinate and maximum upper right min_ll = min([min_x, min_y]) max_ur = max([max_x, max_y]) ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6')
def perform_validation(self, df_dict, gpi_info): """ Perform the validation for one grid point index and return the matched datasets as well as the calculated metrics. Parameters ---------- df_dict: dict of pandas.DataFrames DataFrames read by the data readers for each dataset gpi_info: tuple tuple of at least, (gpi, lon, lat) Returns ------- matched_n: dict of pandas.DataFrames temporally matched data stored by (n, k) tuples results: dict Dictonary of calculated metrics stored by dataset combinations tuples. used_data: dict The DataFrame used for calculation of each set of metrics. """ results = {} used_data = {} matched_n = {} if self.masking_dm is not None: ref_df = df_dict[self.temporal_ref] masked_ref_df = self.mask_dataset(ref_df, gpi_info) if len(masked_ref_df) == 0: return matched_n, results, used_data df_dict[self.temporal_ref] = masked_ref_df matched_n = self.temporal_match_datasets(df_dict) for n, k in self.metrics_c: n_matched_data = matched_n[(n, k)] if len(n_matched_data) == 0: continue result_names = get_result_names(self.data_manager.ds_dict, self.temporal_ref, n=k) for data, result_key in self.k_datasets_from( n_matched_data, result_names): if len(data) == 0: continue # at this stage we can drop the column multiindex and just use # the dataset name data.columns = data.columns.droplevel(level=1) # Rename the columns to 'ref', 'k1', 'k2', ... rename_dict = {} f = lambda x: "k{}".format(x) if x > 0 else 'ref' for i, r in enumerate(result_key): rename_dict[r[0]] = f(i) data.rename(columns=rename_dict, inplace=True) if self.scaling is not None: # get scaling index by finding the column in the # DataFrame that belongs to the scaling reference scaling_index = data.columns.tolist().index( rename_dict[self.scaling_ref]) try: data = scaling.scale(data, method=self.scaling, reference_index=scaling_index) except ValueError: continue if result_key not in results.keys(): results[result_key] = [] metrics_calculator = self.metrics_c[(n, k)] used_data[result_key] = data metrics = metrics_calculator(data, gpi_info) results[result_key].append(metrics) return matched_n, results, used_data
def calc(self, job): """ Takes either a cell or a gpi_info tuple and performs the validation. Parameters ---------- job : object Job of type that self.get_processing_jobs() returns. Returns ------- compact_results : dict of dicts Keys: result names, combinations of (referenceDataset.column, otherDataset.column) Values: dict containing the elements returned by metrics_calculator """ result_names = self.data_manager.get_results_names() results = {} if self.cell_based_jobs: process_gpis, process_lons, process_lats = self.data_manager.\ reference_grid.grid_points_for_cell(job) else: process_gpis, process_lons, process_lats = [ job[0]], [job[1]], [job[2]] for gpi_info in zip(process_gpis, process_lons, process_lats): # if processing is cell based gpi_metainfo is limited to gpi, lon, # lat at the moment if self.cell_based_jobs: gpi_meta = gpi_info else: gpi_meta = job ref_dataframe = self.data_manager.read_reference(gpi_info[0]) # if no reference data available continue with the next gpi if ref_dataframe is None: continue other_dataframes = {} for other_name in self.data_manager.other_name: grids_compatible = self.data_manager.datasets[ other_name]['grids_compatible'] if grids_compatible: other_dataframe = self.data_manager.read_other( other_name, gpi_info[0]) elif self.luts[other_name] is not None: other_gpi = self.luts[other_name][gpi_info[0]] if other_gpi == -1: continue other_dataframe = self.data_manager.read_other( other_name, other_gpi) else: other_dataframe = self.data_manager.read_other( other_name, gpi_info[1], gpi_info[2]) if other_dataframe is not None: other_dataframes[other_name] = other_dataframe # if no other data available continue with the next gpi if len(other_dataframes) == 0: continue joined_data = {} for other in other_dataframes.keys(): joined = self.temp_matching(ref_dataframe, other_dataframes[other]) if len(joined) != 0: joined_data[other] = joined if len(joined_data) == 0: continue # compute results for each combination of (ref, other) columns rescaled_data = {} for result in result_names: ref_col = result[0].split('.')[1] other_col = result[1].split('.')[1] other_name = result[1].split('.')[0] try: data = joined_data[other_name][ [ref_col, other_col]].dropna() except KeyError: continue data.rename( columns={ref_col: 'ref', other_col: 'other'}, inplace=True) if len(data) == 0: continue if self.scaling is not None: try: data = scaling.scale( data, method=self.scaling, reference_index=self.scale_to_index) rescaled_data[other_name] = data except ValueError: continue if result not in results.keys(): results[result] = [] results[result].append(self.calc_metrics(data, gpi_meta)) compact_results = {} for key in results.keys(): compact_results[key] = {} for field_name in results[key][0].keys(): entries = [] for result in results[key]: entries.append(result[field_name][0]) compact_results[key][field_name] = \ np.array(entries, dtype=results[key][0][field_name].dtype) if self.data_postproc is not None: self.data_postproc(compact_results, rescaled_data) return compact_results
def calc_metrics(self, data, gpi_info): """ Calculate Triple Collocation metrics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the data sets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] if self.metadata_template != None: for key, value in self.metadata_template.items(): dataset[key][0] = gpi_info[3][key] # number of observations subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict = mse._asdict() mse_corr_dict = mse_corr._asdict() mse_bias_dict = mse_bias._asdict() mse_var_dict = mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calculate ubRMSD # todo: we could use the TC derived scaling parameters here? data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # calculate TC metrics ref_ind = np.where(np.array(data.columns) == self.ref_name)[0][0] snrs, err_stds, betas = df_metrics.tcol_snr(data, ref_ind=ref_ind) snr_dict = self._tc_res_dict(snrs) err_std_dict = self._tc_res_dict(err_stds) beta_dict = self._tc_res_dict(betas) # store TC results for thds_name in self.thds_names: snr = snr_dict[thds_name] err_std = err_std_dict[thds_name] beta = beta_dict[thds_name] split_thds_name = thds_name.split(self.ds_names_split) thds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_thds_name[0]], self.ds_names_lut[split_thds_name[1]], self.ds_names_lut[split_thds_name[2]] ]) for metr, res in dict(snr=snr, err_std=err_std, beta=beta).items(): for ds, ds_res in res.items(): m_ds = "{}_{}".format(metr, self.ds_names_lut[ds]) n = '{}{}{}'.format(m_ds, self.metric_ds_split, thds_name_key) if n in dataset.keys(): dataset[n][0] = ds_res # Store basic metrics results for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
def optimise(params, timespan=('2009-01', '2009-12'), gpi=None, rescaling=None): """ This function is optimising the parameters vegetation water content 'm_veg', soil moisture 'm_soil' and, if specified, a third optional parameter. The third optional parameter can eitehr be sand 'sand', clay 'clay', fractional root mean square height 'f_rms', stem volume 's_vol' or temperature 'temp'. Parameters ---------- params : list of dicts Model parameters. At least four of the following parameters needs to be specified if an optional parameter has been selected, otherwise all of them needs to be specified: 'sand', 'clay', 'f_rms', 'temp', 's_vol' gpi : int, optional Grid point index. If specified, it will read data from datapool. Returns ------- df : pandas.DataFrame Optimised soil moisture, vegetation water concent and, if specified, optional optimised parameter. """ if gpi is None: ts_resam = pd.read_csv(os.path.join("data", "2011528_2009.csv"), index_col=0, parse_dates=True)[timespan[0]:timespan[1]] gpi = 2011528 else: ts_resam = read_resam(gpi)[timespan[0]:timespan[1]] m_veg_x0 = params.pop('m_veg_x0') m_soil_x0 = params.pop('m_soil_x0') columns = ['m_veg', 'm_soil'] x0 = np.array([m_veg_x0, m_soil_x0]) df = pd.DataFrame(index=ts_resam.index, columns=columns) df = df.fillna(np.nan) # optimise m_soil and m_veg for index, row in ts_resam.iterrows(): ascat_inc = np.array(row[['incf', 'incm', 'inca']].tolist()) ascat_sig = \ db2lin(np.array(row[['sigf', 'sigm', 'siga']].tolist())) args = (ascat_inc, ascat_sig, params, '') res = minimize(sig_sqr_diff, x0, args=args, method='Nelder-Mead') if res['success'] == True: df['m_veg'][index] = res['x'][0] df['m_soil'][index] = res['x'][1] str_static_p = \ ', '.join("%s: %r" % t for t in locals().iteritems()) str_static_p += ",\nm_veg_x0 = {:.2f}, m_soil_x0 = {:.2f}".format( m_veg_x0, m_soil_x0) ismn_file = os.path.join( 'data', 'ARM_ARM_Larned_sm_0.050000_0.050000_Water-Matric-Potential-Sensor-229L-W_20090101_20140527.stm' ) ismn_data = ismn_readers.read_data(ismn_file) insitu = pd.DataFrame(ismn_data.data['soil moisture']).rename( columns={'soil moisture': 'insitu'}) gldas = pd.read_csv(os.path.join('data', 'GLDAS_737602.csv'), parse_dates=True, index_col=0) gldas.rename(columns={'086_L1': 'gldas'}, inplace=True) gldas = pd.DataFrame(gldas['gldas']) ascat = pd.DataFrame(df['m_soil']).rename(columns={'m_soil': 'ascat'}) matched = temp_match.matching(ascat, insitu, gldas) if rescaling is not None: scaled = scaling.scale(matched, rescaling, reference_index=1) else: scaled = matched metrics = OrderedDict() metrics['bias'] = df_metrics.bias(scaled) metrics['pearson'] = df_metrics.pearsonr(scaled) metrics['kendall'] = df_metrics.kendalltau(scaled) metrics['ubrmsd'] = df_metrics.ubrmsd(scaled) metrics['var_ratio'] = df_var_ratio(scaled) tcol_error = df_metrics.tcol_error(scaled)._asdict() ts_title = "Soil moisture. " if rescaling is not None: ts_title = ' '.join([ts_title, 'Rescaling: %s.' % rescaling]) else: ts_title = ' '.join([ts_title, 'No rescaling.']) axes = scaled.plot(subplots=True, title=ts_title, figsize=(18, 8)) # these are matplotlib.patch.Patch properties props = dict(facecolor='white', alpha=0) columns = ('ascat-insitu', 'ascat-gldas', 'insitu-gldas') row_labels = [ 'bias', 'pearson R', 'kendall tau', 'unbiased RMSD', 'variance ratio' ] cell_text = [] for metric in metrics: metric_values = metrics[metric] if type(metric_values) == tuple: metric_values = metric_values[0] metric_values = metric_values._asdict() cell_text.append([ "%.2f" % metric_values['ascat_and_insitu'], "%.2f" % metric_values['ascat_and_gldas'], "%.2f" % metric_values['insitu_and_gldas'] ]) table = plt.table(cellText=cell_text, colLabels=columns, colWidths=[0.1, 0.1, 0.1], rowLabels=row_labels, loc='bottom', bbox=(0.2, -1.25, 0.5, 0.8)) tcol_table = plt.table(cellText=[[ "%.2f" % tcol_error['ascat'], "%.2f" % tcol_error['gldas'], "%.2f" % tcol_error['insitu'] ]], colLabels=('ascat', 'gldas', 'insitu'), colWidths=[0.1, 0.1, 0.1], rowLabels=['Triple collocation error'], loc='bottom', bbox=(0.2, -1.65, 0.5, 0.3)) plt.subplots_adjust(left=0.08, bottom=0.35) axes = scatter_matrix(scaled) axes.flat[0].figure.suptitle(ts_title) # only draw 1:1 line if scaling was applied if rescaling is not None: for j, ax in enumerate(axes.flatten()): if np.remainder(j + 1, 3 + 1) != 1: min_x, max_x = ax.get_xlim() min_y, max_y = ax.get_ylim() # find minimum lower left coordinate and maximum upper right min_ll = min([min_x, min_y]) max_ur = max([max_x, max_y]) ax.plot([min_ll, max_ur], [min_ll, max_ur], '--', c='0.6') return df
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pd.DataFrame with >2 columns, the first column is the reference dataset named 'ref' other columns are the datasets to compare against named 'other_i' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = super(IntercomparisonMetrics, self).calc_metrics(data, gpi_info) subset = np.ones(len(data), dtype=bool) n_obs = subset.sum() if n_obs < self.min_obs: return dataset dataset['n_obs'][0] = n_obs # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R, pearson_p = pearson_R._asdict(), pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho, spea_p = spea_rho._asdict(), spea_p._asdict() # calculate bias bias_nT = df_metrics.bias(data) bias_dict = bias_nT._asdict() # calculate RMSD rmsd = df_metrics.rmsd(data) rmsd_dict = rmsd._asdict() # calculate MSE mse, mse_corr, mse_bias, mse_var = df_metrics.mse(data) mse_dict, mse_corr_dict, mse_bias_dict, mse_var_dict = \ mse._asdict(), mse_corr._asdict(), mse_bias._asdict(), mse_var._asdict() # calculate RSS rss = df_metrics.RSS(data) rss_dict = rss._asdict() # calulcate tau if self.calc_tau: tau, p_tau = df_metrics.kendalltau(data) tau_dict, p_tau_dict = tau._asdict(), p_tau._asdict() else: tau = p_tau = p_tau_dict = tau_dict = None # No extra scaling is performed here. # always scale for ubRMSD with mean std # calculate ubRMSD data_scaled = scale(data, method='mean_std') ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R, p_R = pearson_R[tds_name], pearson_p[tds_name] rho, p_rho = spea_rho[tds_name], spea_p[tds_name] bias = bias_dict[tds_name] mse = mse_dict[tds_name] mse_corr = mse_corr_dict[tds_name] mse_bias = mse_bias_dict[tds_name] mse_var = mse_var_dict[tds_name] rmsd = rmsd_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] rss = rss_dict[tds_name] if tau_dict and p_tau_dict: tau = tau_dict[tds_name] p_tau = p_tau_dict[tds_name] split_tds_name = tds_name.split(self.ds_names_split) tds_name_key = self.ds_names_split.join([ self.ds_names_lut[split_tds_name[0]], self.ds_names_lut[split_tds_name[1]] ]) dataset[self.metric_ds_split.join(['R', tds_name_key])][0] = R dataset[self.metric_ds_split.join(['p_R', tds_name_key])][0] = p_R dataset[self.metric_ds_split.join(['rho', tds_name_key])][0] = rho dataset[self.metric_ds_split.join(['p_rho', tds_name_key])][0] = p_rho dataset[self.metric_ds_split.join(['BIAS', tds_name_key])][0] = bias dataset[self.metric_ds_split.join(['mse', tds_name_key])][0] = mse dataset[self.metric_ds_split.join(['mse_corr', tds_name_key])][0] = mse_corr dataset[self.metric_ds_split.join(['mse_bias', tds_name_key])][0] = mse_bias dataset[self.metric_ds_split.join(['mse_var', tds_name_key])][0] = mse_var dataset[self.metric_ds_split.join(['RMSD', tds_name_key])][0] = rmsd dataset[self.metric_ds_split.join(['urmsd', tds_name_key])][0] = ubRMSD dataset[self.metric_ds_split.join(['RSS', tds_name_key])][0] = rss if self.calc_tau: dataset[self.metric_ds_split.join(['tau', tds_name_key])][0] = tau dataset[self.metric_ds_split.join(['p_tau', tds_name_key])][0] = p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with 3 columns, the first column is the reference dataset named 'ref' the second and third column are the datasets to compare against named 'k1 and k2' gpi_info : tuple Grid point info (i.e. gpi, lon, lat) """ dataset = copy.deepcopy(self.result_template) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] for season in self.seasons: if season != 'ALL': subset = self.month_to_season[data.index.month] == season else: subset = np.ones(len(data), dtype=bool) # number of observations n_obs = subset.sum() if n_obs < 10: continue dataset['{:}_n_obs'.format(season)][0] = n_obs # get single dataset metrics # calculate SNR x = data[self.df_columns[0]].values[subset] y = data[self.df_columns[1]].values[subset] z = data[self.df_columns[2]].values[subset] snr, err, beta = metrics.tcol_snr(x, y, z) for i, name in enumerate(self.ds_names): dataset['{:}_{:}_snr'.format(name, season)][0] = snr[i] dataset['{:}_{:}_err_var'.format(name, season)][0] = err[i] dataset['{:}_{:}_beta'.format(name, season)][0] = beta[i] # calculate Pearson correlation pearson_R, pearson_p = df_metrics.pearsonr(data) pearson_R = pearson_R._asdict() pearson_p = pearson_p._asdict() # calculate Spearman correlation spea_rho, spea_p = df_metrics.spearmanr(data) spea_rho = spea_rho._asdict() spea_p = spea_p._asdict() # scale data to reference in order to calculate absolute metrics data_scaled = scale(data, method='min_max') # calculate bias bias_nT = df_metrics.bias(data_scaled) bias_dict = bias_nT._asdict() # calculate ubRMSD ubRMSD_nT = df_metrics.ubrmsd(data_scaled) ubRMSD_dict = ubRMSD_nT._asdict() for tds_name in self.tds_names: R = pearson_R[tds_name] p_R = pearson_p[tds_name] rho = spea_rho[tds_name] p_rho = spea_p[tds_name] bias = bias_dict[tds_name] ubRMSD = ubRMSD_dict[tds_name] split_tds_name = tds_name.split('_and_') tds_name_key = "{:}_{:}".format(self.ds_names_lut[ split_tds_name[0]], self.ds_names_lut[ split_tds_name[1]]) dataset['{:}_{:}_R'.format(tds_name_key, season)][0] = R dataset['{:}_{:}_p_R'.format(tds_name_key, season)][0] = p_R dataset['{:}_{:}_rho'.format(tds_name_key, season)][0] = rho dataset['{:}_{:}_p_rho'.format(tds_name_key, season)][0] = \ p_rho dataset['{:}_{:}_bias'.format(tds_name_key, season)][0] = bias dataset['{:}_{:}_ubrmsd'.format(tds_name_key, season)][0] = \ ubRMSD return dataset