def test_ubrmsd(): """ Test for ubrmsd """ # example 1 x = np.arange(10) y = np.arange(10) + 2 ubrmsd_pred = 0 ubrmsd_obs = met.ubrmsd(x, y) nptest.assert_equal(ubrmsd_obs, ubrmsd_pred) # aslo check consistency with direct formula ubrmsd_direct = np.sqrt(met.rmsd(x, y)**2 - met.bias(x, y)**2) nptest.assert_equal(ubrmsd_obs, ubrmsd_direct) # example 2, with outlier x = np.arange(10) y = np.arange(10) + 2 y[-1] = 100. ubrmsd_pred = 26.7 ubrmsd_obs = met.ubrmsd(x, y) nptest.assert_almost_equal(ubrmsd_obs, ubrmsd_pred, 6) # aslo check consistency with direct formula ubrmsd_direct = np.sqrt(met.rmsd(x, y)**2 - met.bias(x, y)**2) nptest.assert_almost_equal(ubrmsd_obs, ubrmsd_direct)
def test_ubrmsd(): """ Test for ubrmsd """ # example 1 x = np.arange(10) y = np.arange(10) + 2 ubrmsd_pred = 0 ubrmsd_obs = met.ubrmsd(x, y) nptest.assert_equal(ubrmsd_obs, ubrmsd_pred) # aslo check consistency with direct formula ubrmsd_direct = np.sqrt(met.rmsd(x, y) ** 2 - met.bias(x, y)**2) nptest.assert_equal(ubrmsd_obs, ubrmsd_direct) # example 2, with outlier x = np.arange(10) y = np.arange(10) + 2 y[-1] = 100. ubrmsd_pred = 26.7 ubrmsd_obs = met.ubrmsd(x, y) nptest.assert_almost_equal(ubrmsd_obs, ubrmsd_pred, 6) # aslo check consistency with direct formula ubrmsd_direct = np.sqrt(met.rmsd(x, y) ** 2 - met.bias(x, y)**2) nptest.assert_almost_equal(ubrmsd_obs, ubrmsd_direct)
def test_RollingMetrics(): """ Test RollingMetrics. """ df = make_some_data() df['ref'] += np.random.rand(len(df)) df['k1'] += np.random.rand(len(df)) data = df[['ref', 'k1']] with warnings.catch_warnings(): warnings.simplefilter("ignore") # many warnings due to test data metriccalc = RollingMetrics(other_name='k1') dataset = metriccalc.calc_metrics(data, gpi_info=(0, 0, 0), center=False) # test pearson r ref_array = df['ref'].rolling('30d').corr(df['k1']) np.testing.assert_almost_equal(dataset['R'][0], ref_array.values) # test rmsd indexer = np.arange(30)[None, :] + np.arange(len(df) - 30)[:, None] rmsd_arr = [] for i in range(indexer.shape[0]): rmsd_arr.append( metrics.rmsd(df['ref'][indexer[i, :]], df['k1'][indexer[i, :]])) rmsd_arr = np.array(rmsd_arr) np.testing.assert_almost_equal(dataset['RMSD'][0][29:-1], rmsd_arr)
def test_RollingMetrics(): """ Test RollingMetrics. """ df = make_some_data() df["ref"] += np.random.rand(len(df)) df["k1"] += np.random.rand(len(df)) data = df[["ref", "k1"]] metriccalc = RollingMetrics(other_name="k1") dataset = metriccalc.calc_metrics(data, gpi_info=(0, 0, 0), center=False) # test pearson r ref_array = df["ref"].rolling("30d").corr(df["k1"]) np.testing.assert_almost_equal(dataset["R"][0], ref_array.values) # test rmsd indexer = np.arange(30)[None, :] + np.arange(len(df) - 30)[:, None] rmsd_arr = [] for i in range(indexer.shape[0]): rmsd_arr.append( metrics.rmsd(df["ref"][indexer[i, :]].values, df["k1"][indexer[i, :]].values)) rmsd_arr = np.array(rmsd_arr) np.testing.assert_almost_equal(dataset["RMSD"][0][29:-1], rmsd_arr)
def test_rmsd_mse(): """ Test for rmsd and mse """ # example 1 x = np.random.randn(1000) y = np.random.randn(1000) rmsd_pred = met.rmsd(x, y) mse_pred, _, _, _ = met.mse(x, y) nptest.assert_almost_equal(rmsd_pred ** 2, mse_pred, 6)
def test_rmsd_mse(): """ Test for rmsd and mse """ # example 1 x = np.random.randn(1000) y = np.random.randn(1000) rmsd_pred = met.rmsd(x, y) mse_pred, _, _, _ = met.mse(x, y) nptest.assert_almost_equal(rmsd_pred**2, mse_pred, 6)
def test_rmsd(): """ Test for rmsd """ # example 1 x = np.arange(10) y = np.arange(10) + 2 rmsd_pred = 2. rmsd_obs = met.rmsd(x, y) nptest.assert_equal(rmsd_obs, rmsd_pred) # example 2, with outlier x = np.arange(10) y = np.arange(10) + 2 y[-1] = 100. rmsd_pred = np.sqrt(831.7) rmsd_obs = met.rmsd(x, y) nptest.assert_almost_equal(rmsd_obs, rmsd_pred, 6)
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with 2 columns, the first column is the reference dataset named 'ref' the second column the dataset to compare against named 'other' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset["n_obs"][0] = len(data) dataset["gpi"][0] = gpi_info[0] dataset["lon"][0] = gpi_info[1] dataset["lat"][0] = gpi_info[2] if len(data) < 10: return dataset x, y = data["ref"].values, data[self.other_name].values R, p_R = metrics.pearsonr(x, y) rho, p_rho = metrics.spearmanr(x, y) RMSD = metrics.rmsd(x, y) BIAS = metrics.bias(x, y) dataset["R"][0], dataset["p_R"][0] = R, p_R dataset["rho"][0], dataset["p_rho"][0] = rho, p_rho dataset["RMSD"][0] = RMSD dataset["BIAS"][0] = BIAS if self.calc_tau: tau, p_tau = metrics.kendalltau(x, y) dataset["tau"][0], dataset["p_tau"][0] = tau, p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with 2 columns, the first column is the reference dataset named 'ref' the second column the dataset to compare against named 'other' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is calculation is optional at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['n_obs'][0] = len(data) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] if len(data) < 10: return dataset x, y = data['ref'].values, data[self.other_name].values R, p_R = metrics.pearsonr(x, y) rho, p_rho = metrics.spearmanr(x, y) RMSD = metrics.rmsd(x, y) BIAS = metrics.bias(x, y) dataset['R'][0], dataset['p_R'][0] = R, p_R dataset['rho'][0], dataset['p_rho'][0] = rho, p_rho dataset['RMSD'][0] = RMSD dataset['BIAS'][0] = BIAS if self.calc_tau: tau, p_tau = metrics.kendalltau(x, y) dataset['tau'][0], dataset['p_tau'][0] = tau, p_tau return dataset
def calc_metrics(self, data, gpi_info): """ calculates the desired statistics Parameters ---------- data : pandas.DataFrame with 2 columns, the first column is the reference dataset named 'ref' the second column the dataset to compare against named 'other' gpi_info : tuple of (gpi, lon, lat) Notes ----- Kendall tau is not calculated at the moment because the scipy implementation is very slow which is problematic for global comparisons """ dataset = copy.deepcopy(self.result_template) dataset['n_obs'][0] = len(data) dataset['gpi'][0] = gpi_info[0] dataset['lon'][0] = gpi_info[1] dataset['lat'][0] = gpi_info[2] if len(data) < 10: return dataset x, y = data['ref'].values, data['other'].values R, p_R = metrics.pearsonr(x, y) rho, p_rho = metrics.spearmanr(x, y) # tau, p_tau = metrics.kendalltau(x, y) RMSD = metrics.rmsd(x, y) BIAS = metrics.bias(x, y) dataset['R'][0], dataset['p_R'][0] = R, p_R dataset['rho'][0], dataset['p_rho'][0] = rho, p_rho # dataset['tau'][0], dataset['p_tau'][0] = tau, p_tau dataset['RMSD'][0] = RMSD dataset['BIAS'][0] = BIAS return dataset
plt.scatter(matched_data[scaled_ascat_label].values,matched_data[label_insitu].values) plt.xlabel(scaled_ascat_label) plt.ylabel(label_insitu) plt.show() #calculate correlation coefficients, RMSD, bias, Nash Sutcliffe x, y = matched_data[scaled_ascat_label].values, matched_data[label_insitu].values print "ISMN time series:",ISMN_time_series print "compared to" print ascat_time_series print "Results:" print "Pearson's (R,p_value)", metrics.pearsonr(x, y) print "Spearman's (rho,p_value)", metrics.spearmanr(x, y) print "Kendalls's (tau,p_value)", metrics.kendalltau(x, y) print "RMSD", metrics.rmsd(x, y) print "Bias", metrics.bias(x, y) print "Nash Sutcliffe", metrics.nash_sutcliffe(x, y) i += 1 #only show the first 2 stations, otherwise this program would run a long time #and produce a lot of plots if i >= 2: break
def compare_data(ismn_data, validation_data, scaling='linreg', anomaly=None): """ Compare data from an ISMN station to the defined validation datasets. Parameters ---------- ismn_data: pandas.Dataframe Data from the ISMN used as a reference validation_data: dict Dictionary of pandas.DataFrames, One for each dataset to compare against scaling: string, optional Scaling method to use. anomaly: string If set then the validation is done for anomalies. """ insitu_label = 'soil moisture' if anomaly != None: if anomaly == 'climatology': ascat_clim = anomaly_calc.calc_climatology( ascat_masked[ascat_label]) insitu_clim = anomaly_calc.calc_climatology( ismn_data['soil moisture']) ascat_anom = anomaly_calc.calc_anomaly(ascat_masked[ascat_label], climatology=ascat_clim) ascat_masked[ascat_label] = ascat_anom.values insitu_anom = anomaly_calc.calc_anomaly(ISMN_data['insitu'], climatology=insitu_clim) ISMN_data['insitu'] = insitu_anom.values if anomaly == 'average': ascat_anom = anomaly_calc.calc_anomaly(ascat_masked[ascat_label]) ascat_masked[ascat_label] = ascat_anom.values insitu_anom = anomaly_calc.calc_anomaly(ISMN_data['insitu']) ISMN_data['insitu'] = insitu_anom.values ascat_masked = ascat_masked.dropna() ISMN_data = ISMN_data.dropna() for dname in validation_data: vdata = validation_data[dname] vdata_label = 'cci_sm' matched_data = temp_match.matching(ismn_data, vdata, window=1) if scaling != 'noscale' and scaling != 'porosity': scaled_data = scale.add_scaled(matched_data, label_in=vdata_label, label_scale=insitu_label, method=scaling) scaled_label = vdata_label + '_scaled_' + scaling scaled_data = scaled_data[[insitu_label, scaled_label]] elif scaling == 'noscale': scaled_data = matched_data[[insitu_label, vdata_label]] scaled_label = vdata_label # scaled_data.rename(columns={'insitu': ISMN_ts_name}, inplace=True) labels, values = scaled_data.to_dygraph_format() ascat_insitu = {'labels': labels, 'data': values} x, y = scaled_data[insitu_label].values, scaled_data[scaled_label].values kendall, p_kendall = sc_stats.kendalltau(x.tolist(), y.tolist()) spearman, p_spearman = sc_stats.spearmanr(x, y) pearson, p_pearson = sc_stats.pearsonr(x, y) rmsd = metrics.rmsd(x, y) bias = metrics.bias(y, x) mse, mse_corr, mse_bias, mse_var = metrics.mse(x, y) statistics = { 'kendall': { 'v': '%.2f' % kendall, 'p': '%.4f' % p_kendall }, 'spearman': { 'v': '%.2f' % spearman, 'p': '%.4f' % p_spearman }, 'pearson': { 'v': '%.2f' % pearson, 'p': '%.4f' % p_pearson }, 'bias': '%.4f' % bias, 'rmsd': { 'rmsd': '%.4f' % np.sqrt(mse), 'rmsd_corr': '%.4f' % np.sqrt(mse_corr), 'rmsd_bias': '%.4f' % np.sqrt(mse_bias), 'rmsd_var': '%.4f' % np.sqrt(mse_var) }, 'mse': { 'mse': '%.4f' % mse, 'mse_corr': '%.4f' % mse_corr, 'mse_bias': '%.4f' % mse_bias, 'mse_var': '%.4f' % mse_var } } scaling_options = { 'noscale': 'No scaling', 'porosity': 'Scale using porosity', 'linreg': 'Linear Regression', 'mean_std': 'Mean - standard deviation', 'min_max': 'Minimum,maximum', 'lin_cdf_match': 'Piecewise <br> linear CDF matching', 'cdf_match': 'CDF matching' } settings = { 'scaling': scaling_options[scaling], # 'snow_depth': mask['snow_depth'], # 'surface_temp': mask['st_l1'], # 'air_temp': mask['air_temp'] } era_data = {'labels': [], 'data': []} output_data = { 'validation_data': ascat_insitu, 'masking_data': era_data, 'statistics': statistics, 'settings': settings } return output_data, 1
scaled_data.plot(secondary_y=[label_ascat]) plt.show() plt.scatter(matched_data[scaled_ascat_label].values, matched_data[label_insitu].values) plt.xlabel(scaled_ascat_label) plt.ylabel(label_insitu) plt.show() #calculate correlation coefficients, RMSD, bias, Nash Sutcliffe x, y = matched_data[scaled_ascat_label].values, matched_data[ label_insitu].values print "ISMN time series:", ISMN_time_series print "compared to" print ascat_time_series print "Results:" print "Pearson's (R,p_value)", metrics.pearsonr(x, y) print "Spearman's (rho,p_value)", metrics.spearmanr(x, y) print "Kendalls's (tau,p_value)", metrics.kendalltau(x, y) print "RMSD", metrics.rmsd(x, y) print "Bias", metrics.bias(x, y) print "Nash Sutcliffe", metrics.nash_sutcliffe(x, y) i += 1 #only show the first 2 stations, otherwise this program would run a long time #and produce a lot of plots if i >= 2: break
def _calc_validation_metrics(self): """ Calculate vertical metrics between candidate and reference using pytesmo. Currently implemented: bias, mad, rmsd, nrmsd, Returns ------- df_validation_metrics: pd.DataFrame Data Frame that contains the metrics between the candidate and reference for the 2 groups """ df_validation_metrics = pd.DataFrame() for group_no, subset_data in enumerate([self.set0, self.set1, self.setfull]): if group_no in [0,1]: group = 'group%i' % group_no else: group = 'FRAME' if 'bias' in self.metrics: if any([subset_data[col].empty for col in [self.candidate_name, self.reference_name]]): bias = np.nan else: bias =metrics.bias(subset_data[self.reference_name].values, subset_data[self.candidate_name].values) df_validation_metrics.at['bias', '%s' % group] = bias if 'mad' in self.metrics: if any([subset_data[col].empty for col in [self.candidate_name, self.reference_name]]): mad = np.nan else: mad =metrics.mad(subset_data[self.reference_name].values, subset_data[self.candidate_name].values) df_validation_metrics.at['mad', '%s' % group] = mad if 'rmsd' in self.metrics: if any([subset_data[col].empty for col in [self.candidate_name, self.reference_name]]): rmsd = np.nan else: rmsd =metrics.rmsd(subset_data[self.reference_name].values, subset_data[self.candidate_name].values) df_validation_metrics.at['rmsd', '%s' % group] = rmsd if 'nrmsd' in self.metrics: if any([subset_data[col].empty for col in [self.candidate_name, self.reference_name]]): nrmsd = np.nan else: nrmsd =metrics.nrmsd(subset_data[self.reference_name].values, subset_data[self.candidate_name].values) df_validation_metrics.at['nrmsd', '%s' % group] = nrmsd if 'PearsonR' in self.metrics: if any([subset_data[col].empty for col in [self.candidate_name, self.reference_name]]): pr, pp = np.nan, np.nan else: with warnings.catch_warnings(): # supress scipy warnings warnings.filterwarnings('ignore') pr, pp =metrics.pearsonr(subset_data[self.reference_name].values, subset_data[self.candidate_name].values) df_validation_metrics.at['PearsonR', '%s' % group] = pr df_validation_metrics.at['Pp', '%s' % group] = pp if 'SpearmanR' in self.metrics: if any([subset_data[col].empty for col in [self.candidate_name, self.reference_name]]): sr, sp = np.nan, np.nan else: with warnings.catch_warnings(): # supress scipy warnings warnings.filterwarnings('ignore') sr, sp = metrics.spearmanr(subset_data[self.reference_name].values, subset_data[self.candidate_name].values) df_validation_metrics.at['SpearmanR', '%s' % group] = sr df_validation_metrics.at['Sp', '%s' % group] = sp return df_validation_metrics