def find_pvalue_violation_indices_continuous(n, U, S, R, max_pvalue, max_pvalue_policy): pvalue_violation_indices = [] if max_pvalue_policy == "all": for i in range(n - 1): for t in range(i + 1): u = U[i][t] s = S[i][t] r = R[i][t] for j in range(i + 1, n): for k in range(i + 1, j + 1): u2 = U[j][k] s2 = S[j][k] r2 = R[j][k] if stats.ttest_ind_from_stats(u, s, r, u2, s2, r2, False)[1] > max_pvalue: pvalue_violation_indices.append(([i, t], [j, k])) elif max_pvalue_policy == "consecutive": for i in range(n - 1): for k in range(i + 1): u = U[i][k] s = S[i][k] r = R[i][k] for j in range(i + 1, n): u2 = U[j][i + 1] s2 = S[j][i + 1] r2 = R[j][i + 1] if stats.ttest_ind_from_stats(u, s, r, u2, s2, r2, False)[1] > max_pvalue: pvalue_violation_indices.append(([i, k], [j, i + 1])) return pvalue_violation_indices
def t_test(group1, group2): mean1 = np.mean(group1) mean2 = np.mean(group2) std1 = np.std(group1) std2 = np.std(group2) nobs1 = len(group1) nobs2 = len(group2) modified_std1 = np.sqrt(np.float32(nobs1) / np.float32(nobs1 - 1)) * std1 modified_std2 = np.sqrt(np.float32(nobs2) / np.float32(nobs2 - 1)) * std2 #f檢定 f1 = np.square(modified_std1) / np.square(modified_std2) fp = 1 - f.cdf(f1, nobs1 - 1, nobs2 - 1) if fp > 0.05: (statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1, std1=modified_std1, nobs1=nobs1, mean2=mean2, std2=modified_std2, nobs2=nobs2, equal_var=True) else: (statistic, pvalue) = stats.ttest_ind_from_stats(mean1=mean1, std1=modified_std1, nobs1=nobs1, mean2=mean2, std2=modified_std2, nobs2=nobs2, equal_var=False) return [mean1, std1, mean2, std2, fp, statistic, pvalue]
def compute_statistics(dataframe, n_iterations, run_test=True, csv_out='comparison_models.csv'): """Compares the performance of models at inference time on a common testing dataset using paired t-tests. It uses a dataframe generated by ``scripts/automate_training.py`` with the parameter ``--run-test`` (used to run the models on the testing dataset). It output dataframes that stores the different statistic (average, std and p_value between runs). All can be combined and stored in a csv. .. csv-table:: Example of dataframe :file: ../../images/df_compare.csv Usage example:: ivadomed_compare_models -df results.csv -n 2 --run_test Args: dataframe (pandas.Dataframe): Dataframe of results generated by automate_training. Flag: ``--dataframe``, ``-df`` n_iterations (int): Indicates the number of time that each experiment (ie set of parameter) was run. Flag: ``--n_iteration``, ``-n`` run_test (int): Indicates if the comparison is done on the performances on either the testing subdataset (True) either on the training/validation subdatasets. Flag: ``--run_test`` csv_out (string): Output csv name to store computed value (e.g., df.csv). Default value is model_comparison.csv. Flag ``-o``, ``--output`` """ avg = dataframe.groupby(['path_output']).mean() std = dataframe.groupby(['path_output']).std() print("Average dataframe") print(avg) print("Standard deviation dataframe") print(std) config_logs = list(avg.index.values) p_values = np.zeros((len(config_logs), len(config_logs))) i, j = 0, 0 for confA in config_logs: j = 0 for confB in config_logs: if run_test: p_values[i, j] = ttest_ind_from_stats(mean1=avg.loc[confA]["test_dice"], std1=std.loc[confA]["test_dice"], nobs1=n_iterations, mean2=avg.loc[confB]["test_dice"], std2=std.loc[confB]["test_dice"], nobs2=n_iterations).pvalue else: p_values[i, j] = ttest_ind_from_stats(mean1=avg.loc[confA]["best_validation_dice"], std1=std.loc[confA]["best_validation_dice"], nobs1=n_iterations, mean2=avg.loc[confB]["best_validation_dice"], std2=std.loc[confB]["best_validation_dice"], nobs2=n_iterations).pvalue j += 1 i += 1 p_df = pd.DataFrame(p_values, index=config_logs, columns=config_logs) print("P-values dataframe") print(p_df) if csv_out is not None: # Unnamed 0 column correspond to run number so we remoe that and add prefix for better readability df_concat = pd.concat([avg.add_prefix('avg_').drop(['avg_Unnamed: 0'], axis=1), std.add_prefix('std_').drop(['std_Unnamed: 0'], axis=1), p_df.add_prefix('p-value_')], axis=1) df_concat.to_csv(csv_out)
def gradeStats(coursetitle, dataframe): index = dataframe.columns.values.tolist().index(coursetitle) green_total = 0 gold_total = 0 black_total = 0 green_count = 0 gold_count = 0 black_count = 0 green_values = [] gold_values = [] black_values = [] for row in grid: if row[index] == row[index]: \ #and row[index] != 0: #include to remove pass/fail grades if row[0] == 'green': green_total += row[index] green_count += 1 green_values += [row[index]] elif row[0] == 'gold': gold_total += row[index] gold_count += 1 gold_values += [row[index]] elif row[0] == 'black': black_total += row[index] black_count += 1 black_values += [row[index]] try: green_avg = green_total/green_count except: green_avg = 0 try: gold_avg = gold_total/gold_count except: gold_avg = 0 try: black_avg = black_total/black_count except: black_avg = 0 green_stdev = np.std(green_values) gold_stdev = np.std(gold_values) black_stdev = np.std(black_values) green_gold_ttest = stats.ttest_ind_from_stats(green_avg, green_stdev, green_count, gold_avg, gold_stdev, gold_count) green_black_ttest = stats.ttest_ind_from_stats(green_avg, green_stdev, green_count, black_avg, black_stdev, black_count) gold_black_ttest = stats.ttest_ind_from_stats(gold_avg, gold_stdev, gold_count, black_avg, black_stdev, black_count) #print('green average grade in %sis ' % coursetitle, green_avg) #print('gold average grade in %sis ' % coursetitle, gold_avg) #print('black average grade in %sis ' % coursetitle, black_avg) print('green stdev in %sis ' % coursetitle, green_stdev) print('gold stdev in %sis ' % coursetitle, gold_stdev) print('black stdev in %sis ' % coursetitle, black_stdev) print() print('green/gold has p = ', green_gold_ttest[1]) print('green/black has p = ', green_black_ttest[1]) print('gold/black has p = ', gold_black_ttest[1]) print()
def calculate_stats(): positive_stats = ps = pd.read_csv('data/positive.csv').drop(['Unnamed: 0'], axis=1) negative_stats = ns = pd.read_csv('data/negative.csv').drop(['Unnamed: 0'], axis=1) neither_stats = nes = pd.read_csv('data/neither.csv').drop(['Unnamed: 0'], axis=1) print() names = [ 'num_retweets', 'num_retweets_norm', 'num_faves', 'num_faves_norm', 'steps', 'followers', 'sentiment' ] # print('-------------------------Mean:-------------------------') # for i in range(0, 6): # print('###', names[i], '###') # print('Positive:', np.mean((np.array(positive_stats)[:, i]).astype(np.float))) # print('Negative:', np.mean((np.array(negative_stats)[:, i]).astype(np.float))) # print('Neutral', np.mean((np.array(neither_stats)[:, i]).astype(np.float))) # # print('-------------------------Total:-------------------------') # for i in range(0, 6): # print('###', names[i], '###') # print('Positive:', np.sum((np.array(positive_stats)[:, i]).astype(np.float))) # print('Negative:', np.sum((np.array(negative_stats)[:, i]).astype(np.float))) # print('Neutral', np.sum((np.array(neither_stats)[:, i]).astype(np.float))) # print('-------------------------Variance:-------------------------') # for i in range(0, 6): # print('###', names[i], '###') # print('Positive:', np.var((np.array(positive_stats)[:, i]).astype(np.float))) # print('Negative:', np.var((np.array(negative_stats)[:, i]).astype(np.float))) # print('Neutral', np.var((np.array(neither_stats)[:, i]).astype(np.float))) print('-------------------------T - test:-------------------------') for i in range(0, 6): print('###', names[i], '###') m1 = np.mean((np.array(positive_stats)[:, i]).astype(np.float)) std1 = np.std((np.array(positive_stats)[:, i]).astype(np.float)) num1 = len(np.array(positive_stats)) m2 = np.mean((np.array(negative_stats)[:, i]).astype(np.float)) std2 = np.std((np.array(negative_stats)[:, i]).astype(np.float)) num2 = len(np.array(negative_stats)) m3 = np.mean((np.array(neither_stats)[:, i]).astype(np.float)) std3 = np.std((np.array(neither_stats)[:, i]).astype(np.float)) num3 = len(np.array(neither_stats)) print('1', ss.ttest_ind_from_stats(m1, std1, num1, m2, std2, num2)) print('2', ss.ttest_ind_from_stats(m1, std1, num1, m3, std3, num3)) print('3', ss.ttest_ind_from_stats(m2, std2, num2, m3, std3, num3))
def extract_poly_values(site, sensor, folder, all_touched=True, print_p=False): from scipy.stats import ttest_ind_from_stats # Extract values within plots from images # test cultural vs natural viFolder = os.path.join('/Volumes/RASMUS_1/Satellite/remains_sites',site,sensor,folder) imgList = glob.glob(viFolder + '/*.tif') results_natural = {} results_cultural = {} for img in imgList: # calculate natural background inShpNat = '/Volumes/RASMUS_1/Satellite/analysis/shapefiles/'+site+'_natural_'+sensor+'_poly.shp' natural = extract_from_img(inShpNat, img, band=1, all_touched=all_touched) # calculate cultural inShpCul = '/Volumes/RASMUS_1/Satellite/analysis/shapefiles/'+site+'_cultural_'+sensor+'_poly.shp' cultural = extract_from_img(inShpCul, img, band=1, all_touched=all_touched) imgSplit = img.split('_') vi = imgSplit[len(imgSplit)-1][:-4] results_natural[vi] = natural[0] results_cultural[vi] = cultural[0] pList = [] for key in results_natural: if print_p: print key t, p = ttest_ind_from_stats(results_natural[key]['mean'], results_natural[key]['std'], results_natural[key]['count'], results_cultural[key]['mean'], results_cultural[key]['std'], results_cultural[key]['count'], equal_var=False) pList.append(p) return results_natural,results_cultural,pList
def _is_significant_slice(slice_metric: float, slice_std_dev: float, slice_weight: float, base_metric: float, base_std_dev: float, base_weight: float, comparison_type: Text, alpha: float) -> Tuple[bool, float]: """Perform statistical significance testing.""" assert base_std_dev > 0, ('base_std_dev must be positive, but got ' '{}.'.format(base_std_dev)) assert slice_std_dev > 0, ('slice_std_dev must be positive, but got ' '{}.'.format(slice_std_dev)) assert base_weight > 1, ('base_weight must be greater than 1, but got ' '{}.'.format(base_weight)) assert slice_weight > 1, ('slice_weight must be greater than 1, but got ' '{}.'.format(slice_weight)) try: _, p_value_two_sided = stats.ttest_ind_from_stats(slice_metric, slice_std_dev, slice_weight, base_metric, base_std_dev, base_weight, equal_var=False) except ZeroDivisionError: raise ZeroDivisionError( 'invalid ttest for params: slice_metric={}, ' 'slice_std_dev={}, slice_weight={}, ' 'base_metric={}, base_std_dev={}, base_weight={}, '.format( slice_metric, slice_std_dev, slice_weight, base_metric, base_std_dev, base_weight)) metric_diff = slice_metric - base_metric one_sided_p_value = _two_sided_to_one_sided_pvalue( p_value_two_sided, metric_diff, comparison_type=comparison_type) return (one_sided_p_value < alpha, one_sided_p_value)
def calc_t_pvalue(test_group_average: np.float64, test_group_stdev: np.float64, test_group_nobs: np.float64, control_group_average: np.float64, control_group_stdev: np.float64, control_group_nobs: np.float64) -> np.float64: """Performs the T-test to compare two averages. Args: test_group_average: Average KPI value for the test group. test_group_stdev: Standard deviation of KPI value for the test group. test_group_nobs: Number of observations in the test group. control_group_average: Average KPI value for the test group. control_group_stdev: Standard deviation of KPI value for the test group. control_group_nobs: Number of observations in the test group. Returns: p-value from the T-test. """ _, p_val = stats.ttest_ind_from_stats(mean1=test_group_average, std1=test_group_stdev, nobs1=test_group_nobs, mean2=control_group_average, std2=control_group_stdev, nobs2=control_group_nobs, equal_var=False) return p_val
def print_individual_p_values(): F_WT_err, F_WT_mean, N_WT, _, F_PC_mean, F_PC_err, N_PC, _ = errs_and_N() # Since we have approximately normally distributed rupture force for a # fixed loading rate, we can # use Welch's formula for getting the t-test value with different # population variances . See: Welch, Biometrika, 1947 # t = mean_WT - mean_PO / sqrt( stdev_WT**2/N_WT + stdev_PC**2/N_PC) t_denom = np.sqrt(F_WT_err**2 / N_WT + F_PC_err**2 / N_PC) t = (F_WT_mean - F_PC_mean) / t_denom # get the degrees of freedom asscioated with the system using the # Welch-satterthwaite eq. See: Satterthwaite, 1946, Biometrics Bulletin v_denom = (F_WT_err**4 / (N_WT**2 * (N_WT - 1)) + F_PC_err**4 / (N_PC**2 * (N_PC - 1))) v = t_denom**4 / v_denom # determine the p value based on degrees of freedom and the t statistic p_value_one_sided = 1 - t_distribution.cdf(t, df=v) p_value_two_sided = 2 * p_value_one_sided # as a check, use scientific python to calculate the same thing t_stat_and_p = [ttest_ind_from_stats(\ mean1=F_WT_mean[i],std1=F_WT_err[i],nobs1=N_WT[i], mean2=F_PC_mean[i],std2=F_PC_err[i],nobs2=N_PC[i],equal_var=False) for i in range(3)] t_stat = [ele[0] for ele in t_stat_and_p] p_values = [ele[1] for ele in t_stat_and_p] print("Manually calculated p-values: " + \ ",".join((["{:.3g}".format(p) for p in p_value_two_sided]))) print("Automatically calculated p-values: " + \ ",".join(["{:.3g}".format(p) for p in p_values]))
def student_ttest(X,y,threshold = None, percentile = None): """ perform student t-test, returen the features sorted by p-value """ if threshold and percentile or (not threshold and not percentile): raise ValueError('error') labels = y.unique() p_feature_data = X.loc[y == labels[0],:] n_feature_data = X.loc[y == labels[1],:] p_mean, n_mean = p_feature_data.mean(axis = 0), n_feature_data.mean(axis = 0) p_std, n_std = p_feature_data.std(axis = 0), n_feature_data.std(axis = 0) t_value, p_value = ttest_ind_from_stats( p_mean, p_std, p_feature_data.shape[0], n_mean, n_std, n_feature_data.shape[0]) p_value = pd.Series(data=p_value, index=X.columns) sorted_pvalue = p_value.sort_values(ascending=True) if threshold != None: res = sorted_pvalue[sorted_pvalue < threshold].index.tolist() if percentile: res = sorted_pvalue.iloc[:int(sorted_pvalue.shape[0] * percentile)].index.tolist() return res
def ttest_sub(mean_1, std_1, nyears_1, mean_2, std_2, nyears_2, equal_var=True): """ Sub-routine to call ttest_ind_from_stats from scipy Checks that shapes match and turns integer years into correct format returns pvalue. """ # Convert nobs type nyears_1 = int(nyears_1) nyears_2 = int(nyears_2) # Create arrays like others for nobs nobs1_arr = (nyears_1 - 1) * np.ones_like(mean_1) nobs2_arr = (nyears_2 - 1) * np.ones_like(mean_1) """ # ttest_ind_from_stats # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html """ ttest_out = ttest_ind_from_stats(mean_1, std_1, nobs1_arr, mean_2, std_2, nobs2_arr) # An array of p-values matching the shape of the input arrays pvalue_out = ttest_out[1] return pvalue_out
def simulate_binned_t_test(lmb1, t1, lmb2, t2, bin_size=1.0): """ Simulates data from two Poisson distributions, bins them as per the bin-size and finds the p-value by passing the binned AIR estimate vectors to a two- sided t-test. args: lmb1: The failure rate for first population. t1: The time observed for first population. lmb2: The failure rate for second population. t2: The time observed for second population bin_size: The bins into which data is partitioned. """ num_bins1 = int(t1 / bin_size) num_bins2 = int(t2 / bin_size) if num_bins1 < 2 or num_bins2 < 2: print("Not enough bins!") return n1 = poisson.rvs(lmb1 * t1 / num_bins1, size=num_bins1) n2 = poisson.rvs(lmb2 * t2 / num_bins2, size=num_bins2) mean1 = np.mean(n1 / bin_size) std1 = np.std(n1 / bin_size) mean2 = np.mean(n2 / bin_size) std2 = np.std(n2 / bin_size) p_val = stats.ttest_ind_from_stats(mean1=mean1, std1=std1, nobs1=20, \ mean2=mean2, std2=std2, nobs2=20, \ equal_var=False).pvalue/2 return p_val
def find_min_errors(summary_df, agg_level_name, ttest_pval_th=0.95): """ Runs a pair-wise Welch's t-test between minimum 'mean' value of each group of agg_level_name. For any row whose mean is NOT significanly different than the minimum error in its group, is_min will be set to True. Args: summary_df [In/Out] is an aggregate on a dataframe. THe aggregate should have a "mean", "std" and "count". agg_level_name: a level name in summary_df ttest_pval_th: pvalue threshold (default: 0.95) """ from scipy.stats import ttest_ind_from_stats summary_df["is_min"] = False # Find minimum "mean" for each level and its corresponding std and count min_at_level = summary_df.groupby(agg_level_name)["mean", "std", "count"].transform("min") for index, row in summary_df.iterrows(): t_val, p_val = ttest_ind_from_stats( mean1=min_at_level.loc[index]["mean"], std1=min_at_level.loc[index]["std"], nobs1=min_at_level.loc[index]["count"], mean2=row["mean"], std2=row["std"], nobs2=row["count"], equal_var=False) if p_val >= ttest_pval_th: summary_df.at[index, "is_min"] = True else: summary_df.at[index, "is_min"] = False
def _bivariate_student_orderability_from_moments(mu, var, nTrial, type="stat"): ''' :param mu: 1D Array of temporal means for every channel :param var: 1D Array of temporal variances for every channel :param nTrial: number of trials used to compute temporal moments. Necessary for T-test :param type: Depending on output type, returns either T-Test statistic or p-value :return: Scalar orderability index ''' rezidx = 0 if type == "stat" else 1 nNode = len(mu) std = np.sqrt(var) rez = np.full((nNode, nNode), np.nan) for i in range(nNode): for j in range(i + 1, nNode): rez[i][j] = ttest_ind_from_stats(mu[i], std[i], nTrial, mu[j], std[j], nTrial, equal_var=False)[rezidx] rez[j][i] = rez[i][j] return rez
def end_batch(self, batch_meta): pvalue = 1.0 if self._history.n != 0: # Perform Welch's t test t, pvalue = stats.ttest_ind_from_stats(self._history.mean, self._history.stddev(), self._history.n, self._batch.mean, self._batch.stddev(), self._batch.n, equal_var=False) # Send pvalue point back to Kapacitor response = udf_pb2.Response() response.point.time = batch_meta.tmax response.point.name = batch_meta.name response.point.group = batch_meta.group response.point.tags.update(batch_meta.tags) response.point.fieldsDouble["t"] = t response.point.fieldsDouble["pvalue"] = pvalue self._agent.write_response(response) # Update historical stats with batch, but only if it was normal. if pvalue > self._alpha: for value in self._batch._window: self._history.update(value)
def runTTest(con_samples, exp_samples): con_means = [] con_var = [] con_n = len(con_samples[list(con_samples.keys())[0]]) for key in con_samples: con_means.append(np.mean(con_samples[key])) con_var.append(np.var(con_samples[key])) con_dof = con_n - 1 exp_means = [] exp_var = [] exp_n = len(exp_samples[list(exp_samples.keys())[0]]) for key in exp_samples: exp_n = len(exp_samples[key]) exp_means.append(np.mean(exp_samples[key])) exp_var.append(np.var(exp_samples[key])) exp_dof = exp_n - 1 tstats = [0] * (len(con_means)) pvalues = [0] * (len(con_means)) for i in range(0, len(con_means)): tstats[i], pvalues[i] = ttest_ind_from_stats(con_means[i], np.sqrt(con_var[i]), con_n, exp_means[i], np.sqrt(exp_var[i]), exp_n, equal_var=False) return tstats, pvalues
def __do_comparison(expression_values1, weights1, day1, expression_values2, weights2, day2, features, fraction_expressed_ratio_add=0.0001): mean1 = np.average(expression_values1, weights=weights1, axis=0) mean2 = np.average(expression_values2, weights=weights2, axis=0) fraction_expressed1 = weights1.dot(expression_values1 > 0) fraction_expressed2 = weights2.dot(expression_values2 > 0) fraction_expressed_diff = (fraction_expressed1 + fraction_expressed_ratio_add) / ( fraction_expressed2 + fraction_expressed_ratio_add) variance1 = np.average((expression_values1 - mean1) ** 2, weights=weights1, axis=0) variance2 = np.average((expression_values2 - mean2) ** 2, weights=weights2, axis=0) with np.errstate(invalid="ignore"): scores, ttest_pvals = stats.ttest_ind_from_stats( mean1=mean1, std1=np.sqrt(variance1), nobs1=len(weights1), mean2=mean2, std2=np.sqrt(variance2), nobs2=len(weights2), equal_var=False) # Welch's scores[np.isnan(scores)] = 0 ttest_pvals[np.isnan(ttest_pvals)] = 1 fold_change = np.exp(mean1 - mean2) results = pd.DataFrame(index=features, data={'fold_change': fold_change, 'mean1': mean1, 'mean2': mean2, 'fraction_expressed1': fraction_expressed1, 'fraction_expressed2': fraction_expressed2, 't_score': scores, 't_pval': ttest_pvals, 't_fdr': statsmodels.stats.multitest.multipletests(ttest_pvals)[1], 'fraction_expressed_ratio': fraction_expressed_diff, 'day1': day1, 'day2': day2}) return results
def _system_tests_continuous(self, bin_str, n_records_a, mean_a, std_a, n_records_e, mean_e, std_e): self._metric_a = mean_a self._metric_e = mean_e t_statistics = [] p_values = [] n_bins = len(bin_str) for i in range(n_bins): t, p = stats.ttest_ind_from_stats(mean_a[i], std_a[i], n_records_a[i], mean_e[i], std_e[i], n_records_e[i], False) t_statistics.append(t) p_values.append(p) df_tests = pd.DataFrame({ "Bin": bin_str, "Count A": n_records_a, "Count E": n_records_e, "Mean A": mean_a, "Mean E": mean_e, "Std A": std_a, "Std E": std_e, "statistic": t_statistics, "p-value": p_values }) self._df_tests = df_tests
def main(): parser = argparse.ArgumentParser() parser.add_argument("--means", nargs=2, type=float) parser.add_argument("--stds", nargs=2, type=float) parser.add_argument("--observations", nargs=2, type=int) parser.add_argument("--alpha", default=0.05, type=float) parser.add_argument("--test_type", "--test-type", default="greater-than", choices=("greater-than", "less-than")) args = parser.parse_args() t, p = stats.ttest_ind_from_stats(args.means[0], args.stds[0], args.observations[0], args.means[1], args.stds[1], args.observations[1], equal_var=False) print(t) print(p) if args.test_type == "greater-than": print(greater_than_reject_null(t, p, args.alpha)) else: print(less_than_reject_null(t, p, args.alpha))
def compare_tests(n=1e4, alpha=np.array([.01, .25, .3, .4, .45, .5]), lmb=12.0, mu=12.0): cnt = np.zeros(len(alpha)) cnt1 = np.zeros(len(alpha)) for _ in range(int(n)): t = 10e3 / 4 / 1000 s = 10e3 / 4 / 1000 n1s = poisson.rvs(lmb * t, size=20) n2s = poisson.rvs(mu * s, size=20) rate1 = n1s / t rate2 = n2s / s n1 = sum(n1s) n2 = sum(n2s) d = n2 / (20 * s) - n1 / (20 * t) lmb_mix = (n1 + n2) / (t + s) / 20 p_val2 = pois_diff_sf(d, lmb_mix, lmb_mix, t, s) #if p_val2 < alpha:# and n2/s>n1/t: cnt1 += p_val2 < alpha mean1 = np.mean(rate1) std1 = np.std(rate1) mean2 = np.mean(rate2) std2 = np.std(rate2) #if mean2>mean1: t_score = stats.ttest_ind_from_stats(mean1=mean1, std1=std1, nobs1=20, \ mean2=mean2, std2=std2, nobs2=20, \ equal_var=False) #if t_score.pvalue/2 < alpha: cnt += t_score.pvalue / 2 < alpha print(cnt / n) print(cnt1 / n)
def tTestPredictor(self, blueTeamList, redTeamList): # TODO: Create testing Function blueStats = [0, 0, 0] redStats = [0, 0, 0] for team in blueTeamList: T = Team(team, self.year, self.authKey) teamStats = T.totalScoreStats(["mean", "std"]) blueStats[0] += teamStats[0] blueStats[1] = np.sqrt(pow(blueStats[1], 2) + pow(teamStats[1], 2)) blueStats[2] += teamStats[2] for team in redTeamList: T = Team(team, self.year, self.authKey) teamStats = T.totalScoreStats(["mean", "std"]) redStats[0] += teamStats[0] redStats[1] = np.sqrt(pow(redStats[1], 2) + pow(teamStats[1], 2)) redStats[2] += teamStats[2] tVal, p = sStats.ttest_ind_from_stats(blueStats[0], blueStats[1], blueStats[2], redStats[0], redStats[1], redStats[2]) if tVal > 0: return "blue", p elif tVal < 0: return "red", p elif tVal == 0: return "neither", p else: return "-1", p
def score(self, X, nbhds, nn_matrix=None): k = len(nbhds[0]) if nn_matrix is None: data = np.ones(np.sum([len(x) for x in nbhds])) col_ind = [item for sublist in nbhds for item in sublist] row_ind = [ i for i, sublist in enumerate(nbhds) for item in sublist ] # sparse adjacency matrix of NN graph nn_matrix = csr_matrix((data, (row_ind, col_ind)), shape=(len(nbhds), X.shape[0])) # get mean gene expressions within each neighborhood; this matrix may be less sparse mean_nbhd_exprs = (nn_matrix * X).astype('int').multiply( 1 / nn_matrix.sum(axis=1)).tocsr() vars = np.zeros((len(nbhds), X.shape[1])) for i in range(len(nbhds)): # gotta go cell by cell nbrs = np.array(nbhds[i]).flatten() gene_diffs = np.power( (X[nbrs, :].todense() - mean_nbhd_exprs[i, :].todense()), 2) # diffs of gene expression vars[i, :] = gene_diffs.mean(axis=0) vars = csr_matrix(vars) global_means = np.tile(X.mean(axis=0), (len(nbhds), 1)) # sign is pos if mean is higher, negative otherwise. signs = 2 * (mean_nbhd_exprs.todense() >= global_means).astype('int') - 1 global_var = np.tile(np.var(X.todense(), axis=0), (len(nbhds), 1)) nobs_global = np.tile(X.shape[0], (len(nbhds), X.shape[1])) nobs_local = np.tile(k, (len(nbhds), X.shape[1])) wts = ttest_ind_from_stats( mean1=mean_nbhd_exprs.todense().flatten(), std1=np.array(np.sqrt(vars.todense()).flatten()), nobs1=np.array(nobs_local).flatten(), mean2=np.array(global_means).flatten(), std2=np.array(np.sqrt(global_var)).flatten(), nobs2=np.array(nobs_global).flatten()).pvalue.reshape( (len(nbhds), X.shape[1])) np.nan_to_num(wts, copy=False, nan=1.0) # nans become pval 1 wts[wts == 0] = sys.float_info.min # remove zeros if self.corrector is not None: wts = self.corrector.correct(wts) wts = -1 * np.log(wts) # convert to info np.nan_to_num(wts, copy=False, nan=1.0) # nans become pval 1 wts = np.multiply(signs, wts) # negative if underexpressed return (csr_matrix(wts))
def welchsTest(self, mean1, mean2, std1, std2, sampleSize1, sampleSize2): try: t = stats.ttest_ind_from_stats( mean1, std1, sampleSize1, mean2, std2, sampleSize2, False).statistic #False means the variances are unequal return t if t != np.nan else mean1 > mean2 except: return 0.0
def run_studenttest3(dataset_1, dataset_2): mean1 = np.mean(dataset_1, axis=0) mean2 = np.mean(dataset_2, axis=0) std1 = np.std(dataset_1, axis=0) std2 = np.std(dataset_1, axis=0) nobs1 = len(dataset_1) nobs2 = len(dataset_2) t, p = stats.ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True) return p
def solution(array1, array2, s_level=0.05): ''' True, if set1 and set2 are significantly different. False, if they are significantly not different. ''' # Convert into np array array1 = np.array(array1) array2 = np.array(array2) # Compute the descriptive statistics of a and b. array1_bar = array1.mean() array1_var = array1.var(ddof=1) array1_N = array1.size # number of elements array1_dof = array1_N - 1 # Degrees of freedom array2_bar = array2.mean() array2_var = array2.var(ddof=1) array2_N = array2.size array2_dof = array2_N - 1 # Method 1 # Use scipy.stats.ttest_ind. result = ttest_ind(a=array1, b=array2) t_statistics = result[0] p_value = result[1] print("Using scipy.stats.ttest_ind.: t = %g p = %g" % (t_statistics, p_value)) # Method 2 # Use scipy.stats.ttest_ind_from_stats. t_statistics, p_value = ttest_ind_from_stats(array1_bar, np.sqrt(array1_var), array1_N, array2_bar, np.sqrt(array2_var), array2_N, equal_var=False) print("Using ttest_ind_from_stats: t = %g p = %g" % (t_statistics, p_value)) # Method 3 # Use the formulas directly. t_statistics = (array1_bar - array2_bar) / np.sqrt(array1_var / array1_N + array2_var / array2_N) dof = (array1_var / array1_N + array2_var / array2_N)**2 / ( array1_var**2 / (array1_N**2 * array1_dof) + array2_var**2 / (array2_N**2 * array2_dof)) p_value = 2 * stdtr(dof, -np.abs(t_statistics)) print("Using formulas : t = %g p = %g" % (t_statistics, p_value)) if p_value > (s_level / 2): # Failed to reject null. Both means are not significantly different. return False else: # Reject Null. Both means are significantly different. return True
def gen_voxel_msn(centers_list, label_eg, label_cg): for center in centers_list: mean_eg, std_eg, count_eg = get_center_voxel_msn_by_label(center, label_eg) mean_cg, std_cg, count_cg = get_center_voxel_msn_by_label(center, label_cg) if count_eg and count_cg: t, p = ttest_ind_from_stats(mean_eg, std_eg, count_eg, mean_cg, std_cg, count_cg) return t
def plot_results(df, sid): simple = df.loc[df.arch.str.contains("simple"), ["arch", "FoldChange_Med", "yerr", "log2_FCmed", "log2yerr", "Observed", "fold_change_std"]] simMed, simErr = simple.FoldChange_Med, simple.yerr complexenh= df.loc[df.arch.str.contains("complex"), ["arch", "FoldChange_Med", "yerr", "log2_FCmed", "log2yerr", "Observed", "fold_change_std"]] comMed, comErr = complexenh.FoldChange_Med, complexenh.yerr sids = simple.arch ind = np.arange(len(comMed)) # the x locations for the groups width = 0.2 # the width of the bars barWidth = 0.25 fig, ax = plt.subplots(figsize = (6,6)) # Set position of bar on X axis r1 = np.arange(len(comMed)) r2 = [x + barWidth for x in r1] # Make the plot plt.bar(r1, simMed, color=amber, width=barWidth, edgecolor='white', label='simple', yerr =simErr) plt.bar(r2, comMed, color=faded_green, width=barWidth, edgecolor='white', label='complexenh', yerr = comErr) result, p = stats.ttest_ind_from_stats(mean1 = simple.FoldChange_Med.item(), std1 =simple.fold_change_std.item(), nobs1 = simple.Observed.item(), mean2 = complexenh.FoldChange_Med.item(), std2 = complexenh.fold_change_std.item(), nobs2 = complexenh.Observed.item(), equal_var = False) plt.xlabel("%s, p = %s" % (sid,p)) plt.ylabel("Fold-change") plt.xticks([r + barWidth for r in range(len(comMed))], sids, fontsize = 14) from matplotlib.ticker import MultipleLocator import matplotlib.ticker as ticker #ticks = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(2**x)) #ax.yaxis.set_major_formatter(ticks) for p in ax.patches: ax.annotate("%.2fx" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()-0.05), ha='left', va='bottom', color='gray', xytext=(0, 10), textcoords='offset points') # Create legend & Show graphic plt.legend(bbox_to_anchor = (1.45,1)) ax.yaxis.set_major_locator(MultipleLocator(1)) sns.set("poster") plt.savefig("%sfig4-ROADMAP_%s_matched_GWAS_2019_LDex_p5e-8_untrimmed.pdf"%(RE, sid)) plt.show()
def two_sample_ttest_descriptive_statistic(mean1, std1, n1, mean2, std2, n2, equal_var=True): """ T-test for means of two independent samples from descriptive statistics. This is a two-sided test for the null hypothesis that 2 independent samples have identical average (expected) values. :param data1: :param data2: :return: """ result_test = stats.ttest_ind_from_stats(mean1, std1, n1, mean2, std2, n2, equal_var) return result_test
def ttest(groups, gm1, gm2, metric): df1 = groups.get_group(gm1) df2 = groups.get_group(gm2) tstats = stats.ttest_ind_from_stats(mean1=df1[metric].mean(), std1=df1[metric].std(), nobs1=len(df1), mean2=df2[metric].mean(), std2=df2[metric].std(), nobs2=len(df2)) return tstats
def generate_tables(pickle_files, main_body): # get raw results raw_table = raw_result_table(pickle_files, main_body) raw_table = raw_table.replace('V3AE-Uniform', 'V3AE-MLE') # aggregate processed results into a table table = None for data in raw_table['Data'].unique(): # clean up the name new_name = data.replace('_', ' ') raw_table.loc[raw_table.Data == data, 'Data'] = new_name data = new_name # compute means and standard deviations over methods experiment = raw_table[raw_table.Data == data] groups = ['Data', 'Method'] if main_body else ['Data', 'Method', 'BatchNorm'] mean = pd.DataFrame(experiment.groupby(groups, sort=False).mean()) std = pd.DataFrame(experiment.groupby(groups, sort=False).std(ddof=1)) # build string table df = string_table(mean.copy(deep=True), std.copy(deep=True)) # bold winners if sufficient trials n_trials = max(experiment.index) + 1 if n_trials >= 2: # loop over the metrics for (metric, order) in [('LL', 'max'), ('RMSE', 'min')]: #, ('Entropy', 'min')]: # get top performer i_best = np.argmax(mean[metric]) if order == 'max' else np.argmin(mean[metric]) # get null hypothesis null_mean = mean[metric].to_numpy()[i_best] null_std = std[metric].to_numpy()[i_best] # compute p-values ms = zip([m for m in mean[metric].to_numpy().tolist()], [s for s in std[metric].to_numpy().tolist()]) p = [ttest_ind_from_stats(null_mean, null_std, n_trials, m, s, n_trials, False)[-1] for (m, s) in ms] # bold statistical ties for best for i in range(df.shape[0]): if i == i_best or p[i] >= 0.05: df.loc[mean[metric].index[i], metric] = '\\textbf{' + df.loc[mean[metric].index[i], metric] + '}' # concatenate experiment to results table if main_body: table = pd.concat([table, df.unstack(level=0).T.swaplevel(0, 1)]) else: table = pd.concat([table, df]) return table.to_latex(escape=False)
def maybe_balance(force=False): if(force == True): for train_dataset, test_dataset in zip(train_datasets, test_datasets): print(train_dataset + " " + test_dataset) with open(train_dataset, 'rb') as f: train_set = pickle.load(f) f.close() with open(test_dataset, 'rb') as g: test_set = pickle.load(g) g.close() #result = stats.ttest_ind_from_stats(-0.128243,0.443109,52912,-0.132556,0.44502,1873) result = stats.ttest_ind_from_stats(np.mean(train_set),np.std(train_set),train_set.shape[0], np.mean(test_set),np.std(test_set),test_set.shape[0]) print(result)
def t_test(): data_sets = ["Airport", "Collaboration", "Congress", "Forum"] models = ["pWSBM_m", "pWSBM_s", "ModelR_m", "ModelR_s", "sample_size"] errors = pandas.DataFrame( [ [0.0486, 0.0006, 0.0131, 0.001, 25], [0.0407, 0.0001, 0.0303, 0.001, 25], [0.0571, 0.0004, 0.0369, 0.003, 25], [0.0726, 0.0003, 0.0376, 0.001, 25], ], data_sets, models, ) print(errors) errors["reduction"] = errors.apply(lambda record: (record[0] - record[2]) / record[0], axis=1) errors["p_value"] = errors.apply( lambda record: stats.ttest_ind_from_stats(record[0], record[1], record[4], record[2], record[3], record[4])[1], axis=1, ) return errors
def end_batch(self, batch_meta): pvalue = 1.0 if self._history.n != 0: # Perform Welch's t test t, pvalue = stats.ttest_ind_from_stats( self._history.mean, self._history.stddev(), self._history.n, self._batch.mean, self._batch.stddev(), self._batch.n, equal_var=False) # Send pvalue point back to Kapacitor response = udf_pb2.Response() response.point.time = batch_meta.tmax response.point.name = batch_meta.name response.point.group = batch_meta.group response.point.tags.update(batch_meta.tags) response.point.fieldsDouble["t"] = t response.point.fieldsDouble["pvalue"] = pvalue self._agent.write_response(response) # Update historical stats with batch, but only if it was normal. if pvalue > self._alpha: for value in self._batch._window: self._history.update(value)
def main(): current_start_year = 1981 current_end_year = 2010 future_start_year = 2070 future_end_year = 2099 LABEL_CURRENT = "Current" LABEL_FUTURE = "Future" pval_crit = 0.05 label_to_period = { LABEL_CURRENT: (current_start_year, current_end_year), LABEL_FUTURE: (future_start_year, future_end_year) } season_to_months = OrderedDict() selected_months = [11, 12, 1] for i in selected_months: season_to_months[calendar.month_name[i]] = [i, ] print(season_to_months) nemo_icefrac_vname = "soicecov" nemo_sst_vname = "sosstsst" vname = nemo_sst_vname exp_label = "cc_canesm2_nemo_offline" nemo_managers_coupled_cc_slices_canesm2_rcp85 = OrderedDict([ (LABEL_CURRENT, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/CRCM5_outputs/cc_canesm2_rcp85_gl/coupled-GL-current_CanESM2/CRCMNEMO_GL_CanESM2_RCP85", suffix="grid_T.nc")), (LABEL_FUTURE, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/CRCM5_outputs/cc_canesm2_rcp85_gl/coupled-GL-future_CanESM2/CRCMNEMO_GL_CanESM2_RCP85_future", suffix="grid_T.nc")) ]) nemo_managers_offline_cc_canesm2_rcp85 = OrderedDict([ (LABEL_CURRENT, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/NEMO_OFFICIAL/Simulations/cc_canesm2_nemo_offline_gathered_corrected_from_guillimin", suffix="grid_T.nc")), (LABEL_FUTURE, NemoYearlyFilesManager(folder="/HOME/huziy/skynet3_rech1/NEMO_OFFICIAL/Simulations/cc_canesm2_nemo_offline_gathered_corrected_from_guillimin", suffix="grid_T.nc")) ]) # nemo_managers = OrderedDict([ # (LABEL_CURRENT, NemoYearlyFilesManager(folder="/BIG1/huziy/CRCM5_NEMO_coupled_sim_nemo_outputs/NEMO", suffix="grid_T.nc")), # (LABEL_FUTURE, NemoYearlyFilesManager(folder="/BIG1/huziy/CRCM5_NEMO_coupled_sim_nemo_outputs/NEMO", suffix="grid_T.nc")), # ]) nemo_managers = nemo_managers_offline_cc_canesm2_rcp85 # calculate cc for LSWT and ice cover # Calculate seasonal mean projected changes label_to_data = OrderedDict() lons, lats = None, None for label, manager in nemo_managers.items(): assert isinstance(manager, NemoYearlyFilesManager) start_year, end_year = label_to_period[label] label_to_data[label] = manager.get_seasonal_clim_fields_with_ttest_data(start_year=start_year, end_year=end_year, season_to_months=season_to_months, varname=vname) if lons is None: lons, lats = manager.lons, manager.lats # ----------- plot the plots # plot_utils.apply_plot_params(font_size=10, width_cm=8 * len(season_to_months), height_cm=5) map = Basemap(llcrnrlon=-93, llcrnrlat=41, urcrnrlon=-73, urcrnrlat=48.5, projection='lcc', lat_1=33, lat_2=45, lon_0=-90, resolution='i', area_thresh=10000) xx, yy = map(lons, lats) fig = plt.figure() gs = GridSpec(nrows=1, ncols=len(season_to_months), wspace=0.02, hspace=0.02) for col, season in enumerate(season_to_months): mean_c, std_c, nobs_c = label_to_data[LABEL_CURRENT][season] mean_f, std_f, nobs_f = label_to_data[LABEL_FUTURE][season] cc = mean_f - mean_c tval, pval = ttest_ind_from_stats(mean_c, std_c, nobs_c, mean_f, std_f, nobs_f, equal_var=False) cc = np.ma.masked_where(pval > pval_crit, cc) clevs = vname_to_clevs_diff[vname] cmap = cm.get_cmap("bwr", len(clevs) - 1) bn = BoundaryNorm(clevs, len(clevs) - 1) ax = fig.add_subplot(gs[0, col]) im = map.pcolormesh(xx, yy, cc, cmap=cmap, norm=bn, ax=ax) cb = map.colorbar(im, location="bottom") cb.ax.set_visible(col == 0) map.drawcoastlines(linewidth=0.3) ax.set_frame_on(False) ax.set_title(season) if col == 0: ax.set_ylabel("F - C") # create the image folder if it does not exist yet if not img_folder.exists(): img_folder.mkdir() fname = "{}_{}_{}vs{}.png".format(exp_label, vname, future_start_year, future_end_year, current_start_year, current_end_year) fig.savefig(str(img_folder / fname), bbox_inches="tight", dpi=300)
def main(): start_year = 1980 end_year = 2009 HL_LABEL = "CRCM5_HL" NEMO_LABEL = "CRCM5_NEMO" # critical p-value for the ttest aka significance level p_crit = 1 vars_of_interest = [ # T_AIR_2M, # TOTAL_PREC, # SWE, default_varname_mappings.LATENT_HF, default_varname_mappings.SENSIBLE_HF, default_varname_mappings.LWRAD_DOWN, default_varname_mappings.SWRAD_DOWN # LAKE_ICE_FRACTION ] coastline_width = 0.3 vname_to_seasonmonths_map = { SWE: OrderedDict([("November", [11]), ("December", [12]), ("January", [1, ])]), LAKE_ICE_FRACTION: OrderedDict([ ("December", [12]), ("January", [1, ]), ("February", [2, ]), ("March", [3, ]), ("April", [4, ])]), T_AIR_2M: season_to_months, TOTAL_PREC: season_to_months, } # set season to months mappings for vname in vars_of_interest: if vname not in vname_to_seasonmonths_map: vname_to_seasonmonths_map[vname] = season_to_months sim_configs = { HL_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/GL_440x260_0.1deg_GL_with_Hostetler/Samples_selected", start_year=start_year, end_year=end_year, label=HL_LABEL), NEMO_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/coupled-GL-NEMO1h_30min/selected_fields", start_year=start_year, end_year=end_year, label=NEMO_LABEL), } sim_labels = [HL_LABEL, NEMO_LABEL] vname_to_level = { T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID), U_WE: VerticalLevel(1, level_kinds.HYBRID), V_SN: VerticalLevel(1, level_kinds.HYBRID), default_varname_mappings.LATENT_HF: VerticalLevel(5, level_kinds.ARBITRARY), default_varname_mappings.SENSIBLE_HF: VerticalLevel(5, level_kinds.ARBITRARY), } # Try to get the land_fraction for masking if necessary land_fraction = None try: first_ts_file = Path(sim_configs[HL_LABEL].data_path).parent / "pm1979010100_00000000p" land_fraction = get_land_fraction(first_timestep_file=first_ts_file) except Exception as err: raise err pass # Calculations # prepare params for interpolation lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL]) # get a subdomain of the simulation domain nx, ny = lons_t.shape iss = IndexSubspace(i_start=20, j_start=10, i_end=nx // 1.5, j_end=ny / 1.8) # just to change basemap limits lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL], sub_space=iss) xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_t.flatten(), lats_t.flatten()) vname_map = {} vname_map.update(default_varname_mappings.vname_map_CRCM5) # Read and calculate simulated seasonal means mod_label_to_vname_to_season_to_std = {} mod_label_to_vname_to_season_to_nobs = {} sim_data = defaultdict(dict) for label, r_config in sim_configs.items(): store_config = { "base_folder": r_config.data_path, "data_source_type": data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT_VNAME_IN_FNAME, "varname_mapping": vname_map, "level_mapping": vname_to_level, "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5, "multiplier_mapping": default_varname_mappings.vname_to_multiplier_CRCM5, } dm = DataManager(store_config=store_config) mod_label_to_vname_to_season_to_std[label] = {} mod_label_to_vname_to_season_to_nobs[label] = {} interp_indices = None for vname in vars_of_interest: # -- end_year_for_current_var = end_year if vname == SWE: end_year_for_current_var = min(1996, end_year) # -- seas_to_year_to_mean = dm.get_seasonal_means(varname_internal=vname, start_year=start_year, end_year=end_year_for_current_var, season_to_months=vname_to_seasonmonths_map[vname]) # get the climatology seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()} sim_data[label][vname] = seas_to_clim if interp_indices is None: _, interp_indices = dm.get_kdtree().query(list(zip(xt, yt, zt))) season_to_std = {} mod_label_to_vname_to_season_to_std[label][vname] = season_to_std season_to_nobs = {} mod_label_to_vname_to_season_to_nobs[label][vname] = season_to_nobs for season in seas_to_clim: interpolated_field = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape) seas_to_clim[season] = interpolated_field # calculate standard deviations of the interpolated fields season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in seas_to_year_to_mean[season].values()]).std(axis=0) # calculate numobs for the ttest season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season]) # Plotting: interpolate to the same grid and plot obs and biases xx, yy = bsmap(lons_t, lats_t) lons_t[lons_t > 180] -= 360 for vname in vars_of_interest: field_mask = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=vname in [SWE]).mask field_mask_lakes = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=True).mask plot_utils.apply_plot_params(width_cm=11 * len(vname_to_seasonmonths_map[vname]), height_cm=20, font_size=8) fig = plt.figure() nrows = len(sim_configs) + 1 ncols = len(vname_to_seasonmonths_map[vname]) gs = GridSpec(nrows=nrows, ncols=ncols) # plot the fields for current_row, sim_label in enumerate(sim_labels): for col, season in enumerate(vname_to_seasonmonths_map[vname]): field = sim_data[sim_label][vname][season] ax = fig.add_subplot(gs[current_row, col]) if current_row == 0: ax.set_title(season) clevs = get_clevs(vname) if clevs is not None: bnorm = BoundaryNorm(clevs, len(clevs) - 1) cmap = cm.get_cmap("viridis", len(clevs) - 1) else: cmap = "viridis" bnorm = None the_mask = field_mask_lakes if vname in [T_AIR_2M, TOTAL_PREC, SWE] else field_mask to_plot = np.ma.masked_where(the_mask, field) * internal_name_to_multiplier[vname] # temporary plot the actual values cs = bsmap.contourf(xx, yy, to_plot, ax=ax, levels=get_clevs(vname), cmap=cmap, norm=bnorm, extend="both") bsmap.drawcoastlines(linewidth=coastline_width) bsmap.colorbar(cs, ax=ax) if col == 0: ax.set_ylabel("{}".format(sim_label)) # plot differences between the fields for col, season in enumerate(vname_to_seasonmonths_map[vname]): field = sim_data[NEMO_LABEL][vname][season] - sim_data[HL_LABEL][vname][season] ax = fig.add_subplot(gs[-1, col]) clevs = get_clevs(vname + "biasdiff") if clevs is not None: bnorm = BoundaryNorm(clevs, len(clevs) - 1) cmap = cm.get_cmap("bwr", len(clevs) - 1) else: cmap = "bwr" bnorm = None to_plot = field * internal_name_to_multiplier[vname] # to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname] # ttest a = sim_data[NEMO_LABEL][vname][season] # Calculate the simulation data back from biases std_a = mod_label_to_vname_to_season_to_std[NEMO_LABEL][vname][season] nobs_a = mod_label_to_vname_to_season_to_nobs[NEMO_LABEL][vname][season] b = sim_data[HL_LABEL][vname][season] # Calculate the simulation data back from biases std_b = mod_label_to_vname_to_season_to_std[HL_LABEL][vname][season] nobs_b = mod_label_to_vname_to_season_to_nobs[HL_LABEL][vname][season] t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a, mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False) # Mask non-significant differences as given by the ttest to_plot = np.ma.masked_where(p > p_crit, to_plot) # mask the points with not sufficient land fraction if land_fraction is not None and vname in [SWE, ]: to_plot = np.ma.masked_where(land_fraction < 0.05, to_plot) # print("land fractions for large differences ", land_fraction[to_plot > 30]) cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend="both", levels=get_clevs(vname + "biasdiff"), cmap=cmap, norm=bnorm) bsmap.drawcoastlines(linewidth=coastline_width) bsmap.colorbar(cs, ax=ax) if col == 0: ax.set_ylabel("{}\n-\n{}".format(NEMO_LABEL, HL_LABEL)) fig.tight_layout() # save a figure per variable img_file = "seasonal_differences_noobs_{}_{}_{}-{}.png".format(vname, "-".join([s for s in vname_to_seasonmonths_map[vname]]), start_year, end_year) img_file = img_folder.joinpath(img_file) fig.savefig(str(img_file), dpi=300) plt.close(fig)
def main(): img_folder = Path("nei_validation") if not img_folder.exists(): img_folder.mkdir() pval_crit = 0.1 var_names = ["TT", "PR"] # var_names = ["PR"] seasons = OrderedDict([ ("DJF", MonthPeriod(12, 3)), ("MAM", MonthPeriod(3, 3)), ("JJA", MonthPeriod(6, 3)), ("SON", MonthPeriod(9, 3)), ]) sim_paths = OrderedDict() start_year = 1980 end_year = 2010 sim_paths["WC_0.44deg_default"] = Path("/HOME/huziy/skynet3_rech1/CRCM5_outputs/NEI/diags/NEI_WC0.44deg_default/Diagnostics") sim_paths["WC_0.44deg_ctem+frsoil+dyngla"] = Path("/HOME/huziy/skynet3_rech1/CRCM5_outputs/NEI/diags/debug_NEI_WC0.44deg_Crr1/Diagnostics") sim_paths["WC_0.11deg_ctem+frsoil+dyngla"] = Path("/snow3/huziy/NEI/WC/NEI_WC0.11deg_Crr1/Diagnostics") # -- daymet monthly daymet_vname_to_path = { "prcp": "/HOME/data/Validation/Daymet/Monthly_means/NetCDF/daymet_v3_prcp_monttl_*_na.nc4", "tavg": "/HOME/huziy/skynet3_rech1/obs_data/daymet_tavg_monthly/daymet_v3_tavg_monavg_*_na_nc4classic.nc4" } vname_to_daymet_vname = { "PR": "prcp", "TT": "tavg" } plot_utils.apply_plot_params(font_size=14) basemap_for_obs = None # plot simulation data for sim_label, sim_path in sim_paths.items(): manager_mod = DiagCrcmManager(data_dir=sim_path) for vname in var_names: daymet_vname = vname_to_daymet_vname[vname] manager_obs = HighResDataManager(path=daymet_vname_to_path[daymet_vname], vname=daymet_vname) seas_to_clim_mod = manager_mod.get_seasonal_means_with_ttest_stats( season_to_monthperiod=seasons, start_year=start_year, end_year=end_year, vname=vname, vertical_level=var_name_to_level[vname], data_file_prefix=var_name_to_file_prefix[vname] ) seas_to_clim_obs = manager_obs.get_seasonal_means_with_ttest_stats_interpolated_to( manager_mod.lons, manager_mod.lats, season_to_monthperiod=seasons, start_year=start_year, end_year=end_year, convert_monthly_accumulators_to_daily=(vname == "PR") ) season_to_diff = OrderedDict() season_to_summary_stats = OrderedDict() for season in seas_to_clim_mod: mod_mean, mod_std, mod_n = seas_to_clim_mod[season] obs_mean, obs_std, obs_n = seas_to_clim_obs[season] if vname == "PR": # Convert model data to mm/day from M/s mod_mean *= 1000 * 3600 * 24 mod_std *= 1000 * 3600 * 24 tval, pval = ttest_ind_from_stats(mod_mean, mod_std, mod_n, obs_mean, obs_std, obs_n, equal_var=False) valid_points = ~(obs_mean.mask | np.isnan(obs_mean)) mod_1d = mod_mean[valid_points] obs_1d = obs_mean[valid_points] rms = (((mod_1d - obs_1d) ** 2).sum() / len(mod_1d)) ** 0.5 spat_corr, p_spat_corr = stats.pearsonr(mod_1d, obs_1d) season_to_summary_stats[season] = f"RMSE={rms:.1f}\nr={spat_corr:.2f}\nPVr={p_spat_corr:.2f}" season_to_diff[season] = [] season_to_diff[season].append(np.ma.masked_where(pval >= pval_crit, mod_mean - obs_mean)) # mask not-significant biases season_to_diff[season].append(mod_std - obs_std) season_to_diff[season].append(-1) _plot_seasonal_deltas( seas_data=season_to_diff, data_label="{}_{}-{}".format(sim_label, start_year, end_year), img_dir=img_folder, map=manager_mod.get_basemap(resolution="i", area_thresh=area_thresh_km2), lons=manager_mod.lons, lats=manager_mod.lats, vname=vname, var_name_to_mul={"TT": 1, "PR": 1}, seas_to_stats=season_to_summary_stats )
def main(): # get the data for basemap crcm_data_path = "/RESCUE/skynet3_rech1/huziy/hdf_store/quebec_0.1_crcm5-hcd-rl.hdf5" bmp_info = analysis.get_basemap_info_from_hdf(file_path=crcm_data_path) season_key = "JJA" season_to_months = OrderedDict([(season_key, [6, 7, 8])]) month_to_ndays = {m: _month_to_ndays(m) for m in range(1, 13)} # current_filepath = ( "/RESCUE/skynet3_rech1/huziy/GCM_outputs/CanESM2/pr_Amon_CanESM2_historical_r1i1p1_185001-200512.nc" ) future_filepath = "/RESCUE/skynet3_rech1/huziy/GCM_outputs/CanESM2/pr_Amon_CanESM2_rcp85_r1i1p1_200601-210012.nc" Period = namedtuple("Period", ["start_year", "end_year"]) current = Period(start_year=1980, end_year=2010) future = Period(start_year=2070, end_year=2100) ds = xr.open_mfdataset([current_filepath, future_filepath]) # select the season ds = ds.isel(time=ds["time.season"] == season_key) # select the data for the current and future periods years = ds["time.year"] pr_current = ds.isel(time=(years >= current.start_year) & (years <= current.end_year)).pr pr_future = ds.isel(time=(years >= future.start_year) & (years <= future.end_year)).pr assert isinstance(pr_current, xr.DataArray) weights_current = xr.DataArray( [month_to_ndays[m] for m in pr_current["time.month"].values], coords=[pr_current.time] ) weights_current = weights_current / weights_current.sum() weights_future = xr.DataArray([month_to_ndays[m] for m in pr_future["time.month"].values], coords=[pr_future.time]) weights_future = weights_future / weights_future.sum() # seasonal means pr_current_smean = (pr_current * weights_current).groupby("time.year").sum(dim="time") pr_future_smean = (pr_future * weights_future).groupby("time.year").sum(dim="time") # climatology and stds pr_current_clim = pr_current_smean.mean(dim="year") pr_current_std = pr_current_smean.std(dim="year") pr_future_clim = pr_future_smean.mean(dim="year") pr_future_std = pr_future_smean.std(dim="year") # calculate significance n_current = current.end_year - current.start_year + 1 n_future = future.end_year - future.start_year + 1 tval, pval = stats.ttest_ind_from_stats( pr_current_clim.values, pr_current_std.values, nobs1=n_current, mean2=pr_future_clim.values, std2=pr_future_std.values, nobs2=n_future, ) print(weights_current[:3].values, weights_current[:3].sum()) print(pr_current_smean.shape) print(pr_future.shape) print(pr_current.shape) print(ds["time.year"][-12:]) # do the plotting plot_utils.apply_plot_params() fig = plt.figure() b = bmp_info.basemap xx, yy = bmp_info.get_proj_xy() lons, lats = np.meshgrid(ds.lon, ds.lat) xg, yg = b(lons, lats) dom_mask = (xg >= xx[0, 0]) & (xg <= xx[-1, -1]) & (yg >= yy[0, 0]) & (yg <= yy[-1, -1]) i_list, j_list = np.where(dom_mask) imax, jmax = i_list.max(), j_list.max() imin, jmin = i_list.min(), j_list.min() marginx, marginy = 10, 10 imax += marginx jmax += marginy imin -= marginx jmin -= marginy dom_mask[imin:imax, jmin:jmax] = True print(pr_current_clim.shape) print(ds.lon.shape) cchange = (pr_future_clim - pr_current_clim) * 24 * 3600 # Convert to mm/day cchange = np.ma.masked_where(~dom_mask, cchange) # cchange = np.ma.masked_where(pval > 0.1, cchange) plt.title("{}, (mm/day)".format(season_key)) im = b.contourf(xg, yg, cchange) cb = b.colorbar(im) sign = np.ma.masked_where(~dom_mask, pval <= 0.05) cs = b.contourf(xg, yg, sign, levels=[0, 0.5, 1], hatches=["/", None, None], colors="none") b.drawcoastlines() # create a legend for the contour set artists, labels = cs.legend_elements() plt.legend([artists[0]], ["not sign. (pvalue > 0.05)"], handleheight=2) img_folder = "cc-paper-comments" fig.savefig( os.path.join(img_folder, "canesm_cc_{}_precip.png".format(season_key)), dpi=common_plot_params.FIG_SAVE_DPI / 2, bbox_inches="tight", ) plt.show()
def main(vars_of_interest=None): # Validation with CRU (temp, precip) and CMC SWE # obs_data_path = Path("/RESCUE/skynet3_rech1/huziy/obs_data_for_HLES/interploated_to_the_same_grid/GL_0.1_452x260/anusplin+_interpolated_tt_pr.nc") obs_data_path = Path("/HOME/huziy/skynet3_rech1/obs_data/mh_churchill_nelson_obs_fields") CRU_PRECIP = True sim_id = "mh_0.44" add_shp_files = [ default_domains.MH_BASINS_PATH, constants.upstream_station_boundaries_shp_path[sim_id] ] start_year = 1981 end_year = 2009 MODEL_LABEL = "CRCM5 (0.44)" # critical p-value for the ttest aka significance level # p_crit = 0.05 p_crit = 1 coastlines_width = 0.3 vars_of_interest_default = [ # T_AIR_2M, TOTAL_PREC, # SWE, # LAKE_ICE_FRACTION ] if vars_of_interest is None: vars_of_interest = vars_of_interest_default vname_to_seasonmonths_map = { SWE: OrderedDict([("DJF", [12, 1, 2])]), T_AIR_2M: season_to_months, TOTAL_PREC: OrderedDict([("Annual", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])]) # season_to_months, } sim_configs = { MODEL_LABEL: RunConfig(data_path="/RECH2/huziy/BC-MH/bc_mh_044deg/Samples", start_year=start_year, end_year=end_year, label=MODEL_LABEL), } grid_config = default_domains.bc_mh_044 sim_labels = [MODEL_LABEL, ] vname_to_level = { T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID), U_WE: VerticalLevel(1, level_kinds.HYBRID), V_SN: VerticalLevel(1, level_kinds.HYBRID), SWE: VerticalLevel(-1, level_kinds.ARBITRARY) } vname_map = { default_varname_mappings.TOTAL_PREC: "pre", default_varname_mappings.T_AIR_2M: "tmp", default_varname_mappings.SWE: "SWE" } filename_prefix_mapping = { default_varname_mappings.SWE: "pm", default_varname_mappings.TOTAL_PREC: "pm", default_varname_mappings.T_AIR_2M: "dm" } # Try to get the land_fraction for masking if necessary land_fraction = None try: land_fraction = get_land_fraction(sim_configs[MODEL_LABEL]) except Exception: pass # Calculations # prepare params for interpolation lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[MODEL_LABEL]) bsmap, reg_of_interest_mask = grid_config.get_basemap_using_shape_with_polygons_of_interest(lons=lons_t, lats=lats_t, shp_path=default_domains.MH_BASINS_PATH, mask_margin=2, resolution="i") xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_t.flatten(), lats_t.flatten()) obs_multipliers = default_varname_mappings.vname_to_multiplier_CRCM5.copy() # Read and calculate observed seasonal means store_config = { "base_folder": obs_data_path.parent if not obs_data_path.is_dir() else obs_data_path, "data_source_type": data_source_types.ALL_VARS_IN_A_FOLDER_IN_NETCDF_FILES_OPEN_EACH_FILE_SEPARATELY, "varname_mapping": vname_map, "level_mapping": vname_to_level, "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5, "multiplier_mapping": obs_multipliers, } obs_dm = DataManager(store_config=store_config) obs_data = {} # need to save it for ttesting obs_vname_to_season_to_std = {} obs_vname_to_season_to_nobs = {} interp_indices = None for vname in vars_of_interest: # -- end_year_for_current_var = end_year if vname == SWE: end_year_for_current_var = min(1996, end_year) # -- seas_to_year_to_mean = obs_dm.get_seasonal_means(varname_internal=vname, start_year=start_year, end_year=end_year_for_current_var, season_to_months=vname_to_seasonmonths_map[vname]) seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()} # convert precip from mm/month (CRU) to mm/day if vname in [TOTAL_PREC] and CRU_PRECIP: for seas in seas_to_clim: seas_to_clim[seas] *= 1. / (365.25 / 12) seas_to_clim[seas] = np.ma.masked_where(np.isnan(seas_to_clim[seas]), seas_to_clim[seas]) print("{}: min={}, max={}".format(seas, seas_to_clim[seas].min(), seas_to_clim[seas].max())) obs_data[vname] = seas_to_clim if interp_indices is None: _, interp_indices = obs_dm.get_kdtree().query(list(zip(xt, yt, zt))) # need for ttests season_to_std = {} obs_vname_to_season_to_std[vname] = season_to_std season_to_nobs = {} obs_vname_to_season_to_nobs[vname] = season_to_nobs for season in seas_to_clim: seas_to_clim[season] = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape) # save the yearly means for ttesting season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in seas_to_year_to_mean[season].values()]).std(axis=0) season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season]) plt.show() # Read and calculate simulated seasonal mean biases mod_label_to_vname_to_season_to_std = {} mod_label_to_vname_to_season_to_nobs = {} model_data_multipliers = defaultdict(lambda: 1) model_data_multipliers[TOTAL_PREC] = 1000 * 24 * 3600 sim_data = defaultdict(dict) for label, r_config in sim_configs.items(): store_config = { "base_folder": r_config.data_path, "data_source_type": data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT, "varname_mapping": default_varname_mappings.vname_map_CRCM5, "level_mapping": vname_to_level, "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5, "multiplier_mapping": model_data_multipliers, "filename_prefix_mapping": filename_prefix_mapping } dm = DataManager(store_config=store_config) mod_label_to_vname_to_season_to_std[label] = {} mod_label_to_vname_to_season_to_nobs[label] = {} interp_indices = None for vname in vars_of_interest: # -- end_year_for_current_var = end_year if vname == SWE: end_year_for_current_var = min(1996, end_year) # -- seas_to_year_to_mean = dm.get_seasonal_means(varname_internal=vname, start_year=start_year, end_year=end_year_for_current_var, season_to_months=vname_to_seasonmonths_map[vname]) # get the climatology seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()} sim_data[label][vname] = seas_to_clim if interp_indices is None: _, interp_indices = dm.get_kdtree().query(list(zip(xt, yt, zt))) season_to_std = {} mod_label_to_vname_to_season_to_std[label][vname] = season_to_std season_to_nobs = {} mod_label_to_vname_to_season_to_nobs[label][vname] = season_to_nobs for season in seas_to_clim: interpolated_field = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape) seas_to_clim[season] = interpolated_field - obs_data[vname][season] # calculate standard deviations of the interpolated fields season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in seas_to_year_to_mean[season].values()]).std(axis=0) # calculate numobs for the ttest season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season]) xx, yy = bsmap(lons_t, lats_t) lons_t[lons_t > 180] -= 360 field_mask = maskoceans(lons_t, lats_t, np.zeros_like(lons_t)).mask for vname in vars_of_interest: if vname not in [SWE]: field_mask = np.zeros_like(field_mask, dtype=bool) # Plotting: interpolate to the same grid and plot obs and biases plot_utils.apply_plot_params(width_cm=32 / 4 * (len(vname_to_seasonmonths_map[vname])), height_cm=25 / 3.0 * (len(sim_configs) + 1), font_size=8 * len(vname_to_seasonmonths_map[vname])) fig = plt.figure() # fig.suptitle(internal_name_to_title[vname] + "\n") nrows = len(sim_configs) + 2 ncols = len(vname_to_seasonmonths_map[vname]) gs = GridSpec(nrows=nrows, ncols=ncols) # Plot the obs fields current_row = 0 for col, season in enumerate(vname_to_seasonmonths_map[vname]): field = obs_data[vname][season] ax = fig.add_subplot(gs[current_row, col]) ax.set_title(season) to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname] clevs = get_clevs(vname) to_plot = np.ma.masked_where(~reg_of_interest_mask, to_plot) if clevs is not None: bnorm = BoundaryNorm(clevs, len(clevs) - 1) cmap = cm.get_cmap("Blues", len(clevs) - 1) else: cmap = "jet" bnorm = None bsmap.drawmapboundary(fill_color="0.75") # cs = bsmap.contourf(xx, yy, to_plot, ax=ax, levels=get_clevs(vname), norm=bnorm, cmap=cmap) cs = bsmap.pcolormesh(xx, yy, to_plot, ax=ax, norm=bnorm, cmap=internal_name_to_cmap[vname]) bsmap.drawcoastlines(linewidth=coastlines_width) # bsmap.drawstates(linewidth=0.1) # bsmap.drawcountries(linewidth=0.2) bsmap.colorbar(cs, ax=ax) i = 0 bsmap.readshapefile(str(add_shp_files[i])[:-4], "field_{}".format(i), linewidth=0.5, color="m") if col == 0: ax.set_ylabel("Obs") # plot the biases for sim_label in sim_labels: current_row += 1 for col, season in enumerate(vname_to_seasonmonths_map[vname]): field = sim_data[sim_label][vname][season] ax = fig.add_subplot(gs[current_row, col]) clevs = get_clevs(vname + "bias") if clevs is not None: bnorm = BoundaryNorm(clevs, len(clevs) - 1) cmap = cm.get_cmap("bwr", len(clevs) - 1) else: cmap = "bwr" bnorm = None to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname] # ttest a = sim_data[sim_label][vname][season] + obs_data[vname][season] # Calculate the simulation data back from biases std_a = mod_label_to_vname_to_season_to_std[sim_label][vname][season] nobs_a = mod_label_to_vname_to_season_to_nobs[sim_label][vname][season] b = obs_data[vname][season] std_b = obs_vname_to_season_to_std[vname][season] nobs_b = obs_vname_to_season_to_nobs[vname][season] t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a, mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False) # Mask non-significant differences as given by the ttest to_plot = np.ma.masked_where(p > p_crit, to_plot) # only focus on the basins of interest to_plot = np.ma.masked_where(~reg_of_interest_mask, to_plot) # cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend="both", levels=get_clevs(vname + "bias"), cmap=cmap, norm=bnorm) bsmap.drawmapboundary(fill_color="0.75") cs = bsmap.pcolormesh(xx, yy, to_plot, ax=ax, cmap=cmap, norm=bnorm) bsmap.drawcoastlines(linewidth=coastlines_width) bsmap.colorbar(cs, ax=ax, extend="both") for i, shp in enumerate(add_shp_files[1:], start=1): bsmap.readshapefile(str(shp)[:-4], "field_{}".format(i), linewidth=0.5, color="k") if col == 0: ax.set_ylabel("{}\n-\nObs.".format(sim_label)) fig.tight_layout() # save a figure per variable img_file = "seasonal_biases_{}_{}_{}-{}.png".format(vname, "-".join([s for s in vname_to_seasonmonths_map[vname]]), start_year, end_year) if not img_folder.exists(): img_folder.mkdir(parents=True) img_file = img_folder / img_file fig.savefig(str(img_file), bbox_inches="tight", dpi=300) plt.close(fig)
def main(): obs_data_path = Path("/RESCUE/skynet3_rech1/huziy/obs_data_for_HLES/interploated_to_the_same_grid/GL_0.1_452x260/anusplin+_interpolated_tt_pr.nc") start_year = 1980 end_year = 2010 HL_LABEL = "CRCM5_HL" NEMO_LABEL = "CRCM5_NEMO" # critical p-value for the ttest aka significance level p_crit = 0.1 vars_of_interest = [ # T_AIR_2M, # TOTAL_PREC, # SWE, LAKE_ICE_FRACTION ] coastline_width = 0.3 vname_to_seasonmonths_map = { SWE: OrderedDict([("November", [11]), ("December", [12]), ("January", [1,])]), LAKE_ICE_FRACTION: OrderedDict([ ("February", [2,]), ("March", [3, ]),]), T_AIR_2M: season_to_months, TOTAL_PREC: OrderedDict([ ("Winter", [12, 1, 2]), ("Summer", [6, 7, 8]), ]) } sim_configs = { HL_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/GL_440x260_0.1deg_GL_with_Hostetler/Samples_selected", start_year=start_year, end_year=end_year, label=HL_LABEL), NEMO_LABEL: RunConfig(data_path="/RECH2/huziy/coupling/coupled-GL-NEMO1h_30min/selected_fields", start_year=start_year, end_year=end_year, label=NEMO_LABEL), } sim_labels = [HL_LABEL, NEMO_LABEL] vname_to_level = { T_AIR_2M: VerticalLevel(1, level_kinds.HYBRID), U_WE: VerticalLevel(1, level_kinds.HYBRID), V_SN: VerticalLevel(1, level_kinds.HYBRID), } # Try to get the land_fraction for masking if necessary land_fraction = None try: first_ts_file = Path(sim_configs[HL_LABEL].data_path).parent / "pm1979010100_00000000p" land_fraction = get_land_fraction(first_timestep_file=first_ts_file) except Exception as err: raise err pass # Calculations # prepare params for interpolation lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL]) # get a subdomain of the simulation domain nx, ny = lons_t.shape iss = IndexSubspace(i_start=20, j_start=20, i_end=nx // 2, j_end=ny/2) # just to change basemap limits lons_t, lats_t, bsmap = get_target_lons_lats_basemap(sim_configs[HL_LABEL], sub_space=iss, resolution="i", area_thresh=2000) xt, yt, zt = lat_lon.lon_lat_to_cartesian(lons_t.flatten(), lats_t.flatten()) vname_map = {} vname_map.update(default_varname_mappings.vname_map_CRCM5) # Read and calculate observed seasonal means store_config = { "base_folder": obs_data_path.parent, "data_source_type": data_source_types.ALL_VARS_IN_A_FOLDER_IN_NETCDF_FILES_OPEN_EACH_FILE_SEPARATELY, "varname_mapping": vname_map, "level_mapping": vname_to_level, "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5, "multiplier_mapping": default_varname_mappings.vname_to_multiplier_CRCM5, } obs_dm = DataManager(store_config=store_config) obs_data = {} # need to save it for ttesting obs_vname_to_season_to_std = {} obs_vname_to_season_to_nobs = {} interp_indices = None for vname in vars_of_interest: # -- end_year_for_current_var = end_year if vname == SWE: end_year_for_current_var = min(1996, end_year) # -- seas_to_year_to_mean = obs_dm.get_seasonal_means(varname_internal=vname, start_year=start_year, end_year=end_year_for_current_var, season_to_months=vname_to_seasonmonths_map[vname]) seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()} obs_data[vname] = seas_to_clim if interp_indices is None: _, interp_indices = obs_dm.get_kdtree().query(list(zip(xt, yt, zt))) # need for ttests season_to_std = {} obs_vname_to_season_to_std[vname] = season_to_std season_to_nobs = {} obs_vname_to_season_to_nobs[vname] = season_to_nobs for season in seas_to_clim: seas_to_clim[season] = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape) # save the yearly means for ttesting season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in seas_to_year_to_mean[season].values()]).std(axis=0) season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season]) # Read and calculate simulated seasonal mean biases mod_label_to_vname_to_season_to_std = {} mod_label_to_vname_to_season_to_nobs = {} sim_data = defaultdict(dict) for label, r_config in sim_configs.items(): store_config = { "base_folder": r_config.data_path, "data_source_type": data_source_types.SAMPLES_FOLDER_FROM_CRCM_OUTPUT_VNAME_IN_FNAME, "varname_mapping": vname_map, "level_mapping": vname_to_level, "offset_mapping": default_varname_mappings.vname_to_offset_CRCM5, "multiplier_mapping": default_varname_mappings.vname_to_multiplier_CRCM5, } dm = DataManager(store_config=store_config) mod_label_to_vname_to_season_to_std[label] = {} mod_label_to_vname_to_season_to_nobs[label] = {} interp_indices = None for vname in vars_of_interest: # -- end_year_for_current_var = end_year if vname == SWE: end_year_for_current_var = min(1996, end_year) # -- seas_to_year_to_mean = dm.get_seasonal_means(varname_internal=vname, start_year=start_year, end_year=end_year_for_current_var, season_to_months=vname_to_seasonmonths_map[vname]) # get the climatology seas_to_clim = {seas: np.array(list(y_to_means.values())).mean(axis=0) for seas, y_to_means in seas_to_year_to_mean.items()} sim_data[label][vname] = seas_to_clim if interp_indices is None: _, interp_indices = dm.get_kdtree().query(list(zip(xt, yt, zt))) season_to_std = {} mod_label_to_vname_to_season_to_std[label][vname] = season_to_std season_to_nobs = {} mod_label_to_vname_to_season_to_nobs[label][vname] = season_to_nobs for season in seas_to_clim: interpolated_field = seas_to_clim[season].flatten()[interp_indices].reshape(lons_t.shape) seas_to_clim[season] = interpolated_field - obs_data[vname][season] # calculate standard deviations of the interpolated fields season_to_std[season] = np.asarray([field.flatten()[interp_indices].reshape(lons_t.shape) for field in seas_to_year_to_mean[season].values()]).std(axis=0) # calculate numobs for the ttest season_to_nobs[season] = np.ones_like(lons_t) * len(seas_to_year_to_mean[season]) # Plotting: interpolate to the same grid and plot obs and biases xx, yy = bsmap(lons_t, lats_t) lons_t[lons_t > 180] -= 360 draw_only_first_sim_biases = True for vname in vars_of_interest: field_mask = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=vname in [SWE]).mask field_mask_lakes = maskoceans(lons_t, lats_t, np.zeros_like(lons_t), inlands=True).mask nrows = len(sim_configs) + 2 - 1 * int(draw_only_first_sim_biases) ncols = len(vname_to_seasonmonths_map[vname]) plot_utils.apply_plot_params(width_cm=8 * len(vname_to_seasonmonths_map[vname]), height_cm=4.5 * nrows, font_size=8) fig = plt.figure() gs = GridSpec(nrows=nrows, ncols=ncols, hspace=0.2, wspace=0.02) extend = "both" if vname not in [LAKE_ICE_FRACTION] else "neither" # Plot the obs fields current_row = 0 for col, season in enumerate(vname_to_seasonmonths_map[vname]): field = obs_data[vname][season] ax = fig.add_subplot(gs[current_row, col]) # ax.set_title(season) the_mask = field_mask_lakes if vname in [T_AIR_2M, TOTAL_PREC, SWE] else field_mask to_plot = np.ma.masked_where(the_mask, field) * internal_name_to_multiplier[vname] clevs = get_clevs(vname) if clevs is not None: bnorm = BoundaryNorm(clevs, len(clevs) - 1) cmap = cm.get_cmap("viridis", len(clevs) - 1) else: cmap = "viridis" bnorm = None cs = bsmap.contourf(xx, yy, to_plot, ax=ax, levels=clevs, norm=bnorm, cmap=cmap) bsmap.drawcoastlines(linewidth=coastline_width) cb = bsmap.colorbar(cs, ax=ax, location="bottom") ax.set_frame_on(vname not in [LAKE_ICE_FRACTION, ]) cb.ax.set_visible(col == 0) if col == 0: ax.set_ylabel("Obs") # plot the biases for sim_label in sim_labels: current_row += 1 for col, season in enumerate(vname_to_seasonmonths_map[vname]): field = sim_data[sim_label][vname][season] ax = fig.add_subplot(gs[current_row, col]) clevs = get_clevs(vname + "bias") if clevs is not None: bnorm = BoundaryNorm(clevs, len(clevs) - 1) cmap = cm.get_cmap("bwr", len(clevs) - 1) else: cmap = "bwr" bnorm = None the_mask = field_mask_lakes if vname in [T_AIR_2M, TOTAL_PREC, SWE] else field_mask to_plot = np.ma.masked_where(the_mask, field) * internal_name_to_multiplier[vname] # ttest a = sim_data[sim_label][vname][season] + obs_data[vname][season] # Calculate the simulation data back from biases std_a = mod_label_to_vname_to_season_to_std[sim_label][vname][season] nobs_a = mod_label_to_vname_to_season_to_nobs[sim_label][vname][season] b = obs_data[vname][season] std_b = obs_vname_to_season_to_std[vname][season] nobs_b = obs_vname_to_season_to_nobs[vname][season] t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a, mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False) # Mask non-significant differences as given by the ttest to_plot = np.ma.masked_where(p > p_crit, to_plot) # temporary plot the actual values cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend=extend, levels=get_clevs(vname + "bias"), cmap=cmap, norm=bnorm) bsmap.drawcoastlines(linewidth=coastline_width) cb = bsmap.colorbar(cs, ax=ax, location="bottom") ax.set_frame_on(vname not in [LAKE_ICE_FRACTION, ]) cb.ax.set_visible(False) if col == 0: ax.set_ylabel("{}\n-\nObs.".format(sim_label)) # draw biases only for the first simulation if draw_only_first_sim_biases: break # plot differences between the biases current_row += 1 for col, season in enumerate(vname_to_seasonmonths_map[vname]): field = sim_data[NEMO_LABEL][vname][season] - sim_data[HL_LABEL][vname][season] ax = fig.add_subplot(gs[current_row, col]) clevs = get_clevs(vname + "bias") if clevs is not None: bnorm = BoundaryNorm(clevs, len(clevs) - 1) cmap = cm.get_cmap("bwr", len(clevs) - 1) else: cmap = "bwr" bnorm = None to_plot = field * internal_name_to_multiplier[vname] # to_plot = np.ma.masked_where(field_mask, field) * internal_name_to_multiplier[vname] # ttest a = sim_data[NEMO_LABEL][vname][season] + obs_data[vname][season] # Calculate the simulation data back from biases std_a = mod_label_to_vname_to_season_to_std[NEMO_LABEL][vname][season] nobs_a = mod_label_to_vname_to_season_to_nobs[NEMO_LABEL][vname][season] b = sim_data[HL_LABEL][vname][season] + obs_data[vname][season] # Calculate the simulation data back from biases std_b = mod_label_to_vname_to_season_to_std[HL_LABEL][vname][season] nobs_b = mod_label_to_vname_to_season_to_nobs[HL_LABEL][vname][season] t, p = ttest_ind_from_stats(mean1=a, std1=std_a, nobs1=nobs_a, mean2=b, std2=std_b, nobs2=nobs_b, equal_var=False) # Mask non-significant differences as given by the ttest to_plot = np.ma.masked_where(p > p_crit, to_plot) # mask the points with not sufficient land fraction if land_fraction is not None and vname in [SWE, ]: to_plot = np.ma.masked_where(land_fraction < 0.1, to_plot) # print("land fractions for large differences ", land_fraction[to_plot > 30]) cs = bsmap.contourf(xx, yy, to_plot, ax=ax, extend=extend, levels=clevs, cmap=cmap, norm=bnorm) bsmap.drawcoastlines(linewidth=coastline_width) cb = bsmap.colorbar(cs, ax=ax, location="bottom") ax.text(0.99, 1.1, season, va="top", ha="right", fontsize=16, transform=ax.transAxes) cb.ax.set_visible(col == 0) assert isinstance(ax, Axes) ax.set_frame_on(False) if col == 0: ax.set_ylabel("{}\n-\n{}".format(NEMO_LABEL, HL_LABEL)) # fig.tight_layout() # save a figure per variable img_file = "seasonal_biases_{}_{}_{}-{}.png".format(vname, "-".join([s for s in vname_to_seasonmonths_map[vname]]), start_year, end_year) img_file = img_folder.joinpath(img_file) fig.savefig(str(img_file), dpi=300, bbox_inches="tight") plt.close(fig)
from scipy.stats import ttest_ind_from_stats df = pd.DataFrame([fishmeans, birdmeans, birddropmeans]) dfstd = pd.DataFrame([fishstds, birdstds, birddropstds]) df.columns = ['TT','TF','FT', 'FF'] df.index = ['fish', 'bird','!bird'] dfstd.columns = ['TT','TF','FT', 'FF'] dfstd.index = ['fish', 'bird','!bird'] print '12 samples' print 'eigen-neighbors' ttest = [] for i in range(len(df)): for j, col in enumerate(df.columns): for k in df.columns: p = ttest_ind_from_stats(df[col][i], dfstd[col][i], 12, df[k][i], dfstd[k][i],12)[1] ttest.append([df.index[i], col, k, p]) #print '{0}, {1},{2},{3}'.format(*ttest[-1]) ttest = pd.DataFrame(ttest) ttest.columns = ['word','eigen-neigbor 1', 'eigen-neigbor 2', 'p'] ttest.sort('p') #ttest_ind_from_stats(df['TT']['fish'], dfstd['TT']['fish'], 12, df['TF']['fish'], dfstd['TF']['fish'],12) def print_def(self, metric, rank = 0, col = 'def'): ''' would need to pass in a dataframe. self.conx deprecated ''' # #print a