def get_significance_and_changes_for_months(months = range(1, 13), folder_path = "data/streamflows/hydrosheds_euler9" ): """ returns boolean vector of the size n_grid_cells where True means significant change and False not significant and the percentage of changes with respect to the current mean """ current_means = [] future_means = [] id_to_file_path = {} for the_id in members.all_members: for file_name in os.listdir(folder_path): if file_name.startswith( the_id ): id_to_file_path[the_id] = os.path.join(folder_path, file_name) for the_id in members.all_current: current_file = id_to_file_path[the_id] streamflows, times, i_indices, j_indices = data_select.get_data_from_file(current_file) current_mean_dict = data_select.get_means_over_months_for_each_year(times, streamflows, months=months) current_means.extend(current_mean_dict.values()) future_file = id_to_file_path[members.current2future[the_id]] streamflows, times, i_indices, j_indices = data_select.get_data_from_file(future_file) future_mean_dict = data_select.get_means_over_months_for_each_year(times, streamflows, months=months) future_means.extend(future_mean_dict.values()) current_means = np.array( current_means ) future_means = np.array( future_means ) print future_means.shape t, p = stats.ttest_ind(current_means, future_means, axis = 0) is_sign = p < 0.05 #significance to the 5% confidence level current_means = np.mean(current_means, axis=0) future_means = np.mean(future_means, axis=0) print future_means.shape print "number of significant points = ", sum( map(int, is_sign) ) return is_sign, (future_means - current_means) / current_means * 100.0 pass
def do_bootstrap_for_simulation_mean(sim_id = "aet", folder_path = "data/streamflows/hydrosheds_euler9", months = range(1, 13), n_samples = 1000): """ returns the object containing means for the domain and standard deviations from bootstrap """ cache_file = _get_cache_file_path(sim_id=sim_id, months=months) if os.path.isfile(cache_file): return pickle.load(open(cache_file)) #determine path to the file with data filePath = None for f in os.listdir(folder_path): if f.startswith(sim_id): filePath = os.path.join(folder_path, f) break streamflow, times, i_indices, j_indices = data_select.get_data_from_file(filePath) #for each year and for each gridcell get mean value for the period means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months) means_sorted_in_time = map( lambda x : x[1], sorted(means_dict.items(), key=lambda x: x[0]) ) data_matrix = np.array(means_sorted_in_time) print "data_matrix.shape = ", data_matrix.shape #generate indices index_matrix = np.random.rand(n_samples, data_matrix.shape[0]) index_matrix *= (data_matrix.shape[0] - 1) index_matrix = index_matrix.round().astype(int) means_matrix = np.zeros((n_samples, streamflow.shape[1])) #n_samples x n_points for i in xrange(n_samples): means_matrix[i,:] = np.mean(data_matrix[index_matrix[i,:],:], axis = 0) m_holder = MeansAndDeviation(sim_id=sim_id, means_for_domain=np.mean(data_matrix, axis = 0), standard_devs_for_domain=np.std(means_matrix, axis = 0)) pickle.dump(m_holder, open(cache_file, mode="w")) return m_holder
def get_std_and_mean_using_bootstrap_for_merged_means(sim_ids = None, folder_path = "data/streamflows/hydrosheds_euler9", months = range(1, 13), n_samples = 1000): """ returns the object containing means for the domain and standard deviations from bootstrap """ cache_file = _get_cache_file_path(months=months, sim_ids = sim_ids) if os.path.isfile(cache_file): return pickle.load(open(cache_file)) #determine path to the file with data filePaths = [] for f in os.listdir(folder_path): if f.split("_")[0] in sim_ids: filePath = os.path.join(folder_path, f) filePaths.append(filePath) boot_means = [] real_means = [] index_matrix = None all_means = [] members_boot_means = [] for file_path in filePaths: streamflow, times, i_indices, j_indices = data_select.get_data_from_file(file_path) #for each year and for each gridcell get mean value for the period means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months) means_sorted_in_time = map( lambda x : x[1], sorted(means_dict.items(), key=lambda x: x[0]) ) data_matrix = np.array(means_sorted_in_time) real_means.append(data_matrix) #save modelled means, in order to calculate mean of the merged data #print "data_matrix.shape = ", data_matrix.shape boot_means = [] for i in xrange(n_samples): #generate indices index_vector = np.random.randint(0, data_matrix.shape[0], data_matrix.shape[0]) #average 30 bootstrapped annual means boot_means.append( np.mean(data_matrix[index_vector,:], axis = 0) ) members_boot_means.append( boot_means ) #take average over members print np.array(members_boot_means).shape boot_means = np.array(members_boot_means).mean(axis = 0) #nsamples x npoints print boot_means[:, 499] print boot_means[:, 19] assert boot_means.shape[0] == n_samples, boot_means.shape print "boot_means.shape = ", boot_means.shape std_result = np.std(boot_means, axis = 0) mean_result = np.array(real_means).mean(axis = 0).mean(axis = 0) pickle.dump([std_result, mean_result], open(cache_file, mode="w")) return std_result, mean_result