def plot_diff_between_files(file1, file2, i_array, j_array): data1 = data_select.get_data_from_file(file1) data2 = data_select.get_data_from_file(file2) the_diff = np.mean(data2 - data1, axis = 0) plot_data(the_diff, i_array, j_array, name = 'the_diff', title='AEX, difference between \n %s \n and \n %s' % (file2, file1)) pass
def get_significance_and_changes_for_months(months = range(1, 13), folder_path = "data/streamflows/hydrosheds_euler9" ): """ returns boolean vector of the size n_grid_cells where True means significant change and False not significant and the percentage of changes with respect to the current mean """ current_means = [] future_means = [] id_to_file_path = {} for the_id in members.all_members: for file_name in os.listdir(folder_path): if file_name.startswith( the_id ): id_to_file_path[the_id] = os.path.join(folder_path, file_name) for the_id in members.all_current: current_file = id_to_file_path[the_id] streamflows, times, i_indices, j_indices = data_select.get_data_from_file(current_file) current_mean_dict = data_select.get_means_over_months_for_each_year(times, streamflows, months=months) current_means.extend(current_mean_dict.values()) future_file = id_to_file_path[members.current2future[the_id]] streamflows, times, i_indices, j_indices = data_select.get_data_from_file(future_file) future_mean_dict = data_select.get_means_over_months_for_each_year(times, streamflows, months=months) future_means.extend(future_mean_dict.values()) current_means = np.array( current_means ) future_means = np.array( future_means ) print future_means.shape t, p = stats.ttest_ind(current_means, future_means, axis = 0) is_sign = p < 0.05 #significance to the 5% confidence level current_means = np.mean(current_means, axis=0) future_means = np.mean(future_means, axis=0) print future_means.shape print "number of significant points = ", sum( map(int, is_sign) ) return is_sign, (future_means - current_means) / current_means * 100.0 pass
def compare_simulations(path1, path2, label1 = '1', label2 = '2', field_name = 'water_discharge'): data1, time1, i_indices, j_indices = data_select.get_data_from_file(path1, field_name) data2, time2, i_indices, j_indices = data_select.get_data_from_file(path2, field_name) the_mins1 = np.min(data1, axis = 0) the_mins2 = np.min(data2, axis = 0) the_maxs1 = np.max(data1, axis = 0) the_maxs2 = np.max(data2, axis = 0) the_means1 = np.mean(data1, axis = 0) the_means2 = np.mean(data2, axis = 0) #scatter plot for means plt.subplots_adjust(hspace = 0.5) plt.subplot(2,2,1) plt.title('means', override) plt.scatter( the_means1 , the_means2, linewidth = 0) plt.xlabel(label1) plt.ylabel(label2) x = plt.xlim() plt.plot(x,x, color = 'k') plt.grid(True) #scatter plot for minimums plt.subplot(2,2,2) plt.title('minimums', override) plt.scatter( the_mins1 , the_mins2, linewidth = 0) plt.xlabel(label1) plt.ylabel(label2) x = plt.xlim() plt.plot(x,x, color = 'k') plt.grid(True) #scatter plot for minimums plt.subplot(2,2,3) plt.title('maximums', override) plt.scatter( the_maxs1 , the_maxs2, linewidth = 0) plt.xlabel(label1) plt.ylabel(label2) x = plt.xlim() plt.plot(x,x, color = 'k') plt.grid(True) plt.savefig('{0}_{1}_scatter.png'.format(label1, label2), bbox_inches = 'tight') pass
def compare_and_plot(path_to_folder = 'data/streamflows/hydrosheds_euler9'): """ Calculates interannual variability (standard ceviations) for each pair of members and plots their ratios create annual mean matrices -> calculate standard deviations for future and current climate, plot ratios of variations for memebers and the std for the control run, """ member_to_path = get_member_to_path_mapping(path_to_folder) plot_utils.apply_plot_params(aspect_ratio = 1.5) plt.figure() plot_marks = ['a', 'b', 'c', 'd', 'e'] subplot_count = 1 for current_id, plot_mark in zip(members.current_ids, plot_marks): future_id = members.current2future[current_id] path_c = member_to_path[current_id] path_f = member_to_path[future_id] stfl_c, times_c, i_indices, j_indices = data_select.get_data_from_file(path_c) stfl_f, times_f, i_indices, j_indices = data_select.get_data_from_file(path_f) means_c = calculate_annual_means(times_c, stfl_c) means_f = calculate_annual_means(times_f, stfl_f) std_c = np.std(means_c, axis = 0) std_f = np.std(means_f, axis = 0) f_values = std_f / std_c plt.subplot(3, 2, subplot_count) plot_subplot(i_indices, j_indices, f_values, mark = plot_mark) subplot_count += 1 #plot variance for the control simulation plt.subplot(3,2, subplot_count) stfl_c, times_c, i_indices, j_indices = data_select.get_data_from_file(path_c) means_c = calculate_annual_means(times_c, stfl_c) std_c = np.std(means_c, axis = 0) plot_subplot(i_indices, j_indices, std_c, mark = 'f') super_title = 'a-e: Changes in interannual variability ($\\sigma_{\\rm future}/ \\sigma_{\\rm current}$). \n' super_title += 'f: Interannual variability of the control simulation' plt.suptitle(super_title) plt.show()
def plot_cv_for_seasonal_mean(folder_path = "data/streamflows/hydrosheds_euler9", member_ids = None, file_name_pattern = "%s_discharge_2041_01_01_00_00.nc", months = range(1,13), out_file_name = "cv_for_annual_mean.png", max_value = None ): """ calculate and plot cv for annual mean values """ plt.figure() times = None i_indices = None j_indices = None x_min, x_max = None, None y_min, y_max = None, None seasonal_means = [] for i, the_id in enumerate( member_ids ): fName = file_name_pattern % the_id fPath = os.path.join(folder_path, fName) if not i: data, times, i_indices, j_indices = data_select.get_data_from_file(fPath) interest_x = x[i_indices, j_indices] interest_y = y[i_indices, j_indices] x_min, x_max, y_min, y_max = _get_limits(interest_x = interest_x, interest_y = interest_y) else: data = data_select.get_field_from_file(path = fPath) assert data is not None, "i = %d " % i if len(months) == 12: the_seasonal_mean = np.mean(data, axis = 0) else: bool_vector = map(lambda t: t.month in months, times) indices = np.where(bool_vector) the_seasonal_mean = np.mean(data[indices[0],:], axis = 0) seasonal_means.append(the_seasonal_mean) seasonal_means = np.array( seasonal_means ) mu = np.mean(seasonal_means, axis=0) sigma = np.std(seasonal_means,axis=0) cv = sigma / mu cMap = mpl.cm.get_cmap(name = "jet_r", lut = 30) cMap.set_over(color = "0.5") to_plot = np.ma.masked_all(x.shape) for the_index, i, j in zip( xrange(len(i_indices)), i_indices, j_indices): to_plot[i, j] = cv[the_index] basemap.pcolormesh(x, y, to_plot.copy(), cmap = cMap, vmin = 0, vmax = max_value) basemap.drawcoastlines(linewidth = 0.5) plt.colorbar(ticks = LinearLocator(numticks = 11), format = "%.1e") plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.savefig(out_file_name, bbox_inches = "tight")
def __init__(self, path = "", spinup_years = None): self.spinup_years = spinup_years dataCollection = data_select.get_data_from_file(path = path) self.data = dataCollection[0] self.times = dataCollection[1] self.x_indices = dataCollection[2] self.y_indices = dataCollection[3] self.label = "%d years" % spinup_years self._select_first_year_data()
def get_dispersion_between_members(files): datas = [] for path in files: data = data_select.get_data_from_file(path) datas.append(data) nt, ncell = datas[0].shape nmembers = len(datas) all_data = np.zeros((nmembers, nt, ncell)) for i, the_data in enumerate(datas): all_data[i, :, :] = the_data[:,:] return np.mean(np.std(all_data, axis = 0), axis = 0)
def plot_mean_hydrograph_with_gw_ouflow(data_path = 'data/streamflows/hydrosheds_euler10_spinup100yrs'): #TODO: Implement basins = infocell.get_basins_with_cells_connected_using_hydrosheds_data() basinName = 'RDO' theBasin = None for basin in basins: # @type basin Basin if basin.name == basinName: theBasin = basin break data = data_select.get_data_from_file(path = data_path, field_name = 'gw_outflow') gw_outflow = data[0] times = data[1] x_index = data[2] y_index = data[3] data = data_select.get_data_from_file(path = data_path, field_name = 'surface_runoff') surface_runoff = data[0] for t in times: pass
def do_bootstrap_for_simulation_mean(sim_id = "aet", folder_path = "data/streamflows/hydrosheds_euler9", months = range(1, 13), n_samples = 1000): """ returns the object containing means for the domain and standard deviations from bootstrap """ cache_file = _get_cache_file_path(sim_id=sim_id, months=months) if os.path.isfile(cache_file): return pickle.load(open(cache_file)) #determine path to the file with data filePath = None for f in os.listdir(folder_path): if f.startswith(sim_id): filePath = os.path.join(folder_path, f) break streamflow, times, i_indices, j_indices = data_select.get_data_from_file(filePath) #for each year and for each gridcell get mean value for the period means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months) means_sorted_in_time = map( lambda x : x[1], sorted(means_dict.items(), key=lambda x: x[0]) ) data_matrix = np.array(means_sorted_in_time) print "data_matrix.shape = ", data_matrix.shape #generate indices index_matrix = np.random.rand(n_samples, data_matrix.shape[0]) index_matrix *= (data_matrix.shape[0] - 1) index_matrix = index_matrix.round().astype(int) means_matrix = np.zeros((n_samples, streamflow.shape[1])) #n_samples x n_points for i in xrange(n_samples): means_matrix[i,:] = np.mean(data_matrix[index_matrix[i,:],:], axis = 0) m_holder = MeansAndDeviation(sim_id=sim_id, means_for_domain=np.mean(data_matrix, axis = 0), standard_devs_for_domain=np.std(means_matrix, axis = 0)) pickle.dump(m_holder, open(cache_file, mode="w")) return m_holder
def plot_annual_extremes(data_path = 'data/streamflows/VplusFmask_newton/aex_discharge_1970_01_01_00_00.nc', start_date = datetime(1970, 1,1, 0,0,0), end_date = datetime(2000, 1,1, 0,0,0), ): streamflows, times, i_array, j_array = data_select.get_data_from_file(data_path) period_start_month = 1 period_end_month = 12 the_minima = data_select.get_minimums_for_domain(streamflows, times, start_date = start_date, end_date = end_date, start_month = period_start_month, end_month = period_end_month, duration_days = 1) plot_data_2d.plot_data(the_minima, i_array, j_array, name = "minima", title = "min", digits = 1, color_map = mpl.cm.get_cmap("OrRd", 10), minmax = (None, None), units = "m**3/s") period_start_month = 4 period_end_month = 6 the_maximums = data_select.get_maximums_for_domain(streamflows, times, start_date = start_date, end_date = end_date, start_month = period_start_month, end_month = period_end_month, duration_days = 7) plot_data_2d.plot_data(the_maximums, i_array, j_array, name = "maxima", title = "max", digits = 1, color_map = mpl.cm.get_cmap("OrRd", 10), minmax = (None, None), units = "m**3/s") pass
def get_meanof_means_and_stds_from_files(files): mean = None stdevs = None if not len(files): return for path in files: data = data_select.get_data_from_file(path) if mean is None: mean = np.zeros(data.shape[1]) stdevs = np.zeros(data.shape[1]) mean += np.mean(data, axis = 0) stdevs += np.std(data, axis = 0) mean /= float(len(files)) stdevs /= float(len(files)) print 'max deviation: ', np.max(stdevs) assert mean.shape[0] == data.shape[1] return mean, stdevs
def plot_ratio(path = 'data/streamflows/hydrosheds_euler10_spinup100yrs/aex_discharge_1970_01_01_00_00.nc', min_lon = None, max_lon = None, min_lat = None, max_lat = None): res = data_select.get_data_from_file(path = path, field_name = 'surface_runoff') surf_runoff, times, x_indices, y_indices = res total_runoff = data_select.get_field_from_file(path, field_name = 'subsurface_runoff') total_runoff = surf_runoff + total_runoff if min_lon != None: lons = data_select.get_field_from_file(path, field_name = 'longitude') lats = data_select.get_field_from_file(path, field_name = 'latitude') lons_1d = lons[x_indices, y_indices] lats_1d = lats[x_indices, y_indices] condition = (lons_1d >= min_lon) & (lons_1d <= max_lon) condition = (lats_1d >= min_lat) & (lats_1d <= max_lat) surf_runoff = surf_runoff[:,condition] total_runoff = total_runoff[:,condition] mean_surf_runoff = np.mean(surf_runoff, axis = 1) #mean in space mean_total_runoff = np.mean(total_runoff, axis = 1) stamp_dates = map(lambda d: swe.toStampYear(d, stamp_year = 2000), times) t1, v1 = get_mean_for_day_of_year(stamp_dates, mean_surf_runoff) plt.plot(t1, v1, label = 'surface runoff', linewidth = 3) t2, v2 = get_mean_for_day_of_year(stamp_dates, mean_total_runoff) plt.plot(t2, v2, label = 'total runoff', linewidth = 3) plt.legend() plt.gca().xaxis.set_major_formatter(mpl.dates.DateFormatter('%b')) plt.show()
def compare_means(member_id = 'aex' ,data_folder1 = '', label1 = '', data_folder2 = '', label2 = ''): basin_path = 'data/infocell/amno180x172_basins.nc' basin_indices = lowflow.read_basin_indices(basin_path) for f in os.listdir(data_folder1): if f.lower().startswith(member_id): path1 = os.path.join(data_folder1, f) for f in os.listdir(data_folder2): if f.lower().startswith(member_id): path2 = os.path.join(data_folder2, f) discharge_1, times1, i_list, j_list = data_select.get_data_from_file(path1, 'water_discharge') discharge_2, times2, i_list, j_list = data_select.get_data_from_file(path2, 'water_discharge') discharge_values_1 = [] discharge_values_2 = [] for pos in range(discharge_1.shape[1]): dates, discharge_tmp = pe.average_for_each_day_of_year(times1, discharge_1[:, pos], year = 2000) discharge_values_1.append(np.array(discharge_tmp)) dates, discharge_tmp = pe.average_for_each_day_of_year(times2, discharge_2[:, pos], year = 2000) discharge_values_2.append(np.array(discharge_tmp)) basin_to_discharge_1 = {} basin_to_discharge_2 = {} the_zip = zip(i_list, j_list, discharge_values_1, discharge_values_2) for basin in basin_indices: for i, j, d_1, d_2 in the_zip: if basin.mask[i, j] == 1: if basin_to_discharge_1.has_key(basin): basin_to_discharge_1[basin] += d_1 basin_to_discharge_2[basin] += d_2 else: basin_to_discharge_1[basin] = d_1 basin_to_discharge_2[basin] = d_2 for basin in basin_to_discharge_1.keys(): n = float(basin.get_number_of_cells()) basin_to_discharge_1[basin] /= n basin_to_discharge_2[basin] /= n plt.figure() n = 1 for basin, d in basin_to_discharge_1.iteritems(): plt.subplot(7, 3, n) plt.title(basin.name) dicharge_line_1 = plt.plot(dates, d, linewidth = 2, color = 'b') discharge_line_2 = plt.plot(dates, basin_to_discharge_2[basin], linewidth = 2, color = 'r') #runoff_line = plt.plot(dates, basin_to_runoff[basin]) ax = plt.gca() ax.xaxis.set_major_locator( mpl.dates.MonthLocator(bymonth = range(2,13,2)) ) ax.xaxis.set_major_formatter( mpl.dates.DateFormatter('%b') ) n += 1 plt.figlegend([dicharge_line_1, discharge_line_2], [label1, label2], 'upper right') plt.savefig('{0}_hydrographs.png'.format(member_id), bbox_inches = 'tight') pass
def plot_basin_mean_hydrograph(current_id = 'aex', future_id = None, data_folder = 'data/streamflows/hydrosheds_euler7', current_start_date = None, current_end_date = None, future_start_date = None, future_end_date = None): basin_path = 'data/infocell/amno180x172_basins.nc' basin_indices = lowflow.read_basin_indices(basin_path) for f in os.listdir(data_folder): if f.lower().startswith(current_id): path_current = os.path.join(data_folder, f) if f.lower().startswith(future_id): path_future = os.path.join(data_folder, f) discharge_current, times_current, i_list, j_list = data_select.get_data_from_file(path_current, 'water_discharge') discharge_future, times_future, i_list, j_list = data_select.get_data_from_file(path_future, 'water_discharge') discharge_values_current = [] discharge_values_future = [] for pos in range(discharge_current.shape[1]): dates, discharge1 = pe.average_for_each_day_of_year(times_current, discharge_current[:, pos], start_date = current_start_date, end_date = current_end_date, year = 2000) discharge_values_current.append(np.array(discharge1)) dates, discharge1 = pe.average_for_each_day_of_year(times_future, discharge_future[:, pos], start_date = future_start_date, end_date = future_end_date, year = 2000) discharge_values_future.append(np.array(discharge1)) basin_to_discharge_current = {} basin_to_discharge_future = {} the_zip = zip(i_list, j_list, discharge_values_current, discharge_values_future) for basin in basin_indices: for i, j, d_current, d_future in the_zip: if basin.mask[i, j] == 1: if basin_to_discharge_current.has_key(basin): basin_to_discharge_current[basin] += d_current basin_to_discharge_future[basin] += d_future else: basin_to_discharge_current[basin] = d_current basin_to_discharge_future[basin] = d_future for basin in basin_to_discharge_current.keys(): n = float(basin.get_number_of_cells()) basin_to_discharge_current[basin] /= n basin_to_discharge_future[basin] /= n plt.figure() n = 1 plt.subplots_adjust(hspace = 0.5) for basin, d in basin_to_discharge_current.iteritems(): plt.subplot(7, 3, n) plt.title(basin.name) dicharge_line_current = plt.plot(dates, d, linewidth = 2, color = 'b') discharge_line_future = plt.plot(dates, basin_to_discharge_future[basin], linewidth = 2, color = 'r') plt.ylabel('${\\rm m^3/s}$') #runoff_line = plt.plot(dates, basin_to_runoff[basin]) ax = plt.gca() ax.xaxis.set_major_locator( mpl.dates.MonthLocator(bymonth = range(2,13,2)) ) ax.xaxis.set_major_formatter( mpl.dates.DateFormatter('%b') ) n += 1 plt.figlegend([dicharge_line_current, discharge_line_future], ['current', 'future'], 'upper right') plt.savefig('{0}_{1}_hydrographs.png'.format(current_id, future_id), bbox_inches = 'tight') pass
def plot_seasonal_mean_streamflows(folder_path = "data/streamflows/hydrosheds_euler9", member_ids = None, file_name_pattern = "%s_discharge_1970_01_01_00_00.nc", months = range(1,13), out_file_name = "annual_means.png" ): print months if member_ids is None: return i_indices = None j_indices = None times = None x_min, x_max = None, None y_min, y_max = None, None the_seasonal_mean_list = [] for i, the_id in enumerate( member_ids ): fName = file_name_pattern % the_id fPath = os.path.join(folder_path, fName) print fPath data, times, i_indices, j_indices = data_select.get_data_from_file(fPath) if not i: interest_x = x[i_indices, j_indices] interest_y = y[i_indices, j_indices] x_min, x_max, y_min, y_max = _get_limits(interest_x = interest_x, interest_y = interest_y) assert data is not None, "i = %d " % i if len(months) == 12: the_seasonal_mean = np.mean(data, axis = 0) else: bool_vector = map(lambda t: t.month in months, times) # take only month of interest indices = np.where(bool_vector) print indices[0].shape print len(indices) the_seasonal_mean = np.mean(data[indices[0],:], axis = 0) print data.shape print "data = ", data[indices[0],:].shape print "mean = ", the_seasonal_mean.shape print sum(map(int, bool_vector)) the_seasonal_mean_list.append(the_seasonal_mean) print np.array(the_seasonal_mean_list).shape plot_utils.apply_plot_params(aspect_ratio=0.8) plt.figure() plt.subplots_adjust(hspace = 0.1, wspace = 0.3) max_value = np.array(the_seasonal_mean_list).max() cMap = mpl.cm.get_cmap(name = "jet_r", lut = 18) for k, a_seasonal_mean in enumerate(the_seasonal_mean_list): plt.subplot( 2, len(member_ids) // 2 + 1 , k + 1) to_plot = np.ma.masked_all(x.shape) for the_index, i, j in zip( xrange(len(i_indices)), i_indices, j_indices): to_plot[i, j] = a_seasonal_mean[the_index] basemap.pcolormesh(x, y, to_plot.copy(), cmap = cMap, vmin = 0, vmax = max_value) basemap.drawcoastlines(linewidth = 0.5) plt.colorbar(ticks = LinearLocator(numticks = 7), format = "%d") plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) #plt.savefig(str(k+1)+"_"+out_file_name) plt.savefig(out_file_name) pass
def plot_mean_extreme_flow(folder_path = "data/streamflows/hydrosheds_euler9", member_ids = None, file_name_pattern = "%s_discharge_1970_01_01_00_00.nc", out_file_name = "annual_means.png", high = True, start_month = 1, end_month = 12 ): """ Plot mean extreme (1-day high or 15-day low) flow over time """ if member_ids is None: return i_indices = None j_indices = None times = None x_min, x_max = None, None y_min, y_max = None, None the_extreme_list = [] for i, the_id in enumerate( member_ids ): fName = file_name_pattern % the_id fPath = os.path.join(folder_path, fName) if not i: data, times, i_indices, j_indices = data_select.get_data_from_file(fPath) interest_x = x[i_indices, j_indices] interest_y = y[i_indices, j_indices] x_min, x_max, y_min, y_max = _get_limits(interest_x = interest_x, interest_y = interest_y) else: data = data_select.get_field_from_file(path = fPath) assert data is not None, "i = %d " % i if high: extremes = data_select.get_list_of_annual_maximums_for_domain(data, times, start_month = start_month, end_month = end_month) else: extremes = data_select.get_list_of_annual_minimums_for_domain(data, times, event_duration = timedelta(days = 15), start_month = start_month, end_month = end_month ) the_extreme_list.append(np.mean(extremes, axis = 0)) print "shape of extremes list ", np.array(the_extreme_list).shape plot_utils.apply_plot_params(aspect_ratio=0.8) plt.figure() plt.subplots_adjust(hspace = 0.1, wspace = 0.3) for k, the_extreme_mean in enumerate(the_extreme_list): plt.subplot( 2, len(member_ids) // 2 + 1 , k + 1) to_plot = np.ma.masked_all(x.shape) for the_index, i, j in zip( xrange(len(i_indices)), i_indices, j_indices): to_plot[i, j] = the_extreme_mean[the_index] basemap.pcolormesh(x, y, to_plot.copy(), cmap = mpl.cm.get_cmap(name = "jet_r", lut = 18), vmin = 0, vmax = 1.5) basemap.drawcoastlines(linewidth = 0.5) plt.colorbar(ticks = LinearLocator(numticks = 7), format = "%.2e") plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.savefig(out_file_name) #plot cv for the extremes (here for performance, no need to again fetch the extremes) max_value = 0.1 plot_utils.apply_plot_params(width_pt=600) plt.figure() extreme_means = np.array( the_extreme_list ) mu = np.mean(extreme_means, axis=0) sigma = np.std(extreme_means,axis=0) cv = sigma / mu to_plot = np.ma.masked_all(x.shape) for the_index, i, j in zip( xrange(len(i_indices)), i_indices, j_indices): to_plot[i, j] = cv[the_index] basemap.pcolormesh(x, y, to_plot.copy(), cmap = mpl.cm.get_cmap(name = "jet_r", lut = 30), vmin = 0, vmax = max_value) basemap.drawcoastlines(linewidth = 0.5) plt.colorbar(ticks = LinearLocator(numticks = 11), format = "%.1e") plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.savefig("cv_" + out_file_name) pass
def main(): """ """ skip_ids = ['081007', '081002', "042607", "090605"] #comment to plot for all ensemble members members.current_ids = [] #pylab.rcParams.update(params) path_format = 'data/streamflows/hydrosheds_euler9/%s_discharge_1970_01_01_00_00.nc' #path_format = "data/streamflows/hydrosheds_rk4_changed_partiotioning/%s_discharge_1970_01_01_00_00.nc" #path_format = "data/streamflows/piloted_by_ecmwf/ecmwf_nearest_neighbor_discharge_1970_01_01_00_00.nc" path_to_analysis_driven = path_format % members.control_id simIdToData = {} simIdToTimes = {} for the_id in members.current_ids: thePath = path_format % the_id [simIdToData[the_id], simIdToTimes[the_id], i_list, j_list] = data_select.get_data_from_file(thePath) old = True #in the old version drainage and lon,lats in the file are 1D [ data, times, i_list, j_list ] = data_select.get_data_from_file(path_to_analysis_driven) cell_list = [] ij_to_cell = {} prev_cell_indices = [] tot_rof = None if old: #surf_rof = data_select.get_data_from_file(path_format % ("aex",), field_name="") the_path = path_format % ("aex") static_data_path = "data/streamflows/hydrosheds_euler9/infocell9.nc" #ntimes x ncells tot_rof = data_select.get_field_from_file(the_path, field_name="total_runoff") cell_areas = data_select.get_field_from_file(static_data_path, field_name="AREA") #convert the runoff to m^3/s tot_rof *= 1.0e6 * cell_areas[i_list, j_list] / 1.0e3 flow_dir_values = data_select.get_field_from_file(static_data_path, field_name="flow_direction_value")[i_list, j_list] cell_list = map(lambda i, j, the_id: Cell(id = the_id, ix = i, jy = j), i_list, j_list, xrange(len(i_list))) ij_to_cell = dict( zip( zip(i_list, j_list), cell_list )) for ix, jy, aCell, dir_val in zip( i_list, j_list, cell_list, flow_dir_values): i_next, j_next = direction_and_value.to_indices(ix, jy, dir_val) the_key = (i_next, j_next) if ij_to_cell.has_key(the_key): next_cell = ij_to_cell[the_key] else: next_cell = None assert isinstance(aCell, Cell) aCell.set_next(next_cell) #determine list of indices of the previous cells for each cell #in this case they are equal to the ids for aCell in cell_list: assert isinstance(aCell, Cell) prev_cells = aCell.get_upstream_cells() prev_cell_indices.append(map(lambda c: c.id, prev_cells)) prev_cell_indices[-1].append(aCell.id) if not old: da_2d = data_select.get_field_from_file(path_to_analysis_driven, 'accumulation_area') lons = data_select.get_field_from_file(path_to_analysis_driven, field_name = 'longitude') lats = data_select.get_field_from_file(path_to_analysis_driven, field_name = 'latitude') else: lons = polar_stereographic.lons lats = polar_stereographic.lats da_2d = np.zeros(lons.shape) drainage = data_select.get_field_from_file(path_to_analysis_driven, 'drainage') for i, j, theDa in zip(i_list, j_list, drainage): da_2d[i, j] = theDa data_step = timedelta(days = 1) stations_dump = 'stations_dump.bin' if os.path.isfile(stations_dump): print 'unpickling' stations = pickle.load(open(stations_dump)) else: stations = read_station_data() pickle.dump(stations, open(stations_dump, 'w')) # Did this to solve text encoding issues # reload(sys) # sys.setdefaultencoding('iso-8859-1') selected_stations = [] selected_model_values = [] selected_station_values = [] grid_drainages = [] grid_lons = [] grid_lats = [] plot_utils.apply_plot_params(width_pt= None, font_size=9, aspect_ratio=2.5) #plot_utils.apply_plot_params(font_size=9, width_pt=None) ncols = 2 gs = gridspec.GridSpec(5, ncols) fig = plt.figure() assert isinstance(fig, Figure) current_subplot = 0 label1 = "modelled" label2 = "observed" line1 = None line2 = None lines_for_mems = None labels_for_mems = None #fig.subplots_adjust(hspace = 0.9, wspace = 0.4, top = 0.9) index_objects = [] for index, i, j in zip( range(len(i_list)) , i_list, j_list): index_objects.append(IndexObject(positionIndex = index, i = i, j = j)) #sort by latitude index_objects.sort( key = lambda x: x.j, reverse = True) #simulation id to continuous data map simIdToContData = {} for the_id in members.all_current: simIdToContData[the_id] = {} for indexObj in index_objects: i = indexObj.i j = indexObj.j # @type indexObj IndexObject index = indexObj.positionIndex station = get_corresponding_station(lons[i, j], lats[i, j], da_2d[i, j], stations) if station is None or station in selected_stations: continue #if you want to compare with stations add their ids to the selected if station.id not in selected_station_ids: continue #skip some stations if station.id in skip_ids: continue #try now to find the point with the closest drainage area # current_diff = np.abs(station.drainage_km2 - da_2d[i, j]) # for di in xrange(-1,2): # for dj in xrange(-1,2): # the_diff = np.abs(station.drainage_km2 - da_2d[i + di, j + dj]) # if the_diff < current_diff: #select different grid point # current_diff = the_diff # i = i + di # j = j + dj # indexObj.i = i # indexObj.j = j #found station plot data print station.name start_date = max( np.min(times), np.min(station.dates)) end_date = min( np.max(times), np.max(station.dates)) if start_date.day > 1 or start_date.month > 1: start_date = datetime(start_date.year + 1, 1, 1,0,0,0) if end_date.day < 31 or end_date.month < 12: end_date = datetime(end_date.year - 1, 12, 31,0,0,0) if end_date < start_date: continue #select data for years that do not have gaps start_year = start_date.year end_year = end_date.year continuous_station_data = {} continuous_model_data = {} num_of_continuous_years = 0 for year in xrange(start_year, end_year + 1): # @type station Station station_data = station.get_continuous_dataseries_for_year(year) if len(station_data) >= 365: num_of_continuous_years += 1 #save station data for d, v in station_data.iteritems(): continuous_station_data[d] = v #save model data for t_index, t in enumerate(times): if t.year > year: break if t.year < year: continue continuous_model_data[t] = data[t_index, index] #fill the map sim id to cont model data for the_id in members.current_ids: #save model data for t_index, t in enumerate(simIdToTimes[the_id]): if t.year > year: break if t.year < year: continue simIdToContData[the_id][t] = simIdToData[the_id][t_index, index] #if the length of continuous observation is less than 10 years, skip if len(continuous_station_data) < 3650: continue print 'Number of continuous years for station %s is %d ' % (station.id, num_of_continuous_years) #skip stations with less than 20 years of usable data #if num_of_continuous_years < 2: # continue selected_stations.append(station) # plot_total_precip_for_upstream(i_index = i, j_index = j, station_id = station.id, # subplot_count = current_subplot, # start_date = datetime(1980,01,01,00), # end_date = datetime(1996,12,31,00) # ) #tmp (if do not need to replot streamflow) # current_subplot += 1 # continue ##Calculate means for each day of year, ##as a stamp year we use 2001, ignoring the leap year stamp_year = 2001 start_day = datetime(stamp_year, 1, 1, 0, 0, 0) stamp_dates = [] mean_data_model = [] mean_data_station = [] simIdToMeanModelData = {} for the_id in members.all_current: simIdToMeanModelData[the_id] = [] for day_number in xrange(365): the_day = start_day + day_number * data_step stamp_dates.append(the_day) model_data_for_day = [] station_data_for_day = [] #select model data for each simulation, day #and then save mean for each day simIdToModelDataForDay = {} for the_id in members.current_ids: simIdToModelDataForDay[the_id] = [] for year in xrange(start_year, end_year + 1): the_date = datetime(year, the_day.month, the_day.day, the_day.hour, the_day.minute, the_day.second) if continuous_station_data.has_key(the_date): model_data_for_day.append(continuous_model_data[the_date]) station_data_for_day.append(continuous_station_data[the_date]) for the_id in members.current_ids: simIdToModelDataForDay[the_id].append(simIdToContData[the_id][the_date]) assert len(station_data_for_day) > 0 mean_data_model.append(np.mean(model_data_for_day)) mean_data_station.append(np.mean(station_data_for_day)) for the_id in members.current_ids: simIdToMeanModelData[the_id].append(np.mean(simIdToModelDataForDay[the_id])) #skip stations with small discharge #if np.max(mean_data_station) < 300: # continue row = current_subplot// ncols col = current_subplot % ncols ax = fig.add_subplot(gs[row, col]) assert isinstance(ax, Axes) current_subplot += 1 #put "Streamflow label on the y-axis" if row == 0 and col == 0: ax.annotate("Streamflow (${\\rm m^3/s}$)", (0.025, 0.7) , xycoords = "figure fraction", rotation = 90, va = "top", ha = "center") selected_dates = sorted( continuous_station_data.keys() ) unrouted_stfl = get_unrouted_streamflow_for(selected_dates = selected_dates, all_dates=times, tot_runoff=tot_rof, cell_indices=prev_cell_indices[index]) unrouted_daily_normals = data_select.get_means_for_stamp_dates(stamp_dates, all_dates= selected_dates, all_data=unrouted_stfl) #Calculate Nash-Sutcliff coefficient mean_data_model = np.array(mean_data_model) mean_data_station = np.array( mean_data_station ) #mod = _get_monthly_means(stamp_dates, mean_data_model) #sta = _get_monthly_means(stamp_dates, mean_data_station) month_dates = [ datetime(stamp_year, m, 1) for m in xrange(1,13) ] line1, = ax.plot(stamp_dates, mean_data_model, linewidth = 3, color = "b") #line1, = ax.plot(month_dates, mod, linewidth = 3, color = "b") upper_model = np.max(mean_data_model) line2, = ax.plot(stamp_dates, mean_data_station, linewidth = 3, color = "r") #line2, = ax.plot(month_dates, sta, linewidth = 3, color = "r") #line3, = ax.plot(stamp_dates, unrouted_daily_normals, linewidth = 3, color = "y") mod = mean_data_model sta = mean_data_station ns = 1.0 - np.sum((mod - sta) ** 2) / np.sum((sta - np.mean(sta)) ** 2) if np.abs(ns) < 0.001: ns = 0 corr_coef = np.corrcoef([mod, sta])[0,1] ns_unr = 1.0 - np.sum((unrouted_daily_normals - sta) ** 2) / np.sum((sta - np.mean(sta)) ** 2 ) corr_unr = np.corrcoef([unrouted_daily_normals, sta])[0, 1] da_diff = (da_2d[i, j] - station.drainage_km2) / station.drainage_km2 * 100 ax.annotate("ns = %.2f\nr = %.2f" % (ns, corr_coef), (0.95, 0.90), xycoords = "axes fraction", va = "top", ha = "right", font_properties = FontProperties(size = 9) ) #plot member simulation data lines_for_mems = [] labels_for_mems = [] #lines_for_mems.append(line3) #labels_for_mems.append("Unrouted total runoff") for the_id in members.current_ids: the_line, = ax.plot(stamp_dates, simIdToMeanModelData[the_id], "--", linewidth = 3) lines_for_mems.append(the_line) labels_for_mems.append(the_id) ##calculate mean error means_for_members = [] for the_id in members.current_ids: means_for_members.append(np.mean(simIdToMeanModelData[the_id])) upper_station = np.max(mean_data_station) upper_unr = np.max(unrouted_daily_normals) upper = np.max([upper_model, upper_station]) upper = round(upper / 100 ) * 100 half = round( 0.5 * upper / 100 ) * 100 if upper <= 100: upper = 100 half = upper / 2 print half, upper print 10 * '=' ax.set_yticks([0, half , upper]) assert isinstance(station, Station) print("i = {0}, j = {1}".format(indexObj.i, indexObj.j)) print(lons[i,j], lats[i,j]) print("id = {0}, da_sta = {1}, da_mod = {2}, diff = {3} %".format(station.id ,station.drainage_km2, da_2d[i,j], da_diff)) grid_drainages.append(da_2d[i, j]) grid_lons.append(lons[i, j]) grid_lats.append(lats[i, j]) selected_station_values.append(mean_data_station) selected_model_values.append(mean_data_model) #plot_swe_for_upstream(i_index = i, j_index = j, station_id = station.id) #plt.ylabel("${\\rm m^3/s}$") west_east = 'W' if station.longitude < 0 else 'E' north_south = 'N' if station.latitude > 0 else 'S' title_data = (station.id, np.abs(station.longitude), west_east, np.abs(station.latitude), north_south) ax.set_title('%s: (%3.1f%s, %3.1f%s)' % title_data) date_ticks = [] for month in xrange(1,13): the_date = datetime(stamp_year, month, 1) date_ticks.append(the_date) date_ticks.append(the_date + timedelta(days = 15)) ax.xaxis.set_ticks(date_ticks) major_ticks = ax.xaxis.get_major_ticks() for imtl, mtl in enumerate(major_ticks): mtl.tick1line.set_visible(imtl % 2 == 0) mtl.tick2line.set_visible(imtl % 2 == 0) mtl.label1On = (imtl % 4 == 1) # ax.xaxis.set_major_locator( # mpl.dates.MonthLocator(bymonth = range(2,13,2)) # ) ax.xaxis.set_major_formatter( mpl.dates.DateFormatter('%b') ) lines = [line1] lines.extend(lines_for_mems) lines.append(line2) lines = tuple( lines ) labels = [label1] labels.extend(labels_for_mems) labels.append(label2) labels = tuple(labels) fig.legend(lines, labels, 'lower right', ncol = 1) # fig.text(0.05, 0.5, "Streamflow (${\\rm m^3/s}$)", # rotation=90, # ha = 'center', va = 'center' # ) fig.tight_layout(pad = 2) fig.savefig('performance_error.png') # assert len(selected_dates_with_gw[0]) == len(selected_station_dates[0]) do_skill_calculation = True if do_skill_calculation: calculate_skills(selected_stations, stamp_dates, selected_station_values, selected_model_values, grid_drainages, grid_lons, grid_lats) do_plot_selected_stations = True if do_plot_selected_stations: plot_selected_stations(selected_stations, use_warpimage=False, plot_ts = False, i_list = i_list, j_list = j_list)
def get_station_and_corresponding_model_data(path = 'data/streamflows/hydrosheds_euler10_spinup100yrs/aex_discharge_1970_01_01_00_00.nc'): result = {} saved_selected_stations_file = 'selected_stations_and_model_data.bin' if os.path.isfile(saved_selected_stations_file): result = pickle.load(open(saved_selected_stations_file)) else: print 'getting data from file ', path [data, times, i_list, j_list] = data_select.get_data_from_file(path) drainage_area = data_select.get_field_from_file(path, field_name = 'accumulation_area') if drainage_area is not None: lons = data_select.get_field_from_file(path, field_name = 'longitude') lats = data_select.get_field_from_file(path, field_name = 'latitude') da_2d = drainage_area else: drainage_area = data_select.get_field_from_file(path, field_name = 'drainage') da_2d = np.zeros(polar_stereographic.xs.shape) lons = polar_stereographic.lons lats = polar_stereographic.lats for index, i, j in zip( range(len(i_list)) , i_list, j_list): da_2d[i, j] = drainage_area[index] stations_dump = 'stations_dump.bin' if os.path.isfile(stations_dump): print 'unpickling' stations = pickle.load(open(stations_dump)) else: stations = read_station_data() pickle.dump(stations, open(stations_dump, 'w')) reload(sys) sys.setdefaultencoding('iso-8859-1') selected_stations = [] for index, i, j in zip( range(len(i_list)) , i_list, j_list): station = get_corresponding_station(lons[i, j], lats[i, j], da_2d[i, j], stations) if station is None or station in selected_stations: continue selected_stations.append(station) data_point = ModelPoint(times, data[:, index]) result[station] = data_point print '=' * 20 print station.get_timeseries_length() , station.id #found station plot data print station.name print station.id pickle.dump(result, open(saved_selected_stations_file,'wb')) # for station, point in result.iteritems(): # plt.plot(station.dates, station.values, label = station.name) # plt.legend() # plt.show() assert len(result) > 0 return result
def compare_means(member = 'aet', my_data_path = '', start_date = datetime(1961, 1, 1, 0, 0), end_date = datetime(1990, 12, 31, 0, 0)): streamflows, times, i_array, j_array = data_select.get_data_from_file(my_data_path) event_duration = timedelta(days = 1) my_data = data_select.get_list_of_annual_maximums_for_domain(streamflows, times, start_date = start_date, end_date = end_date, start_month = 1, end_month = 12, event_duration = event_duration) data_path = 'data/streamflows/Vincent_annual_max/mapHIGH_{0}.txt'.format(member) v = VincentMaximumsReader(data_path = data_path) the_format = '{0}: i = {1}, j = {2}, min = {3}, max = {4}, mean = {5}' vmeans = [] vmins = [] vmaxs = [] # my_data = 500 * np.ones((10,547)) for i, j, the_index in zip(i_array, j_array, range(my_data.shape[1])): data = my_data[:, the_index] print the_format.format('Sasha', i, j, np.min(data), np.max(data), np.mean(data)) data = v.get_data_at(i + 1, j + 1) print the_format.format('Vincent', i, j, np.min(data), np.max(data), np.mean(data)) vmeans.append(np.mean(data)) vmins.append(np.min(data)) vmaxs.append(np.max(data)) print '=' * 30 #scatter plot for means plt.subplots_adjust(hspace = 0.5) plt.subplot(2,2,1) plt.title('annual maximums, \n average for each grid point', override) plt.scatter( vmeans , np.mean(my_data, axis = 0), linewidth = 0) plt.xlabel('Vincent') plt.ylabel('Sasha') x = plt.xlim() plt.plot(x,x, color = 'k') plt.grid(True) #scatter plot for minimums plt.subplot(2,2,2) plt.title('annual maximums, \n minimum for each grid point', override) plt.scatter( vmins , np.min(my_data, axis = 0), linewidth = 0) plt.xlabel('Vincent') plt.ylabel('Sasha') x = plt.xlim() plt.plot(x,x, color = 'k') plt.grid(True) #scatter plot for minimums plt.subplot(2,2,3) plt.title('annual maximums, \n maximum for each grid point', override) plt.scatter( vmaxs , np.max(my_data, axis = 0), linewidth = 0) plt.xlabel('Vincent') plt.ylabel('Sasha') x = plt.xlim() plt.plot(x,x, color = 'k') plt.grid(True) plt.savefig('{0}_scatter_max.png'.format(member), bbox_inches = 'tight')
def init_from_path(self, path = ''): self._data, self.times, \ self.i_indices, self.j_indices = data_select.get_data_from_file(path) pass
def main(data_path = DEFAULT_PATH): #get data to memory [data, times, x_indices, y_indices] = data_select.get_data_from_file(data_path) the_mean = np.mean(data, axis = 0) lons2d, lats2d = polar_stereographic.lons, polar_stereographic.lats lons = lons2d[x_indices, y_indices] lats = lats2d[x_indices, y_indices] #colorbar wres = Ngl.Resources() wres.wkColorMap = "BlGrYeOrReVi200" wks_type = "ps" wks = Ngl.open_wks(wks_type,"test_pyngl", wres) #plot resources res = Ngl.Resources() res.cnFillMode = "RasterFill" #res.cnFillOn = True # Turn on contour fill #res.cnMonoFillPattern = True # Turn solid fill back on. #res.cnMonoFillColor = False # Use multiple colors. res.cnLineLabelsOn = False # Turn off line labels. res.cnInfoLabelOn = False # Turn off informational res.pmLabelBarDisplayMode = "Always" # Turn on label bar. res.cnLinesOn = False # Turn off contour lines. res.mpProjection = "LambertConformal" res.mpDataBaseVersion = "MediumRes" # res.mpLimitMode = "LatLon" # limit map via lat/lon # res.mpMinLatF = np.min(lats) # map area # res.mpMaxLatF = np.max(lats) # latitudes # res.mpMinLonF = np.min( lons ) # and # res.mpMaxLonF = np.max( lons ) # longitudes print np.min(lons), np.max(lons) res.tiMainFont = 26 res.tiXAxisFont = 26 res.tiYAxisFont = 26 res.sfXArray = lons2d res.sfYArray = lats2d # # Set title resources. # res.tiMainString = "Logarithm of mean annual streamflow m**3/s" to_plot = np.ma.masked_all(lons2d.shape) to_plot[x_indices, y_indices] = np.log(the_mean[:]) # for i, j, v in zip(x_indices, y_indices, the_mean): # to_plot[i, j] = v Ngl.contour_map(wks, to_plot[:,:], res) Ngl.end() pass
def get_std_and_mean_using_bootstrap_for_merged_means(sim_ids = None, folder_path = "data/streamflows/hydrosheds_euler9", months = range(1, 13), n_samples = 1000): """ returns the object containing means for the domain and standard deviations from bootstrap """ cache_file = _get_cache_file_path(months=months, sim_ids = sim_ids) if os.path.isfile(cache_file): return pickle.load(open(cache_file)) #determine path to the file with data filePaths = [] for f in os.listdir(folder_path): if f.split("_")[0] in sim_ids: filePath = os.path.join(folder_path, f) filePaths.append(filePath) boot_means = [] real_means = [] index_matrix = None all_means = [] members_boot_means = [] for file_path in filePaths: streamflow, times, i_indices, j_indices = data_select.get_data_from_file(file_path) #for each year and for each gridcell get mean value for the period means_dict = data_select.get_means_over_months_for_each_year(times, streamflow, months = months) means_sorted_in_time = map( lambda x : x[1], sorted(means_dict.items(), key=lambda x: x[0]) ) data_matrix = np.array(means_sorted_in_time) real_means.append(data_matrix) #save modelled means, in order to calculate mean of the merged data #print "data_matrix.shape = ", data_matrix.shape boot_means = [] for i in xrange(n_samples): #generate indices index_vector = np.random.randint(0, data_matrix.shape[0], data_matrix.shape[0]) #average 30 bootstrapped annual means boot_means.append( np.mean(data_matrix[index_vector,:], axis = 0) ) members_boot_means.append( boot_means ) #take average over members print np.array(members_boot_means).shape boot_means = np.array(members_boot_means).mean(axis = 0) #nsamples x npoints print boot_means[:, 499] print boot_means[:, 19] assert boot_means.shape[0] == n_samples, boot_means.shape print "boot_means.shape = ", boot_means.shape std_result = np.std(boot_means, axis = 0) mean_result = np.array(real_means).mean(axis = 0).mean(axis = 0) pickle.dump([std_result, mean_result], open(cache_file, mode="w")) return std_result, mean_result