def eof_correlation(eof_filepath, mask_filepath): """ Returns plot and DataArray of areas with p<0.05.""" print("processing precipitation") da = era5.download_data(mask_filepath, xarray=True) tp_ds = da.mean(dim=["latitude", "longitude"]).tp tp = tp_ds.assign_coords(time=(tp_ds.time.astype("datetime64"))) tp_df = tp.to_dataframe() print("processing EOF") eof_da = xr.open_dataset(eof_filepath) eof_ds = eof_da.EOF eof = eof_ds.assign_coords(time=(eof_ds.time.astype("datetime64"))) eof_df = eof.to_dataframe() eof_pv = pd.pivot_table(eof_df, values="EOF", index=["time"], columns=["latitude", "longitude"]) eof_reset = eof_pv.reset_index() eof_reset["time"] -= np.timedelta64(12, "h") print("combining") df_combined = pd.merge_ordered(tp_df, eof_reset, on="time") df_clean = df_combined.dropna() corr_s = df_clean.corrwith(df_clean["tp"]) corr_df = corr_s.to_frame(name="corr") corr_df["pvalue"] = pvalue(df_clean) filepath = "_Data/EOF_corr_pval.csv" corr_df.to_csv(filepath) return filepath
def spatial_autocorr(variable, mask_filepath): # TODO """ Plots spatial autocorrelation """ df = era5.download_data(mask_filepath) # detrend table = pd.pivot_table(df, values="tp", index=["latitude", "longitude"], columns=["time"]) trans_table = table.T detrended_table = detrend(trans_table, axis=0) corr_table = detrended_table.corr() print(corr_table) corr_khyber = corr_table.loc[(34.5, 73.0)] corr_gilgit = corr_table.loc[(36.0, 75.0)] corr_ngari = corr_table.loc[(33.5, 79.0)] corr_list = [corr_khyber, corr_gilgit, corr_ngari] for corr in corr_list: df_reset = corr.reset_index().droplevel(1, axis=1) df_pv = df_reset.pivot(index="latitude", columns="longitude") df_pv = df_pv.droplevel(0, axis=1) da = xr.DataArray(data=df_pv, name="Correlation") # Plot plt.figure() ax = plt.subplot(projection=ccrs.PlateCarree()) ax.set_extent([71, 83, 30, 38]) da.plot( x="longitude", y="latitude", add_colorbar=True, ax=ax, vmin=-1, vmax=1, cmap="coolwarm", cbar_kwargs={"pad": 0.10}, ) ax.gridlines(draw_labels=True) ax.set_xlabel("Longitude") ax.set_ylabel("Latitude") plt.show()
def random_location_generator(location, N=50): """ Returns DataFrame of random location, apply to clean df only """ coord_list = [] df = era5.download_data(location) df_squished = df[["lat", "lon"]].reset_index() df_s_reset = df_squished.drop_duplicates(subset=["lat", "lon"]) indices = np.random.randint(len(df_s_reset), size=N) for i in indices: df_location = df_s_reset.iloc[i] lat = df_location["lat"] lon = df_location["lon"] coord_list.append([lat, lon]) return coord_list
def averaged_timeseries(mask_filepath, variable="tp", longname="Total precipitation [m/day]"): """ Timeseries for the Upper Indus Basin""" df = era5.download_data(mask_filepath) df_var = df[["time", variable]] df_var["time"] = df_var["time"].astype(np.datetime64) df_mean = df_var.groupby("time").mean() df_mean.plot() plt.title("Upper Indus Basin") plt.ylabel(longname) plt.xlabel("Year") plt.grid(True) plt.show()
def tp_vs(variable, longname="", location='uib', time=False): """ df = dp.download_data(mask_filepath) df_var = df[['time','tp', variable]] #df_mean = df_var.groupby('time').mean() """ ds = era5.download_data(location, xarray=True) if type(location) is str: mask_filepath = dp.find_mask(location) masked_ds = location_sel.apply_mask(ds, mask_filepath) else: masked_ds = ds.interp(coords={ "lon": location[1], "lat": location[0] }, method="nearest") if type(time) == str: masked_ds = ds.isel(time=-6) multiindex_df = masked_ds.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = df_clean[["tp", variable]] df_var = df.dropna() # Plot df_var.plot.scatter(x=variable, y="tp", alpha=0.2, c="b") if type(location) is str: plt.title(location) else: plt.title(str(location[0]) + '°N, ' + str(location[1]) + '°E') plt.ylabel("Total precipitation [mm/day]") plt.xlabel(longname) plt.grid(True) plt.show()
def input_correlation_heatmap(): """Plot correlation heatmap for model inputs.""" df = era5.download_data(mask_filepath, all_var=True) # create lags df["N34-1"] = df["N34"].shift(periods=393) df["NAO-1"] = df["NAO"].shift(periods=393) df["N4-1"] = df["N4"].shift(periods=393) df = df.drop(columns=["time"]) df_clean = df.dropna() df_sorted = df_clean.sort_index(axis=1) corr = df_sorted.corr() sns.set(style="white") plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) mask = np.triu(np.ones_like( corr, dtype=np.bool)) # generate a mask for the upper triangle sns.heatmap( corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, fmt="0.2f", square=True, linewidths=0.5, annot=True, annot_kws={"size": 5}, cbar_kws={"shrink": 0.5}, ) plt.title("Correlation plot for Upper Indus Basin") plt.show()
def annual_map(location, variable, year, cumulative=False): """ Annual map """ da = era5.download_data(location, xarray=True) ds_year = da.sel(time=slice( str(year) + "-01-16T12:00:00", str(year + 1) + "-01-01T12:00:00")) ds_var = ds_year[variable] * 1000 # to mm/day if cumulative is True: ds_processed = cumulative_monthly(ds_var) ds_final = ds_processed.sum(dim="time") else: ds_final = ds_var.std(dim="time") # TODO weighted mean print(ds_final) plt.figure() ax = plt.subplot(projection=ccrs.PlateCarree()) ax.set_extent([71, 83, 30, 38]) g = ds_final["tp_0001"].plot( cmap="magma_r", vmin=0.001, cbar_kwargs={ "label": "Precipitation standard deviation [mm/day]", "extend": "neither", "pad": 0.10 }) g.cmap.set_under("white") # ax.add_feature(cf.BORDERS) ax.coastlines() ax.gridlines(draw_labels=True) ax.set_xlabel("Longitude") ax.set_ylabel("Latitude") plt.title("Upper Indus Basin Total Precipitation " + str(year) + "\n \n") plt.show()
def monthly_PDF(timeseries, variable="tp", longname=""): combined_df = pd.DataFrame() for ts in timeseries: df1 = ts.tp.to_dataframe(name=ts.plot_legend) df2 = df1.dropna().reset_index() df3 = df2.drop(["time", "lon", "lat"], axis=1) combined_df[ts.plot_legend] = df3[ts.plot_legend] df = era5.download_data(mask_filepath) clean_df = df.dropna() df_var = clean_df[["time", variable]] reduced_df = df_var.reset_index() reduced_df["time"] = (reduced_df["time"] - np.floor(reduced_df["time"])) * 12 grouped_dfs = [] for m in np.arange(1, 13): month_df = reduced_df[reduced_df["time"] == m] grouped_dfs.append(month_df[variable]) # PDF for each month """ fig, axs = plt.subplots(4, 3, sharex=True, sharey=True) for i in range(12): x= int(i/3) y= i%3 axs[x,y].hist(grouped_dfs[i], bins=50, density=True) axs[x,y].set_title(month_dict[i]) axs[x,y].set_title(month_dict[i]) axs[x,y].set_xlabel('Total precipation [m]') axs[x,y].set_ylabel('Probability density') axs[x,y].axvline(np.percentile(grouped_dfs[i], 95), color='k', linestyle='dashed', linewidth=1) """ """ fig, axs = plt.subplots(12, 1, sharex=True, sharey=True, figsize=(5, 50)) for i in range(12): axs[i].hist(grouped_dfs[i], bins=50, density=True) axs[i].set_title(month_dict[i]) axs[i].set_title(month_dict[i]) axs[i].set_xlabel('Total precipation [m]') axs[i].set_ylabel('Probability density') axs[i].axvline(np.percentile(grouped_dfs[i], 95), color='k', linestyle='dashed', linewidth=1) """ _fig, axs = plt.subplots(3, 4, sharex=True, sharey=True) for i in range(12): x = (i) % 3 y = int(i / 3) axs[x, y].hist(grouped_dfs[i], density=True) axs[x, y].set_title(month_dict[i]) axs[x, y].set_title(month_dict[i]) axs[x, y].xaxis.set_tick_params(which="both", labelbottom=True) axs[x, y].yaxis.set_tick_params(which="both", labelbottom=True) axs[x, y].set_xlabel(longname) axs[x, y].set_ylabel("Probability density") axs[x, y].axvline( np.percentile(grouped_dfs[i], 95), color="k", linestyle="dashed", linewidth=1, label="95th percentile", ) plt.legend(loc="upper center", bbox_to_anchor=(0.5, -0.30)) plt.show()
def point_model(location, number=None, EDA_average=False, maxyear=None): """ Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4, Nino 4 and NAO index for a single point. Inputs number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean coords [latitude, longitude], optional: specify if you want a specific location, list of floats mask_filepath, optional: Outputs x_train: training feature vector, numpy array y_train: training output vector, numpy array x_test: testing feature vector, numpy array y_test: testing output vector, numpy array """ if number is not None: da_ensemble = era5.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average is True: da_ensemble = era5.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = era5.download_data(location, xarray=True) if location is str: multiindex_df = da.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df_location = sa.random_location_sampler(df_clean) df = df_location.drop(columns=["lat", "lon", "slor", "anor", "z"]) else: da_location = da.interp(coords={ "lat": location[0], "lon": location[1] }, method="nearest") multiindex_df = da_location.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = df_clean.drop(columns=["lat", "lon", "slor", "anor", "z"]) if maxyear is not None: df["time"] = df[df["time"] < maxyear] df["time"] = df["time"] - 1970 # to years df["tp"] = log_transform(df["tp"]) df = df[["time", "d2m", "tcwv", "N34", "tp"]] # format order # Keep first of 70% for training train_df = df[df["time"] < df["time"].max() * 0.7] xtrain = train_df.drop(columns=["tp"]).values ytrain = train_df["tp"].values # Last 30% for evaluation eval_df = df[df["time"] > df["time"].max() * 0.7] x_eval = eval_df.drop(columns=["tp"]).values y_eval = eval_df["tp"].values # Training and validation data xval, xtest, yval, ytest = train_test_split(x_eval, y_eval, test_size=0.3333, shuffle=False) return xtrain, xval, xtest, ytrain, yval, ytest
def areal_model(location, number=None, EDA_average=False, length=3000, seed=42, maxyear=None): """ Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4 index for given number randomly sampled data points for a given basin. Inputs location: specify area to train model number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean length, optional: specify number of points to sample, integer seed, optional: specify seed, integer Outputs x_train: training feature vector, numpy array y_train: training output vector, numpy array x_test: testing feature vector, numpy array y_test: testing output vector, numpy array """ if number is not None: da_ensemble = era5.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average is True: da_ensemble = era5.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = era5.download_data(location, xarray=True) # apply mask mask_filepath = location_sel.find_mask(location) masked_da = location_sel.apply_mask(da, mask_filepath) if maxyear is not None: masked_da = masked_da.where(da.time < maxyear + 1, drop=True) multiindex_df = masked_da.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = sa.random_location_and_time_sampler(df_clean, length=length, seed=seed) df["time"] = df["time"] - 1970 df["tp"] = log_transform(df["tp"]) df = df[[ "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp" ]] # Keep first of 70% for training train_df = df[df['time'] < df['time'].max() * 0.7] xtrain = train_df.drop(columns=['tp']).values ytrain = train_df['tp'].values # Last 30% for evaluation eval_df = df[df['time'] > df['time'].max() * 0.7] x_eval = eval_df.drop(columns=['tp']).values y_eval = eval_df['tp'].values # Training and validation data xval, xtest, yval, ytest = train_test_split(x_eval, y_eval, test_size=0.3333, shuffle=True) return xtrain, xval, xtest, ytrain, yval, ytest
def areal_model_eval(location, number=None, EDA_average=False, length=3000, seed=42, minyear=1979, maxyear=2020): """ Returns data to evaluate an areal model at a given location, area and time period. Variables: Total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4, Nino 4 and NAO index for a single point. Inputs: number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean coords [latitude, longitude], optional: specify if you want a specific location, list of floats mask, optional: specify area to train model, defaults to Upper Indus Basin Outputs x_tr: evaluation feature vector, numpy array y_tr: evaluation output vector, numpy array """ if number is not None: da_ensemble = era5.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average is True: da_ensemble = era5.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = era5.download_data(location, xarray=True) sliced_da = da.sel(time=slice(minyear, maxyear)) if isinstance(location, str) is True: mask_filepath = location_sel.find_mask(location) masked_da = location_sel.apply_mask(sliced_da, mask_filepath) multiindex_df = masked_da.to_dataframe() multiindex_df = da.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = sa.random_location_and_time_sampler(df_clean, length=length, seed=seed) else: da_location = sliced_da.interp(coords={ "lat": location[0], "lon": location[1] }, method="nearest") multiindex_df = da_location.to_dataframe() df = multiindex_df.dropna().reset_index() df["time"] = df["time"] - 1970 # to years df["tp"] = log_transform(df["tp"]) df = df[[ "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp" ]] # format order xtr = df.drop(columns=["tp"]).values ytr = df["tp"].values return xtr, ytr
def cluster_correlation_heatmap(): """Plot correlation heatmap for the three clusters.""" masks = ["Khyber_mask.nc", "Gilgit_mask.nc", "Ngari_mask.nc"] names = ["Gilgit regime", "Ngari regime", "Khyber regime"] for i in range(3): cluster_df = era5.download_data(masks[i]) # create lags cluster_df["CGTI-1"] = cluster_df["CGTI"].shift(periods=1) cluster_df["CGTI-2"] = cluster_df["CGTI"].shift(periods=2) cluster_df["CGTI-3"] = cluster_df["CGTI"].shift(periods=3) cluster_df["CGTI-4"] = cluster_df["CGTI"].shift(periods=4) cluster_df["CGTI-5"] = cluster_df["CGTI"].shift(periods=5) cluster_df["CGTI-6"] = cluster_df["CGTI"].shift(periods=6) """' df_combined['N34-1'] = df_combined['N34'].shift(periods=1) df_combined['N34-2'] = df_combined['N34'].shift(periods=2) df_combined['N34-3'] = df_combined['N34'].shift(periods=3) df_combined['N34-4'] = df_combined['N34'].shift(periods=4) df_combined['N34-5'] = df_combined['N34'].shift(periods=5) df_combined['N34-6'] = df_combined['N34'].shift(periods=6) df_combined['N4-1'] = df_combined['N4'].shift(periods=1) df_combined['N4-2'] = df_combined['N4'].shift(periods=2) df_combined['N4-3'] = df_combined['N4'].shift(periods=3) df_combined['N4-4'] = df_combined['N4'].shift(periods=4) df_combined['N4-5'] = df_combined['N4'].shift(periods=5) df_combined['N4-6'] = df_combined['N4'].shift(periods=6) """ df = cluster_df(columns=["expver", "time"]) df_clean = df.dropna() df_sorted = df_clean.sort_index(axis=1) # Correlation matrix corr = df_sorted.corr() # Plot sns.set(style="white") plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio mask = np.triu(np.ones_like( corr, dtype=np.bool)) # generate a mask for the upper triangle sns.heatmap( corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, square=True, linewidths=0.5, cbar_kws={"shrink": 0.5}, ) plt.title(names[i] + "\n") plt.show()
from maps.plot_data import cumulative_monthly # Filepaths mask_filepath = "_Data/Masks/ERA5_Upper_Indus_mask.nc" dem_filepath = "_Data/elev.0.25-deg.nc" # Function inputs # Digital Elevation Model data dem = xr.open_dataset(dem_filepath) dem_da = (dem.data).sum(dim="time") sliced_dem = dem_da.sel(lat=slice(38, 30), lon=slice(71.25, 82.75)) # Precipitation data da = era5.download_data(mask_filepath, xarray=True) tp_da = da.tp # Decade list decades = [1980, 1990, 2000, 2010] # Cluster list N = np.arange(2, 11, 1) def seasonal_clusters(tp_da, sliced_dem, N, decades): """ K-means clustering of precipitation data as a function of seasons, decades and number of clusters. Returns spatial graphs, overlayed with the local topography contours.