def areal_model_eval(location, number=None, EDA_average=False, minyear=1979, maxyear=2020): """ Returns data to evaluate an areal model at a given location, area and time period. Variables: total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4, Nino 4 and NAO index for a single point. Inputs number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean coords [latitude, longitude], optional: specify if you want a specific location, list of floats mask, optional: specify area to train model, defaults to Upper Indus Basin Outputs x_tr: evaluation feature vector, numpy array y_tr: evaluation output vector, numpy array """ if number != None: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average == True: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = dd.download_data(location, xarray=True) sliced_da = da.sel(time=slice(minyear, maxyear)) if isinstance(location, str) == True: mask_filepath = find_mask(location) masked_da = dd.apply_mask(sliced_da, mask_filepath) multiindex_df = masked_da.to_dataframe() multiindex_df = da.to_dataframe() df = multiindex_df.dropna().reset_index() else: da_location = sliced_da.interp(coords={ "lat": location[0], "lon": location[1] }, method="nearest") multiindex_df = da_location.to_dataframe() df = multiindex_df.dropna().reset_index() df["time"] = df["time"] - 1970 # to years df["tp"] = log_transform(df["tp"]) df = df[[ "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp" ]] #format order xtr = df.drop(columns=["tp"]).values ytr = df["tp"].values return xtr, ytr
def tp_vs(mask_filepath, variable, mask=None, longname=""): """ df = dp.download_data(mask_filepath) df_var = df[['time','tp', variable]] #df_mean = df_var.groupby('time').mean() """ if mask == None: df = dd.download_data(mask_filepath) else: cds_filepath = fd.update_cds_monthly_data() da = dd.apply_mask(cds_filepath, mask) df = da.to_dataframe().reset_index() df = df[["time", "tp", variable]] # gilgit = ds.interp(coords={'longitude':74.4584, 'latitude':35.8884 }, method='nearest') df_var = df.dropna() # Plot df_var.plot.scatter(x=variable, y="tp", alpha=0.2, c="b") plt.title("Upper Indus Basin") plt.ylabel("Total precipitation [m/day]") plt.xlabel(longname) plt.grid(True) plt.show()
def eof_correlation(eof_filepath, mask_filepath): """ Returns plot and DataArray of areas with p<0.05 """ print("processing precipitation") da = dd.download_data(mask_filepath, xarray=True) tp_ds = da.mean(dim=["latitude", "longitude"]).tp tp = tp_ds.assign_coords(time=(tp_ds.time.astype("datetime64"))) tp_df = tp.to_dataframe() print("processing EOF") eof_da = xr.open_dataset(eof_filepath) eof_ds = eof_da.EOF eof = eof_ds.assign_coords(time=(eof_ds.time.astype("datetime64"))) eof_df = eof.to_dataframe() eof_pv = pd.pivot_table(eof_df, values="EOF", index=["time"], columns=["latitude", "longitude"]) eof_reset = eof_pv.reset_index() eof_reset["time"] -= np.timedelta64(12, "h") print("combining") df_combined = pd.merge_ordered(tp_df, eof_reset, on="time") df_clean = df_combined.dropna() corr_s = df_clean.corrwith(df_clean["tp"]) corr_df = corr_s.to_frame(name="corr") corr_df["pvalue"] = pvalue(df_clean) filepath = "Data/EOF_corr_pval.csv" corr_df.to_csv(filepath) return filepath
def spatial_autocorr(variable, mask_filepath): # TODO """ Plots spatial autocorrelation """ df = dd.download_data(mask_filepath) # detrend table = pd.pivot_table(df, values="tp", index=["latitude", "longitude"], columns=["time"]) trans_table = table.T detrended_table = detrend(trans_table, axis=0) corr_table = detrended_table.corr() print(corr_table) corr_khyber = corr_table.loc[(34.5, 73.0)] corr_gilgit = corr_table.loc[(36.0, 75.0)] corr_ngari = corr_table.loc[(33.5, 79.0)] corr_list = [corr_khyber, corr_gilgit, corr_ngari] for corr in corr_list: df_reset = corr.reset_index().droplevel(1, axis=1) df_pv = df_reset.pivot(index="latitude", columns="longitude") df_pv = df_pv.droplevel(0, axis=1) da = xr.DataArray(data=df_pv, name="Correlation") # Plot plt.figure() ax = plt.subplot(projection=ccrs.PlateCarree()) ax.set_extent([71, 83, 30, 38]) g = da.plot( x="longitude", y="latitude", add_colorbar=True, ax=ax, vmin=-1, vmax=1, cmap="coolwarm", cbar_kwargs={"pad": 0.10}, ) ax.gridlines(draw_labels=True) ax.set_xlabel("Longitude") ax.set_ylabel("Latitude") plt.show()
def averaged_timeseries(mask_filepath, variable="tp", longname="Total precipitation [m/day]"): """ Timeseries for the Upper Indus Basin""" df = dd.download_data(mask_filepath) df_var = df[["time", variable]] df_var["time"] = df_var["time"].astype(np.datetime64) df_mean = df_var.groupby("time").mean() df_mean.plot() plt.title("Upper Indus Basin") plt.ylabel(longname) plt.xlabel("Year") plt.grid(True) plt.show()
def uib_sample_linreg(): """ Plots sample timeseries for UIB clusters """ # Open data mask_filepath = "Data/Masks/ERA5_Upper_Indus_mask.nc" tp = dd.download_data(mask_filepath, xarray=True) tp_da = tp.tp * 1000 # convert from m/day to mm/day ## Data gilgit = tp_da.interp(coords={"lon": 75, "lat": 36}, method="nearest") ngari = tp_da.interp(coords={"lon": 81, "lat": 32}, method="nearest") khyber = tp_da.interp(coords={"lon": 73, "lat": 34.5}, method="nearest") timeseries = [gilgit, ngari, khyber] gilgit_linear_model = lin_reg(gilgit) ngari_linear_model = lin_reg(ngari) khyber_linear_model = lin_reg(khyber) linear_models = [gilgit_linear_model, ngari_linear_model, khyber_linear_model] linreg_plot(timeseries, linear_models)
def input_correlation_heatmap(): df = dd.download_data(mask_filepath, all_var=True) # create lags df["N34-1"] = df["N34"].shift(periods=393) df["NAO-1"] = df["NAO"].shift(periods=393) df["N4-1"] = df["N4"].shift(periods=393) df = df.drop(columns=["time"]) df_clean = df.dropna() df_sorted = df_clean.sort_index(axis=1) corr = df_sorted.corr() sns.set(style="white") plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) mask = np.triu(np.ones_like( corr, dtype=np.bool)) # generate a mask for the upper triangle sns.heatmap( corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, fmt="0.2f", square=True, linewidths=0.5, annot=True, annot_kws={"size": 5}, cbar_kws={"shrink": 0.5}, ) plt.title("Correlation plot for Upper Indus Basin") plt.show()
def areal_model(location, number=None, EDA_average=False, length=3000, seed=42): """ Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4 index for given number randomly sampled data points for a given basin. Inputs location: specify area to train model number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean length, optional: specify number of points to sample, integer seed, optional: specify seed, integer Outputs x_train: training feature vector, numpy array y_train: training output vector, numpy array x_test: testing feature vector, numpy array y_test: testing output vector, numpy array """ if number != None: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average == True: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = dd.download_data(location, xarray=True) # apply mask mask_filepath = find_mask(location) masked_da = dd.apply_mask(da, mask_filepath) multiindex_df = masked_da.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = sa.random_location_and_time_sampler(df_clean, length=length, seed=seed) df["time"] = df["time"] - 1970 df["tp"] = log_transform(df["tp"]) df = df[[ "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp" ]] # Remove last 10% of time for testing test_df = df[df["time"] > df["time"].max() * 0.9] xtest = test_df.drop(columns=["tp"]).values ytest = test_df["tp"].values # Training and validation data tr_df = df[df["time"] < df["time"].max() * 0.9] xtr = tr_df.drop(columns=["tp"]).values ytr = tr_df["tp"].values xtrain, xval, ytrain, yval = train_test_split( xtr, ytr, test_size=0.30, shuffle=False) # Training and validation data """ # Keep first of 70% for training train_df = df[ df['time']< df['time'].max()*0.7] xtrain = train_df.drop(columns=['tp']).values ytrain = train_df['tp'].values # Last 30% for evaluation eval_df = df[ df['time']> df['time'].max()*0.7] x_eval = eval_df.drop(columns=['tp']).values y_eval = eval_df['tp'].values # Training and validation data xval, xtest, yval, ytest = train_test_split(x_eval, y_eval, test_size=0.3333, shuffle=True) """ return xtrain, xval, xtest, ytrain, yval, ytest
def point_model(location, number=None, EDA_average=False): """ Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4, Nino 4 and NAO index for a single point. Inputs number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean coords [latitude, longitude], optional: specify if you want a specific location, list of floats mask_filepath, optional: Outputs x_train: training feature vector, numpy array y_train: training output vector, numpy array x_test: testing feature vector, numpy array y_test: testing output vector, numpy array """ if number != None: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average == True: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = dd.download_data(location, xarray=True) if location is str: multiindex_df = da.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df_location = sa.random_location_sampler(df_clean) df = df_location.drop(columns=["lat", "lon", "slor", "anor", "z"]) else: da_location = da.interp(coords={ "lat": location[0], "lon": location[1] }, method="nearest") multiindex_df = da_location.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = df_clean.drop(columns=["lat", "lon", "slor", "anor", "z"]) df["time"] = df["time"] - 1970 # to years df["tp"] = log_transform(df["tp"]) df = df[["time", "d2m", "tcwv", "N34", "tp"]] #format order # Keep first of 70% for training train_df = df[df["time"] < df["time"].max() * 0.7] xtrain = train_df.drop(columns=["tp"]).values ytrain = train_df["tp"].values # Last 30% for evaluation eval_df = df[df["time"] > df["time"].max() * 0.7] x_eval = eval_df.drop(columns=["tp"]).values y_eval = eval_df["tp"].values # Training and validation data xval, xtest, yval, ytest = train_test_split(x_eval, y_eval, test_size=0.3333, shuffle=False) return xtrain, xval, xtest, ytrain, yval, ytest
def cluster_correlation_heatmap(): masks = ["Khyber_mask.nc", "Gilgit_mask.nc", "Ngari_mask.nc"] names = ["Gilgit regime", "Ngari regime", "Khyber regime"] for i in range(3): cluster_df = dd.download_data(masks[i]) # create lags cluster_df["CGTI-1"] = cluster_df["CGTI"].shift(periods=1) cluster_df["CGTI-2"] = cluster_df["CGTI"].shift(periods=2) cluster_df["CGTI-3"] = cluster_df["CGTI"].shift(periods=3) cluster_df["CGTI-4"] = cluster_df["CGTI"].shift(periods=4) cluster_df["CGTI-5"] = cluster_df["CGTI"].shift(periods=5) cluster_df["CGTI-6"] = cluster_df["CGTI"].shift(periods=6) """' df_combined['N34-1'] = df_combined['N34'].shift(periods=1) df_combined['N34-2'] = df_combined['N34'].shift(periods=2) df_combined['N34-3'] = df_combined['N34'].shift(periods=3) df_combined['N34-4'] = df_combined['N34'].shift(periods=4) df_combined['N34-5'] = df_combined['N34'].shift(periods=5) df_combined['N34-6'] = df_combined['N34'].shift(periods=6) df_combined['N4-1'] = df_combined['N4'].shift(periods=1) df_combined['N4-2'] = df_combined['N4'].shift(periods=2) df_combined['N4-3'] = df_combined['N4'].shift(periods=3) df_combined['N4-4'] = df_combined['N4'].shift(periods=4) df_combined['N4-5'] = df_combined['N4'].shift(periods=5) df_combined['N4-6'] = df_combined['N4'].shift(periods=6) """ df = cluster_df(columns=["expver", "time"]) df_clean = df.dropna() df_sorted = df_clean.sort_index(axis=1) # Correlation matrix corr = df_sorted.corr() # Plot sns.set(style="white") plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio mask = np.triu(np.ones_like( corr, dtype=np.bool)) # generate a mask for the upper triangle sns.heatmap( corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, square=True, linewidths=0.5, cbar_kws={"shrink": 0.5}, ) plt.title(names[i] + "\n") plt.show()
import DataDownloader as dd import Maps as maps ## Filepaths mask_filepath = "Data/ERA5_Upper_Indus_mask.nc" dem_filepath = "Data/elev.0.25-deg.nc" ## Function inputs ### Digital Elevation Model data dem = xr.open_dataset(dem_filepath) dem_da = (dem.data).sum(dim="time") sliced_dem = dem_da.sel(lat=slice(38, 30), lon=slice(71.25, 82.75)) ### Precipitation data da = dd.download_data(mask_filepath, xarray=True) tp_da = da.tp ### Decade list decades = [1980, 1990, 2000, 2010] ### Cluster list N = np.arange(2, 11, 1) def seasonal_clusters(tp_da, sliced_dem, N, decades): """ K-means clustering of precipitation data as a function of seasons, decades and number of clusters. Returns spatial graphs, overlayed with the local topography contours. Inputs: