def tp_vs(mask_filepath, variable, mask=None, longname=""): """ df = dp.download_data(mask_filepath) df_var = df[['time','tp', variable]] #df_mean = df_var.groupby('time').mean() """ if mask == None: df = dd.download_data(mask_filepath) else: cds_filepath = fd.update_cds_monthly_data() da = dd.apply_mask(cds_filepath, mask) df = da.to_dataframe().reset_index() df = df[["time", "tp", variable]] # gilgit = ds.interp(coords={'longitude':74.4584, 'latitude':35.8884 }, method='nearest') df_var = df.dropna() # Plot df_var.plot.scatter(x=variable, y="tp", alpha=0.2, c="b") plt.title("Upper Indus Basin") plt.ylabel("Total precipitation [m/day]") plt.xlabel(longname) plt.grid(True) plt.show()
def areal_model_eval(location, number=None, EDA_average=False, minyear=1979, maxyear=2020): """ Returns data to evaluate an areal model at a given location, area and time period. Variables: total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4, Nino 4 and NAO index for a single point. Inputs number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean coords [latitude, longitude], optional: specify if you want a specific location, list of floats mask, optional: specify area to train model, defaults to Upper Indus Basin Outputs x_tr: evaluation feature vector, numpy array y_tr: evaluation output vector, numpy array """ if number != None: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average == True: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = dd.download_data(location, xarray=True) sliced_da = da.sel(time=slice(minyear, maxyear)) if isinstance(location, str) == True: mask_filepath = find_mask(location) masked_da = dd.apply_mask(sliced_da, mask_filepath) multiindex_df = masked_da.to_dataframe() multiindex_df = da.to_dataframe() df = multiindex_df.dropna().reset_index() else: da_location = sliced_da.interp(coords={ "lat": location[0], "lon": location[1] }, method="nearest") multiindex_df = da_location.to_dataframe() df = multiindex_df.dropna().reset_index() df["time"] = df["time"] - 1970 # to years df["tp"] = log_transform(df["tp"]) df = df[[ "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp" ]] #format order xtr = df.drop(columns=["tp"]).values ytr = df["tp"].values return xtr, ytr
def eof_correlation(eof_filepath, mask_filepath): """ Returns plot and DataArray of areas with p<0.05 """ print("processing precipitation") da = dd.download_data(mask_filepath, xarray=True) tp_ds = da.mean(dim=["latitude", "longitude"]).tp tp = tp_ds.assign_coords(time=(tp_ds.time.astype("datetime64"))) tp_df = tp.to_dataframe() print("processing EOF") eof_da = xr.open_dataset(eof_filepath) eof_ds = eof_da.EOF eof = eof_ds.assign_coords(time=(eof_ds.time.astype("datetime64"))) eof_df = eof.to_dataframe() eof_pv = pd.pivot_table(eof_df, values="EOF", index=["time"], columns=["latitude", "longitude"]) eof_reset = eof_pv.reset_index() eof_reset["time"] -= np.timedelta64(12, "h") print("combining") df_combined = pd.merge_ordered(tp_df, eof_reset, on="time") df_clean = df_combined.dropna() corr_s = df_clean.corrwith(df_clean["tp"]) corr_df = corr_s.to_frame(name="corr") corr_df["pvalue"] = pvalue(df_clean) filepath = "Data/EOF_corr_pval.csv" corr_df.to_csv(filepath) return filepath
def StartSystem(): """ 启动各子系统 """ HoldManager.Start() DataDownloader.Start() BalanceManager.Start() OrderManager.Start()
def change_maps(data_filepath, mask_filepath, variable): """ Maps of average annual change from 1979 to 1989, 1999, 2009 and 2019 """ da = dd.apply_mask(data_filepath, mask_filepath) da_var = da[variable] * 1000 # to mm/day da_1979 = da_var.sel(time=slice("1979-01-16T12:00:00", str(+1) + "1980-01-01T12:00:00")) da_processed = cumulative_monthly(da_1979) basin_1979_sum = da_processed.sum(dim="time") basin_1989 = da_var.sel( time=slice("1989-01-01T12:00:00", "1990-01-01T12:00:00")) basin_1989_sum = cumulative_monthly(basin_1989).sum(dim="time") basin_1989_change = basin_1989_sum / basin_1979_sum - 1 basin_1999 = da_var.sel( time=slice("1999-01-01T12:00:00", "2000-01-01T12:00:00")) basin_1999_sum = cumulative_monthly(basin_1999).sum(dim="time") basin_1999_change = basin_1999_sum / basin_1979_sum - 1 basin_2009 = da_var.sel( time=slice("2009-01-01T12:00:00", "2010-01-01T12:00:00")) basin_2009_sum = cumulative_monthly(basin_2009).sum(dim="time") basin_2009_change = basin_2009_sum / basin_1979_sum - 1 basin_2019 = da_var.sel( time=slice("2019-01-01T12:00:00", "2020-01-01T12:00:00")) basin_2019_sum = cumulative_monthly(basin_2019).sum(dim="time") basin_2019_change = basin_2019_sum / basin_1979_sum - 1 basin_changes = xr.concat( [ basin_1989_change, basin_1999_change, basin_2009_change, basin_2019_change ], pd.Index(["1989", "1999", "2009", "2019"], name="year"), ) g = basin_changes.plot( x="longitude", y="latitude", col="year", col_wrap=2, subplot_kws={"projection": ccrs.PlateCarree()}, cbar_kwargs={ "label": "Precipitation change", "format": tck.PercentFormatter(xmax=1.0), }, ) for ax in g.axes.flat: ax.coastlines() ax.gridlines() ax.set_extent([71, 83, 30, 38]) ax.add_feature(cf.BORDERS) plt.show()
def single_location_comparison(model_filepath, lat, lon): """ Plots model outputs for given coordinates over time """ era5_ds = dd.collect_ERA5() cmip_ds = dd.collect_CMIP5() cordex_ds = dd.collect_CORDEX() cru_ds = dd.collect_CRU() #aphro_ds = dd.collect_APHRO() era5_ts = select_coords(era5_ds, lat, lon) cmip_ts = select_coords(cmip_ds, lat, lon) cordex_ts = select_coords(cordex_ds, lat, lon) cru_ts = select_coords(cru_ds, lat, lon) #aphro_ts = select_coords(aphro_ds, lat, lon) timeseries = [era5_ts, cmip_ts, cordex_ts, cru_ts] #, aphro_ts] xtr, y_gpr_t, y_std_t = model_prep([lat, lon], model_filepath) tims.benchmarking_plot(timeseries, xtr, y_gpr_t, y_std_t) dataset_stats(timeseries, xtr, y_gpr_t, y_std_t) corr.dataset_correlation(timeseries, y_gpr_t) pdf.benchmarking_plot(timeseries, y_gpr_t)
def basin_comparison(model_filepath, location): """ Plots model outputs for given coordinates over time """ era5_ds = dd.collect_ERA5() cmip_ds = dd.collect_CMIP5() cordex_ds = dd.collect_CORDEX() cru_ds = dd.collect_CRU() #aphro_ds = dd.collect_APHRO() era5_bs = select_basin(era5_ds, location) cmip_bs = select_basin(cmip_ds, location) cordex_bs = select_basin(cordex_ds, location) cru_bs = select_basin(cru_ds, location) #aphro_bs = select_basin(aphro_ds, location) basins = [era5_bs, cmip_bs, cordex_bs, cru_bs] #, aphro_ts] xtr, y_gpr_t, y_std_t = model_prep(location, model_filepath) tims.benchmarking_plot(basins, xtr, y_gpr_t, y_std_t) dataset_stats(basins, xtr, y_gpr_t, y_std_t) corr.dataset_correlation(basins, y_gpr_t) pdf.benchmarking_plot(basins, y_gpr_t)
def merge_csv(data_dir, out_dir, plot=False, ver=False): if not os.path.exists(out_dir): os.makedirs(out_dir) os.chdir(out_dir) files = sorted(os.listdir(data_dir)) syear,eyear = int(files[0].split('_')[3]), int(files[-1].split('_')[3]) smonth,emonth = int(files[0].split('_')[4][0:-4]), int(files[-1].split('_')[4][0:-4]) name = files[0].split('_')[0:3] if os.path.exists(out_dir+'/'+name[0]+'_'+name[1]+'_'+name[2]+'_merged.csv'): return name,'no errors' # Merge csv files into one pandas dataframe def read_append(data_dir,names,name,year,month,ver=False): path =data_dir+'/'+name[0]+'_'+name[1]+'_'+name[2]+'_%s_%s.csv' % (year,month) frame = pd.read_csv(path, header=14, parse_dates=['Date/Time'], index_col=['Date/Time']) names = names.append(frame) if ver==True: print path return names years = range(syear, eyear+1) months,smonths,emonths = range(1,12+1), range(smonth,12+1), range(1,emonth+1) names = pd.DataFrame() for year in years: if year==eyear: for month in emonths: try: names = read_append(data_dir,names,name,year,month) except ValueError, e: print e return name,e except IOError, e: print e dd.downloader(data_dir,name[0],name[1],name[2],1,month,year,verbose='on') names = read_append(data_dir,names,name,year,month) return name, e
def spatial_autocorr(variable, mask_filepath): # TODO """ Plots spatial autocorrelation """ df = dd.download_data(mask_filepath) # detrend table = pd.pivot_table(df, values="tp", index=["latitude", "longitude"], columns=["time"]) trans_table = table.T detrended_table = detrend(trans_table, axis=0) corr_table = detrended_table.corr() print(corr_table) corr_khyber = corr_table.loc[(34.5, 73.0)] corr_gilgit = corr_table.loc[(36.0, 75.0)] corr_ngari = corr_table.loc[(33.5, 79.0)] corr_list = [corr_khyber, corr_gilgit, corr_ngari] for corr in corr_list: df_reset = corr.reset_index().droplevel(1, axis=1) df_pv = df_reset.pivot(index="latitude", columns="longitude") df_pv = df_pv.droplevel(0, axis=1) da = xr.DataArray(data=df_pv, name="Correlation") # Plot plt.figure() ax = plt.subplot(projection=ccrs.PlateCarree()) ax.set_extent([71, 83, 30, 38]) g = da.plot( x="longitude", y="latitude", add_colorbar=True, ax=ax, vmin=-1, vmax=1, cmap="coolwarm", cbar_kwargs={"pad": 0.10}, ) ax.gridlines(draw_labels=True) ax.set_xlabel("Longitude") ax.set_ylabel("Latitude") plt.show()
def averaged_timeseries(mask_filepath, variable="tp", longname="Total precipitation [m/day]"): """ Timeseries for the Upper Indus Basin""" df = dd.download_data(mask_filepath) df_var = df[["time", variable]] df_var["time"] = df_var["time"].astype(np.datetime64) df_mean = df_var.groupby("time").mean() df_mean.plot() plt.title("Upper Indus Basin") plt.ylabel(longname) plt.xlabel("Year") plt.grid(True) plt.show()
def uib_sample_linreg(): """ Plots sample timeseries for UIB clusters """ # Open data mask_filepath = "Data/Masks/ERA5_Upper_Indus_mask.nc" tp = dd.download_data(mask_filepath, xarray=True) tp_da = tp.tp * 1000 # convert from m/day to mm/day ## Data gilgit = tp_da.interp(coords={"lon": 75, "lat": 36}, method="nearest") ngari = tp_da.interp(coords={"lon": 81, "lat": 32}, method="nearest") khyber = tp_da.interp(coords={"lon": 73, "lat": 34.5}, method="nearest") timeseries = [gilgit, ngari, khyber] gilgit_linear_model = lin_reg(gilgit) ngari_linear_model = lin_reg(ngari) khyber_linear_model = lin_reg(khyber) linear_models = [gilgit_linear_model, ngari_linear_model, khyber_linear_model] linreg_plot(timeseries, linear_models)
def input_correlation_heatmap(): df = dd.download_data(mask_filepath, all_var=True) # create lags df["N34-1"] = df["N34"].shift(periods=393) df["NAO-1"] = df["NAO"].shift(periods=393) df["N4-1"] = df["N4"].shift(periods=393) df = df.drop(columns=["time"]) df_clean = df.dropna() df_sorted = df_clean.sort_index(axis=1) corr = df_sorted.corr() sns.set(style="white") plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) mask = np.triu(np.ones_like( corr, dtype=np.bool)) # generate a mask for the upper triangle sns.heatmap( corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, fmt="0.2f", square=True, linewidths=0.5, annot=True, annot_kws={"size": 5}, cbar_kws={"shrink": 0.5}, ) plt.title("Correlation plot for Upper Indus Basin") plt.show()
def annual_map(data_filepath, mask_filepath, variable, year, cumulative=False): """ Annual map """ da = dd.apply_mask(data_filepath, mask_filepath) ds_year = da.sel(time=slice( str(year) + "-01-16T12:00:00", str(year + 1) + "-01-01T12:00:00")) ds_var = ds_year[variable] * 1000 # to mm/day if cumulative is True: ds_processed = cumulative_monthly(ds_var) ds_final = ds_processed.sum(dim="time") else: ds_final = ds_var.std(dim="time") # TODO weighted mean print(ds_final) plt.figure() ax = plt.subplot(projection=ccrs.PlateCarree()) ax.set_extent([71, 83, 30, 38]) g = ds_final["tp_0001"].plot( cmap="magma_r", vmin=0.001, cbar_kwargs={ "label": "Precipitation standard deviation [mm/day]", "extend": "neither", "pad": 0.10 }) g.cmap.set_under("white") #ax.add_feature(cf.BORDERS) ax.coastlines() ax.gridlines(draw_labels=True) ax.set_xlabel("Longitude") ax.set_ylabel("Latitude") plt.title("Upper Indus Basin Total Precipitation " + str(year) + "\n \n") plt.show()
def StopSystem(): """ 停止各子系统 """ OrderManager.Stop() DataDownloader.Stop()
#print DataDownloader.downloader(wd='/home/nbrown/Desktop',stationID='1706',interval='hourly',day='14',month='7',year='2001',verbose='off') # Test multipleDownloads l = [['51157', 'hourly', '1', '1', '2015'], ['51157', 'hourly', '1', '2', '2015'], ['51157', 'hourly', '1', '3', '2015'], ['51157', 'hourly', '1', '4', '2015'], ['51157', 'hourly', '1', '5', '2015'], ['51157', 'hourly', '1', '6', '2015'], ['51157', 'hourly', '1', '7', '2015'], ['51157', 'hourly', '1', '8', '2015'], ['51157', 'hourly', '1', '9', '2015'], ['51157', 'hourly', '1', '10', '2015'], ['51157', 'hourly', '1', '11', '2015'], ['51157', 'hourly', '1', '12', '2015']] #DataDownloader.multipleDownloads('/home/nbrown/Desktop',l) # Test findStations wd='/home/nbrown/Desktop' a = DataDownloader.findStations(DataDownloader.genStationsDict(wd),name='montreal',interval='hourly',tp=['1950','2014'],verbose='on') b = DataDownloader.genDownloadList(a,verbose='on') # Test: Check that downloader gives the right errors on wrong inputs or handles input conversion properly
def select_basin(dataset, location): """ Interpolate dataset at given coordinates """ mask_filepath = dp.find_mask(location) basin = dd.apply_mask(dataset, mask_filepath) basin = basin.sel(time=slice(1990, 2005)) return basin
def get_coord(stationName,stationInvFileDir): stationInv = dd.genStationsDict(stationInvFileDir,downloadNew=False,ver=False) lat,lon,elev = stationInv[stationName][5],stationInv[stationName][6],stationInv[stationName][9] return lat,lon,elev
import DataDownloader as dd import Maps as maps ## Filepaths mask_filepath = "Data/ERA5_Upper_Indus_mask.nc" dem_filepath = "Data/elev.0.25-deg.nc" ## Function inputs ### Digital Elevation Model data dem = xr.open_dataset(dem_filepath) dem_da = (dem.data).sum(dim="time") sliced_dem = dem_da.sel(lat=slice(38, 30), lon=slice(71.25, 82.75)) ### Precipitation data da = dd.download_data(mask_filepath, xarray=True) tp_da = da.tp ### Decade list decades = [1980, 1990, 2000, 2010] ### Cluster list N = np.arange(2, 11, 1) def seasonal_clusters(tp_da, sliced_dem, N, decades): """ K-means clustering of precipitation data as a function of seasons, decades and number of clusters. Returns spatial graphs, overlayed with the local topography contours. Inputs:
'DataDownloader', # runs on multiple cpu 'DataExtractor', # runs on multiple cpu 'RoleUpdater', 'DataProcessing', # runs on multiple cpu 'DataShuffling', # runs on multiple cpu 'Learner', # runs on gpu 'BestPicks', ] if __name__ == '__main__': if 'PlayersListing' in to_execute: import PlayersListing PlayersListing.run(m) if 'DataDownloader' in to_execute: import DataDownloader DataDownloader.run(m) if 'DataExtractor' in to_execute: import DataExtractor DataExtractor.run(m, cpu) if 'RoleUpdater' in to_execute: import RoleUpdater RoleUpdater.run(m) if 'DataProcessing' in to_execute: import DataProcessing DataProcessing.run(m, cpu) if 'DataShuffling' in to_execute: import DataShuffling DataShuffling.run(m, shuffling_files, keep_for_testing, cpu) if 'Learner' in to_execute: import Learner Learner.run(m, n, restore)
stationInv = dd.genStationsDict(stationInvFileDir,downloadNew=False,ver=False) lat,lon,elev = stationInv[stationName][5],stationInv[stationName][6],stationInv[stationName][9] return lat,lon,elev files = sorted(os.listdir('/home/nbrown/Desktop/plots')) files[:] = [ x for x in files if '_merged.csv' in x ] lats,lons,elevs = np.empty(0),np.empty(0),np.empty(0) count=1. for f in files: f = f.split('_')[0] try: lat,lon,elev = get_coord(f,'/home/nbrown/Desktop/test') lats = np.append(lats,lat) lons = np.append(lons,lon) elevs = np.append(elevs,elev) except KeyError,e: print e pass dd.update_progress(count/len(files)) count=count+1. fig1 = plt.figure(figsize=(6,9.5)) ax = plt.subplot(111,projection=ccrs.Mollweide(central_longitude=-95)) SC = ax.scatter(lons,lats,marker='o',transform=ccrs.PlateCarree()) #cbar = plt.colorbar(CS, cmap='coolwarm', orientation='horizontal') plt.gca().coastlines(resolution='50m') plt.grid() fig1.show()
def cluster_correlation_heatmap(): masks = ["Khyber_mask.nc", "Gilgit_mask.nc", "Ngari_mask.nc"] names = ["Gilgit regime", "Ngari regime", "Khyber regime"] for i in range(3): cluster_df = dd.download_data(masks[i]) # create lags cluster_df["CGTI-1"] = cluster_df["CGTI"].shift(periods=1) cluster_df["CGTI-2"] = cluster_df["CGTI"].shift(periods=2) cluster_df["CGTI-3"] = cluster_df["CGTI"].shift(periods=3) cluster_df["CGTI-4"] = cluster_df["CGTI"].shift(periods=4) cluster_df["CGTI-5"] = cluster_df["CGTI"].shift(periods=5) cluster_df["CGTI-6"] = cluster_df["CGTI"].shift(periods=6) """' df_combined['N34-1'] = df_combined['N34'].shift(periods=1) df_combined['N34-2'] = df_combined['N34'].shift(periods=2) df_combined['N34-3'] = df_combined['N34'].shift(periods=3) df_combined['N34-4'] = df_combined['N34'].shift(periods=4) df_combined['N34-5'] = df_combined['N34'].shift(periods=5) df_combined['N34-6'] = df_combined['N34'].shift(periods=6) df_combined['N4-1'] = df_combined['N4'].shift(periods=1) df_combined['N4-2'] = df_combined['N4'].shift(periods=2) df_combined['N4-3'] = df_combined['N4'].shift(periods=3) df_combined['N4-4'] = df_combined['N4'].shift(periods=4) df_combined['N4-5'] = df_combined['N4'].shift(periods=5) df_combined['N4-6'] = df_combined['N4'].shift(periods=6) """ df = cluster_df(columns=["expver", "time"]) df_clean = df.dropna() df_sorted = df_clean.sort_index(axis=1) # Correlation matrix corr = df_sorted.corr() # Plot sns.set(style="white") plt.subplots(figsize=(11, 9)) cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio mask = np.triu(np.ones_like( corr, dtype=np.bool)) # generate a mask for the upper triangle sns.heatmap( corr, mask=mask, cmap=cmap, center=0, vmin=-1, vmax=1, square=True, linewidths=0.5, cbar_kws={"shrink": 0.5}, ) plt.title(names[i] + "\n") plt.show()
return name,e except IOError, e: print e dd.downloader(data_dir,name[0],name[1],name[2],1,month,year,verbose='on') names = read_append(data_dir,names,name,year,month) return name, e elif year==syear: for month in smonths: try: names = read_append(data_dir,names,name,year,month) except ValueError, e: print e return name,e except IOError, e: print e dd.downloader(data_dir,name[0],name[1],name[2],1,month,year,verbose='on') names = read_append(data_dir,names,name,year,month) return name, e else: for month in months: try: names = read_append(data_dir,names,name,year,month) except ValueError, e: print e return name,e except IOError, e: print e dd.downloader(data_dir,name[0],name[1],name[2],1,month,year,verbose='on') names = read_append(data_dir,names,name,year,month) return name, e names.to_csv(out_dir+'/'+name[0]+'_'+name[1]+'_'+name[2]+'_merged.csv')
def areal_model(location, number=None, EDA_average=False, length=3000, seed=42): """ Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4 index for given number randomly sampled data points for a given basin. Inputs location: specify area to train model number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean length, optional: specify number of points to sample, integer seed, optional: specify seed, integer Outputs x_train: training feature vector, numpy array y_train: training output vector, numpy array x_test: testing feature vector, numpy array y_test: testing output vector, numpy array """ if number != None: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average == True: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = dd.download_data(location, xarray=True) # apply mask mask_filepath = find_mask(location) masked_da = dd.apply_mask(da, mask_filepath) multiindex_df = masked_da.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = sa.random_location_and_time_sampler(df_clean, length=length, seed=seed) df["time"] = df["time"] - 1970 df["tp"] = log_transform(df["tp"]) df = df[[ "time", "lat", "lon", "slor", "anor", "z", "d2m", "tcwv", "N34", "tp" ]] # Remove last 10% of time for testing test_df = df[df["time"] > df["time"].max() * 0.9] xtest = test_df.drop(columns=["tp"]).values ytest = test_df["tp"].values # Training and validation data tr_df = df[df["time"] < df["time"].max() * 0.9] xtr = tr_df.drop(columns=["tp"]).values ytr = tr_df["tp"].values xtrain, xval, ytrain, yval = train_test_split( xtr, ytr, test_size=0.30, shuffle=False) # Training and validation data """ # Keep first of 70% for training train_df = df[ df['time']< df['time'].max()*0.7] xtrain = train_df.drop(columns=['tp']).values ytrain = train_df['tp'].values # Last 30% for evaluation eval_df = df[ df['time']> df['time'].max()*0.7] x_eval = eval_df.drop(columns=['tp']).values y_eval = eval_df['tp'].values # Training and validation data xval, xtest, yval, ytest = train_test_split(x_eval, y_eval, test_size=0.3333, shuffle=True) """ return xtrain, xval, xtest, ytrain, yval, ytest
def point_model(location, number=None, EDA_average=False): """ Outputs test, validation and training data for total precipitation as a function of time, 2m dewpoint temperature, angle of sub-gridscale orography, orography, slope of sub-gridscale orography, total column water vapour, Nino 3.4, Nino 4 and NAO index for a single point. Inputs number, optional: specify desired ensemble run, integer EDA_average, optional: specify if you want average of low resolution ensemble runs, boolean coords [latitude, longitude], optional: specify if you want a specific location, list of floats mask_filepath, optional: Outputs x_train: training feature vector, numpy array y_train: training output vector, numpy array x_test: testing feature vector, numpy array y_test: testing output vector, numpy array """ if number != None: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.sel(number=number).drop("number") if EDA_average == True: da_ensemble = dd.download_data(location, xarray=True, ensemble=True) da = da_ensemble.mean(dim="number") else: da = dd.download_data(location, xarray=True) if location is str: multiindex_df = da.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df_location = sa.random_location_sampler(df_clean) df = df_location.drop(columns=["lat", "lon", "slor", "anor", "z"]) else: da_location = da.interp(coords={ "lat": location[0], "lon": location[1] }, method="nearest") multiindex_df = da_location.to_dataframe() df_clean = multiindex_df.dropna().reset_index() df = df_clean.drop(columns=["lat", "lon", "slor", "anor", "z"]) df["time"] = df["time"] - 1970 # to years df["tp"] = log_transform(df["tp"]) df = df[["time", "d2m", "tcwv", "N34", "tp"]] #format order # Keep first of 70% for training train_df = df[df["time"] < df["time"].max() * 0.7] xtrain = train_df.drop(columns=["tp"]).values ytrain = train_df["tp"].values # Last 30% for evaluation eval_df = df[df["time"] > df["time"].max() * 0.7] x_eval = eval_df.drop(columns=["tp"]).values y_eval = eval_df["tp"].values # Training and validation data xval, xtest, yval, ytest = train_test_split(x_eval, y_eval, test_size=0.3333, shuffle=False) return xtrain, xval, xtest, ytrain, yval, ytest
declineProbe = None riseProbe = None if __name__ == '__main__': InitSystem() StartSystem() logStr = "All System Started!" Log.Print(logStr) Log.Info(Const.logFile, logStr) while (True): if Terminated(): break if DataDownloader.DataValid() and len(DataDownloader.realTimeBids) > 0: currBidPrice = DataDownloader.realTimeBids[-1] TryToSell(currBidPrice) currAskPrice = DataDownloader.realTimeAsks[-1] if declineProbe == None and riseProbe == None: declineProbe = SetProbe(currAskPrice, 0) riseProbe = SetProbe(currAskPrice, 1) else: if len(DataDownloader.realTimeAsks) < 60: time.sleep(0.5) continue meanPrice = np.mean(DataDownloader.realTimeAsks[-10:]) if declineProbe.Triggered(meanPrice): declineProbe = SetProbe(currAskPrice, 0, declineProbe)