def shuffle_split_rf(latlon_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list, rep, res=10000): '''Shuffle-split cross-validation with 50/50 training test split Parameters ---------- loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file rep : int number of replications Returns ---------- float - MAE estimate for entire surface (average of replications) ''' count = 1 error_dictionary = {} while count <= rep: x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} # for plotting station_name_list = [] projected_lat_lon = {} for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): station_name_list.append(station_name) loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] # Split the stations in two # we can't just use Cvar_dict.keys() because some stations do not have valid lat/lon stations_input = [] for station_code in Cvar_dict.keys(): if station_code in latlon_dict.keys(): stations_input.append(station_code) # Split the stations in two stations = np.array(stations_input) # Won't be exactly 50/50 if uneven num stations splits = ShuffleSplit(n_splits=1, train_size=.5) for train_index, test_index in splits.split(stations): train_stations = stations[train_index] # print(train_stations) test_stations = stations[test_index] # print(test_stations) # They can't overlap for val in train_stations: if val in test_stations: print('Error, the train and test sets overlap!') sys.exit() lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in latlon_dict.keys(): if station_name not in test_stations: loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = res pixelWidth = res num_col = int((xmax - xmin) / pixelHeight) + 1 num_row = int((ymax - ymin) / pixelWidth) + 1 # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup( zip(xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj ): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame({ 'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z }) df_testX = pd.DataFrame({ 'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array }) reg = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) Zi = reg.predict(X_test) rf_grid = Zi.reshape(num_row, num_col) # Calc the RMSE, MAE at the pixel loc # Delete at a certain point for statLoc in test_stations: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) try: interpolated_val = rf_grid[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error except IndexError: pass error_dictionary[count] = sum( absolute_error_dictionary.values()) / len( absolute_error_dictionary.values( )) # average of all the withheld stations count += 1 overall_error = sum(error_dictionary.values()) / rep return overall_error
def spatial_kfold_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list,\ block_num, blocking_type, return_error): '''Spatially blocked k-fold cross-validation procedure for RF Parameters ---------- idw_example_grid : ndarray used for reference of study area grid size loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file block_num : int number of blocks/clusters blocking_type : string whether to use clusters or blocks return_error : bool whether or not to return the error dictionary Returns ---------- float - MAE estimate for entire surface int - Return the block number just so we can later write it into the file to keep track dictionary - if return_error = True, a dictionary of the absolute error at each fold when it was left out ''' groups_complete = [ ] # If not using replacement, keep a record of what we have done error_dictionary = {} x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} projected_lat_lon = {} # Selecting blocknum if blocking_type == 'cluster': cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, block_num, file_path_elev, idx_list, False, False, False) elif blocking_type == 'block': # Get the numpy array that delineates the blocks np_array_blocks = mbk.make_block(idw_example_grid, block_num) cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict, Cvar_dict) # Now get the dictionary else: print('That is not a valid blocking method') sys.exit() for group in cluster.values(): if group not in groups_complete: station_list = [k for k, v in cluster.items() if v == group] groups_complete.append(group) for station_name in Cvar_dict.keys(): if station_name in loc_dict.keys(): loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): if station_name not in station_list: loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup( zip(xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame({ 'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z }) df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array}) reg = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) Zi = reg.predict(X_test) rf_grid = Zi.reshape(num_row, num_col) # Calc the RMSE, MAE at the pixel loc # Delete at a certain point for statLoc in station_list: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = rf_grid[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error # average of all the withheld stations MAE = sum(absolute_error_dictionary.values()) / \ len(absolute_error_dictionary.values()) if return_error: return block_num, MAE, absolute_error_dictionary else: return block_num, MAE
def cross_validate_rf(latlon_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list, pass_to_plot): '''Leave-one-out cross-validation procedure for RF Parameters ---------- latlon_dict : dictionary the latitude and longitudes of the stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile, including its name file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file pass_to_plot : bool whether you will be plotting the error and need a version without absolute value error (i.e. fire season days) Returns ---------- dictionary - a dictionary of the absolute error at each station when it was left out dictionary - if pass_to_plot = True, returns a dictionary without the absolute value of the error, for example for plotting fire season error ''' x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} # for plotting no_absolute_value_dict = {} # to see whether under or over estimation station_name_list = [] projected_lat_lon = {} for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): station_name_list.append(station_name) loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] for station_name_hold_back in station_name_list: lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in latlon_dict.keys(): if station_name != station_name_hold_back: loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup( zip(xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj ): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame({ 'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z }) df_testX = pd.DataFrame({ 'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array }) reg = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) Zi = reg.predict(X_test) rf_grid = Zi.reshape(num_row, num_col) # Calc the RMSE, MAE at the pixel loc # Delete at a certain point coord_pair = projected_lat_lon[station_name_hold_back] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = rf_grid[y_orig][x_orig] original_val = Cvar_dict[station_name_hold_back] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[station_name_hold_back] = absolute_error no_absolute_value_dict[ station_name_hold_back] = interpolated_val - original_val if pass_to_plot: return absolute_error_dictionary, no_absolute_value_dict else: return absolute_error_dictionary
def random_forest_interpolator(latlon_dict, Cvar_dict, input_date, var_name, shapefile, show, \ file_path_elev, idx_list, expand_area, res = 10000): '''Random forest interpolation Parameters ---------- latlon_dict : dictionary the latitude and longitudes of the stations Cvar_dict : dictionary dictionary of weather variable values for each station input_date : string the date you want to interpolate for var_name : string the name of the variable you are interpolating shapefile : string path to the study area shapefile, including its name show : bool whether you want to plot a map file_path_elev : string path to the elevation lookup file idx_list : int position of the elevation column in the lookup file expand_area : bool function will expand the study area so that more stations are taken into account (200 km) Returns ---------- ndarray - the array of values for the interpolated surface list - the bounds of the array surface, for use in other functions ''' lat = [] lon = [] Cvar = [] na_map = gpd.read_file(shapefile) bounds = na_map.bounds if expand_area: xmax = bounds['maxx'] + 200000 xmin = bounds['minx'] - 200000 ymax = bounds['maxy'] + 200000 ymin = bounds['miny'] - 200000 else: xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] # Filter out stations outside of grid proj_coord = pyproj.Proj('esri:102001')(longitude, latitude) if (proj_coord[1] <= float(ymax[0]) and proj_coord[1] >= float(ymin[0]) and proj_coord[0] <= float(xmax[0]) and proj_coord[0] >= float(xmin[0])): cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) y = np.array(lat) x = np.array(lon) z = np.array(Cvar) pixelHeight = res pixelWidth = res num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) if expand_area: yProj_extent = np.append( yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000]) xProj_extent = np.append( xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000]) else: yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row + 1) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col + 1) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup( zip(xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame({ 'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z }) df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array}) reg = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) Zi = reg.predict(X_test) rf_grid = Zi.reshape(num_row + 1, num_col + 1) if show: fig, ax = plt.subplots(figsize=(15, 15)) crs = {'init': 'esri:102001'} na_map = gpd.read_file(shapefile) plt.imshow(rf_grid, extent=(xProj_extent.min() - 1, xProj_extent.max() + 1, yProj_extent.max() - 1, yProj_extent.min() + 1)) na_map.plot(ax=ax, color='white', edgecolor='k', linewidth=2, zorder=10, alpha=0.1) plt.scatter(xProj, yProj, c=z, edgecolors='k') plt.gca().invert_yaxis() cbar = plt.colorbar() cbar.set_label(var_name) title = 'RF Interpolation for %s on %s' % (var_name, input_date) fig.suptitle(title, fontsize=14) plt.xlabel('Longitude') plt.ylabel('Latitude') plt.show() return rf_grid, maxmin
def spatial_cluster(loc_dict, Cvar_dict, shapefile, cluster_num, file_path_elev, idx_list, plot_2D, plot_3D, return_all): '''Spatial clustering based on scikit learn's agglomerative clustering Parameters ---------- loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile clusternum : int number of clusters file_path_elev : string path to the elevation lookup file idx_list : int position of the elevation column in the lookup file plot_2D : bool whether to plot maps of the clusters in 2d plot_3D : bool whether to plot maps of the clusters in 3d return_all : bool whether or not to return all the outputs (needed for selecting cluster size) Returns ---------- dictionary - a dictionary of cluster that each station is in ''' x = [] y = [] proj_stations = {} for station in Cvar_dict.keys(): if station in loc_dict.keys(): coord = loc_dict[station] Plon1, Plat1 = pyproj.Proj('esri:102001')( coord[1], coord[0]) # longitude,lat Plat = float(Plat1) Plon = float(Plon1) x.append([Plon]) y.append([Plat]) proj_stations[station] = [Plat, Plon] X = [val+y[i] for i, val in enumerate(x)] X = np.array(X) # print(X) # Make the longitudinal transect of distance (lon, elev) Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((x, y)).T send_to_list = concat[0].tolist() send_to_tuple = [tuple(x) for x in send_to_list] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup( send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) lon = [i for i in Xi1_grd] # list of 0 lon_list = [[i] for i in lon] lat_list = [[i] for i in Yi1_grd] elev = [[i] for i in elev_grd] # put into sublist so you can make pairs Xelev = [val+lat_list[i]+elev[i] for i, val in enumerate(lon_list)] Xelev = np.array(Xelev) # This is where we make the connectivity graph based on elevation knn_graph = kneighbors_graph(Xelev, 10, include_self=False) connectivity = knn_graph n_clusters = cluster_num linkage = 'ward' model = AgglomerativeClustering( linkage=linkage, connectivity=connectivity, n_clusters=n_clusters) model.fit(Xelev) # fit with lat lon elev label = model.labels_ if plot_3D: fig = plt.figure() ax = p3.Axes3D(fig) ax.view_init(7, -80) for l in np.unique(label): ax.scatter(Xelev[label == l, 0], Xelev[label == l, 1], Xelev[label == l, 2], color=plt.cm.jet(float(l) / np.max(label + 1)), s=20, edgecolor='k') plt.title('With connectivity constraints, Elevation inc.') ax.set_xlabel('Longitude') ax.set_ylabel('Latitude') ax.set_zlabel('Elevation (m)') plt.show() # This is where we make the connectivity graph where we can see on the map if plot_2D: fig, ax = plt.subplots(figsize=(15, 15)) crs = {'init': 'esri:102001'} na_map = gpd.read_file(shapefile) na_map.plot(ax=ax, color='white', edgecolor='k', linewidth=1, alpha=1) plt.scatter(Xelev[:, 0], Xelev[:, 1], c=model.labels_, cmap=plt.cm.tab20b, s=20, edgecolor='k') ax.tick_params(axis='both', which='both', bottom=False, top=False, labelbottom=False, right=False, left=False, labelleft=False) ax.ticklabel_format(useOffset=False, style='plain') # plt.subplots_adjust(bottom=0, top=.83, wspace=0, # left=0, right=1) # plt.suptitle('n_cluster=%i, connectivity=%r' % # (n_clusters, connectivity is not None), size=17) plt.show() # Make a dictionary with each class station_class = {} count = 0 for val in Xelev: key = [key for key, value in proj_stations.items() if value == [ val[1], val[0]]] if len(key) == 1: # We add 1, because for the random selection the groups start at 1 station_class[key[0]] = label[count] + 1 elif len(key) == 2: station_class[key[0]] = label[count] + 1 station_class[key[1]] = label[count] + 1 elif len(key) == 3: station_class[key[0]] = label[count] + 1 station_class[key[1]] = label[count] + 1 station_class[key[2]] = label[count] + 1 else: print('Too many stations have the same lat lon.') count += 1 if count != label.shape[0]: print('The groups and label matrix do not match') if return_all: return label, Xelev, station_class else: return station_class
def spatial_groups_rf(idw_example_grid, loc_dict, Cvar_dict, shapefile, blocknum, nfolds,\ replacement, dictionary_Groups, file_path_elev, idx_list, expand_area): '''Stratified shuffle-split cross-validation procedure Parameters ---------- idw_example_grid : ndarray used for reference of study area grid size loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile blocknum : int number of blocks/clusters nfolds : int number of folds to create (essentially repetitions) replacement : bool whether or not to use replacement between folds, should usually be true dictionary_Groups : dictionary dictionary of what groups (clusters) the stations belong to expand_area : bool function will expand the study area so that more stations are taken into account (200 km) Returns ---------- dictionary - a dictionary of the absolute error at each fold when it was left out ''' station_list_used = [ ] # If not using replacement, keep a record of what we have done count = 1 error_dictionary = {} na_map = gpd.read_file(shapefile) bounds = na_map.bounds if expand_area: xmax = bounds['maxx'] + 200000 xmin = bounds['minx'] - 200000 ymax = bounds['maxy'] + 200000 ymin = bounds['miny'] - 200000 else: xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] while count <= nfolds: x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} projected_lat_lon = {} station_list = Eval.select_random_station(dictionary_Groups, blocknum, replacement, station_list_used).values() if replacement == False: station_list_used.append(list(station_list)) # print(station_list_used) for station_name in Cvar_dict.keys(): if station_name in loc_dict.keys(): loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) # Filter out stations outside of grid proj_coord = pyproj.Proj('esri:102001')(longitude, latitude) if (proj_coord[1] <= float(ymax[0]) and proj_coord[1] >= float(ymin[0]) and proj_coord[0] <= float(xmax[0]) and proj_coord[0] >= float(xmin[0])): projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): if station_name not in station_list: # This is the step where we hold back the fold loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] # Filter out stations outside of grid proj_coord = pyproj.Proj('esri:102001')(longitude, latitude) if (proj_coord[1] <= float(ymax[0]) and proj_coord[1] >= float(ymin[0]) and proj_coord[0] <= float(xmax[0]) and proj_coord[0] >= float(xmin[0])): lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass # Skip the station y = np.array(lat) x = np.array(lon) z = np.array(Cvar) pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) + 1 num_row = int((ymax - ymin) / pixelWidth) + 1 # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) if expand_area: yProj_extent = np.append( yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000]) xProj_extent = np.append( xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000]) else: yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row + 1) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col + 1) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup( zip(xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj ): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame({ 'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z }) df_testX = pd.DataFrame({ 'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array }) reg = RandomForestRegressor(n_estimators=100, max_features='sqrt', random_state=1) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) Zi = reg.predict(X_test) rf_grid = Zi.reshape(num_row + 1, num_col + 1) # Compare at a certain point for statLoc in station_list: coord_pair = projected_lat_lon[statLoc] x_orig = int((coord_pair[0] - float(xmin)) / pixelHeight) # lon y_orig = int((coord_pair[1] - float(ymin)) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = rf_grid[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error error_dictionary[count] = sum( absolute_error_dictionary.values()) / len( absolute_error_dictionary.values( )) # average of all the withheld stations # print(absolute_error_dictionary) count += 1 overall_error = sum(error_dictionary.values()) / \ nfolds # average of all the runs # print(overall_error) return overall_error
def shuffle_split_IDEW(latlon_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list, d, rep, res=10000): '''Shuffle-split cross-validation with 50/50 training test split Parameters ---------- loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file d : int the weighting for IDW interpolation rep : int number of replications Returns ---------- float - MAE estimate for entire surface (average of replications) ''' count = 1 error_dictionary = {} while count <= rep: x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} station_name_list = [] projected_lat_lon = {} # we can't just use Cvar_dict.keys() because some stations do not have valid lat/lon stations_input = [] for station_code in Cvar_dict.keys(): if station_code in latlon_dict.keys(): stations_input.append(station_code) # Split the stations in two stations = np.array(stations_input) # Won't be exactly 50/50 if uneven num stations splits = ShuffleSplit(n_splits=1, train_size=.5) for train_index, test_index in splits.split(stations): train_stations = stations[train_index] # print(train_stations) test_stations = stations[test_index] # print(test_stations) # They can't overlap for val in train_stations: if val in test_stations: print('Error, the train and test sets overlap!') sys.exit() for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): station_name_list.append(station_name) loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in latlon_dict.keys(): if station_name not in test_stations: loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation? z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = res pixelWidth = res num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] vals = np.vstack((xProj, yProj)).T interpol = np.vstack((Xi, Yi)).T # Length of the triangle side from the cell to the point with data dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0]) # Length of the triangle side from the cell to the point with data dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1]) # euclidean distance, getting the hypotenuse distance_matrix = np.hypot(dist_not, dist_one) # what if distance is 0 --> np.inf? have to account for the pixel underneath weights = 1 / (distance_matrix**d) # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath weights[np.where(np.isinf(weights))] = 1 / (1.0E-50) weights /= weights.sum(axis=0) Zi = np.dot(weights.T, z) idw_grid = Zi.reshape(num_row, num_col) elev_dict = GD.finding_data_frm_lookup(zip(xProj, yProj), file_path_elev, idx_list) xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # in case there are two stations at the same lat\lon x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) vals2 = np.vstack(source_elev).T interpol2 = np.vstack(elev_array).T dist_not2 = np.subtract.outer(vals2[0], interpol2[0]) dist_not2 = np.absolute(dist_not2) weights2 = 1 / (dist_not2**d) weights2[np.where(np.isinf(weights2))] = 1 weights2 /= weights2.sum(axis=0) fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z) fin = fin.reshape(num_row, num_col) # Calc the RMSE, MAE, NSE, and MRAE at the pixel loc # Delete at a certain point for statLoc in test_stations: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = fin[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error error_dictionary[count] = sum( absolute_error_dictionary.values()) / len( absolute_error_dictionary.values( )) # average of all the withheld stations count += 1 overall_error = sum(error_dictionary.values()) / rep return overall_error
def spatial_kfold_IDEW(loc_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list, d, block_num, blocking_type): '''Spatially blocked k-folds cross-validation procedure for IDEW Parameters ---------- idw_example_grid : ndarray used for reference of study area grid size loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile d : int the weighting for IDW interpolation file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file block_num : int number of blocks/clusters blocking_type : string whether to use clusters or blocks Returns ---------- float - MAE estimate for entire surface int - Return the block number just so we can later write it into the file to keep track ''' groups_complete = [ ] # If not using replacement, keep a record of what we have done error_dictionary = {} x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} projected_lat_lon = {} if blocking_type == 'cluster': cluster = c3d.spatial_cluster(loc_dict, Cvar_dict, shapefile, block_num, file_path_elev, idx_list, False, False, False) elif blocking_type == 'block': # Get the numpy array that delineates the blocks np_array_blocks = mbk.make_block(idw_example_grid, block_num) cluster = mbk.sorting_stations(np_array_blocks, shapefile, loc_dict, Cvar_dict) # Now get the dictionary else: print('That is not a valid blocking method') sys.exit() for group in cluster.values(): if group not in groups_complete: station_list = [k for k, v in cluster.items() if v == group] groups_complete.append(group) for station_name in Cvar_dict.keys(): if station_name in loc_dict.keys(): loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): if station_name not in station_list: loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation? z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] vals = np.vstack((xProj, yProj)).T interpol = np.vstack((Xi, Yi)).T # Length of the triangle side from the cell to the point with data dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0]) # Length of the triangle side from the cell to the point with data dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1]) # euclidean distance, getting the hypotenuse distance_matrix = np.hypot(dist_not, dist_one) # what if distance is 0 --> np.inf? have to account for the pixel underneath weights = 1 / (distance_matrix**d) # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath weights[np.where(np.isinf(weights))] = 1 / (1.0E-50) weights /= weights.sum(axis=0) Zi = np.dot(weights.T, z) idw_grid = Zi.reshape(num_row, num_col) elev_dict = GD.finding_data_frm_lookup(zip(xProj, yProj), file_path_elev, idx_list) xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # in case there are two stations at the same lat\lon x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) vals2 = np.vstack(source_elev).T interpol2 = np.vstack(elev_array).T dist_not2 = np.subtract.outer(vals2[0], interpol2[0]) dist_not2 = np.absolute(dist_not2) weights2 = 1 / (dist_not2**d) weights2[np.where(np.isinf(weights2))] = 1 weights2 /= weights2.sum(axis=0) fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z) fin = fin.reshape(num_row, num_col) # Calc the RMSE, MAE, NSE, and MRAE at the pixel loc # Delete at a certain point for statLoc in station_list: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = fin[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error # average of all the withheld stations MAE = sum(absolute_error_dictionary.values()) / \ len(absolute_error_dictionary.values()) return block_num, MAE
def IDEW(latlon_dict, Cvar_dict, input_date, var_name, shapefile, show, file_path_elev, idx_list, d, expand_area, res=10000): '''Inverse distance elevation weighting Parameters ---------- latlon_dict : dictionary the latitude and longitudes of the stations Cvar_dict : dictionary dictionary of weather variable values for each station input_date : string the date you want to interpolate for var_name : string the name of the variable you are interpolating shapefile : string path to the study area shapefile, including its name show : bool whether you want to plot a map file_path_elev : string path to the elevation lookup file idx_list : int position of the elevation column in the lookup file d : int the weighting for IDW interpolation Returns ---------- ndarray - the array of values for the interpolated surface list - the bounds of the array surface, for use in other functions ndarray - elevation array (for use in the random forest module ''' # Input: lat lon of station, variable (start day, rainfall, etc), date of interest,variable name (for plotting), show (bool true/false), file path to elevation lookup file # idx_list (for the column containing the elevation data), d is the power applied to get the weight lat = [] # Initialize empty lists to store data lon = [] Cvar = [] for station_name in Cvar_dict.keys(): # Loop through the list of stations if station_name in latlon_dict.keys( ): # Make sure the station is present in the latlon dict loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) y = np.array(lat) # Convert to a numpy array for faster processing speed x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds # Get the bounding box of the shapefile if expand_area: xmax = bounds['maxx'] + 200000 xmin = bounds['minx'] - 200000 ymax = bounds['maxy'] + 200000 ymin = bounds['miny'] - 200000 else: xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] proj_coord = pyproj.Proj('esri:102001')( longitude, latitude) # Filter out stations outside of grid if (proj_coord[1] <= float(ymax[0]) and proj_coord[1] >= float(ymin[0]) and proj_coord[0] <= float(xmax[0]) and proj_coord[0] >= float(xmin[0])): cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) y = np.array(lat) x = np.array(lon) z = np.array(Cvar) pixelHeight = res pixelWidth = res num_col = int((xmax - xmin) / pixelHeight) + 1 num_row = int((ymax - ymin) / pixelWidth) + 1 # We need to project to a projected system before making distance matrix # We dont know but assume NAD83 source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')( x, y) # Convert to Canada Albers Equal Area # Add the bounding box coords to the dataset so we can extrapolate the interpolation to cover whole area if expand_area: yProj_extent = np.append( yProj, [bounds['maxy'] + 200000, bounds['miny'] - 200000]) xProj_extent = np.append( xProj, [bounds['maxx'] + 200000, bounds['minx'] - 200000]) else: yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) # Get the value for lat lon in each cell we just made Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) # Make a rectangular grid (because eventually we will map the values) Xi, Yi = np.meshgrid(Xi, Yi) # Then we flatten the arrays for easier processing Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] # We will later return this for use in other functions # vertically stack station x and y vals and then transpose them so they are in pairs vals = np.vstack((xProj, yProj)).T # Do the same thing for the grid x and y vals interpol = np.vstack((Xi, Yi)).T # Length of the triangle side from the cell to the point with data dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0]) # Length of the triangle side from the cell to the point with data dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1]) # Euclidean distance, getting the hypotenuse distance_matrix = np.hypot(dist_not, dist_one) # what if distance is 0 --> np.inf? have to account for the pixel underneath weights = 1 / (distance_matrix**d) # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath weights[np.where(np.isinf(weights))] = 1 / (1.0E-50) weights /= weights.sum(axis=0) # The weights must add up to 0 # Take the dot product of the weights and the values, in this case the dot product is the sum product over the last axis of Weights.T and z Zi = np.dot(weights.T, z) # reshape the array into the proper format for the map idw_grid = Zi.reshape(num_row, num_col) # Elevation weights # Lon (X) goes in first for a REASON. It has to do with order in the lookup file. # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup(send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup( zip(xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) vals2 = np.vstack(source_elev).T interpol2 = np.vstack(elev_array).T # Get distance in terms of the elevation (vertical distance) from the station to the point to be interpolated dist_not2 = np.subtract.outer(vals2[0], interpol2[0]) # Take the absolute value, we just care about what is the difference dist_not2 = np.absolute(dist_not2) weights2 = 1 / (dist_not2**d) # Get the inverse distance weight # In the case of no elevation change weights2[np.where(np.isinf(weights2))] = 1 weights2 /= weights2.sum(axis=0) # Make weights add up to 1 # Weight distance as 0.8 and elevation as 0.2 fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z) idew_grid = fin.reshape(num_row, num_col) # Reshape the final array if show: # Plot if show == True fig, ax = plt.subplots(figsize=(15, 15)) crs = {'init': 'esri:102001'} na_map = gpd.read_file(shapefile) plt.imshow(elev_array.reshape(num_row, num_col), extent=(xProj_extent.min() - 1, xProj_extent.max() + 1, yProj_extent.max() - 1, yProj_extent.min() + 1)) na_map.plot(ax=ax, color='white', edgecolor='k', linewidth=2, zorder=10, alpha=0.1) plt.scatter(xProj, yProj, c=z, edgecolors='k') plt.gca().invert_yaxis() cbar = plt.colorbar() cbar.set_label(var_name) title = 'IDEW Interpolation for %s on %s' % (var_name, input_date) fig.suptitle(title, fontsize=14) plt.xlabel('Longitude') plt.ylabel('Latitude') plt.show() return idew_grid, maxmin, elev_array
def cross_validate_IDEW(latlon_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list, d): '''Leave-one-out cross-validation procedure for IDEW Parameters ---------- latlon_dict : dictionary the latitude and longitudes of the stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile, including its name file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file d : int the weighting for IDW interpolation Returns ---------- dictionary - a dictionary of the absolute error at each station when it was left out ''' x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} # for plotting station_name_list = [] projected_lat_lon = {} for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): station_name_list.append(station_name) loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] # Pre-make the elev_dict to speed up code latO = [] lonO = [] for station_name in sorted(Cvar_dict.keys()): if station_name in latlon_dict.keys(): loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] latO.append(float(latitude)) lonO.append(float(longitude)) else: pass yO = np.array(latO) xO = np.array(lonO) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProjO, yProjO = pyproj.Proj('esri:102001')(xO, yO) elev_dict = GD.finding_data_frm_lookup(zip(xProjO, yProjO), file_path_elev, idx_list) for station_name_hold_back in station_name_list: lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in latlon_dict.keys(): if station_name != station_name_hold_back: loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation? z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] vals = np.vstack((xProj, yProj)).T interpol = np.vstack((Xi, Yi)).T # Length of the triangle side from the cell to the point with data dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0]) # Length of the triangle side from the cell to the point with data dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1]) # euclidean distance, getting the hypotenuse distance_matrix = np.hypot(dist_not, dist_one) # what if distance is 0 --> np.inf? have to account for the pixel underneath weights = 1 / (distance_matrix**d) # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath weights[np.where(np.isinf(weights))] = 1 / (1.0E-50) weights /= weights.sum(axis=0) Zi = np.dot(weights.T, z) idw_grid = Zi.reshape(num_row, num_col) #elev_dict= GD.finding_data_frm_lookup(zip(xProj, yProj),file_path_elev,idx_list) xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # in case there are two stations at the same lat\lon x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) vals2 = np.vstack(source_elev).T interpol2 = np.vstack(elev_array).T dist_not2 = np.subtract.outer(vals2[0], interpol2[0]) dist_not2 = np.absolute(dist_not2) weights2 = 1 / (dist_not2**d) weights2[np.where(np.isinf(weights2))] = 1 weights2 /= weights2.sum(axis=0) fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z) fin = fin.reshape(num_row, num_col) # Calc the RMSE, MAE, NSE, and MRAE at the pixel loc # Delete at a certain point coord_pair = projected_lat_lon[station_name_hold_back] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = fin[y_orig][x_orig] # Get the original value original_val = Cvar_dict[station_name_hold_back] # Calc the difference absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[station_name_hold_back] = absolute_error return absolute_error_dictionary
def spatial_groups_IDEW(idw_example_grid, loc_dict, Cvar_dict, shapefile, d, blocknum, nfolds, replacement, dictionary_Groups, file_path_elev, idx_list, elev_array): '''Stratified shuffle-split cross-validation procedure Parameters ---------- idw_example_grid : ndarray used for reference of study area grid size loc_dict : dictionary the latitude and longitudes of the daily/hourly stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile d : int the weighting for IDW interpolation blocknum : int number of blocks/clusters nfolds : int number of folds to create (essentially repetitions) replacement : bool whether or not to use replacement between folds, should usually be true dictionary_Groups : dictionary dictionary of what groups (clusters) the stations belong to elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) Returns ---------- dictionary - a dictionary of the absolute error at each fold when it was left out ''' station_list_used = [ ] # If not using replacement, keep a record of what we have done count = 1 error_dictionary = {} # Premake elevation dictionary to speed up code latO = [] lonO = [] for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] latO.append(float(latitude)) lonO.append(float(longitude)) else: pass yO = np.array(latO) xO = np.array(lonO) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProjO, yProjO = pyproj.Proj('esri:102001')(xO, yO) elev_dict = GD.finding_data_frm_lookup(zip(xProjO, yProjO), file_path_elev, idx_list) while count <= nfolds: x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} projected_lat_lon = {} station_list = Eval.select_random_station(dictionary_Groups, blocknum, replacement, station_list_used).values() if replacement == False: station_list_used.append(list(station_list)) # print(station_list_used) for station_name in Cvar_dict.keys(): if station_name in loc_dict.keys(): loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in loc_dict.keys(): if station_name not in station_list: loc = loc_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) # what if we add the bounding locations to the array??? ==> that would be extrapolation not interpolation? z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix # We dont know but assume source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [ np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent) ] vals = np.vstack((xProj, yProj)).T interpol = np.vstack((Xi, Yi)).T # Length of the triangle side from the cell to the point with data dist_not = np.subtract.outer(vals[:, 0], interpol[:, 0]) # Length of the triangle side from the cell to the point with data dist_one = np.subtract.outer(vals[:, 1], interpol[:, 1]) # euclidean distance, getting the hypotenuse distance_matrix = np.hypot(dist_not, dist_one) # what if distance is 0 --> np.inf? have to account for the pixel underneath weights = 1 / (distance_matrix**d) # Making sure to assign the value of the weather station above the pixel directly to the pixel underneath weights[np.where(np.isinf(weights))] = 1 / (1.0E-50) weights /= weights.sum(axis=0) Zi = np.dot(weights.T, z) idw_grid = Zi.reshape(num_row, num_col) elev_dict = GD.finding_data_frm_lookup(zip(xProj, yProj), file_path_elev, idx_list) xProj_input = [] yProj_input = [] e_input = [] for keys in zip( xProj, yProj): # in case there are two stations at the same lat\lon x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) vals2 = np.vstack(source_elev).T interpol2 = np.vstack(elev_array).T dist_not2 = np.subtract.outer(vals2[0], interpol2[0]) dist_not2 = np.absolute(dist_not2) weights2 = 1 / (dist_not2**d) weights2[np.where(np.isinf(weights2))] = 1 weights2 /= weights2.sum(axis=0) fin = 0.8 * np.dot(weights.T, z) + 0.2 * np.dot(weights2.T, z) fin = fin.reshape(num_row, num_col) # Compare at a certain point for statLoc in station_list: coord_pair = projected_lat_lon[statLoc] x_orig = int( (coord_pair[0] - float(bounds['minx'])) / pixelHeight) # lon y_orig = int( (coord_pair[1] - float(bounds['miny'])) / pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = fin[y_orig][x_orig] original_val = Cvar_dict[statLoc] absolute_error = abs(interpolated_val - original_val) absolute_error_dictionary[statLoc] = absolute_error error_dictionary[count] = sum( absolute_error_dictionary.values()) / len( absolute_error_dictionary.values( )) # average of all the withheld stations # print(absolute_error_dictionary) count += 1 overall_error = sum(error_dictionary.values()) / \ nfolds # average of all the runs # print(overall_error) return overall_error
def GPR_interpolator(latlon_dict, Cvar_dict, input_date, var_name, shapefile, show, file_path_elev, idx_list, expand_area, kernel_object, restarts, \ report_params, optimizer, param_initiate=None, cov_type='RBF',res=10000): '''Base interpolator function for gaussian process regression Parameters ---------- latlon_dict : dictionary the latitude and longitudes of the stations Cvar_dict : dictionary dictionary of weather variable values for each station input_date : string the date you want to interpolate for shapefile : string path to the study area shapefile, including its name show : bool whether you want to plot a map file_path_elev : string file path to the elevation lookup file idx_list : list the index of the elevation data column in the lookup file expand_area : bool function will expand the study area so that more stations are taken into account (200 km) kernel_object : list kernel object describing input kernel you want to use, if optimizing a set of parameters, can input empty list restarts : int number of times to restart to avoid local optima report_params : bool if True, outputs optimized values for kernel hyperparameters optimizer : bool if False, fix parameters of covariance function param_initiate : list input parameters needed to start optimization, controls extent of the spatial autocorrelation modelled by the process whether the spatial autocorrelation is the same in all directions will depend on the inputs for parameters, you need to input the parameters of the function (distribution) as a vector not a scalar since we are working in 3d (latitude, longitude, elevation) the vector must be len=3 because this corresponds to the [x,y,z] if we are using an anisotropic distribution ...for isotropic 1d, [1] (or if 2 parameters, [[1],[1]]), for anisotropic, will be [1,1,1] or [[1,1],[1,1],[1,1]] cov_type : str type of covariance function to use if have not specified a kernel object Returns ---------- ndarray - an array of the interpolated values ''' lat = [] lon = [] Cvar = [] na_map = gpd.read_file(shapefile) bounds = na_map.bounds if expand_area: xmax = bounds['maxx']+200000 xmin = bounds['minx']-200000 ymax = bounds['maxy']+200000 ymin = bounds['miny']-200000 else: xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] # Filter out stations outside of grid proj_coord = pyproj.Proj('esri:102001')(longitude, latitude) if (proj_coord[1] <= float(ymax[0]) and proj_coord[1] >= float(ymin[0]) and proj_coord[0] <= float(xmax[0]) and proj_coord[0] >= float(xmin[0])): cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) y = np.array(lat) x = np.array(lon) z = np.array(Cvar) pixelHeight = res pixelWidth = res num_col = int((xmax - xmin) / pixelHeight) num_row = int((ymax - ymin) / pixelWidth) # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) if expand_area: yProj_extent = np.append( yProj, [bounds['maxy']+200000, bounds['miny']-200000]) xProj_extent = np.append( xProj, [bounds['maxx']+200000, bounds['minx']-200000]) else: yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row+1) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col+1) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent)] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup( send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup(zip( xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip(xProj, yProj): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame( {'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z}) df_testX = pd.DataFrame({'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array}) if param_initiate is not None: if len(param_initiate) > 1: kernels = [1.0 * RBF(length_scale=param_initiate[0]), 1.0 * RationalQuadratic(length_scale=param_initiate[0][0], alpha=param_initiate[0][1]), 1.0 * Matern(length_scale=param_initiate[0], nu=param_initiate[1], length_scale_bounds=(1000, 500000))] # Temp =(100,500000) #RH = (1000,500000) # Optimizer = ‘L-BGFS-B’ algorithm else: kernels = [1.0 * RBF(length_scale=param_initiate[0])] if cov == 'RationalQuadratic': if optimizer: # Updated Nov 23 for fire season manuscript to make 3 restarts, Dec 9 = 5 reg = GaussianProcessRegressor( kernel=kernels[1], normalize_y=True, n_restarts_optimizer=restarts) else: reg = GaussianProcessRegressor( kernel=kernels[1], normalize_y=True, n_restarts_optimizer=restarts, optimizer=None) elif cov == 'RBF': if optimizer: # Updated Nov 23 for fire season manuscript to make 3 restarts, Dec 9 = 5 reg = GaussianProcessRegressor( kernel=kernels[0], normalize_y=True, n_restarts_optimizer=restarts) else: reg = GaussianProcessRegressor( kernel=kernels[0], normalize_y=True, n_restarts_optimizer=restarts, optimizer=None) elif cov == 'Matern': if optimizer: # Updated Nov 23 for fire season manuscript to make 3 restarts, Dec 9 = 5 reg = GaussianProcessRegressor( kernel=kernels[2], normalize_y=True, n_restarts_optimizer=restarts) else: #kernels = [307**2 * Matern(length_scale=[5e+05, 6.62e+04, 1.07e+04], nu=0.5)] #kernels = [316**2 * Matern(length_scale=[5e+05, 5e+05, 6.01e+03], nu=0.5)] #kernels = [316**2 * Matern(length_scale=[5e+05, 5e+05, 4.67e+05], nu=0.5)] reg = GaussianProcessRegressor( kernel=kernels[0], normalize_y=True, n_restarts_optimizer=restarts, optimizer=None) else: kernels = [eval(kernel_object[0])] reg = GaussianProcessRegressor( kernel=kernels[0], normalize_y=True, n_restarts_optimizer=0, optimizer=None) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) fitted_params = reg.kernel_ score = reg.score(X_train, y) print(fitted_params) print(score) Zi = reg.predict(X_test) gpr_grid = Zi.reshape(num_row+1, num_col+1) if show: fig, ax = plt.subplots(figsize=(15, 15)) crs = {'init': 'esri:102001'} na_map = gpd.read_file(shapefile) plt.imshow(gpr_grid, extent=(xProj_extent.min( )-1, xProj_extent.max()+1, yProj_extent.max()-1, yProj_extent.min()+1)) na_map.plot(ax=ax, color='white', edgecolor='k', linewidth=2, zorder=10, alpha=0.1) plt.scatter(xProj, yProj, c=z, edgecolors='k') plt.gca().invert_yaxis() cbar = plt.colorbar() cbar.set_label(var_name) title = 'GPR Interpolation for %s on %s' % (var_name, input_date) fig.suptitle(title, fontsize=14) plt.xlabel('Longitude') plt.ylabel('Latitude') plt.show() if report_params: return fitted_params else: return gpr_grid, maxmin
def cross_validate_gpr(latlon_dict, Cvar_dict, shapefile, file_path_elev, elev_array, idx_list, cov_function): '''Leave-one-out cross-validation procedure for GPR Parameters ---------- latlon_dict : dictionary the latitude and longitudes of the stations Cvar_dict : dictionary dictionary of weather variable values for each station shapefile : string path to the study area shapefile, including its name file_path_elev : string path to the elevation lookup file elev_array : ndarray array for elevation, create using IDEW interpolation (this is a trick to speed up code) idx_list : int position of the elevation column in the lookup file cov_function : list list containing a string that describes the input covariance function, similar to: ['316**2 * Matern(length_scale=[5e+05, 5e+05, 6.01e+03], nu=0.5)'] Returns ---------- dictionary - a dictionary of the absolute error at each station when it was left out ''' x_origin_list = [] y_origin_list = [] absolute_error_dictionary = {} # for plotting station_name_list = [] projected_lat_lon = {} for station_name in Cvar_dict.keys(): if station_name in latlon_dict.keys(): station_name_list.append(station_name) loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] Plat, Plon = pyproj.Proj('esri:102001')(longitude, latitude) Plat = float(Plat) Plon = float(Plon) projected_lat_lon[station_name] = [Plat, Plon] for station_name_hold_back in station_name_list: lat = [] lon = [] Cvar = [] for station_name in sorted(Cvar_dict.keys()): if station_name in latlon_dict.keys(): if station_name != station_name_hold_back: loc = latlon_dict[station_name] latitude = loc[0] longitude = loc[1] cvar_val = Cvar_dict[station_name] lat.append(float(latitude)) lon.append(float(longitude)) Cvar.append(cvar_val) else: pass y = np.array(lat) x = np.array(lon) z = np.array(Cvar) na_map = gpd.read_file(shapefile) bounds = na_map.bounds xmax = bounds['maxx'] xmin = bounds['minx'] ymax = bounds['maxy'] ymin = bounds['miny'] pixelHeight = 10000 pixelWidth = 10000 num_col = int((xmax - xmin) / pixelHeight)+1 num_row = int((ymax - ymin) / pixelWidth)+1 # We need to project to a projected system before making distance matrix source_proj = pyproj.Proj(proj='latlong', datum='NAD83') xProj, yProj = pyproj.Proj('esri:102001')(x, y) df_trainX = pd.DataFrame({'xProj': xProj, 'yProj': yProj, 'var': z}) yProj_extent = np.append(yProj, [bounds['maxy'], bounds['miny']]) xProj_extent = np.append(xProj, [bounds['maxx'], bounds['minx']]) Yi = np.linspace(np.min(yProj_extent), np.max(yProj_extent), num_row) Xi = np.linspace(np.min(xProj_extent), np.max(xProj_extent), num_col) Xi, Yi = np.meshgrid(Xi, Yi) Xi, Yi = Xi.flatten(), Yi.flatten() maxmin = [np.min(yProj_extent), np.max(yProj_extent), np.max(xProj_extent), np.min(xProj_extent)] # Elevation # Preparing the coordinates to send to the function that will get the elevation grid concat = np.array((Xi.flatten(), Yi.flatten())).T send_to_list = concat.tolist() # The elevation function takes a tuple send_to_tuple = [tuple(x) for x in send_to_list] Xi1_grd = [] Yi1_grd = [] elev_grd = [] # Get the elevations from the lookup file elev_grd_dict = GD.finding_data_frm_lookup( send_to_tuple, file_path_elev, idx_list) for keys in elev_grd_dict.keys(): # The keys are each lat lon pair x = keys[0] y = keys[1] Xi1_grd.append(x) Yi1_grd.append(y) # Append the elevation data to the empty list elev_grd.append(elev_grd_dict[keys]) elev_array = np.array(elev_grd) # make an elevation array elev_dict = GD.finding_data_frm_lookup(zip( xProj, yProj), file_path_elev, idx_list) # Get the elevations for the stations xProj_input = [] yProj_input = [] e_input = [] for keys in zip(xProj, yProj): # Repeat process for just the stations not the whole grid x = keys[0] y = keys[1] xProj_input.append(x) yProj_input.append(y) e_input.append(elev_dict[keys]) source_elev = np.array(e_input) Xi1_grd = np.array(Xi1_grd) Yi1_grd = np.array(Yi1_grd) df_trainX = pd.DataFrame( {'xProj': xProj, 'yProj': yProj, 'elevS': source_elev, 'var': z}) df_testX = pd.DataFrame( {'Xi': Xi1_grd, 'Yi': Yi1_grd, 'elev': elev_array}) #kernels = [1.0 * RationalQuadratic(length_scale=1.0, alpha=alpha_input)] #kernels = [multiplier**exponent * Matern(length_scale=length_scale_list,nu=param_initiate[1],length_scale_bounds='fixed')] #kernels = [params] # Temperature #kernels = [316**2 * Matern(length_scale=[5e+05, 5e+05, 6.01e+03], nu=0.5)] # RH #kernels = [307**2 * Matern(length_scale=[9.51e+04, 9.58e+04, 3.8e+05], nu=0.5)] # Wind = #kernels = [316**2 * Matern(length_scale=[5e+05, 6.62e+04, 1.07e+04], nu=0.5)] kernels = [eval(cov_function[0])] reg = GaussianProcessRegressor( kernel=kernels[0], normalize_y=True, n_restarts_optimizer=0, optimizer=None) y = np.array(df_trainX['var']).reshape(-1, 1) X_train = np.array(df_trainX[['xProj', 'yProj', 'elevS']]) X_test = np.array(df_testX[['Xi', 'Yi', 'elev']]) reg.fit(X_train, y) Zi = reg.predict(X_test) gpr_grid = Zi.reshape(num_row, num_col) # Calc the RMSE, MAE at the pixel loc # Delete at a certain point coord_pair = projected_lat_lon[station_name_hold_back] x_orig = int( (coord_pair[0] - float(bounds['minx']))/pixelHeight) # lon y_orig = int((coord_pair[1] - float(bounds['miny']))/pixelWidth) # lat x_origin_list.append(x_orig) y_origin_list.append(y_orig) interpolated_val = gpr_grid[y_orig][x_orig] original_val = Cvar_dict[station_name_hold_back] absolute_error = abs(interpolated_val-original_val) absolute_error_dictionary[station_name_hold_back] = absolute_error return absolute_error_dictionary