def test_krige(): # dummy data np.random.seed(1) X = np.random.randint(0, 400, size=(20, 3)).astype(float) y = 5 * np.random.rand(20) for m, v in _method_and_vergiogram(): param_dict = {"method": [m], "variogram_model": [v]} estimator = GridSearchCV( Krige(), param_dict, n_jobs=-1, pre_dispatch="2*n_jobs", verbose=False, cv=5, ) # run the gridsearch if m in ["ordinary", "universal"]: estimator.fit(X=X[:, :2], y=y) else: estimator.fit(X=X, y=y) if hasattr(estimator, "best_score_"): if m in threed_krige: assert estimator.best_score_ > -10.0 else: assert estimator.best_score_ > -3.0 if hasattr(estimator, "cv_results_"): assert estimator.cv_results_["mean_train_score"] > 0
def SelectModel(data): from pykrige.compat import GridSearchCV from pykrige.rk import Krige # Set up parameters param_dict = { "method": ["universal"], "variogram_model": ["exponential", "spherical"], "nlags": [10, 15], "lagdist": [1000], "weight": [True], "n_closest_points": [0], "anisotropy_scaling": [1, 1.5, 2], "anisotropy_angle": [10, 15] } estimator = GridSearchCV(Krige(), param_dict, verbose=True, error_score=np.nan) # Run gridsearch estimator.fit(X=data[:, 0:2], y=data[:, 2]) if hasattr(estimator, 'best_score_'): print('best_score R² = {:.3f}'.format(estimator.best_score_)) print('best_params = ', estimator.best_params_) print('\nCV results::') if hasattr(estimator, 'cv_results_'): for key in [ 'mean_test_score', 'mean_train_score', 'param_method', 'param_variogram_model' ]: print(' - {} : {}'.format(key, estimator.cv_results_[key])) return
def krige_grid_search(BEC_df): # 3D Kring param opt param_dict3d = { "method": ["ordinary3d", "universal3d"], "variogram_model": ["linear", "power", "gaussian", "spherical"], # "nlags": [4, 6, 8], # "weight": [True, False] } estimator = GridSearchCV(Krige(), param_dict3d, verbose=False, cv=5, iid=False) # Data X3 = BEC_df[['X', 'Y', 'Z']].values y = BEC_df['BEC'].values # run the gridsearch estimator.fit(X=X3, y=y) if hasattr(estimator, 'best_score_'): print('best_score R2 = {:.3f}'.format(estimator.best_score_)) print('best_params = ', estimator.best_params_) print('\nCV results::') if hasattr(estimator, 'cv_results_'): for key in [ 'mean_test_score', 'mean_train_score', 'param_method', 'param_variogram_model' ]: print(' - {} : {}'.format(key, estimator.cv_results_[key])) return estimator.best_params_
def test_krige(): # dummy data np.random.seed(1) X = np.random.randint(0, 400, size=(20, 3)).astype(float) y = 5 * np.random.rand(20) for m, v in _method_and_vergiogram(): param_dict = {'method': [m], 'variogram_model': [v]} estimator = GridSearchCV( Krige(), param_dict, n_jobs=-1, iid=False, pre_dispatch='2*n_jobs', verbose=False, cv=5, ) # run the gridsearch if m in ['ordinary', 'universal']: estimator.fit(X=X[:, :2], y=y) else: estimator.fit(X=X, y=y) if hasattr(estimator, 'best_score_'): if m in threed_krige: assert estimator.best_score_ > -10.0 else: assert estimator.best_score_ > -3.0 if hasattr(estimator, 'cv_results_'): assert estimator.cv_results_['mean_train_score'] > 0
def test_gridsearch_cv_variogram_parameters(): param_dict3d = { "method": ["ordinary3d"], "variogram_model": ["linear"], "variogram_parameters": [{ 'slope': 1.0, 'nugget': 1.0 }, { 'slope': 2.0, 'nugget': 1.0 }] } estimator = GridSearchCV(Krige(), param_dict3d, verbose=True) # dummy data seed = np.random.RandomState(42) X3 = 400. * (1 + seed.rand(100, 3)) y = 5 * seed.rand(100) # run the gridsearch estimator.fit(X=X3, y=y) # Expected best parameters and score best_params = [1.0, 1.0] # best_score = -0.4624793735893478 best_score = round(Decimal(-0.462479373589), 12) if hasattr(estimator, 'best_params_'): assert len(estimator.cv_results_['param_variogram_parameters']) == len( param_dict3d["variogram_parameters"]) for i, k in enumerate(estimator.best_params_["variogram_parameters"]): assert estimator.best_params_["variogram_parameters"][ k] == best_params[i]
def test_gridsearch_cv_variogram_parameters(): param_dict3d = { "method": ["ordinary3d"], "variogram_model": ["linear"], "variogram_parameters": [{ 'slope': 1.0, 'nugget': 1.0 }, { 'slope': 2.0, 'nugget': 1.0 }] } estimator = GridSearchCV(Krige(), param_dict3d, verbose=True) # dummy data seed = np.random.RandomState(42) # X3 = seed.randint(0, 400, size=(100, 3)).astype(float) X3 = 400. * (1 + seed.rand(100, 3)) y = 5 * seed.rand(100) # run the gridsearch estimator.fit(X=X3, y=y) # Expected best parameters best_params = [1.0, 1.0] print("\n\n###########", estimator.cv_results_['param_variogram_parameters'], "\n\n", len(estimator.cv_results_['param_variogram_parameters'])) if hasattr(estimator, 'best_score_'): print('best_score R² = {:.3f}'.format(estimator.best_score_)) print('best_params = ', estimator.best_params_) # print("\n") # if hasattr(estimator2, 'best_score_'): # print('best_score R² = {:.3f}'.format(estimator2.best_score_)) # print('best_params = ', estimator2.best_params_) # best_params = [1.0, 1.0] # best_score = -0.4624793735893478 best_score = round(Decimal(-0.462479373589), 12) if hasattr(estimator, 'best_score_'): print(round(Decimal(estimator.best_score_), 12), best_score) print("\n\n best score :", round(Decimal(estimator.best_score_), 12) == best_score) if hasattr(estimator, 'best_params_'): print( "\nlen param vario :", len(estimator.cv_results_['param_variogram_parameters']) == len( param_dict3d["variogram_parameters"])) for i, k in enumerate(estimator.best_params_["variogram_parameters"]): print( "\n\negal parm vario :", estimator.best_params_["variogram_parameters"][k] == best_params[i])
def interpolate_at_one_dt(new_hdf, H, predict_df=None, dem_path=awd_path, ppd=50): from aux_gps import coarse_dem import numpy as np from pykrige.rk import Krige """ interpolate to Israel grid the values in new_hdf (already removed the lapse rate) with ppd being the map resolution. if predict_df is not None, interpolate only to df's locations and altitudes. predict_df should have lat, lon and alt columns""" # create mesh and load DEM: da = create_lat_lon_mesh(points_per_degree=ppd) # 500? # populate the empty mesh grid with stations data: for i, row in new_hdf.iterrows(): lat = da.sel(lat=row['lat'], method='nearest').lat.values lon = da.sel(lon=row['lon'], method='nearest').lon.values da.loc[{'lat': lat, 'lon': lon}] = row.iloc[0] c = np.linspace(min(da.lat.values), max(da.lat.values), da.shape[0]) r = np.linspace(min(da.lon.values), max(da.lon.values), da.shape[1]) rr, cc = np.meshgrid(r, c) vals = ~np.isnan(da.values) X = np.column_stack([rr[vals], cc[vals]]) rr_cc_as_cols = np.column_stack([rr.flatten(), cc.flatten()]) # y = da_scaled.values[vals] y = da.values[vals] model = Krige(method='ordinary', variogram_model='spherical', verbose=True) model.fit(X, y) if predict_df is None: # i.e., interpolate to all map coords: interpolated = model.predict(rr_cc_as_cols).reshape(da.values.shape) da_inter = da.copy(data=interpolated) awd = coarse_dem(da, dem_path=dem_path) assert H > 0 da_inter *= np.exp(-1.0 * awd / H) return da_inter else: predict_lats = np.linspace(predict_df.lat.min(), predict_df.lat.max(), predict_df.lat.values.shape[0]) predict_lons = np.linspace(predict_df.lon.min(), predict_df.lon.max(), predict_df.lon.values.shape[0]) predict_lons_lats_as_cols = np.column_stack( [predict_lons, predict_lats]) interpolated = model.predict(predict_lons_lats_as_cols).reshape( (predict_lats.shape)) df_inter = predict_df.copy() df_inter['interpolated'] = interpolated # fix for lapse rate: assert H > 0 df_inter['interpolated_lr_fixed'] = df_inter['interpolated'] * np.exp( -1.0 * df_inter['alt'] / H) return df_inter
def Interpolating_models_ims(time='2013-10-19T22:00:00', var='TD', plot=True, gis_path=gis_path, method='okrig', dem_path=work_yuval / 'AW3D30', lapse_rate=5., cv=None, rms=None, gridsearch=False): """main 2d_interpolation from stations to map""" # cv usage is {'kfold': 5} or {'rkfold': [2, 3]} # TODO: try 1d modeling first, like T=f(lat) from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.neighbors import KNeighborsRegressor from pykrige.rk import Krige import numpy as np from sklearn.svm import SVR from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from scipy.spatial import Delaunay from scipy.interpolate import griddata from sklearn.metrics import mean_squared_error from aux_gps import coarse_dem import seaborn as sns import matplotlib.pyplot as plt import pyproj from sklearn.utils.estimator_checks import check_estimator from pykrige.compat import GridSearchCV lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84') ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84') def parse_cv(cv): from sklearn.model_selection import KFold from sklearn.model_selection import RepeatedKFold from sklearn.model_selection import LeaveOneOut """input:cv number or string""" # check for integer: if 'kfold' in cv.keys(): n_splits = cv['kfold'] print('CV is KFold with n_splits={}'.format(n_splits)) return KFold(n_splits=n_splits) if 'rkfold' in cv.keys(): n_splits = cv['rkfold'][0] n_repeats = cv['rkfold'][1] print('CV is ReapetedKFold with n_splits={},'.format(n_splits) + ' n_repeates={}'.format(n_repeats)) return RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=42) if 'loo' in cv.keys(): return LeaveOneOut() # from aux_gps import scale_xr da = create_lat_lon_mesh(points_per_degree=250) # 500? awd = coarse_dem(da) awd = awd.values geo_snap = geo_pandas_time_snapshot(var=var, datetime=time, plot=False) if var == 'TD': [a, b] = np.polyfit(geo_snap['alt'].values, geo_snap['TD'].values, 1) if lapse_rate == 'auto': lapse_rate = np.abs(a) * 1000 fig, ax_lapse = plt.subplots(figsize=(10, 6)) sns.regplot(data=geo_snap, x='alt', y='TD', color='r', scatter_kws={'color': 'b'}, ax=ax_lapse) suptitle = time.replace('T', ' ') ax_lapse.set_xlabel('Altitude [m]') ax_lapse.set_ylabel('Temperature [degC]') ax_lapse.text(0.5, 0.95, 'Lapse_rate: {:.2f} degC/km'.format(lapse_rate), horizontalalignment='center', verticalalignment='center', transform=ax_lapse.transAxes, fontsize=12, color='k', fontweight='bold') ax_lapse.grid() ax_lapse.set_title(suptitle, fontsize=14, fontweight='bold') # fig.suptitle(suptitle, fontsize=14, fontweight='bold') alts = [] for i, row in geo_snap.iterrows(): lat = da.sel(lat=row['lat'], method='nearest').lat.values lon = da.sel(lon=row['lon'], method='nearest').lon.values alt = row['alt'] if lapse_rate is not None and var == 'TD': da.loc[{'lat': lat, 'lon': lon}] = row[var] + \ lapse_rate * alt / 1000.0 alts.append(alt) elif lapse_rate is None or var != 'TD': da.loc[{'lat': lat, 'lon': lon}] = row[var] alts.append(alt) # da_scaled = scale_xr(da) c = np.linspace(min(da.lat.values), max(da.lat.values), da.shape[0]) r = np.linspace(min(da.lon.values), max(da.lon.values), da.shape[1]) rr, cc = np.meshgrid(r, c) vals = ~np.isnan(da.values) if lapse_rate is None: Xrr, Ycc, Z = pyproj.transform(lla, ecef, rr[vals], cc[vals], np.array(alts), radians=False) X = np.column_stack([Xrr, Ycc, Z]) XX, YY, ZZ = pyproj.transform(lla, ecef, rr, cc, awd.values, radians=False) rr_cc_as_cols = np.column_stack( [XX.flatten(), YY.flatten(), ZZ.flatten()]) else: X = np.column_stack([rr[vals], cc[vals]]) rr_cc_as_cols = np.column_stack([rr.flatten(), cc.flatten()]) # y = da_scaled.values[vals] y = da.values[vals] if method == 'gp-rbf': from sklearn.gaussian_process.kernels import RBF from sklearn.gaussian_process.kernels import WhiteKernel kernel = 1.0 * RBF(length_scale=0.25, length_scale_bounds=(1e-2, 1e3)) \ + WhiteKernel(noise_level=0.01, noise_level_bounds=(1e-10, 1e+1)) # kernel = None model = GaussianProcessRegressor(alpha=0.0, kernel=kernel, n_restarts_optimizer=5, random_state=42, normalize_y=True) elif method == 'gp-qr': from sklearn.gaussian_process.kernels import RationalQuadratic from sklearn.gaussian_process.kernels import WhiteKernel kernel = RationalQuadratic(length_scale=100.0) \ + WhiteKernel(noise_level=0.01, noise_level_bounds=(1e-10, 1e+1)) model = GaussianProcessRegressor(alpha=0.0, kernel=kernel, n_restarts_optimizer=5, random_state=42, normalize_y=True) elif method == 'knn': model = KNeighborsRegressor(n_neighbors=5, weights='distance') elif method == 'svr': model = SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False) elif method == 'okrig': model = Krige(method='ordinary', variogram_model='spherical', verbose=True) elif method == 'ukrig': model = Krige(method='universal', variogram_model='linear', verbose=True) # elif method == 'okrig3d': # # don't bother - MemoryError... # model = OrdinaryKriging3D(rr[vals], cc[vals], np.array(alts), # da.values[vals], variogram_model='linear', # verbose=True) # awd = coarse_dem(da) # interpolated, ss = model.execute('grid', r, c, awd['data'].values) # elif method == 'rkrig': # # est = LinearRegression() # est = RandomForestRegressor() # model = RegressionKriging(regression_model=est, n_closest_points=5, # verbose=True) # p = np.array(alts).reshape(-1, 1) # model.fit(p, X, y) # P = awd.flatten().reshape(-1, 1) # interpolated = model.predict(P, rr_cc_as_cols).reshape(da.values.shape) # try: # u = check_estimator(model) # except TypeError: # u = False # pass if cv is not None and not gridsearch: # and u is None): # from sklearn.model_selection import cross_validate from sklearn import metrics cv = parse_cv(cv) ytests = [] ypreds = [] for train_idx, test_idx in cv.split(X): X_train, X_test = X[train_idx], X[test_idx] # requires arrays y_train, y_test = y[train_idx], y[test_idx] model.fit(X_train, y_train) y_pred = model.predict(X_test) # there is only one y-test and y-pred per iteration over the loo.split, # so to get a proper graph, we append them to respective lists. ytests += list(y_test) ypreds += list(y_pred) true_vals = np.array(ytests) predicted = np.array(ypreds) r2 = metrics.r2_score(ytests, ypreds) ms_error = metrics.mean_squared_error(ytests, ypreds) print("R^2: {:.5f}%, MSE: {:.5f}".format(r2 * 100, ms_error)) if gridsearch: cv = parse_cv(cv) param_dict = { "method": ["ordinary", "universal"], "variogram_model": ["linear", "power", "gaussian", "spherical"], # "nlags": [4, 6, 8], # "weight": [True, False] } estimator = GridSearchCV(Krige(), param_dict, verbose=True, cv=cv, scoring='neg_mean_absolute_error', return_train_score=True, n_jobs=1) estimator.fit(X, y) if hasattr(estimator, 'best_score_'): print('best_score = {:.3f}'.format(estimator.best_score_)) print('best_params = ', estimator.best_params_) return estimator # if (cv is not None and not u): # from sklearn import metrics # cv = parse_cv(cv) # ytests = [] # ypreds = [] # for train_idx, test_idx in cv.split(X): # X_train, X_test = X[train_idx], X[test_idx] # requires arrays # y_train, y_test = y[train_idx], y[test_idx] ## model = UniversalKriging(X_train[:, 0], X_train[:, 1], y_train, ## variogram_model='linear', verbose=False, ## enable_plotting=False) # model.X_ORIG = X_train[:, 0] # model.X_ADJUSTED = model.X_ORIG # model.Y_ORIG = X_train[:, 1] # model.Y_ADJUSTED = model.Y_ORIG # model.Z = y_train # y_pred, ss = model.execute('points', X_test[0, 0], # X_test[0, 1]) # # there is only one y-test and y-pred per iteration over the loo.split, # # so to get a proper graph, we append them to respective lists. # ytests += list(y_test) cmap = plt.get_cmap('spring', 10) Q = ax.quiver(isr['X'], isr['Y'], isr['U'], isr['V'], isr['cm_per_year'], cmap=cmap) fig.colorbar(Q, extend='max') # ypreds += list(y_pred) # true_vals = np.array(ytests) # predicted = np.array(ypreds) # r2 = metrics.r2_score(ytests, ypreds) # ms_error = metrics.mean_squared_error(ytests, ypreds) # print("R^2: {:.5f}%, MSE: {:.5f}".format(r2*100, ms_error)) # cv_results = cross_validate(gp, X, y, cv=cv, scoring='mean_squared_error', # return_train_score=True, n_jobs=-1) # test = xr.DataArray(cv_results['test_score'], dims=['kfold']) # train = xr.DataArray(cv_results['train_score'], dims=['kfold']) # train.name = 'train' # cds = test.to_dataset(name='test') # cds['train'] = train # cds['kfold'] = np.arange(len(cv_results['test_score'])) + 1 # cds['mean_train'] = cds.train.mean('kfold') # cds['mean_test'] = cds.test.mean('kfold') # interpolated=griddata(X, y, (rr, cc), method='nearest') model.fit(X, y) interpolated = model.predict(rr_cc_as_cols).reshape(da.values.shape) da_inter = da.copy(data=interpolated) if lapse_rate is not None and var == 'TD': da_inter -= lapse_rate * awd / 1000.0 if (rms is not None and cv is None): # or (rms is not None and not u): predicted = [] true_vals = [] for i, row in geo_snap.iterrows(): lat = da.sel(lat=row['lat'], method='nearest').lat.values lon = da.sel(lon=row['lon'], method='nearest').lon.values pred = da_inter.loc[{'lat': lat, 'lon': lon}].values.item() true = row[var] predicted.append(pred) true_vals.append(true) predicted = np.array(predicted) true_vals = np.array(true_vals) ms_error = mean_squared_error(true_vals, predicted) print("MSE: {:.5f}".format(ms_error)) if plot: import salem from salem import DataLevels, Map import cartopy.crs as ccrs # import cartopy.io.shapereader as shpreader import matplotlib.pyplot as plt # fname = gis_path / 'ne_10m_admin_0_sovereignty.shp' # fname = gis_path / 'gadm36_ISR_0.shp' # ax = plt.axes(projection=ccrs.PlateCarree()) f, ax = plt.subplots(figsize=(6, 10)) # shdf = salem.read_shapefile(salem.get_demo_file('world_borders.shp')) shdf = salem.read_shapefile(gis_path / 'Israel_and_Yosh.shp') # shdf = shdf.loc[shdf['CNTRY_NAME'] == 'Israel'] # remove other countries shdf.crs = {'init': 'epsg:4326'} dsr = da_inter.salem.roi(shape=shdf) grid = dsr.salem.grid grid = da_inter.salem.grid sm = Map(grid) # sm.set_shapefile(gis_path / 'Israel_and_Yosh.shp') # sm = dsr.salem.quick_map(ax=ax) # sm2 = salem.Map(grid, factor=1) # sm2.set_shapefile(gis_path/'gis_osm_water_a_free_1.shp', # edgecolor='k') sm.set_data(dsr) # sm.set_nlevels(7) # sm.visualize(ax=ax, title='Israel {} interpolated temperature from IMS'.format(method), # cbar_title='degC') sm.set_shapefile(gis_path / 'gis_osm_water_a_free_1.shp', edgecolor='k') # , facecolor='aqua') # sm.set_topography(awd.values, crs=awd.crs) # sm.set_rgb(crs=shdf.crs, natural_earth='hr') # ad # lakes = salem.read_shapefile(gis_path/'gis_osm_water_a_free_1.shp') sm.set_cmap(cm='rainbow') sm.visualize( ax=ax, title='Israel {} interpolated temperature from IMS'.format(method), cbar_title='degC') dl = DataLevels(geo_snap[var], levels=sm.levels) dl.set_cmap(sm.cmap) x, y = sm.grid.transform(geo_snap.lon.values, geo_snap.lat.values) ax.scatter(x, y, color=dl.to_rgb(), s=20, edgecolors='k', linewidths=0.5) suptitle = time.replace('T', ' ') f.suptitle(suptitle, fontsize=14, fontweight='bold') if (rms is not None or cv is not None) and (not gridsearch): import seaborn as sns f, ax = plt.subplots(1, 2, figsize=(12, 6)) sns.scatterplot(x=true_vals, y=predicted, ax=ax[0], marker='.', s=100) resid = predicted - true_vals sns.distplot(resid, bins=5, color='c', label='residuals', ax=ax[1]) rmean = np.mean(resid) rstd = np.std(resid) rmedian = np.median(resid) rmse = np.sqrt(mean_squared_error(true_vals, predicted)) plt.axvline(rmean, color='r', linestyle='dashed', linewidth=1) _, max_ = plt.ylim() plt.text(rmean + rmean / 10, max_ - max_ / 10, 'Mean: {:.2f}, RMSE: {:.2f}'.format(rmean, rmse)) f.tight_layout() # lakes.plot(ax=ax, color='b', edgecolor='k') # lake_borders = gpd.overlay(countries, capitals, how='difference') # adm1_shapes = list(shpreader.Reader(fname).geometries()) # ax = plt.axes(projection=ccrs.PlateCarree()) # ax.coastlines(resolution='10m') # ax.add_geometries(adm1_shapes, ccrs.PlateCarree(), # edgecolor='black', facecolor='gray', alpha=0.5) # da_inter.plot.pcolormesh('lon', 'lat', ax=ax) #geo_snap.plot(ax=ax, column=var, cmap='viridis', edgecolor='black', # legend=False) return da_inter
""" import numpy as np from pykrige.rk import Krige from pykrige.compat import GridSearchCV # 2D Kring param opt param_dict = { "method": ["ordinary", "universal"], "variogram_model": ["linear", "power", "gaussian", "spherical"], # "nlags": [4, 6, 8], # "weight": [True, False] } estimator = GridSearchCV(Krige(), param_dict, verbose=True) # dummy data X = np.random.randint(0, 400, size=(100, 2)).astype(float) y = 5 * np.random.rand(100) # run the gridsearch estimator.fit(X=X, y=y) if hasattr(estimator, 'best_score_'): print('best_score R² = {:.3f}'.format(estimator.best_score_)) print('best_params = ', estimator.best_params_) print('\nCV results::') if hasattr(estimator, 'cv_results_'): for key in [
def kriging_per_row(all_data_daily_slice): param_dict3d = {"method":["ordinary3d", "universal3d"],"variogram_model": ["linear", "power", "gaussian", "spherical"]} estimator = GridSearchCV(Krige(), param_dict3d, verbose=False) interpolated_values = pd.DataFrame() for index,row_under_observation in all_data_daily_slice.iterrows(): row_under_observation = pd.DataFrame(row_under_observation) transposed_row = row_under_observation.T #merge using station ids as indices snow_amt_with_locn = all_data_daily_slice.merge(row_under_observation,left_index = True, right_index = True) snow_amt_with_locn.rename(columns = {index : 'snow_adj_inches'} , inplace = True) snow_amt_with_locn['snow_adj_mters'] = snow_amt_with_locn['snow_adj_inches'] * 0.0254 #containing non null values snow_amt_with_locn_notnull = snow_amt_with_locn.dropna() #print(snow_amt_with_locn_notnull.shape) #containing null values snow_amount_null = snow_amt_with_locn[snow_amt_with_locn['snow_adj_inches'].isnull() == True] snow_amount_null.drop(['snow_adj_mters'],axis=1 , inplace = True) # perform grid search to identify the good fitting variogram if (snow_amt_with_locn_notnull.shape[0] != 0 and snow_amt_with_locn_notnull.shape[0] != 1): lons=numpy.array(snow_amt_with_locn_notnull['Longitude_Metres']) lons = lons[~numpy.isnan(lons)] lats=numpy.array(snow_amt_with_locn_notnull['Latiitude_Metres']) lats = lats[~numpy.isnan(lats)] elev=numpy.array(snow_amt_with_locn_notnull['ElevationRelative']) snow_amount =numpy.array(snow_amt_with_locn_notnull['snow_adj_mters']) # count the number of zeros in snow_amount #print(snow_amount) zero_count = (snow_amount == 0.0).sum() zero_count_fraction = (zero_count / snow_amount.shape[0]) if numpy.all(snow_amount == 0.0) or zero_count_fraction >= 0.9: predicted_Values = numpy.zeros(snow_amount_null.shape[0]) predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters']) else: lons_null=numpy.array(snow_amount_null['Longitude_Metres']) lats_null=numpy.array(snow_amount_null['Latiitude_Metres']) elev_null=numpy.array(snow_amount_null['ElevationRelative']) X = numpy.array(snow_amt_with_locn_notnull[['Longitude_Metres','Latiitude_Metres', 'ElevationRelative']]) y = numpy.array(snow_amt_with_locn_notnull['snow_adj_mters']) estimator = GridSearchCV(Krige(), param_dict3d, verbose=False) try: estimator.fit(X=X, y=y, verbose=False) # find the best kriging technique: if hasattr(estimator, 'best_score_'): print('best_score R²={}'.format(round(estimator.best_score_,2))) print('best_params = ', estimator.best_params_) if(estimator.best_params_['method'] == 'universal3d' ): ok3d = UniversalKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model']) predicted_Values, variance_locn = ok3d.execute('points', lons_null,lats_null,elev_null) else: sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model']) predicted_Values, variance_locn = sim3d.execute('points', lons_null,lats_null,elev_null) except ValueError: sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model='gaussian') predicted_Values, variance_locn = sim3d.execute('points', lons_null,lats_null,elev_null) predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters']) interplated_df = pd.merge(predicted_snow_values,snow_amount_null,left_index = True, right_index = True) final_row = pd.concat([snow_amt_with_locn_notnull,interplated_df]) final_row_snow = final_row[['snow_adj_mters']] final_row_snow_transpose = final_row_snow.T final_row_snow_transpose = final_row_snow_transpose[stn_data.ID.values.tolist()] interpolated_values = interpolated_values.append(final_row_snow_transpose) else: last_row = interpolated_values.tail(1) interpolated_values = interpolated_values.append(last_row) return interpolated_values
def KrigeCV(P, lags, date, metric, output): #create dictionary for gridsearch to use in parameter tuning param_dict = { "method": ["ordinary", "universal"], "variogram_model": ["exponential", "gaussian", "linear", "power", "spherical"], "nlags": lags, "weight": [True, False] } estimator = GridSearchCV(Krige(), param_dict, verbose=False, cv=2) ###This cv=2 could be adjusted X = (P[:, 0:2]) #select x variables y = P[:, 2] #select y variable estimator.fit(X=X, y=y) if hasattr(estimator, 'best_score_'): print('best_score R² = {:.3f}'.format(estimator.best_score_)) print('Optimal Lags: {}'.format(estimator.best_params_['nlags'])) print('best_params = ', estimator.best_params_) #define grid dist = .15 gridx = np.arange(math.floor(min(P[:, 0])), math.ceil(max(P[:, 0])), dist) gridy = np.arange(math.floor(min(P[:, 1])), math.ceil(max(P[:, 1])), dist) ##to be used for shapefile output #gridxShape = np.arange(math.floor(min(P[:,0])) - (.5*dist), math.ceil(max(P[:,0])) + (.5*dist), dist) #gridyShape = np.arange(math.floor(min(P[:,1])) - (.5*dist), math.ceil(max(P[:,1])) + (.5*dist), dist) if estimator.best_params_[ 'method'] == 'universal': #for all universal kriging UK = UniversalKriging( P[:, 0], P[:, 1], P[:, 2], variogram_model=estimator.best_params_['variogram_model'], nlags=estimator.best_params_['nlags'], weight=estimator.best_params_['weight'], verbose=False, enable_plotting=True ) #perform kriging with params chosen by gridsearch z, ss = UK.execute('grid', gridx, gridy) if output == 'ASCII': filename = str(date) + '_' + str( metric) + '.asc' #Create unique filename kt.write_asc_grid(gridx, gridy, z, filename=filename) #write out as ASCII file elif output == 'Shapefile': geo_df = OutputShape(z, gridxShape, gridyShape) filename = str(date) + '_' + str(metric) + '.shp' geo_df.to_file(filename) else: print("output parameter must be 'ASCII' or 'Shapefile'. ") elif estimator.best_params_[ 'method'] == 'ordinary': #for all ordinary kriging OK = OrdinaryKriging( P[:, 0], P[:, 1], P[:, 2], variogram_model=estimator.best_params_['variogram_model'], nlags=estimator.best_params_['nlags'], weight=estimator.best_params_['weight'], verbose=False, enable_plotting=True ) #perform kriging with params chosen by gridsearch z, ss = OK.execute('grid', gridx, gridy) if output == 'ASCII': filename = str(date) + '_' + str( metric) + '.asc' #Create unique filename kt.write_asc_grid(gridx, gridy, z, filename=filename) #write out as ASCII file elif output == 'Shapefile': geo_df = OutputShape(z, gridxShape, gridyShape) filename = str(date) + '_' + str(metric) + '.shp' geo_df.to_file(filename) else: print("output parameter must be 'ASCII' or 'Shapefile'. ") else: print('Kriging method not recognized as Universal or Ordinary') sub = [ ] #create and fill list, to save Rsquared and other outputs/parameters sub.extend((date, metric, estimator.best_score_, estimator.best_params_)) return sub
def kriging_per_row(all_data_daily_slice): ''' This function interpolates the missing snow_adj values (in metres) when the sensor for a particular station does not have the data for the given day. The input to this function is the snow adjusted values of the 136 stations on a given date It checks for any null values, which are then interpolated using kriging. The most suitable kernel is checked using cross validation of different variogram models and kroging methods such as ordinary and gaussian kriging. The kernel with the highest R-squared value is chosen for interpolating the missing values. ''' estimator = GridSearchCV(Krige(), param_dict3d, verbose=False) interpolated_values = pd.DataFrame() for index,row_under_observation in all_data_daily_slice.iterrows(): row_under_observation = pd.DataFrame(row_under_observation) #drop the date column: transposed_row = row_under_observation.T #merge using station ids as indices snow_amt_with_locn = daily_adj_snow_stn_gpd.merge(row_under_observation,left_index = True, right_index = True) snow_amt_with_locn.rename(columns = {index : 'snow_adj_inches'} , inplace = True) #print(snow_amt_with_locn) #same unit uniformity snow_amt_with_locn['snow_adj_mters'] = snow_amt_with_locn['snow_adj_inches'] * 0.0254 #containing non null values snow_amt_with_locn_notnull = snow_amt_with_locn.dropna() #print(snow_amt_with_locn_notnull.shape) #containing null values snow_amount_null = snow_amt_with_locn[snow_amt_with_locn['snow_adj_inches'].isnull() == True] snow_amount_null.drop(['snow_adj_mters'],axis=1 , inplace = True) #if only one value is present in the entire row for that dataframe, use the previous values and continue #snow_amount_null # 3d kriging interpolation: # perform grid search to identify the good fitting variogram if (snow_amt_with_locn_notnull.shape[0] != 0 and snow_amt_with_locn_notnull.shape[0] != 1): lons=numpy.array(snow_amt_with_locn_notnull['Longitude_Metres']) lons = lons[~numpy.isnan(lons)] lats=numpy.array(snow_amt_with_locn_notnull['Latiitude_Metres']) lats = lats[~numpy.isnan(lats)] elev=numpy.array(snow_amt_with_locn_notnull['ElevationRelative']) snow_amount =numpy.array(snow_amt_with_locn_notnull['snow_adj_mters']) # count the number of zeros in snow_amount #print(snow_amount) zero_count = (snow_amount == 0.0).sum() zero_count_fraction = (zero_count / snow_amount.shape[0]) if numpy.all(snow_amount == 0.0) or zero_count_fraction >= 0.9: # replace the remaining null values with 0 ; skip kriging here predicted_Values = numpy.zeros(snow_amount_null.shape[0]) predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters']) else: lons_null=numpy.array(snow_amount_null['Longitude_Metres']) lats_null=numpy.array(snow_amount_null['Latiitude_Metres']) elev_null=numpy.array(snow_amount_null['ElevationRelative']) #snow_amount =np.array(snow_amt_with_locn_notnull['snow_adj_mters']) # group the coordinates into a single numpy array X = numpy.array(snow_amt_with_locn_notnull[['Longitude_Metres','Latiitude_Metres', 'ElevationRelative']]) y = numpy.array(snow_amt_with_locn_notnull['snow_adj_mters']) #y_req = np.array(snow_amt_with_locn_notnull['snow_adj_mters']) estimator = GridSearchCV(Krige(), param_dict3d, verbose=False) try: estimator.fit(X=X, y=y, verbose=False) # find the best kriging technique: if hasattr(estimator, 'best_score_'): print('best_score R² = {:.3f}'.format(estimator.best_score_)) print('best_params = ', estimator.best_params_) if(estimator.best_params_['method'] == 'universal3d' ): ok3d = UniversalKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model']) predicted_Values, variance_locn = ok3d.execute('points', lons_null,lats_null,elev_null) else: sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model']) predicted_Values, variance_locn = sim3d.execute('points', lons_null,lats_null,elev_null) except ValueError: ''' Due to some data prerocessing the input values of latitude, longitude and snow_adj values becomes infinitesimally small or large resulting in either NaNs or INF values. Ordinary Kriging with Gaussian kernel did not give this error, so this is being used for these edge cases. ''' sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model='gaussian') predicted_Values, variance_locn = sim3d.execute('points', lons_null,lats_null,elev_null) predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters']) interplated_df = pd.merge(predicted_snow_values,snow_amount_null,left_index = True, right_index = True) final_row = pd.concat([snow_amt_with_locn_notnull,interplated_df]) final_row_snow = final_row[['snow_adj_mters']] final_row_snow_transpose = final_row_snow.T final_row_snow_transpose = final_row_snow_transpose[stn_data.ID.values.tolist()] #take the transpose interpolated_values = interpolated_values.append(final_row_snow_transpose) else: # if all nans for a given day, set the current date data as that of the precious day last_row = interpolated_values.tail(1) interpolated_values = interpolated_values.append(last_row) interpolated_values.to_csv('f12k.csv')
""" import numpy as np from pykrige.rk import Krige from sklearn.model_selection import GridSearchCV # 2D Kring param opt param_dict = { "method": ["ordinary", "universal"], "variogram_model": ["linear", "power", "gaussian", "spherical"], # "nlags": [4, 6, 8], # "weight": [True, False] } estimator = GridSearchCV(Krige(), param_dict, verbose=True, return_train_score=True) # dummy data X = np.random.randint(0, 400, size=(100, 2)).astype(float) y = 5 * np.random.rand(100) # run the gridsearch estimator.fit(X=X, y=y) if hasattr(estimator, "best_score_"): print("best_score R² = {:.3f}".format(estimator.best_score_)) print("best_params = ", estimator.best_params_)