Example #1
0
def krige_grid_search(BEC_df):
    # 3D Kring param opt
    param_dict3d = {
        "method": ["ordinary3d", "universal3d"],
        "variogram_model": ["linear", "power", "gaussian", "spherical"],
        # "nlags": [4, 6, 8],
        # "weight": [True, False]
    }

    estimator = GridSearchCV(Krige(),
                             param_dict3d,
                             verbose=False,
                             cv=5,
                             iid=False)

    # Data
    X3 = BEC_df[['X', 'Y', 'Z']].values
    y = BEC_df['BEC'].values

    # run the gridsearch
    estimator.fit(X=X3, y=y)

    if hasattr(estimator, 'best_score_'):
        print('best_score R2 = {:.3f}'.format(estimator.best_score_))
        print('best_params = ', estimator.best_params_)

    print('\nCV results::')
    if hasattr(estimator, 'cv_results_'):
        for key in [
                'mean_test_score', 'mean_train_score', 'param_method',
                'param_variogram_model'
        ]:
            print(' - {} : {}'.format(key, estimator.cv_results_[key]))

    return estimator.best_params_
Example #2
0
def SelectModel(data):
    from pykrige.compat import GridSearchCV
    from pykrige.rk import Krige
    # Set up parameters
    param_dict = {
        "method": ["universal"],
        "variogram_model": ["exponential", "spherical"],
        "nlags": [10, 15],
        "lagdist": [1000],
        "weight": [True],
        "n_closest_points": [0],
        "anisotropy_scaling": [1, 1.5, 2],
        "anisotropy_angle": [10, 15]
    }
    estimator = GridSearchCV(Krige(),
                             param_dict,
                             verbose=True,
                             error_score=np.nan)
    # Run gridsearch
    estimator.fit(X=data[:, 0:2], y=data[:, 2])

    if hasattr(estimator, 'best_score_'):
        print('best_score R² = {:.3f}'.format(estimator.best_score_))
        print('best_params = ', estimator.best_params_)
    print('\nCV results::')
    if hasattr(estimator, 'cv_results_'):
        for key in [
                'mean_test_score', 'mean_train_score', 'param_method',
                'param_variogram_model'
        ]:
            print(' - {} : {}'.format(key, estimator.cv_results_[key]))
    return
Example #3
0
def test_krige():
    # dummy data
    np.random.seed(1)
    X = np.random.randint(0, 400, size=(20, 3)).astype(float)
    y = 5 * np.random.rand(20)

    for m, v in _method_and_vergiogram():
        param_dict = {"method": [m], "variogram_model": [v]}

        estimator = GridSearchCV(
            Krige(),
            param_dict,
            n_jobs=-1,
            pre_dispatch="2*n_jobs",
            verbose=False,
            cv=5,
        )
        # run the gridsearch
        if m in ["ordinary", "universal"]:
            estimator.fit(X=X[:, :2], y=y)
        else:
            estimator.fit(X=X, y=y)
        if hasattr(estimator, "best_score_"):
            if m in threed_krige:
                assert estimator.best_score_ > -10.0
            else:
                assert estimator.best_score_ > -3.0
        if hasattr(estimator, "cv_results_"):
            assert estimator.cv_results_["mean_train_score"] > 0
Example #4
0
def test_krige():
    # dummy data
    np.random.seed(1)
    X = np.random.randint(0, 400, size=(20, 3)).astype(float)
    y = 5 * np.random.rand(20)

    for m, v in _method_and_vergiogram():
        param_dict = {'method': [m], 'variogram_model': [v]}

        estimator = GridSearchCV(
            Krige(),
            param_dict,
            n_jobs=-1,
            iid=False,
            pre_dispatch='2*n_jobs',
            verbose=False,
            cv=5,
        )
        # run the gridsearch
        if m in ['ordinary', 'universal']:
            estimator.fit(X=X[:, :2], y=y)
        else:
            estimator.fit(X=X, y=y)
        if hasattr(estimator, 'best_score_'):
            if m in threed_krige:
                assert estimator.best_score_ > -10.0
            else:
                assert estimator.best_score_ > -3.0
        if hasattr(estimator, 'cv_results_'):
            assert estimator.cv_results_['mean_train_score'] > 0
Example #5
0
def test_gridsearch_cv_variogram_parameters():
    param_dict3d = {
        "method": ["ordinary3d"],
        "variogram_model": ["linear"],
        "variogram_parameters": [{
            'slope': 1.0,
            'nugget': 1.0
        }, {
            'slope': 2.0,
            'nugget': 1.0
        }]
    }

    estimator = GridSearchCV(Krige(), param_dict3d, verbose=True)

    # dummy data
    seed = np.random.RandomState(42)
    X3 = 400. * (1 + seed.rand(100, 3))
    y = 5 * seed.rand(100)

    # run the gridsearch
    estimator.fit(X=X3, y=y)

    # Expected best parameters and score
    best_params = [1.0, 1.0]
    # best_score = -0.4624793735893478
    best_score = round(Decimal(-0.462479373589), 12)

    if hasattr(estimator, 'best_params_'):
        assert len(estimator.cv_results_['param_variogram_parameters']) == len(
            param_dict3d["variogram_parameters"])
        for i, k in enumerate(estimator.best_params_["variogram_parameters"]):
            assert estimator.best_params_["variogram_parameters"][
                k] == best_params[i]
Example #6
0
def test_gridsearch_cv_variogram_parameters():
    param_dict3d = {
        "method": ["ordinary3d"],
        "variogram_model": ["linear"],
        "variogram_parameters": [{
            'slope': 1.0,
            'nugget': 1.0
        }, {
            'slope': 2.0,
            'nugget': 1.0
        }]
    }

    estimator = GridSearchCV(Krige(), param_dict3d, verbose=True)

    # dummy data
    seed = np.random.RandomState(42)
    # X3 = seed.randint(0, 400, size=(100, 3)).astype(float)
    X3 = 400. * (1 + seed.rand(100, 3))
    y = 5 * seed.rand(100)

    # run the gridsearch
    estimator.fit(X=X3, y=y)

    # Expected best parameters
    best_params = [1.0, 1.0]

    print("\n\n###########",
          estimator.cv_results_['param_variogram_parameters'], "\n\n",
          len(estimator.cv_results_['param_variogram_parameters']))

    if hasattr(estimator, 'best_score_'):
        print('best_score R² = {:.3f}'.format(estimator.best_score_))
        print('best_params = ', estimator.best_params_)
    # print("\n")
    # if hasattr(estimator2, 'best_score_'):
    #     print('best_score R² = {:.3f}'.format(estimator2.best_score_))
    #     print('best_params = ', estimator2.best_params_)
    #
    best_params = [1.0, 1.0]
    # best_score = -0.4624793735893478
    best_score = round(Decimal(-0.462479373589), 12)

    if hasattr(estimator, 'best_score_'):
        print(round(Decimal(estimator.best_score_), 12), best_score)
        print("\n\n best score :",
              round(Decimal(estimator.best_score_), 12) == best_score)
    if hasattr(estimator, 'best_params_'):
        print(
            "\nlen param vario :",
            len(estimator.cv_results_['param_variogram_parameters']) == len(
                param_dict3d["variogram_parameters"]))
        for i, k in enumerate(estimator.best_params_["variogram_parameters"]):
            print(
                "\n\negal parm vario :",
                estimator.best_params_["variogram_parameters"][k] ==
                best_params[i])
Example #7
0
def Interpolating_models_ims(time='2013-10-19T22:00:00',
                             var='TD',
                             plot=True,
                             gis_path=gis_path,
                             method='okrig',
                             dem_path=work_yuval / 'AW3D30',
                             lapse_rate=5.,
                             cv=None,
                             rms=None,
                             gridsearch=False):
    """main 2d_interpolation from stations to map"""
    # cv usage is {'kfold': 5} or {'rkfold': [2, 3]}
    # TODO: try 1d modeling first, like T=f(lat)
    from sklearn.gaussian_process import GaussianProcessRegressor
    from sklearn.neighbors import KNeighborsRegressor
    from pykrige.rk import Krige
    import numpy as np
    from sklearn.svm import SVR
    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import RandomForestRegressor
    from scipy.spatial import Delaunay
    from scipy.interpolate import griddata
    from sklearn.metrics import mean_squared_error
    from aux_gps import coarse_dem
    import seaborn as sns
    import matplotlib.pyplot as plt
    import pyproj
    from sklearn.utils.estimator_checks import check_estimator
    from pykrige.compat import GridSearchCV
    lla = pyproj.Proj(proj='latlong', ellps='WGS84', datum='WGS84')
    ecef = pyproj.Proj(proj='geocent', ellps='WGS84', datum='WGS84')

    def parse_cv(cv):
        from sklearn.model_selection import KFold
        from sklearn.model_selection import RepeatedKFold
        from sklearn.model_selection import LeaveOneOut
        """input:cv number or string"""
        # check for integer:
        if 'kfold' in cv.keys():
            n_splits = cv['kfold']
            print('CV is KFold with n_splits={}'.format(n_splits))
            return KFold(n_splits=n_splits)
        if 'rkfold' in cv.keys():
            n_splits = cv['rkfold'][0]
            n_repeats = cv['rkfold'][1]
            print('CV is ReapetedKFold with n_splits={},'.format(n_splits) +
                  ' n_repeates={}'.format(n_repeats))
            return RepeatedKFold(n_splits=n_splits,
                                 n_repeats=n_repeats,
                                 random_state=42)
        if 'loo' in cv.keys():
            return LeaveOneOut()

    # from aux_gps import scale_xr
    da = create_lat_lon_mesh(points_per_degree=250)  # 500?
    awd = coarse_dem(da)
    awd = awd.values
    geo_snap = geo_pandas_time_snapshot(var=var, datetime=time, plot=False)
    if var == 'TD':
        [a, b] = np.polyfit(geo_snap['alt'].values, geo_snap['TD'].values, 1)
        if lapse_rate == 'auto':
            lapse_rate = np.abs(a) * 1000
        fig, ax_lapse = plt.subplots(figsize=(10, 6))
        sns.regplot(data=geo_snap,
                    x='alt',
                    y='TD',
                    color='r',
                    scatter_kws={'color': 'b'},
                    ax=ax_lapse)
        suptitle = time.replace('T', ' ')
        ax_lapse.set_xlabel('Altitude [m]')
        ax_lapse.set_ylabel('Temperature [degC]')
        ax_lapse.text(0.5,
                      0.95,
                      'Lapse_rate: {:.2f} degC/km'.format(lapse_rate),
                      horizontalalignment='center',
                      verticalalignment='center',
                      transform=ax_lapse.transAxes,
                      fontsize=12,
                      color='k',
                      fontweight='bold')
        ax_lapse.grid()
        ax_lapse.set_title(suptitle, fontsize=14, fontweight='bold')
#     fig.suptitle(suptitle, fontsize=14, fontweight='bold')
    alts = []
    for i, row in geo_snap.iterrows():
        lat = da.sel(lat=row['lat'], method='nearest').lat.values
        lon = da.sel(lon=row['lon'], method='nearest').lon.values
        alt = row['alt']
        if lapse_rate is not None and var == 'TD':
            da.loc[{'lat': lat, 'lon': lon}] = row[var] + \
                lapse_rate * alt / 1000.0
            alts.append(alt)
        elif lapse_rate is None or var != 'TD':
            da.loc[{'lat': lat, 'lon': lon}] = row[var]
            alts.append(alt)
    # da_scaled = scale_xr(da)
    c = np.linspace(min(da.lat.values), max(da.lat.values), da.shape[0])
    r = np.linspace(min(da.lon.values), max(da.lon.values), da.shape[1])
    rr, cc = np.meshgrid(r, c)
    vals = ~np.isnan(da.values)
    if lapse_rate is None:
        Xrr, Ycc, Z = pyproj.transform(lla,
                                       ecef,
                                       rr[vals],
                                       cc[vals],
                                       np.array(alts),
                                       radians=False)
        X = np.column_stack([Xrr, Ycc, Z])
        XX, YY, ZZ = pyproj.transform(lla,
                                      ecef,
                                      rr,
                                      cc,
                                      awd.values,
                                      radians=False)
        rr_cc_as_cols = np.column_stack(
            [XX.flatten(), YY.flatten(),
             ZZ.flatten()])
    else:
        X = np.column_stack([rr[vals], cc[vals]])
        rr_cc_as_cols = np.column_stack([rr.flatten(), cc.flatten()])
    # y = da_scaled.values[vals]
    y = da.values[vals]
    if method == 'gp-rbf':
        from sklearn.gaussian_process.kernels import RBF
        from sklearn.gaussian_process.kernels import WhiteKernel
        kernel = 1.0 * RBF(length_scale=0.25, length_scale_bounds=(1e-2, 1e3)) \
            + WhiteKernel(noise_level=0.01, noise_level_bounds=(1e-10, 1e+1))
        #        kernel = None
        model = GaussianProcessRegressor(alpha=0.0,
                                         kernel=kernel,
                                         n_restarts_optimizer=5,
                                         random_state=42,
                                         normalize_y=True)

    elif method == 'gp-qr':
        from sklearn.gaussian_process.kernels import RationalQuadratic
        from sklearn.gaussian_process.kernels import WhiteKernel
        kernel = RationalQuadratic(length_scale=100.0) \
            + WhiteKernel(noise_level=0.01, noise_level_bounds=(1e-10, 1e+1))
        model = GaussianProcessRegressor(alpha=0.0,
                                         kernel=kernel,
                                         n_restarts_optimizer=5,
                                         random_state=42,
                                         normalize_y=True)
    elif method == 'knn':
        model = KNeighborsRegressor(n_neighbors=5, weights='distance')
    elif method == 'svr':
        model = SVR(C=1.0,
                    cache_size=200,
                    coef0=0.0,
                    degree=3,
                    epsilon=0.1,
                    gamma='auto_deprecated',
                    kernel='rbf',
                    max_iter=-1,
                    shrinking=True,
                    tol=0.001,
                    verbose=False)
    elif method == 'okrig':
        model = Krige(method='ordinary',
                      variogram_model='spherical',
                      verbose=True)
    elif method == 'ukrig':
        model = Krige(method='universal',
                      variogram_model='linear',
                      verbose=True)
#    elif method == 'okrig3d':
#        # don't bother - MemoryError...
#        model = OrdinaryKriging3D(rr[vals], cc[vals], np.array(alts),
#                                  da.values[vals], variogram_model='linear',
#                                  verbose=True)
#        awd = coarse_dem(da)
#        interpolated, ss = model.execute('grid', r, c, awd['data'].values)
#    elif method == 'rkrig':
#        # est = LinearRegression()
#        est = RandomForestRegressor()
#        model = RegressionKriging(regression_model=est, n_closest_points=5,
#                                  verbose=True)
#        p = np.array(alts).reshape(-1, 1)
#        model.fit(p, X, y)
#        P = awd.flatten().reshape(-1, 1)
#        interpolated = model.predict(P, rr_cc_as_cols).reshape(da.values.shape)
#    try:
#        u = check_estimator(model)
#    except TypeError:
#        u = False
#        pass
    if cv is not None and not gridsearch:  # and u is None):
        # from sklearn.model_selection import cross_validate
        from sklearn import metrics
        cv = parse_cv(cv)
        ytests = []
        ypreds = []
        for train_idx, test_idx in cv.split(X):
            X_train, X_test = X[train_idx], X[test_idx]  # requires arrays
            y_train, y_test = y[train_idx], y[test_idx]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            # there is only one y-test and y-pred per iteration over the loo.split,
            # so to get a proper graph, we append them to respective lists.
            ytests += list(y_test)
            ypreds += list(y_pred)
        true_vals = np.array(ytests)
        predicted = np.array(ypreds)
        r2 = metrics.r2_score(ytests, ypreds)
        ms_error = metrics.mean_squared_error(ytests, ypreds)
        print("R^2: {:.5f}%, MSE: {:.5f}".format(r2 * 100, ms_error))
    if gridsearch:
        cv = parse_cv(cv)
        param_dict = {
            "method": ["ordinary", "universal"],
            "variogram_model": ["linear", "power", "gaussian", "spherical"],
            # "nlags": [4, 6, 8],
            # "weight": [True, False]
        }
        estimator = GridSearchCV(Krige(),
                                 param_dict,
                                 verbose=True,
                                 cv=cv,
                                 scoring='neg_mean_absolute_error',
                                 return_train_score=True,
                                 n_jobs=1)
        estimator.fit(X, y)
        if hasattr(estimator, 'best_score_'):
            print('best_score = {:.3f}'.format(estimator.best_score_))
            print('best_params = ', estimator.best_params_)

        return estimator
        #    if (cv is not None and not u):
        #        from sklearn import metrics
        #        cv = parse_cv(cv)
        #        ytests = []
        #        ypreds = []
        #        for train_idx, test_idx in cv.split(X):
        #            X_train, X_test = X[train_idx], X[test_idx]  # requires arrays
        #            y_train, y_test = y[train_idx], y[test_idx]
        ##            model = UniversalKriging(X_train[:, 0], X_train[:, 1], y_train,
        ##                                     variogram_model='linear', verbose=False,
        ##                                     enable_plotting=False)
        #            model.X_ORIG = X_train[:, 0]
        #            model.X_ADJUSTED = model.X_ORIG
        #            model.Y_ORIG = X_train[:, 1]
        #            model.Y_ADJUSTED = model.Y_ORIG
        #            model.Z = y_train
        #            y_pred, ss = model.execute('points', X_test[0, 0],
        #                                             X_test[0, 1])
        #            # there is only one y-test and y-pred per iteration over the loo.split,
        #            # so to get a proper graph, we append them to respective lists.
        #            ytests += list(y_test)        cmap = plt.get_cmap('spring', 10)
        Q = ax.quiver(isr['X'],
                      isr['Y'],
                      isr['U'],
                      isr['V'],
                      isr['cm_per_year'],
                      cmap=cmap)
        fig.colorbar(Q, extend='max')

#            ypreds += list(y_pred)
#        true_vals = np.array(ytests)
#        predicted = np.array(ypreds)
#        r2 = metrics.r2_score(ytests, ypreds)
#        ms_error = metrics.mean_squared_error(ytests, ypreds)
#        print("R^2: {:.5f}%, MSE: {:.5f}".format(r2*100, ms_error))
#        cv_results = cross_validate(gp, X, y, cv=cv, scoring='mean_squared_error',
#                                    return_train_score=True, n_jobs=-1)
#        test = xr.DataArray(cv_results['test_score'], dims=['kfold'])
#        train = xr.DataArray(cv_results['train_score'], dims=['kfold'])
#        train.name = 'train'
#        cds = test.to_dataset(name='test')
#        cds['train'] = train
#        cds['kfold'] = np.arange(len(cv_results['test_score'])) + 1
#        cds['mean_train'] = cds.train.mean('kfold')
#        cds['mean_test'] = cds.test.mean('kfold')

# interpolated=griddata(X, y, (rr, cc), method='nearest')
    model.fit(X, y)
    interpolated = model.predict(rr_cc_as_cols).reshape(da.values.shape)
    da_inter = da.copy(data=interpolated)
    if lapse_rate is not None and var == 'TD':
        da_inter -= lapse_rate * awd / 1000.0
    if (rms is not None and cv is None):  # or (rms is not None and not u):
        predicted = []
        true_vals = []
        for i, row in geo_snap.iterrows():
            lat = da.sel(lat=row['lat'], method='nearest').lat.values
            lon = da.sel(lon=row['lon'], method='nearest').lon.values
            pred = da_inter.loc[{'lat': lat, 'lon': lon}].values.item()
            true = row[var]
            predicted.append(pred)
            true_vals.append(true)
        predicted = np.array(predicted)
        true_vals = np.array(true_vals)
        ms_error = mean_squared_error(true_vals, predicted)
        print("MSE: {:.5f}".format(ms_error))
    if plot:
        import salem
        from salem import DataLevels, Map
        import cartopy.crs as ccrs
        # import cartopy.io.shapereader as shpreader
        import matplotlib.pyplot as plt
        # fname = gis_path / 'ne_10m_admin_0_sovereignty.shp'
        # fname = gis_path / 'gadm36_ISR_0.shp'
        # ax = plt.axes(projection=ccrs.PlateCarree())
        f, ax = plt.subplots(figsize=(6, 10))
        # shdf = salem.read_shapefile(salem.get_demo_file('world_borders.shp'))
        shdf = salem.read_shapefile(gis_path / 'Israel_and_Yosh.shp')
        # shdf = shdf.loc[shdf['CNTRY_NAME'] == 'Israel']  # remove other countries
        shdf.crs = {'init': 'epsg:4326'}
        dsr = da_inter.salem.roi(shape=shdf)
        grid = dsr.salem.grid
        grid = da_inter.salem.grid
        sm = Map(grid)
        # sm.set_shapefile(gis_path / 'Israel_and_Yosh.shp')
        # sm = dsr.salem.quick_map(ax=ax)
        #        sm2 = salem.Map(grid, factor=1)
        #        sm2.set_shapefile(gis_path/'gis_osm_water_a_free_1.shp',
        #                          edgecolor='k')
        sm.set_data(dsr)
        # sm.set_nlevels(7)
        # sm.visualize(ax=ax, title='Israel {} interpolated temperature from IMS'.format(method),
        #             cbar_title='degC')
        sm.set_shapefile(gis_path / 'gis_osm_water_a_free_1.shp',
                         edgecolor='k')  # , facecolor='aqua')
        # sm.set_topography(awd.values, crs=awd.crs)
        # sm.set_rgb(crs=shdf.crs, natural_earth='hr')  # ad
        # lakes = salem.read_shapefile(gis_path/'gis_osm_water_a_free_1.shp')
        sm.set_cmap(cm='rainbow')
        sm.visualize(
            ax=ax,
            title='Israel {} interpolated temperature from IMS'.format(method),
            cbar_title='degC')
        dl = DataLevels(geo_snap[var], levels=sm.levels)
        dl.set_cmap(sm.cmap)
        x, y = sm.grid.transform(geo_snap.lon.values, geo_snap.lat.values)
        ax.scatter(x,
                   y,
                   color=dl.to_rgb(),
                   s=20,
                   edgecolors='k',
                   linewidths=0.5)
        suptitle = time.replace('T', ' ')
        f.suptitle(suptitle, fontsize=14, fontweight='bold')
        if (rms is not None or cv is not None) and (not gridsearch):
            import seaborn as sns
            f, ax = plt.subplots(1, 2, figsize=(12, 6))
            sns.scatterplot(x=true_vals,
                            y=predicted,
                            ax=ax[0],
                            marker='.',
                            s=100)
            resid = predicted - true_vals
            sns.distplot(resid, bins=5, color='c', label='residuals', ax=ax[1])
            rmean = np.mean(resid)
            rstd = np.std(resid)
            rmedian = np.median(resid)
            rmse = np.sqrt(mean_squared_error(true_vals, predicted))
            plt.axvline(rmean, color='r', linestyle='dashed', linewidth=1)
            _, max_ = plt.ylim()
            plt.text(rmean + rmean / 10, max_ - max_ / 10,
                     'Mean: {:.2f}, RMSE: {:.2f}'.format(rmean, rmse))
            f.tight_layout()
        # lakes.plot(ax=ax, color='b', edgecolor='k')
        # lake_borders = gpd.overlay(countries, capitals, how='difference')
        # adm1_shapes = list(shpreader.Reader(fname).geometries())
        # ax = plt.axes(projection=ccrs.PlateCarree())
        # ax.coastlines(resolution='10m')
        # ax.add_geometries(adm1_shapes, ccrs.PlateCarree(),
        #                  edgecolor='black', facecolor='gray', alpha=0.5)
        # da_inter.plot.pcolormesh('lon', 'lat', ax=ax)
        #geo_snap.plot(ax=ax, column=var, cmap='viridis', edgecolor='black',
        #              legend=False)
    return da_inter
Example #8
0
"""

import numpy as np
from pykrige.rk import Krige
from pykrige.compat import GridSearchCV

# 2D Kring param opt

param_dict = {
    "method": ["ordinary", "universal"],
    "variogram_model": ["linear", "power", "gaussian", "spherical"],
    # "nlags": [4, 6, 8],
    # "weight": [True, False]
}

estimator = GridSearchCV(Krige(), param_dict, verbose=True)

# dummy data
X = np.random.randint(0, 400, size=(100, 2)).astype(float)
y = 5 * np.random.rand(100)

# run the gridsearch
estimator.fit(X=X, y=y)

if hasattr(estimator, 'best_score_'):
    print('best_score R² = {:.3f}'.format(estimator.best_score_))
    print('best_params = ', estimator.best_params_)

print('\nCV results::')
if hasattr(estimator, 'cv_results_'):
    for key in [
	def kriging_per_row(all_data_daily_slice):

	    param_dict3d = {"method":["ordinary3d", "universal3d"],"variogram_model": ["linear", "power", "gaussian", "spherical"]}

	    estimator = GridSearchCV(Krige(), param_dict3d, verbose=False)
	    interpolated_values = pd.DataFrame()

	    for index,row_under_observation in all_data_daily_slice.iterrows():
	        row_under_observation = pd.DataFrame(row_under_observation)
	        transposed_row = row_under_observation.T

	              #merge using station ids as indices
	        snow_amt_with_locn = all_data_daily_slice.merge(row_under_observation,left_index = True, right_index = True)
	        snow_amt_with_locn.rename(columns = {index : 'snow_adj_inches'} , inplace = True)
	        snow_amt_with_locn['snow_adj_mters'] = snow_amt_with_locn['snow_adj_inches'] * 0.0254

	  #containing non null values
	        snow_amt_with_locn_notnull = snow_amt_with_locn.dropna()
	    #print(snow_amt_with_locn_notnull.shape)

	  #containing null values
	        snow_amount_null = snow_amt_with_locn[snow_amt_with_locn['snow_adj_inches'].isnull() == True]
	        snow_amount_null.drop(['snow_adj_mters'],axis=1 , inplace = True)


	  # perform grid search to identify the good fitting variogram
	        if (snow_amt_with_locn_notnull.shape[0] != 0 and snow_amt_with_locn_notnull.shape[0] != 1):
	            lons=numpy.array(snow_amt_with_locn_notnull['Longitude_Metres'])
	            lons = lons[~numpy.isnan(lons)]

	            lats=numpy.array(snow_amt_with_locn_notnull['Latiitude_Metres'])
	            lats = lats[~numpy.isnan(lats)]
	            elev=numpy.array(snow_amt_with_locn_notnull['ElevationRelative'])
	            snow_amount =numpy.array(snow_amt_with_locn_notnull['snow_adj_mters'])
	      # count the number of zeros in snow_amount
	      #print(snow_amount)

	            zero_count = (snow_amount == 0.0).sum()
	            zero_count_fraction = (zero_count / snow_amount.shape[0])


	        if numpy.all(snow_amount == 0.0) or zero_count_fraction >= 0.9:
	            predicted_Values = numpy.zeros(snow_amount_null.shape[0])
	            predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters'])


	        else:
	            lons_null=numpy.array(snow_amount_null['Longitude_Metres'])
	            lats_null=numpy.array(snow_amount_null['Latiitude_Metres'])
	            elev_null=numpy.array(snow_amount_null['ElevationRelative'])
	            X = numpy.array(snow_amt_with_locn_notnull[['Longitude_Metres','Latiitude_Metres', 'ElevationRelative']])
	            y = numpy.array(snow_amt_with_locn_notnull['snow_adj_mters'])
	            estimator = GridSearchCV(Krige(), param_dict3d, verbose=False)


	        try:
	            estimator.fit(X=X, y=y, verbose=False)
	        # find the best kriging technique:
	            if hasattr(estimator, 'best_score_'):
	                print('best_score R²={}'.format(round(estimator.best_score_,2)))
	                print('best_params = ', estimator.best_params_)


	            if(estimator.best_params_['method'] == 'universal3d' ):
	                ok3d = UniversalKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model'])
	                predicted_Values, variance_locn = ok3d.execute('points',  lons_null,lats_null,elev_null)

	            else:
	                sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model'])
	                predicted_Values, variance_locn = sim3d.execute('points',  lons_null,lats_null,elev_null)


	        except ValueError:
	            sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model='gaussian')
	            predicted_Values, variance_locn = sim3d.execute('points',  lons_null,lats_null,elev_null)


	            predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters'])

	            interplated_df = pd.merge(predicted_snow_values,snow_amount_null,left_index = True, right_index = True)

	            final_row = pd.concat([snow_amt_with_locn_notnull,interplated_df])

	            final_row_snow = final_row[['snow_adj_mters']]
	            final_row_snow_transpose = final_row_snow.T
	            final_row_snow_transpose = final_row_snow_transpose[stn_data.ID.values.tolist()]
	            interpolated_values = interpolated_values.append(final_row_snow_transpose)

	    else:
	        last_row = interpolated_values.tail(1)
	        interpolated_values = interpolated_values.append(last_row)



	    return interpolated_values
Example #10
0
def KrigeCV(P, lags, date, metric, output):
    #create dictionary for gridsearch to use in parameter tuning
    param_dict = {
        "method": ["ordinary", "universal"],
        "variogram_model":
        ["exponential", "gaussian", "linear", "power", "spherical"],
        "nlags":
        lags,
        "weight": [True, False]
    }
    estimator = GridSearchCV(Krige(), param_dict, verbose=False,
                             cv=2)  ###This cv=2 could be adjusted
    X = (P[:, 0:2])  #select x variables
    y = P[:, 2]  #select y variable
    estimator.fit(X=X, y=y)
    if hasattr(estimator, 'best_score_'):
        print('best_score R² = {:.3f}'.format(estimator.best_score_))
        print('Optimal Lags: {}'.format(estimator.best_params_['nlags']))
        print('best_params = ', estimator.best_params_)

    #define grid
    dist = .15
    gridx = np.arange(math.floor(min(P[:, 0])), math.ceil(max(P[:, 0])), dist)
    gridy = np.arange(math.floor(min(P[:, 1])), math.ceil(max(P[:, 1])), dist)
    ##to be used for shapefile output
    #gridxShape = np.arange(math.floor(min(P[:,0])) - (.5*dist), math.ceil(max(P[:,0])) + (.5*dist), dist)
    #gridyShape = np.arange(math.floor(min(P[:,1])) - (.5*dist), math.ceil(max(P[:,1])) + (.5*dist), dist)

    if estimator.best_params_[
            'method'] == 'universal':  #for all universal kriging
        UK = UniversalKriging(
            P[:, 0],
            P[:, 1],
            P[:, 2],
            variogram_model=estimator.best_params_['variogram_model'],
            nlags=estimator.best_params_['nlags'],
            weight=estimator.best_params_['weight'],
            verbose=False,
            enable_plotting=True
        )  #perform kriging with params chosen by gridsearch
        z, ss = UK.execute('grid', gridx, gridy)
        if output == 'ASCII':
            filename = str(date) + '_' + str(
                metric) + '.asc'  #Create unique filename
            kt.write_asc_grid(gridx, gridy, z,
                              filename=filename)  #write out as ASCII file
        elif output == 'Shapefile':
            geo_df = OutputShape(z, gridxShape, gridyShape)
            filename = str(date) + '_' + str(metric) + '.shp'
            geo_df.to_file(filename)
        else:
            print("output parameter must be 'ASCII' or 'Shapefile'. ")
    elif estimator.best_params_[
            'method'] == 'ordinary':  #for all ordinary kriging
        OK = OrdinaryKriging(
            P[:, 0],
            P[:, 1],
            P[:, 2],
            variogram_model=estimator.best_params_['variogram_model'],
            nlags=estimator.best_params_['nlags'],
            weight=estimator.best_params_['weight'],
            verbose=False,
            enable_plotting=True
        )  #perform kriging with params chosen by gridsearch
        z, ss = OK.execute('grid', gridx, gridy)
        if output == 'ASCII':
            filename = str(date) + '_' + str(
                metric) + '.asc'  #Create unique filename
            kt.write_asc_grid(gridx, gridy, z,
                              filename=filename)  #write out as ASCII file
        elif output == 'Shapefile':
            geo_df = OutputShape(z, gridxShape, gridyShape)
            filename = str(date) + '_' + str(metric) + '.shp'
            geo_df.to_file(filename)
        else:
            print("output parameter must be 'ASCII' or 'Shapefile'. ")
    else:
        print('Kriging method not recognized as Universal or Ordinary')
    sub = [
    ]  #create and fill list, to save Rsquared and other outputs/parameters
    sub.extend((date, metric, estimator.best_score_, estimator.best_params_))
    return sub
Example #11
0
def kriging_per_row(all_data_daily_slice):
  
  '''
  This function interpolates the missing snow_adj values (in metres) when the sensor for
  a particular station does not have the data for the given day.
  
  The input to this function is the snow adjusted values of the 136 stations on a given date
  
  It checks for any null values, which are then interpolated using kriging.
  The most suitable kernel is checked using cross validation of different variogram models and kroging methods such as ordinary and gaussian kriging.
  
  The kernel with the highest R-squared value is chosen for interpolating the missing values.
  
  
  
  
  '''
  
  
  estimator = GridSearchCV(Krige(), param_dict3d, verbose=False)
  interpolated_values = pd.DataFrame()
  
  for index,row_under_observation in all_data_daily_slice.iterrows():
    
    
    
    
    
 
    row_under_observation = pd.DataFrame(row_under_observation)
    
    
   
  
  
  #drop the date column:
    transposed_row = row_under_observation.T
  
  #merge using station ids as indices
    snow_amt_with_locn = daily_adj_snow_stn_gpd.merge(row_under_observation,left_index = True, right_index = True)
    snow_amt_with_locn.rename(columns = {index : 'snow_adj_inches'} , inplace = True)
  #print(snow_amt_with_locn)
  #same unit uniformity
    snow_amt_with_locn['snow_adj_mters'] = snow_amt_with_locn['snow_adj_inches'] * 0.0254
  
  #containing non null values
    snow_amt_with_locn_notnull = snow_amt_with_locn.dropna()
    #print(snow_amt_with_locn_notnull.shape)
  
  #containing null values 
    snow_amount_null = snow_amt_with_locn[snow_amt_with_locn['snow_adj_inches'].isnull() == True]
    snow_amount_null.drop(['snow_adj_mters'],axis=1 , inplace = True)
  
  #if only one value is present in the entire row for that dataframe, use the previous values and continue
  
    #snow_amount_null
  # 3d kriging interpolation:
  
  # perform grid search to identify the good fitting variogram
    if (snow_amt_with_locn_notnull.shape[0] != 0 and snow_amt_with_locn_notnull.shape[0] != 1):
      lons=numpy.array(snow_amt_with_locn_notnull['Longitude_Metres']) 
      lons = lons[~numpy.isnan(lons)]

      lats=numpy.array(snow_amt_with_locn_notnull['Latiitude_Metres']) 
      lats = lats[~numpy.isnan(lats)]
      elev=numpy.array(snow_amt_with_locn_notnull['ElevationRelative'])
      snow_amount =numpy.array(snow_amt_with_locn_notnull['snow_adj_mters'])
      # count the number of zeros in snow_amount
      #print(snow_amount)
      
      zero_count = (snow_amount == 0.0).sum()
      zero_count_fraction = (zero_count / snow_amount.shape[0])

        
      

      if numpy.all(snow_amount == 0.0) or zero_count_fraction >= 0.9:
        # replace the remaining null values with 0 ; skip kriging here
        predicted_Values = numpy.zeros(snow_amount_null.shape[0])
        predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters'])
        
        
        
        
        
        
  
      else:
        lons_null=numpy.array(snow_amount_null['Longitude_Metres']) 
        lats_null=numpy.array(snow_amount_null['Latiitude_Metres']) 
        elev_null=numpy.array(snow_amount_null['ElevationRelative'])
        #snow_amount =np.array(snow_amt_with_locn_notnull['snow_adj_mters'])
        
        
        # group the coordinates into a single numpy array
        X = numpy.array(snow_amt_with_locn_notnull[['Longitude_Metres','Latiitude_Metres', 'ElevationRelative']])

        y = numpy.array(snow_amt_with_locn_notnull['snow_adj_mters'])
        #y_req = np.array(snow_amt_with_locn_notnull['snow_adj_mters'])
        
        estimator = GridSearchCV(Krige(), param_dict3d, verbose=False)
        
        
        try:

          estimator.fit(X=X, y=y, verbose=False)
        # find the best kriging technique:
          if hasattr(estimator, 'best_score_'):
            print('best_score R² = {:.3f}'.format(estimator.best_score_))
            print('best_params = ', estimator.best_params_)
  
        
          if(estimator.best_params_['method'] == 'universal3d' ):
            ok3d = UniversalKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model'])
            predicted_Values, variance_locn = ok3d.execute('points',  lons_null,lats_null,elev_null)
          
          else:
            sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model=estimator.best_params_['variogram_model'])
            predicted_Values, variance_locn = sim3d.execute('points',  lons_null,lats_null,elev_null)
          
          
        except ValueError:
          '''
          
          Due to some data prerocessing the input values of latitude, longitude and snow_adj values becomes infinitesimally small or large
          resulting in either NaNs or INF values.
          
          Ordinary Kriging with Gaussian kernel did not give this error, so this is being used for these edge cases.
          
          '''
          sim3d = OrdinaryKriging3D(lons, lats, elev, snow_amount, variogram_model='gaussian')
          predicted_Values, variance_locn = sim3d.execute('points',  lons_null,lats_null,elev_null)
          
            
      predicted_snow_values = pd.DataFrame(predicted_Values,index =snow_amount_null.index.values.tolist() , columns = ['snow_adj_mters'])
        
      interplated_df = pd.merge(predicted_snow_values,snow_amount_null,left_index = True, right_index = True)
        
      final_row = pd.concat([snow_amt_with_locn_notnull,interplated_df])
        
      final_row_snow = final_row[['snow_adj_mters']]
      final_row_snow_transpose = final_row_snow.T
      final_row_snow_transpose = final_row_snow_transpose[stn_data.ID.values.tolist()]
   
    #take the transpose
    
        
 
      interpolated_values = interpolated_values.append(final_row_snow_transpose)
        
        
  
    else:
      
      # if all nans for a given day, set the current date data as that of the precious day
      
      
      last_row = interpolated_values.tail(1)
      interpolated_values = interpolated_values.append(last_row)
    
    
    interpolated_values.to_csv('f12k.csv')