def test_maxlag_change(): # get data c, v = _get_pan_sample() # create a Variogram with default settings default = skg.Variogram(c, v) maxlag = skg.Variogram(c, v, maxlag=default.bins[-1]) assert_array_almost_equal( default.experimental, maxlag.experimental, decimal=1 )
def test_raster_metric(): # Generate a gridded dataset shape = (100, 100) np.random.seed(42) vals = np.random.normal(0, 1, size=shape) # Coordinates x = np.arange(0, shape[0]) y = np.arange(0, shape[1]) xx, yy = np.meshgrid(x, y) # Flatten everything because we don't care about the 2D at this point coords = np.dstack((xx.flatten(), yy.flatten())).squeeze() vals = vals.flatten() # Run the computation rems = skg.RasterEquidistantMetricSpace(coords, shape=shape, extent=(x[0],x[-1],y[0],y[-1]), samples=10, runs=10, rnd=42, verbose=True) # Minimal check of the output assert rems.max_dist == pytest.approx(140,rel=0.01) assert rems.res == pytest.approx(1, rel=0.0001) assert isinstance(rems.dists, scipy.sparse.csr.csr_matrix) assert rems.dists.shape == (10000, 10000) # Check the random state provides the same final center assert all(rems._centers[-1] == np.array([62, 52])) # Check the interface with a Variogram object works V = skg.Variogram(rems, vals) assert V.bin_count is not None # Check the variogram is always the same with the random state given assert V.experimental[0] == pytest.approx(0.89,0.01) # Check that the routines are robust to very few data points in the grid (e.g., from nodata values) coords_sub = coords[0::1000] vals_sub = vals[0::1000] rems_sub = skg.RasterEquidistantMetricSpace(coords_sub, shape=shape, extent=(x[0],x[-1],y[0],y[-1]), samples=100, runs=10, rnd=42) V = skg.Variogram(rems_sub, vals_sub) # Check with a single isolated point possibly being used as center coords_sub = np.concatenate(([coords[0]], coords[-10:])) vals_sub = np.concatenate(([vals[0]], vals[-10:])) rems_sub = skg.RasterEquidistantMetricSpace(coords_sub, shape=shape, extent=(x[0],x[-1],y[0],y[-1]), samples=100, runs=11, rnd=42) V = skg.Variogram(rems_sub, vals_sub)
def variograms(data, geometries, window=7, N=10, estimator='matheron', maxlag='median', binify='uniform', cm=plt.cm.Reds, styles=['-b', '--k', ':k'], rank=True): assert len(data) == len(geometries) # override skgstat's entropy method for using global bins if estimator == 'entropy': estimator = entropy_f if estimator == 'dispersion': estimator = dispersion n = len(data) v = [list() for _ in range(n)] for j, (df, geom) in enumerate(zip(data, geometries)): # variogrmas for i in range(0, len(df.index) - window, 1): if rank: df = df.rank(axis=1, pct=True) df_window = df.iloc[i:i+window].mean().dropna() #c = geom.loc[values.index].values c = geom.reindex(df_window.index).dropna() values = df_window.reindex(c.index).dropna() V = skg.Variogram(coordinates=c.values, values=values.values, n_lags=N, estimator=estimator, maxlag=maxlag, bin_func=binify) v[j].append(V) return v
def test_likelihood(): """ Call the likelihood function and make sure that it optimizes the the pancake variogram """ # build the variogram from the tutorial c, v = skg.data.pancake(300, seed=42).get('sample') vario = skg.Variogram(c, v, bin_func='scott', maxlag=0.7) # get the likelihood function like = li.get_likelihood(vario) # cretae the optimization attributes sep_mean = vario.distance.mean() sam_var = vario.values.var() # create initial guess p0 = np.array([sep_mean, sam_var, 0.1 * sam_var]) # create the bounds to restrict optimization bounds = [[0, vario.bins[-1]], [0, 3 * sam_var], [0, 2.9 * sam_var]] # minimize the likelihood function res = minimize(like, p0, bounds=bounds, method='SLSQP') # the result and p0 should be different assert not np.allclose(res.x, p0, rtol=1e-3)
def fit(self, setImageHoled): """ This function should average an analytical variogramm for several images wich will be use in the predict function. This part IS NOT finished param setImageHoled: 3darray, collection of several holed image """ y = range(0,setImageHoled.shape[1]) x = range(0,setImageHoled.shape[0]) xv,yv = np.meshgrid(x,y) for index in range (len(setImageHoled)): lx = len(setImageHoled[0]) ImageHoled = setImageHoled[index].reshape( (lx,lx) ) complete = np.concatenate((xv.reshape( (-1,1) ), yv.reshape( (-1,1) ), ImageHoled.reshape( (-1,1) ) ), axis=1 ) ok = np.isfinite(complete[:,2]) carte = complete[ok,:] coordinates = carte[:,:2] values_no_nan = carte[:,2] coef_plot = np.zeros([len(setImageHoled),2]) nb_sample = np.random.choice(range(carte.shape[0]), self.nbPoints) V = skg.Variogram(coordinates = coordinates[nb_sample], values = values_no_nan[nb_sample], normalize=False, model='spherical') coef_plot[:,0] += V.bins coef_plot[:,1] += V.experimental coef_plot = coef_plot/len(setImageHoled) self.cof = V.fit(coef_plot[:,0], coef_plot[:,1])[0] V.plot(show = False) return self.cof
def test_sparse_matrix_no_warning(): # make a really sparse matrix sparse = skg.MetricSpace(rcoords, max_dist=5) # call triangular_distance_matrix without warning V = skg.Variogram(sparse, rvals) V.triangular_distance_matrix
def test_invalid_dist_func(): # instantiate metrix space ms = skg.MetricSpace(rcoords, dist_metric='euclidean') with pytest.raises(AttributeError) as e: skg.Variogram(ms, rvals, dist_func='cityblock') assert 'Distance metric' in e.value
def test_dense_matrix_warning(): dense = skg.MetricSpace(rcoords) # check the warning with pytest.raises(RuntimeWarning) as w: V = skg.Variogram(dense, rvals) V.triangular_distance_matrix assert 'Only available' in w.value
def predict(self, imageHoled, showVariogram = False, calculSp = False, imageMasked = None): """ Calculates the missing values of the 2darray imageHoled using the ordinary kriging method param showVariogram: boolean, show the experimental and the analytical variogram if true param calculSp: boolean, calculates the standard deviation of each kriged point if true """ y = range(0,imageHoled.shape[1]) x = range(0,imageHoled.shape[0]) xv,yv = np.meshgrid(x,y) ny = len(y) nx = len(x) full_image = np.concatenate((xv.reshape( (-1,1) ), yv.reshape( (-1,1) ), imageHoled.reshape( (-1,1) ) ), axis=1 ) ok = np.isfinite(full_image[:,2]) nok = np.isnan(full_image[:,2]) carte = full_image[ok,:] coordinates = carte[:,:2] values_no_nan = carte[:,2] point_nan = full_image[nok,:2] nb_sample = np.random.choice(range(carte.shape[0]),self.nbPoints) #Construction of the theorical semivariogram using scikit-geostat self.V = skg.Variogram(coordinates = coordinates[nb_sample], values = values_no_nan[nb_sample], normalize = False, model = self.model) self.V.cof = self.V.fit(self.V.bins, self.V.experimental)[0] #self.V.plot(show = False, cof=self.V.cof)#Erreur sur show= showVariogram ? A = self._build_A(carte) F = np.empty(len(point_nan)) if calculSp: sp2 = np.zeros([len(point_nan)]) carte_Sp = np.concatenate((xv.reshape( (-1,1) ), yv.reshape( (-1,1) ), imageMasked.reshape( (-1,1) ) ), axis=1 ) for k in range(len(point_nan)): D = np.sqrt(np.sum(np.square(point_nan[k]-carte[:,:2]),axis=1)) D_dist = D[D<self.distance] B = self._build_B(D_dist) W = np.linalg.solve(self._maskDistance(A,D)[1] ,B) if calculSp: sp2[k] = sum(W*B) W = W/np.sum(W[:-1]) F[k] = np.sum(W[:-1]*carte[self._maskDistance(A,D)[0][:-1],2]) if calculSp: carte_Sp[nok,2] = sp2 self.sp = np.reshape(carte_Sp[:,2],(nx,ny),order ='C') full_image[nok,2] = F self.Im = np.reshape(full_image[:,2],(nx,ny),order='C') return self.Im
def variogram_nugget(df): if df.columns[0] == df.columns[1]: return np.nan df = df.dropna() if (len(set(df.columns[0])) < 2): return np.nan coordinates = df[df.columns[:-1]] values = df[df.columns[-1]] try: V = skg.Variogram(coordinates=coordinates, values=values) except: return np.nan return V.describe()["nugget"]
def get_spatial_corr(argsin): coords, vals, i, cutoffs, nlags, nmax = argsin if len(coords) > nmax: subset = np.random.choice(len(coords), nmax, replace=False) coords = coords[subset] vals = vals[subset] print('Drawing variograms for pack ' + str(i + 1) + ' with ' + str(len(coords)) + ' points.') arr_shape = (len(cutoffs), nlags) exps, bins, counts = (np.zeros(arr_shape) * np.nan for i in range(3)) for i in range(len(cutoffs)): try: #commented "ignoring maxlag" in skgstat/binning.py V = skg.Variogram(coordinates=coords, values=vals, n_lags=nlags, maxlag=cutoffs[i], normalize=False, model='exponential') except: return np.zeros(arr_shape) * np.nan, np.zeros( arr_shape) * np.nan, np.zeros(arr_shape) * np.nan count = np.zeros(nlags) tmp_count = np.fromiter((g.size for g in V.lag_classes()), dtype=int) count[0:len(tmp_count)] = tmp_count exps[i, :] = V.experimental bins[i, :] = V.bins counts[i, :] = count return exps, bins, counts
).transform_filter( (alt.datum.state != "PR") & (alt.datum.state != "VI") ).add_selection( select_city ) cht = (background + connections + points).configure_view(stroke=None) cht.save("/cdn/altair.html") from IPython.display import HTML HTML("<iframe src=altair.html width=900 height=600>") flights_delayed = spark.sql("SELECT Origin, count(1) as total, sum(case when DepDelay > 0 then 1 else 0 end) as delayed from flights group by Origin").toPandas() delayed_fraction = flights_delayed["delayed"].astype("float64") / flights_delayed["total"] import matplotlib.pyplot as plt import numpy as np #plt.figure() #plt.hist(delayed_fraction, bins=100) import pandas as pd airports_with_delays = pd.merge(flights_delayed, airports, left_on=['Origin'], right_on=['iata'], how='left') # Investigate the spatial dependence of propensity to delay # with a variogram. The variogram doesn't show strong evidence # of spatial dependence. import skgstat v = skgstat.Variogram(np.vstack([airports_with_delays["long"], airports_with_delays["lat"]]).transpose(), delayed_fraction).plot()
# # .. math:: # # \gamma (h) = \frac{1}{2N(h)} * \sum_{i=1}^{N(h)}(Z(x_i) - Z(x_{i + h}))^2 # # For more details, please refer to the `User Guide <https://mmaelicke.github.io/scikit-gstat/userguide/variogram.html#experimental-variograms>`_. # # The :class:`Variogram <skgstat.Variogram>` class takes at least two arguments. # The :func:`coordinates <skgstat.Variogram.coordinates>` and the :func:`values <skgstat.Variogram.values>` observed at these locations. # If you use older versions, <ou should also at least set the ``normalize`` parameter to explicitly, as it changed it's default value in # version `0.2.8` to ``False``. This attribute affects only the plotting, not the variogram values. # Additionally, the number of bins is set to 15, because we have fairly many observations and the default value of 10 is unnecessarily small. # The ``maxlag`` set the maximum distance for the last bin. As we have no other additional information about expected correlation lengths, # we can either set nothing and use the full distance matrix, or set it i.e. to ``'median'``, which will restrict the distance matrix to only # use half of all combinations. V = skg.Variogram(coords, vals.flatten(), maxlag='median', n_lags=15, normalize=False) fig = V.plot(show=False) # %% # The upper subplot show the histogram for the count of point-pairs in each lag class. You can see various things here: # # * As expected, there is a clear spatial dependency, because semi-variance increases with distance (blue dots) # * The default `spherical` variogram model is well fitted to the experimental data # * The shape of the dependency is **not** captured quite well, but fair enough for this example # * The first two bins are not well captured, suggesting either the use of a nugget, or a different model. # # The sill of the variogram should correspond with the field variance. The field is unknown, but we can compare the sill to the *sample* variance: print('Sample variance: %.2f Variogram sill: %.2f' % (vals.flatten().var(), V.describe()['sill'])) # %% # The ``describe`` method will return the most important parameters as a dictionary.
# %% # make a nice overview of the samples fig, axes = plt.subplots(1, 3, figsize=(18, 5)) for data, ax in zip((data1, data2, data3), axes.flatten()): plot_scatter(data, ax) # %% # 2.2 Comparing estimators # ------------------------ # calculate all variograms, but use only the second one V1 = skg.Variogram(data2[['x', 'y']].values, data2.v.values, normalize=False, n_lags=8, use_nugget=True) V2 = skg.Variogram(data2[['x', 'y']].values, data2.v.values, normalize=False, n_lags=15, use_nugget=True) V3 = skg.Variogram(data2[['x', 'y']].values, data2.v.values, normalize=False, n_lags=25, use_nugget=True) vario = V2
# Load a artificial random field, generated from a Gaussian covariance function, # with a 2x larger range in x-axis direction: ac, av = skg.data.aniso(N=300, seed=42).get('sample') # %% # Load the TERENO soil temperature data from Fersch et al. (2020): with open('./data/tereno_fendt/tereno.json', 'r') as js: data_obj = json.load(js) coords = np.array(data_obj['coordinates']) vals = np.array(data_obj['values']) print(data_obj['description']) # %% # Estimate a variogram, with a few more lag classes, as there are enough observation points available. V = skg.Variogram(c, v, n_lags=25) print(V) # %% # Estimate the directional variogram with a few more lag classes and an azimuth # of 90°. The tolerance is set rather low to illustrate the graphs better # (fewer point connections.): DV = skg.DirectionalVariogram(ac, av, n_lags=20, azimuth=40., tolerance=15.0) print(DV) # %% # Estimate the spatio-temporal variogram with a product-sum model. # Only every 6th hour is taken into account to decrease the memory footprint. # If you use the full dataset, you need ^120 GiB RAM. # The marginal variograms are kept as they are. STV = skg.SpaceTimeVariogram(coords,
coords = np.column_stack((x, y)) # %% # In the example, :any:`gstools.variogram.vario_estimate` is used to estimate the empirical variogram: # # .. code-block:: python # # # estimate the variogram of the field # bin_center, gamma = gs.vario_estimate((x, y), field) # # # Here, we can use :class:`skg.Variogram <skgstat.Variogram>`. # From the shown arguments, :func:`estimator <skgstat.Variogram.estimator>` and # :func:`bin_func <skgstat.Variogram.bin_func>` are using the default values: V = skg.Variogram(coords, field, n_lags=21, estimator='matheron', maxlag=45, bin_func='even') bin_center, gamma = V.get_empirical(bin_center=True) # %% # And finally, the exact same code from the GSTools docs can be called: # fit the variogram with a stable model. (no nugget fitted) fit_model = gs.Stable(dim=2) fit_model.fit_variogram(bin_center, gamma, nugget=False) # %% # Output the model ax = fit_model.plot(x_max=max(bin_center)) ax.scatter(bin_center, gamma) print(fit_model)
name='samples')) fig.update_layout(width=450, height=450, template='plotly_white') fig # %% # 5.2 Lag class binning - fixed ``N`` # ----------------------------------- # Apply different lag class binning methods and visualize their histograms. # In this section, the distance matrix between all point pair combinations # ``(NxN)`` is binned using each method. The plots visualize the histrogram of # the distance matrix of the variogram, **not** the variogram lag classes # themselves. N = 15 # use a nugget V = skg.Variogram(coords, vals, n_lags=N, use_nugget=True) # %% # 5.2.1 default :func:`'even' <skgstat.binning.even_width_lags>` lag classes # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # The default binning method will find ``N`` equidistant bins. This is the # default behavior and used in almost all geostatistical publications. # It should not be used without a ``maxlag`` (like done in the plot below). # apply binning bins, _ = skg.binning.even_width_lags(V.distance, N, None) # get the histogram count, _ = np.histogram(V.distance, bins=bins) fig = go.Figure(go.Bar(x=bins, y=count),
warnings.filterwarnings('ignore') # %% # We use the pancake dataset, sampled at 300 random locations to produce a quite dense sample. c, v = skg.data.pancake(N=300, seed=42).get('sample') # %% # First of, the variogram is calculated. We use Scott's rule to determine # the number of lag classes, explicitly set Trust-Region Reflective as # fitting method (although its default) and limit the distance matrix to # 70% of the maximum separating distance. # Additionally, we capture the processing time for the whole variogram # estimation. Note, that this also includes the calculation of the # distance matrix, which is a mututal step. t1 = time() V = skg.Variogram(c,v, bin_func='scott', maxlag=0.7, fit_func='trf') t2 = time() # get time for full analysis, including fit print(f"Processing time: {round((t2 - t1) * 1000)} ms") print(V) fig = V.plot() # %% # Maximum likelihood using SciKit-GStat # ------------------------------------- # First step to perform the fitting is to make initial guesses for the parameters. # Here, we take the mean separating distance for the effective range, the sample # variance for the sill and 10% of the sample variance for the nugget. # To improve performance and runtime, we also define a boundary to restrict # the parameter space. # base initial guess on separating distance and sample variance
def interpolate_arrays(param_name, variograms, positions, values, new_positions, variogram_args={}, kriging_args={}): """Helper function for interpolate() that generates, stores and uses variograms the same way, but uses data from explicitly supplied arrays for both generating the variogram and for kriging. Parameters ----------- param_name : str Name of variogram to use variograms : str Dataframe with a column "variogram" (dtype object) containing variogram descriptions and an index containing variogram names. This dataframe can be empty (or not have a row for the requested name), and if so will be populated with a new variogram. positions : np.ndarray[N,2] Positions to generate variogram for. values : np.ndarray[N] Values to generate variogram for. new_positions : np.ndarray[N,2] Positions to interpolate (krige) values to. variogram_args : dict Extra arguments to skgstat.Variogram kriging_args : dict Extra arguments to skgstat.OrdinaryKriging """ if np.isnan(values).min(): return np.full(len(new_positions), np.nan), np.full(len(new_positions), np.nan) elif np.nanmin(values) == np.nanmax(values): return np.full(len(new_positions), np.nanmax(values)), np.full(len(new_positions), 0) if param_name not in variograms.index: logger.debug("...Generating variogram for %s..." % param_name) variogram = skgstat.Variogram(positions, values, **variogram_args) desc = variogram.describe() desc["experimental"] = list(variogram.experimental) desc["bins"] = list(variogram.bins) variograms.loc[param_name] = { "variogram": { "type": "skgstat.Variogram", "values": desc } } else: variogram = variograms.loc[param_name, "variogram"]["values"] kriging = skgstat.OrdinaryKriging(variogram, coordinates=positions, values=values, **kriging_args) values = kriging.transform(new_positions[:, 0], new_positions[:, 1]) variance = kriging.sigma return values, variance
def variogram( coordinates: np.ndarray, values: np.ndarray, estimator: typing.Estimator = 'matheron', model: typing.Model = 'spherical', dist_func: typing.DistFunc = 'euclidean', bin_func: typing.BinFunc = 'even', fit_method: typing.FitMethod = 'trf', fit_sigma: typing.FitSigma = None, use_nugget: bool = False, maxlag: typing.Maxlag = None, n_lags: typing.Union[int, None] = 10, return_type: typing.Literal['object', 'describe', 'plot', 'distance_difference', 'location_trend', 'scattergram'] = 'object', **kwargs ) -> skg.Variogram: """ Uses scikit-gstat to estimate a Variogram. Refer to the :class:`Variogram <skgstat.Variogram>` class to learn about the different Parameters. Parameters ---------- return_type : str Specify how the Variogram instance should be returned. Object will return the actual instance. 'describe' is the dictionary output generated by the class, which is serializable. All other options return the different plot types Returns ------- variogram : skgstat.Variogram If the return type is `'object'` plot : plotly.Figure, matplotlib.pyplot.Figure If the return type is one of the plots description : dict If the return type is `'describe'` """ # create the variogram v = skg.Variogram( coordinates=coordinates, values=values, estimator=estimator, model=model, dist_func=dist_func, bin_func=bin_func, fit_method=fit_method, fit_sigma=fit_sigma, use_nugget=use_nugget, maxlag=maxlag, n_lags=n_lags, **kwargs ) if return_type == 'object': return v elif return_type == 'describe': return v.describe(short=False, flat=False) # otherwise create a plot pfunc = plot_function_loader('variogram') fig = pfunc( func_args=dict( variogram=v, plot_type=return_type ), plot_args=kwargs ) return fig
plt.colorbar(art, ax=ax) # run fig, axes = plt.subplots(1, 3, figsize=(18, 5)) for data, ax in zip((data1, data2, data3), axes.flatten()): plot_scatter(data, ax) # %% # 3.2 Comparing theoretical models # -------------------------------- # One of the features of :mod:`skgstat` is the fact that it is programmed object oriented. # That means, we can just instantiate a :class:`Variogram <skgstat.Variogram>` object # and start changing arguments unitl it models spatial dependency in our observations well. V1 = skg.Variogram(data1[['x', 'y']].values, data1.v.values, maxlag='median', normalize=False) V1.plot(show=False) # %% # Plot the others as well V2 = skg.Variogram(data2[['x', 'y']].values, data2.v.values, maxlag='median', normalize=False) V3 = skg.Variogram(data3[['x', 'y']].values, data3.v.values, maxlag='median', normalize=False) fig, _a = plt.subplots(1, 3, figsize=(12, 3), sharey=True)