def generic_gridder(day, df, idx): """ Generic gridding algorithm for easy variables """ data = df[idx].values coordinates = (df["lon"].values, df["lat"].values) region = [XAXIS[0], XAXIS[-1], YAXIS[0], YAXIS[-1]] projection = pyproj.Proj(proj="merc", lat_ts=df["lat"].mean()) spacing = 0.5 chain = vd.Chain([ ("mean", vd.BlockReduce(np.mean, spacing=spacing * 111e3)), ("spline", vd.Spline(damping=1e-10, mindist=100e3)), ]) train, test = vd.train_test_split(projection(*coordinates), data, random_state=0) chain.fit(*train) score = chain.score(*test) shape = (len(YAXIS), len(XAXIS)) grid = chain.grid( region=region, shape=shape, projection=projection, dims=["latitude", "longitude"], data_names=["precip"], ) res = grid.to_array() res = np.ma.where(res < 0, 0, res) print(("%s %s rows for %s column min:%.3f max:%.3f score: %.3f") % (day, len(df.index), idx, np.nanmin(res), np.nanmax(res), score)) return masked_array(res, mpunits("inch"))
def validation(sample_block_size=500, test_size=0.1): begin = process_time() print("model validation begin") train, test = vd.train_test_split(coordinates, dados[feature], test_size=test_size, spacing=sample_block_size) chain.fit(*train) score = chain.score(*test) print(score) timelapse(begin, "model validation") return score
# Fetch the wind speed data from Texas. data = vd.datasets.fetch_texas_wind() print(data.head()) # Separate out some of the data into utility variables coordinates = (data.longitude.values, data.latitude.values) region = vd.get_region(coordinates) # Use a Mercator projection because Spline is a Cartesian gridder projection = pyproj.Proj(proj="merc", lat_ts=data.latitude.mean()) # Split the data into a training and testing set. We'll fit the gridder on the training # set and use the testing set to evaluate how well the gridder is performing. train, test = vd.train_test_split( projection(*coordinates), (data.wind_speed_east_knots, data.wind_speed_north_knots), random_state=2, ) # We'll make a 20 arc-minute grid spacing = 20 / 60 # Chain together a blocked mean to avoid aliasing, a polynomial trend (Spline usually # requires de-trended data), and finally a Spline for each component. Notice that # BlockReduce can work on multicomponent data without the use of Vector. chain = vd.Chain( [ ("mean", vd.BlockReduce(np.mean, spacing * 111e3)), ("trend", vd.Vector([vd.Trend(degree=1) for i in range(2)])), ( "spline",
# # We can't evaluate a gridder on the data that went into fitting it. The true test of a # model is if it can correctly predict data that it hasn't seen before. scikit-learn has # the :func:`sklearn.model_selection.train_test_split` function to separate a dataset # into two parts: one for fitting the model (called *training* data) and a separate one # for evaluating the model (called *testing* data). Using it with spatial data would # involve some tedious array conversions so Verde implements # :func:`verde.train_test_split` which does the same thing but takes coordinates and # data arrays instead. # # The split is done randomly so we specify a seed for the random number generator to # guarantee that we'll get the same result every time we run this example. You probably # don't want to do that for real data. We'll keep 30% of the data to use for testing. train, test = vd.train_test_split(proj_coords, data.air_temperature_c, test_size=0.3, random_state=0) print(train) print(test) plt.figure(figsize=(8, 6)) ax = plt.axes() ax.set_title("Air temperature measurements for Texas") ax.plot(train[0][0], train[0][1], ".r", label="train") ax.plot(test[0][0], test[0][1], ".b", label="test") ax.legend() ax.set_aspect("equal") plt.tight_layout() plt.show() ########################################################################################
# vary smoothly but have different uncertainties. spacing = 5 / 60 # 5 arc-minutes chain = vd.Chain( [ ("mean", vd.BlockMean(spacing=spacing * 111e3, uncertainty=True)), ("spline", vd.Spline(damping=1e-10)), ] ) print(chain) # Split the data into a training and testing set. We'll use the training set to grid the # data and the testing set to validate our spline model. Weights need to # 1/uncertainty**2 for the error propagation in BlockMean to work. train, test = vd.train_test_split( projection(*coordinates), data.velocity_up, weights=1 / data.std_up ** 2, random_state=0, ) # Fit the model on the training set chain.fit(*train) # And calculate an R^2 score coefficient on the testing set. The best possible score # (perfect prediction) is 1. This can tell us how good our spline is at predicting data # that was not in the input dataset. score = chain.score(*test) print("\nScore: {:.3f}".format(score)) # Create a grid of the vertical velocity and mask it to only show points close to the # actual data. region = vd.get_region(coordinates) grid_full = chain.grid( region=region,
def interp(df, mask, var='biomass', spacing=4000): """ Grid a set of lat/lon points to a grid defined by mask Parameters ---------- df : pd.DataFrame Data points to be gridded in the form of a Pandas DataFrame with columns ``lat``, ``lon``, and ``var``. mask : xr.DataArray Target grid defintion. Must include a pyproj parsable crs attribute (e.g. ``mask.attrs['crs']``). Data should be between 0 and 1. var : str Name of column in df to grid. spacing : float Grid spacing in units defined by the masks crs. Returns ------- grid : xr.DataArray Gridded data from df. """ import verde as vd # extract the projection and grid info region = [mask.x.data[0], mask.x.data[-1], mask.y.data[-1], mask.y.data[0]] projection = pyproj.Proj(mask.attrs['crs']) coordinates = (df.lon.values, df.lat.values) proj_coords = projection(*coordinates) # split for validation... this may belong outside of this function train, test = vd.train_test_split( projection(*coordinates), df[var], random_state=RANDOM_SEED, ) # fit the gridder chain = vd.Chain( [ ('mean', vd.BlockReduce(np.mean, spacing=spacing * 0.25, region=region)), ('nearest', vd.ScipyGridder(method='linear')), ] ) chain.fit(*train) # y_pred = chain.predict(test[0]) # fit_score = score(test[1][0], y_pred) # make the grid grid = chain.grid(spacing=spacing, region=region, data_names=[var], dims=('y', 'x')) grid = vd.distance_mask( proj_coords, maxdist=4 * spacing, grid=grid, ) grid = np.flipud(grid[var]) * mask grid.name = var return grid
import verde as vd import erizo as ez # Fetch the GPS data from the U.S. West coast that is shipped with Verde. We'll # grid only the horizontal components of the velocities data = vd.datasets.fetch_california_gps() coordinates = (data.longitude.values, data.latitude.values) region = vd.get_region(coordinates) # Use a Mercator projection because Elastic2D is a Cartesian gridder projection = pyproj.Proj(proj="merc", lat_ts=data.latitude.mean()) # Split the data into a training and testing set. We'll fit the gridder on the # training set and use the testing set to evaluate how well the gridder is # performing. train, test = vd.train_test_split(projection(*coordinates), (data.velocity_east, data.velocity_north), random_state=0) # We'll make a 10 arc-minute grid in the end. spacing = 10 / 60 # Chain together a blocked mean to avoid aliasing, a polynomial trend to take # care of the increase toward the coast, and finally the vector gridder using # Poisson's ratio 0.5 to couple the two horizontal components. chain = vd.Chain([ ("mean", vd.BlockReduce(np.mean, spacing * 111e3)), ("trend", vd.Vector([vd.Trend(degree=1) for i in range(2)])), ("spline", ez.Elastic2D(poisson=0.5, mindist=10e3)), ]) # Fit on the training data chain.fit(*train)
spacing = 15 / 60 # Now we can chain a blocked mean and spline together. The Spline can be regularized # by setting the damping coefficient (should be positive). It's also a good idea to set # the minimum distance to the average data spacing to avoid singularities in the spline. chain = vd.Chain([ ("mean", vd.BlockReduce(np.mean, spacing=spacing * 111e3)), ("spline", vd.Spline(damping=1e-10, mindist=100e3)), ]) print(chain) # We can evaluate model performance by splitting the data into a training and testing # set. We'll use the training set to grid the data and the testing set to validate our # spline model. train, test = vd.train_test_split(projection(*coordinates), data.air_temperature_c, random_state=0) # Fit the model on the training set chain.fit(*train) # And calculate an R^2 score coefficient on the testing set. The best possible score # (perfect prediction) is 1. This can tell us how good our spline is at predicting data # that was not in the input dataset. score = chain.score(*test) print("\nScore: {:.3f}".format(score)) # Now we can create a geographic grid of air temperature by providing a projection # function to the grid method and mask points that are too far from the observations grid_full = chain.grid( region=region,
import matplotlib.pyplot as plt import cartopy.crs as ccrs import verde as vd # Let's split the Baja California shipborne bathymetry data data = vd.datasets.fetch_baja_bathymetry() coordinates = (data.longitude, data.latitude) values = data.bathymetry_m # Assign 20% of the data to the testing set. test_size = 0.2 # Split the data randomly into training and testing. Set the random state # (seed) so that we get the same result if running this code again. train, test = vd.train_test_split(coordinates, values, test_size=test_size, random_state=123) # train and test are tuples = (coordinates, data, weights). print("Train and test size for random splits:", train[0][0].size, test[0][0].size) # A different strategy is to first assign the data to blocks and then split the # blocks randomly. To do this, specify the size of the blocks using the # 'spacing' argument. train_block, test_block = vd.train_test_split( coordinates, values, spacing=10 / 60, test_size=test_size, random_state=213, )
######################################################################################## # Gridding # -------- # # You can use :class:`verde.Vector` to create multi-component gridders out of # :class:`verde.Spline` the same way as we did for trends. In this case, each component # is treated separately. # # We can start by splitting the data into training and testing sets (see # :ref:`model_selection`). Notice that :func:`verde.train_test_split` work for # multicomponent data automatically. train, test = vd.train_test_split( coordinates=proj_coords, data=(data.velocity_east, data.velocity_north), weights=(1 / data.std_east**2, 1 / data.std_north**2), random_state=1, ) ######################################################################################## # Now we can make a 2-component spline. Since :class:`verde.Vector` implements # ``fit``, ``predict``, and ``filter``, we can use it in a :class:`verde.Chain` to build # a pipeline. # # We need to use a bit of damping so that the weights can be taken into account. Splines # without damping provide a perfect fit to the data and ignore the weights as a # consequence. chain = vd.Chain([ ("mean", vd.BlockMean(spacing=spacing * 111e3, uncertainty=True)), ("trend", vd.Vector([vd.Trend(1), vd.Trend(1)])),