def cross_validation(): begin = process_time() print("cross validation begin") cv = vd.BlockKFold(spacing=100, n_splits=10, shuffle=True) scores = vd.cross_val_score(chain, coordinates, dados[feature], cv=cv) plt.figure() plt.hist(scores, bins='auto') print("cross validation end") timelapse(begin) return scores
# Use 1 as a seed instead of 0 train_other, test_other = vd.train_test_split(proj_coords, data.air_temperature_c, test_size=0.3, random_state=1) print("R² score with seed 1:", spline.fit(*train_other).score(*test_other)) ######################################################################################## # A more robust way of scoring the gridders is to use function # :func:`verde.cross_val_score`, which (by default) uses a `k-fold cross-validation # <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation>`__. # It will split the data *k* times and return the score on each *fold*. We can then take # a mean of these scores. scores = vd.cross_val_score(spline, proj_coords, data.air_temperature_c) print("k-fold scores:", scores) print("Mean score:", np.mean(scores)) ######################################################################################## # That is not a very good score so clearly the default arguments for # :class:`~verde.Spline` aren't suitable for this dataset. We could try different # combinations manually until we get a good score. A better way is to do this # automatically. ######################################################################################## # Tuning # ------ # # :class:`~verde.Spline` has many parameters that can be set to modify the final result. # Mainly the ``damping`` regularization parameter and the ``mindist`` "fudge factor"
# Use Mercator projection because Spline is a Cartesian gridder projection = pyproj.Proj(proj="merc", lat_ts=data.latitude.mean()) proj_coords = projection(data.longitude.values, data.latitude.values) region = vd.get_region((data.longitude, data.latitude)) # The desired grid spacing in degrees (converted to meters using 1 degree approx. 111km) spacing = 15 / 60 ######################################################################################## # Before we begin tuning, let's reiterate what the results were with the default # parameters. spline_default = vd.Spline() score_default = np.mean( vd.cross_val_score(spline_default, proj_coords, data.air_temperature_c)) spline_default.fit(proj_coords, data.air_temperature_c) print("R² with defaults:", score_default) ######################################################################################## # Tuning # ------ # # :class:`~verde.Spline` has many parameters that can be set to modify the final result. # Mainly the ``damping`` regularization parameter and the ``mindist`` "fudge factor" # which smooths the solution. Would changing the default values give us a better score? # # We can answer these questions by changing the values in our ``spline`` and # re-evaluating the model score repeatedly for different values of these parameters. # Let's test the following combinations:
random_state=1) print("R² score with seed 1:", vd.Spline().fit(*train_other).score(*test_other)) ######################################################################################## # Cross-validation # ---------------- # # A more robust way of scoring the gridders is to use function # :func:`verde.cross_val_score`, which (by default) uses a `k-fold cross-validation # <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation>`__ # by default. It will split the data *k* times and return the score on each *fold*. We # can then take a mean of these scores. scores = vd.cross_val_score(vd.Spline(), proj_coords, data.air_temperature_c) print("k-fold scores:", scores) print("Mean score:", np.mean(scores)) ######################################################################################## # You can also use most cross-validation splitter classes from # :mod:`sklearn.model_selection` by specifying the ``cv`` argument. For example, if we # want to shuffle then split the data *n* times # (:class:`sklearn.model_selection.ShuffleSplit`): from sklearn.model_selection import ShuffleSplit shuffle = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0) scores = vd.cross_val_score(vd.Spline(), proj_coords,