Ejemplo n.º 1
0
def cross_validation():
    begin = process_time()
    print("cross validation begin")
    cv = vd.BlockKFold(spacing=100, n_splits=10, shuffle=True)
    scores = vd.cross_val_score(chain, coordinates, dados[feature], cv=cv)
    plt.figure()
    plt.hist(scores, bins='auto')
    print("cross validation end")
    timelapse(begin)
    return scores
Ejemplo n.º 2
0
# Use 1 as a seed instead of 0
train_other, test_other = vd.train_test_split(proj_coords,
                                              data.air_temperature_c,
                                              test_size=0.3,
                                              random_state=1)
print("R² score with seed 1:", spline.fit(*train_other).score(*test_other))

########################################################################################
# A more robust way of scoring the gridders is to use function
# :func:`verde.cross_val_score`, which (by default) uses a `k-fold cross-validation
# <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation>`__.
# It will split the data *k* times and return the score on each *fold*. We can then take
# a mean of these scores.

scores = vd.cross_val_score(spline, proj_coords, data.air_temperature_c)
print("k-fold scores:", scores)
print("Mean score:", np.mean(scores))

########################################################################################
# That is not a very good score so clearly the default arguments for
# :class:`~verde.Spline` aren't suitable for this dataset. We could try different
# combinations manually until we get a good score. A better way is to do this
# automatically.

########################################################################################
# Tuning
# ------
#
# :class:`~verde.Spline` has many parameters that can be set to modify the final result.
# Mainly the ``damping`` regularization parameter and the ``mindist`` "fudge factor"
Ejemplo n.º 3
0
# Use Mercator projection because Spline is a Cartesian gridder
projection = pyproj.Proj(proj="merc", lat_ts=data.latitude.mean())
proj_coords = projection(data.longitude.values, data.latitude.values)

region = vd.get_region((data.longitude, data.latitude))
# The desired grid spacing in degrees (converted to meters using 1 degree approx. 111km)
spacing = 15 / 60

########################################################################################
# Before we begin tuning, let's reiterate what the results were with the default
# parameters.

spline_default = vd.Spline()
score_default = np.mean(
    vd.cross_val_score(spline_default, proj_coords, data.air_temperature_c))
spline_default.fit(proj_coords, data.air_temperature_c)
print("R² with defaults:", score_default)

########################################################################################
# Tuning
# ------
#
# :class:`~verde.Spline` has many parameters that can be set to modify the final result.
# Mainly the ``damping`` regularization parameter and the ``mindist`` "fudge factor"
# which smooths the solution. Would changing the default values give us a better score?
#
# We can answer these questions by changing the values in our ``spline`` and
# re-evaluating the model score repeatedly for different values of these parameters.
# Let's test the following combinations:
Ejemplo n.º 4
0
                                              random_state=1)

print("R² score with seed 1:",
      vd.Spline().fit(*train_other).score(*test_other))

########################################################################################
# Cross-validation
# ----------------
#
# A more robust way of scoring the gridders is to use function
# :func:`verde.cross_val_score`, which (by default) uses a `k-fold cross-validation
# <https://en.wikipedia.org/wiki/Cross-validation_(statistics)#k-fold_cross-validation>`__
# by default. It will split the data *k* times and return the score on each *fold*. We
# can then take a mean of these scores.

scores = vd.cross_val_score(vd.Spline(), proj_coords, data.air_temperature_c)
print("k-fold scores:", scores)
print("Mean score:", np.mean(scores))

########################################################################################
# You can also use most cross-validation splitter classes from
# :mod:`sklearn.model_selection` by specifying the ``cv`` argument. For example, if we
# want to shuffle then split the data *n* times
# (:class:`sklearn.model_selection.ShuffleSplit`):

from sklearn.model_selection import ShuffleSplit

shuffle = ShuffleSplit(n_splits=10, test_size=0.3, random_state=0)

scores = vd.cross_val_score(vd.Spline(),
                            proj_coords,