コード例 #1
0
ファイル: test_loss.py プロジェクト: subhayuroy/scikit-learn
        ("squared_error", -2.0, 42),
        ("squared_error", 117.0, 1.05),
        ("squared_error", 0.0, 0.0),
        # The argmin of binomial_loss for y_true=0 and y_true=1 is resp.
        # -inf and +inf due to logit, cf. "complete separation". Therefore, we
        # use 0 < y_true < 1.
        ("binomial_loss", 0.3, 0.1),
        ("binomial_loss", -12, 0.2),
        ("binomial_loss", 30, 0.9),
        ("poisson_loss", 12.0, 1.0),
        ("poisson_loss", 0.0, 2.0),
        ("poisson_loss", -22.0, 10.0),
    ],
)
@pytest.mark.skipif(
    sp_version == parse_version("1.2.0"),
    reason="bug in scipy 1.2.0, see scipy issue #9608",
)
@skip_if_32bit
def test_derivatives(loss, x0, y_true):
    """Test that gradients are zero at the minimum of the loss.

    We check this on a single value/sample using Halley's method with the
    first and second order derivatives computed by the Loss instance.
    Note that methods of Loss instances operate on arrays while the newton
    root finder expects a scalar or a one-element array for this purpose.
    """
    loss = _LOSSES[loss](sample_weight=None)
    y_true = np.array([y_true], dtype=np.float64)
    x0 = np.array([x0], dtype=np.float64)
コード例 #2
0
# Generate a signal
y = np.linspace(0, resolution - 1, resolution)
first_quarter = y < resolution / 4
y[first_quarter] = 3.0
y[np.logical_not(first_quarter)] = -1.0

# List the different sparse coding methods in the following format:
# (title, transform_algorithm, transform_alpha,
#  transform_n_nozero_coefs, color)
estimators = [
    ("OMP", "omp", None, 15, "navy"),
    ("Lasso", "lasso_lars", 2, None, "turquoise"),
]
lw = 2
# Avoid FutureWarning about default value change when numpy >= 1.14
lstsq_rcond = None if np_version >= parse_version("1.14") else -1

plt.figure(figsize=(13, 6))
for subplot, (D, title) in enumerate(
        zip((D_fixed, D_multi), ("fixed width", "multiple widths"))):
    plt.subplot(1, 2, subplot + 1)
    plt.title("Sparse coding against %s dictionary" % title)
    plt.plot(y, lw=lw, linestyle="--", label="Original signal")
    # Do a wavelet approximation
    for title, algo, alpha, n_nonzero, color in estimators:
        coder = SparseCoder(
            dictionary=D,
            transform_n_nonzero_coefs=n_nonzero,
            transform_alpha=alpha,
            transform_algorithm=algo,
        )
コード例 #3
0
# doc/modules/clustering.rst and use sklearn from the local folder rather than
# the one from site-packages.

import platform
import sys

import pytest
from _pytest.doctest import DoctestItem

from sklearn.utils import _IS_32BIT
from sklearn.externals import _pilutil
from sklearn._min_dependencies import PYTEST_MIN_VERSION
from sklearn.utils.fixes import np_version, parse_version


if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION):
    raise ImportError('Your version of pytest is too old, you should have '
                      'at least pytest >= {} installed.'
                      .format(PYTEST_MIN_VERSION))


def pytest_addoption(parser):
    parser.addoption("--skip-network", action="store_true", default=False,
                     help="skip network tests")


def pytest_collection_modifyitems(config, items):
    for item in items:
        # FeatureHasher is not compatible with PyPy
        if (item.name.endswith(('_hash.FeatureHasher',
                                'text.HashingVectorizer'))
コード例 #4
0
"""
Class and functions to segment cells.
"""

import bigfish.stack as stack

from .utils import thresholding
from .postprocess import label_instances
from .postprocess import clean_segmentation

import numpy as np
from scipy import ndimage as ndi

import skimage
from sklearn.utils.fixes import parse_version
if parse_version(skimage.__version__) < parse_version("0.17.0"):
    from skimage.morphology import watershed
else:
    from skimage.segmentation import watershed


# ### Unet models ###

def unet_distance_edge_double():
    """Load a pretrained Unet model to predict foreground and a distance map
    to edge from nucleus and cell images.

    Returns
    -------
    model : ``tensorflow.keras.model`` object
        Pretrained Unet model.
コード例 #5
0
ファイル: conftest.py プロジェクト: cozek/scikit-learn
def pytest_collection_modifyitems(config, items):
    """Called after collect is completed.

    Parameters
    ----------
    config : pytest config
    items : list of collected items
    """
    run_network_tests = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0'
    skip_network = pytest.mark.skip(
        reason="test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0")

    # download datasets during collection to avoid thread unsafe behavior
    # when running pytest in parallel with pytest-xdist
    dataset_features_set = set(dataset_fetchers)
    datasets_to_download = set()

    for item in items:
        if not hasattr(item, "fixturenames"):
            continue
        item_fixtures = set(item.fixturenames)
        dataset_to_fetch = item_fixtures & dataset_features_set
        if not dataset_to_fetch:
            continue

        if run_network_tests:
            datasets_to_download |= dataset_to_fetch
        else:
            # network tests are skipped
            item.add_marker(skip_network)

    # Only download datasets on the first worker spawned by pytest-xdist
    # to avoid thread unsafe behavior. If pytest-xdist is not used, we still
    # download before tests run.
    worker_id = environ.get("PYTEST_XDIST_WORKER", "gw0")
    if worker_id == "gw0" and run_network_tests:
        for name in datasets_to_download:
            dataset_fetchers[name]()

    for item in items:
        # FeatureHasher is not compatible with PyPy
        if (item.name.endswith(('_hash.FeatureHasher',
                                'text.HashingVectorizer'))
                and platform.python_implementation() == 'PyPy'):
            marker = pytest.mark.skip(
                reason='FeatureHasher is not compatible with PyPy')
            item.add_marker(marker)
        # Known failure on with GradientBoostingClassifier on ARM64
        elif (item.name.endswith('GradientBoostingClassifier')
                and platform.machine() == 'aarch64'):

            marker = pytest.mark.xfail(
                reason=(
                    'know failure. See '
                    'https://github.com/scikit-learn/scikit-learn/issues/17797'  # noqa
                )
            )
            item.add_marker(marker)

    # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to
    # run doctests only for numpy >= 1.14.
    skip_doctests = False
    try:
        if np_version < parse_version('1.14'):
            reason = 'doctests are only run for numpy >= 1.14'
            skip_doctests = True
        elif _IS_32BIT:
            reason = ('doctest are only run when the default numpy int is '
                      '64 bits.')
            skip_doctests = True
        elif sys.platform.startswith("win32"):
            reason = ("doctests are not run for Windows because numpy arrays "
                      "repr is inconsistent across platforms.")
            skip_doctests = True
    except ImportError:
        pass

    if skip_doctests:
        skip_marker = pytest.mark.skip(reason=reason)

        for item in items:
            if isinstance(item, DoctestItem):
                item.add_marker(skip_marker)
    elif not _pilutil.pillow_installed:
        skip_marker = pytest.mark.skip(reason="pillow (or PIL) not installed!")
        for item in items:
            if item.name in [
                    "sklearn.feature_extraction.image.PatchExtractor",
                    "sklearn.feature_extraction.image.extract_patches_2d"]:
                item.add_marker(skip_marker)
コード例 #6
0
                extrapolation="periodic",
            ),
        ),
        ("ols", LinearRegression(fit_intercept=intercept)),
    ])
    pipe.fit(X, f(X[:, 0]))

    # Generate larger array to check periodic extrapolation
    X_ = np.linspace(-1, 2, 301)[:, None]
    predictions = pipe.predict(X_)
    assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01)
    assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3)


@pytest.mark.skipif(
    sp_version < parse_version("1.0.0"),
    reason="Periodic extrapolation not yet implemented for BSpline.",
)
def test_spline_transformer_periodic_spline_backport():
    """Test that the backport of extrapolate="periodic" works correctly"""
    X = np.linspace(-2, 3.5, 10)[:, None]
    degree = 2

    # Use periodic extrapolation backport in SplineTransformer
    transformer = SplineTransformer(degree=degree,
                                    extrapolation="periodic",
                                    knots=[[-1.0], [0.0], [1.0]])
    Xt = transformer.fit_transform(X)

    # Use periodic extrapolation in BSpline
    coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
コード例 #7
0
print(__doc__)

###############################################################################
# Synthetic example
###############################################################################

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.utils.fixes import parse_version

# `normed` is being deprecated in favor of `density` in histograms
if parse_version(matplotlib.__version__) >= parse_version('2.1'):
    density_param = {'density': True}
else:
    density_param = {'normed': True}

###############################################################################
# A synthetic random regression problem is generated. The targets ``y`` are
# modified by: (i) translating all targets such that all entries are
# non-negative and (ii) applying an exponential function to obtain non-linear
# targets which cannot be fitted using a simple linear model.
#
# Therefore, a logarithmic (`np.log1p`) and an exponential function
# (`np.expm1`) will be used to transform the targets before training a linear
# regression model and using it for prediction.

X, y = make_regression(n_samples=10000, noise=100, random_state=0)
コード例 #8
0
            {
                "solver_options": "blah"
            },
            "Invalid value for argument solver_options",
        ),
    ],
)
def test_init_parameters_validation(X_y_data, params, err_msg):
    """Test that invalid init parameters raise errors."""
    X, y = X_y_data
    with pytest.raises(ValueError, match=err_msg):
        QuantileRegressor(**params).fit(X, y)


@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
@pytest.mark.skipif(sp_version >= parse_version('1.6.0'),
                    reason="Solvers are available as of scipy 1.6.0")
def test_too_new_solver_methods_raise_error(X_y_data, solver):
    """Test that highs solver raises for scipy<1.6.0."""
    X, y = X_y_data
    with pytest.raises(ValueError, match="scipy>=1.6.0"):
        QuantileRegressor(solver=solver).fit(X, y)


@pytest.mark.parametrize(
    "quantile, alpha, intercept, coef",
    [
        # for 50% quantile w/o regularization, any slope in [1, 10] is okay
        [0.5, 0, 1, None],
        # if positive error costs more, the slope is maximal
        [0.51, 0, 1, 10],
コード例 #9
0
            {
                "solver_options": "blah"
            },
            "Invalid value for argument solver_options",
        ),
    ],
)
def test_init_parameters_validation(X_y_data, params, err_msg):
    """Test that invalid init parameters raise errors."""
    X, y = X_y_data
    with pytest.raises(ValueError, match=err_msg):
        QuantileRegressor(**params).fit(X, y)


@pytest.mark.skipif(
    sp_version < parse_version("1.3.0"),
    reason="Solver 'revised simplex' is only available with of scipy>=1.3.0",
)
@pytest.mark.parametrize("solver", ["interior-point", "revised simplex"])
def test_incompatible_solver_for_sparse_input(X_y_data, solver):
    X, y = X_y_data
    X_sparse = sparse.csc_matrix(X)
    err_msg = (
        f"Solver {solver} does not support sparse X. Use solver 'highs' for example."
    )
    with pytest.raises(ValueError, match=err_msg):
        QuantileRegressor(solver=solver).fit(X_sparse, y)


@pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs"))
@pytest.mark.skipif(
コード例 #10
0
    clust = OPTICS(min_cluster_size=len(X) + 1)
    with pytest.raises(ValueError, match="must be no greater than the "):
        clust.fit(X)


def test_processing_order():
    # Ensure that we consider all unprocessed points,
    # not only direct neighbors. when picking the next point.
    Y = [[0], [10], [-10], [25]]
    clust = OPTICS(min_samples=3, max_eps=15).fit(Y)
    assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15])
    assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf])
    assert_array_equal(clust.ordering_, [0, 1, 2, 3])


@pytest.mark.skipif(sp_version >= parse_version("1.6.0")
                    and (platform.machine() == "aarch64" or
                         (sys.platform == "linux" and _IS_32BIT)),
                    reason=("Test fails for SciPy 1.6.0 on ARM and on 32-bit "
                            "linux. See #19111"))
def test_compare_to_ELKI():
    # Expected values, computed with (future) ELKI 0.7.5 using:
    # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter
    #   -algorithm clustering.optics.OPTICSHeap -optics.minpts 5
    # where the FixedDBIDsFilter gives 0-indexed ids.
    r1 = [
        np.inf, 1.0574896366427478, 0.7587934993548423, 0.7290174038973836,
        0.7290174038973836, 0.7290174038973836, 0.6861627576116127,
        0.7587934993548423, 0.9280118450166668, 1.1748022534146194,
        3.3355455741292257, 0.49618389254482587, 0.2552805046961355,
        0.2552805046961355, 0.24944622248445714, 0.24944622248445714,
コード例 #11
0
# and :class:`~sklearn.linear_model.LinearRegression`.
#
# Fitting a `QuantileRegressor`
# -----------------------------
#
# In this section, we want to estimate the conditional median as well as
# a low and high quantile fixed at 5% and 95%, respectively. Thus, we will get
# three linear models, one for each quantile.
#
# We will use the quantiles at 5% and 95% to find the outliers in the training
# sample beyond the central 90% interval.
from sklearn.utils.fixes import sp_version, parse_version

# This is line is to avoid incompatibility if older SciPy version.
# You should use `solver="highs"` with recent version of SciPy.
solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point"

# %%
from sklearn.linear_model import QuantileRegressor

quantiles = [0.05, 0.5, 0.95]
predictions = {}
out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_)
for quantile in quantiles:
    qr = QuantileRegressor(quantile=quantile, alpha=0, solver=solver)
    y_pred = qr.fit(X, y_normal).predict(X)
    predictions[quantile] = y_pred

    if quantile == min(quantiles):
        out_bounds_predictions = np.logical_or(out_bounds_predictions,
                                               y_pred >= y_normal)
コード例 #12
0
def plot_kde_1d():
    # `normed` is being deprecated in favor of `density` in histograms
    if parse_version(matplotlib.__version__) >= parse_version('2.1'):
        density_param = {'density': True}
    else:
        density_param = {'normed': True}

    # ----------------------------------------------------------------------
    # Plot the progression of histograms to kernels
    np.random.seed(1)
    N = 20
    X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)),
                        np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis]
    X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]
    bins = np.linspace(-5, 10, 10)

    fig, ax = plt.subplots(2, 2, sharex=True, sharey=True)
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    # histogram 1
    ax[0, 0].hist(X[:, 0], bins=bins, fc='#AAAAFF', **density_param)
    ax[0, 0].text(-3.5, 0.31, "Histogram")

    # histogram 2
    ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc='#AAAAFF', **density_param)
    ax[0, 1].text(-3.5, 0.31, "Histogram, bins shifted")

    # tophat KDE
    kde = KernelDensity(kernel='tophat', bandwidth=0.75).fit(X)
    log_dens = kde.score_samples(X_plot)
    ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
    ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density")

    # Gaussian KDE
    kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X)
    log_dens = kde.score_samples(X_plot)
    ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
    ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density")

    for axi in ax.ravel():
        axi.plot(X[:, 0], np.full(X.shape[0], -0.01), '+k')
        axi.set_xlim(-4, 9)
        axi.set_ylim(-0.02, 0.34)

    for axi in ax[:, 0]:
        axi.set_ylabel('Normalized Density')

    for axi in ax[1, :]:
        axi.set_xlabel('x')

    # ----------------------------------------------------------------------
    # Plot all available kernels
    X_plot = np.linspace(-6, 6, 1000)[:, None]
    X_src = np.zeros((1, 1))

    fig, ax = plt.subplots(2, 3, sharex=True, sharey=True)
    fig.subplots_adjust(left=0.05, right=0.95, hspace=0.05, wspace=0.05)

    def format_func(x, loc):
        if x == 0:
            return '0'
        elif x == 1:
            return 'h'
        elif x == -1:
            return '-h'
        else:
            return '%ih' % x

    for i, kernel in enumerate([
            'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear',
            'cosine'
    ]):
        axi = ax.ravel()[i]
        log_dens = KernelDensity(
            kernel=kernel).fit(X_src).score_samples(X_plot)
        axi.fill(X_plot[:, 0], np.exp(log_dens), '-k', fc='#AAAAFF')
        axi.text(-2.6, 0.95, kernel)

        axi.xaxis.set_major_formatter(plt.FuncFormatter(format_func))
        axi.xaxis.set_major_locator(plt.MultipleLocator(1))
        axi.yaxis.set_major_locator(plt.NullLocator())

        axi.set_ylim(0, 1.05)
        axi.set_xlim(-2.9, 2.9)

    ax[0, 1].set_title('Available Kernels')

    # ----------------------------------------------------------------------
    # Plot a 1D density example
    N = 100
    np.random.seed(1)
    X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)),
                        np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis]

    X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis]

    true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) +
                 0.7 * norm(5, 1).pdf(X_plot[:, 0]))

    fig, ax = plt.subplots()
    ax.fill(X_plot[:, 0],
            true_dens,
            fc='black',
            alpha=0.2,
            label='input distribution')
    colors = ['navy', 'cornflowerblue', 'darkorange']
    kernels = ['gaussian', 'tophat', 'epanechnikov']
    lw = 2

    for color, kernel in zip(colors, kernels):
        kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X)
        log_dens = kde.score_samples(X_plot)
        ax.plot(X_plot[:, 0],
                np.exp(log_dens),
                color=color,
                lw=lw,
                linestyle='-',
                label="kernel = '{0}'".format(kernel))

    ax.text(6, 0.38, "N={0} points".format(N))

    ax.legend(loc='upper left')
    ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k')

    ax.set_xlim(-4, 9)
    ax.set_ylim(-0.02, 0.4)
    plt.show()
コード例 #13
0
                for w in (10, 50, 100, 500, 1000))]

# Generate a signal
y = np.linspace(0, resolution - 1, resolution)
first_quarter = y < resolution / 4
y[first_quarter] = 3.
y[np.logical_not(first_quarter)] = -1.

# List the different sparse coding methods in the following format:
# (title, transform_algorithm, transform_alpha,
#  transform_n_nozero_coefs, color)
estimators = [('OMP', 'omp', None, 15, 'navy'),
              ('Lasso', 'lasso_lars', 2, None, 'turquoise'), ]
lw = 2
# Avoid FutureWarning about default value change when numpy >= 1.14
lstsq_rcond = None if np_version >= parse_version('1.14') else -1

plt.figure(figsize=(13, 6))
for subplot, (D, title) in enumerate(zip((D_fixed, D_multi),
                                         ('fixed width', 'multiple widths'))):
    plt.subplot(1, 2, subplot + 1)
    plt.title('Sparse coding against %s dictionary' % title)
    plt.plot(y, lw=lw, linestyle='--', label='Original signal')
    # Do a wavelet approximation
    for title, algo, alpha, n_nonzero, color in estimators:
        coder = SparseCoder(dictionary=D, transform_n_nonzero_coefs=n_nonzero,
                            transform_alpha=alpha, transform_algorithm=algo)
        x = coder.transform(y.reshape(1, -1))
        density = len(np.flatnonzero(x))
        x = np.ravel(np.dot(x, D))
        squared_error = np.sum((y - x) ** 2)
コード例 #14
0
def default_solver():
    return "highs" if sp_version >= parse_version("1.6.0") else "interior-point"
コード例 #15
0
import numpy as np
from scipy.ndimage.filters import gaussian_filter

import matplotlib.pyplot as plt

import skimage
from skimage.data import coins
from skimage.transform import rescale

from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cluster import AgglomerativeClustering
from sklearn.utils.fixes import parse_version

# these were introduced in skimage-0.14
if parse_version(skimage.__version__) >= parse_version('0.14'):
    rescale_params = {'anti_aliasing': False, 'multichannel': False}
else:
    rescale_params = {}


def plot_coin_ward_segmentation():
    # Generate data
    orig_coins = coins()

    # Resize it to 20% of the original size to speed up the processing
    # Applying a Gaussian filter for smoothing prior to down-scaling
    # reduces aliasing artifacts.
    smoothened_coins = gaussian_filter(orig_coins, sigma=2)
    rescaled_coins = rescale(smoothened_coins,
                             0.2,
コード例 #16
0
# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type
class MyBackend(DEFAULT_JOBLIB_BACKEND):  # type: ignore
    def __init__(self, *args, **kwargs):
        self.count = 0
        super().__init__(*args, **kwargs)

    def start_call(self):
        self.count += 1
        return super().start_call()


joblib.register_parallel_backend('testing', MyBackend)


@pytest.mark.skipif(parse_version(joblib.__version__) < parse_version('0.12'),
                    reason='tests not yet supported in joblib <0.12')
@skip_if_no_parallel
def test_backend_respected():
    clf = RandomForestClassifier(n_estimators=10, n_jobs=2)

    with joblib.parallel_backend("testing") as (ba, n_jobs):
        clf.fit(X, y)

    assert ba.count > 0

    # predict_proba requires shared memory. Ensure that's honored.
    with joblib.parallel_backend("testing") as (ba, _):
        clf.predict_proba(X)

    assert ba.count == 0
コード例 #17
0
def test_pipeline_memory():
    X = iris.data
    y = iris.target
    cachedir = mkdtemp()
    try:
        if parse_version(joblib.__version__) < parse_version('0.12'):
            # Deal with change of API in joblib
            memory = joblib.Memory(cachedir=cachedir, verbose=10)
        else:
            memory = joblib.Memory(location=cachedir, verbose=10)
        # Test with Transformer + SVC
        clf = SVC(probability=True, random_state=0)
        transf = DummyTransf()
        pipe = Pipeline([('transf', clone(transf)), ('svc', clf)])
        cached_pipe = Pipeline([('transf', transf), ('svc', clf)],
                               memory=memory)

        # Memoize the transformer at the first fit
        cached_pipe.fit(X, y)
        pipe.fit(X, y)
        # Get the time stamp of the transformer in the cached pipeline
        ts = cached_pipe.named_steps['transf'].timestamp_
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert not hasattr(transf, 'means_')
        # Check that we are reading the cache while fitting
        # a second time
        cached_pipe.fit(X, y)
        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe.predict(X))
        assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe.named_steps['transf'].means_)
        assert ts == cached_pipe.named_steps['transf'].timestamp_
        # Create a new pipeline with cloned estimators
        # Check that even changing the name step does not affect the cache hit
        clf_2 = SVC(probability=True, random_state=0)
        transf_2 = DummyTransf()
        cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)],
                                 memory=memory)
        cached_pipe_2.fit(X, y)

        # Check that cached_pipe and pipe yield identical results
        assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X))
        assert_array_equal(pipe.predict_proba(X),
                           cached_pipe_2.predict_proba(X))
        assert_array_equal(pipe.predict_log_proba(X),
                           cached_pipe_2.predict_log_proba(X))
        assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y))
        assert_array_equal(pipe.named_steps['transf'].means_,
                           cached_pipe_2.named_steps['transf_2'].means_)
        assert ts == cached_pipe_2.named_steps['transf_2'].timestamp_
    finally:
        shutil.rmtree(cachedir)
コード例 #18
0
)
@pytest.mark.parametrize(
    "estimator",
    [
        LinearRegression,
        Ridge,
        RidgeCV,
        RidgeClassifier,
        RidgeClassifierCV,
        BayesianRidge,
        ARDRegression,
    ],
)
# FIXME remove test in 1.2
@pytest.mark.xfail(
    sys.platform == "darwin" and np_version < parse_version("1.22"),
    reason="https://github.com/scikit-learn/scikit-learn/issues/21395",
)
def test_linear_model_normalize_deprecation_message(estimator, normalize,
                                                    n_warnings,
                                                    warning_category):
    # check that we issue a FutureWarning when normalize was set in
    # linear model
    rng = check_random_state(0)
    n_samples = 200
    n_features = 2
    X = rng.randn(n_samples, n_features)
    X[X < 0.1] = 0.0
    y = rng.rand(n_samples)
    if is_classifier(estimator):
        y = np.sign(y)
コード例 #19
0
import matplotlib
import matplotlib.pyplot as plt

from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeCV
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.utils.fixes import parse_version

# %%
# Synthetic example
##############################################################################

# `normed` is being deprecated in favor of `density` in histograms
if parse_version(matplotlib.__version__) >= parse_version("2.1"):
    density_param = {"density": True}
else:
    density_param = {"normed": True}

# %%
# A synthetic random regression dataset is generated. The targets ``y`` are
# modified by:
#
#   1. translating all targets such that all entries are
#      non-negative (by adding the absolute value of the lowest ``y``) and
#   2. applying an exponential function to obtain non-linear
#      targets which cannot be fitted using a simple linear model.
#
# Therefore, a logarithmic (`np.log1p`) and an exponential function
# (`np.expm1`) will be used to transform the targets before training a linear
コード例 #20
0
V = rng.random_sample((d, d))
VI = np.dot(V, V.T)


METRICS_DEFAULT_PARAMS = [
    ("euclidean", {}),
    ("cityblock", {}),
    ("minkowski", dict(p=(1, 1.5, 2, 3))),
    ("chebyshev", {}),
    ("seuclidean", dict(V=(rng.random_sample(d),))),
    ("mahalanobis", dict(VI=(VI,))),
    ("hamming", {}),
    ("canberra", {}),
    ("braycurtis", {}),
]
if sp_version >= parse_version("1.8.0.dev0"):
    # Starting from scipy 1.8.0.dev0, minkowski now accepts w, the weighting
    # parameter directly and using it is preferred over using wminkowski.
    METRICS_DEFAULT_PARAMS.append(
        ("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))),
    )
else:
    # For previous versions of scipy, this was possible through a dedicated
    # metric (deprecated in 1.6 and removed in 1.8).
    METRICS_DEFAULT_PARAMS.append(
        ("wminkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))),
    )


def check_cdist(metric, kwargs, X1, X2):
    if metric == "wminkowski":
コード例 #21
0
import numpy as np
from scipy.ndimage.filters import gaussian_filter

import matplotlib.pyplot as plt

import skimage
from skimage.data import coins
from skimage.transform import rescale

from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cluster import AgglomerativeClustering
from sklearn.utils.fixes import parse_version

# these were introduced in skimage-0.14
if parse_version(skimage.__version__) >= parse_version("0.14"):
    rescale_params = {"anti_aliasing": False, "multichannel": False}
else:
    rescale_params = {}

# #############################################################################
# Generate data
orig_coins = coins()

# Resize it to 20% of the original size to speed up the processing
# Applying a Gaussian filter for smoothing prior to down-scaling
# reduces aliasing artifacts.
smoothened_coins = gaussian_filter(orig_coins, sigma=2)
rescaled_coins = rescale(smoothened_coins,
                         0.2,
                         mode="reflect",
コード例 #22
0
    'loss, x0, y_true',
    [
        ('least_squares', -2., 42),
        ('least_squares', 117., 1.05),
        ('least_squares', 0., 0.),
        # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp. -inf
        # and +inf due to logit, cf. "complete separation". Therefore, we use
        # 0 < y_true < 1.
        ('binary_crossentropy', 0.3, 0.1),
        ('binary_crossentropy', -12, 0.2),
        ('binary_crossentropy', 30, 0.9),
        ('poisson', 12., 1.),
        ('poisson', 0., 2.),
        ('poisson', -22., 10.),
    ])
@pytest.mark.skipif(sp_version == parse_version('1.2.0'),
                    reason='bug in scipy 1.2.0, see scipy issue #9608')
@skip_if_32bit
def test_derivatives(loss, x0, y_true):
    # Check that gradients are zero when the loss is minimized on a single
    # value/sample using Halley's method with the first and second order
    # derivatives computed by the Loss instance.
    # Note that methods of Loss instances operate on arrays while the newton
    # root finder expects a scalar or a one-element array for this purpose.

    loss = _LOSSES[loss](sample_weight=None)
    y_true = np.array([y_true], dtype=Y_DTYPE)
    x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1)
    get_gradients, get_hessians = get_derivatives_helper(loss)

    def func(x: np.ndarray) -> np.ndarray: