("squared_error", -2.0, 42), ("squared_error", 117.0, 1.05), ("squared_error", 0.0, 0.0), # The argmin of binomial_loss for y_true=0 and y_true=1 is resp. # -inf and +inf due to logit, cf. "complete separation". Therefore, we # use 0 < y_true < 1. ("binomial_loss", 0.3, 0.1), ("binomial_loss", -12, 0.2), ("binomial_loss", 30, 0.9), ("poisson_loss", 12.0, 1.0), ("poisson_loss", 0.0, 2.0), ("poisson_loss", -22.0, 10.0), ], ) @pytest.mark.skipif( sp_version == parse_version("1.2.0"), reason="bug in scipy 1.2.0, see scipy issue #9608", ) @skip_if_32bit def test_derivatives(loss, x0, y_true): """Test that gradients are zero at the minimum of the loss. We check this on a single value/sample using Halley's method with the first and second order derivatives computed by the Loss instance. Note that methods of Loss instances operate on arrays while the newton root finder expects a scalar or a one-element array for this purpose. """ loss = _LOSSES[loss](sample_weight=None) y_true = np.array([y_true], dtype=np.float64) x0 = np.array([x0], dtype=np.float64)
# Generate a signal y = np.linspace(0, resolution - 1, resolution) first_quarter = y < resolution / 4 y[first_quarter] = 3.0 y[np.logical_not(first_quarter)] = -1.0 # List the different sparse coding methods in the following format: # (title, transform_algorithm, transform_alpha, # transform_n_nozero_coefs, color) estimators = [ ("OMP", "omp", None, 15, "navy"), ("Lasso", "lasso_lars", 2, None, "turquoise"), ] lw = 2 # Avoid FutureWarning about default value change when numpy >= 1.14 lstsq_rcond = None if np_version >= parse_version("1.14") else -1 plt.figure(figsize=(13, 6)) for subplot, (D, title) in enumerate( zip((D_fixed, D_multi), ("fixed width", "multiple widths"))): plt.subplot(1, 2, subplot + 1) plt.title("Sparse coding against %s dictionary" % title) plt.plot(y, lw=lw, linestyle="--", label="Original signal") # Do a wavelet approximation for title, algo, alpha, n_nonzero, color in estimators: coder = SparseCoder( dictionary=D, transform_n_nonzero_coefs=n_nonzero, transform_alpha=alpha, transform_algorithm=algo, )
# doc/modules/clustering.rst and use sklearn from the local folder rather than # the one from site-packages. import platform import sys import pytest from _pytest.doctest import DoctestItem from sklearn.utils import _IS_32BIT from sklearn.externals import _pilutil from sklearn._min_dependencies import PYTEST_MIN_VERSION from sklearn.utils.fixes import np_version, parse_version if parse_version(pytest.__version__) < parse_version(PYTEST_MIN_VERSION): raise ImportError('Your version of pytest is too old, you should have ' 'at least pytest >= {} installed.' .format(PYTEST_MIN_VERSION)) def pytest_addoption(parser): parser.addoption("--skip-network", action="store_true", default=False, help="skip network tests") def pytest_collection_modifyitems(config, items): for item in items: # FeatureHasher is not compatible with PyPy if (item.name.endswith(('_hash.FeatureHasher', 'text.HashingVectorizer'))
""" Class and functions to segment cells. """ import bigfish.stack as stack from .utils import thresholding from .postprocess import label_instances from .postprocess import clean_segmentation import numpy as np from scipy import ndimage as ndi import skimage from sklearn.utils.fixes import parse_version if parse_version(skimage.__version__) < parse_version("0.17.0"): from skimage.morphology import watershed else: from skimage.segmentation import watershed # ### Unet models ### def unet_distance_edge_double(): """Load a pretrained Unet model to predict foreground and a distance map to edge from nucleus and cell images. Returns ------- model : ``tensorflow.keras.model`` object Pretrained Unet model.
def pytest_collection_modifyitems(config, items): """Called after collect is completed. Parameters ---------- config : pytest config items : list of collected items """ run_network_tests = environ.get('SKLEARN_SKIP_NETWORK_TESTS', '1') == '0' skip_network = pytest.mark.skip( reason="test is enabled when SKLEARN_SKIP_NETWORK_TESTS=0") # download datasets during collection to avoid thread unsafe behavior # when running pytest in parallel with pytest-xdist dataset_features_set = set(dataset_fetchers) datasets_to_download = set() for item in items: if not hasattr(item, "fixturenames"): continue item_fixtures = set(item.fixturenames) dataset_to_fetch = item_fixtures & dataset_features_set if not dataset_to_fetch: continue if run_network_tests: datasets_to_download |= dataset_to_fetch else: # network tests are skipped item.add_marker(skip_network) # Only download datasets on the first worker spawned by pytest-xdist # to avoid thread unsafe behavior. If pytest-xdist is not used, we still # download before tests run. worker_id = environ.get("PYTEST_XDIST_WORKER", "gw0") if worker_id == "gw0" and run_network_tests: for name in datasets_to_download: dataset_fetchers[name]() for item in items: # FeatureHasher is not compatible with PyPy if (item.name.endswith(('_hash.FeatureHasher', 'text.HashingVectorizer')) and platform.python_implementation() == 'PyPy'): marker = pytest.mark.skip( reason='FeatureHasher is not compatible with PyPy') item.add_marker(marker) # Known failure on with GradientBoostingClassifier on ARM64 elif (item.name.endswith('GradientBoostingClassifier') and platform.machine() == 'aarch64'): marker = pytest.mark.xfail( reason=( 'know failure. See ' 'https://github.com/scikit-learn/scikit-learn/issues/17797' # noqa ) ) item.add_marker(marker) # numpy changed the str/repr formatting of numpy arrays in 1.14. We want to # run doctests only for numpy >= 1.14. skip_doctests = False try: if np_version < parse_version('1.14'): reason = 'doctests are only run for numpy >= 1.14' skip_doctests = True elif _IS_32BIT: reason = ('doctest are only run when the default numpy int is ' '64 bits.') skip_doctests = True elif sys.platform.startswith("win32"): reason = ("doctests are not run for Windows because numpy arrays " "repr is inconsistent across platforms.") skip_doctests = True except ImportError: pass if skip_doctests: skip_marker = pytest.mark.skip(reason=reason) for item in items: if isinstance(item, DoctestItem): item.add_marker(skip_marker) elif not _pilutil.pillow_installed: skip_marker = pytest.mark.skip(reason="pillow (or PIL) not installed!") for item in items: if item.name in [ "sklearn.feature_extraction.image.PatchExtractor", "sklearn.feature_extraction.image.extract_patches_2d"]: item.add_marker(skip_marker)
extrapolation="periodic", ), ), ("ols", LinearRegression(fit_intercept=intercept)), ]) pipe.fit(X, f(X[:, 0])) # Generate larger array to check periodic extrapolation X_ = np.linspace(-1, 2, 301)[:, None] predictions = pipe.predict(X_) assert_allclose(predictions, f(X_[:, 0]), atol=0.01, rtol=0.01) assert_allclose(predictions[0:100], predictions[100:200], rtol=1e-3) @pytest.mark.skipif( sp_version < parse_version("1.0.0"), reason="Periodic extrapolation not yet implemented for BSpline.", ) def test_spline_transformer_periodic_spline_backport(): """Test that the backport of extrapolate="periodic" works correctly""" X = np.linspace(-2, 3.5, 10)[:, None] degree = 2 # Use periodic extrapolation backport in SplineTransformer transformer = SplineTransformer(degree=degree, extrapolation="periodic", knots=[[-1.0], [0.0], [1.0]]) Xt = transformer.fit_transform(X) # Use periodic extrapolation in BSpline coef = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]])
print(__doc__) ############################################################################### # Synthetic example ############################################################################### from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.linear_model import RidgeCV from sklearn.compose import TransformedTargetRegressor from sklearn.metrics import median_absolute_error, r2_score from sklearn.utils.fixes import parse_version # `normed` is being deprecated in favor of `density` in histograms if parse_version(matplotlib.__version__) >= parse_version('2.1'): density_param = {'density': True} else: density_param = {'normed': True} ############################################################################### # A synthetic random regression problem is generated. The targets ``y`` are # modified by: (i) translating all targets such that all entries are # non-negative and (ii) applying an exponential function to obtain non-linear # targets which cannot be fitted using a simple linear model. # # Therefore, a logarithmic (`np.log1p`) and an exponential function # (`np.expm1`) will be used to transform the targets before training a linear # regression model and using it for prediction. X, y = make_regression(n_samples=10000, noise=100, random_state=0)
{ "solver_options": "blah" }, "Invalid value for argument solver_options", ), ], ) def test_init_parameters_validation(X_y_data, params, err_msg): """Test that invalid init parameters raise errors.""" X, y = X_y_data with pytest.raises(ValueError, match=err_msg): QuantileRegressor(**params).fit(X, y) @pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs")) @pytest.mark.skipif(sp_version >= parse_version('1.6.0'), reason="Solvers are available as of scipy 1.6.0") def test_too_new_solver_methods_raise_error(X_y_data, solver): """Test that highs solver raises for scipy<1.6.0.""" X, y = X_y_data with pytest.raises(ValueError, match="scipy>=1.6.0"): QuantileRegressor(solver=solver).fit(X, y) @pytest.mark.parametrize( "quantile, alpha, intercept, coef", [ # for 50% quantile w/o regularization, any slope in [1, 10] is okay [0.5, 0, 1, None], # if positive error costs more, the slope is maximal [0.51, 0, 1, 10],
{ "solver_options": "blah" }, "Invalid value for argument solver_options", ), ], ) def test_init_parameters_validation(X_y_data, params, err_msg): """Test that invalid init parameters raise errors.""" X, y = X_y_data with pytest.raises(ValueError, match=err_msg): QuantileRegressor(**params).fit(X, y) @pytest.mark.skipif( sp_version < parse_version("1.3.0"), reason="Solver 'revised simplex' is only available with of scipy>=1.3.0", ) @pytest.mark.parametrize("solver", ["interior-point", "revised simplex"]) def test_incompatible_solver_for_sparse_input(X_y_data, solver): X, y = X_y_data X_sparse = sparse.csc_matrix(X) err_msg = ( f"Solver {solver} does not support sparse X. Use solver 'highs' for example." ) with pytest.raises(ValueError, match=err_msg): QuantileRegressor(solver=solver).fit(X_sparse, y) @pytest.mark.parametrize("solver", ("highs-ds", "highs-ipm", "highs")) @pytest.mark.skipif(
clust = OPTICS(min_cluster_size=len(X) + 1) with pytest.raises(ValueError, match="must be no greater than the "): clust.fit(X) def test_processing_order(): # Ensure that we consider all unprocessed points, # not only direct neighbors. when picking the next point. Y = [[0], [10], [-10], [25]] clust = OPTICS(min_samples=3, max_eps=15).fit(Y) assert_array_equal(clust.reachability_, [np.inf, 10, 10, 15]) assert_array_equal(clust.core_distances_, [10, 15, np.inf, np.inf]) assert_array_equal(clust.ordering_, [0, 1, 2, 3]) @pytest.mark.skipif(sp_version >= parse_version("1.6.0") and (platform.machine() == "aarch64" or (sys.platform == "linux" and _IS_32BIT)), reason=("Test fails for SciPy 1.6.0 on ARM and on 32-bit " "linux. See #19111")) def test_compare_to_ELKI(): # Expected values, computed with (future) ELKI 0.7.5 using: # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter # -algorithm clustering.optics.OPTICSHeap -optics.minpts 5 # where the FixedDBIDsFilter gives 0-indexed ids. r1 = [ np.inf, 1.0574896366427478, 0.7587934993548423, 0.7290174038973836, 0.7290174038973836, 0.7290174038973836, 0.6861627576116127, 0.7587934993548423, 0.9280118450166668, 1.1748022534146194, 3.3355455741292257, 0.49618389254482587, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714, 0.24944622248445714,
# and :class:`~sklearn.linear_model.LinearRegression`. # # Fitting a `QuantileRegressor` # ----------------------------- # # In this section, we want to estimate the conditional median as well as # a low and high quantile fixed at 5% and 95%, respectively. Thus, we will get # three linear models, one for each quantile. # # We will use the quantiles at 5% and 95% to find the outliers in the training # sample beyond the central 90% interval. from sklearn.utils.fixes import sp_version, parse_version # This is line is to avoid incompatibility if older SciPy version. # You should use `solver="highs"` with recent version of SciPy. solver = "highs" if sp_version >= parse_version("1.6.0") else "interior-point" # %% from sklearn.linear_model import QuantileRegressor quantiles = [0.05, 0.5, 0.95] predictions = {} out_bounds_predictions = np.zeros_like(y_true_mean, dtype=np.bool_) for quantile in quantiles: qr = QuantileRegressor(quantile=quantile, alpha=0, solver=solver) y_pred = qr.fit(X, y_normal).predict(X) predictions[quantile] = y_pred if quantile == min(quantiles): out_bounds_predictions = np.logical_or(out_bounds_predictions, y_pred >= y_normal)
def plot_kde_1d(): # `normed` is being deprecated in favor of `density` in histograms if parse_version(matplotlib.__version__) >= parse_version('2.1'): density_param = {'density': True} else: density_param = {'normed': True} # ---------------------------------------------------------------------- # Plot the progression of histograms to kernels np.random.seed(1) N = 20 X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)), np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis] X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] bins = np.linspace(-5, 10, 10) fig, ax = plt.subplots(2, 2, sharex=True, sharey=True) fig.subplots_adjust(hspace=0.05, wspace=0.05) # histogram 1 ax[0, 0].hist(X[:, 0], bins=bins, fc='#AAAAFF', **density_param) ax[0, 0].text(-3.5, 0.31, "Histogram") # histogram 2 ax[0, 1].hist(X[:, 0], bins=bins + 0.75, fc='#AAAAFF', **density_param) ax[0, 1].text(-3.5, 0.31, "Histogram, bins shifted") # tophat KDE kde = KernelDensity(kernel='tophat', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[1, 0].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[1, 0].text(-3.5, 0.31, "Tophat Kernel Density") # Gaussian KDE kde = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X) log_dens = kde.score_samples(X_plot) ax[1, 1].fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF') ax[1, 1].text(-3.5, 0.31, "Gaussian Kernel Density") for axi in ax.ravel(): axi.plot(X[:, 0], np.full(X.shape[0], -0.01), '+k') axi.set_xlim(-4, 9) axi.set_ylim(-0.02, 0.34) for axi in ax[:, 0]: axi.set_ylabel('Normalized Density') for axi in ax[1, :]: axi.set_xlabel('x') # ---------------------------------------------------------------------- # Plot all available kernels X_plot = np.linspace(-6, 6, 1000)[:, None] X_src = np.zeros((1, 1)) fig, ax = plt.subplots(2, 3, sharex=True, sharey=True) fig.subplots_adjust(left=0.05, right=0.95, hspace=0.05, wspace=0.05) def format_func(x, loc): if x == 0: return '0' elif x == 1: return 'h' elif x == -1: return '-h' else: return '%ih' % x for i, kernel in enumerate([ 'gaussian', 'tophat', 'epanechnikov', 'exponential', 'linear', 'cosine' ]): axi = ax.ravel()[i] log_dens = KernelDensity( kernel=kernel).fit(X_src).score_samples(X_plot) axi.fill(X_plot[:, 0], np.exp(log_dens), '-k', fc='#AAAAFF') axi.text(-2.6, 0.95, kernel) axi.xaxis.set_major_formatter(plt.FuncFormatter(format_func)) axi.xaxis.set_major_locator(plt.MultipleLocator(1)) axi.yaxis.set_major_locator(plt.NullLocator()) axi.set_ylim(0, 1.05) axi.set_xlim(-2.9, 2.9) ax[0, 1].set_title('Available Kernels') # ---------------------------------------------------------------------- # Plot a 1D density example N = 100 np.random.seed(1) X = np.concatenate((np.random.normal(0, 1, int(0.3 * N)), np.random.normal(5, 1, int(0.7 * N))))[:, np.newaxis] X_plot = np.linspace(-5, 10, 1000)[:, np.newaxis] true_dens = (0.3 * norm(0, 1).pdf(X_plot[:, 0]) + 0.7 * norm(5, 1).pdf(X_plot[:, 0])) fig, ax = plt.subplots() ax.fill(X_plot[:, 0], true_dens, fc='black', alpha=0.2, label='input distribution') colors = ['navy', 'cornflowerblue', 'darkorange'] kernels = ['gaussian', 'tophat', 'epanechnikov'] lw = 2 for color, kernel in zip(colors, kernels): kde = KernelDensity(kernel=kernel, bandwidth=0.5).fit(X) log_dens = kde.score_samples(X_plot) ax.plot(X_plot[:, 0], np.exp(log_dens), color=color, lw=lw, linestyle='-', label="kernel = '{0}'".format(kernel)) ax.text(6, 0.38, "N={0} points".format(N)) ax.legend(loc='upper left') ax.plot(X[:, 0], -0.005 - 0.01 * np.random.random(X.shape[0]), '+k') ax.set_xlim(-4, 9) ax.set_ylim(-0.02, 0.4) plt.show()
for w in (10, 50, 100, 500, 1000))] # Generate a signal y = np.linspace(0, resolution - 1, resolution) first_quarter = y < resolution / 4 y[first_quarter] = 3. y[np.logical_not(first_quarter)] = -1. # List the different sparse coding methods in the following format: # (title, transform_algorithm, transform_alpha, # transform_n_nozero_coefs, color) estimators = [('OMP', 'omp', None, 15, 'navy'), ('Lasso', 'lasso_lars', 2, None, 'turquoise'), ] lw = 2 # Avoid FutureWarning about default value change when numpy >= 1.14 lstsq_rcond = None if np_version >= parse_version('1.14') else -1 plt.figure(figsize=(13, 6)) for subplot, (D, title) in enumerate(zip((D_fixed, D_multi), ('fixed width', 'multiple widths'))): plt.subplot(1, 2, subplot + 1) plt.title('Sparse coding against %s dictionary' % title) plt.plot(y, lw=lw, linestyle='--', label='Original signal') # Do a wavelet approximation for title, algo, alpha, n_nonzero, color in estimators: coder = SparseCoder(dictionary=D, transform_n_nonzero_coefs=n_nonzero, transform_alpha=alpha, transform_algorithm=algo) x = coder.transform(y.reshape(1, -1)) density = len(np.flatnonzero(x)) x = np.ravel(np.dot(x, D)) squared_error = np.sum((y - x) ** 2)
def default_solver(): return "highs" if sp_version >= parse_version("1.6.0") else "interior-point"
import numpy as np from scipy.ndimage.filters import gaussian_filter import matplotlib.pyplot as plt import skimage from skimage.data import coins from skimage.transform import rescale from sklearn.feature_extraction.image import grid_to_graph from sklearn.cluster import AgglomerativeClustering from sklearn.utils.fixes import parse_version # these were introduced in skimage-0.14 if parse_version(skimage.__version__) >= parse_version('0.14'): rescale_params = {'anti_aliasing': False, 'multichannel': False} else: rescale_params = {} def plot_coin_ward_segmentation(): # Generate data orig_coins = coins() # Resize it to 20% of the original size to speed up the processing # Applying a Gaussian filter for smoothing prior to down-scaling # reduces aliasing artifacts. smoothened_coins = gaussian_filter(orig_coins, sigma=2) rescaled_coins = rescale(smoothened_coins, 0.2,
# mypy error: Variable "DEFAULT_JOBLIB_BACKEND" is not valid type class MyBackend(DEFAULT_JOBLIB_BACKEND): # type: ignore def __init__(self, *args, **kwargs): self.count = 0 super().__init__(*args, **kwargs) def start_call(self): self.count += 1 return super().start_call() joblib.register_parallel_backend('testing', MyBackend) @pytest.mark.skipif(parse_version(joblib.__version__) < parse_version('0.12'), reason='tests not yet supported in joblib <0.12') @skip_if_no_parallel def test_backend_respected(): clf = RandomForestClassifier(n_estimators=10, n_jobs=2) with joblib.parallel_backend("testing") as (ba, n_jobs): clf.fit(X, y) assert ba.count > 0 # predict_proba requires shared memory. Ensure that's honored. with joblib.parallel_backend("testing") as (ba, _): clf.predict_proba(X) assert ba.count == 0
def test_pipeline_memory(): X = iris.data y = iris.target cachedir = mkdtemp() try: if parse_version(joblib.__version__) < parse_version('0.12'): # Deal with change of API in joblib memory = joblib.Memory(cachedir=cachedir, verbose=10) else: memory = joblib.Memory(location=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert ts == cached_pipe.named_steps['transf'].timestamp_ # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert ts == cached_pipe_2.named_steps['transf_2'].timestamp_ finally: shutil.rmtree(cachedir)
) @pytest.mark.parametrize( "estimator", [ LinearRegression, Ridge, RidgeCV, RidgeClassifier, RidgeClassifierCV, BayesianRidge, ARDRegression, ], ) # FIXME remove test in 1.2 @pytest.mark.xfail( sys.platform == "darwin" and np_version < parse_version("1.22"), reason="https://github.com/scikit-learn/scikit-learn/issues/21395", ) def test_linear_model_normalize_deprecation_message(estimator, normalize, n_warnings, warning_category): # check that we issue a FutureWarning when normalize was set in # linear model rng = check_random_state(0) n_samples = 200 n_features = 2 X = rng.randn(n_samples, n_features) X[X < 0.1] = 0.0 y = rng.rand(n_samples) if is_classifier(estimator): y = np.sign(y)
import matplotlib import matplotlib.pyplot as plt from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split from sklearn.linear_model import RidgeCV from sklearn.compose import TransformedTargetRegressor from sklearn.metrics import median_absolute_error, r2_score from sklearn.utils.fixes import parse_version # %% # Synthetic example ############################################################################## # `normed` is being deprecated in favor of `density` in histograms if parse_version(matplotlib.__version__) >= parse_version("2.1"): density_param = {"density": True} else: density_param = {"normed": True} # %% # A synthetic random regression dataset is generated. The targets ``y`` are # modified by: # # 1. translating all targets such that all entries are # non-negative (by adding the absolute value of the lowest ``y``) and # 2. applying an exponential function to obtain non-linear # targets which cannot be fitted using a simple linear model. # # Therefore, a logarithmic (`np.log1p`) and an exponential function # (`np.expm1`) will be used to transform the targets before training a linear
V = rng.random_sample((d, d)) VI = np.dot(V, V.T) METRICS_DEFAULT_PARAMS = [ ("euclidean", {}), ("cityblock", {}), ("minkowski", dict(p=(1, 1.5, 2, 3))), ("chebyshev", {}), ("seuclidean", dict(V=(rng.random_sample(d),))), ("mahalanobis", dict(VI=(VI,))), ("hamming", {}), ("canberra", {}), ("braycurtis", {}), ] if sp_version >= parse_version("1.8.0.dev0"): # Starting from scipy 1.8.0.dev0, minkowski now accepts w, the weighting # parameter directly and using it is preferred over using wminkowski. METRICS_DEFAULT_PARAMS.append( ("minkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), ) else: # For previous versions of scipy, this was possible through a dedicated # metric (deprecated in 1.6 and removed in 1.8). METRICS_DEFAULT_PARAMS.append( ("wminkowski", dict(p=(1, 1.5, 3), w=(rng.random_sample(d),))), ) def check_cdist(metric, kwargs, X1, X2): if metric == "wminkowski":
import numpy as np from scipy.ndimage.filters import gaussian_filter import matplotlib.pyplot as plt import skimage from skimage.data import coins from skimage.transform import rescale from sklearn.feature_extraction.image import grid_to_graph from sklearn.cluster import AgglomerativeClustering from sklearn.utils.fixes import parse_version # these were introduced in skimage-0.14 if parse_version(skimage.__version__) >= parse_version("0.14"): rescale_params = {"anti_aliasing": False, "multichannel": False} else: rescale_params = {} # ############################################################################# # Generate data orig_coins = coins() # Resize it to 20% of the original size to speed up the processing # Applying a Gaussian filter for smoothing prior to down-scaling # reduces aliasing artifacts. smoothened_coins = gaussian_filter(orig_coins, sigma=2) rescaled_coins = rescale(smoothened_coins, 0.2, mode="reflect",
'loss, x0, y_true', [ ('least_squares', -2., 42), ('least_squares', 117., 1.05), ('least_squares', 0., 0.), # The argmin of binary_crossentropy for y_true=0 and y_true=1 is resp. -inf # and +inf due to logit, cf. "complete separation". Therefore, we use # 0 < y_true < 1. ('binary_crossentropy', 0.3, 0.1), ('binary_crossentropy', -12, 0.2), ('binary_crossentropy', 30, 0.9), ('poisson', 12., 1.), ('poisson', 0., 2.), ('poisson', -22., 10.), ]) @pytest.mark.skipif(sp_version == parse_version('1.2.0'), reason='bug in scipy 1.2.0, see scipy issue #9608') @skip_if_32bit def test_derivatives(loss, x0, y_true): # Check that gradients are zero when the loss is minimized on a single # value/sample using Halley's method with the first and second order # derivatives computed by the Loss instance. # Note that methods of Loss instances operate on arrays while the newton # root finder expects a scalar or a one-element array for this purpose. loss = _LOSSES[loss](sample_weight=None) y_true = np.array([y_true], dtype=Y_DTYPE) x0 = np.array([x0], dtype=Y_DTYPE).reshape(1, 1) get_gradients, get_hessians = get_derivatives_helper(loss) def func(x: np.ndarray) -> np.ndarray: