def check_transformer_pickle(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): return set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) assert_array_almost_equal(pickled_X_pred, X_pred)
def test_set_random_state(): lda = LDA() tree = DecisionTreeClassifier() # LDA doesn't have random state: smoke test set_random_state(lda, 3) set_random_state(tree, 3) assert_equal(tree.random_state, 3)
def test_various_scoring_on_tuples_learners(estimator, build_dataset, with_preprocessor): """Tests that scikit-learn's scoring returns something finite, for other scoring than default scoring. (List of scikit-learn's scores can be found in sklearn.metrics.scorer). For each type of output (predict, predict_proba, decision_function), we test a bunch of scores. We only test on pairs learners because quadruplets don't have a y argument. """ input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) # scores that need a predict function: every tuples learner should have a # predict function (whether the pair is of positive samples or negative # samples) for scoring in ['accuracy', 'f1']: check_score_is_finite(scoring, estimator, input_data, labels) # scores that need a predict_proba: if hasattr(estimator, "predict_proba"): for scoring in ['neg_log_loss', 'brier_score']: check_score_is_finite(scoring, estimator, input_data, labels) # scores that need a decision_function: every tuples learner should have a # decision function (the metric between points) for scoring in ['roc_auc', 'average_precision', 'precision', 'recall']: check_score_is_finite(scoring, estimator, input_data, labels)
def test_get_metric_works_does_not_raise(estimator, build_dataset): """Tests that the metric returned by get_metric does not raise errors (or warnings) similarly to the distance functions in scipy.spatial.distance""" input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) metric = model.get_metric() list_test_get_metric_doesnt_raise = [(X[0], X[1]), (X[0].tolist(), X[1].tolist()), (X[0][None], X[1][None])] for u, v in list_test_get_metric_doesnt_raise: with pytest.warns(None) as record: metric(u, v) assert len(record) == 0 # Test that the scalar case works model.transformer_ = np.array([3.1]) metric = model.get_metric() for u, v in [(5, 6.7), ([5], [6.7]), ([[5]], [[6.7]])]: with pytest.warns(None) as record: metric(u, v) assert len(record) == 0
def test_set_random_state(): lda = LinearDiscriminantAnalysis() tree = DecisionTreeClassifier() # Linear Discriminant Analysis doesn't have random state: smoke test set_random_state(lda, 3) set_random_state(tree, 3) assert_equal(tree.random_state, 3)
def check_regressors_int(name, Regressor, X, y): if name == 'OrthogonalMatchingPursuitCV': # FIXME: This test is unstable on Travis, see issue #3190. check_skip_travis() rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds regressor_1 = Regressor() regressor_2 = Regressor() set_random_state(regressor_1) set_random_state(regressor_2) if name in ('_PLS', 'PLSCanonical', 'PLSRegression'): y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit regressor_1.fit(X, y_) pred1 = regressor_1.predict(X) regressor_2.fit(X, y_.astype(np.float)) pred2 = regressor_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def check_classifiers_input_shapes(name, Classifier): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=1) X = StandardScaler().fit_transform(X) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() set_fast_parameters(classifier) set_random_state(classifier) # fit classifier.fit(X, y) y_pred = classifier.predict(X) set_random_state(classifier) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) warnings.simplefilter("ignore", RuntimeWarning) classifier.fit(X, y[:, np.newaxis]) msg = "expected 1 DataConversionWarning, got: %s" % ( ", ".join([str(w_x) for w_x in w])) assert_equal(len(w), 1, msg) assert_array_equal(y_pred, classifier.predict(X))
def test_regressors_int(): # test if regressors can cope with integer labels (by converting them to # float) regressors = all_estimators(type_filter='regressor') boston = load_boston() X, y = boston.data, boston.target X, y = shuffle(X, y, random_state=0) X = StandardScaler().fit_transform(X) y = np.random.randint(2, size=X.shape[0]) for name, Reg in regressors: if Reg in dont_test or Reg in (CCA,): continue # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds reg1 = Reg() reg2 = Reg() set_random_state(reg1) set_random_state(reg2) if Reg in (_PLS, PLSCanonical, PLSRegression): y_ = np.vstack([y, 2 * y + np.random.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit reg1.fit(X, y_) pred1 = reg1.predict(X) reg2.fit(X, y_.astype(np.float)) pred2 = reg2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def test_class_weight_classifiers(): # test that class_weight works and that the semantics are consistent classifiers = all_estimators(type_filter="classifier") with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.9)
def test_regressors_int(): # test if regressors can cope with integer labels (by converting them to # float) regressors = all_estimators(type_filter="regressor") X, _ = _boston_subset() X = X[:50] rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) for name, Regressor in regressors: if name in dont_test or name in ("CCA"): continue # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds regressor_1 = Regressor() regressor_2 = Regressor() set_random_state(regressor_1) set_random_state(regressor_2) if name in ("_PLS", "PLSCanonical", "PLSRegression"): y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit regressor_1.fit(X, y_) pred1 = regressor_1.predict(X) regressor_2.fit(X, y_.astype(np.float)) pred2 = regressor_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def check_regressors_train(name, Regressor, X, y): if name == 'OrthogonalMatchingPursuitCV': # FIXME: This test is unstable on Travis, see issue #3190. check_skip_travis() rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 # raises error on malformed input for fit assert_raises(ValueError, regressor.fit, X, y[:-1]) # fit if name in ('PLSCanonical', 'PLSRegression', 'CCA'): y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y set_random_state(regressor) regressor.fit(X, y_) regressor.predict(X) # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'): assert_greater(regressor.score(X, y_), 0.5)
def check_regressors_train(name, Regressor): X, y = _boston_subset() y = StandardScaler().fit_transform(y) # X is already scaled y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() set_fast_parameters(regressor) if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 if name == 'PassiveAggressiveRegressor': regressor.C = 0.01 # raises error on malformed input for fit assert_raises(ValueError, regressor.fit, X, y[:-1]) # fit if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y set_random_state(regressor) regressor.fit(X, y_) regressor.fit(X.tolist(), y_.tolist()) regressor.predict(X) # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'): print(regressor) assert_greater(regressor.score(X, y_), 0.5)
def check_estimators_overwrite_params(name, Estimator, X, y): with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if hasattr(estimator, 'batch_size'): # FIXME # for MiniBatchDictLearning estimator.batch_size = 1 if name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be # greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) estimator = Estimator(n_components=1) elif name == "SelectKBest": estimator = Estimator(k=1) set_random_state(estimator) params = estimator.get_params() estimator.fit(X, y) new_params = estimator.get_params() for k, v in params.items(): assert_false(np.any(new_params[k] != v), "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]))
def check_class_weight_classifiers(name, Classifier): if name == "NuSVC": # the sparse version has a parameter that doesn't do anything raise SkipTest if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! raise SkipTest for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) n_centers = len(np.unique(y_train)) if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) if hasattr(classifier, "min_weight_fraction_leaf"): classifier.set_params(min_weight_fraction_leaf=0.01) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.89)
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def check_class_weight_auto_linear_classifier(name, Classifier): """Test class weights with non-contiguous class labels.""" X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] with warnings.catch_warnings(record=True): classifier = Classifier() if hasattr(classifier, "n_iter"): # This is a very small dataset, default n_iter are likely to prevent # convergence classifier.set_params(n_iter=1000) set_random_state(classifier) # Let the model compute the class frequencies classifier.set_params(class_weight='auto') coef_auto = classifier.fit(X, y).coef_.copy() # Count each label occurrence to reweight manually mean_weight = (1. / 3 + 1. / 2) / 2 class_weight = { 1: 1. / 3 / mean_weight, -1: 1. / 2 / mean_weight, } classifier.set_params(class_weight=class_weight) coef_manual = classifier.fit(X, y).coef_.copy() assert_array_almost_equal(coef_auto, coef_manual)
def check_samplers_pandas(name, Sampler): pd = pytest.importorskip("pandas") # Check that the samplers handle pandas dataframe and pandas series X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) X_pd = pd.DataFrame(X) sampler = Sampler() if isinstance(Sampler(), SMOTE): samplers = [ Sampler(random_state=0, kind=kind) for kind in ('regular', 'borderline1', 'borderline2', 'svm') ] elif isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] else: samplers = [Sampler()] for sampler in samplers: # FIXME: in 0.6 set the random_state for all if name not in DONT_HAVE_RANDOM_STATE: set_random_state(sampler) X_res_pd, y_res_pd = sampler.fit_resample(X_pd, y) X_res, y_res = sampler.fit_resample(X, y) assert_allclose(X_res_pd, X_res) assert_allclose(y_res_pd, y_res)
def check_pipeline_consistency(name, Estimator): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) # check that make_pipeline(est) gives same score as est X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) pipeline = make_pipeline(estimator) estimator.fit(X, y) pipeline.fit(X, y) funcs = ["score", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func_pipeline = getattr(pipeline, func_name) result = func(X, y) result_pipe = func_pipeline(X, y) assert_array_almost_equal(result, result_pipe)
def check_class_weight_balanced_linear_classifier(name, Classifier): """Test class weights with non-contiguous class labels.""" X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = np.array([1, 1, 1, -1, -1]) with warnings.catch_warnings(record=True): classifier = Classifier() if hasattr(classifier, "n_iter"): # This is a very small dataset, default n_iter are likely to prevent # convergence classifier.set_params(n_iter=1000) set_random_state(classifier) # Let the model compute the class frequencies classifier.set_params(class_weight='balanced') coef_balanced = classifier.fit(X, y).coef_.copy() # Count each label occurrence to reweight manually n_samples = len(y) n_classes = float(len(np.unique(y))) class_weight = {1: n_samples / (np.sum(y == 1) * n_classes), -1: n_samples / (np.sum(y == -1) * n_classes)} classifier.set_params(class_weight=class_weight) coef_manual = classifier.fit(X, y).coef_.copy() assert_array_almost_equal(coef_balanced, coef_manual)
def check_estimators_pickle(name, Estimator): """Test that we can pickle all estimators""" check_methods = ["predict", "transform", "decision_function", "predict_proba"] X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) # some estimators can't do features less than 0 X -= X.min() # some estimators only take multioutputs y = multioutput_estimator_convert_y_2d(name, y) # catch deprecation warnings with warnings.catch_warnings(record=True): estimator = Estimator() set_random_state(estimator) set_fast_parameters(estimator) estimator.fit(X, y) result = dict() for method in check_methods: if hasattr(estimator, method): result[method] = getattr(estimator, method)(X) # pickle and unpickle! pickled_estimator = pickle.dumps(estimator) unpickled_estimator = pickle.loads(pickled_estimator) for method in result: unpickled_result = getattr(unpickled_estimator, method)(X) assert_array_almost_equal(result[method], unpickled_result)
def check_target_type(name, Estimator): X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) estimator = Estimator() set_random_state(estimator) with warns(UserWarning, match='should be of types'): estimator.fit(X, y)
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) # Make a physical copy of the orginal estimator parameters before fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model estimator.fit(X, y) # Compare the state of the model parameters with the original parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert_equal(hash(new_value), hash(original_value), "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (name, param_name, original_value, new_value))
def check_classifiers_classes(name, Classifier): X, y = make_blobs(n_samples=30, random_state=0, cluster_std=0.1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 y_names = np.array(["one", "two", "three"])[y] for y_names in [y_names, y_names.astype('O')]: if name in ["LabelPropagation", "LabelSpreading"]: # TODO some complication with -1 label y_ = y else: y_ = y_names classes = np.unique(y_) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() if name == 'BernoulliNB': classifier.set_params(binarize=X.mean()) set_fast_parameters(classifier) set_random_state(classifier) # fit classifier.fit(X, y_) y_pred = classifier.predict(X) # training set performance assert_array_equal(np.unique(y_), np.unique(y_pred)) if np.any(classifier.classes_ != classes): print("Unexpected classes_ attribute for %r: " "expected %s, got %s" % (classifier, classes, classifier.classes_))
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA': # FIXME # for MiniBatchDictLearning and MiniBatchSparsePCA estimator.batch_size = 1 set_fast_parameters(estimator) set_random_state(estimator) params = estimator.get_params() estimator.fit(X, y) new_params = estimator.get_params() for k, v in params.items(): assert_false(np.any(new_params[k] != v), "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]))
def check_regressors_int(name, Regressor): X, _ = _boston_subset() X = X[:50] rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds regressor_1 = Regressor() regressor_2 = Regressor() set_fast_parameters(regressor_1) set_fast_parameters(regressor_2) set_random_state(regressor_1) set_random_state(regressor_2) if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit regressor_1.fit(X, y_) pred1 = regressor_1.predict(X) regressor_2.fit(X, y_.astype(np.float)) pred2 = regressor_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def check_supervised_y_2d(name, Estimator): if "MultiTask" in name: # These only work on 2d, so this test makes no sense return rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 3)) y = np.arange(10) % 3 # catch deprecation warnings with warnings.catch_warnings(record=True): estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) # fit estimator.fit(X, y) y_pred = estimator.predict(X) set_random_state(estimator) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) warnings.simplefilter("ignore", RuntimeWarning) estimator.fit(X, y[:, np.newaxis]) y_pred_2d = estimator.predict(X) msg = "expected 1 DataConversionWarning, got: %s" % ( ", ".join([str(w_x) for w_x in w])) if name not in MULTI_OUTPUT: # check that we warned if we don't support multi-output assert_greater(len(w), 0, msg) assert_true("DataConversionWarning('A column-vector y" " was passed when a 1d array was expected" in msg) assert_array_almost_equal(y_pred.ravel(), y_pred_2d.ravel())
def test_transformers_pickle(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() succeeded = True for name, Transformer in transformers: if name in dont_test: continue # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): continue set_random_state(transformer) if hasattr(transformer, 'compute_importances'): transformer.compute_importances = True if name == "SelectKBest": # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 # fit if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) try: assert_array_almost_equal(pickled_X_pred, X_pred) except Exception as exc: succeeded = False print ("Transformer %s doesn't predict the same value " "after pickling" % name) raise exc assert_true(succeeded)
def _check_transformer(name, Transformer, X, y): n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if name == "KernelPCA": transformer.remove_zero_eig = False set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) # raises error on malformed input for transform if hasattr(X, 'T'): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)
def _check_transformer(name, Transformer, X, y): if name in ("CCA", "LocallyLinearEmbedding", "KernelPCA") and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # on numpy & scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + " is non deterministic on 32bit Python" raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, "transform"): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer ) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer ) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer ) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer ) # raises error on malformed input for transform if hasattr(X, "T"): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)
def check_transformer(name, Transformer, X, y): n_samples, n_features = X.shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if hasattr(transformer, "compute_importances"): transformer.compute_importances = True if name == "SelectKBest": # SelectKBest has a default of k=10 # which is more feature than we have. transformer.k = 1 elif name in ["GaussianRandomProjection", "SparseRandomProjection"]: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) transformer.n_components = 1 elif name == "MiniBatchDictionaryLearning": transformer.set_params(n_iter=5) # default = 1000 elif name == "KernelPCA": transformer.remove_zero_eig = False # fit if name in ("PLSCanonical", "PLSRegression", "CCA", "PLSSVD"): y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, "transform"): if name in ("PLSCanonical", "PLSRegression", "CCA", "PLSSVD"): X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal(x_pred, x_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal(x_pred3, x_pred2, 2, "fit_transform not correct in %s" % Transformer) else: assert_array_almost_equal(X_pred, X_pred2, 2, "fit_transform not correct in %s" % Transformer) assert_array_almost_equal(X_pred3, X_pred2, 2, "fit_transform not correct in %s" % Transformer) # raises error on malformed input for transform assert_raises(ValueError, transformer.transform, X.T)
def _sample(self): random_state = check_random_state(self.random_state) if self.sampler is None: self.sampler_ = RandomUnderSampler(return_indices=True, random_state=random_state) else: if not hasattr(self.sampler, 'return_indices'): raise ValueError("'sampler' needs to return the indices of " "the samples selected. Provide a sampler " "which has an attribute 'return_indices'.") self.sampler_ = clone(self.sampler) self.sampler_.set_params(return_indices=True) set_random_state(self.sampler_, random_state) _, _, self.indices_ = self.sampler_.fit_sample(self.X, self.y) # shuffle the indices since the sampler are packing them by class random_state.shuffle(self.indices_)
def check_classifiers_input_shapes(name, Classifier, X, y): # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() set_random_state(classifier) # fit classifier.fit(X, y) y_pred = classifier.predict(X) set_random_state(classifier) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) classifier.fit(X, y[:, np.newaxis]) assert_equal(len(w), 1) assert_array_equal(y_pred, classifier.predict(X))
def check_non_transformer_estimators_n_iter(name, estimator, multi_output=False): # Check if all iterative solvers, run for more than one iteratiom iris = load_iris() X, y_ = iris.data, iris.target if multi_output: y_ = y_[:, np.newaxis] set_random_state(estimator, 0) if name == 'AffinityPropagation': estimator.fit(X) else: estimator.fit(X, y_) assert_greater(estimator.n_iter_, 0)
def _sample(self): random_state = check_random_state(self.random_state) if self.sampler is None: self.sampler_ = RandomUnderSampler(random_state=random_state) else: self.sampler_ = clone(self.sampler) # FIXME: Remove in 0.6 if self.sampler_.__class__.__name__ not in DONT_HAVE_RANDOM_STATE: set_random_state(self.sampler_, random_state) self.sampler_.fit_resample(self.X, self.y) if not hasattr(self.sampler_, 'sample_indices_'): raise ValueError("'sampler' needs to have an attribute " "'sample_indices_'.") self.indices_ = self.sampler_.sample_indices_ # shuffle the indices since the sampler are packing them by class random_state.shuffle(self.indices_)
def test_get_metric_equivalent_to_explicit_mahalanobis(estimator, build_dataset): """Tests that using the get_metric method of mahalanobis metric learners is equivalent to explicitely calling scipy's mahalanobis metric """ rng = np.random.RandomState(42) input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) metric = model.get_metric() n_features = X.shape[1] a, b = (rng.randn(n_features), rng.randn(n_features)) expected_dist = mahalanobis(a[None], b[None], VI=model.get_mahalanobis_matrix()) assert_allclose(metric(a, b), expected_dist, rtol=1e-15)
def test_score_pairs_dim(estimator, build_dataset): # scoring of 3D arrays should return 1D array (several tuples), # and scoring of 2D arrays (one tuple) should return an error (like # scikit-learn's error when scoring 1D arrays) input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) tuples = np.array(list(product(X, X))) assert model.score_pairs(tuples).shape == (tuples.shape[0], ) context = make_context(estimator) msg = ("3D array of formed tuples expected{}. Found 2D array " "instead:\ninput={}. Reshape your data and/or use a preprocessor.\n" .format(context, tuples[1])) with pytest.raises(ValueError) as raised_error: model.score_pairs(tuples[1]) assert str(raised_error.value) == msg
def test_class_weight_auto_classifies(): # test that class_weight="auto" improves f1-score classifiers = all_estimators(type_filter='classifier') with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if 'class_weight' in c[1]().get_params().keys()] for n_classes, weights in zip([2, 3], [[.8, .2], [.8, .1, .1]]): # create unbalanced dataset X, y = make_classification(n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) for name, Classifier in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.startswith("RidgeClassifier"): # RidgeClassifier behaves unexpected # FIXME! continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! continue with warnings.catch_warnings(record=True): classifier = Classifier() if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) classifier.set_params(class_weight='auto') classifier.fit(X_train, y_train) y_pred_auto = classifier.predict(X_test) assert_greater(f1_score(y_test, y_pred_auto), f1_score(y_test, y_pred))
def test_check_input_pairs_learners_invalid_y(estimator, build_dataset, with_preprocessor): """checks that the only allowed labels for learning pairs are +1 and -1""" input_data, labels, _, X = build_dataset() wrong_labels_list = [labels + 0.5, np.random.RandomState(42).randn(len(labels)), np.random.RandomState(42).choice([0, 1], size=len(labels))] model = clone(estimator) set_random_state(model) expected_msg = ("When training on pairs, the labels (y) should contain " "only values in [-1, 1]. Found an incorrect value.") for wrong_labels in wrong_labels_list: with pytest.raises(ValueError) as raised_error: model.fit(input_data, wrong_labels) assert str(raised_error.value) == expected_msg
def test_predict_monotonous(estimator, build_dataset, with_preprocessor): """Test that there is a threshold distance separating points labeled as similar and points labeled as dissimilar """ input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) pairs_train, pairs_test, y_train, y_test = train_test_split( input_data, labels) estimator.fit(pairs_train, y_train) distances = estimator.score_pairs(pairs_test) predictions = estimator.predict(pairs_test) min_dissimilar = np.min(distances[predictions == -1]) max_similar = np.max(distances[predictions == 1]) assert max_similar <= min_dissimilar separator = np.mean([min_dissimilar, max_similar]) assert (predictions[distances > separator] == -1).all() assert (predictions[distances < separator] == 1).all()
def check_estimators_dtypes(name, estimator_orig): rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) y = (X_train_int_64[:, 0] < 1).astype(int) methods = ["predict", "transform", "decision_function", "predict_proba"] for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]: estimator = clone(estimator_orig) set_random_state(estimator, 1) estimator.fit(X_train, y) for method in methods: if hasattr(estimator, method): getattr(estimator, method)(X_train)
def check_estimators_fit_returns_self(name, estimator_orig, readonly_memmap=False): """Check if self is returned when calling fit""" X, y = make_blobs(random_state=0, n_samples=9, n_features=4) y = (y > 1).astype(int) # some want non-negative input X -= X.min() X = pairwise_estimator_convert_X(X, estimator_orig) estimator = clone(estimator_orig) y = multioutput_estimator_convert_y_2d(estimator, y) if readonly_memmap: X, y = create_memmap_backed_data([X, y]) set_random_state(estimator) assert estimator.fit(X, y) is estimator
def check_class_weight_classifiers(name, Classifier, X_train, y_train, X_test, y_test): n_centers = len(np.unique(y_train)) if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.9)
def check_fit_score_takes_y(name, Estimator): # check that all estimators accept an optional y # in fit and score so they can be used in pipelines rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 3)) y = np.arange(10) % 3 y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func(X, y) args = inspect.getargspec(func).args assert_true(args[2] in ["y", "Y"])
def test_cross_validation_manual_vs_scikit(estimator, build_dataset, with_preprocessor): """Tests that if we make a manual cross-validation, the result will be the same as scikit-learn's cross-validation (some code for generating the folds is taken from scikit-learn). """ if any(hasattr(estimator, method) for method in ["predict", "score"]): input_data, labels, preprocessor, _ = build_dataset(with_preprocessor) estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) n_splits = 3 kfold = KFold(shuffle=False, n_splits=n_splits) n_samples = input_data.shape[0] fold_sizes = (n_samples // n_splits) * np.ones(n_splits, dtype=np.int) fold_sizes[:n_samples % n_splits] += 1 current = 0 scores, predictions = [], np.zeros(input_data.shape[0]) for fold_size in fold_sizes: start, stop = current, current + fold_size current = stop test_slice = slice(start, stop) train_mask = np.ones(input_data.shape[0], bool) train_mask[test_slice] = False y_train, y_test = labels[train_mask], labels[test_slice] estimator.fit(*remove_y_quadruplets( estimator, input_data[train_mask], y_train)) if hasattr(estimator, "score"): scores.append( estimator.score(*remove_y_quadruplets( estimator, input_data[test_slice], y_test))) if hasattr(estimator, "predict"): predictions[test_slice] = estimator.predict( input_data[test_slice]) if hasattr(estimator, "score"): assert all(scores == cross_val_score( estimator, *remove_y_quadruplets(estimator, input_data, labels), cv=kfold)) if hasattr(estimator, "predict"): assert all(predictions == cross_val_predict( estimator, *remove_y_quadruplets(estimator, input_data, labels), cv=kfold))
def check_samplers_multiclass_ova(name, Sampler): # Check that multiclass target lead to the same results than OVA encoding X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) y_ova = label_binarize(y, np.unique(y)) sampler = Sampler() # FIXME: in 0.6 set the random_state for all if name not in DONT_HAVE_RANDOM_STATE: set_random_state(sampler) X_res, y_res = sampler.fit_resample(X, y) X_res_ova, y_res_ova = sampler.fit_resample(X, y_ova) assert_allclose(X_res, X_res_ova) if issubclass(Sampler, BaseEnsembleSampler): for batch_y, batch_y_ova in zip(y_res, y_res_ova): assert type_of_target(batch_y_ova) == type_of_target(y_ova) assert_allclose(batch_y, batch_y_ova.argmax(axis=1)) else: assert type_of_target(y_res_ova) == type_of_target(y_ova) assert_allclose(y_res, y_res_ova.argmax(axis=1))
def test_estimators_overwrite_params(): # test whether any classifier overwrites his init parameters during fit for est_type in ["classifier", "regressor", "transformer"]: estimators = all_estimators(type_filter=est_type) X, y = make_blobs(random_state=0, n_samples=9) # some want non-negative input X -= X.min() for name, Estimator in estimators: if (name in dont_test or name in ['CCA', '_CCA', 'PLSCanonical', 'PLSRegression', 'PLSSVD', 'GaussianProcess']): # FIXME! # in particular GaussianProcess! continue with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if hasattr(estimator, 'batch_size'): # FIXME # for MiniBatchDictLearning estimator.batch_size = 1 if name in ['GaussianRandomProjection', 'SparseRandomProjection']: # Due to the jl lemma and very few samples, the number # of components of the random matrix projection will be # greater # than the number of features. # So we impose a smaller number (avoid "auto" mode) estimator = Estimator(n_components=1) set_random_state(estimator) params = estimator.get_params() estimator.fit(X, y) new_params = estimator.get_params() for k, v in params.items(): assert_false(np.any(new_params[k] != v), "Estimator %s changes its parameter %s" " from %s to %s during fit." % (name, k, v, new_params[k]))
def check_estimators_dtypes(name, Estimator): rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) y = X_train_int_64[:, 0] y = multioutput_estimator_convert_y_2d(name, y) for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]: with warnings.catch_warnings(record=True): estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator, 1) estimator.fit(X_train, y) for method in ["predict", "transform", "decision_function", "predict_proba"]: if hasattr(estimator, method): getattr(estimator, method)(X_train)
def check_estimators_not_an_array(name, Estimator, X, y): if name in ('CCA', '_PLS', 'PLSCanonical', 'PLSRegression'): raise SkipTest # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds regressor_1 = Estimator() regressor_2 = Estimator() set_random_state(regressor_1) set_random_state(regressor_2) y_ = NotAnArray(np.asarray(y)) # fit regressor_1.fit(X, y_) pred1 = regressor_1.predict(X) regressor_2.fit(X, y) pred2 = regressor_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def test_get_metric_raises_error(estimator, build_dataset): """Tests that the metric returned by get_metric raises errors similar to the distance functions in scipy.spatial.distance""" input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) model.fit(input_data, labels) metric = model.get_metric() list_test_get_metric_raises = [(X[0].tolist() + [5.2], X[1]), # vectors with # different dimensions (X[0:4], X[1:5]), # 2D vectors (X[0].tolist() + [5.2], X[1] + [7.2])] # vectors of same dimension but incompatible with what the metric learner # was trained on for u, v in list_test_get_metric_raises: with pytest.raises(ValueError): metric(u, v)
def test_class_weight_classifiers(): # test that class_weight works and that the semantics are consistent classifiers = all_estimators(type_filter='classifier') with warnings.catch_warnings(record=True): classifiers = [ c for c in classifiers if 'class_weight' in c[1]().get_params().keys() ] for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) for name, Clf in classifiers: if name == "NuSVC": # the sparse version has a parameter that doesn't do anything continue if name.startswith("RidgeClassifier"): # RidgeClassifier shows unexpected behavior # FIXME! continue if name.endswith("NB"): # NaiveBayes classifiers have a somewhat differnt interface. # FIXME SOON! continue if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): clf = Clf(class_weight=class_weight) if hasattr(clf, "n_iter"): clf.set_params(n_iter=100) set_random_state(clf) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.9)
def test_accuracy_toy_example(estimator, build_dataset): """Test that the default scoring for triplets (accuracy) works on some toy example""" triplets, _, _, X = build_dataset(with_preprocessor=False) estimator = clone(estimator) set_random_state(estimator) estimator.fit(triplets) # We take the two first points and we build 4 regularly spaced points on the # line they define, so that it's easy to build triplets of different # similarities. X_test = X[0] + np.arange(4)[:, np.newaxis] * (X[0] - X[1]) / 4 triplets_test = np.array([[X_test[0], X_test[2], X_test[1]], [X_test[1], X_test[3], X_test[0]], [X_test[1], X_test[2], X_test[3]], [X_test[3], X_test[0], X_test[2]]]) # we force the transformation to be identity so that we control what it does estimator.components_ = np.eye(X.shape[1]) assert estimator.score(triplets_test) == 0.25
def check_transformer_n_iter(name, estimator): if name in CROSS_DECOMPOSITION: # Check using default data X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [2., 5., 4.]] y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]] else: X, y_ = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() - 0.1 set_random_state(estimator, 0) estimator.fit(X, y_) # These return a n_iter per component. if name in CROSS_DECOMPOSITION: for iter_ in estimator.n_iter_: assert_greater(iter_, 1) else: assert_greater(estimator.n_iter_, 1)
def check_samplers_sparse(name, Sampler): # check that sparse matrices can be passed through the sampler leading to # the same results than dense X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, weights=[0.2, 0.3, 0.5], random_state=0) X_sparse = sparse.csr_matrix(X) if isinstance(Sampler(), SMOTE): samplers = [ Sampler(random_state=0, kind=kind) for kind in ('regular', 'borderline1', 'borderline2', 'svm') ] elif isinstance(Sampler(), NearMiss): samplers = [Sampler(version=version) for version in (1, 2, 3)] elif isinstance(Sampler(), ClusterCentroids): # set KMeans to full since it support sparse and dense samplers = [ Sampler(random_state=0, voting='soft', estimator=KMeans(random_state=1, algorithm='full')) ] else: samplers = [Sampler()] for sampler in samplers: # FIXME: in 0.6 set the random_state for all if name not in DONT_HAVE_RANDOM_STATE: set_random_state(sampler) X_res_sparse, y_res_sparse = sampler.fit_resample(X_sparse, y) X_res, y_res = sampler.fit_resample(X, y) if not isinstance(sampler, BaseEnsembleSampler): assert sparse.issparse(X_res_sparse) assert_allclose(X_res_sparse.A, X_res) assert_allclose(y_res_sparse, y_res) else: for x_sp, x, y_sp, y in zip(X_res_sparse, X_res, y_res_sparse, y_res): assert sparse.issparse(x_sp) assert_allclose(x_sp.A, x) assert_allclose(y_sp, y)
def test_get_metric_is_independent_from_metric_learner(estimator, build_dataset): """Tests that the get_metric method returns a function that is independent from the original metric learner""" input_data, labels, _, X = build_dataset() model = clone(estimator) set_random_state(model) # we fit the metric learner on it and then we compute the metric on some # points model.fit(input_data, labels) metric = model.get_metric() score = metric(X[0], X[1]) # then we refit the estimator on another dataset model.fit(np.sin(input_data), labels) # we recompute the distance between the two points: it should be the same score_bis = metric(X[0], X[1]) assert score_bis == score
def test_score_pairs_pairwise(estimator, build_dataset): # Computing pairwise scores should return a euclidean distance matrix. input_data, labels, _, X = build_dataset() n_samples = 20 X = X[:n_samples] model = clone(estimator) set_random_state(model) model.fit(*remove_y_quadruplets(estimator, input_data, labels)) pairwise = model.score_pairs(np.array(list(product(X, X))))\ .reshape(n_samples, n_samples) check_is_distance_matrix(pairwise) # a necessary condition for euclidean distance matrices: (see # https://en.wikipedia.org/wiki/Euclidean_distance_matrix) assert np.linalg.matrix_rank(pairwise**2) <= min(X.shape) + 2 # assert that this distance is coherent with pdist on embeddings assert_array_almost_equal(squareform(pairwise), pdist(model.transform(X)))
def check_multiprocessing_idempotent(Estimator): # Check that running an estimator on a single process is no different to running # it on multiple processes. We also check that we can set n_jobs=-1 to make use # of all CPUs. The test is not really necessary though, as we rely on joblib for # parallelization and can trust that it works as expected. estimator = _construct_instance(Estimator) params = estimator.get_params() if "n_jobs" in params: results = dict() args = dict() # run on a single process estimator = _construct_instance(Estimator) estimator.set_params(n_jobs=1) set_random_state(estimator) args["fit"] = _make_args(estimator, "fit") estimator.fit(*args["fit"]) # compute and store results for method in NON_STATE_CHANGING_METHODS: if hasattr(estimator, method): args[method] = _make_args(estimator, method) results[method] = getattr(estimator, method)(*args[method]) # run on multiple processes, reusing the same input arguments estimator = _construct_instance(Estimator) estimator.set_params(n_jobs=-1) set_random_state(estimator) estimator.fit(*args["fit"]) # compute and compare results for method in results: if hasattr(estimator, method): result = getattr(estimator, method)(*args[method]) _assert_array_equal( results[method], result, err_msg="Results are not equal for n_jobs=1 and " "n_jobs=-1", )
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA': # FIXME # for MiniBatchDictLearning and MiniBatchSparsePCA estimator.batch_size = 1 set_fast_parameters(estimator) set_random_state(estimator) # Make a physical copy of the orginal estimator parameters before fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model estimator.fit(X, y) # Compare the state of the model parameters with the original parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert_equal( hash(new_value), hash(original_value), "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (name, param_name, original_value, new_value))
def test_array_like_inputs(estimator, build_dataset, with_preprocessor): """Test that metric-learners can have as input (of all functions that are applied on data) any array-like object.""" input_data, labels, preprocessor, X = build_dataset(with_preprocessor) # we subsample the data for the test to be more efficient input_data, _, labels, _ = train_test_split(input_data, labels, train_size=20) X = X[:10] estimator = clone(estimator) estimator.set_params(preprocessor=preprocessor) set_random_state(estimator) input_variants, label_variants = generate_array_like(input_data, labels) for input_variant in input_variants: for label_variant in label_variants: estimator.fit( *remove_y_quadruplets(estimator, input_variant, label_variant)) if hasattr(estimator, "predict"): estimator.predict(input_variant) if hasattr(estimator, "predict_proba"): estimator.predict_proba(input_variant) # anticipation in case some # time we have that, or if ppl want to contribute with new algorithms # it will be checked automatically if hasattr(estimator, "decision_function"): estimator.decision_function(input_variant) if hasattr(estimator, "score"): for label_variant in label_variants: estimator.score(*remove_y_quadruplets(estimator, input_variant, label_variant)) X_variants, _ = generate_array_like(X) for X_variant in X_variants: estimator.transform(X_variant) pairs = np.array([[X[0], X[1]], [X[0], X[2]]]) pairs_variants, _ = generate_array_like(pairs) for pairs_variant in pairs_variants: estimator.score_pairs(pairs_variant)
def test_singular_array_init_or_prior_strictpd(estimator, build_dataset, w0): """Tests that when using a custom array init (or prior), it returns the appropriate error if it is singular, for algorithms that need a strictly PD prior or init (see https://github.com/scikit-learn-contrib/metric-learn/issues/202 and https://github.com/scikit-learn-contrib/metric-learn/pull/195#issuecomment -492332451) """ matrices_to_set = [] if hasattr(estimator, 'init'): matrices_to_set.append('init') if hasattr(estimator, 'prior'): matrices_to_set.append('prior') rng = np.random.RandomState(42) input_data, labels, _, X = build_dataset() for param in matrices_to_set: model = clone(estimator) set_random_state(model) P = ortho_group.rvs(X.shape[1], random_state=rng) w = np.abs(rng.randn(X.shape[1])) w[0] = w0 M = P.dot(np.diag(w)).dot(P.T) if hasattr(model, 'init'): model.set_params(init=M) if hasattr(model, 'prior'): model.set_params(prior=M) if not hasattr(model, 'prior') and not hasattr(model, 'init'): raise RuntimeError( "Neither prior or init could be set in the model.") msg = ("You should provide a strictly positive definite " "matrix as `{}`. This one is not definite. Try another" " {}, or an algorithm that does not " "require the {} to be strictly positive definite.".format( *(param, ) * 3)) with pytest.raises(LinAlgError) as raised_err: model.fit(input_data, labels) assert str(raised_err.value) == msg
def check_classifiers_input_shapes(name, Classifier): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=1) X = StandardScaler().fit_transform(X) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() set_fast_parameters(classifier) set_random_state(classifier) # fit classifier.fit(X, y) y_pred = classifier.predict(X) set_random_state(classifier) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) classifier.fit(X, y[:, np.newaxis]) assert_equal(len(w), 1) assert_array_equal(y_pred, classifier.predict(X))