def test_one_hot_encoder_specified_categories(): X = np.array([['a', 'b']], dtype=object).T enc = OneHotEncoder(categories=[['a', 'b', 'c']]) exp = np.array([[1., 0., 0.], [0., 1., 0.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories[0] == ['a', 'b', 'c'] assert enc.categories_[0].tolist() == ['a', 'b', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.str_) # unsorted passed categories raises for now enc = OneHotEncoder(categories=[['c', 'b', 'a']]) msg = re.escape('Unsorted categories are not yet supported') assert_raises_regex(ValueError, msg, enc.fit_transform, X) # multiple columns X = np.array([['a', 'b'], [0, 2]], dtype=object).T enc = OneHotEncoder(categories=[['a', 'b', 'c'], [0, 1, 2]]) exp = np.array([[1., 0., 0., 1., 0., 0.], [0., 1., 0., 0., 0., 1.]]) assert_array_equal(enc.fit_transform(X).toarray(), exp) assert enc.categories_[0].tolist() == ['a', 'b', 'c'] assert np.issubdtype(enc.categories_[0].dtype, np.str_) assert enc.categories_[1].tolist() == [0, 1, 2] assert np.issubdtype(enc.categories_[1].dtype, np.integer) # when specifying categories manually, unknown categories should already # raise when fitting X = np.array([['a', 'b', 'c']]).T enc = OneHotEncoder(categories=[['a', 'b']]) assert_raises(ValueError, enc.fit, X) enc = OneHotEncoder(categories=[['a', 'b']], handle_unknown='ignore') exp = np.array([[1., 0.], [0., 1.], [0., 0.]]) assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
def test_one_hot_encoder_unsorted_categories(): X = np.array([['a', 'b']], dtype=object).T # unsorted passed categories raises for now enc = OneHotEncoder(categories=[['c', 'b', 'a']]) msg = re.escape('Unsorted categories are not yet supported') assert_raises_regex(ValueError, msg, enc.fit_transform, X)
def test_ridgecv_store_cv_values(): rng = np.random.RandomState(42) n_samples = 8 n_features = 5 x = rng.randn(n_samples, n_features) alphas = [1e-1, 1e0, 1e1] n_alphas = len(alphas) r = RidgeCV(alphas=alphas, cv=None, store_cv_values=True) # with len(y.shape) == 1 y = rng.randn(n_samples) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_alphas) # with len(y.shape) == 2 n_targets = 3 y = rng.randn(n_samples, n_targets) r.fit(x, y) assert r.cv_values_.shape == (n_samples, n_targets, n_alphas) r = RidgeCV(cv=3, store_cv_values=True) assert_raises_regex(ValueError, 'cv!=None and store_cv_values', r.fit, x, y)
def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA(svd_solver="full") pipe = Pipeline([("scaler", scaler), ("pca", pca)]) assert_raises_regex(AttributeError, "'PCA' object has no attribute 'fit_predict'", getattr, pipe, "fit_predict")
def check_dtype_object(name, Estimator): # check that estimators treat dtype object as numeric if possible rng = np.random.RandomState(0) X = rng.rand(40, 10).astype(object) y = (X[:, 0] * 4).astype(np.int) y = multioutput_estimator_convert_y_2d(name, y) with warnings.catch_warnings(): estimator = Estimator() set_fast_parameters(estimator) estimator.fit(X, y) if hasattr(estimator, "predict"): estimator.predict(X) if hasattr(estimator, "transform"): estimator.transform(X) try: estimator.fit(X, y.astype(object)) except Exception as e: if "Unknown label type" not in str(e): raise X[0, 0] = {'foo': 'bar'} msg = "argument must be a string or a number" assert_raises_regex(TypeError, msg, estimator.fit, X, y)
def test_one_hot_encoder_invalid_params(): enc = OneHotEncoder(drop='second') assert_raises_regex( ValueError, "Wrong input for parameter `drop`.", enc.fit, [["Male"], ["Female"]]) enc = OneHotEncoder(handle_unknown='ignore', drop='first') assert_raises_regex( ValueError, "`handle_unknown` must be 'error'", enc.fit, [["Male"], ["Female"]]) enc = OneHotEncoder(drop='first') assert_raises_regex( ValueError, "The handling of integer data will change in version", enc.fit, [[1], [2]]) enc = OneHotEncoder(drop='first', categories='auto') assert_no_warnings(enc.fit_transform, [[1], [2]]) enc = OneHotEncoder(drop=np.asarray('b', dtype=object)) assert_raises_regex( ValueError, "Wrong input for parameter `drop`.", enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]]) enc = OneHotEncoder(drop=['ghi', 3, 59]) assert_raises_regex( ValueError, "The following categories were supposed", enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
def test_graphviz_errors(): # Check for errors of export_graphviz clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2) # Check not-fitted decision tree error out = StringIO() assert_raises(NotFittedError, export_graphviz, clf, out) clf.fit(X, y) # Check if it errors when length of feature_names # mismatches with number of features message = ("Length of feature_names, " "1 does not match number of features, 2") assert_raise_message(ValueError, message, export_graphviz, clf, None, feature_names=["a"]) message = ("Length of feature_names, " "3 does not match number of features, 2") assert_raise_message(ValueError, message, export_graphviz, clf, None, feature_names=["a", "b", "c"]) # Check class_names error out = StringIO() assert_raises(IndexError, export_graphviz, clf, out, class_names=[]) # Check precision error out = StringIO() assert_raises_regex(ValueError, "should be greater or equal", export_graphviz, clf, out, precision=-1) assert_raises_regex(ValueError, "should be an integer", export_graphviz, clf, out, precision="1")
def test_bad_pyfunc_metric(): def wrong_distance(x, y): return "1" X = np.ones((5, 2)) assert_raises_regex(TypeError, "Custom distance function must accept two vectors", BallTree, X, metric=wrong_distance)
def test_check_class_weight_balanced_linear_classifier(): # check that ill-computed balanced weights raises an exception assert_raises_regex(AssertionError, "Classifier estimator_name is not computing" " class_weight=balanced properly.", check_class_weight_balanced_linear_classifier, 'estimator_name', BadBalancedWeightsClassifier)
def test_k_means_n_init(): rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 2)) # two regression tests on bad n_init argument # previous bug: n_init <= 0 threw non-informative TypeError (#3858) assert_raises_regex(ValueError, "n_init", KMeans(n_init=0).fit, X) assert_raises_regex(ValueError, "n_init", KMeans(n_init=-1).fit, X)
def test_check_estimators_unfitted(): # check that a ValueError/AttributeError is raised when calling predict # on an unfitted estimator msg = "AttributeError or ValueError not raised by predict" assert_raises_regex(AssertionError, msg, check_estimators_unfitted, "estimator", NoSparseClassifier) # check that CorrectNotFittedError inherit from either ValueError # or AttributeError check_estimators_unfitted("estimator", CorrectNotFittedErrorClassifier)
def test_gen_even_slices(): # check that gen_even_slices contains all samples some_range = range(10) joined_range = list(chain(*[some_range[slice] for slice in gen_even_slices(10, 3)])) assert_array_equal(some_range, joined_range) # check that passing negative n_chunks raises an error slices = gen_even_slices(10, -1) assert_raises_regex(ValueError, "gen_even_slices got n_packs=-1, must be" " >=1", next, slices)
def test_fit_predict_on_pipeline_without_fit_predict(): # tests that a pipeline does not have fit_predict method when final # step of pipeline does not have fit_predict defined scaler = StandardScaler() pca = PCA() pipe = Pipeline([('scaler', scaler), ('pca', pca)]) assert_raises_regex(AttributeError, "'PCA' object has no attribute 'fit_predict'", getattr, pipe, 'fit_predict')
def test_precompute_invalid_argument(): X, y, _, _ = build_dataset() for clf in [ElasticNetCV(precompute="invalid"), LassoCV(precompute="invalid")]: assert_raises_regex(ValueError, ".*should be.*True.*False.*auto.*" "array-like.*Got 'invalid'", clf.fit, X, y) # Precompute = 'auto' is not supported for ElasticNet assert_raises_regex( ValueError, ".*should be.*True.*False.*array-like.*" "Got 'auto'", ElasticNet(precompute="auto").fit, X, y )
def test_check_classification_targets(): for y_type in EXAMPLES.keys(): if y_type in ["unknown", "continuous", 'continuous-multioutput']: for example in EXAMPLES[y_type]: msg = 'Unknown label type: ' assert_raises_regex(ValueError, msg, check_classification_targets, example) else: for example in EXAMPLES[y_type]: check_classification_targets(example)
def test_regression_metrics_at_limits(): assert_almost_equal(mean_squared_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_squared_log_error([0.], [0.]), 0.00, 2) assert_almost_equal(mean_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(median_absolute_error([0.], [0.]), 0.00, 2) assert_almost_equal(explained_variance_score([0.], [0.]), 1.00, 2) assert_almost_equal(r2_score([0., 1], [0., 1]), 1.00, 2) assert_raises_regex(ValueError, "Mean Squared Logarithmic Error cannot be " "used when targets contain negative values.", mean_squared_log_error, [-1.], [-1.])
def test__check_reg_targets_exception(): invalid_multioutput = 'this_value_is_not_valid' expected_message = ("Allowed 'multioutput' string values are.+" "You provided multioutput={!r}".format( invalid_multioutput)) assert_raises_regex(ValueError, expected_message, _check_reg_targets, [1, 2, 3], [[1], [2], [3]], invalid_multioutput)
def test_pipeline_with_cache_attribute(): X = np.array([[1, 2]]) pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=DummyMemory()) pipe.fit(X, y=None) dummy = WrongDummyMemory() pipe = Pipeline([('transf', Transf()), ('clf', Mult())], memory=dummy) assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='{}' instead.".format(dummy), pipe.fit, X)
def test_multi_target_sample_weights_api(): X = [[1, 2, 3], [4, 5, 6]] y = [[3.141, 2.718], [2.718, 3.141]] w = [0.8, 0.6] rgr = MultiOutputRegressor(Lasso()) assert_raises_regex(ValueError, "does not support sample weights", rgr.fit, X, y, w) # no exception should be raised if the base estimator supports weights rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr.fit(X, y, w)
def test_ovr_partial_fit_exceptions(): ovr = OneVsRestClassifier(MultinomialNB()) X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr.partial_fit(X[:7], y[:7], np.unique(y)) # A new class value which was not in the first call of partial_fit # It should raise ValueError y1 = [5] + y[7:-1] assert_raises_regex(ValueError, "Mini-batch contains \[.+\] while classes" " must be subset of \[.+\]", ovr.partial_fit, X=X[7:], y=y1)
def test_randomized_lasso_error_memory(): scaling = 0.3 selection_threshold = 0.5 tempdir = 5 clf = RandomizedLasso(verbose=False, alpha=[1, 0.8], random_state=42, scaling=scaling, selection_threshold=selection_threshold, memory=tempdir) assert_raises_regex(ValueError, "'memory' should either be a string or" " a sklearn.utils.Memory instance", clf.fit, X, y)
def test_scale_input_finiteness_validation(): # Check if non finite inputs raise ValueError X = [np.nan, 5, 6, 7, 8] assert_raises_regex(ValueError, "Input contains NaN, infinity or a value too large", scale, X) X = [np.inf, 5, 6, 7, 8] assert_raises_regex(ValueError, "Input contains NaN, infinity or a value too large", scale, X)
def test_ordinal_encoder_inverse(): X = [['abc', 2, 55], ['def', 1, 55]] enc = OrdinalEncoder() X_tr = enc.fit_transform(X) exp = np.array(X, dtype=object) assert_array_equal(enc.inverse_transform(X_tr), exp) # incorrect shape raises X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]]) msg = re.escape('Shape of the passed X data is not correct') assert_raises_regex(ValueError, msg, enc.inverse_transform, X_tr)
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], memory=memory) assert_raises_regex(ValueError, "'memory' should be None, a string or" " have the same interface as joblib.Memory." " Got memory='1' instead.", cached_pipe.fit, X, y)
def test_check_non_negative(retype): A = np.array([[1, 1, 0, 0], [1, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]) X = retype(A) check_non_negative(X, "") X = retype([[0, 0], [0, 0]]) check_non_negative(X, "") A[0, 0] = -1 X = retype(A) assert_raises_regex(ValueError, "Negative ", check_non_negative, X, "")
def test_pipeline_wrong_memory(): # Test that an error is raised when memory is not a string or a Memory # instance iris = load_iris() X = iris.data y = iris.target # Define memory as an integer memory = 1 cached_pipe = Pipeline([('transf', DummyTransf()), ('svc', SVC())], memory=memory) assert_raises_regex(ValueError, "'memory' should either be a string or a" " sklearn.externals.joblib.Memory instance, got", cached_pipe.fit, X, y)
def test_check_no_fit_attributes_set_in_init(): class NonConformantEstimator(object): def __init__(self): self.you_should_not_set_this_ = None msg = ("By convention, attributes ending with '_'.+" 'should not be initialized in the constructor.+' "Attribute 'you_should_not_set_this_' was found.+" 'in estimator estimator_name') assert_raises_regex(AssertionError, msg, check_no_fit_attributes_set_in_init, 'estimator_name', NonConformantEstimator)
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "it does not implement a 'get_params' methods" assert_raises_regex(TypeError, msg, check_estimator, object) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator) # check that fit does input validation msg = "TypeError not raised by fit" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier) # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) # check for sparse matrix input handling msg = "Estimator type doesn't seem to fail gracefully on sparse data" # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier) except: pass finally: sys.stdout = old_stdout assert_true(msg in string_buffer.getvalue()) # doesn't error on actual estimator check_estimator(AdaBoostClassifier)
def test_sparse_validate_centers(): from sklearn.datasets import load_iris iris = load_iris() X = iris.data # Get a local optimum centers = KMeans(n_clusters=4).fit(X).cluster_centers_ # Test that a ValueError is raised for validate_center_shape classifier = KMeans(n_clusters=3, init=centers, n_init=1) msg = "The shape of the initial centers \(\(4L?, 4L?\)\) " \ "does not match the number of clusters 3" assert_raises_regex(ValueError, msg, classifier.fit, X)
def test_novelty_errors(): X = iris.data # check errors for novelty=False clf = neighbors.LocalOutlierFactor() clf.fit(X) # predict, decision_function and score_samples raise ValueError for method in ['predict', 'decision_function', 'score_samples']: msg = ('{} is not available when novelty=False'.format(method)) assert_raises_regex(AttributeError, msg, getattr, clf, method) # check errors for novelty=True clf = neighbors.LocalOutlierFactor(novelty=True) msg = 'fit_predict is not available when novelty=True' assert_raises_regex(AttributeError, msg, getattr, clf, 'fit_predict')
def test_check_estimator_transformer_no_mixin(): # check that TransformerMixin is not required for transformer tests to run assert_raises_regex(AttributeError, '.*fit_transform.*', check_estimator, BadTransformerWithoutMixin())
def check_samplers_no_fit_error(name, Sampler): sampler = Sampler() X = np.random.random((20, 2)) y = np.array([1] * 5 + [0] * 15) assert_raises_regex(NotFittedError, "instance is not fitted yet.", sampler.sample, X, y)
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "it does not implement a 'get_params' methods" assert_raises_regex(TypeError, msg, check_estimator, object) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator) # check that fit does input validation msg = "TypeError not raised" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier) # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) # check that `fit` only changes attributes that # are private (start with an _ or end with a _). msg = ('Estimator changes public attribute\(s\) during the fit method.' ' Estimators are only allowed to change attributes started' ' or ended with _, but wrong_attribute changed') assert_raises_regex(AssertionError, msg, check_estimator, ChangesWrongAttribute) # check that `fit` doesn't add any public attribute msg = ('Estimator adds public attribute\(s\) during the fit method.' ' Estimators are only allowed to add private attributes' ' either started with _ or ended' ' with _ but wrong_attribute added') assert_raises_regex(AssertionError, msg, check_estimator, SetsWrongAttribute) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = ("Estimator " + name + " doesn't seem to fail gracefully on" " sparse data") # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier) except: pass finally: sys.stdout = old_stdout assert msg in string_buffer.getvalue()
def test_iht_fit_sample_wrong_class_obj(): from sklearn.cluster import KMeans est = KMeans() iht = InstanceHardnessThreshold(estimator=est, random_state=RND_SEED) assert_raises_regex(ValueError, "Invalid parameter `estimator`", iht.fit_sample, X, Y)
def test_ncr_wrong_nn_obj(): nn = 'rnd' ncr = NeighbourhoodCleaningRule(return_indices=True, random_state=RND_SEED, n_neighbors=nn) assert_raises_regex(ValueError, "has to be one of", ncr.fit_sample, X, Y)
def test_smote_wrong_kind(): kind = 'rnd' smote = SMOTE(kind=kind, random_state=RND_SEED) assert_raises_regex(ValueError, "Unknown kind for SMOTE", smote.fit_sample, X, Y)
def test_invalid_dimension(): assert_raises_regex(ValueError, "has to be a list or tuple", space_check_dimension, "23") assert_raises_regex(ValueError, "Invalid dimension", space_check_dimension, (23, ))
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "it does not implement a 'get_params' methods" assert_raises_regex(TypeError, msg, check_estimator, object) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator) # check that fit does input validation msg = "TypeError not raised" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") assert_raises_regex( ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator " + name + " doesn't seem to fail gracefully on sparse data" # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier) except: pass finally: sys.stdout = old_stdout assert_true(msg in string_buffer.getvalue()) # doesn't error on actual estimator check_estimator(AdaBoostClassifier) check_estimator(MultiTaskElasticNet)
def test_assert_raises_msg(): with assert_raises_regex(AssertionError, 'Hello world'): with assert_raises(ValueError, msg='Hello world'): pass
def test_mutli_output_classifiation_partial_fit_no_first_classes_exception(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) assert_raises_regex( ValueError, "classes must be passed on the first call " "to partial_fit.", multi_target_linear.partial_fit, X, y)
def test_check_array_complex_data_error(): X = np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]]) assert_raises_regex(ValueError, "Complex data not supported", check_array, X) # list of lists X = [[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]] assert_raises_regex(ValueError, "Complex data not supported", check_array, X) # tuple of tuples X = ((1 + 2j, 3 + 4j, 5 + 7j), (2 + 3j, 4 + 5j, 6 + 7j)) assert_raises_regex(ValueError, "Complex data not supported", check_array, X) # list of np arrays X = [ np.array([1 + 2j, 3 + 4j, 5 + 7j]), np.array([2 + 3j, 4 + 5j, 6 + 7j]) ] assert_raises_regex(ValueError, "Complex data not supported", check_array, X) # tuple of np arrays X = (np.array([1 + 2j, 3 + 4j, 5 + 7j]), np.array([2 + 3j, 4 + 5j, 6 + 7j])) assert_raises_regex(ValueError, "Complex data not supported", check_array, X) # dataframe X = MockDataFrame( np.array([[1 + 2j, 3 + 4j, 5 + 7j], [2 + 3j, 4 + 5j, 6 + 7j]])) assert_raises_regex(ValueError, "Complex data not supported", check_array, X) # sparse matrix X = sp.coo_matrix([[0, 1 + 2j], [0, 0]]) assert_raises_regex(ValueError, "Complex data not supported", check_array, X)
def test_real_distance_out_of_range(): ints = Real(1, 10) assert_raises_regex(RuntimeError, "compute distance for values within", ints.distance, 11, 10)
def test_invalid_dimension(): assert_raises_regex(ValueError, "has to be a list or tuple", space_check_dimension, "23") # single value fixes dimension of space space_check_dimension((23,))
def check_valid_transformation(klass): assert klass(2, 30, transform="normalize") assert klass(2, 30, transform="identity") assert_raises_regex(ValueError, "should be 'normalize' or 'identity'", klass, 2, 30, transform='not a valid transform name')
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "it does not implement a 'get_params' methods" assert_raises_regex(TypeError, msg, check_estimator, object) assert_raises_regex(TypeError, msg, check_estimator, object()) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator) assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator()) # check that fit does input validation msg = "TypeError not raised" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier) assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier()) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") assert_raises_regex(ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict()) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) # check that `fit` only changes attribures that # are private (start with an _ or end with a _). msg = ('Estimator ChangesWrongAttribute should not change or mutate ' 'the parameter wrong_attribute from 0 to 1 during fit.') assert_raises_regex(AssertionError, msg, check_estimator, ChangesWrongAttribute) check_estimator(ChangesUnderscoreAttribute) # check that `fit` doesn't add any public attribute msg = ('Estimator adds public attribute\(s\) during the fit method.' ' Estimators are only allowed to add private attributes' ' either started with _ or ended' ' with _ but wrong_attribute added') assert_raises_regex(AssertionError, msg, check_estimator, SetsWrongAttribute) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier) except: pass finally: sys.stdout = old_stdout assert_true(msg in string_buffer.getvalue()) # doesn't error on actual estimator check_estimator(AdaBoostClassifier) check_estimator(AdaBoostClassifier()) check_estimator(MultiTaskElasticNet) check_estimator(MultiTaskElasticNet())
def test_one_hot_encoder_sparse(): # Test OneHotEncoder's fit and transform. X = [[3, 2, 1], [0, 1, 1]] enc = OneHotEncoder() with ignore_warnings(category=(DeprecationWarning, FutureWarning)): # discover max values automatically X_trans = enc.fit_transform(X).toarray() assert_equal(X_trans.shape, (2, 5)) assert_array_equal(enc.active_features_, np.where([1, 0, 0, 1, 0, 1, 1, 0, 1])[0]) assert_array_equal(enc.feature_indices_, [0, 4, 7, 9]) # check outcome assert_array_equal(X_trans, [[0., 1., 0., 1., 1.], [1., 0., 1., 0., 1.]]) # max value given as 3 # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=4) enc = OneHotEncoder(n_values=4) with ignore_warnings(category=DeprecationWarning): X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 4 * 3)) assert_array_equal(enc.feature_indices_, [0, 4, 8, 12]) # max value given per feature # enc = assert_warns(DeprecationWarning, OneHotEncoder, n_values=[3, 2, 2]) enc = OneHotEncoder(n_values=[3, 2, 2]) with ignore_warnings(category=DeprecationWarning): X = [[1, 0, 1], [0, 1, 1]] X_trans = enc.fit_transform(X) assert_equal(X_trans.shape, (2, 3 + 2 + 2)) assert_array_equal(enc.n_values_, [3, 2, 2]) # check that testing with larger feature works: X = np.array([[2, 0, 1], [0, 1, 1]]) enc.transform(X) # test that an error is raised when out of bounds: X_too_large = [[0, 2, 1], [0, 1, 1]] assert_raises(ValueError, enc.transform, X_too_large) error_msg = r"unknown categorical feature present \[2\] during transform" assert_raises_regex(ValueError, error_msg, enc.transform, X_too_large) with ignore_warnings(category=DeprecationWarning): assert_raises(ValueError, OneHotEncoder(n_values=2).fit_transform, X) # test that error is raised when wrong number of features assert_raises(ValueError, enc.transform, X[:, :-1]) # test that error is raised when wrong number of features in fit # with prespecified n_values with ignore_warnings(category=DeprecationWarning): assert_raises(ValueError, enc.fit, X[:, :-1]) # test exception on wrong init param with ignore_warnings(category=DeprecationWarning): assert_raises(TypeError, OneHotEncoder(n_values=np.int).fit, X) enc = OneHotEncoder() # test negative input to fit with ignore_warnings(category=FutureWarning): assert_raises(ValueError, enc.fit, [[0], [-1]]) # test negative input to transform with ignore_warnings(category=FutureWarning): enc.fit([[0], [1]]) assert_raises(ValueError, enc.transform, [[0], [-1]])
def test_invalid_drop_length(drop): enc = OneHotEncoder(drop=drop) assert_raises_regex(ValueError, "`drop` should have length equal to the number", enc.fit, [['abc', 2, 55], ['def', 1, 55], ['def', 3, 59]])
def test_ratio_minority_under_sampling(): assert_raises_regex( ValueError, "'ratio'='minority' cannot be used with" " under-sampler.", check_ratio, 'minority', np.array([1, 2, 3]), 'under-sampling')
def test_pipeline_init(): # Test the various init parameters of the pipeline. assert_raises(TypeError, Pipeline) # Check that we can't instantiate pipelines with objects without fit # method assert_raises_regex( TypeError, 'Last step of Pipeline should implement fit. ' '.*NoFit.*', Pipeline, [('clf', NoFit())]) # Smoke test with only an estimator clf = NoTrans() pipe = Pipeline([('svc', clf)]) assert_equal( pipe.get_params(deep=True), dict(svc__a=None, svc__b=None, svc=clf, **pipe.get_params(deep=False))) # Check that params are set pipe.set_params(svc__a=0.1) assert_equal(clf.a, 0.1) assert_equal(clf.b, None) # Smoke test the repr: repr(pipe) # Test with two objects clf = SVC() filter1 = SelectKBest(f_classif) pipe = Pipeline([('anova', filter1), ('svc', clf)]) # Check that we can't instantiate with non-transformers on the way # Note that NoTrans implements fit, but not transform assert_raises_regex( TypeError, 'All intermediate steps should be transformers' '.*\\bNoTrans\\b.*', Pipeline, [('t', NoTrans()), ('svc', clf)]) # Check that params are set pipe.set_params(svc__C=0.1) assert_equal(clf.C, 0.1) # Smoke test the repr: repr(pipe) # Check that params are not set when naming them wrong assert_raises(ValueError, pipe.set_params, anova__C=0.1) # Test clone pipe2 = clone(pipe) assert_false(pipe.named_steps['svc'] is pipe2.named_steps['svc']) # Check that apart from estimators, the parameters are the same params = pipe.get_params(deep=True) params2 = pipe2.get_params(deep=True) for x in pipe.get_params(deep=False): params.pop(x) for x in pipe2.get_params(deep=False): params2.pop(x) # Remove estimators that where copied params.pop('svc') params.pop('anova') params2.pop('svc') params2.pop('anova') assert_equal(params, params2)
def test_cnn_fit_sample_with_wrong_object(): knn = 'rnd' cnn = CondensedNearestNeighbour(random_state=RND_SEED, n_neighbors=knn) assert_raises_regex(ValueError, "has to be a int or an ", cnn.fit_sample, X, Y)
def test_load_with_offsets_error(): assert_raises_regex(ValueError, "n_features is required", load_svmlight_file, datafile, offset=3, length=3)
def test_init_parameters_validation(GradientBoosting, X, y): assert_raises_regex( ValueError, "Loss blah is not supported for", GradientBoosting(loss='blah').fit, X, y ) for learning_rate in (-1, 0): assert_raises_regex( ValueError, f"learning_rate={learning_rate} must be strictly positive", GradientBoosting(learning_rate=learning_rate).fit, X, y ) assert_raises_regex( ValueError, f"max_iter=0 must not be smaller than 1", GradientBoosting(max_iter=0).fit, X, y ) assert_raises_regex( ValueError, f"max_leaf_nodes=0 should not be smaller than 1", GradientBoosting(max_leaf_nodes=0).fit, X, y ) assert_raises_regex( ValueError, f"max_depth=0 should not be smaller than 1", GradientBoosting(max_depth=0).fit, X, y ) assert_raises_regex( ValueError, f"min_samples_leaf=0 should not be smaller than 1", GradientBoosting(min_samples_leaf=0).fit, X, y ) assert_raises_regex( ValueError, f"l2_regularization=-1 must be positive", GradientBoosting(l2_regularization=-1).fit, X, y ) for max_bins in (1, 257): assert_raises_regex( ValueError, f"max_bins={max_bins} should be no smaller than 2 and no larger", GradientBoosting(max_bins=max_bins).fit, X, y ) assert_raises_regex( ValueError, f"max_bins is set to 4 but the data is pre-binned with 256 bins", GradientBoosting(max_bins=4).fit, X.astype(np.uint8), y ) assert_raises_regex( ValueError, f"n_iter_no_change=-1 must be positive", GradientBoosting(n_iter_no_change=-1).fit, X, y ) for validation_split in (-1, 0): assert_raises_regex( ValueError, f"validation_split={validation_split} must be strictly positive", GradientBoosting(validation_split=validation_split).fit, X, y ) assert_raises_regex( ValueError, f"tol=-1 must not be smaller than 0", GradientBoosting(tol=-1).fit, X, y )
def test_nearmiss_wrong_version(): version = 1000 nm = NearMiss(version=version, random_state=RND_SEED) assert_raises_regex(ValueError, "must be 1, 2 or 3", nm.fit_sample, X, Y)
def test_raise_isinstance_error(): var = 10.0 assert_raises_regex(ValueError, "has to be one of", raise_isinstance_error, 'var', [int], var)
def test_oss_with_wrong_object(): knn = 'rnd' oss = OneSidedSelection(random_state=RND_SEED, n_neighbors=knn) assert_raises_regex(ValueError, "has to be a int", oss.fit_sample, X, Y)
def test_enn_not_good_object(): nn = 'rnd' enn = EditedNearestNeighbours(n_neighbors=nn, random_state=RND_SEED, kind_sel='mode') assert_raises_regex(ValueError, "has to be one of", enn.fit_sample, X, Y)
def test_sample_weight_length(): # check that an error is raised when passing sample weights # with an incompatible shape km = KMeans(n_clusters=n_clusters, random_state=42) assert_raises_regex(ValueError, 'len\(sample_weight\)', km.fit, X, sample_weight=np.ones(2))
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "it does not implement a 'get_params' methods" assert_raises_regex(TypeError, msg, check_estimator, object) assert_raises_regex(TypeError, msg, check_estimator, object()) # check that values returned by get_params match set_params msg = "get_params result does not match what was passed to set_params" assert_raises_regex(AssertionError, msg, check_estimator, ModifiesValueInsteadOfRaisingError()) assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams()) assert_raises_regex(AssertionError, msg, check_estimator, ModifiesAnotherValue()) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator) assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator()) # check that fit does input validation msg = "ValueError not raised" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier) assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier()) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") assert_raises_regex( ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict()) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) # check that `fit` only changes attribures that # are private (start with an _ or end with a _). msg = ('Estimator ChangesWrongAttribute should not change or mutate ' 'the parameter wrong_attribute from 0 to 1 during fit.') assert_raises_regex(AssertionError, msg, check_estimator, ChangesWrongAttribute) check_estimator(ChangesUnderscoreAttribute) # check that `fit` doesn't add any public attribute msg = (r'Estimator adds public attribute\(s\) during the fit method.' ' Estimators are only allowed to add private attributes' ' either started with _ or ended' ' with _ but wrong_attribute added') assert_raises_regex(AssertionError, msg, check_estimator, SetsWrongAttribute) # check for invariant method name = NotInvariantPredict.__name__ method = 'predict' msg = ("{method} of {name} is not invariant when applied " "to a subset.").format(method=method, name=name) assert_raises_regex(AssertionError, msg, check_estimator, NotInvariantPredict) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier) except: pass finally: sys.stdout = old_stdout assert msg in string_buffer.getvalue() # Large indices test on bad estimator msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to ' r'support \S{3}_64 matrix, and is not failing gracefully.*') assert_raises_regex(AssertionError, msg, check_estimator, LargeSparseNotSupportedClassifier) # non-regression test for estimators transforming to sparse data check_estimator(SparseTransformer()) # doesn't error on actual estimator check_estimator(AdaBoostClassifier) check_estimator(AdaBoostClassifier()) check_estimator(MultiTaskElasticNet) check_estimator(MultiTaskElasticNet())
def test_ada_wrong_nn_obj(): nn = 'rnd' ada = ADASYN(random_state=RND_SEED, n_neighbors=nn) assert_raises_regex(ValueError, "has to be one of", ada.fit_sample, X, Y)