def check_transformer_pickle(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): return set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) assert_array_almost_equal(pickled_X_pred, X_pred)
def test_set_random_state(): lda = LDA() tree = DecisionTreeClassifier() # LDA doesn't have random state: smoke test set_random_state(lda, 3) set_random_state(tree, 3) assert_equal(tree.random_state, 3)
def check_classifiers_input_shapes(name, Classifier): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=1) X = StandardScaler().fit_transform(X) # catch deprecation warnings with warnings.catch_warnings(record=True): classifier = Classifier() set_fast_parameters(classifier) set_random_state(classifier) # fit classifier.fit(X, y) y_pred = classifier.predict(X) set_random_state(classifier) # Check that when a 2D y is given, a DataConversionWarning is # raised with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always", DataConversionWarning) warnings.simplefilter("ignore", RuntimeWarning) classifier.fit(X, y[:, np.newaxis]) msg = "expected 1 DataConversionWarning, got: %s" % ( ", ".join([str(w_x) for w_x in w])) assert_equal(len(w), 1, msg) assert_array_equal(y_pred, classifier.predict(X))
def check_regressors_int(name, Regressor): X, _ = _boston_subset() X = X[:50] rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds regressor_1 = Regressor() regressor_2 = Regressor() set_fast_parameters(regressor_1) set_fast_parameters(regressor_2) set_random_state(regressor_1) set_random_state(regressor_2) if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit regressor_1.fit(X, y_) pred1 = regressor_1.predict(X) regressor_2.fit(X, y_.astype(np.float)) pred2 = regressor_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def check_regressors_train(name, Regressor): X, y = _boston_subset() y = StandardScaler().fit_transform(y) # X is already scaled y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() set_fast_parameters(regressor) if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 if name == 'PassiveAggressiveRegressor': regressor.C = 0.01 # raises error on malformed input for fit assert_raises(ValueError, regressor.fit, X, y[:-1]) # fit if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y set_random_state(regressor) regressor.fit(X, y_) regressor.fit(X.tolist(), y_.tolist()) regressor.predict(X) # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'): print(regressor) assert_greater(regressor.score(X, y_), 0.5)
def check_pipeline_consistency(name, Estimator): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) # check that make_pipeline(est) gives same score as est X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) pipeline = make_pipeline(estimator) estimator.fit(X, y) pipeline.fit(X, y) funcs = ["score", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func_pipeline = getattr(pipeline, func_name) result = func(X, y) result_pipe = func_pipeline(X, y) assert_array_almost_equal(result, result_pipe)
def check_class_weight_auto_linear_classifier(name, Classifier): """Test class weights with non-contiguous class labels.""" X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] with warnings.catch_warnings(record=True): classifier = Classifier() if hasattr(classifier, "n_iter"): # This is a very small dataset, default n_iter are likely to prevent # convergence classifier.set_params(n_iter=1000) set_random_state(classifier) # Let the model compute the class frequencies classifier.set_params(class_weight='auto') coef_auto = classifier.fit(X, y).coef_.copy() # Count each label occurrence to reweight manually mean_weight = (1. / 3 + 1. / 2) / 2 class_weight = { 1: 1. / 3 / mean_weight, -1: 1. / 2 / mean_weight, } classifier.set_params(class_weight=class_weight) coef_manual = classifier.fit(X, y).coef_.copy() assert_array_almost_equal(coef_auto, coef_manual)
def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def check_estimators_empty_data_messages(name, Estimator): e = Estimator() set_fast_parameters(e) set_random_state(e, 1) X_zero_samples = np.empty(0).reshape(0, 3) # The precise message can change depending on whether X or y is # validated first. Let us test the type of exception only: assert_raises(ValueError, e.fit, X_zero_samples, []) X_zero_features = np.empty(0).reshape(3, 0) # the following y should be accepted by both classifiers and regressors # and ignored by unsupervised models y = multioutput_estimator_convert_y_2d(name, np.array([1, 0, 1])) msg = "0 feature(s) (shape=(3, 0)) while a minimum of 1 is required." assert_raise_message(ValueError, msg, e.fit, X_zero_features, y)
def check_non_transformer_estimators_n_iter(name, estimator, multi_output=False): # Check if all iterative solvers, run for more than one iteratiom iris = load_iris() X, y_ = iris.data, iris.target if multi_output: y_ = y_[:, np.newaxis] set_random_state(estimator, 0) if name == 'AffinityPropagation': estimator.fit(X) else: estimator.fit(X, y_) assert_greater(estimator.n_iter_, 0)
def check_class_weight_auto_classifiers(name, Classifier, X_train, y_train, X_test, y_test, weights): with warnings.catch_warnings(record=True): classifier = Classifier() if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) classifier.set_params(class_weight='auto') classifier.fit(X_train, y_train) y_pred_auto = classifier.predict(X_test) assert_greater(f1_score(y_test, y_pred_auto, average='weighted'), f1_score(y_test, y_pred, average='weighted'))
def check_fit_score_takes_y(name, Estimator): # check that all estimators accept an optional y # in fit and score so they can be used in pipelines rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 3)) y = np.arange(10) % 3 y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) funcs = ["fit", "score", "partial_fit", "fit_predict", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func(X, y) args = inspect.getargspec(func).args assert_true(args[2] in ["y", "Y"])
def check_estimators_dtypes(name, Estimator): rnd = np.random.RandomState(0) X_train_32 = 3 * rnd.uniform(size=(20, 5)).astype(np.float32) X_train_64 = X_train_32.astype(np.float64) X_train_int_64 = X_train_32.astype(np.int64) X_train_int_32 = X_train_32.astype(np.int32) y = X_train_int_64[:, 0] y = multioutput_estimator_convert_y_2d(name, y) for X_train in [X_train_32, X_train_64, X_train_int_64, X_train_int_32]: with warnings.catch_warnings(record=True): estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator, 1) estimator.fit(X_train, y) for method in ["predict", "transform", "decision_function", "predict_proba"]: if hasattr(estimator, method): getattr(estimator, method)(X_train)
def check_transformer_n_iter(name, estimator): if name in CROSS_DECOMPOSITION: # Check using default data X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [2., 5., 4.]] y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]] else: X, y_ = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() - 0.1 set_random_state(estimator, 0) estimator.fit(X, y_) # These return a n_iter per component. if name in CROSS_DECOMPOSITION: for iter_ in estimator.n_iter_: assert_greater(iter_, 1) else: assert_greater(estimator.n_iter_, 1)
def check_estimators_overwrite_params(name, Estimator): X, y = make_blobs(random_state=0, n_samples=9) y = multioutput_estimator_convert_y_2d(name, y) # some want non-negative input X -= X.min() with warnings.catch_warnings(record=True): # catch deprecation warnings estimator = Estimator() if name == 'MiniBatchDictLearning' or name == 'MiniBatchSparsePCA': # FIXME # for MiniBatchDictLearning and MiniBatchSparsePCA estimator.batch_size = 1 set_fast_parameters(estimator) set_random_state(estimator) # Make a physical copy of the orginal estimator parameters before fitting. params = estimator.get_params() original_params = deepcopy(params) # Fit the model estimator.fit(X, y) # Compare the state of the model parameters with the original parameters new_params = estimator.get_params() for param_name, original_value in original_params.items(): new_value = new_params[param_name] # We should never change or mutate the internal state of input # parameters by default. To check this we use the joblib.hash function # that introspects recursively any subobjects to compute a checksum. # The only exception to this rule of immutable constructor parameters # is possible RandomState instance but in this check we explicitly # fixed the random_state params recursively to be integer seeds. assert_equal(hash(new_value), hash(original_value), "Estimator %s should not change or mutate " " the parameter %s from %s to %s during fit." % (name, param_name, original_value, new_value))
def check_estimators_data_not_an_array(name, Estimator, X, y): if name in CROSS_DECOMPOSITION: raise SkipTest # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds estimator_1 = Estimator() estimator_2 = Estimator() set_fast_parameters(estimator_1) set_fast_parameters(estimator_2) set_random_state(estimator_1) set_random_state(estimator_2) y_ = NotAnArray(np.asarray(y)) X_ = NotAnArray(np.asarray(X)) # fit estimator_1.fit(X_, y_) pred1 = estimator_1.predict(X_) estimator_2.fit(X, y) pred2 = estimator_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def check_class_weight_classifiers(name, Classifier): for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) n_centers = len(np.unique(y_train)) if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) if hasattr(classifier, "min_weight_fraction_leaf"): classifier.set_params(min_weight_fraction_leaf=0.01) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.89)
def _check_transformer(name, Transformer, X, y): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # on numpy & scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) # raises error on malformed input for transform if hasattr(X, 'T'): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)
def check_estimators_nan_inf(name, Estimator): rnd = np.random.RandomState(0) X_train_finite = rnd.uniform(size=(10, 3)) X_train_nan = rnd.uniform(size=(10, 3)) X_train_nan[0, 0] = np.nan X_train_inf = rnd.uniform(size=(10, 3)) X_train_inf[0, 0] = np.inf y = np.ones(10) y[:5] = 0 y = multioutput_estimator_convert_y_2d(name, y) error_string_fit = "Estimator doesn't check for NaN and inf in fit." error_string_predict = ("Estimator doesn't check for NaN and inf in" " predict.") error_string_transform = ("Estimator doesn't check for NaN and inf in" " transform.") for X_train in [X_train_nan, X_train_inf]: # catch deprecation warnings with warnings.catch_warnings(record=True): estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator, 1) # try to fit try: estimator.fit(X_train, y) except ValueError as e: if 'inf' not in repr(e) and 'NaN' not in repr(e): print(error_string_fit, Estimator, e) traceback.print_exc(file=sys.stdout) raise e except Exception as exc: print(error_string_fit, Estimator, exc) traceback.print_exc(file=sys.stdout) raise exc else: raise AssertionError(error_string_fit, Estimator) # actually fit estimator.fit(X_train_finite, y) # predict if hasattr(estimator, "predict"): try: estimator.predict(X_train) except ValueError as e: if 'inf' not in repr(e) and 'NaN' not in repr(e): print(error_string_predict, Estimator, e) traceback.print_exc(file=sys.stdout) raise e except Exception as exc: print(error_string_predict, Estimator, exc) traceback.print_exc(file=sys.stdout) else: raise AssertionError(error_string_predict, Estimator) # transform if hasattr(estimator, "transform"): try: estimator.transform(X_train) except ValueError as e: if 'inf' not in repr(e) and 'NaN' not in repr(e): print(error_string_transform, Estimator, e) traceback.print_exc(file=sys.stdout) raise e except Exception as exc: print(error_string_transform, Estimator, exc) traceback.print_exc(file=sys.stdout) else: raise AssertionError(error_string_transform, Estimator)
def check_classifiers_train(name, Classifier): X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # catch deprecation warnings classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape with warnings.catch_warnings(record=True): classifier = Classifier() if name in ['BernoulliNB', 'MultinomialNB']: X -= X.min() set_fast_parameters(classifier) set_random_state(classifier) # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples,)) # training set performance if name not in ['BernoulliNB', 'MultinomialNB']: assert_greater(accuracy_score(y, y_pred), 0.83) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.shape, (n_samples,)) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if n_classes is 3: assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T)