def check_clustering(name, Alg): X, y = make_blobs(n_samples=50, random_state=1) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) n_samples, n_features = X.shape # catch deprecation and neighbors warnings with warnings.catch_warnings(record=True): alg = Alg() set_fast_parameters(alg) if hasattr(alg, "n_clusters"): alg.set_params(n_clusters=3) set_random_state(alg) if name == 'AffinityPropagation': alg.set_params(preference=-100) alg.set_params(max_iter=100) # fit alg.fit(X) # with lists alg.fit(X.tolist()) assert_equal(alg.labels_.shape, (n_samples,)) pred = alg.labels_ assert_greater(adjusted_rand_score(pred, y), 0.4) # fit another time with ``fit_predict`` and compare results if name is 'SpectralClustering': # there is no way to make Spectral clustering deterministic :( return set_random_state(alg) with warnings.catch_warnings(record=True): pred2 = alg.fit_predict(X) assert_array_equal(pred, pred2)
def check_regressors_train(name, Regressor): X, y = _boston_subset() y = StandardScaler().fit_transform(y) # X is already scaled y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() set_fast_parameters(regressor) if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 if name == 'PassiveAggressiveRegressor': regressor.C = 0.01 # raises error on malformed input for fit assert_raises(ValueError, regressor.fit, X, y[:-1]) # fit if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y set_random_state(regressor) regressor.fit(X, y_) regressor.fit(X.tolist(), y_.tolist()) regressor.predict(X) # TODO: find out why PLS and CCA fail. RANSAC is random # and furthermore assumes the presence of outliers, hence # skipped if name not in ('PLSCanonical', 'CCA', 'RANSACRegressor'): print(regressor) assert_greater(regressor.score(X, y_), 0.5)
def test_all_estimators(): # Test that estimators are default-constructible, clonable # and have working repr. estimators = all_estimators(include_meta_estimators=True) # Meta sanity-check to make sure that the estimator introspection runs # properly assert_greater(len(estimators), 0) for name, Estimator in estimators: # some can just not be sensibly default constructed yield check_parameters_default_constructible, name, Estimator
def check_class_weight_auto_classifiers(name, Classifier, X_train, y_train, X_test, y_test, weights): with warnings.catch_warnings(record=True): classifier = Classifier() if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) classifier.set_params(class_weight='auto') classifier.fit(X_train, y_train) y_pred_auto = classifier.predict(X_test) assert_greater(f1_score(y_test, y_pred_auto, average='weighted'), f1_score(y_test, y_pred, average='weighted'))
def check_non_transformer_estimators_n_iter(name, estimator, multi_output=False): # Check if all iterative solvers, run for more than one iteratiom iris = load_iris() X, y_ = iris.data, iris.target if multi_output: y_ = y_[:, np.newaxis] set_random_state(estimator, 0) if name == 'AffinityPropagation': estimator.fit(X) else: estimator.fit(X, y_) assert_greater(estimator.n_iter_, 0)
def check_transformer_n_iter(name, estimator): if name in CROSS_DECOMPOSITION: # Check using default data X = [[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [2., 5., 4.]] y_ = [[0.1, -0.2], [0.9, 1.1], [0.1, -0.5], [0.3, -0.2]] else: X, y_ = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() - 0.1 set_random_state(estimator, 0) estimator.fit(X, y_) # These return a n_iter per component. if name in CROSS_DECOMPOSITION: for iter_ in estimator.n_iter_: assert_greater(iter_, 1) else: assert_greater(estimator.n_iter_, 1)
def check_class_weight_classifiers(name, Classifier): for n_centers in [2, 3]: # create a very noisy dataset X, y = make_blobs(centers=n_centers, random_state=0, cluster_std=20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) n_centers = len(np.unique(y_train)) if n_centers == 2: class_weight = {0: 1000, 1: 0.0001} else: class_weight = {0: 1000, 1: 0.0001, 2: 0.0001} with warnings.catch_warnings(record=True): classifier = Classifier(class_weight=class_weight) if hasattr(classifier, "n_iter"): classifier.set_params(n_iter=100) if hasattr(classifier, "min_weight_fraction_leaf"): classifier.set_params(min_weight_fraction_leaf=0.01) set_random_state(classifier) classifier.fit(X_train, y_train) y_pred = classifier.predict(X_test) assert_greater(np.mean(y_pred == 0), 0.89)
def test_program_init_method(): """'full' should create longer and deeper programs than other methods""" params = { 'function_set': [add2, sub2, mul2, div2, sqrt1, log1, abs1, max2, min2], 'arities': { 1: [sqrt1, log1, abs1], 2: [add2, sub2, mul2, div2, max2, min2] }, 'init_depth': (2, 6), 'n_features': 10, 'const_range': (-1.0, 1.0), 'metric': 'mean absolute error', 'p_point_replace': 0.05, 'parsimony_coefficient': 0.1 } random_state = check_random_state(415) programs = [] for i in range(20): programs.append( _Program(init_method='full', random_state=random_state, **params)) full_length = np.mean([gp.length_ for gp in programs]) full_depth = np.mean([gp.depth_ for gp in programs]) programs = [] for i in range(20): programs.append( _Program(init_method='half and half', random_state=random_state, **params)) hnh_length = np.mean([gp.length_ for gp in programs]) hnh_depth = np.mean([gp.depth_ for gp in programs]) programs = [] for i in range(20): programs.append( _Program(init_method='grow', random_state=random_state, **params)) grow_length = np.mean([gp.length_ for gp in programs]) grow_depth = np.mean([gp.depth_ for gp in programs]) assert_greater(full_length, hnh_length) assert_greater(hnh_length, grow_length) assert_greater(full_depth, hnh_depth) assert_greater(hnh_depth, grow_depth)
def test_program_init_method(): """'full' should create longer and deeper programs than other methods""" params = {'function_set': ['add2', 'sub2', 'mul2', 'div2', 'sqrt1', 'log1', 'abs1', 'max2', 'min2'], 'arities': {1: ['sqrt1', 'log1', 'abs1'], 2: ['add2', 'sub2', 'mul2', 'div2', 'max2', 'min2']}, 'init_depth': (2, 6), 'n_features': 10, 'const_range': (-1.0, 1.0), 'metric': 'mean absolute error', 'p_point_replace': 0.05, 'parsimony_coefficient': 0.1} random_state = check_random_state(415) programs = [] for i in range(20): programs.append(_Program(init_method='full', random_state=random_state, **params)) full_length = np.mean([gp.length_ for gp in programs]) full_depth = np.mean([gp.depth_ for gp in programs]) programs = [] for i in range(20): programs.append(_Program(init_method='half and half', random_state=random_state, **params)) hnh_length = np.mean([gp.length_ for gp in programs]) hnh_depth = np.mean([gp.depth_ for gp in programs]) programs = [] for i in range(20): programs.append(_Program(init_method='grow', random_state=random_state, **params)) grow_length = np.mean([gp.length_ for gp in programs]) grow_depth = np.mean([gp.depth_ for gp in programs]) assert_greater(full_length, hnh_length) assert_greater(hnh_length, grow_length) assert_greater(full_depth, hnh_depth) assert_greater(hnh_depth, grow_depth)
def check_classifiers_train(name, Classifier): X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # catch deprecation warnings classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape with warnings.catch_warnings(record=True): classifier = Classifier() if name in ['BernoulliNB', 'MultinomialNB']: X -= X.min() set_fast_parameters(classifier) set_random_state(classifier) # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples,)) # training set performance if name not in ['BernoulliNB', 'MultinomialNB']: assert_greater(accuracy_score(y, y_pred), 0.83) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.shape, (n_samples,)) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if n_classes is 3: assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T)