def check_transformer_pickle(name, Transformer): X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) n_samples, n_features = X.shape X = StandardScaler().fit_transform(X) X -= X.min() # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() if not hasattr(transformer, 'transform'): return set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: random_state = np.random.RandomState(seed=12345) y_ = np.vstack([y, 2 * y + random_state.randint(2, size=len(y))]) y_ = y_.T else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit(X, y_).transform(X) pickled_transformer = pickle.dumps(transformer) unpickled_transformer = pickle.loads(pickled_transformer) pickled_X_pred = unpickled_transformer.transform(X) assert_array_almost_equal(pickled_X_pred, X_pred)
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(): """Test radius neighbors in multi-output regression (uniform weight)""" rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, 'uniform']): rnn = neighbors. RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train) neigh_idx = rnn.radius_neighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred_idx = np.array(y_pred_idx) y_pred = rnn.predict(X_test) assert_equal(y_pred_idx.shape, y_test.shape) assert_equal(y_pred.shape, y_test.shape) assert_array_almost_equal(y_pred, y_pred_idx)
def test_kneighbors_classifier_predict_proba(): """Test KNeighborsClassifier.predict_proba() method""" X = np.array([[0, 2, 0], [0, 2, 1], [2, 0, 0], [2, 2, 0], [0, 0, 2], [0, 0, 1]]) y = np.array([4, 4, 5, 5, 1, 1]) cls = neighbors.KNeighborsClassifier(n_neighbors=3, p=1) # cityblock dist cls.fit(X, y) y_prob = cls.predict_proba(X) real_prob = np.array([[0, 2. / 3, 1. / 3], [1. / 3, 2. / 3, 0], [1. / 3, 0, 2. / 3], [0, 1. / 3, 2. / 3], [2. / 3, 1. / 3, 0], [2. / 3, 1. / 3, 0]]) assert_array_equal(real_prob, y_prob) # Check that it also works with non integer labels cls.fit(X, y.astype(str)) y_prob = cls.predict_proba(X) assert_array_equal(real_prob, y_prob) # Check that it works with weights='distance' cls = neighbors.KNeighborsClassifier( n_neighbors=2, p=1, weights='distance') cls.fit(X, y) y_prob = cls.predict_proba(np.array([[0, 2, 0], [2, 2, 2]])) real_prob = np.array([[0, 1, 0], [0, 0.4, 0.6]]) assert_array_almost_equal(real_prob, y_prob)
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = RandomizedPCA(n_components=2, random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
def test_RadiusNeighborsClassifier_multioutput(): """Test k-NN classifier on multioutput data""" rng = check_random_state(0) n_features = 2 n_samples = 40 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, 'uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] for o in range(n_output): rnn = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train[:, o]) y_pred_so.append(rnn.predict(X_test)) y_pred_so = np.vstack(y_pred_so).T assert_equal(y_pred_so.shape, y_test.shape) # Multioutput prediction rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn_mo.fit(X_train, y_train) y_pred_mo = rnn_mo.predict(X_test) assert_equal(y_pred_mo.shape, y_test.shape) assert_array_almost_equal(y_pred_mo, y_pred_so)
def test_sparse_design_with_sample_weights(): # Sample weights must work with sparse matrices n_sampless = [2, 3] n_featuress = [3, 2] rng = np.random.RandomState(42) sparse_matrix_converters = [sp.coo_matrix, sp.csr_matrix, sp.csc_matrix, sp.lil_matrix, sp.dok_matrix ] sparse_ridge = Ridge(alpha=1., fit_intercept=False) dense_ridge = Ridge(alpha=1., fit_intercept=False) for n_samples, n_features in zip(n_sampless, n_featuress): X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) sample_weights = rng.randn(n_samples) ** 2 + 1 for sparse_converter in sparse_matrix_converters: X_sparse = sparse_converter(X) sparse_ridge.fit(X_sparse, y, sample_weight=sample_weights) dense_ridge.fit(X, y, sample_weight=sample_weights) assert_array_almost_equal(sparse_ridge.coef_, dense_ridge.coef_, decimal=6)
def test_primal_dual_relationship(): y = y_diabetes.reshape(-1, 1) coef = _solve_cholesky(X_diabetes, y, alpha=[1e-2]) K = np.dot(X_diabetes, X_diabetes.T) dual_coef = _solve_cholesky_kernel(K, y, alpha=[1e-2]) coef2 = np.dot(X_diabetes.T, dual_coef).T assert_array_almost_equal(coef, coef2)
def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation rng = np.random.RandomState(42) noisy_centers = centers + rng.normal(size=centers.shape) labels_gold = - np.ones(n_samples, dtype=np.int) mindist = np.empty(n_samples) mindist.fill(np.infty) for center_id in range(n_clusters): dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1) labels_gold[dist < mindist] = center_id mindist = np.minimum(dist, mindist) inertia_gold = mindist.sum() assert_true((mindist >= 0.0).all()) assert_true((labels_gold != -1).all()) # perform label assignment using the dense array input x_squared_norms = (X ** 2).sum(axis=1) labels_array, inertia_array = _labels_inertia( X, x_squared_norms, noisy_centers) assert_array_almost_equal(inertia_array, inertia_gold) assert_array_equal(labels_array, labels_gold) # perform label assignment using the sparse CSR input x_squared_norms_from_csr = row_norms(X_csr, squared=True) labels_csr, inertia_csr = _labels_inertia( X_csr, x_squared_norms_from_csr, noisy_centers) assert_array_almost_equal(inertia_csr, inertia_gold) assert_array_equal(labels_csr, labels_gold)
def test_class_weights(): # Test class weights. X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] clf = RidgeClassifier(class_weight=None) clf.fit(X, y) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # we give a small weights to class 1 clf = RidgeClassifier(class_weight={1: 0.001}) clf.fit(X, y) # now the hyperplane should rotate clock-wise and # the prediction on this point should shift assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([-1])) # check if class_weight = 'balanced' can handle negative labels. clf = RidgeClassifier(class_weight='balanced') clf.fit(X, y) assert_array_equal(clf.predict([[0.2, -1.0]]), np.array([1])) # class_weight = 'balanced', and class_weight = None should return # same values when y has equal number of all labels X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0]]) y = [1, 1, -1, -1] clf = RidgeClassifier(class_weight=None) clf.fit(X, y) clfa = RidgeClassifier(class_weight='balanced') clfa.fit(X, y) assert_equal(len(clfa.classes_), 2) assert_array_almost_equal(clf.coef_, clfa.coef_) assert_array_almost_equal(clf.intercept_, clfa.intercept_)
def test_cross_val_predict_with_method(): iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=0) classes = len(set(y)) kfold = KFold(len(iris.target)) methods = ['decision_function', 'predict_proba', 'predict_log_proba'] for method in methods: est = LogisticRegression() predictions = cross_val_predict(est, X, y, method=method) assert_equal(len(predictions), len(y)) expected_predictions = np.zeros([len(y), classes]) func = getattr(est, method) # Naive loop (should be same as cross_val_predict): for train, test in kfold.split(X, y): est.fit(X[train], y[train]) expected_predictions[test] = func(X[test]) predictions = cross_val_predict(est, X, y, method=method, cv=kfold) assert_array_almost_equal(expected_predictions, predictions)
def test_enet_l1_ratio(): # Test that an error message is raised if an estimator that # uses _alpha_grid is called with l1_ratio=0 msg = ("Automatic alpha grid generation is not supported for l1_ratio=0. " "Please supply a grid by providing your estimator with the " "appropriate `alphas=` argument.") X = np.array([[1, 2, 4, 5, 8], [3, 5, 7, 7, 8]]).T y = np.array([12, 10, 11, 21, 5]) assert_raise_message(ValueError, msg, ElasticNetCV( l1_ratio=0, random_state=42).fit, X, y) assert_raise_message(ValueError, msg, MultiTaskElasticNetCV( l1_ratio=0, random_state=42).fit, X, y[:, None]) # Test that l1_ratio=0 is allowed if we supply a grid manually alphas = [0.1, 10] estkwds = {'alphas': alphas, 'random_state': 42} est_desired = ElasticNetCV(l1_ratio=0.00001, **estkwds) est = ElasticNetCV(l1_ratio=0, **estkwds) with ignore_warnings(): est_desired.fit(X, y) est.fit(X, y) assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5) est_desired = MultiTaskElasticNetCV(l1_ratio=0.00001, **estkwds) est = MultiTaskElasticNetCV(l1_ratio=0, **estkwds) with ignore_warnings(): est.fit(X, y[:, None]) est_desired.fit(X, y[:, None]) assert_array_almost_equal(est.coef_, est_desired.coef_, decimal=5)
def test_skewed_chi2_sampler(): """test that RBFSampler approximates kernel on random data""" # compute exact kernel c = 0.03 # appreviations for easier formular X_c = (X + c)[:, np.newaxis, :] Y_c = (Y + c)[np.newaxis, :, :] # we do it in log-space in the hope that it's more stable # this array is n_samples_x x n_samples_y big x n_features log_kernel = ((np.log(X_c) / 2.) + (np.log(Y_c) / 2.) + np.log(2.) - np.log(X_c + Y_c)) # reduce to n_samples_x x n_samples_y by summing over features in log-space kernel = np.exp(log_kernel.sum(axis=2)) # approximate kernel mapping transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42) X_trans = transform.fit_transform(X) Y_trans = transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) assert_array_almost_equal(kernel, kernel_approx, 1) # test error is raised on negative input Y_neg = Y.copy() Y_neg[0, 0] = -1 assert_raises(ValueError, transform.transform, Y_neg)
def test_inverse_transform(): # Test FastICA.inverse_transform n_features = 10 n_samples = 100 n1, n2 = 5, 10 rng = np.random.RandomState(0) X = rng.random_sample((n_samples, n_features)) expected = {(True, n1): (n_features, n1), (True, n2): (n_features, n2), (False, n1): (n_features, n2), (False, n2): (n_features, n2)} for whiten in [True, False]: for n_components in [n1, n2]: n_components_ = (n_components if n_components is not None else X.shape[1]) ica = FastICA(n_components=n_components, random_state=rng, whiten=whiten) with warnings.catch_warnings(record=True): # catch "n_components ignored" warning Xt = ica.fit_transform(X) expected_shape = expected[(whiten, n_components_)] assert_equal(ica.mixing_.shape, expected_shape) X2 = ica.inverse_transform(Xt) assert_equal(X.shape, X2.shape) # reversibility test in non-reduction case if n_components == X.shape[1]: assert_array_almost_equal(X, X2)
def check_regressors_pickle(name, Regressor): X, y = _boston_subset() y = StandardScaler().fit_transform(y) # X is already scaled y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): regressor = Regressor() set_fast_parameters(regressor) if not hasattr(regressor, 'alphas') and hasattr(regressor, 'alpha'): # linear regressors need to set alpha, but not generalized CV ones regressor.alpha = 0.01 if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y regressor.fit(X, y_) y_pred = regressor.predict(X) # store old predictions pickled_regressor = pickle.dumps(regressor) unpickled_regressor = pickle.loads(pickled_regressor) pickled_y_pred = unpickled_regressor.predict(X) assert_array_almost_equal(pickled_y_pred, y_pred)
def check_class_weight_auto_linear_classifier(name, Classifier): """Test class weights with non-contiguous class labels.""" X = np.array([[-1.0, -1.0], [-1.0, 0], [-.8, -1.0], [1.0, 1.0], [1.0, 0.0]]) y = [1, 1, 1, -1, -1] with warnings.catch_warnings(record=True): classifier = Classifier() if hasattr(classifier, "n_iter"): # This is a very small dataset, default n_iter are likely to prevent # convergence classifier.set_params(n_iter=1000) set_random_state(classifier) # Let the model compute the class frequencies classifier.set_params(class_weight='auto') coef_auto = classifier.fit(X, y).coef_.copy() # Count each label occurrence to reweight manually mean_weight = (1. / 3 + 1. / 2) / 2 class_weight = { 1: 1. / 3 / mean_weight, -1: 1. / 2 / mean_weight, } classifier.set_params(class_weight=class_weight) coef_manual = classifier.fit(X, y).coef_.copy() assert_array_almost_equal(coef_auto, coef_manual)
def test_nystroem_approximation(): # some basic tests rnd = np.random.RandomState(0) X = rnd.uniform(size=(10, 4)) # With n_components = n_samples this is exact X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X) K = rbf_kernel(X) assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K) trans = Nystroem(n_components=2, random_state=rnd) X_transformed = trans.fit(X).transform(X) assert_equal(X_transformed.shape, (X.shape[0], 2)) # test callable kernel linear_kernel = lambda X, Y: np.dot(X, Y.T) trans = Nystroem(n_components=2, kernel=linear_kernel, random_state=rnd) X_transformed = trans.fit(X).transform(X) assert_equal(X_transformed.shape, (X.shape[0], 2)) # test that available kernels fit and transform kernels_available = kernel_metrics() for kern in kernels_available: trans = Nystroem(n_components=2, kernel=kern, random_state=rnd) X_transformed = trans.fit(X).transform(X) assert_equal(X_transformed.shape, (X.shape[0], 2))
def check_pipeline_consistency(name, Estimator): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) # check that make_pipeline(est) gives same score as est X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X -= X.min() y = multioutput_estimator_convert_y_2d(name, y) estimator = Estimator() set_fast_parameters(estimator) set_random_state(estimator) pipeline = make_pipeline(estimator) estimator.fit(X, y) pipeline.fit(X, y) funcs = ["score", "fit_transform"] for func_name in funcs: func = getattr(estimator, func_name, None) if func is not None: func_pipeline = getattr(pipeline, func_name) result = func(X, y) result_pipe = func_pipeline(X, y) assert_array_almost_equal(result, result_pipe)
def check_multioutput(name): # Check estimators on multi-output problems. X_train = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [-2, 1], [-1, 1], [-1, 2], [2, -1], [1, -1], [1, -2]] y_train = [[-1, 0], [-1, 0], [-1, 0], [1, 1], [1, 1], [1, 1], [-1, 2], [-1, 2], [-1, 2], [1, 3], [1, 3], [1, 3]] X_test = [[-1, -1], [1, 1], [-1, 1], [1, -1]] y_test = [[-1, 0], [1, 1], [-1, 2], [1, 3]] est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False) y_pred = est.fit(X_train, y_train).predict(X_test) assert_array_almost_equal(y_pred, y_test) if name in FOREST_CLASSIFIERS: with np.errstate(divide="ignore"): proba = est.predict_proba(X_test) assert_equal(len(proba), 2) assert_equal(proba[0].shape, (4, 2)) assert_equal(proba[1].shape, (4, 4)) log_proba = est.predict_log_proba(X_test) assert_equal(len(log_proba), 2) assert_equal(log_proba[0].shape, (4, 2)) assert_equal(log_proba[1].shape, (4, 4))
def test_binary_perplexity_stability(): # Binary perplexity search should be stable. # The binary_search_perplexity had a bug wherein the P array # was uninitialized, leading to sporadically failing tests. k = 10 n_samples = 100 random_state = check_random_state(0) distances = random_state.randn(n_samples, 2).astype(np.float32) # Distances shouldn't be negative distances = np.abs(distances.dot(distances.T)) np.fill_diagonal(distances, 0.0) last_P = None neighbors_nn = np.argsort(distances, axis=1)[:, :k].astype(np.int64) for _ in range(100): P = _binary_search_perplexity(distances.copy(), neighbors_nn.copy(), 3, verbose=0) P1 = _joint_probabilities_nn(distances, neighbors_nn, 3, verbose=0) # Convert the sparse matrix to a dense one for testing P1 = P1.toarray() if last_P is None: last_P = P last_P1 = P1 else: assert_array_almost_equal(P, last_P, decimal=4) assert_array_almost_equal(P1, last_P1, decimal=4)
def test_multinomial_grad_hess(): rng = np.random.RandomState(0) n_samples, n_features, n_classes = 100, 5, 3 X = rng.randn(n_samples, n_features) w = rng.rand(n_classes, n_features) Y = np.zeros((n_samples, n_classes)) ind = np.argmax(np.dot(X, w.T), axis=1) Y[range(0, n_samples), ind] = 1 w = w.ravel() sample_weights = np.ones(X.shape[0]) grad, hessp = _multinomial_grad_hess(w, X, Y, alpha=1., sample_weight=sample_weights) # extract first column of hessian matrix vec = np.zeros(n_features * n_classes) vec[0] = 1 hess_col = hessp(vec) # Estimate hessian using least squares as done in # test_logistic_grad_hess e = 1e-3 d_x = np.linspace(-e, e, 30) d_grad = np.array([ _multinomial_grad_hess(w + t * vec, X, Y, alpha=1., sample_weight=sample_weights)[0] for t in d_x ]) d_grad -= d_grad.mean(axis=0) approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel() assert_array_almost_equal(hess_col, approx_hess_col)
def test_classification_sample_weight(): X = [[0], [0], [1]] y = [0, 1, 0] sample_weight = [0.1, 1., 0.1] clf = DummyClassifier().fit(X, y, sample_weight) assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1. / 1.2])
def test_intercept_logistic_helper(): n_samples, n_features = 10, 5 X, y = make_classification(n_samples=n_samples, n_features=n_features, random_state=0) # Fit intercept case. alpha = 1. w = np.ones(n_features + 1) grad_interp, hess_interp = _logistic_grad_hess(w, X, y, alpha) loss_interp = _logistic_loss(w, X, y, alpha) # Do not fit intercept. This can be considered equivalent to adding # a feature vector of ones, i.e column of one vectors. X_ = np.hstack((X, np.ones(10)[:, np.newaxis])) grad, hess = _logistic_grad_hess(w, X_, y, alpha) loss = _logistic_loss(w, X_, y, alpha) # In the fit_intercept=False case, the feature vector of ones is # penalized. This should be taken care of. assert_almost_equal(loss_interp + 0.5 * (w[-1] ** 2), loss) # Check gradient. assert_array_almost_equal(grad_interp[:n_features], grad[:n_features]) assert_almost_equal(grad_interp[-1] + alpha * w[-1], grad[-1]) rng = np.random.RandomState(0) grad = rng.rand(n_features + 1) hess_interp = hess_interp(grad) hess = hess(grad) assert_array_almost_equal(hess_interp[:n_features], hess[:n_features]) assert_almost_equal(hess_interp[-1] + alpha * grad[-1], hess[-1])
def test_write_parameters(): # Test that we can write to coef_ and intercept_ clf = LogisticRegression(random_state=0) clf.fit(X, Y1) clf.coef_[:] = 0 clf.intercept_[:] = 0 assert_array_almost_equal(clf.decision_function(X), 0)
def check_regressors_int(name, Regressor): X, _ = _boston_subset() X = X[:50] rnd = np.random.RandomState(0) y = rnd.randint(3, size=X.shape[0]) y = multioutput_estimator_convert_y_2d(name, y) rnd = np.random.RandomState(0) # catch deprecation warnings with warnings.catch_warnings(record=True): # separate estimators to control random seeds regressor_1 = Regressor() regressor_2 = Regressor() set_fast_parameters(regressor_1) set_fast_parameters(regressor_2) set_random_state(regressor_1) set_random_state(regressor_2) if name in CROSS_DECOMPOSITION: y_ = np.vstack([y, 2 * y + rnd.randint(2, size=len(y))]) y_ = y_.T else: y_ = y # fit regressor_1.fit(X, y_) pred1 = regressor_1.predict(X) regressor_2.fit(X, y_.astype(np.float)) pred2 = regressor_2.predict(X) assert_array_almost_equal(pred1, pred2, 2, name)
def test_logistic_regressioncv_class_weights(): X, y = make_classification(n_samples=20, n_features=20, n_informative=10, n_classes=3, random_state=0) # Test the liblinear fails when class_weight of type dict is # provided, when it is multiclass. However it can handle # binary problems. clf_lib = LogisticRegressionCV(class_weight={0: 0.1, 1: 0.2}, solver='liblinear') assert_raises(ValueError, clf_lib.fit, X, y) y_ = y.copy() y_[y == 2] = 1 clf_lib.fit(X, y_) assert_array_equal(clf_lib.classes_, [0, 1]) # Test for class_weight=auto X, y = make_classification(n_samples=20, n_features=20, n_informative=10, random_state=0) clf_lbf = LogisticRegressionCV(solver='lbfgs', fit_intercept=False, class_weight='auto') clf_lbf.fit(X, y) clf_lib = LogisticRegressionCV(solver='liblinear', fit_intercept=False, class_weight='auto') clf_lib.fit(X, y) assert_array_almost_equal(clf_lib.coef_, clf_lbf.coef_, decimal=4)
def test_predict_iris(): # Test logistic regression with the iris dataset n_samples, n_features = iris.data.shape target = iris.target_names[iris.target] # Test that both multinomial and OvR solvers handle # multiclass data correctly and give good accuracy # score (>0.95) for the training data. for clf in [LogisticRegression(C=len(iris.data)), LogisticRegression(C=len(iris.data), solver='lbfgs', multi_class='multinomial'), LogisticRegression(C=len(iris.data), solver='newton-cg', multi_class='multinomial'), LogisticRegression(C=len(iris.data), solver='sag', tol=1e-2, multi_class='ovr', random_state=42), LogisticRegression(C=len(iris.data), solver='saga', tol=1e-2, multi_class='ovr', random_state=42) ]: clf.fit(iris.data, target) assert_array_equal(np.unique(target), clf.classes_) pred = clf.predict(iris.data) assert_greater(np.mean(pred == target), .95) probabilities = clf.predict_proba(iris.data) assert_array_almost_equal(probabilities.sum(axis=1), np.ones(n_samples)) pred = iris.target_names[probabilities.argmax(axis=1)] assert_greater(np.mean(pred == target), .95)
def test_consistency_path(): """Test that the path algorithm is consistent""" rng = np.random.RandomState(0) X = np.concatenate((rng.randn(100, 2) + [1, 1], rng.randn(100, 2))) y = [1] * 100 + [-1] * 100 Cs = np.logspace(0, 4, 10) f = ignore_warnings # can't test with fit_intercept=True since LIBLINEAR # penalizes the intercept for method in ('lbfgs', 'newton-cg', 'liblinear'): coefs, Cs = f(logistic_regression_path)( X, y, Cs=Cs, fit_intercept=False, tol=1e-16, solver=method) for i, C in enumerate(Cs): lr = LogisticRegression(C=C, fit_intercept=False, tol=1e-16) lr.fit(X, y) lr_coef = lr.coef_.ravel() assert_array_almost_equal(lr_coef, coefs[i], decimal=4) # test for fit_intercept=True for method in ('lbfgs', 'newton-cg', 'liblinear'): Cs = [1e3] coefs, Cs = f(logistic_regression_path)( X, y, Cs=Cs, fit_intercept=True, tol=1e-4, solver=method) lr = LogisticRegression(C=Cs[0], fit_intercept=True, tol=1e-4, intercept_scaling=10000) lr.fit(X, y) lr_coef = np.concatenate([lr.coef_.ravel(), lr.intercept_]) assert_array_almost_equal(lr_coef, coefs[0], decimal=4)
def test_liblinear_random_state(): X, y = make_classification(n_samples=20) lr1 = LogisticRegression(random_state=0) lr1.fit(X, y) lr2 = LogisticRegression(random_state=0) lr2.fit(X, y) assert_array_almost_equal(lr1.coef_, lr2.coef_)
def test_logistic_regression_class_weights(): # Multinomial case: remove 90% of class 0 X = iris.data[45:, :] y = iris.target[45:] solvers = ("lbfgs", "newton-cg") class_weight_dict = _compute_class_weight_dictionary(y) for solver in solvers: clf1 = LogisticRegression(solver=solver, multi_class="multinomial", class_weight="balanced") clf2 = LogisticRegression(solver=solver, multi_class="multinomial", class_weight=class_weight_dict) clf1.fit(X, y) clf2.fit(X, y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=4) # Binary case: remove 90% of class 0 and 100% of class 2 X = iris.data[45:100, :] y = iris.target[45:100] solvers = ("lbfgs", "newton-cg", "liblinear") class_weight_dict = _compute_class_weight_dictionary(y) for solver in solvers: clf1 = LogisticRegression(solver=solver, multi_class="ovr", class_weight="balanced") clf2 = LogisticRegression(solver=solver, multi_class="ovr", class_weight=class_weight_dict) clf1.fit(X, y) clf2.fit(X, y) assert_array_almost_equal(clf1.coef_, clf2.coef_, decimal=6)
def _check_transformer(name, Transformer, X, y): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # on numpy & scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if name == "KernelPCA": transformer.remove_zero_eig = False set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) # raises error on malformed input for transform if hasattr(X, 'T'): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)
def test_linear_kernel(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) K = linear_kernel(X, X) # the diagonal elements of a linear kernel are their squared norm assert_array_almost_equal(K.flat[::6], [linalg.norm(x)**2 for x in X])
def test_roc_curve_toydata(): # Binary classification y_true = [0, 1] y_score = [0, 1] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 0, 1]) assert_array_almost_equal(fpr, [0, 1, 1]) assert_almost_equal(roc_auc, 1.) y_true = [0, 1] y_score = [1, 0] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1, 1]) assert_array_almost_equal(fpr, [0, 0, 1]) assert_almost_equal(roc_auc, 0.) y_true = [1, 0] y_score = [1, 1] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1]) assert_array_almost_equal(fpr, [0, 1]) assert_almost_equal(roc_auc, 0.5) y_true = [1, 0] y_score = [1, 0] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 0, 1]) assert_array_almost_equal(fpr, [0, 1, 1]) assert_almost_equal(roc_auc, 1.) y_true = [1, 0] y_score = [0.5, 0.5] tpr, fpr, _ = roc_curve(y_true, y_score) roc_auc = roc_auc_score(y_true, y_score) assert_array_almost_equal(tpr, [0, 1]) assert_array_almost_equal(fpr, [0, 1]) assert_almost_equal(roc_auc, .5) y_true = [0, 0] y_score = [0.25, 0.75] # assert UndefinedMetricWarning because of no positive sample in y_true tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score) assert_raises(ValueError, roc_auc_score, y_true, y_score) assert_array_almost_equal(tpr, [0., 0.5, 1.]) assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan]) y_true = [1, 1] y_score = [0.25, 0.75] # assert UndefinedMetricWarning because of no negative sample in y_true tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score) assert_raises(ValueError, roc_auc_score, y_true, y_score) assert_array_almost_equal(tpr, [np.nan, np.nan, np.nan]) assert_array_almost_equal(fpr, [0., 0.5, 1.]) # Multi-label classification task y_true = np.array([[0, 1], [0, 1]]) y_score = np.array([[0, 1], [0, 1]]) assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro") assert_raises(ValueError, roc_auc_score, y_true, y_score, average="weighted") assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.) y_true = np.array([[0, 1], [0, 1]]) y_score = np.array([[0, 1], [1, 0]]) assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro") assert_raises(ValueError, roc_auc_score, y_true, y_score, average="weighted") assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5) y_true = np.array([[1, 0], [0, 1]]) y_score = np.array([[0, 1], [1, 0]]) assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0) assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0) assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0) y_true = np.array([[1, 0], [0, 1]]) y_score = np.array([[0.5, 0.5], [0.5, 0.5]]) assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), .5) assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), .5) assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), .5) assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), .5)
def test_ward_linkage_tree_return_distance(): # Test return_distance option on linkage and ward trees # test that return_distance when set true, gives same # output on both structured and unstructured clustering. n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) for linkage in ['average', 'complete']: structured_items = linkage_tree(X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[-1] structured_dist = structured_items[-1] unstructured_dist = unstructured_items[-1] structured_children = structured_items[0] unstructured_children = unstructured_items[0] assert_array_almost_equal(structured_dist, unstructured_dist) assert_array_almost_equal(structured_children, unstructured_children) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) linkage_X_complete = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.96742194, 4.], [7., 9., 18.77445997, 6.]]) linkage_X_average = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.55832839, 4.], [7., 9., 15.44089605, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) linkage_options = ['complete', 'average'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage) out_X_structured = linkage_tree(X, connectivity=connectivity_X, linkage=linkage, return_distance=True) # check that the labels are the same assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) assert_array_equal(X_truth[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_multitarget(): """ Assure that estimators receiving multidimensional y do the right thing """ X = diabetes.data Y = np.vstack([diabetes.target, diabetes.target ** 2]).T n_targets = Y.shape[1] for estimator in (linear_model.LassoLars(), linear_model.Lars()): estimator.fit(X, Y) Y_pred = estimator.predict(X) Y_dec = estimator.decision_function(X) assert_array_almost_equal(Y_pred, Y_dec) alphas, active, coef, path = (estimator.alphas_, estimator.active_, estimator.coef_, estimator.coef_path_) for k in range(n_targets): estimator.fit(X, Y[:, k]) y_pred = estimator.predict(X) assert_array_almost_equal(alphas[k], estimator.alphas_) assert_array_almost_equal(active[k], estimator.active_) assert_array_almost_equal(coef[k], estimator.coef_) assert_array_almost_equal(path[k], estimator.coef_path_) assert_array_almost_equal(Y_pred[:, k], y_pred)
def test_pls(): d = load_linnerud() X = d.data Y = d.target # 1) Canonical (symmetric) PLS (PLS 2 blocks canonical mode A) # =========================================================== # Compare 2 algo.: nipals vs. svd # ------------------------------ pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1]) pls_bynipals.fit(X, Y) pls_bysvd = pls_.PLSCanonical(algorithm="svd", n_components=X.shape[1]) pls_bysvd.fit(X, Y) # check equalities of loading (up to the sign of the second column) assert_array_almost_equal( pls_bynipals.x_loadings_, np.multiply(pls_bysvd.x_loadings_, np.array([1, -1, 1])), decimal=5, err_msg="nipals and svd implementation lead to different x loadings") assert_array_almost_equal( pls_bynipals.y_loadings_, np.multiply(pls_bysvd.y_loadings_, np.array([1, -1, 1])), decimal=5, err_msg="nipals and svd implementation lead to different y loadings") # Check PLS properties (with n_components=X.shape[1]) # --------------------------------------------------- plsca = pls_.PLSCanonical(n_components=X.shape[1]) plsca.fit(X, Y) T = plsca.x_scores_ P = plsca.x_loadings_ Wx = plsca.x_weights_ U = plsca.y_scores_ Q = plsca.y_loadings_ Wy = plsca.y_weights_ def check_ortho(M, err_msg): K = np.dot(M.T, M) assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(Wx, "x weights are not orthogonal") check_ortho(Wy, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(T, "x scores are not orthogonal") check_ortho(U, "y scores are not orthogonal") # Check X = TP' and Y = UQ' (with (p == q) components) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # center scale X, Y Xc, Yc, x_mean, y_mean, x_std, y_std =\ pls_._center_scale_xy(X.copy(), Y.copy(), scale=True) assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'") assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'") # Check that rotations on training data lead to scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Xr = plsca.transform(X) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") Xr, Yr = plsca.transform(X, Y) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") assert_array_almost_equal(Yr, plsca.y_scores_, err_msg="rotation on Y failed") # "Non regression test" on canonical PLS # -------------------------------------- # The results were checked against the R-package plspm pls_ca = pls_.PLSCanonical(n_components=X.shape[1]) pls_ca.fit(X, Y) x_weights = np.array([[-0.61330704, 0.25616119, -0.74715187], [-0.74697144, 0.11930791, 0.65406368], [-0.25668686, -0.95924297, -0.11817271]]) assert_array_almost_equal(pls_ca.x_weights_, x_weights) x_rotations = np.array([[-0.61330704, 0.41591889, -0.62297525], [-0.74697144, 0.31388326, 0.77368233], [-0.25668686, -0.89237972, -0.24121788]]) assert_array_almost_equal(pls_ca.x_rotations_, x_rotations) y_weights = np.array([[+0.58989127, 0.7890047, 0.1717553], [+0.77134053, -0.61351791, 0.16920272], [-0.23887670, -0.03267062, 0.97050016]]) assert_array_almost_equal(pls_ca.y_weights_, y_weights) y_rotations = np.array([[+0.58989127, 0.7168115, 0.30665872], [+0.77134053, -0.70791757, 0.19786539], [-0.23887670, -0.00343595, 0.94162826]]) assert_array_almost_equal(pls_ca.y_rotations_, y_rotations) # 2) Regression PLS (PLS2): "Non regression test" # =============================================== # The results were checked against the R-packages plspm, misOmics and pls pls_2 = pls_.PLSRegression(n_components=X.shape[1]) pls_2.fit(X, Y) x_weights = np.array([[-0.61330704, -0.00443647, 0.78983213], [-0.74697144, -0.32172099, -0.58183269], [-0.25668686, 0.94682413, -0.19399983]]) assert_array_almost_equal(pls_2.x_weights_, x_weights) x_loadings = np.array([[-0.61470416, -0.24574278, 0.78983213], [-0.65625755, -0.14396183, -0.58183269], [-0.51733059, 1.00609417, -0.19399983]]) assert_array_almost_equal(pls_2.x_loadings_, x_loadings) y_weights = np.array([[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) assert_array_almost_equal(pls_2.y_weights_, y_weights) y_loadings = np.array([[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) assert_array_almost_equal(pls_2.y_loadings_, y_loadings) # 3) Another non-regression test of Canonical PLS on random dataset # ================================================================= # The results were checked against the R-package plspm n = 500 p_noise = 10 q_noise = 5 # 2 latents vars: np.random.seed(11) l1 = np.random.normal(size=n) l2 = np.random.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + np.random.normal(size=4 * n).reshape((n, 4)) Y = latents + np.random.normal(size=4 * n).reshape((n, 4)) X = np.concatenate( (X, np.random.normal(size=p_noise * n).reshape(n, p_noise)), axis=1) Y = np.concatenate( (Y, np.random.normal(size=q_noise * n).reshape(n, q_noise)), axis=1) np.random.seed(None) pls_ca = pls_.PLSCanonical(n_components=3) pls_ca.fit(X, Y) x_weights = np.array([[0.65803719, 0.19197924, 0.21769083], [0.7009113, 0.13303969, -0.15376699], [0.13528197, -0.68636408, 0.13856546], [0.16854574, -0.66788088, -0.12485304], [-0.03232333, -0.04189855, 0.40690153], [0.1148816, -0.09643158, 0.1613305], [0.04792138, -0.02384992, 0.17175319], [-0.06781, -0.01666137, -0.18556747], [-0.00266945, -0.00160224, 0.11893098], [-0.00849528, -0.07706095, 0.1570547], [-0.00949471, -0.02964127, 0.34657036], [-0.03572177, 0.0945091, 0.3414855], [0.05584937, -0.02028961, -0.57682568], [0.05744254, -0.01482333, -0.17431274]]) assert_array_almost_equal(pls_ca.x_weights_, x_weights) x_loadings = np.array([[0.65649254, 0.1847647, 0.15270699], [0.67554234, 0.15237508, -0.09182247], [0.19219925, -0.67750975, 0.08673128], [0.2133631, -0.67034809, -0.08835483], [-0.03178912, -0.06668336, 0.43395268], [0.15684588, -0.13350241, 0.20578984], [0.03337736, -0.03807306, 0.09871553], [-0.06199844, 0.01559854, -0.1881785], [0.00406146, -0.00587025, 0.16413253], [-0.00374239, -0.05848466, 0.19140336], [0.00139214, -0.01033161, 0.32239136], [-0.05292828, 0.0953533, 0.31916881], [0.04031924, -0.01961045, -0.65174036], [0.06172484, -0.06597366, -0.1244497]]) assert_array_almost_equal(pls_ca.x_loadings_, x_loadings) y_weights = np.array([[0.66101097, 0.18672553, 0.22826092], [0.69347861, 0.18463471, -0.23995597], [0.14462724, -0.66504085, 0.17082434], [0.22247955, -0.6932605, -0.09832993], [0.07035859, 0.00714283, 0.67810124], [0.07765351, -0.0105204, -0.44108074], [-0.00917056, 0.04322147, 0.10062478], [-0.01909512, 0.06182718, 0.28830475], [0.01756709, 0.04797666, 0.32225745]]) assert_array_almost_equal(pls_ca.y_weights_, y_weights) y_loadings = np.array([[0.68568625, 0.1674376, 0.0969508], [0.68782064, 0.20375837, -0.1164448], [0.11712173, -0.68046903, 0.12001505], [0.17860457, -0.6798319, -0.05089681], [0.06265739, -0.0277703, 0.74729584], [0.0914178, 0.00403751, -0.5135078], [-0.02196918, -0.01377169, 0.09564505], [-0.03288952, 0.09039729, 0.31858973], [0.04287624, 0.05254676, 0.27836841]]) assert_array_almost_equal(pls_ca.y_loadings_, y_loadings) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_weights_, "x weights are not orthogonal") check_ortho(pls_ca.y_weights_, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_scores_, "x scores are not orthogonal") check_ortho(pls_ca.y_scores_, "y scores are not orthogonal")
def check_ortho(M, err_msg): K = np.dot(M.T, M) assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg)
def test_rbf_kernel(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) K = rbf_kernel(X, X) # the diagonal elements of a rbf kernel are 1 assert_array_almost_equal(K.flat[::6], np.ones(5))
def test_paired_manhattan_distances(): # Check the paired manhattan distances computation X = [[0], [0]] Y = [[1], [2]] D = paired_manhattan_distances(X, Y) assert_array_almost_equal(D, [1., 2.])
def test_kernel_symmetry(kernel): # Valid kernels should be symmetric rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) K = kernel(X, X) assert_array_almost_equal(K, K.T, 15)
def test_pairwise_distances_argmin_min(): # Check pairwise minimum distances computation for any metric X = [[0], [1]] Y = [[-2], [3]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y, dtype=np.float32) expected_idx = [0, 1] expected_vals = [2, 2] expected_vals_sq = [4, 4] # euclidean metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean") idx2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # We don't want np.matrix here assert_equal(type(idxsp), np.ndarray) assert_equal(type(valssp), np.ndarray) # euclidean metric squared idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean", metric_kwargs={"squared": True}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals_sq) # Non-euclidean scikit-learn metric idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") idx2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(idx2, expected_idx) assert_array_almost_equal(vals, expected_vals) # sparse matrix case idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") assert_array_almost_equal(idxsp, expected_idx) assert_array_almost_equal(valssp, expected_vals) # Non-euclidean Scipy distance (callable) idx, vals = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Non-euclidean Scipy distance (string) idx, vals = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(idx, expected_idx) assert_array_almost_equal(vals, expected_vals) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan") np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) # Test batch_size deprecation warning assert_warns_message(DeprecationWarning, "version 0.22", pairwise_distances_argmin_min, X, Y, batch_size=500, metric='euclidean')
def test_pairwise_distances(): # Test the pairwise_distance helper function. rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses scikit-learn metric, cityblock (function) is # scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test cosine as a string metric versus cosine callable # The string "cosine" uses sklearn.metric, # while the function cosine is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Test with sparse X and Y, # currently only supported for Euclidean, L1 and cosine. X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) assert_array_almost_equal(S, S2) S2 = manhattan_distances(X, Y) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unknown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
def test_precision_recall_curve_toydata(): with np.errstate(all="raise"): # Binary classification y_true = [0, 1] y_score = [0, 1] p, r, _ = precision_recall_curve(y_true, y_score) auc_prc = average_precision_score(y_true, y_score) assert_array_almost_equal(p, [1, 1]) assert_array_almost_equal(r, [1, 0]) assert_almost_equal(auc_prc, 1.) y_true = [0, 1] y_score = [1, 0] p, r, _ = precision_recall_curve(y_true, y_score) auc_prc = average_precision_score(y_true, y_score) assert_array_almost_equal(p, [0.5, 0., 1.]) assert_array_almost_equal(r, [1., 0., 0.]) # Here we are doing a terrible prediction: we are always getting # it wrong, hence the average_precision_score is the accuracy at # chance: 50% assert_almost_equal(auc_prc, 0.5) y_true = [1, 0] y_score = [1, 1] p, r, _ = precision_recall_curve(y_true, y_score) auc_prc = average_precision_score(y_true, y_score) assert_array_almost_equal(p, [0.5, 1]) assert_array_almost_equal(r, [1., 0]) assert_almost_equal(auc_prc, .5) y_true = [1, 0] y_score = [1, 0] p, r, _ = precision_recall_curve(y_true, y_score) auc_prc = average_precision_score(y_true, y_score) assert_array_almost_equal(p, [1, 1]) assert_array_almost_equal(r, [1, 0]) assert_almost_equal(auc_prc, 1.) y_true = [1, 0] y_score = [0.5, 0.5] p, r, _ = precision_recall_curve(y_true, y_score) auc_prc = average_precision_score(y_true, y_score) assert_array_almost_equal(p, [0.5, 1]) assert_array_almost_equal(r, [1, 0.]) assert_almost_equal(auc_prc, .5) y_true = [0, 0] y_score = [0.25, 0.75] assert_raises(Exception, precision_recall_curve, y_true, y_score) assert_raises(Exception, average_precision_score, y_true, y_score) y_true = [1, 1] y_score = [0.25, 0.75] p, r, _ = precision_recall_curve(y_true, y_score) assert_almost_equal(average_precision_score(y_true, y_score), 1.) assert_array_almost_equal(p, [1., 1., 1.]) assert_array_almost_equal(r, [1, 0.5, 0.]) # Multi-label classification task y_true = np.array([[0, 1], [0, 1]]) y_score = np.array([[0, 1], [0, 1]]) assert_raises(Exception, average_precision_score, y_true, y_score, average="macro") assert_raises(Exception, average_precision_score, y_true, y_score, average="weighted") assert_almost_equal( average_precision_score(y_true, y_score, average="samples"), 1.) assert_almost_equal( average_precision_score(y_true, y_score, average="micro"), 1.) y_true = np.array([[0, 1], [0, 1]]) y_score = np.array([[0, 1], [1, 0]]) assert_raises(Exception, average_precision_score, y_true, y_score, average="macro") assert_raises(Exception, average_precision_score, y_true, y_score, average="weighted") assert_almost_equal( average_precision_score(y_true, y_score, average="samples"), 0.75) assert_almost_equal( average_precision_score(y_true, y_score, average="micro"), 0.5) y_true = np.array([[1, 0], [0, 1]]) y_score = np.array([[0, 1], [1, 0]]) assert_almost_equal( average_precision_score(y_true, y_score, average="macro"), 0.5) assert_almost_equal( average_precision_score(y_true, y_score, average="weighted"), 0.5) assert_almost_equal( average_precision_score(y_true, y_score, average="samples"), 0.5) assert_almost_equal( average_precision_score(y_true, y_score, average="micro"), 0.5) y_true = np.array([[1, 0], [0, 1]]) y_score = np.array([[0.5, 0.5], [0.5, 0.5]]) assert_almost_equal( average_precision_score(y_true, y_score, average="macro"), 0.5) assert_almost_equal( average_precision_score(y_true, y_score, average="weighted"), 0.5) assert_almost_equal( average_precision_score(y_true, y_score, average="samples"), 0.5) assert_almost_equal( average_precision_score(y_true, y_score, average="micro"), 0.5)
def check_classifiers_train(name, Classifier): X_m, y_m = make_blobs(random_state=0) X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # catch deprecation warnings classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape with warnings.catch_warnings(record=True): classifier = Classifier() if name in ['BernoulliNB', 'MultinomialNB']: X -= X.min() set_fast_parameters(classifier) # raises error on malformed input for fit assert_raises(ValueError, classifier.fit, X, y[:-1]) # fit classifier.fit(X, y) # with lists classifier.fit(X.tolist(), y.tolist()) assert_true(hasattr(classifier, "classes_")) y_pred = classifier.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance if name not in ['BernoulliNB', 'MultinomialNB']: assert_greater(accuracy_score(y, y_pred), 0.85) # raises error on malformed input for predict assert_raises(ValueError, classifier.predict, X.T) if hasattr(classifier, "decision_function"): try: # decision_function agrees with predict: decision = classifier.decision_function(X) if n_classes is 2: assert_equal(decision.shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if (n_classes is 3 and not isinstance(classifier, BaseLibSVM)): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, classifier.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, classifier.decision_function, X.T) except NotImplementedError: pass if hasattr(classifier, "predict_proba"): # predict_proba agrees with predict: y_prob = classifier.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, classifier.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, classifier.predict_proba, X.T)
def test_mnnb(): """Test Multinomial Naive Bayes classification. This checks that MultinomialNB implements fit and predict and returns correct values for a simple toy dataset. """ for X in [X2, scipy.sparse.csr_matrix(X2)]: # Check the ability to predict the learning set. clf = MultinomialNB() assert_raises(ValueError, clf.fit, -X, y2) y_pred = clf.fit(X, y2).predict(X) assert_array_equal(y_pred, y2) # Verify that np.log(clf.predict_proba(X)) gives the same results as # clf.predict_log_proba(X) y_pred_proba = clf.predict_proba(X) y_pred_log_proba = clf.predict_log_proba(X) assert_array_almost_equal(np.log(y_pred_proba), y_pred_log_proba, 8) # Check that incremental fitting yields the same results clf2 = MultinomialNB() clf2.partial_fit(X[:2], y2[:2], classes=np.unique(y2)) clf2.partial_fit(X[2:5], y2[2:5]) clf2.partial_fit(X[5:], y2[5:]) y_pred2 = clf2.predict(X) assert_array_equal(y_pred2, y2) y_pred_proba2 = clf2.predict_proba(X) y_pred_log_proba2 = clf2.predict_log_proba(X) assert_array_almost_equal(np.log(y_pred_proba2), y_pred_log_proba2, 8) assert_array_almost_equal(y_pred_proba2, y_pred_proba) assert_array_almost_equal(y_pred_log_proba2, y_pred_log_proba) # Partial fit on the whole data at once should be the same as fit too clf3 = MultinomialNB() clf3.partial_fit(X, y2, classes=np.unique(y2)) y_pred3 = clf3.predict(X) assert_array_equal(y_pred3, y2) y_pred_proba3 = clf3.predict_proba(X) y_pred_log_proba3 = clf3.predict_log_proba(X) assert_array_almost_equal(np.log(y_pred_proba3), y_pred_log_proba3, 8) assert_array_almost_equal(y_pred_proba3, y_pred_proba) assert_array_almost_equal(y_pred_log_proba3, y_pred_log_proba)
def test_paired_euclidean_distances(): """ Check the paired Euclidean distances computation""" X = [[0], [0]] Y = [[1], [2]] D = paired_euclidean_distances(X, Y) assert_array_almost_equal(D, [1., 2.])
def test_pairwise_kernels(): """ Test the pairwise_kernels helper function. """ def callable_rbf_kernel(x, y, **kwds): """ Callable version of pairwise.rbf_kernel. """ K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds) return K rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((2, 4)) # Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS. test_metrics = ["rbf", "sigmoid", "polynomial", "linear", "chi2", "additive_chi2"] for metric in test_metrics: function = PAIRWISE_KERNEL_FUNCTIONS[metric] # Test with Y=None K1 = pairwise_kernels(X, metric=metric) K2 = function(X) assert_array_almost_equal(K1, K2) # Test with Y=Y K1 = pairwise_kernels(X, Y=Y, metric=metric) K2 = function(X, Y=Y) assert_array_almost_equal(K1, K2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) assert_array_almost_equal(K1, K2) # Test with sparse X and Y X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) if metric in ["chi2", "additive_chi2"]: # these don't support sparse matrices yet assert_raises(ValueError, pairwise_kernels, X_sparse, Y=Y_sparse, metric=metric) continue K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) assert_array_almost_equal(K1, K2) # Test with a callable function, with given keywords. metric = callable_rbf_kernel kwds = {} kwds['gamma'] = 0.1 K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds) K2 = rbf_kernel(X, Y=Y, **kwds) assert_array_almost_equal(K1, K2) # callable function, X=Y K1 = pairwise_kernels(X, Y=X, metric=metric, **kwds) K2 = rbf_kernel(X, Y=X, **kwds) assert_array_almost_equal(K1, K2)
def test_pairwise_distances(): """ Test the pairwise_distance helper function. """ rng = np.random.RandomState(0) # Euclidean distance should be equivalent to calling the function. X = rng.random_sample((5, 4)) S = pairwise_distances(X, metric="euclidean") S2 = euclidean_distances(X) assert_array_almost_equal(S, S2) # Euclidean distance, with Y != X. Y = rng.random_sample((2, 4)) S = pairwise_distances(X, Y, metric="euclidean") S2 = euclidean_distances(X, Y) assert_array_almost_equal(S, S2) # Test with tuples as X and Y X_tuples = tuple([tuple([v for v in row]) for row in X]) Y_tuples = tuple([tuple([v for v in row]) for row in Y]) S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") assert_array_almost_equal(S, S2) # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial. S = pairwise_distances(X, metric="cityblock") S2 = pairwise_distances(X, metric=cityblock) assert_equal(S.shape[0], S.shape[1]) assert_equal(S.shape[0], X.shape[0]) assert_array_almost_equal(S, S2) # The manhattan metric should be equivalent to cityblock. S = pairwise_distances(X, Y, metric="manhattan") S2 = pairwise_distances(X, Y, metric=cityblock) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # manhattan does not support sparse matrices atm. assert_raises(ValueError, pairwise_distances, csr_matrix(X), metric="manhattan") # Low-level function for manhattan can divide in blocks to avoid # using too much memory during the broadcasting S3 = manhattan_distances(X, Y, size_threshold=10) assert_array_almost_equal(S, S3) # Test cosine as a string metric versus cosine callable # "cosine" uses sklearn metric, cosine (function) is scipy.spatial S = pairwise_distances(X, Y, metric="cosine") S2 = pairwise_distances(X, Y, metric=cosine) assert_equal(S.shape[0], X.shape[0]) assert_equal(S.shape[1], Y.shape[0]) assert_array_almost_equal(S, S2) # Tests that precomputed metric returns pointer to, and not copy of, X. S = np.dot(X, X.T) S2 = pairwise_distances(S, metric="precomputed") assert_true(S is S2) # Test with sparse X and Y, # currently only supported for euclidean and cosine X_sparse = csr_matrix(X) Y_sparse = csr_matrix(Y) S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") S2 = euclidean_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") S2 = cosine_distances(X_sparse, Y_sparse) assert_array_almost_equal(S, S2) # Test with scipy.spatial.distance metric, with a kwd kwds = {"p": 2.0} S = pairwise_distances(X, Y, metric="minkowski", **kwds) S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # same with Y = None kwds = {"p": 2.0} S = pairwise_distances(X, metric="minkowski", **kwds) S2 = pairwise_distances(X, metric=minkowski, **kwds) assert_array_almost_equal(S, S2) # Test that scipy distance metrics throw an error if sparse matrix given assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski") assert_raises(TypeError, pairwise_distances, X, Y_sparse, metric="minkowski") # Test that a value error is raised if the metric is unkown assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
def test_explained_variance(): # Check that PCA output has unit-variance rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2, svd_solver='full').fit(X) apca = PCA(n_components=2, svd_solver='arpack', random_state=0).fit(X) assert_array_almost_equal(pca.explained_variance_, apca.explained_variance_, 1) assert_array_almost_equal(pca.explained_variance_ratio_, apca.explained_variance_ratio_, 3) rpca = PCA(n_components=2, svd_solver='randomized', random_state=42).fit(X) assert_array_almost_equal(pca.explained_variance_, rpca.explained_variance_, 1) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 1) # compare to empirical variances X_pca = pca.transform(X) assert_array_almost_equal(pca.explained_variance_, np.var(X_pca, axis=0)) X_pca = apca.transform(X) assert_array_almost_equal(apca.explained_variance_, np.var(X_pca, axis=0)) X_rpca = rpca.transform(X) assert_array_almost_equal(rpca.explained_variance_, np.var(X_rpca, axis=0), decimal=1) # Same with correlated data X = datasets.make_classification(n_samples, n_features, n_informative=n_features-2, random_state=rng)[0] pca = PCA(n_components=2).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.explained_variance_ratio_, rpca.explained_variance_ratio_, 5)
def test_pairwise_distances_argmin_min(): """ Check pairwise minimum distances computation for any metric""" X = [[0], [1]] Y = [[-1], [2]] Xsp = dok_matrix(X) Ysp = csr_matrix(Y) # euclidean metric D, E = pairwise_distances_argmin_min(X, Y, metric="euclidean") D2 = pairwise_distances_argmin(X, Y, metric="euclidean") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case Dsp, Esp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") assert_array_equal(Dsp, D) assert_array_equal(Esp, E) # We don't want np.matrix here assert_equal(type(Dsp), np.ndarray) assert_equal(type(Esp), np.ndarray) # Non-euclidean sklearn metric D, E = pairwise_distances_argmin_min(X, Y, metric="manhattan") D2 = pairwise_distances_argmin(X, Y, metric="manhattan") assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(D2, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # sparse matrix case assert_raises(ValueError, pairwise_distances_argmin_min, Xsp, Ysp, metric="manhattan") # Non-euclidean Scipy distance (callable) D, E = pairwise_distances_argmin_min(X, Y, metric=minkowski, metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Non-euclidean Scipy distance (string) D, E = pairwise_distances_argmin_min(X, Y, metric="minkowski", metric_kwargs={"p": 2}) assert_array_almost_equal(D, [0, 1]) assert_array_almost_equal(E, [1., 1.]) # Compare with naive implementation rng = np.random.RandomState(0) X = rng.randn(97, 149) Y = rng.randn(111, 149) dist = pairwise_distances(X, Y, metric="manhattan") dist_orig_ind = dist.argmin(axis=0) dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( X, Y, axis=0, metric="manhattan", batch_size=50) np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
def test_lasso_toy(): """ Test Lasso on a toy example for various values of alpha. When validating this against glmnet notice that glmnet divides it against nobs. """ X = [[-1], [0], [1]] Y = [-1, 0, 1] # just a straight line T = [[2], [3], [4]] # test sample clf = Lasso(alpha=1e-8) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.85]) assert_array_almost_equal(pred, [1.7, 2.55, 3.4]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.25]) assert_array_almost_equal(pred, [0.5, 0.75, 1.]) assert_almost_equal(clf.dual_gap_, 0) clf = Lasso(alpha=1) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [.0]) assert_array_almost_equal(pred, [0, 0, 0]) assert_almost_equal(clf.dual_gap_, 0)
def test_singular_values(): # Check that the PCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 100 n_features = 80 X = rng.randn(n_samples, n_features) pca = PCA(n_components=2, svd_solver='full', random_state=rng).fit(X) apca = PCA(n_components=2, svd_solver='arpack', random_state=rng).fit(X) rpca = PCA(n_components=2, svd_solver='randomized', random_state=rng).fit(X) assert_array_almost_equal(pca.singular_values_, apca.singular_values_, 12) assert_array_almost_equal(pca.singular_values_, rpca.singular_values_, 1) assert_array_almost_equal(apca.singular_values_, rpca.singular_values_, 1) # Compare to the Frobenius norm X_pca = pca.transform(X) X_apca = apca.transform(X) X_rpca = rpca.transform(X) assert_array_almost_equal(np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(apca.singular_values_**2.0), np.linalg.norm(X_apca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(rpca.singular_values_**2.0), np.linalg.norm(X_rpca, "fro")**2.0, 0) # Compare to the 2-norms of the score vectors assert_array_almost_equal(pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12) assert_array_almost_equal(apca.singular_values_, np.sqrt(np.sum(X_apca**2.0, axis=0)), 12) assert_array_almost_equal(rpca.singular_values_, np.sqrt(np.sum(X_rpca**2.0, axis=0)), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = rng.randn(n_samples, n_features) pca = PCA(n_components=3, svd_solver='full', random_state=rng) apca = PCA(n_components=3, svd_solver='arpack', random_state=rng) rpca = PCA(n_components=3, svd_solver='randomized', random_state=rng) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) apca.fit(X_hat) rpca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(apca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(rpca.singular_values_, [3.142, 2.718, 1.0], 14)
def check_sparse_input(name, X, X_sparse, y): ForestEstimator = FOREST_ESTIMATORS[name] dense = ForestEstimator(random_state=0, max_depth=2).fit(X, y) sparse = ForestEstimator(random_state=0, max_depth=2).fit(X_sparse, y) assert_array_almost_equal(sparse.apply(X), dense.apply(X)) if name in FOREST_CLASSIFIERS or name in FOREST_REGRESSORS: assert_array_almost_equal(sparse.predict(X), dense.predict(X)) assert_array_almost_equal(sparse.feature_importances_, dense.feature_importances_) if name in FOREST_CLASSIFIERS: assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X)) assert_array_almost_equal(sparse.predict_log_proba(X), dense.predict_log_proba(X)) if name in FOREST_TRANSFORMERS: assert_array_almost_equal( sparse.transform(X).toarray(), dense.transform(X).toarray()) assert_array_almost_equal( sparse.fit_transform(X).toarray(), dense.fit_transform(X).toarray())
def test_enet_toy(): """ Test ElasticNet for various parameters of alpha and l1_ratio. Actually, the parameters alpha = 0 should not be allowed. However, we test it as a border case. ElasticNet is tested with and without precomputed Gram matrix """ X = np.array([[-1.], [0.], [1.]]) Y = [-1, 0, 1] # just a straight line T = [[2.], [3.], [4.]] # test sample # this should be the same as lasso clf = ElasticNet(alpha=1e-8, l1_ratio=1.0) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [1]) assert_array_almost_equal(pred, [2, 3, 4]) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=100, precompute=False) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf.set_params(max_iter=100, precompute=True) clf.fit(X, Y) # with Gram pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf.set_params(max_iter=100, precompute=np.dot(X.T, X)) clf.fit(X, Y) # with Gram pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.50819], decimal=3) assert_array_almost_equal(pred, [1.0163, 1.5245, 2.0327], decimal=3) assert_almost_equal(clf.dual_gap_, 0) clf = ElasticNet(alpha=0.5, l1_ratio=0.5) clf.fit(X, Y) pred = clf.predict(T) assert_array_almost_equal(clf.coef_, [0.45454], 3) assert_array_almost_equal(pred, [0.9090, 1.3636, 1.8181], 3) assert_almost_equal(clf.dual_gap_, 0)
def test_sgd_proba(self): # Check SGD.predict_proba # Hinge loss does not allow for conditional prob estimate. # We cannot use the factory here, because it defines predict_proba # anyway. clf = SGDClassifier(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y) assert_false(hasattr(clf, "predict_proba")) assert_false(hasattr(clf, "predict_log_proba")) # log and modified_huber losses can output probability estimates # binary case for loss in ["log", "modified_huber"]: clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X, Y) p = clf.predict_proba([[3, 2]]) assert_true(p[0, 1] > 0.5) p = clf.predict_proba([[-1, -1]]) assert_true(p[0, 1] < 0.5) p = clf.predict_log_proba([[3, 2]]) assert_true(p[0, 1] > p[0, 0]) p = clf.predict_log_proba([[-1, -1]]) assert_true(p[0, 1] < p[0, 0]) # log loss multiclass probability estimates clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2) d = clf.decision_function([[.1, -.1], [.3, .2]]) p = clf.predict_proba([[.1, -.1], [.3, .2]]) assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1)) assert_almost_equal(p[0].sum(), 1) assert_true(np.all(p[0] >= 0)) p = clf.predict_proba([[-1, -1]]) d = clf.decision_function([[-1, -1]]) assert_array_equal(np.argsort(p[0]), np.argsort(d[0])) l = clf.predict_log_proba([[3, 2]]) p = clf.predict_proba([[3, 2]]) assert_array_almost_equal(np.log(p), l) l = clf.predict_log_proba([[-1, -1]]) p = clf.predict_proba([[-1, -1]]) assert_array_almost_equal(np.log(p), l) # Modified Huber multiclass probability estimates; requires a separate # test because the hard zero/one probabilities may destroy the # ordering present in decision_function output. clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10) clf.fit(X2, Y2) d = clf.decision_function([[3, 2]]) p = clf.predict_proba([[3, 2]]) if not isinstance(self, SparseSGDClassifierTestCase): assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1)) else: # XXX the sparse test gets a different X2 (?) assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1)) # the following sample produces decision_function values < -1, # which would cause naive normalization to fail (see comment # in SGDClassifier.predict_proba) x = X.mean(axis=0) d = clf.decision_function([x]) if np.all(d < -1): # XXX not true in sparse test case (why?) p = clf.predict_proba([x]) assert_array_almost_equal(p[0], [1 / 3.] * 3)
def check_memory_layout(name, dtype): # Check that it works no matter the memory layout est = FOREST_ESTIMATORS[name](random_state=0, bootstrap=False) # Nothing X = np.asarray(iris.data, dtype=dtype) y = iris.target assert_array_almost_equal(est.fit(X, y).predict(X), y) # C-order X = np.asarray(iris.data, order="C", dtype=dtype) y = iris.target assert_array_almost_equal(est.fit(X, y).predict(X), y) # F-order X = np.asarray(iris.data, order="F", dtype=dtype) y = iris.target assert_array_almost_equal(est.fit(X, y).predict(X), y) # Contiguous X = np.ascontiguousarray(iris.data, dtype=dtype) y = iris.target assert_array_almost_equal(est.fit(X, y).predict(X), y) if est.base_estimator.splitter in SPARSE_SPLITTERS: # csr matrix X = csr_matrix(iris.data, dtype=dtype) y = iris.target assert_array_almost_equal(est.fit(X, y).predict(X), y) # csc_matrix X = csc_matrix(iris.data, dtype=dtype) y = iris.target assert_array_almost_equal(est.fit(X, y).predict(X), y) # coo_matrix X = coo_matrix(iris.data, dtype=dtype) y = iris.target assert_array_almost_equal(est.fit(X, y).predict(X), y) # Strided X = np.asarray(iris.data[::3], dtype=dtype) y = iris.target[::3] assert_array_almost_equal(est.fit(X, y).predict(X), y)
def test_discrete_prior(): """Test whether class priors are properly set. """ for cls in [BernoulliNB, MultinomialNB]: clf = cls().fit(X2, y2) assert_array_almost_equal(np.log(np.array([2, 2, 2]) / 6.0), clf.class_log_prior_, 8)
def test_dump(): X_sparse, y_dense = load_svmlight_file(datafile) X_dense = X_sparse.toarray() y_sparse = sp.csr_matrix(y_dense) # slicing a csr_matrix can unsort its .indices, so test that we sort # those correctly X_sliced = X_sparse[np.arange(X_sparse.shape[0])] y_sliced = y_sparse[np.arange(y_sparse.shape[0])] for X in (X_sparse, X_dense, X_sliced): for y in (y_sparse, y_dense, y_sliced): for zero_based in (True, False): for dtype in [np.float32, np.float64, np.int32]: f = BytesIO() # we need to pass a comment to get the version info in; # LibSVM doesn't grok comments so they're not put in by # default anymore. if sp.issparse(y) and y.shape[0] == 1: # make sure y's shape is: (n_samples, n_labels) # when it is sparse y = y.T dump_svmlight_file(X.astype(dtype), y, f, comment="test", zero_based=zero_based) f.seek(0) comment = f.readline() try: comment = str(comment, "utf-8") except TypeError: # fails in Python 2.x pass assert_in("scikit-learn %s" % sklearn.__version__, comment) comment = f.readline() try: comment = str(comment, "utf-8") except TypeError: # fails in Python 2.x pass assert_in(["one", "zero"][zero_based] + "-based", comment) X2, y2 = load_svmlight_file(f, dtype=dtype, zero_based=zero_based) assert_equal(X2.dtype, dtype) assert_array_equal(X2.sorted_indices().indices, X2.indices) X2_dense = X2.toarray() if dtype == np.float32: # allow a rounding error at the last decimal place assert_array_almost_equal(X_dense.astype(dtype), X2_dense, 4) assert_array_almost_equal(y_dense.astype(dtype), y2, 4) else: # allow a rounding error at the last decimal place assert_array_almost_equal(X_dense.astype(dtype), X2_dense, 15) assert_array_almost_equal(y_dense.astype(dtype), y2, 15)
def test_lasso_lars_vs_R_implementation(): # Test that sklearn LassoLars implementation agrees with the LassoLars # implementation available in R (lars library) under the following # scenarios: # 1) fit_intercept=False and normalize=False # 2) fit_intercept=True and normalize=True # Let's generate the data used in the bug report 7778 y = np.array( [-6.45006793, -3.51251449, -8.52445396, 6.12277822, -19.42109366]) x = np.array( [[0.47299829, 0, 0, 0, 0], [0.08239882, 0.85784863, 0, 0, 0], [0.30114139, -0.07501577, 0.80895216, 0, 0], [-0.01460346, -0.1015233, 0.0407278, 0.80338378, 0], [-0.69363927, 0.06754067, 0.18064514, -0.0803561, 0.40427291]]) X = x.T ########################################################################### # Scenario 1: Let's compare R vs sklearn when fit_intercept=False and # normalize=False ########################################################################### # # The R result was obtained using the following code: # # library(lars) # model_lasso_lars = lars(X, t(y), type="lasso", intercept=FALSE, # trace=TRUE, normalize=FALSE) # r = t(model_lasso_lars$beta) # r = np.array([[ 0, 0, 0, 0, 0, -79.810362809499026, -83.528788732782829, -83.777653739190711, -83.784156932888934, -84.033390591756657 ], [0, 0, 0, 0, -0.476624256777266, 0, 0, 0, 0, 0.025219751009936], [ 0, -3.577397088285891, -4.702795355871871, -7.016748621359461, -7.614898471899412, -0.336938391359179, 0, 0, 0.001213370600853, 0.048162321585148 ], [ 0, 0, 0, 2.231558436628169, 2.723267514525966, 2.811549786389614, 2.813766976061531, 2.817462468949557, 2.817368178703816, 2.816221090636795 ], [ 0, 0, -1.218422599914637, -3.457726183014808, -4.021304522060710, -45.827461592423745, -47.776608869312305, -47.911561610746404, -47.914845922736234, -48.039562334265717 ]]) model_lasso_lars = linear_model.LassoLars(alpha=0, fit_intercept=False, normalize=False) model_lasso_lars.fit(X, y) skl_betas = model_lasso_lars.coef_path_ assert_array_almost_equal(r, skl_betas, decimal=12) ########################################################################### ########################################################################### # Scenario 2: Let's compare R vs sklearn when fit_intercept=True and # normalize=True # # Note: When normalize is equal to True, R returns the coefficients in # their original units, that is, they are rescaled back, whereas sklearn # does not do that, therefore, we need to do this step before comparing # their results. ########################################################################### # # The R result was obtained using the following code: # # library(lars) # model_lasso_lars2 = lars(X, t(y), type="lasso", intercept=TRUE, # trace=TRUE, normalize=TRUE) # r2 = t(model_lasso_lars2$beta) r2 = np.array( [[0, 0, 0, 0, 0], [0, 0, 0, 8.371887668009453, 19.463768371044026], [0, 0, 0, 0, 9.901611055290553], [ 0, 7.495923132833733, 9.245133544334507, 17.389369207545062, 26.971656815643499 ], [0, 0, -1.569380717440311, -5.924804108067312, -7.996385265061972]]) model_lasso_lars2 = linear_model.LassoLars(alpha=0, fit_intercept=True, normalize=True) model_lasso_lars2.fit(X, y) skl_betas2 = model_lasso_lars2.coef_path_ # Let's rescale back the coefficients returned by sklearn before comparing # against the R result (read the note above) temp = X - np.mean(X, axis=0) normx = np.sqrt(np.sum(temp**2, axis=0)) skl_betas2 /= normx[:, np.newaxis] assert_array_almost_equal(r2, skl_betas2, decimal=12)
def test_fastica_simple(add_noise=False): """ Test the FastICA algorithm on very simple data. """ rng = np.random.RandomState(0) # scipy.stats uses the global RNG: np.random.seed(0) n_samples = 1000 # Generate two sources: s1 = (2 * np.sin(np.linspace(0, 100, n_samples)) > 0) - 1 s2 = stats.t.rvs(1, size=n_samples) s = np.c_[s1, s2].T center_and_norm(s) s1, s2 = s # Mixing angle phi = 0.6 mixing = np.array([[np.cos(phi), np.sin(phi)], [np.sin(phi), -np.cos(phi)]]) m = np.dot(mixing, s) if add_noise: m += 0.1 * rng.randn(2, 1000) center_and_norm(m) # function as fun arg def g_test(x): return x ** 3, (3 * x ** 2).mean(axis=-1) algos = ['parallel', 'deflation'] nls = ['logcosh', 'exp', 'cube', g_test] whitening = [True, False] for algo, nl, whiten in itertools.product(algos, nls, whitening): if whiten: k_, mixing_, s_ = fastica(m.T, fun=nl, algorithm=algo) assert_raises(ValueError, fastica, m.T, fun=np.tanh, algorithm=algo) else: X = PCA(n_components=2, whiten=True).fit_transform(m.T) k_, mixing_, s_ = fastica(X, fun=nl, algorithm=algo, whiten=False) assert_raises(ValueError, fastica, X, fun=np.tanh, algorithm=algo) s_ = s_.T # Check that the mixing model described in the docstring holds: if whiten: assert_almost_equal(s_, np.dot(np.dot(mixing_, k_), m)) center_and_norm(s_) s1_, s2_ = s_ # Check to see if the sources have been estimated # in the wrong order if abs(np.dot(s1_, s2)) > abs(np.dot(s1_, s1)): s2_, s1_ = s_ s1_ *= np.sign(np.dot(s1_, s1)) s2_ *= np.sign(np.dot(s2_, s2)) # Check that we have estimated the original sources if not add_noise: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=2) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=2) else: assert_almost_equal(np.dot(s1_, s1) / n_samples, 1, decimal=1) assert_almost_equal(np.dot(s2_, s2) / n_samples, 1, decimal=1) # Test FastICA class _, _, sources_fun = fastica(m.T, fun=nl, algorithm=algo, random_state=0) ica = FastICA(fun=nl, algorithm=algo, random_state=0) sources = ica.fit_transform(m.T) assert_equal(ica.components_.shape, (2, 2)) assert_equal(sources.shape, (1000, 2)) assert_array_almost_equal(sources_fun, sources) assert_array_almost_equal(sources, ica.transform(m.T)) assert_equal(ica.mixing_.shape, (2, 2)) for fn in [np.tanh, "exp(-.5(x^2))"]: ica = FastICA(fun=fn, algorithm=algo, random_state=0) assert_raises(ValueError, ica.fit, m.T) assert_raises(TypeError, FastICA(fun=moves.xrange(10)).fit, m.T)