def test_partial_fit_classification(): # Test partial_fit on classification. # `partial_fit` should yield the same results as 'fit' for binary and # multi-class classification. for X, y in classification_datasets: X = X y = y mlp = MLPClassifier(solver='sgd', max_iter=100, random_state=1, tol=0, alpha=1e-5, learning_rate_init=0.2) with ignore_warnings(category=ConvergenceWarning): mlp.fit(X, y) pred1 = mlp.predict(X) mlp = MLPClassifier(solver='sgd', random_state=1, alpha=1e-5, learning_rate_init=0.2) for i in range(100): mlp.partial_fit(X, y, classes=np.unique(y)) pred2 = mlp.predict(X) assert_array_equal(pred1, pred2) assert mlp.score(X, y) > 0.95
def plot_on_dataset(X, y, ax, name): # for each dataset, plot learning for each learning strategy print("\nlearning on dataset %s" % name) ax.set_title(name) X = MinMaxScaler().fit_transform(X) mlps = [] if name == "digits": # digits is larger but converges fairly quickly max_iter = 15 else: max_iter = 400 for label, param in zip(labels, params): print("training: %s" % label) mlp = MLPClassifier(random_state=0, max_iter=max_iter, **param) # some parameter combinations will not converge as can be seen on the # plots so they are ignored here with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning, module="mrex") mlp.fit(X, y) mlps.append(mlp) print("Training set score: %f" % mlp.score(X, y)) print("Training set loss: %f" % mlp.loss_) for mlp, label, args in zip(mlps, labels, plot_args): ax.plot(mlp.loss_curve_, label=label, **args)
def test_multilabel_classification(): # Test that multi-label classification works as expected. # test fit method X, y = make_multilabel_classification(n_samples=50, random_state=0, return_indicator=True) mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5, max_iter=150, random_state=0, activation='logistic', learning_rate_init=0.2) mlp.fit(X, y) assert mlp.score(X, y) > 0.97 # test partial fit method mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150, random_state=0, activation='logistic', alpha=1e-5, learning_rate_init=0.2) for i in range(100): mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4]) assert mlp.score(X, y) > 0.9 # Make sure early stopping still work now that spliting is stratified by # default (it is disabled for multilabel classification) mlp = MLPClassifier(early_stopping=True) mlp.fit(X, y).predict(X)
def test_tolerance(): # Test tolerance. # It should force the solver to exit the loop when it converges. X = [[3, 2], [1, 6]] y = [1, 0] clf = MLPClassifier(tol=0.5, max_iter=3000, solver='sgd') clf.fit(X, y) assert clf.max_iter > clf.n_iter_
def test_adaptive_learning_rate(): X = [[3, 2], [1, 6]] y = [1, 0] clf = MLPClassifier(tol=0.5, max_iter=3000, solver='sgd', learning_rate='adaptive') clf.fit(X, y) assert clf.max_iter > clf.n_iter_ assert 1e-6 > clf._optimizer.learning_rate
def test_early_stopping_stratified(): # Make sure data splitting for early stopping is stratified X = [[1, 2], [2, 3], [3, 4], [4, 5]] y = [0, 0, 0, 1] mlp = MLPClassifier(early_stopping=True) with pytest.raises( ValueError, match='The least populated class in y has only 1 member'): mlp.fit(X, y)
def test_sparse_matrices(): # Test that sparse and dense input matrices output the same results. X = X_digits_binary[:50] y = y_digits_binary[:50] X_sparse = csr_matrix(X) mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=15, random_state=1) mlp.fit(X, y) pred1 = mlp.predict(X) mlp.fit(X_sparse, y) pred2 = mlp.predict(X_sparse) assert_almost_equal(pred1, pred2) pred1 = mlp.predict(X) pred2 = mlp.predict(X_sparse) assert_array_equal(pred1, pred2)
def test_early_stopping(): X = X_digits_binary[:100] y = y_digits_binary[:100] tol = 0.2 clf = MLPClassifier(tol=tol, max_iter=3000, solver='sgd', early_stopping=True) clf.fit(X, y) assert clf.max_iter > clf.n_iter_ valid_scores = clf.validation_scores_ best_valid_score = clf.best_validation_score_ assert max(valid_scores) == best_valid_score assert best_valid_score + tol > valid_scores[-2] assert best_valid_score + tol > valid_scores[-1]
def test_lbfgs_classification_maxfun(X, y): # Test lbfgs parameter max_fun. # It should independently limit the number of iterations for lbfgs. max_fun = 10 # classification tests for activation in ACTIVATION_TYPES: mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, max_iter=150, max_fun=max_fun, shuffle=True, random_state=1, activation=activation) with pytest.warns(ConvergenceWarning): mlp.fit(X, y) assert max_fun >= mlp.n_iter_
def test_verbose_sgd(): # Test verbose. X = [[3, 2], [1, 6]] y = [1, 0] clf = MLPClassifier(solver='sgd', max_iter=2, verbose=10, hidden_layer_sizes=2) old_stdout = sys.stdout sys.stdout = output = StringIO() with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) clf.partial_fit(X, y) sys.stdout = old_stdout assert 'Iteration' in output.getvalue()
def test_predict_proba_multiclass(): # Test that predict_proba works as expected for multi class. X = X_digits_multi[:10] y = y_digits_multi[:10] clf = MLPClassifier(hidden_layer_sizes=5) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) y_proba = clf.predict_proba(X) y_log_proba = clf.predict_log_proba(X) (n_samples, n_classes) = y.shape[0], np.unique(y).size proba_max = y_proba.argmax(axis=1) proba_log_max = y_log_proba.argmax(axis=1) assert y_proba.shape == (n_samples, n_classes) assert_array_equal(proba_max, proba_log_max) assert_array_equal(y_log_proba, np.log(y_proba))
def test_n_iter_no_change(): # test n_iter_no_change using binary data set # the classifying fitting process is not prone to loss curve fluctuations X = X_digits_binary[:100] y = y_digits_binary[:100] tol = 0.01 max_iter = 3000 # test multiple n_iter_no_change for n_iter_no_change in [2, 5, 10, 50, 100]: clf = MLPClassifier(tol=tol, max_iter=max_iter, solver='sgd', n_iter_no_change=n_iter_no_change) clf.fit(X, y) # validate n_iter_no_change assert clf._no_improvement_count == n_iter_no_change + 1 assert max_iter > clf.n_iter_
def test_alpha(): # Test that larger alpha yields weights closer to zero X = X_digits_binary[:100] y = y_digits_binary[:100] alpha_vectors = [] alpha_values = np.arange(2) absolute_sum = lambda x: np.sum(np.abs(x)) for alpha in alpha_values: mlp = MLPClassifier(hidden_layer_sizes=10, alpha=alpha, random_state=1) with ignore_warnings(category=ConvergenceWarning): mlp.fit(X, y) alpha_vectors.append( np.array( [absolute_sum(mlp.coefs_[0]), absolute_sum(mlp.coefs_[1])])) for i in range(len(alpha_values) - 1): assert (alpha_vectors[i] > alpha_vectors[i + 1]).all()
def test_lbfgs_classification(X, y): # Test lbfgs on classification. # It should achieve a score higher than 0.95 for the binary and multi-class # versions of the digits dataset. X_train = X[:150] y_train = y[:150] X_test = X[150:] expected_shape_dtype = (X_test.shape[0], y_train.dtype.kind) for activation in ACTIVATION_TYPES: mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, max_iter=150, shuffle=True, random_state=1, activation=activation) mlp.fit(X_train, y_train) y_predict = mlp.predict(X_test) assert mlp.score(X_train, y_train) > 0.95 assert ((y_predict.shape[0], y_predict.dtype.kind) == expected_shape_dtype)
def test_learning_rate_warmstart(): # Tests that warm_start reuse past solutions. X = [[3, 2], [1, 6], [5, 6], [-2, -4]] y = [1, 1, 1, 0] for learning_rate in ["invscaling", "constant"]: mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=4, learning_rate=learning_rate, max_iter=1, power_t=0.25, warm_start=True) with ignore_warnings(category=ConvergenceWarning): mlp.fit(X, y) prev_eta = mlp._optimizer.learning_rate mlp.fit(X, y) post_eta = mlp._optimizer.learning_rate if learning_rate == 'constant': assert prev_eta == post_eta elif learning_rate == 'invscaling': assert (mlp.learning_rate_init / pow(8 + 1, mlp.power_t) == post_eta)
def test_predict_proba_multilabel(): # Test that predict_proba works as expected for multilabel. # Multilabel should not use softmax which makes probabilities sum to 1 X, Y = make_multilabel_classification(n_samples=50, random_state=0, return_indicator=True) n_samples, n_classes = Y.shape clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30, random_state=0) clf.fit(X, Y) y_proba = clf.predict_proba(X) assert y_proba.shape == (n_samples, n_classes) assert_array_equal(y_proba > 0.5, Y) y_log_proba = clf.predict_log_proba(X) proba_max = y_proba.argmax(axis=1) proba_log_max = y_log_proba.argmax(axis=1) assert (y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1) > 1e-10 assert_array_equal(proba_max, proba_log_max) assert_array_equal(y_log_proba, np.log(y_proba))
def test_predict_proba_binary(): # Test that predict_proba works as expected for binary class. X = X_digits_binary[:50] y = y_digits_binary[:50] clf = MLPClassifier(hidden_layer_sizes=5, activation='logistic', random_state=1) with ignore_warnings(category=ConvergenceWarning): clf.fit(X, y) y_proba = clf.predict_proba(X) y_log_proba = clf.predict_log_proba(X) (n_samples, n_classes) = y.shape[0], 2 proba_max = y_proba.argmax(axis=1) proba_log_max = y_log_proba.argmax(axis=1) assert y_proba.shape == (n_samples, n_classes) assert_array_equal(proba_max, proba_log_max) assert_array_equal(y_log_proba, np.log(y_proba)) assert roc_auc_score(y, y_proba[:, 1]) == 1.0
def test_n_iter_no_change_inf(): # test n_iter_no_change using binary data set # the fitting process should go to max_iter iterations X = X_digits_binary[:100] y = y_digits_binary[:100] # set a ridiculous tolerance # this should always trigger _update_no_improvement_count() tol = 1e9 # fit n_iter_no_change = np.inf max_iter = 3000 clf = MLPClassifier(tol=tol, max_iter=max_iter, solver='sgd', n_iter_no_change=n_iter_no_change) clf.fit(X, y) # validate n_iter_no_change doesn't cause early stopping assert clf.n_iter_ == max_iter # validate _update_no_improvement_count() was always triggered assert clf._no_improvement_count == clf.n_iter_ - 1
def test_warm_start(): X = X_iris y = y_iris y_2classes = np.array([0] * 75 + [1] * 75) y_3classes = np.array([0] * 40 + [1] * 40 + [2] * 70) y_3classes_alt = np.array([0] * 50 + [1] * 50 + [3] * 50) y_4classes = np.array([0] * 37 + [1] * 37 + [2] * 38 + [3] * 38) y_5classes = np.array([0] * 30 + [1] * 30 + [2] * 30 + [3] * 30 + [4] * 30) # No error raised clf = MLPClassifier(hidden_layer_sizes=2, solver='lbfgs', warm_start=True).fit(X, y) clf.fit(X, y) clf.fit(X, y_3classes) for y_i in (y_2classes, y_3classes_alt, y_4classes, y_5classes): clf = MLPClassifier(hidden_layer_sizes=2, solver='lbfgs', warm_start=True).fit(X, y) message = ('warm_start can only be used where `y` has the same ' 'classes as in the previous call to fit.' ' Previously got [0 1 2], `y` has %s' % np.unique(y_i)) assert_raise_message(ValueError, message, clf.fit, X, y_i)
X, y = fetch_openml('mnist_784', version=1, return_X_y=True) X = X / 255. # rescale the data, use the traditional train/test split X_train, X_test = X[:60000], X[60000:] y_train, y_test = y[:60000], y[60000:] mlp = MLPClassifier(hidden_layer_sizes=(50, ), max_iter=10, alpha=1e-4, solver='sgd', verbose=10, random_state=1, learning_rate_init=.1) mlp.fit(X_train, y_train) print("Training set score: %f" % mlp.score(X_train, y_train)) print("Test set score: %f" % mlp.score(X_test, y_test)) fig, axes = plt.subplots(4, 4) # use global min / max to ensure all weights are shown on the same scale vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max() for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()): ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin, vmax=.5 * vmax) ax.set_xticks(()) ax.set_yticks(()) plt.show()
def test_gradient(): # Test gradient. # This makes sure that the activation functions and their derivatives # are correct. The numerical and analytical computation of the gradient # should be close. for n_labels in [2, 3]: n_samples = 5 n_features = 10 random_state = np.random.RandomState(seed=42) X = random_state.rand(n_samples, n_features) y = 1 + np.mod(np.arange(n_samples) + 1, n_labels) Y = LabelBinarizer().fit_transform(y) for activation in ACTIVATION_TYPES: mlp = MLPClassifier(activation=activation, hidden_layer_sizes=10, solver='lbfgs', alpha=1e-5, learning_rate_init=0.2, max_iter=1, random_state=1) mlp.fit(X, y) theta = np.hstack( [l.ravel() for l in mlp.coefs_ + mlp.intercepts_]) layer_units = ([X.shape[1]] + [mlp.hidden_layer_sizes] + [mlp.n_outputs_]) activations = [] deltas = [] coef_grads = [] intercept_grads = [] activations.append(X) for i in range(mlp.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) deltas.append(np.empty((X.shape[0], layer_units[i + 1]))) fan_in = layer_units[i] fan_out = layer_units[i + 1] coef_grads.append(np.empty((fan_in, fan_out))) intercept_grads.append(np.empty(fan_out)) # analytically compute the gradients def loss_grad_fun(t): return mlp._loss_grad_lbfgs(t, X, Y, activations, deltas, coef_grads, intercept_grads) [value, grad] = loss_grad_fun(theta) numgrad = np.zeros(np.size(theta)) n = np.size(theta, 0) E = np.eye(n) epsilon = 1e-5 # numerically compute the gradients for i in range(n): dtheta = E[:, i] * epsilon numgrad[i] = ((loss_grad_fun(theta + dtheta)[0] - loss_grad_fun(theta - dtheta)[0]) / (epsilon * 2.0)) assert_almost_equal(numgrad, grad)