def test_check_accuracy_on_digits(): # Non regression test to make sure that any further refactoring / optim # of the NB models do not harm the performance on a slightly non-linearly # separable dataset X, y = load_digits(return_X_y=True) binary_3v8 = np.logical_or(y == 3, y == 8) X_3v8, y_3v8 = X[binary_3v8], y[binary_3v8] # Multinomial NB scores = cross_val_score(MultinomialNB(alpha=10), X, y, cv=10) assert scores.mean() > 0.86 scores = cross_val_score(MultinomialNB(alpha=10), X_3v8, y_3v8, cv=10) assert scores.mean() > 0.94 # Bernoulli NB scores = cross_val_score(BernoulliNB(alpha=10), X > 4, y, cv=10) assert scores.mean() > 0.83 scores = cross_val_score(BernoulliNB(alpha=10), X_3v8 > 4, y_3v8, cv=10) assert scores.mean() > 0.92 # Gaussian NB scores = cross_val_score(GaussianNB(), X, y, cv=10) assert scores.mean() > 0.77 scores = cross_val_score(GaussianNB(var_smoothing=0.1), X, y, cv=10) assert scores.mean() > 0.89 scores = cross_val_score(GaussianNB(), X_3v8, y_3v8, cv=10) assert scores.mean() > 0.86
def test_load_digits(): digits = load_digits() assert digits.data.shape == (1797, 64) assert numpy.unique(digits.target).size == 10 # test return_X_y option check_return_X_y(digits, partial(load_digits))
def test_pca_score_consistency_solvers(svd_solver): # Check the consistency of score between solvers X, _ = datasets.load_digits(return_X_y=True) pca_full = PCA(n_components=30, svd_solver='full', random_state=0) pca_other = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca_full.fit(X) pca_other.fit(X) assert_allclose(pca_full.score(X), pca_other.score(X), rtol=5e-6)
def test_pca_sanity_noise_variance(svd_solver): # Sanity check for the noise_variance_. For more details see # https://github.com/scikit-learn/scikit-learn/issues/7568 # https://github.com/scikit-learn/scikit-learn/issues/8541 # https://github.com/scikit-learn/scikit-learn/issues/8544 X, _ = datasets.load_digits(return_X_y=True) pca = PCA(n_components=30, svd_solver=svd_solver, random_state=0) pca.fit(X) assert np.all((pca.explained_variance_ - pca.noise_variance_) >= 0)
def test_adaboost_consistent_predict(algorithm): # check that predict_proba and predict give consistent results # regression test for: # https://github.com/scikit-learn/scikit-learn/issues/14084 X_train, X_test, y_train, y_test = train_test_split( *datasets.load_digits(return_X_y=True), random_state=42) model = AdaBoostClassifier(algorithm=algorithm, random_state=42) model.fit(X_train, y_train) assert_array_equal(np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test))
def get_data(N, D, dataset='dense'): if dataset == 'dense': np.random.seed(0) return np.random.random((N, D)) elif dataset == 'digits': X, _ = datasets.load_digits(return_X_y=True) i = np.argsort(X[0])[::-1] X = X[:, i] return X[:N, :D] else: raise ValueError("invalid dataset: %s" % dataset)
def test_unsorted_indices(): # test that the result with sorted and unsorted indices in csr is the same # we use a subset of digits as iris, blobs or make_classification didn't # show the problem X, y = load_digits(return_X_y=True) X_test = sparse.csr_matrix(X[50:100]) X, y = X[:50], y[:50] X_sparse = sparse.csr_matrix(X) coef_dense = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X, y).coef_ sparse_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse, y) coef_sorted = sparse_svc.coef_ # make sure dense and sparse SVM give the same result assert_array_almost_equal(coef_dense, coef_sorted.toarray()) # reverse each row's indices def scramble_indices(X): new_data = [] new_indices = [] for i in range(1, len(X.indptr)): row_slice = slice(*X.indptr[i - 1:i + 1]) new_data.extend(X.data[row_slice][::-1]) new_indices.extend(X.indices[row_slice][::-1]) return sparse.csr_matrix((new_data, new_indices, X.indptr), shape=X.shape) X_sparse_unsorted = scramble_indices(X_sparse) X_test_unsorted = scramble_indices(X_test) assert not X_sparse_unsorted.has_sorted_indices assert not X_test_unsorted.has_sorted_indices unsorted_svc = svm.SVC(kernel='linear', probability=True, random_state=0).fit(X_sparse_unsorted, y) coef_unsorted = unsorted_svc.coef_ # make sure unsorted indices give same result assert_array_almost_equal(coef_unsorted.toarray(), coef_sorted.toarray()) assert_array_almost_equal(sparse_svc.predict_proba(X_test_unsorted), sparse_svc.predict_proba(X_test))
# License: BSD 3 clause print(__doc__) # Standard scientific Python imports import matplotlib.pyplot as plt import numpy as np from time import time # Import datasets, classifiers and performance metrics from mrex import datasets, svm, pipeline from mrex.kernel_approximation import (RBFSampler, Nystroem) from mrex.decomposition import PCA # The digits dataset digits = datasets.load_digits(n_class=9) ################################################################## # Timing and accuracy plots # -------------------------------------------------- # To apply an classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.data) data = digits.data / 16. data -= data.mean(axis=0) # We learn the digits on the first half of the digits data_train, targets_train = (data[:n_samples // 2], digits.target[:n_samples // 2]) # Now predict the value of the digit on the second half:
# some parameter combinations will not converge as can be seen on the # plots so they are ignored here with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=ConvergenceWarning, module="mrex") mlp.fit(X, y) mlps.append(mlp) print("Training set score: %f" % mlp.score(X, y)) print("Training set loss: %f" % mlp.loss_) for mlp, label, args in zip(mlps, labels, plot_args): ax.plot(mlp.loss_curve_, label=label, **args) fig, axes = plt.subplots(2, 2, figsize=(15, 10)) # load / generate some toy datasets iris = datasets.load_iris() X_digits, y_digits = datasets.load_digits(return_X_y=True) data_sets = [(iris.data, iris.target), (X_digits, y_digits), datasets.make_circles(noise=0.2, factor=0.5, random_state=1), datasets.make_moons(noise=0.3, random_state=0)] for ax, data, name in zip(axes.ravel(), data_sets, ['iris', 'digits', 'circles', 'moons']): plot_on_dataset(*data, ax=ax, name=name) fig.legend(ax.get_lines(), labels, ncol=3, loc="upper center") plt.show()
# larger number of dimensions ``n_components``. # # - for the 20 newsgroups dataset some 500 documents with 100k # features in total are projected using a sparse random matrix to smaller # euclidean spaces with various values for the target number of dimensions # ``n_components``. # # The default dataset is the digits dataset. To run the example on the twenty # newsgroups dataset, pass the --twenty-newsgroups command line argument to # this script. if '--twenty-newsgroups' in sys.argv: # Need an internet connection hence not enabled by default data = fetch_20newsgroups_vectorized().data[:500] else: data = load_digits().data[:500] ########################################################## # For each value of ``n_components``, we plot: # # - 2D distribution of sample pairs with pairwise distances in original # and projected spaces as x and y axis respectively. # # - 1D histogram of the ratio of those distances (projected / original). n_samples, n_features = data.shape print("Embedding %d samples with dim %d using various random projections" % (n_samples, n_features)) n_components_range = np.array([300, 1000, 10000]) dists = euclidean_distances(data, squared=True).ravel()
def exp(solvers, penalty, single_target, n_samples=30000, max_iter=20, dataset='rcv1', n_jobs=1, skip_slow=False): dtypes_mapping = { "float64": np.float64, "float32": np.float32, } if dataset == 'rcv1': rcv1 = fetch_rcv1() lbin = LabelBinarizer() lbin.fit(rcv1.target_names) X = rcv1.data y = rcv1.target y = lbin.inverse_transform(y) le = LabelEncoder() y = le.fit_transform(y) if single_target: y_n = y.copy() y_n[y > 16] = 1 y_n[y <= 16] = 0 y = y_n elif dataset == 'digits': X, y = load_digits(return_X_y=True) if single_target: y_n = y.copy() y_n[y < 5] = 1 y_n[y >= 5] = 0 y = y_n elif dataset == 'iris': iris = load_iris() X, y = iris.data, iris.target elif dataset == '20newspaper': ng = fetch_20newsgroups_vectorized() X = ng.data y = ng.target if single_target: y_n = y.copy() y_n[y > 4] = 1 y_n[y <= 16] = 0 y = y_n X = X[:n_samples] y = y[:n_samples] out = Parallel(n_jobs=n_jobs, mmap_mode=None)( delayed(fit_single)(solver, X, y, penalty=penalty, single_target=single_target, dtype=dtype, C=1, max_iter=max_iter, skip_slow=skip_slow) for solver in solvers for dtype in dtypes_mapping.values()) res = [] idx = 0 for dtype_name in dtypes_mapping.keys(): for solver in solvers: if not (skip_slow and solver == 'lightning' and penalty == 'l1'): lr, times, train_scores, test_scores, accuracies = out[idx] this_res = dict(solver=solver, penalty=penalty, dtype=dtype_name, single_target=single_target, times=times, train_scores=train_scores, test_scores=test_scores, accuracies=accuracies) res.append(this_res) idx += 1 with open('bench_saga.json', 'w+') as f: json.dump(res, f)
""" print(__doc__) from time import time import numpy as np import matplotlib.pyplot as plt from mrex import metrics from mrex.cluster import KMeans from mrex.datasets import load_digits from mrex.decomposition import PCA from mrex.preprocessing import scale np.random.seed(42) X_digits, y_digits = load_digits(return_X_y=True) data = scale(X_digits) n_samples, n_features = data.shape n_digits = len(np.unique(y_digits)) labels = y_digits sample_size = 300 print("n_digits: %d, \t n_samples %d, \t n_features %d" % (n_digits, n_samples, n_features)) print(82 * '_') print('init\t\ttime\tinertia\thomo\tcompl\tv-meas\tARI\tAMI\tsilhouette')
a digit classification task. .. note:: See also :ref:`sphx_glr_auto_examples_feature_selection_plot_rfe_with_cross_validation.py` """ print(__doc__) from mrex.svm import SVC from mrex.datasets import load_digits from mrex.feature_selection import RFE import matplotlib.pyplot as plt # Load the digits dataset digits = load_digits() X = digits.images.reshape((len(digits.images), -1)) y = digits.target # Create the RFE object and rank each pixel svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=1, step=1) rfe.fit(X, y) ranking = rfe.ranking_.reshape(digits.images[0].shape) # Plot pixel ranking plt.matshow(ranking, cmap=plt.cm.Blues) plt.colorbar() plt.title("Ranking of pixels with RFE") plt.show()
# Authors: Vighnesh Birodkar <*****@*****.**> # Raghav RV <*****@*****.**> # License: BSD 3 clause import time import numpy as np import matplotlib.pyplot as plt from mrex import ensemble from mrex import datasets from mrex.model_selection import train_test_split print(__doc__) data_list = [datasets.load_iris(), datasets.load_digits()] data_list = [(d.data, d.target) for d in data_list] data_list += [datasets.make_hastie_10_2()] names = ['Iris Data', 'Digits Data', 'Hastie Data'] n_gb = [] score_gb = [] time_gb = [] n_gbes = [] score_gbes = [] time_gbes = [] n_estimators = 500 for X, y in data_list: X_train, X_test, y_train, y_test = train_test_split(X,
from mrex.datasets import load_digits, load_boston, load_iris from mrex.datasets import make_regression, make_multilabel_classification from mrex.exceptions import ConvergenceWarning from io import StringIO from mrex.metrics import roc_auc_score from mrex.neural_network import MLPClassifier from mrex.neural_network import MLPRegressor from mrex.preprocessing import LabelBinarizer from mrex.preprocessing import StandardScaler, MinMaxScaler from scipy.sparse import csr_matrix from mrex.utils.testing import assert_raises, ignore_warnings from mrex.utils.testing import assert_raise_message ACTIVATION_TYPES = ["identity", "logistic", "tanh", "relu"] X_digits, y_digits = load_digits(n_class=3, return_X_y=True) X_digits_multi = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_multi = y_digits[:200] X_digits, y_digits = load_digits(n_class=2, return_X_y=True) X_digits_binary = MinMaxScaler().fit_transform(X_digits[:200]) y_digits_binary = y_digits[:200] classification_datasets = [(X_digits_multi, y_digits_multi), (X_digits_binary, y_digits_binary)] boston = load_boston() Xboston = StandardScaler().fit_transform(boston.data)[:200]
def test_load_digits_n_class_lt_10(): digits = load_digits(9) assert digits.data.shape == (1617, 64) assert numpy.unique(digits.target).size == 9
simultaneously using grid search, but pick only the ones deemed most important. """ print(__doc__) import numpy as np from time import time from scipy.stats import randint as sp_randint from mrex.model_selection import GridSearchCV from mrex.model_selection import RandomizedSearchCV from mrex.datasets import load_digits from mrex.ensemble import RandomForestClassifier # get some data X, y = load_digits(return_X_y=True) # build a classifier clf = RandomForestClassifier(n_estimators=20) # Utility function to report best scores def report(results, n_top=3): for i in range(1, n_top + 1): candidates = np.flatnonzero(results['rank_test_score'] == i) for candidate in candidates: print("Model with rank: {0}".format(i)) print("Mean validation score: {0:.3f} (std: {1:.3f})".format( results['mean_test_score'][candidate], results['std_test_score'][candidate])) print("Parameters: {0}".format(results['params'][candidate]))
============================================= Cross-validation on Digits Dataset Exercise ============================================= A tutorial exercise using Cross-validation with an SVM on the Digits dataset. This exercise is used in the :ref:`cv_generators_tut` part of the :ref:`model_selection_tut` section of the :ref:`stat_learn_tut_index`. """ print(__doc__) import numpy as np from mrex.model_selection import cross_val_score from mrex import datasets, svm X, y = datasets.load_digits(return_X_y=True) svc = svm.SVC(kernel='linear') C_s = np.logspace(-10, 0, 10) scores = list() scores_std = list() for C in C_s: svc.C = C this_scores = cross_val_score(svc, X, y, n_jobs=1) scores.append(np.mean(this_scores)) scores_std.append(np.std(this_scores)) # Do the plotting import matplotlib.pyplot as plt plt.figure()
print(__doc__) # Authors: Clay Woolam <*****@*****.**> # License: BSD import numpy as np import matplotlib.pyplot as plt from scipy import stats from mrex import datasets from mrex.semi_supervised import label_propagation from mrex.metrics import confusion_matrix, classification_report digits = datasets.load_digits() rng = np.random.RandomState(2) indices = np.arange(len(digits.data)) rng.shuffle(indices) X = digits.data[indices[:340]] y = digits.target[indices[:340]] images = digits.images[indices[:340]] n_total_samples = len(y) n_labeled_points = 40 indices = np.arange(n_total_samples) unlabeled_set = indices[n_labeled_points:]
import sys import re import numpy as np from scipy.sparse import csc_matrix, csr_matrix, lil_matrix from mrex.utils.testing import (assert_almost_equal, assert_array_equal) from mrex.datasets import load_digits from io import StringIO from mrex.neural_network import BernoulliRBM from mrex.utils.validation import assert_all_finite Xdigits, _ = load_digits(return_X_y=True) Xdigits -= Xdigits.min() Xdigits /= Xdigits.max() def test_fit(): X = Xdigits.copy() rbm = BernoulliRBM(n_components=64, learning_rate=0.1, batch_size=10, n_iter=7, random_state=9) rbm.fit(X) assert_almost_equal(rbm.score_samples(X).mean(), -21., decimal=0) # in-place tricks shouldn't have modified X assert_array_equal(X, Xdigits)