def test_shuffle_on_ndim_equals_three(): def to_tuple(A): # to make the inner arrays hashable return tuple(tuple(tuple(C) for C in B) for B in A) A = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) # A.shape = (2,2,2) S = set(to_tuple(A)) shuffle(A) # shouldn't raise a ValueError for dim = 3 assert set(to_tuple(A)) == S
def test_extract_xi(): # small and easy test (no clusters around other clusters) # but with a clear noise data. rng = np.random.RandomState(0) n_points_per_cluster = 5 C1 = [-5, -2] + .8 * rng.randn(n_points_per_cluster, 2) C2 = [4, -1] + .1 * rng.randn(n_points_per_cluster, 2) C3 = [1, -2] + .2 * rng.randn(n_points_per_cluster, 2) C4 = [-2, 3] + .3 * rng.randn(n_points_per_cluster, 2) C5 = [3, -2] + .6 * rng.randn(n_points_per_cluster, 2) C6 = [5, 6] + .2 * rng.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)) expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] X, expected_labels = shuffle(X, expected_labels, random_state=rng) clust = OPTICS(min_samples=3, min_cluster_size=2, max_eps=20, cluster_method='xi', xi=0.4).fit(X) assert_array_equal(clust.labels_, expected_labels) # check float min_samples and min_cluster_size clust = OPTICS(min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method='xi', xi=0.4).fit(X) assert_array_equal(clust.labels_, expected_labels) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)) expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5] X, expected_labels = shuffle(X, expected_labels, random_state=rng) clust = OPTICS(min_samples=3, min_cluster_size=3, max_eps=20, cluster_method='xi', xi=0.3).fit(X) # this may fail if the predecessor correction is not at work! assert_array_equal(clust.labels_, expected_labels) C1 = [[0, 0], [0, 0.1], [0, -.1], [0.1, 0]] C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] X = np.vstack((C1, C2, C3)) expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] X, expected_labels = shuffle(X, expected_labels, random_state=rng) clust = OPTICS(min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method='xi', xi=0.04).fit(X) assert_array_equal(clust.labels_, expected_labels)
def test_small_trainset(): # Make sure that the small trainset is stratified and has the expected # length (10k samples) n_samples = 20000 original_distrib = {0: 0.1, 1: 0.2, 2: 0.3, 3: 0.4} rng = np.random.RandomState(42) X = rng.randn(n_samples).reshape(n_samples, 1) y = [[class_] * int(prop * n_samples) for (class_, prop) in original_distrib.items()] y = shuffle(np.concatenate(y)) gb = HistGradientBoostingClassifier() # Compute the small training set X_small, y_small = gb._get_small_trainset(X, y, seed=42) # Compute the class distribution in the small training set unique, counts = np.unique(y_small, return_counts=True) small_distrib = {class_: count / 10000 for (class_, count) in zip(unique, counts)} # Test that the small training set has the expected length assert X_small.shape[0] == 10000 assert y_small.shape[0] == 10000 # Test that the class distributions in the whole dataset and in the small # training set are identical assert small_distrib == pytest.approx(original_distrib)
def test_ovr_partial_fit(): # Test if partial_fit is working as intended X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert len(ovr.estimators_) == len(np.unique(y)) assert np.mean(y == pred) > 0.65 # Test when mini batches doesn't have all classes # with SGDClassifier X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr = OneVsRestClassifier( SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) ovr.partial_fit(X[:7], y[:7], np.unique(y)) ovr.partial_fit(X[7:], y[7:]) pred = ovr.predict(X) ovr1 = OneVsRestClassifier( SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) pred1 = ovr1.fit(X, y).predict(X) assert np.mean(pred == y) == np.mean(pred1 == y) # test partial_fit only exists if estimator has it: ovr = OneVsRestClassifier(SVC()) assert not hasattr(ovr, "partial_fit")
def generate_dataset(n_train, n_test, n_features, noise=0.1, verbose=False): """Generate a regression dataset with the given parameters.""" if verbose: print("generating dataset...") X, y, coef = make_regression(n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True) random_seed = 13 X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=n_train, test_size=n_test, random_state=random_seed) X_train, y_train = shuffle(X_train, y_train, random_state=random_seed) X_scaler = StandardScaler() X_train = X_scaler.fit_transform(X_train) X_test = X_scaler.transform(X_test) y_scaler = StandardScaler() y_train = y_scaler.fit_transform(y_train[:, None])[:, 0] y_test = y_scaler.transform(y_test[:, None])[:, 0] gc.collect() if verbose: print("ok") return X_train, y_train, X_test, y_test
def test_cluster_hierarchy_(): rng = np.random.RandomState(0) n_points_per_cluster = 100 C1 = [0, 0] + 2 * rng.randn(n_points_per_cluster, 2) C2 = [0, 0] + 50 * rng.randn(n_points_per_cluster, 2) X = np.vstack((C1, C2)) X = shuffle(X, random_state=0) clusters = OPTICS(min_samples=20, xi=.1).fit(X).cluster_hierarchy_ assert clusters.shape == (2, 2) diff = np.sum(clusters - np.array([[0, 99], [0, 199]])) assert diff / len(X) < 0.05
def load_mnist(n_samples=None, class_0='0', class_1='8'): """Load MNIST, select two classes, shuffle and return only n_samples.""" # Load data from http://openml.org/d/554 mnist = fetch_openml('mnist_784', version=1) # take only two classes for binary classification mask = np.logical_or(mnist.target == class_0, mnist.target == class_1) X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42) if n_samples is not None: X, y = X[:n_samples], y[:n_samples] return X, y
def test_permutation_invariance(): # check that fit is permutation invariant. # regression test of missing sorting of sample-weights ir = IsotonicRegression() x = [1, 2, 3, 4, 5, 6, 7] y = [1, 41, 51, 1, 2, 5, 24] sample_weight = [1, 2, 3, 4, 5, 6, 7] x_s, y_s, sample_weight_s = shuffle(x, y, sample_weight, random_state=0) y_transformed = ir.fit_transform(x, y, sample_weight=sample_weight) y_transformed_s = \ ir.fit(x_s, y_s, sample_weight=sample_weight_s).transform(x) assert_array_equal(y_transformed, y_transformed_s)
def make_data(random_state, n_samples_per_center, grid_size, scale): random_state = check_random_state(random_state) centers = np.array([[i, j] for i in range(grid_size) for j in range(grid_size)]) n_clusters_true, n_features = centers.shape noise = random_state.normal( scale=scale, size=(n_samples_per_center, centers.shape[1])) X = np.concatenate([c + noise for c in centers]) y = np.concatenate([[i] * n_samples_per_center for i in range(n_clusters_true)]) return shuffle(X, y, random_state=random_state)
def test_shuffle_dont_convert_to_array(): # Check that shuffle does not try to convert to numpy arrays with float # dtypes can let any indexable datastructure pass-through. a = ['a', 'b', 'c'] b = np.array(['a', 'b', 'c'], dtype=object) c = [1, 2, 3] d = MockDataFrame(np.array([['a', 0], ['b', 1], ['c', 2]], dtype=object)) e = sp.csc_matrix(np.arange(6).reshape(3, 2)) a_s, b_s, c_s, d_s, e_s = shuffle(a, b, c, d, e, random_state=0) assert a_s == ['c', 'b', 'a'] assert type(a_s) == list assert_array_equal(b_s, ['c', 'b', 'a']) assert b_s.dtype == object assert c_s == [3, 2, 1] assert type(c_s) == list assert_array_equal(d_s, np.array([['c', 2], ['b', 1], ['a', 0]], dtype=object)) assert type(d_s) == MockDataFrame assert_array_equal(e_s.toarray(), np.array([[4, 5], [2, 3], [0, 1]]))
# Author: Peter Prettenhofer <*****@*****.**> # # License: BSD 3 clause import numpy as np import matplotlib.pyplot as plt from mrex import ensemble from mrex import datasets from mrex.utils import shuffle from mrex.metrics import mean_squared_error # ############################################################################# # Load data boston = datasets.load_boston() X, y = shuffle(boston.data, boston.target, random_state=13) X = X.astype(np.float32) offset = int(X.shape[0] * 0.9) X_train, y_train = X[:offset], y[:offset] X_test, y_test = X[offset:], y[offset:] # ############################################################################# # Fit regression model params = { 'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.01, 'loss': 'ls' } clf = ensemble.GradientBoostingRegressor(**params)
# Common random state rng = np.random.RandomState(0) # Toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels y_regr = [-1, -1, -1, 1, 1, 1] T = [[-1, -1], [2, 2], [3, 2]] y_t_class = ["foo", 1, 1] y_t_regr = [-1, 1, 1] # Load the iris dataset and randomly permute it iris = datasets.load_iris() perm = rng.permutation(iris.target.size) iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng) # Load the boston dataset and randomly permute it boston = datasets.load_boston() boston.data, boston.target = shuffle(boston.data, boston.target, random_state=rng) def test_samme_proba(): # Test the `_samme_proba` helper function. # Define some example (bad) `predict_proba` output. probs = np.array([[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]]) probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
# unweighted, but with repeated samples X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]] y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]] rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0)) rgr.fit(X, y) X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]] assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test)) # Import the data iris = datasets.load_iris() # create a multiple targets by randomized shuffling and concatenating y. X = iris.data y1 = iris.target y2 = shuffle(y1, random_state=1) y3 = shuffle(y1, random_state=2) y = np.column_stack((y1, y2, y3)) n_samples, n_features = X.shape n_outputs = y.shape[1] n_classes = len(np.unique(y1)) classes = list(map(np.unique, (y1, y2, y3))) def test_multi_output_classification_partial_fit_parallelism(): sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4) mor.partial_fit(X, y, classes) est1 = mor.estimators_[0] mor.partial_fit(X, y) est2 = mor.estimators_[0]
# Load the Summer Palace photo china = load_sample_image("china.jpg") # Convert to floats instead of the default 8 bits integer coding. Dividing by # 255 is important so that plt.imshow behaves works well on float data (need to # be in the range [0-1]) china = np.array(china, dtype=np.float64) / 255 # Load Image and transform to a 2D numpy array. w, h, d = original_shape = tuple(china.shape) assert d == 3 image_array = np.reshape(china, (w * h, d)) print("Fitting model on a small sub-sample of the data") t0 = time() image_array_sample = shuffle(image_array, random_state=0)[:1000] kmeans = KMeans(n_clusters=n_colors, random_state=0).fit(image_array_sample) print("done in %0.3fs." % (time() - t0)) # Get labels for all points print("Predicting color indices on the full image (k-means)") t0 = time() labels = kmeans.predict(image_array) print("done in %0.3fs." % (time() - t0)) codebook_random = shuffle(image_array, random_state=0)[:n_colors] print("Predicting color indices on the full image (random)") t0 = time() labels_random = pairwise_distances_argmin(codebook_random, image_array, axis=0) print("done in %0.3fs." % (time() - t0))