def test_set_pipeline_step_passthrough(passthrough): # Test setting Pipeline steps to None X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([("m2", mult2), ("m3", mult3), ("last", mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) expected_params = { "steps": pipeline.steps, "m2": mult2, "m3": passthrough, "last": mult5, "memory": None, "m2__mult": 2, "last__mult": 5, "verbose": False, } assert pipeline.get_params(deep=True) == expected_params pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = [ "predict_proba", "predict_log_proba", "decision_function", "transform", "score", ] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 pipeline.fit(X, y) pipeline.transform(X) assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) with raises(AttributeError, match="has no attribute 'predict'"): getattr(pipeline, "predict") # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline([("m2", mult2), ("m3", passthrough), ("last", mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_set_pipeline_step_passthrough(passthrough): X = np.array([[1]]) y = np.array([1]) mult2 = Mult(mult=2) mult3 = Mult(mult=3) mult5 = Mult(mult=5) def make(): return Pipeline([('m2', mult2), ('m3', mult3), ('last', mult5)]) pipeline = make() exp = 2 * 3 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline.set_params(m3=passthrough) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) assert (pipeline.get_params(deep=True) == {'steps': pipeline.steps, 'm2': mult2, 'm3': passthrough, 'last': mult5, 'memory': None, 'm2__mult': 2, 'last__mult': 5, 'verbose': False }) pipeline.set_params(m2=passthrough) exp = 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) # for other methods, ensure no AttributeErrors on None: other_methods = ['predict_proba', 'predict_log_proba', 'decision_function', 'transform', 'score'] for method in other_methods: getattr(pipeline, method)(X) pipeline.set_params(m2=mult2) exp = 2 * 5 assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) pipeline = make() pipeline.set_params(last=passthrough) # mult2 and mult3 are active exp = 6 assert_array_equal([[exp]], pipeline.fit(X, y).transform(X)) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal(X, pipeline.inverse_transform([[exp]])) msg = "'str' object has no attribute 'predict'" with pytest.raises(AttributeError, match=msg): getattr(pipeline, 'predict') # Check 'passthrough' step at construction time exp = 2 * 5 pipeline = Pipeline( [('m2', mult2), ('m3', passthrough), ('last', mult5)]) assert_array_equal([[exp]], pipeline.fit_transform(X, y)) assert_array_equal([exp], pipeline.fit(X).predict(X)) assert_array_equal(X, pipeline.inverse_transform([[exp]]))
def test_string_labels(): X = [[0]] * 5 y = ["paris", "paris", "tokyo", "amsterdam", "berlin"] clf = DummyClassifier(strategy="most_frequent") clf.fit(X, y) assert_array_equal(clf.predict(X), ["paris"] * 5)
def test_dbscan_core_samples_toy(algorithm): X = [[0], [2], [3], [4], [6], [8], [10]] n_samples = len(X) # Degenerate case: every sample is a core sample, either with its own # cluster or including other close core samples. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=1) assert_array_equal(core_samples, np.arange(n_samples)) assert_array_equal(labels, [0, 1, 1, 1, 2, 3, 4]) # With eps=1 and min_samples=2 only the 3 samples from the denser area # are core samples. All other points are isolated and considered noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=2) assert_array_equal(core_samples, [1, 2, 3]) assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) # Only the sample in the middle of the dense area is core. Its two # neighbors are edge samples. Remaining samples are noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=3) assert_array_equal(core_samples, [2]) assert_array_equal(labels, [-1, 0, 0, 0, -1, -1, -1]) # It's no longer possible to extract core samples with eps=1: # everything is noise. core_samples, labels = dbscan(X, algorithm=algorithm, eps=1, min_samples=4) assert_array_equal(core_samples, []) assert_array_equal(labels, np.full(n_samples, -1.))
def test_ward_linkage_tree_return_distance(): # Test return_distance option on linkage and ward trees # test that return_distance when set true, gives same # output on both structured and unstructured clustering. n, p = 10, 5 rng = np.random.RandomState(0) connectivity = np.ones((n, n)) for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out_unstructured = ward_tree(X, return_distance=True) out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) # get children children_unstructured = out_unstructured[0] children_structured = out_structured[0] # check if we got the same clusters assert_array_equal(children_unstructured, children_structured) # check if the distances are the same dist_unstructured = out_unstructured[-1] dist_structured = out_structured[-1] assert_array_almost_equal(dist_unstructured, dist_structured) for linkage in ['average', 'complete', 'single']: structured_items = linkage_tree( X, connectivity=connectivity, linkage=linkage, return_distance=True)[-1] unstructured_items = linkage_tree( X, linkage=linkage, return_distance=True)[-1] structured_dist = structured_items[-1] unstructured_dist = unstructured_items[-1] structured_children = structured_items[0] unstructured_children = unstructured_items[0] assert_array_almost_equal(structured_dist, unstructured_dist) assert_array_almost_equal( structured_children, unstructured_children) # test on the following dataset where we know the truth # taken from scipy/cluster/tests/hierarchy_test_data.py X = np.array([[1.43054825, -7.5693489], [6.95887839, 6.82293382], [2.87137846, -9.68248579], [7.87974764, -6.05485803], [8.24018364, -6.09495602], [7.39020262, 8.54004355]]) # truth linkage_X_ward = np.array([[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 9.10208346, 4.], [7., 9., 24.7784379, 6.]]) linkage_X_complete = np.array( [[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.96742194, 4.], [7., 9., 18.77445997, 6.]]) linkage_X_average = np.array( [[3., 4., 0.36265956, 2.], [1., 5., 1.77045373, 2.], [0., 2., 2.55760419, 2.], [6., 8., 6.55832839, 4.], [7., 9., 15.44089605, 6.]]) n_samples, n_features = np.shape(X) connectivity_X = np.ones((n_samples, n_samples)) out_X_unstructured = ward_tree(X, return_distance=True) out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) # check that the labels are the same assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) linkage_options = ['complete', 'average', 'single'] X_linkage_truth = [linkage_X_complete, linkage_X_average] for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): out_X_unstructured = linkage_tree( X, return_distance=True, linkage=linkage) out_X_structured = linkage_tree( X, connectivity=connectivity_X, linkage=linkage, return_distance=True) # check that the labels are the same assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) assert_array_equal(X_truth[:, :2], out_X_structured[0]) # check that the distances are correct assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
def test_inverse_binarize_multiclass(): got = _inverse_binarize_multiclass( csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)) assert_array_equal(got, np.array([1, 1, 0]))
def test_predict_equal_labels(algo): km = KMeans(random_state=13, n_init=1, max_iter=1, algorithm=algo) km.fit(X) assert_array_equal(km.predict(X), km.labels_)
def test_the_extract_xi_labels(ordering, clusters, expected): labels = _extract_xi_labels(ordering, clusters) assert_array_equal(labels, expected)
def test_extract_xi(global_dtype): # small and easy test (no clusters around other clusters) # but with a clear noise data. rng = np.random.RandomState(0) n_points_per_cluster = 5 C1 = [-5, -2] + 0.8 * rng.randn(n_points_per_cluster, 2) C2 = [4, -1] + 0.1 * rng.randn(n_points_per_cluster, 2) C3 = [1, -2] + 0.2 * rng.randn(n_points_per_cluster, 2) C4 = [-2, 3] + 0.3 * rng.randn(n_points_per_cluster, 2) C5 = [3, -2] + 0.6 * rng.randn(n_points_per_cluster, 2) C6 = [5, 6] + 0.2 * rng.randn(n_points_per_cluster, 2) X = np.vstack( (C1, C2, C3, C4, C5, np.array([[100, 100]]), C6)).astype(global_dtype, copy=False) expected_labels = np.r_[[2] * 5, [0] * 5, [1] * 5, [3] * 5, [1] * 5, -1, [4] * 5] X, expected_labels = shuffle(X, expected_labels, random_state=rng) clust = OPTICS(min_samples=3, min_cluster_size=2, max_eps=20, cluster_method="xi", xi=0.4).fit(X) assert_array_equal(clust.labels_, expected_labels) # check float min_samples and min_cluster_size clust = OPTICS(min_samples=0.1, min_cluster_size=0.08, max_eps=20, cluster_method="xi", xi=0.4).fit(X) assert_array_equal(clust.labels_, expected_labels) X = np.vstack((C1, C2, C3, C4, C5, np.array([[100, 100]] * 2), C6)).astype(global_dtype, copy=False) expected_labels = np.r_[[1] * 5, [3] * 5, [2] * 5, [0] * 5, [2] * 5, -1, -1, [4] * 5] X, expected_labels = shuffle(X, expected_labels, random_state=rng) clust = OPTICS(min_samples=3, min_cluster_size=3, max_eps=20, cluster_method="xi", xi=0.3).fit(X) # this may fail if the predecessor correction is not at work! assert_array_equal(clust.labels_, expected_labels) C1 = [[0, 0], [0, 0.1], [0, -0.1], [0.1, 0]] C2 = [[10, 10], [10, 9], [10, 11], [9, 10]] C3 = [[100, 100], [100, 90], [100, 110], [90, 100]] X = np.vstack((C1, C2, C3)).astype(global_dtype, copy=False) expected_labels = np.r_[[0] * 4, [1] * 4, [2] * 4] X, expected_labels = shuffle(X, expected_labels, random_state=rng) clust = OPTICS(min_samples=2, min_cluster_size=2, max_eps=np.inf, cluster_method="xi", xi=0.04).fit(X) assert_array_equal(clust.labels_, expected_labels)
def test_isotonic_regression(): y = np.array([3, 7, 5, 9, 8, 7, 10]) y_ = np.array([3, 6, 6, 8, 8, 8, 10]) assert_array_equal(y_, isotonic_regression(y)) y = np.array([10, 0, 2]) y_ = np.array([4, 4, 4]) assert_array_equal(y_, isotonic_regression(y)) x = np.arange(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) ir.fit(x, y) assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y)) assert_array_equal(ir.transform(x), ir.predict(x)) # check that it is immune to permutation perm = np.random.permutation(len(y)) ir = IsotonicRegression(y_min=0., y_max=1.) assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm]) assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm]) # check we don't crash when all x are equal: ir = IsotonicRegression() assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
def test_compare_to_ELKI(): # Expected values, computed with (future) ELKI 0.7.5 using: # java -jar elki.jar cli -dbc.in csv -dbc.filter FixedDBIDsFilter # -algorithm clustering.optics.OPTICSHeap -optics.minpts 5 # where the FixedDBIDsFilter gives 0-indexed ids. r1 = [ np.inf, 1.0574896366427478, 0.7587934993548423, 0.7290174038973836, 0.7290174038973836, 0.7290174038973836, 0.6861627576116127, 0.7587934993548423, 0.9280118450166668, 1.1748022534146194, 3.3355455741292257, 0.49618389254482587, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714, 0.24944622248445714, 0.24944622248445714, 0.2552805046961355, 0.2552805046961355, 0.3086779122185853, 4.163024452756142, 1.623152630340929, 0.45315840475822655, 0.25468325192031926, 0.2254004358159971, 0.18765711877083036, 0.1821471333893275, 0.1821471333893275, 0.18765711877083036, 0.18765711877083036, 0.2240202988740153, 1.154337614548715, 1.342604473837069, 1.323308536402633, 0.8607514948648837, 0.27219111215810565, 0.13260875220533205, 0.13260875220533205, 0.09890587675958984, 0.09890587675958984, 0.13548790801634494, 0.1575483940837384, 0.17515137170530226, 0.17575920159442388, 0.27219111215810565, 0.6101447895405373, 1.3189208094864302, 1.323308536402633, 2.2509184159764577, 2.4517810628594527, 3.675977064404973, 3.8264795626020365, 2.9130735341510614, 2.9130735341510614, 2.9130735341510614, 2.9130735341510614, 2.8459300127258036, 2.8459300127258036, 2.8459300127258036, 3.0321982337972537, ] o1 = [ 0, 3, 6, 4, 7, 8, 2, 9, 5, 1, 31, 30, 32, 34, 33, 38, 39, 35, 37, 36, 44, 21, 23, 24, 22, 25, 27, 29, 26, 28, 20, 40, 45, 46, 10, 15, 11, 13, 17, 19, 18, 12, 16, 14, 47, 49, 43, 48, 42, 41, 53, 57, 51, 52, 56, 59, 54, 55, 58, 50, ] p1 = [ -1, 0, 3, 6, 6, 6, 8, 3, 7, 5, 1, 31, 30, 30, 34, 34, 34, 32, 32, 37, 36, 44, 21, 23, 24, 22, 25, 25, 22, 22, 22, 21, 40, 45, 46, 10, 15, 15, 13, 13, 15, 11, 19, 15, 10, 47, 12, 45, 14, 43, 42, 53, 57, 57, 57, 57, 59, 59, 59, 58, ] # Tests against known extraction array # Does NOT work with metric='euclidean', because sklearn euclidean has # worse numeric precision. 'minkowski' is slower but more accurate. clust1 = OPTICS(min_samples=5).fit(X) assert_array_equal(clust1.ordering_, np.array(o1)) assert_array_equal(clust1.predecessor_[clust1.ordering_], np.array(p1)) assert_allclose(clust1.reachability_[clust1.ordering_], np.array(r1)) # ELKI currently does not print the core distances (which are not used much # in literature, but we can at least ensure to have this consistency: for i in clust1.ordering_[1:]: assert clust1.reachability_[i] >= clust1.core_distances_[ clust1.predecessor_[i]] # Expected values, computed with (future) ELKI 0.7.5 using r2 = [ np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, 0.27219111215810565, 0.13260875220533205, 0.13260875220533205, 0.09890587675958984, 0.09890587675958984, 0.13548790801634494, 0.1575483940837384, 0.17515137170530226, 0.17575920159442388, 0.27219111215810565, 0.4928068613197889, np.inf, 0.2666183922512113, 0.18765711877083036, 0.1821471333893275, 0.1821471333893275, 0.1821471333893275, 0.18715928772277457, 0.18765711877083036, 0.18765711877083036, 0.25468325192031926, np.inf, 0.2552805046961355, 0.2552805046961355, 0.24944622248445714, 0.24944622248445714, 0.24944622248445714, 0.2552805046961355, 0.2552805046961355, 0.3086779122185853, 0.34466409325984865, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, ] o2 = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 11, 13, 17, 19, 18, 12, 16, 14, 47, 46, 20, 22, 25, 23, 27, 29, 24, 26, 28, 21, 30, 32, 34, 33, 38, 39, 35, 37, 36, 31, 40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, ] p2 = [ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 15, 15, 13, 13, 15, 11, 19, 15, 10, 47, -1, 20, 22, 25, 25, 25, 25, 22, 22, 23, -1, 30, 30, 34, 34, 34, 32, 32, 37, 38, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ] clust2 = OPTICS(min_samples=5, max_eps=0.5).fit(X) assert_array_equal(clust2.ordering_, np.array(o2)) assert_array_equal(clust2.predecessor_[clust2.ordering_], np.array(p2)) assert_allclose(clust2.reachability_[clust2.ordering_], np.array(r2)) index = np.where(clust1.core_distances_ <= 0.5)[0] assert_allclose(clust1.core_distances_[index], clust2.core_distances_[index])
def test_isotonic_regression_reversed(): y = np.array([10, 9, 10, 7, 6, 6.1, 5]) y_ = IsotonicRegression(increasing=False).fit_transform( np.arange(len(y)), y) assert_array_equal(np.ones(y_[:-1].shape), ((y_[:-1] - y_[1:]) >= 0))
def test_pipeline_memory_sampler(): X, y = make_classification( n_classes=2, class_sep=2, weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0, n_features=20, n_clusters_per_class=1, n_samples=5000, random_state=0, ) cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummySampler() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummySampler() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_pipeline_memory_transformer(): iris = load_iris() X = iris.data y = iris.target cachedir = mkdtemp() try: memory = Memory(cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(gamma="scale", probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([("transf", clone(transf)), ("svc", clf)]) cached_pipe = Pipeline([("transf", transf), ("svc", clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the tranformer in the cached pipeline expected_ts = cached_pipe.named_steps["transf"].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert not hasattr(transf, "means_") # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe.named_steps["transf"].means_, ) assert cached_pipe.named_steps["transf"].timestamp_ == expected_ts # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(gamma="scale", probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([("transf_2", transf_2), ("svc", clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal( pipe.named_steps["transf"].means_, cached_pipe_2.named_steps["transf_2"].means_, ) assert cached_pipe_2.named_steps["transf_2"].timestamp_ == expected_ts finally: shutil.rmtree(cachedir)
def test_multilabel_binarizer_non_integer_labels(): tuple_classes = _to_object_array([(1, ), (2, ), (3, )]) inputs = [ ([('2', '3'), ('1', ), ('1', '2')], ['1', '2', '3']), ([('b', 'c'), ('a', ), ('a', 'b')], ['a', 'b', 'c']), ([((2, ), (3, )), ((1, ), ), ((1, ), (2, ))], tuple_classes), ] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) for inp, classes in inputs: # fit_transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) # fit().transform() mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, classes) assert_array_equal(mlb.inverse_transform(indicator_mat), inp) mlb = MultiLabelBinarizer() with pytest.raises(TypeError): mlb.fit_transform([({}), ({}, {'a': 'b'})])
def test_additive_chi2_sampler(): # test that AdditiveChi2Sampler approximates kernel on random data # compute exact kernel # abbreviations for easier formula X_ = X[:, np.newaxis, :] Y_ = Y[np.newaxis, :, :] large_kernel = 2 * X_ * Y_ / (X_ + Y_) # reduce to n_samples_x x n_samples_y by summing over features kernel = large_kernel.sum(axis=2) # approximate kernel mapping transform = AdditiveChi2Sampler(sample_steps=3) X_trans = transform.fit_transform(X) Y_trans = transform.transform(Y) kernel_approx = np.dot(X_trans, Y_trans.T) assert_array_almost_equal(kernel, kernel_approx, 1) X_sp_trans = transform.fit_transform(csr_matrix(X)) Y_sp_trans = transform.transform(csr_matrix(Y)) assert_array_equal(X_trans, X_sp_trans.A) assert_array_equal(Y_trans, Y_sp_trans.A) # test error is raised on negative input Y_neg = Y.copy() Y_neg[0, 0] = -1 msg = "Negative values in data passed to" with pytest.raises(ValueError, match=msg): transform.transform(Y_neg) # test error on invalid sample_steps transform = AdditiveChi2Sampler(sample_steps=4) msg = re.escape("If sample_steps is not in [1, 2, 3]," " you need to provide sample_interval") with pytest.raises(ValueError, match=msg): transform.fit(X) # test that the sample interval is set correctly sample_steps_available = [1, 2, 3] for sample_steps in sample_steps_available: # test that the sample_interval is initialized correctly transform = AdditiveChi2Sampler(sample_steps=sample_steps) assert transform.sample_interval is None # test that the sample_interval is changed in the fit method transform.fit(X) assert transform.sample_interval_ is not None # test that the sample_interval is set correctly sample_interval = 0.3 transform = AdditiveChi2Sampler(sample_steps=4, sample_interval=sample_interval) assert transform.sample_interval == sample_interval transform.fit(X) assert transform.sample_interval_ == sample_interval
def test_multilabel_binarizer_non_unique(): inp = [(1, 1, 1, 0)] indicator_mat = np.array([[1, 1]]) mlb = MultiLabelBinarizer() assert_array_equal(mlb.fit_transform(inp), indicator_mat)
def test_one_hot_encoder_inverse_if_binary(): X = np.array([['Male', 1], ['Female', 3], ['Female', 2]], dtype=object) ohe = OneHotEncoder(drop='if_binary', sparse=False) X_tr = ohe.fit_transform(X) assert_array_equal(ohe.inverse_transform(X_tr), X)
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, buffer, 1, None, random_reassign=False) assert old_inertia > 0.0 # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) assert new_inertia > 0.0 assert new_inertia < old_inertia # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, weight_sums_csr, buffer_csr, 1, None, random_reassign=False) assert old_inertia_csr > 0.0 # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) assert new_inertia_csr > 0.0 assert new_inertia_csr < old_inertia_csr # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def test_ordinal_encoder(X): enc = OrdinalEncoder() exp = np.array([[0, 1, 0], [1, 0, 0]], dtype='int64') assert_array_equal(enc.fit_transform(X), exp.astype('float64')) enc = OrdinalEncoder(dtype='int64') assert_array_equal(enc.fit_transform(X), exp)
def test_weighted_dbscan(): # ensure sample_weight is validated with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2]) with pytest.raises(ValueError): dbscan([[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) core1, label1 = dbscan(X, sample_weight=sample_weight) assert len(label1) == len(X) X_repeated = np.repeat(X, sample_weight, axis=0) core_repeated, label_repeated = dbscan(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[core_repeated] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[core1] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = dbscan(D, sample_weight=sample_weight, metric='precomputed') assert_array_equal(core1, core3) assert_array_equal(label1, label3) # sample_weight should work with estimator est = DBSCAN().fit(X, sample_weight=sample_weight) core4 = est.core_sample_indices_ label4 = est.labels_ assert_array_equal(core1, core4) assert_array_equal(label1, label4) est = DBSCAN() label5 = est.fit_predict(X, sample_weight=sample_weight) core5 = est.core_sample_indices_ assert_array_equal(core1, core5) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)
def test_one_hot_encoder_drop_manual(missing_value): cats_to_drop = ['def', 12, 3, 56, missing_value] enc = OneHotEncoder(drop=cats_to_drop) X = [['abc', 12, 2, 55, 'a'], ['def', 12, 1, 55, 'a'], ['def', 12, 3, 56, missing_value]] trans = enc.fit_transform(X).toarray() exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]] assert_array_equal(trans, exp) dropped_cats = [ cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_) ] X_inv_trans = enc.inverse_transform(trans) X_array = np.array(X, dtype=object) # last value is np.nan if is_scalar_nan(cats_to_drop[-1]): assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1]) assert is_scalar_nan(dropped_cats[-1]) assert is_scalar_nan(cats_to_drop[-1]) # do not include the last column which includes missing values assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1]) # check last column is the missing value assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1]) assert is_scalar_nan(X_array[-1, -1]) assert is_scalar_nan(X_inv_trans[-1, -1]) else: assert_array_equal(dropped_cats, cats_to_drop) assert_array_equal(X_array, X_inv_trans)
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert np.size(np.unique(labels)) == 10 finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert np.size(np.unique(clustering.labels_)) == 10 # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) with pytest.raises(ValueError): clustering.fit(X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
def test_multilabel_binarizer_empty_sample(): mlb = MultiLabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(mlb.fit_transform(y), Y)
def test_pipeline_memory(): X = iris.data y = iris.target cachedir = mkdtemp() try: if parse_version(joblib.__version__) < parse_version('0.12'): # Deal with change of API in joblib memory = joblib.Memory(cachedir=cachedir, verbose=10) else: memory = joblib.Memory(location=cachedir, verbose=10) # Test with Transformer + SVC clf = SVC(probability=True, random_state=0) transf = DummyTransf() pipe = Pipeline([('transf', clone(transf)), ('svc', clf)]) cached_pipe = Pipeline([('transf', transf), ('svc', clf)], memory=memory) # Memoize the transformer at the first fit cached_pipe.fit(X, y) pipe.fit(X, y) # Get the time stamp of the transformer in the cached pipeline ts = cached_pipe.named_steps['transf'].timestamp_ # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert not hasattr(transf, 'means_') # Check that we are reading the cache while fitting # a second time cached_pipe.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe.named_steps['transf'].means_) assert ts == cached_pipe.named_steps['transf'].timestamp_ # Create a new pipeline with cloned estimators # Check that even changing the name step does not affect the cache hit clf_2 = SVC(probability=True, random_state=0) transf_2 = DummyTransf() cached_pipe_2 = Pipeline([('transf_2', transf_2), ('svc', clf_2)], memory=memory) cached_pipe_2.fit(X, y) # Check that cached_pipe and pipe yield identical results assert_array_equal(pipe.predict(X), cached_pipe_2.predict(X)) assert_array_equal(pipe.predict_proba(X), cached_pipe_2.predict_proba(X)) assert_array_equal(pipe.predict_log_proba(X), cached_pipe_2.predict_log_proba(X)) assert_array_equal(pipe.score(X, y), cached_pipe_2.score(X, y)) assert_array_equal(pipe.named_steps['transf'].means_, cached_pipe_2.named_steps['transf_2'].means_) assert ts == cached_pipe_2.named_steps['transf_2'].timestamp_ finally: shutil.rmtree(cachedir)
def test_multilabel_binarizer_given_classes(): inp = [(2, 3), (1, ), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]]) # fit_transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # fit().transform() mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) assert_array_equal(mlb.classes_, [1, 3, 2]) # ensure works with extra class mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2]) assert_array_equal(mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))) assert_array_equal(mlb.classes_, [4, 1, 3, 2]) # ensure fit is no-op as iterable is not consumed inp = iter(inp) mlb = MultiLabelBinarizer(classes=[1, 3, 2]) assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat) # ensure a ValueError is thrown if given duplicate classes err_msg = "The classes argument contains duplicate classes. Remove " \ "these duplicates before passing them to MultiLabelBinarizer." mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3]) with pytest.raises(ValueError, match=err_msg): mlb.fit(inp)
def test_set_feature_union_step_drop(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert ['m2__x2', 'm3__x3'] == ft.get_feature_names() with pytest.warns(None) as record: ft.set_params(m2='drop') assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ['m3__x3'] == ft.get_feature_names() assert not record with pytest.warns(None) as record: ft.set_params(m3='drop') assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert [] == ft.get_feature_names() assert not record with pytest.warns(None) as record: # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) assert not record with pytest.warns(None) as record: # Check 'drop' step at construction time ft = FeatureUnion([('m2', 'drop'), ('m3', mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ['m3__x3'] == ft.get_feature_names() assert not record
def test_label_binarizer(): # one-class case defaults to negative label # For dense case: inp = ["pos", "pos", "pos", "pos"] lb = LabelBinarizer(sparse_output=False) expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # For sparse case: lb = LabelBinarizer(sparse_output=True) got = lb.fit_transform(inp) assert issparse(got) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got.toarray()) assert_array_equal(lb.inverse_transform(got.toarray()), inp) lb = LabelBinarizer(sparse_output=False) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_fit_transform(strategy, expected): est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy=strategy) est.fit(X) assert_array_equal(expected, est.transform(X))
def test_unique_labels(): # Empty iterable assert_raises(ValueError, unique_labels) # Multiclass problem assert_array_equal(unique_labels(range(10)), np.arange(10)) assert_array_equal(unique_labels(np.arange(10)), np.arange(10)) assert_array_equal(unique_labels([4, 0, 2]), np.array([0, 2, 4])) # Multilabel indicator assert_array_equal( unique_labels(np.array([[0, 0, 1], [1, 0, 1], [0, 0, 0]])), np.arange(3)) assert_array_equal(unique_labels(np.array([[0, 0, 1], [0, 0, 0]])), np.arange(3)) # Several arrays passed assert_array_equal(unique_labels([4, 0, 2], range(5)), np.arange(5)) assert_array_equal(unique_labels((0, 1, 2), (0, ), (2, 1)), np.arange(3)) # Border line case with binary indicator matrix assert_raises(ValueError, unique_labels, [4, 0, 2], np.ones((5, 5))) assert_raises(ValueError, unique_labels, np.ones((5, 4)), np.ones((5, 5))) assert_array_equal(unique_labels(np.ones((4, 5)), np.ones((5, 5))), np.arange(5))