def test_discretenb_predict_proba(): """Test discrete NB classes' probability scores""" # The 100s below distinguish Bernoulli from multinomial. X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]] X_multinomial = [[0, 1], [1, 3], [4, 0]] # Confirm that the 100s above distinguish Bernoulli from multinomial y = [0, 0, 1] cls_b = BernoulliNB().fit(X_bernoulli, y) cls_m = MultinomialNB().fit(X_bernoulli, y) assert_not_equal(cls_b.predict(X_bernoulli)[-1], cls_m.predict(X_bernoulli)[-1]) # test binary case (1-d output) y = [0, 0, 2] # 2 is regression test for binary case, 02e673 for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict(X[-1]), 2) assert_equal(clf.predict_proba(X[0]).shape, (1, 2)) assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1), np.array([1., 1.]), 6) # test multiclass case (2-d output, must sum to one) y = [0, 1, 2] for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict_proba(X[0]).shape, (1, 3)) assert_equal(clf.predict_proba(X[:2]).shape, (2, 3)) assert_almost_equal(np.sum(clf.predict_proba(X[1])), 1) assert_almost_equal(np.sum(clf.predict_proba(X[-1])), 1) assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1) assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
def _dump_svmlight(X, y, f, one_based, comment, query_id): is_sp = int(hasattr(X, "tocsr")) if X.dtype.kind == 'i': value_pattern = u("%d:%d") else: value_pattern = u("%d:%.16g") line_pattern = u("%s") line_pattern += u(" %s\n") for i in range(X.shape[0]): if is_sp: span = slice(X.indptr[i], X.indptr[i + 1]) row = zip(X.indices[span], X.data[span]) else: nz = X[i] != 0 row = zip(np.where(nz)[0], X[i, nz]) s = " ".join(value_pattern % (j + one_based, x) for j, x in row) label = "" first = True for l in y[i]: if not first: label += "," label += str(int(l)) first = False feat = (label, s) f.write((line_pattern % feat).encode('ascii'))
def test_discretenb_predict_proba(): # Test discrete NB classes' probability scores # The 100s below distinguish Bernoulli from multinomial. # FIXME: write a test to show this. X_bernoulli = [[1, 100, 0], [0, 1, 0], [0, 100, 1]] X_multinomial = [[0, 1], [1, 3], [4, 0]] # test binary case (1-d output) y = [0, 0, 2] # 2 is regression test for binary case, 02e673 for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict(X[-1:]), 2) assert_equal(clf.predict_proba([X[0]]).shape, (1, 2)) assert_array_almost_equal(clf.predict_proba(X[:2]).sum(axis=1), np.array([1., 1.]), 6) # test multiclass case (2-d output, must sum to one) y = [0, 1, 2] for cls, X in zip([BernoulliNB, MultinomialNB], [X_bernoulli, X_multinomial]): clf = cls().fit(X, y) assert_equal(clf.predict_proba(X[0:1]).shape, (1, 3)) assert_equal(clf.predict_proba(X[:2]).shape, (2, 3)) assert_almost_equal(np.sum(clf.predict_proba([X[1]])), 1) assert_almost_equal(np.sum(clf.predict_proba([X[-1]])), 1) assert_almost_equal(np.sum(np.exp(clf.class_log_prior_)), 1) assert_almost_equal(np.sum(np.exp(clf.intercept_)), 1)
def test_make_union(): pca = PCA() mock = TransfT() fu = make_union(pca, mock) names, transformers = zip(*fu.transformer_list) assert_equal(names, ("pca", "transft")) assert_equal(transformers, (pca, mock))
def _set_oob_score(self, X, y): n_samples = y.shape[0] predictions = np.zeros((n_samples,)) n_predictions = np.zeros((n_samples,)) for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): mask = np.ones(n_samples, dtype=np.bool) mask[samples] = False predictions[mask] += estimator.predict((X[mask, :])[:, features]) n_predictions[mask] += 1 if (n_predictions == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few estimators were used " "to compute any reliable oob estimates.") n_predictions[n_predictions == 0] = 1 predictions /= n_predictions self.oob_prediction_ = predictions self.oob_score_ = r2_score(y, predictions)
def constrained_GMM(X, K, iter_total, fixed_means=None, aligned_covs=None, cov_minmax=None): """ Wrap the sklearn mixture of gaussains. Take one iter at a time, apply the constraint after each step. """ assert (fixed_means is not None) | (aligned_covs is not None) | \ (cov_minmax is not None), '\n\nWhy are you using this code??\n' if cov_minmax is not None: raise ValueError('contraints on variance size not implemented') model = GMM(n_components=K, covariance_type='full', n_iter=1, n_init=1, params='wmc', init_params='wmc') model.fit(X) for i in range(iter_total + 1): if fixed_means is not None: ind = np.array(fixed_means) < np.inf model.means_[ind] = fixed_means[ind] if aligned_covs is not None: for info in aligned_covs: c, inds = zip(info) s = model.covars_[c, inds, inds] model.covars_[c, inds, :] = 0.0 model.covars_[c, :, inds] = 0.0 model.covars_[c, inds, inds] = s if i < iter_total: model = set_new_GMM(model) model.fit(X) return model
def test_make_union(): pca = PCA(svd_solver='full') mock = Transf() fu = make_union(pca, mock) names, transformers = zip(*fu.transformer_list) assert_equal(names, ("pca", "transf")) assert_equal(transformers, (pca, mock))
def test_knn_version_consistency(): if not have_flann: raise SkipTest("No flann, so skipping knn tests.") if not have_accel: raise SkipTest("No skl-groups-accel, so skipping version consistency.") n = 20 for dim in [1, 7]: np.random.seed(47) bags = Features([np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(n)]) div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8') Ks = (3, 4) get_est = partial(KNNDivergenceEstimator, div_funcs=div_funcs, Ks=Ks) results = {} for version in ('fast', 'slow', 'best'): est = get_est(version=version) results[version] = res = est.fit_transform(bags) assert res.shape == (len(div_funcs), len(Ks), n, n) assert np.all(np.isfinite(res)) for df, fast, slow in zip(div_funcs, results['fast'], results['slow']): assert_array_almost_equal( fast, slow, decimal=1 if df == 'js' else 5, err_msg="({}, dim {})".format(df, dim)) # TODO: debug JS differences est = get_est(version='fast', n_jobs=-1) res = est.fit_transform(bags) assert np.all(results['fast'] == res) est = get_est(version='slow', n_jobs=-1) res = est.fit_transform(bags) assert np.all(results['slow'] == res)
def test_class_weight_auto_classifiers(): """Test that class_weight="auto" improves f1-score""" # This test is broken; its success depends on: # * a rare fortuitous RNG seed for make_classification; and # * the use of binary F1 over a seemingly arbitrary positive class for two # datasets, and weighted average F1 for the third. # Its expectations need to be clarified and reimplemented. raise SkipTest("This test requires redefinition") classifiers = all_estimators(type_filter="classifier") clean_warning_registry() with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if "class_weight" in c[1]().get_params().keys()] for n_classes, weights in zip([2, 3], [[0.8, 0.2], [0.8, 0.1, 0.1]]): # create unbalanced dataset X, y = make_classification( n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes ) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) for name, Classifier in classifiers: if ( name != "NuSVC" # the sparse version has a parameter that doesn't do anything and not name.startswith("RidgeClassifier") # RidgeClassifier behaves unexpected # FIXME! and not name.endswith("NB") ): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
def test_make_moons(): X, y = make_moons(3, shuffle=False) for x, label in zip(X, y): center = [0.0, 0.0] if label == 0 else [1.0, 0.5] dist_sqr = ((x - center) ** 2).sum() assert_almost_equal(dist_sqr, 1.0, err_msg="Point is not on expected unit circle")
def test_shuffle_kfold_stratifiedkfold_reproducibility(): # Check that when the shuffle is True multiple split calls produce the # same split when random_state is set X = np.ones(15) # Divisible by 3 y = [0] * 7 + [1] * 8 X2 = np.ones(16) # Not divisible by 3 y2 = [0] * 8 + [1] * 8 kf = KFold(3, shuffle=True, random_state=0) skf = StratifiedKFold(3, shuffle=True, random_state=0) for cv in (kf, skf): np.testing.assert_equal(list(cv.split(X, y)), list(cv.split(X, y))) np.testing.assert_equal(list(cv.split(X2, y2)), list(cv.split(X2, y2))) kf = KFold(3, shuffle=True) skf = StratifiedKFold(3, shuffle=True) for cv in (kf, skf): for data in zip((X, X2), (y, y2)): try: np.testing.assert_equal(list(cv.split(*data)), list(cv.split(*data))) except AssertionError: pass else: raise AssertionError("The splits for data, %s, are same even " "when random state is not set" % data)
def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7): """Log probability for full covariance matrices. """ from scipy import linalg if hasattr(linalg, 'solve_triangular'): # only in scipy since 0.9 solve_triangular = linalg.solve_triangular else: # slower, but works solve_triangular = linalg.solve n_samples, n_dim = X.shape nmix = len(means) log_prob = np.empty((n_samples, nmix)) for c, (mu, cv) in enumerate(zip(means, covars)): try: cv_chol = linalg.cholesky(cv, lower=True) except linalg.LinAlgError: # The model is most probabily stuck in a component with too # few observations, we need to reinitialize this components cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim), lower=True) cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol))) cv_sol = solve_triangular(cv_chol, (X - mu).T, lower=True).T log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) + n_dim * np.log(2 * np.pi) + cv_log_det) return log_prob
def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7): """Log probability for full covariance matrices. """ n_samples, n_dim = X.shape nmix = len(means) log_prob = np.empty((n_samples, nmix)) for c, (mu, cv) in enumerate(zip(means, covars)): try: cv_chol = linalg.cholesky(cv, lower=True) except linalg.LinAlgError: # The model is most probably stuck in a component with too # few observations, we need to reinitialize this components try: cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim), lower=True) except linalg.LinAlgError: raise ValueError("'covars' must be symmetric, " "positive-definite") cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol))) cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T log_prob[:, c] = - .5 * (np.sum(cv_sol ** 2, axis=1) + n_dim * np.log(2 * np.pi) + cv_log_det) return log_prob
def test_cross_val_generator_mask_indices_same(): # Test that the cross validation generators return the same results when # indices=True and when indices=False y = np.array([0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2]) labels = np.array([1, 1, 2, 3, 3, 3, 4]) loo_mask = cval.LeaveOneOut(5, indices=False) loo_ind = cval.LeaveOneOut(5, indices=True) lpo_mask = cval.LeavePOut(10, 2, indices=False) lpo_ind = cval.LeavePOut(10, 2, indices=True) kf_mask = cval.KFold(10, 5, indices=False, shuffle=True, random_state=1) kf_ind = cval.KFold(10, 5, indices=True, shuffle=True, random_state=1) skf_mask = cval.StratifiedKFold(y, 3, indices=False) skf_ind = cval.StratifiedKFold(y, 3, indices=True) lolo_mask = cval.LeaveOneLabelOut(labels, indices=False) lolo_ind = cval.LeaveOneLabelOut(labels, indices=True) lopo_mask = cval.LeavePLabelOut(labels, 2, indices=False) lopo_ind = cval.LeavePLabelOut(labels, 2, indices=True) for cv_mask, cv_ind in [(loo_mask, loo_ind), (lpo_mask, lpo_ind), (kf_mask, kf_ind), (skf_mask, skf_ind), (lolo_mask, lolo_ind), (lopo_mask, lopo_ind)]: for (train_mask, test_mask), (train_ind, test_ind) in \ zip(cv_mask, cv_ind): assert_array_equal(np.where(train_mask)[0], train_ind) assert_array_equal(np.where(test_mask)[0], test_ind)
def _check_transformer(name, Transformer, X, y): if name in ("CCA", "LocallyLinearEmbedding", "KernelPCA") and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # on numpy & scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + " is non deterministic on 32bit Python" raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, "transform"): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer ) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer ) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer ) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer ) # raises error on malformed input for transform if hasattr(X, "T"): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)
def _check_transformer(name, Transformer, X, y): n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if name == "KernelPCA": transformer.remove_zero_eig = False set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) # raises error on malformed input for transform if hasattr(X, 'T'): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)
def test_shuffle_stratifiedkfold(): # Check that shuffling is happening when requested, and for proper # sample coverage labels = [0] * 20 + [1] * 20 kf0 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=0)) kf1 = list(cval.StratifiedKFold(labels, 5, shuffle=True, random_state=1)) for (_, test0), (_, test1) in zip(kf0, kf1): assert_true(set(test0) != set(test1)) check_cv_coverage(kf0, expected_n_iter=5, n_samples=40)
def test_make_multilabel_classification_return_indicator(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=25, n_features=20, n_classes=3, random_state=0, return_indicator=True, allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (25, 20), "X shape mismatch") assert_equal(Y.shape, (25, 3), "Y shape mismatch") assert_true(np.all(np.sum(Y, axis=0) > min_length))
def test_make_multilabel_classification_return_indicator_sparse(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=25, n_features=20, n_classes=3, random_state=0, return_indicator='sparse', allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (25, 20), "X shape mismatch") assert_equal(Y.shape, (25, 3), "Y shape mismatch") assert_true(sp.issparse(Y))
def test_make_multilabel_classification(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=100, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (100, 20), "X shape mismatch") if not allow_unlabeled: assert_equal(max([max(y) for y in Y]), 2) assert_equal(min([len(y) for y in Y]), min_length) assert_true(max([len(y) for y in Y]) <= 3)
def test_shuffle_stratifiedkfold(): # Check that shuffling is happening when requested, and for proper # sample coverage X_40 = np.ones(40) y = [0] * 20 + [1] * 20 kf0 = StratifiedKFold(5, shuffle=True, random_state=0) kf1 = StratifiedKFold(5, shuffle=True, random_state=1) for (_, test0), (_, test1) in zip(kf0.split(X_40, y), kf1.split(X_40, y)): assert_not_equal(set(test0), set(test1)) check_cv_coverage(kf0, X_40, y, labels=None, expected_n_iter=5)
def test_grid_search_cv_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) n_splits = 3 n_grid_points = 6 params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]), dict(kernel=['poly', ], degree=[1, 2])] grid_search = GridSearchCV(SVC(), cv=n_splits, iid=False, param_grid=params) grid_search.fit(X, y) grid_search_iid = GridSearchCV(SVC(), cv=n_splits, iid=True, param_grid=params) grid_search_iid.fit(X, y) param_keys = ('param_C', 'param_degree', 'param_gamma', 'param_kernel') score_keys = ('mean_test_score', 'mean_train_score', 'rank_test_score', 'split0_test_score', 'split1_test_score', 'split2_test_score', 'split0_train_score', 'split1_train_score', 'split2_train_score', 'std_test_score', 'std_train_score', 'mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time') n_candidates = n_grid_points for search, iid in zip((grid_search, grid_search_iid), (False, True)): assert_equal(iid, search.iid) cv_results = search.cv_results_ # Check if score and timing are reasonable assert_true(all(cv_results['rank_test_score'] >= 1)) assert_true(all(cv_results[k] >= 0) for k in score_keys if k is not 'rank_test_score') assert_true(all(cv_results[k] <= 1) for k in score_keys if 'time' not in k and k is not 'rank_test_score') # Check cv_results structure check_cv_results_array_types(cv_results, param_keys, score_keys) check_cv_results_keys(cv_results, param_keys, score_keys, n_candidates) # Check masking cv_results = grid_search.cv_results_ n_candidates = len(grid_search.cv_results_['params']) assert_true(all((cv_results['param_C'].mask[i] and cv_results['param_gamma'].mask[i] and not cv_results['param_degree'].mask[i]) for i in range(n_candidates) if cv_results['param_kernel'][i] == 'linear')) assert_true(all((not cv_results['param_C'].mask[i] and not cv_results['param_gamma'].mask[i] and cv_results['param_degree'].mask[i]) for i in range(n_candidates) if cv_results['param_kernel'][i] == 'rbf')) check_cv_results_grid_scores_consistency(search)
def test_make_blobs(): cluster_stds = np.array([0.05, 0.2, 0.4]) cluster_centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) X, y = make_blobs(random_state=0, n_samples=50, n_features=2, centers=cluster_centers, cluster_std=cluster_stds) assert_equal(X.shape, (50, 2), "X shape mismatch") assert_equal(y.shape, (50,), "y shape mismatch") assert_equal(np.unique(y).shape, (3,), "Unexpected number of blobs") for i, (ctr, std) in enumerate(zip(cluster_centers, cluster_stds)): assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
def check_parameters_default_constructible(name, Estimator): classifier = LDA() # test default-constructibility # get rid of deprecation warnings with warnings.catch_warnings(record=True): if name in META_ESTIMATORS: estimator = Estimator(classifier) else: estimator = Estimator() # test cloning clone(estimator) # test __repr__ repr(estimator) # test that set_params returns self assert_true(estimator.set_params() is estimator) # test if init does nothing but set parameters # this is important for grid_search etc. # We get the default parameters from init and then # compare these against the actual values of the attributes. # this comes from getattr. Gets rid of deprecation decorator. init = getattr(estimator.__init__, 'deprecated_original', estimator.__init__) try: args, varargs, kws, defaults = inspect.getargspec(init) except TypeError: # init is not a python function. # true for mixins return params = estimator.get_params() if name in META_ESTIMATORS: # they need a non-default argument args = args[2:] else: args = args[1:] if args: # non-empty list assert_equal(len(args), len(defaults)) else: return for arg, default in zip(args, defaults): assert_in(type(default), [str, int, float, bool, tuple, type(None), np.float64, types.FunctionType, Memory]) if arg not in params.keys(): # deprecated parameter, not in get_params assert_true(default is None) continue if isinstance(params[arg], np.ndarray): assert_array_equal(params[arg], default) else: assert_equal(params[arg], default)
def test_make_blobs_n_samples_list_with_centers(): n_samples = [20, 20, 20] centers = np.array([[0.0, 0.0], [1.0, 1.0], [0.0, 1.0]]) cluster_stds = np.array([0.05, 0.2, 0.4]) X, y = make_blobs(n_samples=n_samples, centers=centers, cluster_std=cluster_stds, random_state=0) assert X.shape == (sum(n_samples), 2), "X shape mismatch" assert all(np.bincount(y, minlength=len(n_samples)) == n_samples), \ "Incorrect number of samples per blob" for i, (ctr, std) in enumerate(zip(centers, cluster_stds)): assert_almost_equal((X[y == i] - ctr).std(), std, 1, "Unexpected std")
def test_cross_validator_with_default_params(): n_samples = 4 n_unique_groups = 4 n_splits = 2 p = 2 n_shuffle_splits = 10 # (the default value) X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]]) X_1d = np.array([1, 2, 3, 4]) y = np.array([1, 1, 2, 2]) groups = np.array([1, 2, 3, 4]) loo = LeaveOneOut() lpo = LeavePOut(p) kf = KFold(n_splits) skf = StratifiedKFold(n_splits) lolo = LeaveOneGroupOut() lopo = LeavePGroupsOut(p) ss = ShuffleSplit(random_state=0) ps = PredefinedSplit([1, 1, 2, 2]) # n_splits = np of unique folds = 2 loo_repr = "LeaveOneOut()" lpo_repr = "LeavePOut(p=2)" kf_repr = "KFold(n_splits=2, random_state=None, shuffle=False)" skf_repr = "StratifiedKFold(n_splits=2, random_state=None, shuffle=False)" lolo_repr = "LeaveOneGroupOut()" lopo_repr = "LeavePGroupsOut(n_groups=2)" ss_repr = ("ShuffleSplit(n_splits=10, random_state=0, test_size=0.1, " "train_size=None)") ps_repr = "PredefinedSplit(test_fold=array([1, 1, 2, 2]))" n_splits_expected = [n_samples, comb(n_samples, p), n_splits, n_splits, n_unique_groups, comb(n_unique_groups, p), n_shuffle_splits, 2] for i, (cv, cv_repr) in enumerate(zip( [loo, lpo, kf, skf, lolo, lopo, ss, ps], [loo_repr, lpo_repr, kf_repr, skf_repr, lolo_repr, lopo_repr, ss_repr, ps_repr])): # Test if get_n_splits works correctly assert_equal(n_splits_expected[i], cv.get_n_splits(X, y, groups)) # Test if the cross-validator works as expected even if # the data is 1d np.testing.assert_equal(list(cv.split(X, y, groups)), list(cv.split(X_1d, y, groups))) # Test that train, test indices returned are integers for train, test in cv.split(X, y, groups): assert_equal(np.asarray(train).dtype.kind, 'i') assert_equal(np.asarray(train).dtype.kind, 'i') # Test if the repr works without any errors assert_equal(cv_repr, repr(cv))
def _parallel_predict_proba(estimators, estimators_features, X, n_classes, combination, estimators_weight): """Private function used to compute (proba-)predictions within a job.""" n_samples = X.shape[0] proba = np.zeros((n_samples, n_classes)) for estimator, features, weight in zip(estimators, estimators_features, estimators_weight): proba_estimator = estimator.predict_proba(X[:, features]) if combination in ['weighted_voting', 'weighted_bmr']: proba += proba_estimator * weight else: proba += proba_estimator return proba
def test_shuffle_split(): ss1 = cval.ShuffleSplit(10, test_size=0.2, random_state=0) ss2 = cval.ShuffleSplit(10, test_size=2, random_state=0) ss3 = cval.ShuffleSplit(10, test_size=np.int32(2), random_state=0) for typ in six.integer_types: ss4 = cval.ShuffleSplit(10, test_size=typ(2), random_state=0) for t1, t2, t3, t4 in zip(ss1, ss2, ss3, ss4): assert_array_equal(t1[0], t2[0]) assert_array_equal(t2[0], t3[0]) assert_array_equal(t3[0], t4[0]) assert_array_equal(t1[1], t2[1]) assert_array_equal(t2[1], t3[1]) assert_array_equal(t3[1], t4[1])
def staged_decision_function(self, X): """Compute decision function of ``X`` for each boosting iteration. This method allows monitoring (i.e. determine error on testing set) after each boosting iteration. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- score : generator of array, shape = [n_samples, k] The decision function of the input samples. The order of outputs is the same of that of the `classes_` attribute. Binary classification is a special cases with ``k == 1``, otherwise ``k==n_classes``. For binary classification, values closer to -1 or 1 mean more like the first or second class in ``classes_``, respectively. """ self._check_fitted() X = np.asarray(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] pred = None norm = 0. for weight, estimator in zip(self.estimator_weights_, self.estimators_): norm += weight if self.algorithm == 'SAMME.R': # The weights are all 1. for SAMME.R current_pred = _samme_proba(estimator, n_classes, X) else: # elif self.algorithm == "SAMME": current_pred = estimator.predict(X) current_pred = (current_pred == classes).T * weight if pred is None: pred = current_pred else: pred += current_pred if n_classes == 2: tmp_pred = np.copy(pred) tmp_pred[:, 0] *= -1 yield (tmp_pred / norm).sum(axis=1) else: yield pred / norm
def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if # the labels variable is changed before calling __iter__ labels = np.array([0, 1, 2, 1, 1, 2, 0, 0]) labels_changing = np.array(labels, copy=True) lolo = cval.LeaveOneLabelOut(labels) lolo_changing = cval.LeaveOneLabelOut(labels_changing) lplo = cval.LeavePLabelOut(labels, p=2) lplo_changing = cval.LeavePLabelOut(labels_changing, p=2) labels_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan)
def test_shuffle_kfold(): # Check the indices are shuffled properly kf = KFold(3) kf2 = KFold(3, shuffle=True, random_state=0) kf3 = KFold(3, shuffle=True, random_state=1) X = np.ones(300) all_folds = np.zeros(300) for (tr1, te1), (tr2, te2), (tr3, te3) in zip(kf.split(X), kf2.split(X), kf3.split(X)): for tr_a, tr_b in combinations((tr1, tr2, tr3), 2): # Assert that there is no complete overlap assert_not_equal(len(np.intersect1d(tr_a, tr_b)), len(tr1)) # Set all test indices in successive iterations of kf2 to 1 all_folds[te2] = 1 # Check that all indices are returned in the different test folds assert_equal(sum(all_folds), 300)
def test_leave_group_out_changing_groups(): # Check that LeaveOneGroupOut and LeavePGroupsOut work normally if # the groups variable is changed before calling split groups = np.array([0, 1, 2, 1, 1, 2, 0, 0]) X = np.ones(len(groups)) groups_changing = np.array(groups, copy=True) lolo = LeaveOneGroupOut().split(X, groups=groups) lolo_changing = LeaveOneGroupOut().split(X, groups=groups) lplo = LeavePGroupsOut(n_groups=2).split(X, groups=groups) lplo_changing = LeavePGroupsOut(n_groups=2).split(X, groups=groups) groups_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan) # n_splits = no of 2 (p) group combinations of the unique groups = 3C2 = 3 assert_equal(3, LeavePGroupsOut(n_groups=2).get_n_splits(X, y, groups)) # n_splits = no of unique groups (C(uniq_lbls, 1) = n_unique_groups) assert_equal(3, LeaveOneGroupOut().get_n_splits(X, y, groups))
def test_make_multilabel_classification_return_indicator(): for allow_unlabeled, min_length in zip((True, False), (0, 1)): X, Y = make_multilabel_classification(n_samples=25, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled) assert_equal(X.shape, (25, 20), "X shape mismatch") assert_equal(Y.shape, (25, 3), "Y shape mismatch") assert_true(np.all(np.sum(Y, axis=0) > min_length)) # Also test return_distributions and return_indicator with True X2, Y2, p_c, p_w_c = make_multilabel_classification( n_samples=25, n_features=20, n_classes=3, random_state=0, allow_unlabeled=allow_unlabeled, return_distributions=True) assert_array_equal(X, X2) assert_array_equal(Y, Y2) assert_equal(p_c.shape, (3,)) assert_almost_equal(p_c.sum(), 1) assert_equal(p_w_c.shape, (20, 3)) assert_almost_equal(p_w_c.sum(axis=0), [1] * 3)
def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7): """Log probability for full covariance matrices. """ n_samples, n_dim = X.shape nmix = len(means) log_prob = np.empty((n_samples, nmix)) for c, (mu, cv) in enumerate(zip(means, covars)): try: cv_chol = linalg.cholesky(cv, lower=True) except linalg.LinAlgError: # The model is most probably stuck in a component with too # few observations, we need to reinitialize this components cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim), lower=True) cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol))) cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T log_prob[:, c] = -.5 * (np.sum(cv_sol**2, axis=1) + n_dim * np.log(2 * np.pi) + cv_log_det) return log_prob
def test_leave_label_out_changing_labels(): # Check that LeaveOneLabelOut and LeavePLabelOut work normally if # the labels variable is changed before calling split labels = np.array([0, 1, 2, 1, 1, 2, 0, 0]) X = np.ones(len(labels)) labels_changing = np.array(labels, copy=True) lolo = LeaveOneLabelOut().split(X, labels=labels) lolo_changing = LeaveOneLabelOut().split(X, labels=labels) lplo = LeavePLabelOut(n_labels=2).split(X, labels=labels) lplo_changing = LeavePLabelOut(n_labels=2).split(X, labels=labels) labels_changing[:] = 0 for llo, llo_changing in [(lolo, lolo_changing), (lplo, lplo_changing)]: for (train, test), (train_chan, test_chan) in zip(llo, llo_changing): assert_array_equal(train, train_chan) assert_array_equal(test, test_chan) # n_splits = no of 2 (p) label combinations of the unique labels = 3C2 = 3 assert_equal(3, LeavePLabelOut(n_labels=2).get_n_splits(X, y, labels)) # n_splits = no of unique labels (C(uniq_lbls, 1) = n_unique_labels) assert_equal(3, LeaveOneLabelOut().get_n_splits(X, y, labels))
def test_grid_search_results(): X, y = make_classification(n_samples=50, n_features=4, random_state=42) n_folds = 3 n_grid_points = 6 params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]), dict(kernel=['poly', ], degree=[1, 2])] grid_search = GridSearchCV(SVC(), cv=n_folds, iid=False, param_grid=params) grid_search.fit(X, y) grid_search_iid = GridSearchCV(SVC(), cv=n_folds, iid=True, param_grid=params) grid_search_iid.fit(X, y) param_keys = ('param_C', 'param_degree', 'param_gamma', 'param_kernel') score_keys = ('test_mean_score', 'test_rank_score', 'test_split0_score', 'test_split1_score', 'test_split2_score', 'test_std_score') n_candidates = n_grid_points for search, iid in zip((grid_search, grid_search_iid), (False, True)): assert_equal(iid, search.iid) results = search.results_ # Check results structure check_results_array_types(results, param_keys, score_keys) check_results_keys(results, param_keys, score_keys, n_candidates) # Check masking results = grid_search.results_ n_candidates = len(grid_search.results_['params']) assert_true(all((results['param_C'].mask[i] and results['param_gamma'].mask[i] and not results['param_degree'].mask[i]) for i in range(n_candidates) if results['param_kernel'][i] == 'linear')) assert_true(all((not results['param_C'].mask[i] and not results['param_gamma'].mask[i] and results['param_degree'].mask[i]) for i in range(n_candidates) if results['param_kernel'][i] == 'rbf')) check_results_grid_scores_consistency(search)
def test_ovr_multilabel_dataset(): base_clf = MultinomialNB(alpha=1) for au, prec, recall in zip((True, False), (0.65, 0.74), (0.72, 0.84)): X, Y = datasets.make_multilabel_classification(n_samples=100, n_features=20, n_classes=5, n_labels=2, length=50, allow_unlabeled=au, random_state=0) X_train, Y_train = X[:80], Y[:80] X_test, Y_test = X[80:], Y[80:] clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train) Y_pred = clf.predict(X_test) assert_true(clf.multilabel_) assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"), prec, decimal=2) assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"), recall, decimal=2)
def feature_importances_(self): """Return the feature importances (the higher, the more important the feature). Returns ------- feature_importances_ : array, shape = [n_features] """ if self.estimators_ is None or len(self.estimators_) == 0: raise ValueError("Estimator not fitted, " "call `fit` before `feature_importances_`.") try: norm = self.estimator_weights_.sum() return (sum(weight * clf.feature_importances_ for weight, clf in zip(self.estimator_weights_, self.estimators_)) / norm) except AttributeError: raise AttributeError("Unable to compute feature importances " "since base_estimator does not have a " "feature_importances_ attribute")
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the weighted mean predicted class probabilities of the classifiers in the ensemble. Parameters ---------- X : {array-like, sparse matrix} of shape = [n_samples, n_features] The training input samples. Sparse matrix can be CSC, CSR, COO, DOK, or LIL. DOK and LIL are converted to CSR. Returns ------- p : array of shape = [n_samples] The class probabilities of the input samples. The order of outputs is the same of that of the `classes_` attribute. """ check_is_fitted(self, "n_classes_") n_classes = self.n_classes_ X = self._validate_X_predict(X) if n_classes == 1: return np.ones((X.shape[0], 1)) if self.algorithm == 'SAMME.R': # The weights are all 1. for SAMME.R proba = sum(_samme_proba(estimator, n_classes, X) for estimator in self.estimators_) else: # self.algorithm == "SAMME" proba = sum(estimator.predict_proba(X) * w for estimator, w in zip(self.estimators_, self.estimator_weights_)) proba /= self.estimator_weights_.sum() proba = np.exp((1. / (n_classes - 1)) * proba) normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer return proba
def test_train_test_split(): X = np.arange(100).reshape((10, 10)) X_s = coo_matrix(X) y = np.arange(10) # simple test split = train_test_split(X, y, test_size=None, train_size=.5) X_train, X_test, y_train, y_test = split assert_equal(len(y_test), len(y_train)) # test correspondence of X and y assert_array_equal(X_train[:, 0], y_train * 10) assert_array_equal(X_test[:, 0], y_test * 10) # don't convert lists to anything else by default split = train_test_split(X, X_s, y.tolist()) X_train, X_test, X_s_train, X_s_test, y_train, y_test = split assert_true(isinstance(y_train, list)) assert_true(isinstance(y_test, list)) # allow nd-arrays X_4d = np.arange(10 * 5 * 3 * 2).reshape(10, 5, 3, 2) y_3d = np.arange(10 * 7 * 11).reshape(10, 7, 11) split = train_test_split(X_4d, y_3d) assert_equal(split[0].shape, (7, 5, 3, 2)) assert_equal(split[1].shape, (3, 5, 3, 2)) assert_equal(split[2].shape, (7, 7, 11)) assert_equal(split[3].shape, (3, 7, 11)) # test stratification option y = np.array([1, 1, 1, 1, 2, 2, 2, 2]) for test_size, exp_test_size in zip([2, 4, 0.25, 0.5, 0.75], [2, 4, 2, 4, 6]): train, test = train_test_split(y, test_size=test_size, stratify=y, random_state=0) assert_equal(len(test), exp_test_size) assert_equal(len(test) + len(train), len(y)) # check the 1:1 ratio of ones and twos in the data is preserved assert_equal(np.sum(train == 1), np.sum(train == 2))
def decision_function(self, X): """Compute the decision function of ``X``. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- score : array, shape = [n_samples, k] The decision function of the input samples. The order of outputs is the same of that of the `classes_` attribute. Binary classification is a special cases with ``k == 1``, otherwise ``k==n_classes``. For binary classification, values closer to -1 or 1 mean more like the first or second class in ``classes_``, respectively. """ self._check_fitted() X = np.asarray(X) n_classes = self.n_classes_ classes = self.classes_[:, np.newaxis] pred = None if self.algorithm == 'SAMME.R': # The weights are all 1. for SAMME.R pred = sum( _samme_proba(estimator, n_classes, X) for estimator in self.estimators_) else: # self.algorithm == "SAMME" pred = sum((estimator.predict(X) == classes).T * w for estimator, w in zip(self.estimators_, self.estimator_weights_)) pred /= self.estimator_weights_.sum() if n_classes == 2: pred[:, 0] *= -1 return pred.sum(axis=1) return pred
def _incremental_fit_estimator(estimator, X, y, classes, train, test, train_sizes, scorer, verbose): """Train estimator on training subsets incrementally and compute scores.""" train_scores, test_scores = [], [] partitions = zip(train_sizes, np.split(train, train_sizes)[:-1]) for n_train_samples, partial_train in partitions: train_subset = train[:n_train_samples] # NOTE: wrapper patch X_train, y_train = _patch_split(estimator, X, y, train_subset) X_partial_train, y_partial_train = _patch_split(estimator, X, y, partial_train) X_test, y_test = _patch_split(estimator, X, y, test, train_subset) if y_partial_train is None: estimator.partial_fit(X_partial_train, classes=classes) else: estimator.partial_fit(X_partial_train, y_partial_train, classes=classes) train_scores.append(_score(estimator, X_train, y_train, scorer)) test_scores.append(_score(estimator, X_test, y_test, scorer)) return np.array((train_scores, test_scores)).T
def test_grid_search_score_consistency(): # test that correct scores are used clf = LinearSVC(random_state=0) X, y = make_blobs(random_state=0, centers=2) Cs = [.1, 1, 10] for score in ['f1', 'roc_auc']: grid_search = GridSearchCV(clf, {'C': Cs}, scoring=score) grid_search.fit(X, y) cv = StratifiedKFold(n_folds=3, y=y) for C, scores in zip(Cs, grid_search.grid_scores_): clf.set_params(C=C) scores = scores[2] # get the separate runs from grid scores i = 0 for train, test in cv: clf.fit(X[train], y[train]) if score == "f1": correct_score = f1_score(y[test], clf.predict(X[test])) elif score == "roc_auc": dec = clf.decision_function(X[test]) correct_score = roc_auc_score(y[test], dec) assert_almost_equal(correct_score, scores[i]) i += 1
def _parallel_predict_log_proba(estimators, estimators_features, X, n_classes): """Private function used to compute log probabilities within a job.""" n_samples = X.shape[0] log_proba = np.empty((n_samples, n_classes)) log_proba.fill(-np.inf) all_classes = np.arange(n_classes, dtype=np.int) for estimator, features in zip(estimators, estimators_features): log_proba_estimator = estimator.predict_log_proba(X[:, features]) if n_classes == len(estimator.classes_): log_proba = logaddexp(log_proba, log_proba_estimator) else: log_proba[:, estimator.classes_] = logaddexp( log_proba[:, estimator.classes_], log_proba_estimator[:, range(len(estimator.classes_))]) missing = np.setdiff1d(all_classes, estimator.classes_) log_proba[:, missing] = logaddexp(log_proba[:, missing], -np.inf) return log_proba
def _set_oob_score(self, X, y): n_classes_ = self.n_classes_ classes_ = self.classes_ n_samples = y.shape[0] predictions = np.zeros((n_samples, n_classes_)) for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): mask = np.ones(n_samples, dtype=np.bool) mask[samples] = False if hasattr(estimator, "predict_proba"): predictions[mask, :] += estimator.predict_proba( (X[mask, :])[:, features]) else: p = estimator.predict((X[mask, :])[:, features]) j = 0 for i in range(n_samples): if mask[i]: predictions[i, p[j]] += 1 j += 1 if (predictions.sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few estimators were used " "to compute any reliable oob estimates.") oob_decision_function = (predictions / predictions.sum(axis=1)[:, np.newaxis]) oob_score = accuracy_score( y, classes_.take(np.argmax(predictions, axis=1))) self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the weighted mean predicted class probabilities of the classifiers in the ensemble. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples] The class probabilities of the input samples. The order of outputs is the same of that of the `classes_` attribute. """ X = np.asarray(X) n_classes = self.n_classes_ if self.algorithm == 'SAMME.R': # The weights are all 1. for SAMME.R proba = sum( _samme_proba(estimator, n_classes, X) for estimator in self.estimators_) else: # self.algorithm == "SAMME" proba = sum( estimator.predict_proba(X) * w for estimator, w in zip( self.estimators_, self.estimator_weights_)) proba /= self.estimator_weights_.sum() proba = np.exp((1. / (n_classes - 1)) * proba) normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer return proba
def _log_multivariate_normal_density(X, means, covars, min_covar=1.e-7): """Log probability for covariance matrices.""" n_samples, n_dim = X.shape K = len(means) log_prob = np.empty((n_samples, K)) for c, (mu, cv) in enumerate(zip(means, covars)): try: cv_chol = linalg.cholesky(cv, lower=True) except linalg.LinAlgError: # The model is most probably stuck in a component with too # few observations, we need to reinitialize this components try: cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim), lower=True) except linalg.LinAlgError: raise ValueError("'covars' must be symmetric, " "positive-definite") # find the precision via determinant formula and cholesky decomposition cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol))) cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T log_prob[:, c] = -.5 * (np.sum(cv_sol**2, axis=1) + n_dim * np.log(2 * np.pi) + cv_log_det) return log_prob
def test_knn_version_consistency(): if not have_flann: raise SkipTest("No flann, so skipping knn tests.") if not have_accel: raise SkipTest("No skl-groups-accel, so skipping version consistency.") n = 20 for dim in [1, 7]: np.random.seed(47) bags = Features([ np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(n) ]) div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8') Ks = (3, 4) get_est = partial(KNNDivergenceEstimator, div_funcs=div_funcs, Ks=Ks) results = {} for version in ('fast', 'slow', 'best'): est = get_est(version=version) results[version] = res = est.fit_transform(bags) assert res.shape == (len(div_funcs), len(Ks), n, n) assert np.all(np.isfinite(res)) for df, fast, slow in zip(div_funcs, results['fast'], results['slow']): assert_array_almost_equal(fast, slow, decimal=1 if df == 'js' else 5, err_msg="({}, dim {})".format(df, dim)) # TODO: debug JS differences est = get_est(version='fast', n_jobs=-1) res = est.fit_transform(bags) assert np.all(results['fast'] == res) est = get_est(version='slow', n_jobs=-1) res = est.fit_transform(bags) assert np.all(results['slow'] == res)
def _set_oob_score(self, X, y): n_samples = y.shape[0] n_classes_ = self.n_classes_ classes_ = self.classes_ predictions = np.zeros((n_samples, n_classes_)) for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Create mask for OOB samples mask = ~samples if hasattr(estimator, "predict_proba"): predictions[mask, :] += estimator.predict_proba( (X[mask, :])[:, features]) else: p = estimator.predict((X[mask, :])[:, features]) j = 0 for i in range(n_samples): if mask[i]: predictions[i, p[j]] += 1 j += 1 if (predictions.sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few estimators were used " "to compute any reliable oob estimates.") oob_decision_function = (predictions / predictions.sum(axis=1)[:, np.newaxis]) oob_score = roc_auc_score(y, predictions[:,1]) self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score
def test_random_search_results(): # Make a dataset with a lot of noise to get various kind of prediction # errors across CV folds and parameter settings X, y = make_classification(n_samples=200, n_features=100, n_informative=3, random_state=0) # scipy.stats dists now supports `seed` but we still support scipy 0.12 # which doesn't support the seed. Hence the assertions in the test for # random_search alone should not depend on randomization. n_folds = 3 n_search_iter = 30 params = dict(C=expon(scale=10), gamma=expon(scale=0.1)) random_search = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_folds, iid=False, param_distributions=params) random_search.fit(X, y) random_search_iid = RandomizedSearchCV(SVC(), n_iter=n_search_iter, cv=n_folds, iid=True, param_distributions=params) random_search_iid.fit(X, y) param_keys = ('param_C', 'param_gamma') score_keys = ('test_mean_score', 'test_rank_score', 'test_split0_score', 'test_split1_score', 'test_split2_score', 'test_std_score') n_cand = n_search_iter for search, iid in zip((random_search, random_search_iid), (False, True)): assert_equal(iid, search.iid) results = search.results_ # Check results structure check_results_array_types(results, param_keys, score_keys) check_results_keys(results, param_keys, score_keys, n_cand) # For random_search, all the param array vals should be unmasked assert_false(any(results['param_C'].mask) or any(results['param_gamma'].mask)) check_results_grid_scores_consistency(search)
def _parallel_predict_proba(estimators, estimators_features, X, n_classes): """Private function used to compute (proba-)predictions within a job.""" n_samples = X.shape[0] proba = np.zeros((n_samples, n_classes)) for estimator, features in zip(estimators, estimators_features): if hasattr(estimator, "predict_proba"): proba_estimator = estimator.predict_proba(X[:, features]) if n_classes == len(estimator.classes_): proba += proba_estimator else: proba[:, estimator.classes_] += \ proba_estimator[:, range(len(estimator.classes_))] else: # Resort to voting predictions = estimator.predict(X[:, features]) for i in range(n_samples): proba[i, predictions[i]] += 1 return proba
def test_class_weight_auto_classifiers(): """Test that class_weight="auto" improves f1-score""" # This test is broken; its success depends on: # * a rare fortuitous RNG seed for make_classification; and # * the use of binary F1 over a seemingly arbitrary positive class for two # datasets, and weighted average F1 for the third. # Its expectations need to be clarified and reimplemented. raise SkipTest('This test requires redefinition') classifiers = all_estimators(type_filter='classifier') clean_warning_registry() with warnings.catch_warnings(record=True): classifiers = [c for c in classifiers if 'class_weight' in c[1]().get_params().keys()] for n_classes, weights in zip([2, 3], [[.8, .2], [.8, .1, .1]]): # create unbalanced dataset X, y = make_classification(n_classes=n_classes, n_samples=200, n_features=10, weights=weights, random_state=0, n_informative=n_classes) X = StandardScaler().fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=0) for name, Classifier in classifiers: if (name != "NuSVC" # the sparse version has a parameter that doesn't do anything and not name.startswith("RidgeClassifier") # RidgeClassifier behaves unexpected # FIXME! and not name.endswith("NB")): # NaiveBayes classifiers have a somewhat different interface. # FIXME SOON! yield (check_class_weight_auto_classifiers, name, Classifier, X_train, y_train, X_test, y_test, weights)
def _evaluate_oob_savings(self, X, y, cost_mat): """Private function used to calculate the OOB Savings of each estimator.""" estimators_weight = [] for estimator, samples, features in zip(self.estimators_, self.estimators_samples_, self.estimators_features_): # Test if all examples where used for training if not np.any(~samples): # Then use training oob_pred = estimator.predict(X[:, features]) oob_savings = max(0, savings_score(y, oob_pred, cost_mat)) else: # Then use OOB oob_pred = estimator.predict((X[~samples])[:, features]) oob_savings = max(0, savings_score(y[~samples], oob_pred, cost_mat[~samples])) estimators_weight.append(oob_savings) # Control in case were all weights are 0 if sum(estimators_weight) == 0: self.estimators_weight_ = np.ones(len(estimators_weight)) / len(estimators_weight) else: self.estimators_weight_ = (np.array(estimators_weight) / sum(estimators_weight)).tolist() return self
def test_make_circles(): factor = 0.3 for (n_samples, n_outer, n_inner) in [(7, 3, 4), (8, 4, 4)]: # Testing odd and even case, because in the past make_circles always # created an even number of samples. X, y = make_circles(n_samples, shuffle=False, noise=None, factor=factor) assert_equal(X.shape, (n_samples, 2), "X shape mismatch") assert_equal(y.shape, (n_samples,), "y shape mismatch") center = [0.0, 0.0] for x, label in zip(X, y): dist_sqr = ((x - center) ** 2).sum() dist_exp = 1.0 if label == 0 else factor**2 assert_almost_equal(dist_sqr, dist_exp, err_msg="Point is not on expected circle") assert_equal(X[y == 0].shape, (n_outer, 2), "Samples not correctly distributed across circles.") assert_equal(X[y == 1].shape, (n_inner, 2), "Samples not correctly distributed across circles.") assert_raises(ValueError, make_circles, factor=-0.01) assert_raises(ValueError, make_circles, factor=1.)
def _fit_and_score_multisignal(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score=False, return_parameters=False, return_n_test_samples=False, return_times=False, error_score='raise', logger=logger): """Fit estimator and compute scores for a given dataset split. Parameters ---------- estimator : estimator object implementing 'fit' The object to use to fit the data. X : array-like of shape at least 2D The data to fit. y : array-like, optional, default: None The target variable to try to predict in the case of supervised learning. scorer : A single callable or dict mapping scorer name to the callable If it is a single callable, the return value for ``train_scores`` and ``test_scores`` is a single float. For a dict, it should be one mapping the scorer name to the scorer callable object / function. The callable object / fn should have signature ``scorer(estimator, X, y)``. train : array-like, shape (n_train_samples,) Indices of training samples. test : array-like, shape (n_test_samples,) Indices of test samples. verbose : integer The verbosity level. error_score : 'raise' (default) or numeric Value to assign to the score if an error occurs in estimator fitting. If set to 'raise', the error is raised. If a numeric value is given, FitFailedWarning is raised. This parameter does not affect the refit step, which will always raise the error. parameters : dict or None Parameters to be set on the estimator. fit_params : dict or None Parameters that will be passed to ``estimator.fit``. return_train_score : boolean, optional, default: False Compute and return score on training set. return_parameters : boolean, optional, default: False Return parameters that has been used for the estimator. return_n_test_samples : boolean, optional, default: False Whether to return the ``n_test_samples`` return_times : boolean, optional, default: False Whether to return the fit/score times. Returns ------- train_scores : dict of scorer name -> float, optional Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float, optional Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None, optional The parameters that have been evaluated. """ if verbose > 1: if parameters is None: msg = '' else: msg = '%s' % (', '.join('%s=%s' % (k, v) for k, v in parameters.items())) logger.info("[CV] %s %s" % (msg, (64 - len(msg)) * '.')) # Adjust length of sample weights fit_params = fit_params if fit_params is not None else {} fit_params = dict([(k, _index_param_value(X, v, train)) for k, v in fit_params.items()]) test_scores = {} train_scores = {} if parameters is not None: estimator.set_params(**parameters) start_time = time.time() X_train, y_train = _safe_split_multisignal(estimator, X, y, train) X_test, y_test = _safe_split_multisignal(estimator, X, y, test, train) is_multimetric = not callable(scorer) n_scorers = len(scorer.keys()) if is_multimetric else 1 try: if y_train is None: estimator.fit(X_train, **fit_params) else: estimator.fit(X_train, y_train, **fit_params) except Exception as e: # Note fit time as time until error fit_time = time.time() - start_time score_time = 0.0 if error_score == 'raise': raise elif isinstance(error_score, numbers.Number): if is_multimetric: test_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) if return_train_score: train_scores = dict(zip(scorer.keys(), [error_score, ] * n_scorers)) else: test_scores = error_score if return_train_score: train_scores = error_score logger.warning("Classifier fit failed. The score on this train-test" " partition for these parameters will be set to %f. " "Details: \n%r" % (error_score, e), FitFailedWarning) else: raise ValueError("error_score must be the string 'raise' or a" " numeric value. (Hint: if using 'raise', please" " make sure that it has been spelled correctly.)") else: fit_time = time.time() - start_time # _score will return dict if is_multimetric is True test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric) score_time = time.time() - start_time - fit_time if return_train_score: train_scores = _score(estimator, X_train, y_train, scorer, is_multimetric) if verbose > 2: if is_multimetric: for scorer_name, score in test_scores.items(): msg += ", %s=%s" % (scorer_name, score) else: msg += ", score=%s" % test_scores if verbose > 1: total_time = score_time + fit_time end_msg = "%s, total=%s" % (msg, short_format_time(total_time)) logger.info("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)) ret = [train_scores, test_scores] if return_train_score else [test_scores] if return_n_test_samples: ret.append(_num_samples(X_test)) if return_times: ret.extend([fit_time, score_time]) if return_parameters: ret.append(parameters) return ret
def _make_test_folds(self, X, y=None, groups=None): """ Args: self (?): X (ndarray): data y (ndarray): labels(default = None) groups (None): (default = None) Returns: ?: test_folds CommandLine: python -m ibeis.algo.verif.sklearn_utils _make_test_folds Example: >>> # DISABLE_DOCTEST >>> from ibeis.algo.verif.sklearn_utils import * # NOQA >>> import utool as ut >>> rng = ut.ensure_rng(0) >>> groups = [1, 1, 3, 4, 2, 2, 7, 8, 8] >>> y = [1, 1, 1, 1, 2, 2, 2, 3, 3] >>> X = np.empty((len(y), 0)) >>> self = StratifiedGroupKFold(random_state=rng) >>> skf_list = list(self.split(X=X, y=y, groups=groups)) """ # if self.shuffle: # rng = check_random_state(self.random_state) # else: # rng = self.random_state n_splits = self.n_splits y = np.asarray(y) n_samples = y.shape[0] import utool as ut # y_counts = bincount(y_inversed) # min_classes_ = np.min(y_counts) # if np.all(self.n_splits > y_counts): # raise ValueError("All the n_groups for individual classes" # " are less than n_splits=%d." # % (self.n_splits)) # if self.n_splits > min_classes_: # warnings.warn(("The least populated class in y has only %d" # " members, which is too few. The minimum" # " number of groups for any class cannot" # " be less than n_splits=%d." # % (min_classes_, self.n_splits)), Warning) unique_y, y_inversed = np.unique(y, return_inverse=True) n_classes = max(unique_y) + 1 unique_groups, group_idxs = ut.group_indices(groups) # grouped_ids = list(grouping.keys()) grouped_y = ut.apply_grouping(y, group_idxs) grouped_y_counts = np.array([ bincount(y_, minlength=n_classes) for y_ in grouped_y]) target_freq = grouped_y_counts.sum(axis=0) target_ratio = target_freq / target_freq.sum() # Greedilly choose the split assignment that minimizes the local # * squared differences in target from actual frequencies # * and best equalizes the number of items per fold # Distribute groups with most members first split_freq = np.zeros((n_splits, n_classes)) # split_ratios = split_freq / split_freq.sum(axis=1) split_ratios = np.ones(split_freq.shape) / split_freq.shape[1] split_diffs = ((split_freq - target_ratio) ** 2).sum(axis=1) sortx = np.argsort(grouped_y_counts.sum(axis=1))[::-1] grouped_splitx = [] for count, group_idx in enumerate(sortx): # print('---------\n') group_freq = grouped_y_counts[group_idx] cand_freq = split_freq + group_freq cand_ratio = cand_freq / cand_freq.sum(axis=1)[:, None] cand_diffs = ((cand_ratio - target_ratio) ** 2).sum(axis=1) # Compute loss losses = [] # others = np.nan_to_num(split_diffs) other_diffs = np.array([ sum(split_diffs[x + 1:]) + sum(split_diffs[:x]) for x in range(n_splits) ]) # penalize unbalanced splits ratio_loss = other_diffs + cand_diffs # penalize heavy splits freq_loss = split_freq.sum(axis=1) freq_loss = freq_loss / freq_loss.sum() losses = ratio_loss + freq_loss # print('group_freq = %r' % (group_freq,)) # print('freq_loss = %s' % (ut.repr2(freq_loss, precision=2),)) # print('ratio_loss = %s' % (ut.repr2(ratio_loss, precision=2),)) #------- splitx = np.argmin(losses) # print('losses = %r, splitx=%r' % (losses, splitx)) split_freq[splitx] = cand_freq[splitx] split_ratios[splitx] = cand_ratio[splitx] split_diffs[splitx] = cand_diffs[splitx] grouped_splitx.append(splitx) # if count > 4: # break # else: # print('split_freq = \n' + # ut.repr2(split_freq, precision=2, suppress_small=True)) # print('target_ratio = \n' + # ut.repr2(target_ratio, precision=2, suppress_small=True)) # print('split_ratios = \n' + # ut.repr2(split_ratios, precision=2, suppress_small=True)) # print(ut.dict_hist(grouped_splitx)) # final_ratio_loss = ((split_ratios - target_ratio) ** 2).sum(axis=1) # print('split_freq = \n' + # ut.repr2(split_freq, precision=3, suppress_small=True)) # print('target_ratio = \n' + # ut.repr2(target_ratio, precision=3, suppress_small=True)) # print('split_ratios = \n' + # ut.repr2(split_ratios, precision=3, suppress_small=True)) # print(ut.dict_hist(grouped_splitx)) test_folds = np.empty(n_samples, dtype=np.int) for group_idx, splitx in zip(sortx, grouped_splitx): idxs = group_idxs[group_idx] test_folds[idxs] = splitx return test_folds
def _parallel_predict_regression(estimators, estimators_features, X): """Private function used to compute predictions within a job.""" return sum( estimator.predict(X[:, features]) for estimator, features in zip(estimators, estimators_features))
def _parallel_decision_function(estimators, estimators_features, X): """Private function used to compute decisions within a job.""" return sum( estimator.decision_function(X[:, features]) for estimator, features in zip(estimators, estimators_features))
def _check_transformer(name, Transformer, X, y): if name in ('CCA', 'LocallyLinearEmbedding', 'KernelPCA') and _is_32bit(): # Those transformers yield non-deterministic output when executed on # a 32bit Python. The same transformers are stable on 64bit Python. # FIXME: try to isolate a minimalistic reproduction case only depending # on numpy & scipy and/or maybe generate a test dataset that does not # cause such unstable behaviors. msg = name + ' is non deterministic on 32bit Python' raise SkipTest(msg) n_samples, n_features = np.asarray(X).shape # catch deprecation warnings with warnings.catch_warnings(record=True): transformer = Transformer() set_random_state(transformer) if name == "KernelPCA": transformer.remove_zero_eig = False set_fast_parameters(transformer) # fit if name in CROSS_DECOMPOSITION: y_ = np.c_[y, y] y_[::2, 1] *= 2 else: y_ = y transformer.fit(X, y_) X_pred = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple): for x_pred in X_pred: assert_equal(x_pred.shape[0], n_samples) else: assert_equal(X_pred.shape[0], n_samples) if hasattr(transformer, 'transform'): if name in CROSS_DECOMPOSITION: X_pred2 = transformer.transform(X, y_) X_pred3 = transformer.fit_transform(X, y=y_) else: X_pred2 = transformer.transform(X) X_pred3 = transformer.fit_transform(X, y=y_) if isinstance(X_pred, tuple) and isinstance(X_pred2, tuple): for x_pred, x_pred2, x_pred3 in zip(X_pred, X_pred2, X_pred3): assert_array_almost_equal( x_pred, x_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( x_pred, x_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) else: assert_array_almost_equal( X_pred, X_pred2, 2, "fit_transform and transform outcomes not consistent in %s" % Transformer) assert_array_almost_equal( X_pred, X_pred3, 2, "consecutive fit_transform outcomes not consistent in %s" % Transformer) # raises error on malformed input for transform if hasattr(X, 'T'): # If it's not an array, it does not have a 'T' property assert_raises(ValueError, transformer.transform, X.T)