def test_auc_score_non_binary_class(): # Test that roc_auc_score function returns an error when trying # to compute AUC for non-binary class values. rng = check_random_state(404) y_pred = rng.rand(10) # y_true contains only one class value y_true = np.zeros(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = -np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) # y_true contains three different class values y_true = rng.randint(0, 3, size=10) assert_raise_message(ValueError, "multiclass format is not supported", roc_auc_score, y_true, y_pred) clean_warning_registry() with warnings.catch_warnings(record=True): rng = check_random_state(404) y_pred = rng.rand(10) # y_true contains only one class value y_true = np.zeros(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) y_true = -np.ones(10, dtype="int") assert_raise_message(ValueError, "ROC AUC score is not defined", roc_auc_score, y_true, y_pred) # y_true contains three different class values y_true = rng.randint(0, 3, size=10) assert_raise_message(ValueError, "multiclass format is not supported", roc_auc_score, y_true, y_pred)
def test_sample_weight_invariance(n_samples=50): random_state = check_random_state(0) # binary random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples,)) for name in ALL_METRICS: if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or name in METRIC_UNDEFINED_BINARY): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_score else: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_pred # multiclass random_state = check_random_state(0) y_true = random_state.randint(0, 5, size=(n_samples, )) y_pred = random_state.randint(0, 5, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples, 5)) for name in ALL_METRICS: if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or name in METRIC_UNDEFINED_BINARY_MULTICLASS): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_score else: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_pred # multilabel indicator _, ya = make_multilabel_classification(n_features=1, n_classes=20, random_state=0, n_samples=100, allow_unlabeled=False) _, yb = make_multilabel_classification(n_features=1, n_classes=20, random_state=1, n_samples=100, allow_unlabeled=False) y_true = np.vstack([ya, yb]) y_pred = np.vstack([ya, ya]) y_score = random_state.randint(1, 4, size=y_true.shape) for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS + MULTIOUTPUT_METRICS): if name in METRICS_WITHOUT_SAMPLE_WEIGHT: continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield (_named_check(check_sample_weight_invariance, name), name, metric, y_true, y_score) else: yield (_named_check(check_sample_weight_invariance, name), name, metric, y_true, y_pred)
def __init__(self, configuration, random_state=None): self.configuration = configuration if random_state is None: self.random_state = check_random_state(1) else: self.random_state = check_random_state(random_state)
def test_RadiusNeighborsRegressor_multioutput_with_uniform_weight(): """Test radius neighbors in multi-output regression (uniform weight)""" rng = check_random_state(0) n_features = 5 n_samples = 40 n_output = 4 X = rng.rand(n_samples, n_features) y = rng.rand(n_samples, n_output) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) for algorithm, weights in product(ALGORITHMS, [None, 'uniform']): rnn = neighbors. RadiusNeighborsRegressor(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train) neigh_idx = rnn.radius_neighbors(X_test, return_distance=False) y_pred_idx = np.array([np.mean(y_train[idx], axis=0) for idx in neigh_idx]) y_pred_idx = np.array(y_pred_idx) y_pred = rnn.predict(X_test) assert_equal(y_pred_idx.shape, y_test.shape) assert_equal(y_pred.shape, y_test.shape) assert_array_almost_equal(y_pred, y_pred_idx)
def endless_permutations(N, random_state=None): """ Generate an endless sequence of random integers from permutations of the set [0, ..., N). If we call this N times, we will sweep through the entire set without replacement, on the (N+1)th call a new permutation will be created, etc. Parameters ---------- N: int the length of the set random_state: int or RandomState, optional random seed Yields ------ int: a random int from the set [0, ..., N) """ generator = check_random_state(random_state) while True: batch_inds = generator.permutation(N) for b in batch_inds: yield b
def fit(self, x, y): random_state = check_random_state(self.random_state) self.a = np.min(y) self.b = np.max(y) self.w = self._find_best_w(x, y, random_state) x_r = np.dot(self.w, x.T).T x1 = x_r[y == self.a] x2 = x_r[y == self.b] kA = self.base_objective.gamma * 1.06 h1 = kA * len(x1) ** (-1.0 / 5) * np.std(x1) # silverman's rule of the thumb h2 = kA * len(x2) ** (-1.0 / 5) * np.std(x2) # silverman's rule of the thumb self.kde_a = KernelDensity(kernel='gaussian', bandwidth=h1).fit(x1.reshape(-1, 1)) self.kde_b = KernelDensity(kernel='gaussian', bandwidth=h2).fit(x2.reshape(-1, 1)) self.min_v = min(x_r) self.max_v = max(x_r) self.min_c = self._density_classification(self.min_v) self.max_c = self._density_classification(self.max_v) ytr = self.predict(x, True) last = ytr[0] self.k = 0 for i in range(1, len(ytr)): if ytr[i] != last: self.k += 1 last = ytr[i]
def setUp(self): iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] self.iris = iris
def test_RadiusNeighborsClassifier_multioutput(): """Test k-NN classifier on multioutput data""" rng = check_random_state(0) n_features = 2 n_samples = 40 n_output = 3 X = rng.rand(n_samples, n_features) y = rng.randint(0, 3, (n_samples, n_output)) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) weights = [None, 'uniform', 'distance', _weight_func] for algorithm, weights in product(ALGORITHMS, weights): # Stack single output prediction y_pred_so = [] for o in range(n_output): rnn = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn.fit(X_train, y_train[:, o]) y_pred_so.append(rnn.predict(X_test)) y_pred_so = np.vstack(y_pred_so).T assert_equal(y_pred_so.shape, y_test.shape) # Multioutput prediction rnn_mo = neighbors.RadiusNeighborsClassifier(weights=weights, algorithm=algorithm) rnn_mo.fit(X_train, y_train) y_pred_mo = rnn_mo.predict(X_test) assert_equal(y_pred_mo.shape, y_test.shape) assert_array_almost_equal(y_pred_mo, y_pred_so)
def check_alternative_lrap_implementation(lrap_score, n_classes=5, n_samples=20, random_state=0): _, y_true = make_multilabel_classification(n_features=1, allow_unlabeled=False, random_state=random_state, n_classes=n_classes, n_samples=n_samples) # Score with ties y_score = sparse_random_matrix(n_components=y_true.shape[0], n_features=y_true.shape[1], random_state=random_state) if hasattr(y_score, "toarray"): y_score = y_score.toarray() score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap) # Uniform score random_state = check_random_state(random_state) y_score = random_state.uniform(size=(n_samples, n_classes)) score_lrap = label_ranking_average_precision_score(y_true, y_score) score_my_lrap = _my_lrap(y_true, y_score) assert_almost_equal(score_lrap, score_my_lrap)
def test_symmetry(): """Test the symmetry of score and loss functions""" random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) # We shouldn't forget any metrics assert_equal(set(SYMMETRIC_METRICS).union(NOT_SYMMETRIC_METRICS, THRESHOLDED_METRICS, METRIC_UNDEFINED_MULTICLASS), set(ALL_METRICS)) assert_equal( set(SYMMETRIC_METRICS).intersection(set(NOT_SYMMETRIC_METRICS)), set([])) # Symmetric metric for name in SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_almost_equal(metric(y_true, y_pred), metric(y_pred, y_true), err_msg="%s is not symmetric" % name) # Not symmetric metrics for name in NOT_SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_true(np.any(metric(y_true, y_pred) != metric(y_pred, y_true)), msg="%s seems to be symmetric" % name)
def _fit(self, gn): from sklearn.utils.validation import check_random_state from sklearn.utils.extmath import randomized_svd # apply scaling gn = self.scaler_.fit(gn).transform(gn) # transpose for svd # TODO eliminate need for transposition x = gn.T n_samples, n_features = x.shape # intermediates random_state = check_random_state(self.random_state) n_components = self.n_components n_samples, n_features = x.shape # singular value decomposition u, s, v = randomized_svd(x, n_components, n_iter=self.iterated_power, random_state=random_state) # calculate explained variance self.explained_variance_ = exp_var = (s ** 2) / n_samples full_var = np.var(x, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var # store components self.components_ = v return u, s, v
def test_iris(self): """Check consistency on dataset iris.""" # also load the iris dataset # and randomly permute it iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] clf = CFClassifier("") clf.fit(iris.data, iris.target) self.assertTrue(os.path.isfile(clf.forest)) preds = clf.predict(iris.data) predicted_ratio = float(np.sum(preds==iris.target))/float(len(iris.target)) print predicted_ratio self.assertGreaterEqual(predicted_ratio, .97) probs = clf.predict_proba(iris.data) bin_idx=iris.target!=2 roc_auc = roc_auc_score(iris.target[bin_idx], probs[bin_idx,1]) self.assertGreaterEqual(roc_auc, .97)
def fit(self, x, y): if len(set(y)) > 2: raise NotImplementedError('Currently MELM supports only binary datasets') self.base_objective = DCS_kd(gamma=self.gamma, k=self.k, covariance_estimator=self.covariance_estimator) if self.classifier == 'KDE': self.clf = KDE(gamma=self.gamma) elif self.classifier == 'SVM': self.clf = SVM() elif self.classifier == 'KNN': self.clf = KNN() else: raise NotImplementedError('%s classifier is not implemented' % self.classifier) random_state = check_random_state(self.random_state) self.a = min(y) self.b = max(y) self.classes_ = np.array([self.a, self.b]) self.w = self._find_best_w(x, y, random_state) self.clf.fit(self.transform(x), y)
def predict(self, X, Y_possible): if self.method == "random": rng = check_random_state(self.random_state) else: rng = None return [self._predict_interval(possible_intervals, rng) for possible_intervals in Y_possible]
def _predict_interval(self, possible_intervals, rng=None): if self.method == "center": return possible_intervals[len(possible_intervals) / 2] elif self.method == "random": if rng is None: rng = check_random_state(self.random_state) return possible_intervals[rng.randint(len(possible_intervals))]
def test_symmetry(): # Test the symmetry of score and loss functions random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(20, )) y_pred = random_state.randint(0, 2, size=(20, )) # We shouldn't forget any metrics assert_equal(SYMMETRIC_METRICS.union( NOT_SYMMETRIC_METRICS, set(THRESHOLDED_METRICS), METRIC_UNDEFINED_BINARY_MULTICLASS), set(ALL_METRICS)) assert_equal( SYMMETRIC_METRICS.intersection(NOT_SYMMETRIC_METRICS), set([])) # Symmetric metric for name in SYMMETRIC_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_pred), metric(y_pred, y_true), err_msg="%s is not symmetric" % name) # Not symmetric metrics for name in NOT_SYMMETRIC_METRICS: metric = ALL_METRICS[name] # use context manager to supply custom error message with assert_raises(AssertionError) as cm: assert_array_equal(metric(y_true, y_pred), metric(y_pred, y_true)) cm.msg = ("%s seems to be symmetric" % name)
def __init__(self, shuffle_factor=0.05, not_shuffled_columns=None, random_state=None): self.shuffle_factor = shuffle_factor self.random_state = check_random_state(random_state) if not_shuffled_columns is None: self.not_shuffled_columns = [] else: self.not_shuffled_columns = not_shuffled_columns
def test_thresholded_invariance_string_vs_numbers_labels(name): # Ensure that thresholded metrics with string labels are invariant random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_str = np.array(["eggs", "spam"])[y1] pos_label_str = "spam" with ignore_warnings(): metric = THRESHOLDED_METRICS[name] if name not in METRIC_UNDEFINED_BINARY: # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_number = metric(y1, y2) measure_with_str = metric_str(y1_str, y2) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number " "invariance test".format(name)) measure_with_strobj = metric_str(y1_str.astype('O'), y2) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number " "invariance test".format(name)) else: # TODO those metrics doesn't support string label yet assert_raises(ValueError, metric, y1_str, y2) assert_raises(ValueError, metric, y1_str.astype('O'), y2)
def test_sample_order_invariance_multilabel_and_multioutput(): random_state = check_random_state(0) # Generate some data y_true = random_state.randint(0, 2, size=(20, 25)) y_pred = random_state.randint(0, 2, size=(20, 25)) y_score = random_state.normal(size=y_true.shape) y_true_shuffle, y_pred_shuffle, y_score_shuffle = shuffle(y_true, y_pred, y_score, random_state=0) for name in MULTILABELS_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name) for name in THRESHOLDED_MULTILABEL_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_score), metric(y_true_shuffle, y_score_shuffle), err_msg="%s is not sample order invariant" % name) for name in MULTIOUTPUT_METRICS: metric = ALL_METRICS[name] assert_allclose(metric(y_true, y_score), metric(y_true_shuffle, y_score_shuffle), err_msg="%s is not sample order invariant" % name) assert_allclose(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % name)
def check_importances(name, criterion, X, y): ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=20, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parrallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parrallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert_true(np.all(importances >= 0.0)) for scale in [0.5, 10, 100]: est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert_less(np.abs(importances - importances_bis).mean(), 0.001)
def test_binary_clf_curve(): rng = check_random_state(404) y_true = rng.randint(0, 3, size=10) y_pred = rng.rand(10) msg = "multiclass format is not supported" assert_raise_message(ValueError, msg, precision_recall_curve, y_true, y_pred)
def check_importances(X, y, name, criterion): ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=20, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ n_important = np.sum(importances > 0.1) assert_equal(importances.shape[0], 10) assert_equal(n_important, 3) # XXX: Remove this test in 0.19 after transform support to estimators # is removed. X_new = assert_warns( DeprecationWarning, est.transform, X, threshold="mean") assert_less(0 < X_new.shape[1], X.shape[1]) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parrallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parrallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert_true(np.all(importances >= 0.0)) for scale in [0.5, 10, 100]: est = ForestEstimator(n_estimators=20, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert_less(np.abs(importances - importances_bis).mean(), 0.001)
def check_explicit_sparse_zeros(tree, max_depth=3, n_features=10): TreeEstimator = ALL_TREES[tree] # n_samples set n_feature to ease construction of a simultaneous # construction of a csr and csc matrix n_samples = n_features samples = np.arange(n_samples) # Generate X, y random_state = check_random_state(0) indices = [] data = [] offset = 0 indptr = [offset] for i in range(n_features): n_nonzero_i = random_state.binomial(n_samples, 0.5) indices_i = random_state.permutation(samples)[:n_nonzero_i] indices.append(indices_i) data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1 data.append(data_i) offset += n_nonzero_i indptr.append(offset) indices = np.concatenate(indices) data = np.array(np.concatenate(data), dtype=np.float32) X_sparse = csc_matrix((data, indices, indptr), shape=(n_samples, n_features)) X = X_sparse.toarray() X_sparse_test = csr_matrix((data, indices, indptr), shape=(n_samples, n_features)) X_test = X_sparse_test.toarray() y = random_state.randint(0, 3, size=(n_samples, )) # Ensure that X_sparse_test owns its data, indices and indptr array X_sparse_test = X_sparse_test.copy() # Ensure that we have explicit zeros assert_greater((X_sparse.data == 0.).sum(), 0) assert_greater((X_sparse_test.data == 0.).sum(), 0) # Perform the comparison d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y) s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y) assert_tree_equal(d.tree_, s.tree_, "{0} with dense and sparse format gave different " "trees".format(tree)) Xs = (X_test, X_sparse_test) for X1, X2 in product(Xs, Xs): assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2)) assert_array_almost_equal(s.apply(X1), d.apply(X2)) assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1)) assert_array_almost_equal(s.predict(X1), d.predict(X2)) if tree in CLF_TREES: assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2))
def setUp(self): iris = datasets.load_iris() rng = check_random_state(0) iris.data = iris.data iris.target = iris.target self.iris = iris for csv_file in glob.glob("*.csv"): os.remove(csv_file)
def _get_folds_column(self, length): if self._random_number is None: self._random_number = check_random_state(self.random_state).randint(0, 100000) folds_column = numpy.zeros(length) for fold_number, (_, folds_indices) in enumerate( KFold(length, self.n_folds, shuffle=True, random_state=self._random_number)): folds_column[folds_indices] = fold_number return folds_column
def testSubset(test): n_faces = 5 rng = check_random_state(4) face_ids = rng.randint(test.shape[0], size=(n_faces, )) test = test[face_ids, :] return test, n_faces
def test_regression_sample_weight_invariance(name): n_samples = 50 random_state = check_random_state(0) # regression y_true = random_state.random_sample(size=(n_samples,)) y_pred = random_state.random_sample(size=(n_samples,)) metric = ALL_METRICS[name] check_sample_weight_invariance(name, metric, y_true, y_pred)
def test_only_constant_features(): random_state = check_random_state(0) X = np.zeros((10, 20)) y = random_state.randint(0, 2, (10, )) for name, TreeEstimator in ALL_TREES.items(): est = TreeEstimator(random_state=0) est.fit(X, y) assert_equal(est.tree_.max_depth, 0)
def _check_params(self): if self.loss is None: self.loss = AdaLossFunction() # Losses from sklearn are not allowed assert isinstance(self.loss, AbstractLossFunction), \ 'LossFunction should be derived from AbstractLossFunction' assert self.n_estimators > 0, 'n_estimators should be positive' self.random_state = check_random_state(self.random_state) assert 0 < self.subsample <= 1.0, 'subsample should be in the interval (0, 1]'
def pairwise_transform(X, Y, limit=1.0, random_state=None): """Form comparable pairs with interval-annotated entries. Parameters ---------- X: array-like, shape (n_samples x n_features) The feature representation of the instances. Y: array_like, shape (n_samples x 2) The lower and upper bounds of the interval of each instance. limit: float, Ratio (between 0 and 1) of how many pairs to form with each input sample. Use this to reduce computing time and memory usage, at the cost of approximation error. If, for a given sample, there are 100 samples before and 100 samples after, and p=0.1, then 10 + 10 transformed pairs will be created. """ X = check_array(X, accept_sparse='csr') Y = check_array(Y, accept_sparse=None) rng = check_random_state(random_state) if Y.shape[1] != 2: raise ValueError("Y must have two columns, represeting the lower " "and upper bound of the interval for each entry.") #n_samples = X.shape[0] #idx = np.arange(n_samples) chunks = [] #chunk_idx = [] for k, (x, (y_min, y_max)) in enumerate(zip(X, Y)): X_rest, Y_rest = X[1 + k:], Y[1 + k:] #idx_rest = idx[1 + k:] before = Y_rest[:, 1] < y_min after = Y_rest[:, 0] > y_max n_before = np.sum(before) * limit n_after = np.sum(after) * limit if n_before: before = np.where(before)[0] before = rng.choice(before, n_before, replace=False) X_bef = X_rest[before].copy() chunks.append(_safe_sparse_add_row(X_bef, -x)) #chunk_idx.append(np.array([(i, k) for i in idx_rest[before]])) if n_after: after = np.where(after)[0] after = rng.choice(after, n_after, replace=False) X_aft = X_rest[after].copy() chunks.append(-(_safe_sparse_add_row(X_aft, -x))) #chunk_idx.append(np.array([(k, i) for i in idx_rest[after]])) if len(chunks): return sp.vstack(chunks) if sp.issparse(X) else np.vstack(chunks) # , np.row_stack(chunk_idx) else: raise ValueError("Empty slice: no pairs can be formed.")
def test_recursion_decision_tree_vs_forest_and_gbdt(seed): # Make sure that the recursion method gives the same results on a # DecisionTreeRegressor and a GradientBoostingRegressor or a # RandomForestRegressor with 1 tree and equivalent parameters. rng = np.random.RandomState(seed) # Purely random dataset to avoid correlated features n_samples = 1000 n_features = 5 X = rng.randn(n_samples, n_features) y = rng.randn(n_samples) * 10 # The 'init' estimator for GBDT (here the average prediction) isn't taken # into account with the recursion method, for technical reasons. We set # the mean to 0 to that this 'bug' doesn't have any effect. y = y - y.mean() # set max_depth not too high to avoid splits with same gain but different # features max_depth = 5 tree_seed = 0 forest = RandomForestRegressor(n_estimators=1, max_features=None, bootstrap=False, max_depth=max_depth, random_state=tree_seed) # The forest will use ensemble.base._set_random_states to set the # random_state of the tree sub-estimator. We simulate this here to have # equivalent estimators. equiv_random_state = check_random_state(tree_seed).randint( np.iinfo(np.int32).max) gbdt = GradientBoostingRegressor(n_estimators=1, learning_rate=1, criterion='mse', max_depth=max_depth, random_state=equiv_random_state) tree = DecisionTreeRegressor(max_depth=max_depth, random_state=equiv_random_state) forest.fit(X, y) gbdt.fit(X, y) tree.fit(X, y) # sanity check: if the trees aren't the same, the PD values won't be equal try: assert_is_subtree(tree.tree_, gbdt[0, 0].tree_) assert_is_subtree(tree.tree_, forest[0].tree_) except AssertionError: # For some reason the trees aren't exactly equal on 32bits, so the PDs # cannot be equal either. See # https://github.com/scikit-learn/scikit-learn/issues/8853 assert _IS_32BIT, "this should only fail on 32 bit platforms" return grid = rng.randn(50).reshape(-1, 1) for f in range(n_features): features = np.array([f], dtype=np.int32) pdp_forest = _partial_dependence_recursion(forest, grid, features) pdp_gbdt = _partial_dependence_recursion(gbdt, grid, features) pdp_tree = _partial_dependence_recursion(tree, grid, features) np.testing.assert_allclose(pdp_gbdt, pdp_tree) np.testing.assert_allclose(pdp_forest, pdp_tree)
norm_diff(X - A, norm=2, random_state=0) / X_spectral_norm) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if len(all_time) == 0: raise ValueError("No tests ran. Aborting.") if enable_spectral_norm: title = "normalized spectral norm diff vs running time" scatter_time_vs_s(all_time, all_spectral, datasets, title) title = "normalized Frobenius norm diff vs running time" scatter_time_vs_s(all_time, all_frobenius, datasets, title) if __name__ == "__main__": random_state = check_random_state(1234) power_iter = np.arange(0, 6) n_comps = 50 for dataset_name in datasets: X = get_data(dataset_name) if X is None: continue print(" >>>>>> Benching sklearn and fbpca on %s %d x %d" % (dataset_name, X.shape[0], X.shape[1])) bench_a( X, dataset_name, power_iter, n_oversamples=2,
hidden_layer_sizes) # X, y = mlp_estimator._validate_input(X, y, incremental) n_samples, n_features = X.shape # Ensure y is 2D # TODO:保证array为两维,即输入的y应该是np.array([[1, 2, 3]])这才是1行3列的array # if y.ndim == 1: # y = y.reshape((-1, 1)) mlp_estimator.n_outputs_ = y.shape[1] layer_units = ([n_features] + hidden_layer_sizes + [mlp_estimator.n_outputs_]) # check random state mlp_estimator._random_state = check_random_state(mlp_estimator.random_state) incremental = False if not hasattr(mlp_estimator, 'coefs_') or (not mlp_estimator.warm_start and not incremental): # First time training the model mlp_estimator._initialize(y, layer_units) # lbfgs does not support mini-batches if mlp_estimator.solver == 'lbfgs': batch_size = n_samples elif mlp_estimator.batch_size == 'auto': batch_size = min(200, n_samples) else: if mlp_estimator.batch_size < 1 or mlp_estimator.batch_size > n_samples: warnings.warn("Got `batch_size` less than 1 or larger than "
def check_explicit_sparse_zeros(tree, max_depth=3, n_features=10): TreeEstimator = ALL_TREES[tree] # n_samples set n_feature to ease construction of a simultaneous # construction of a csr and csc matrix n_samples = n_features samples = np.arange(n_samples) # Generate X, y random_state = check_random_state(0) indices = [] data = [] offset = 0 indptr = [offset] for i in range(n_features): n_nonzero_i = random_state.binomial(n_samples, 0.5) indices_i = random_state.permutation(samples)[:n_nonzero_i] indices.append(indices_i) data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i, )) - 1 data.append(data_i) offset += n_nonzero_i indptr.append(offset) indices = np.concatenate(indices) data = np.array(np.concatenate(data), dtype=np.float32) X_sparse = csc_matrix((data, indices, indptr), shape=(n_samples, n_features)) X = X_sparse.toarray() X_sparse_test = csr_matrix((data, indices, indptr), shape=(n_samples, n_features)) X_test = X_sparse_test.toarray() y = random_state.randint(0, 3, size=(n_samples, )) # Ensure that X_sparse_test owns its data, indices and indptr array X_sparse_test = X_sparse_test.copy() # Ensure that we have explicit zeros assert_greater((X_sparse.data == 0.).sum(), 0) assert_greater((X_sparse_test.data == 0.).sum(), 0) # Perform the comparison d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y) s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y) assert_tree_equal( d.tree_, s.tree_, "{0} with dense and sparse format gave different " "trees".format(tree)) Xs = (X_test, X_sparse_test) for X1, X2 in product(Xs, Xs): assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2)) assert_array_almost_equal(s.apply(X1), d.apply(X2)) assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1)) assert_array_almost_equal( s.tree_.decision_path(X1).toarray(), d.tree_.decision_path(X2).toarray()) assert_array_almost_equal( s.decision_path(X1).toarray(), d.decision_path(X2).toarray()) assert_array_almost_equal( s.decision_path(X1).toarray(), s.tree_.decision_path(X1).toarray()) assert_array_almost_equal(s.predict(X1), d.predict(X2)) if tree in CLF_TREES: assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2))
def test_print_overloading_estimator(): """Check that printing a fitted estimator results in 'pretty' output""" random_state = check_random_state(415) X = np.reshape(random_state.uniform(size=50), (5, 10)) y = random_state.uniform(size=5) # Check the regressor est = SymbolicRegressor(generations=2, random_state=0) # Unfitted orig_stdout = sys.stdout try: out = StringIO() sys.stdout = out print(est) output_unfitted = out.getvalue().strip() finally: sys.stdout = orig_stdout # Fitted est.fit(X, y) orig_stdout = sys.stdout try: out = StringIO() sys.stdout = out print(est) output_fitted = out.getvalue().strip() finally: sys.stdout = orig_stdout orig_stdout = sys.stdout try: out = StringIO() sys.stdout = out print(est._program) output_program = out.getvalue().strip() finally: sys.stdout = orig_stdout assert_true(output_unfitted != output_fitted) assert_true(output_unfitted == est.__repr__()) assert_true(output_fitted == output_program) # Check the transformer est = SymbolicTransformer(generations=2, random_state=0) # Unfitted orig_stdout = sys.stdout try: out = StringIO() sys.stdout = out print(est) output_unfitted = out.getvalue().strip() finally: sys.stdout = orig_stdout # Fitted est.fit(X, y) orig_stdout = sys.stdout try: out = StringIO() sys.stdout = out print(est) output_fitted = out.getvalue().strip() finally: sys.stdout = orig_stdout orig_stdout = sys.stdout try: out = StringIO() sys.stdout = out output = str([gp.__str__() for gp in est]) print(output.replace("',", ",\n").replace("'", "")) output_program = out.getvalue().strip() finally: sys.stdout = orig_stdout assert_true(output_unfitted != output_fitted) assert_true(output_unfitted == est.__repr__()) assert_true(output_fitted == output_program)
def seed(self, seed=None): """Same as parent method but passing a RandomState instance is allowed. """ self.np_random = check_random_state(seed) return [seed]
def _fit(self, X): solvers = {"full", "auto", "tsqr", "randomized"} solver = self.svd_solver if solver not in solvers: raise ValueError("Invalid solver '{}'. Must be one of {}".format( solver, solvers)) # Handle n_components==None if self.n_components is None: # TODO: handle nan shapes n_components = min(X.shape) elif 0 < self.n_components < 1: raise NotImplementedError("Fractional 'n_components' is not " "currently supported") else: n_components = self.n_components n_samples, n_features = X.shape if solver == "auto": # Small problem, just call full PCA if max(X.shape) <= 500: solver = "full" elif n_components >= 1 and n_components < 0.8 * min(X.shape): solver = "randomized" # This is also the case of n_components in (0,1) else: solver = "full" if solver == "randomized": lower_limit = 1 else: lower_limit = 0 if not (min(n_samples, n_features) >= n_components >= lower_limit): msg = ("n_components={} must be between {} and " "min(n_samples, n_features)={} with " "svd_solver='{}'".format(n_components, lower_limit, min(n_samples, n_features), solver)) raise ValueError(msg) if sp.issparse(X): raise TypeError("Cannot fit PCA on sparse 'X'") self.mean_ = X.mean(0) X -= self.mean_ if solver in {"full", "tsqr"}: U, S, V = da.linalg.svd(X) else: # randomized random_state = check_random_state(self.random_state) seed = random_state.randint(np.iinfo("int32").max) n_power_iter = self.iterated_power U, S, V = da.linalg.svd_compressed(X, n_components, n_power_iter=n_power_iter, seed=seed) U, V = svd_flip(U, V) explained_variance = (S**2) / (n_samples - 1) components, singular_values = V, S if solver == "randomized": # total_var = X.var(ddof=1, axis=0)[:n_components].sum() total_var = X.var(ddof=1, axis=0).sum() else: total_var = explained_variance.sum() explained_variance_ratio = explained_variance / total_var # Postprocess the number of components required # TODO: n_components = 'mle' # Punting on fractional n_components for now # if 0 < n_components < 1.0: # # number of components for which the cumulated explained # # variance percentage is superior to the desired threshold # ratio_cumsum = stable_cumsum(explained_variance_ratio) # n_components = np.searchsorted(ratio_cumsum, n_components) + 1 # Compute noise covariance using Probabilistic PCA model # The sigma2 maximum likelihood (cf. eq. 12.46) if n_components < min(n_features, n_samples): if solver == "randomized": noise_variance = (total_var.sum() - explained_variance.sum()) / (min( n_features, n_samples) - n_components) pass else: noise_variance = explained_variance[n_components:].mean() else: noise_variance = 0.0 ( self.n_samples_, self.n_features_, self.n_components_, self.components_, self.explained_variance_, self.explained_variance_ratio_, self.singular_values_, self.noise_variance_, self.singular_values_, ) = compute( n_samples, n_features, n_components, components, explained_variance, explained_variance_ratio, singular_values, noise_variance, singular_values, ) if solver != "randomized": self.components_ = self.components_[:n_components] self.explained_variance_ = self.explained_variance_[:n_components] self.explained_variance_ratio_ = self.explained_variance_ratio_[: n_components] self.singular_values_ = self.singular_values_[:n_components] return U, S, V
T = [[-1, -1], [2, 2], [3, 2]] true_result = [-1, 1, 1] # Larger classification sample used for testing feature importances X_large, y_large = datasets.make_classification(n_samples=500, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # also load the iris dataset # and randomly permute it iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] # also load the boston dataset # and randomly permute it boston = datasets.load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] # also make a hastie_10_2 dataset hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1) hastie_X = hastie_X.astype(np.float32)
# # License: BSD 3 clause import numpy as np from sklearn.datasets import load_boston, load_breast_cancer from sklearn.metrics import mean_absolute_error from sklearn.utils.testing import assert_equal, assert_raises from sklearn.utils.validation import check_random_state from gplearn.genetic import SymbolicRegressor, SymbolicClassifier from gplearn.genetic import SymbolicTransformer from gplearn.fitness import make_fitness, _mean_square_error # load the breast cancer dataset and randomly permute it cancer = load_breast_cancer() perm = check_random_state(0).permutation(cancer.target.size) cancer.data = cancer.data[perm] cancer.target = cancer.target[perm] # load the boston dataset and randomly permute it boston = load_boston() perm = check_random_state(0).permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] def test_validate_fitness(): """Check that valid fitness measures are accepted & invalid raise error""" # Check arg count checks _ = make_fitness(function=_mean_square_error, greater_is_better=True)
# variable called 'model'. Don't actually train or do anything else # with it yet: # # .. your code here .. from sklearn import linear_model model = linear_model.LinearRegression() # # INFO: There are 50 takes of each clip. You want to pull out just one # of them, randomly, and that one will NOT be used in the training of # your model. In other words, the one file we'll be testing / scoring # on will be an unseen sample, independent to the rest of your # training set: from sklearn.utils.validation import check_random_state rng = check_random_state(7) # Leave this alone until you've submitted your lab random_idx = rng.randint(zero.shape[0]) test = zero[random_idx] train = np.delete(zero, [random_idx], axis=0) # # TODO: Print out the shape of train, and the shape of test # train will be shaped: [n_samples, n_audio_samples], where # n_audio_samples are the 'features' of the audio file # train will be shaped [n_audio_features], since it is a single # sample (audio file, e.g. observation). # # .. your code here .. print("train.shape = \n", train.shape) print("test.shape = \n", test.shape)
def _initialize_metric_mahalanobis(input, init='identity', random_state=None, return_inverse=False, strict_pd=False, matrix_name='matrix'): """Returns a PSD matrix that can be used as a prior or an initialization for the Mahalanobis distance Parameters ---------- input : array-like The input samples (can be tuples or regular samples). init : string or numpy array, optional (default='identity') Specification for the matrix to initialize. Possible options are 'identity', 'covariance', 'random', and a numpy array of shape (n_features, n_features). 'identity' An identity matrix of shape (n_features, n_features). 'covariance' The (pseudo-)inverse covariance matrix (raises an error if the covariance matrix is not definite and `strict_pd == True`) 'random' A random positive definite (PD) matrix of shape `(n_features, n_features)`, generated using `sklearn.datasets.make_spd_matrix`. numpy array A PSD matrix (or strictly PD if strict_pd==True) of shape (n_features, n_features), that will be used as such to initialize the metric, or set the prior. random_state : int or `numpy.RandomState` or None, optional (default=None) A pseudo random number generator object or a seed for it if int. If ``init='random'``, ``random_state`` is used to set the random Mahalanobis matrix. If ``init='pca'``, ``random_state`` is passed as an argument to PCA when initializing the matrix. return_inverse : bool, optional (default=False) Whether to return the inverse of the specified matrix. This can be sometimes useful. It will return the pseudo-inverse (which is the same as the inverse if the matrix is definite (i.e. invertible)). If `strict_pd == True` and the matrix is not definite, it will return an error. strict_pd : bool, optional (default=False) Whether to enforce that the provided matrix is definite (in addition to being PSD). param_name : str, optional (default='matrix') The name of the matrix used (example: 'init', 'prior'). Will be used in error messages. Returns ------- M, or (M, M_inv) : `numpy.ndarray` The initial matrix to use M, and its inverse if `return_inverse=True`. """ n_features = input.shape[-1] if isinstance(init, np.ndarray): # we copy the array, so that if we update the metric, we don't want to # update the init init = check_array(init, copy=True) # Assert that init.shape[1] = n_features if init.shape != (n_features, ) * 2: raise ValueError('The input dimensionality {} of the given ' 'mahalanobis matrix `{}` must match the ' 'dimensionality of the given inputs ({}).'.format( init.shape, matrix_name, n_features)) # Assert that the matrix is symmetric if not np.allclose(init, init.T): raise ValueError("`{}` is not symmetric.".format(matrix_name)) elif init not in ['identity', 'covariance', 'random']: raise ValueError( "`{}` must be 'identity', 'covariance', 'random' " "or a numpy array of shape (n_features, n_features).".format( matrix_name)) random_state = check_random_state(random_state) M = init if isinstance(M, np.ndarray): w, V = eigh(M, check_finite=False) init_is_definite = _check_sdp_from_eigen(w) if strict_pd and not init_is_definite: raise LinAlgError( "You should provide a strictly positive definite " "matrix as `{}`. This one is not definite. Try another" " {}, or an algorithm that does not " "require the {} to be strictly positive definite.".format( *((matrix_name, ) * 3))) elif return_inverse and not init_is_definite: warnings.warn('The initialization matrix is not invertible: ' 'using the pseudo-inverse instead.') if return_inverse: M_inv = _pseudo_inverse_from_eig(w, V) return M, M_inv else: return M elif init == 'identity': M = np.eye(n_features, n_features) if return_inverse: M_inv = M.copy() return M, M_inv else: return M elif init == 'covariance': if input.ndim == 3: # if the input are tuples, we need to form an X by deduplication X = np.unique(np.vstack(input), axis=0) else: X = input # atleast2d is necessary to deal with scalar covariance matrices M_inv = np.atleast_2d(np.cov(X, rowvar=False)) w, V = eigh(M_inv, check_finite=False) cov_is_definite = _check_sdp_from_eigen(w) if strict_pd and not cov_is_definite: raise LinAlgError( "Unable to get a true inverse of the covariance " "matrix since it is not definite. Try another " "`{}`, or an algorithm that does not " "require the `{}` to be strictly positive definite.".format( *((matrix_name, ) * 2))) elif not cov_is_definite: warnings.warn( 'The covariance matrix is not invertible: ' 'using the pseudo-inverse instead.' 'To make the covariance matrix invertible' ' you can remove any linearly dependent features and/or ' 'reduce the dimensionality of your input, ' 'for instance using `sklearn.decomposition.PCA` as a ' 'preprocessing step.') M = _pseudo_inverse_from_eig(w, V) if return_inverse: return M, M_inv else: return M elif init == 'random': # we need to create a random symmetric matrix M = make_spd_matrix(n_features, random_state=random_state) if return_inverse: # we use pinvh even if we know the matrix is definite, just because # we need the returned matrix to be symmetric (and sometimes # np.linalg.inv returns not symmetric inverses of symmetric matrices) # TODO: there might be a more efficient method to do so M_inv = pinvh(M) return M, M_inv else: return M
def test_symbolic_regressor(): """Check that SymbolicRegressor example works""" rng = check_random_state(0) X_train = rng.uniform(-1, 1, 100).reshape(50, 2) y_train = X_train[:, 0]**2 - X_train[:, 1]**2 + X_train[:, 1] - 1 X_test = rng.uniform(-1, 1, 100).reshape(50, 2) y_test = X_test[:, 0]**2 - X_test[:, 1]**2 + X_test[:, 1] - 1 est_gp = SymbolicRegressor(population_size=5000, generations=20, stopping_criteria=0.01, p_crossover=0.7, p_subtree_mutation=0.1, p_hoist_mutation=0.05, p_point_mutation=0.1, max_samples=0.9, parsimony_coefficient=0.01, random_state=0) est_gp.fit(X_train, y_train) assert_equal(len(est_gp._programs), 7) expected = 'sub(add(-0.999, X1), mul(sub(X1, X0), add(X0, X1)))' assert_equal(est_gp.__str__(), expected) assert_almost_equal(est_gp.score(X_test, y_test), 0.99999, decimal=5) dot_data = est_gp._program.export_graphviz() expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", ' 'fillcolor="#136ed4"] ;\n1 [label="add", fillcolor="#136ed4"] ' ';\n2 [label="-0.999", fillcolor="#60a6f6"] ;\n3 [label="X1", ' 'fillcolor="#60a6f6"] ;\n1 -> 3 ;\n1 -> 2 ;\n4 [label="mul", ' 'fillcolor="#136ed4"] ;\n5 [label="sub", fillcolor="#136ed4"] ' ';\n6 [label="X1", fillcolor="#60a6f6"] ;\n7 [label="X0", ' 'fillcolor="#60a6f6"] ;\n5 -> 7 ;\n5 -> 6 ;\n8 [label="add", ' 'fillcolor="#136ed4"] ;\n9 [label="X0", fillcolor="#60a6f6"] ' ';\n10 [label="X1", fillcolor="#60a6f6"] ;\n8 -> 10 ;\n8 -> 9 ' ';\n4 -> 8 ;\n4 -> 5 ;\n0 -> 4 ;\n0 -> 1 ;\n}') assert_equal(dot_data, expected) assert_equal( est_gp._program.parents, { 'method': 'Crossover', 'parent_idx': 1555, 'parent_nodes': range(1, 4), 'donor_idx': 78, 'donor_nodes': [] }) idx = est_gp._program.parents['donor_idx'] fade_nodes = est_gp._program.parents['donor_nodes'] assert_equal(est_gp._programs[-2][idx].__str__(), 'add(-0.999, X1)') assert_almost_equal(est_gp._programs[-2][idx].fitness_, 0.351803319075) dot_data = est_gp._programs[-2][idx].export_graphviz(fade_nodes=fade_nodes) expected = ('digraph program {\nnode [style=filled]\n0 [label="add", ' 'fillcolor="#136ed4"] ;\n1 [label="-0.999", ' 'fillcolor="#60a6f6"] ;\n2 [label="X1", fillcolor="#60a6f6"] ' ';\n0 -> 2 ;\n0 -> 1 ;\n}') assert_equal(dot_data, expected) idx = est_gp._program.parents['parent_idx'] fade_nodes = est_gp._program.parents['parent_nodes'] assert_equal(est_gp._programs[-2][idx].__str__(), 'sub(sub(X1, 0.939), mul(sub(X1, X0), add(X0, X1)))') assert_almost_equal(est_gp._programs[-2][idx].fitness_, 0.17080204042) dot_data = est_gp._programs[-2][idx].export_graphviz(fade_nodes=fade_nodes) expected = ('digraph program {\nnode [style=filled]\n0 [label="sub", ' 'fillcolor="#136ed4"] ;\n1 [label="sub", fillcolor="#cecece"] ' ';\n2 [label="X1", fillcolor="#cecece"] ;\n3 [label="0.939", ' 'fillcolor="#cecece"] ;\n1 -> 3 ;\n1 -> 2 ;\n4 [label="mul", ' 'fillcolor="#136ed4"] ;\n5 [label="sub", fillcolor="#136ed4"] ' ';\n6 [label="X1", fillcolor="#60a6f6"] ;\n7 [label="X0", ' 'fillcolor="#60a6f6"] ;\n5 -> 7 ;\n5 -> 6 ;\n8 [label="add", ' 'fillcolor="#136ed4"] ;\n9 [label="X0", fillcolor="#60a6f6"] ' ';\n10 [label="X1", fillcolor="#60a6f6"] ;\n8 -> 10 ;\n8 -> 9 ' ';\n4 -> 8 ;\n4 -> 5 ;\n0 -> 4 ;\n0 -> 1 ;\n}') assert_equal(dot_data, expected)
def _validate_train_parms(self, train_set, train_lab, classes=None): random_state = validation.check_random_state(self.random_state) train_set, train_lab = validation.check_X_y(train_set, train_lab) if (self.initial_fit): if (classes): self.classes_ = np.asarray(classes) self.protos_initialized = np.zeros(self.classes_.size) else: self.classes_ = unique_labels(train_lab) self.protos_initialized = np.zeros(self.classes_.size) nb_classes = len(self.classes_) nb_samples, nb_features = train_set.shape # nb_samples unused # set prototypes per class if isinstance(self.prototypes_per_class, int): if self.prototypes_per_class < 0 or not isinstance( self.prototypes_per_class, int): raise ValueError("prototypes_per_class must be a positive int") # nb_ppc = number of protos per class nb_ppc = np.ones([nb_classes], dtype='int') * self.prototypes_per_class else: nb_ppc = validation.column_or_1d( validation.check_array(self.prototypes_per_class, ensure_2d=False, dtype='int')) if nb_ppc.min() <= 0: raise ValueError( "values in prototypes_per_class must be positive") if nb_ppc.size != nb_classes: raise ValueError("length of prototypes per class" " does not fit the number of classes" "classes=%d" "length=%d" % (nb_classes, nb_ppc.size)) # initialize prototypes if self.initial_prototypes is None: if self.initial_fit: self.w_ = np.empty([np.sum(nb_ppc), nb_features], dtype=np.double) self.c_w_ = np.empty([nb_ppc.sum()], dtype=self.classes_.dtype) pos = 0 for actClass in range(len(self.classes_)): nb_prot = nb_ppc[actClass] # nb_ppc: prototypes per class if (self.protos_initialized[actClass] == 0 and self.classes_[actClass] in unique_labels(train_lab)): mean = np.mean( train_set[train_lab == self.classes_[actClass], :], 0) self.w_[pos:pos + nb_prot] = mean + ( random_state.rand(nb_prot, nb_features) * 2 - 1) if math.isnan(self.w_[pos, 0]): print('null: ', actClass) self.protos_initialized[actClass] = 0 else: self.protos_initialized[actClass] = 1 self.c_w_[pos:pos + nb_prot] = self.classes_[actClass] pos += nb_prot else: x = validation.check_array(self.initial_prototypes) self.w_ = x[:, :-1] self.c_w_ = x[:, -1] if self.w_.shape != (np.sum(nb_ppc), nb_features): raise ValueError("the initial prototypes have wrong shape\n" "found=(%d,%d)\n" "expected=(%d,%d)" % (self.w_.shape[0], self.w_.shape[1], nb_ppc.sum(), nb_features)) if set(self.c_w_) != set(self.classes_): raise ValueError( "prototype labels and test data classes do not match\n" "classes={}\n" "prototype labels={}\n".format(self.classes_, self.c_w_)) if self.initial_fit: self.initial_fit = False return train_set, train_lab, random_state
def func_consensus(data, n_boot=1000, ci=95, seed=None): """ Calculates thresholded group consensus functional connectivity graph This function concatenates all time series in `data` and computes a group correlation matrix based on this extended time series. It then generates length `T` bootstrapped samples from the concatenated matrix and estimates confidence intervals for all correlations. Correlations whose sign is consistent across bootstraps are retained; inconsistent correlations are set to zero. If `n_boot` is set to 0 or None a simple, group-averaged functional connectivity matrix is estimated, instead. Parameters ---------- data : (N, T, S) array_like (or a list of S arrays, each shaped as (N, T)) Pre-processed functional time series, where `N` is the number of nodes, `T` is the number of volumes in the time series, and `S` is the number of subjects. n_boot : int, optional Number of bootstraps for which to generate correlation. Default: 1000 ci : (0, 100) float, optional Confidence interval for which to assess the reliability of correlations with bootstraps. Default: 95 seed : int, optional Random seed. Default: None Returns ------- consensus : (N, N) numpy.ndarray Thresholded, group-level correlation matrix References ---------- Mišić, B., Betzel, R. F., Nematzadeh, A., Goni, J., Griffa, A., Hagmann, P., Flammini, A., Ahn, Y.-Y., & Sporns, O. (2015). Cooperative and competitive spreading dynamics on the human connectome. Neuron, 86(6), 1518-1529. """ # check inputs rs = check_random_state(seed) if ci > 100 or ci < 0: raise ValueError("`ci` must be between 0 and 100.") # group-average functional connectivity matrix desired instead of bootstrap if n_boot == 0 or n_boot is None: if isinstance(data, list): corrs = [np.corrcoef(sub) for sub in data] else: corrs = [ np.corrcoef(data[..., sub]) for sub in range(data.shape[-1]) ] return np.mean(corrs, axis=0) if isinstance(data, list): collapsed_data = np.hstack(data) nsample = int(collapsed_data.shape[-1] / len(data)) else: collapsed_data = data.reshape((len(data), -1), order='F') nsample = data.shape[1] consensus = np.corrcoef(collapsed_data) # only keep the upper triangle for the bootstraps to save on memory usage triu_inds = np.triu_indices_from(consensus, k=1) bootstrapped_corrmat = np.zeros((len(triu_inds[0]), n_boot)) # generate `n_boot` bootstrap correlation matrices by sampling `t` time # points from the concatenated time series for boot in range(n_boot): inds = rs.randint(collapsed_data.shape[-1], size=nsample) bootstrapped_corrmat[..., boot] = \ np.corrcoef(collapsed_data[:, inds])[triu_inds] # extract the CIs from the bootstrapped correlation matrices # we don't need the input anymore so overwrite it bootstrapped_ci = np.percentile(bootstrapped_corrmat, [100 - ci, ci], axis=-1, overwrite_input=True) # remove unreliable (i.e., CI zero-crossing) correlations # if the signs of the bootstrapped confidence intervals are different # (i.e., their signs sum to 0), then we want to remove them # so, take the logical not of the CI (CI = 0 ---> True) and create a mask # then, set all connections from the consensus array inside the mask to 0 remove_inds = np.logical_not(np.sign(bootstrapped_ci).sum(axis=0)) mask = np.zeros_like(consensus, dtype=bool) mask[triu_inds] = remove_inds consensus[mask + mask.T] = 0 return consensus
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and spectral features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) X = X.squeeze(1) n_instances, self.series_length = X.shape rng = check_random_state(self.random_state) self.estimators_ = [] self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.intervals = np.zeros((self.n_estimators, 2), dtype=int) self.intervals[0][0] = 0 self.intervals[0][1] = self.series_length for i in range(1, self.n_estimators): self.intervals[i][0] = rng.randint(self.series_length - self.min_interval) self.intervals[i][1] = rng.randint( self.intervals[i][0] + self.min_interval, self.series_length) # Check lag against global properties self.acf_lag_ = self.acf_lag if self.acf_lag > self.series_length - self.acf_min_values: self.acf_lag_ = self.series_length - self.acf_min_values if self.acf_lag < 0: self.acf_lag_ = 1 self.lags = np.zeros(self.n_estimators, dtype=int) for i in range(0, self.n_estimators): temp_lag = self.acf_lag_ if (temp_lag > self.intervals[i][1] - self.intervals[i][0] - self.acf_min_values): temp_lag = (self.intervals[i][1] - self.intervals[i][0] - self.acf_min_values) if temp_lag < 0: temp_lag = 1 self.lags[i] = int(temp_lag) acf_x = np.empty(shape=(n_instances, self.lags[i])) ps_len = (self.intervals[i][1] - self.intervals[i][0]) / 2 ps_x = np.empty(shape=(n_instances, int(ps_len))) for j in range(0, n_instances): acf_x[j] = acf(X[j, self.intervals[i][0]:self.intervals[i][1]], temp_lag) ps_x[j] = ps(X[j, self.intervals[i][0]:self.intervals[i][1]]) transformed_x = np.concatenate((acf_x, ps_x), axis=1) # transformed_x=acf_x tree = clone(self.base_estimator) # set random state, but not the same, so that estimators vary tree.set_params( **{"random_state": rng.randint(np.iinfo(np.int32).max)}) tree.fit(transformed_x, y) self.estimators_.append(tree) self._is_fitted = True return self
def test_format_invariance_with_1d_vectors(name): random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_list = list(y1) y2_list = list(y2) y1_1d, y2_1d = np.array(y1), np.array(y2) assert_array_equal(y1_1d.ndim, 1) assert_array_equal(y2_1d.ndim, 1) y1_column = np.reshape(y1_1d, (-1, 1)) y2_column = np.reshape(y2_1d, (-1, 1)) y1_row = np.reshape(y1_1d, (1, -1)) y2_row = np.reshape(y2_1d, (1, -1)) with ignore_warnings(): metric = ALL_METRICS[name] measure = metric(y1, y2) assert_allclose(metric(y1_list, y2_list), measure, err_msg="%s is not representation invariant with list" "" % name) assert_allclose(metric(y1_1d, y2_1d), measure, err_msg="%s is not representation invariant with " "np-array-1d" % name) assert_allclose(metric(y1_column, y2_column), measure, err_msg="%s is not representation invariant with " "np-array-column" % name) # Mix format support assert_allclose(metric(y1_1d, y2_list), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and list" % name) assert_allclose(metric(y1_list, y2_1d), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and list" % name) assert_allclose(metric(y1_1d, y2_column), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and np-array-column" % name) assert_allclose(metric(y1_column, y2_1d), measure, err_msg="%s is not representation invariant with mix " "np-array-1d and np-array-column" % name) assert_allclose(metric(y1_list, y2_column), measure, err_msg="%s is not representation invariant with mix " "list and np-array-column" % name) assert_allclose(metric(y1_column, y2_list), measure, err_msg="%s is not representation invariant with mix " "list and np-array-column" % name) # These mix representations aren't allowed assert_raises(ValueError, metric, y1_1d, y2_row) assert_raises(ValueError, metric, y1_row, y2_1d) assert_raises(ValueError, metric, y1_list, y2_row) assert_raises(ValueError, metric, y1_row, y2_list) assert_raises(ValueError, metric, y1_column, y2_row) assert_raises(ValueError, metric, y1_row, y2_column) # NB: We do not test for y1_row, y2_row as these may be # interpreted as multilabel or multioutput data. if (name not in (MULTIOUTPUT_METRICS | THRESHOLDED_MULTILABEL_METRICS | MULTILABELS_METRICS)): assert_raises(ValueError, metric, y1_row, y2_row)
def test_invariance_string_vs_numbers_labels(): # Ensure that classification metrics with string labels random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_str = np.array(["eggs", "spam"])[y1] y2_str = np.array(["eggs", "spam"])[y2] pos_label_str = "spam" labels_str = ["eggs", "spam"] for name, metric in CLASSIFICATION_METRICS.items(): if name in METRIC_UNDEFINED_BINARY_MULTICLASS: continue measure_with_number = metric(y1, y2) # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number invariance " "test".format(name)) measure_with_strobj = metric_str(y1_str.astype('O'), y2_str.astype('O')) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number " "invariance test".format(name)) if name in METRICS_WITH_LABELS: metric_str = partial(metric_str, labels=labels_str) measure_with_str = metric_str(y1_str, y2_str) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number " "invariance test".format(name)) measure_with_strobj = metric_str(y1_str.astype('O'), y2_str.astype('O')) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string vs number " "invariance test".format(name)) for name, metric in THRESHOLDED_METRICS.items(): if name not in METRIC_UNDEFINED_BINARY: # Ugly, but handle case with a pos_label and label metric_str = metric if name in METRICS_WITH_POS_LABEL: metric_str = partial(metric_str, pos_label=pos_label_str) measure_with_number = metric(y1, y2) measure_with_str = metric_str(y1_str, y2) assert_array_equal(measure_with_number, measure_with_str, err_msg="{0} failed string vs number " "invariance test".format(name)) measure_with_strobj = metric(y1_str.astype('O'), y2) assert_array_equal(measure_with_number, measure_with_strobj, err_msg="{0} failed string object vs number " "invariance test".format(name)) else: # TODO those metrics doesn't support string label yet assert_raises(ValueError, metric, y1_str, y2) assert_raises(ValueError, metric, y1_str.astype('O'), y2)
def select(self, competences): """Select the most competent classifier for the classification of the query sample given the competence level estimates. Four selection schemes are available. Best : The base classifier with the highest competence level is selected. In cases where more than one base classifier achieves the same competence level, the one with the lowest index is selected. This method is the standard for the LCA, OLA, MLA techniques. Diff : Select the base classifier that is significantly better than the others in the pool (when the difference between its competence level and the competence level of the other base classifiers is higher than a predefined threshold). If no base classifier is significantly better, the base classifier is selected randomly among the member with equivalent competence level. Random : Selects a random base classifier among all base classifiers that achieved the same competence level. ALL : all base classifiers with the max competence level estimates are selected (note that in this case the DCS technique becomes a DES technique). Parameters ---------- competences : array of shape = [n_samples, n_classifiers] Competence level estimated for each base classifier and test example. Returns ------- selected_classifiers : array of shape [n_samples] Indices of the selected base classifier for each sample. If the selection_method is set to 'all', a boolean matrix is returned, containing True for the selected base classifiers, otherwise false. """ if competences.ndim < 2: competences = competences.reshape(1, -1) selected_classifiers = [] best_index = np.argmax(competences, axis=1) if self.selection_method == 'best': # Select the classifier with highest competence level selected_classifiers = best_index elif self.selection_method == 'diff': # Selects a base classifier if its competence level is significant # better than the rest. If there is no such classifier, select # randomly a base model. # # the best classifier will always have diff < diff_thresh. In a # case it is superior than all others, it will be the only member # selected. Otherwise, a random classifier from this list is # selected. rng = check_random_state(self.random_state) best_competence = competences[np.arange(competences.shape[0]), best_index] # best_competence = np.max(competences) diff = best_competence.reshape(-1, 1) - competences # TODO: Improve this part of the code selected_classifiers = np.zeros(diff.shape[0], dtype=np.int) for row in range(diff.shape[0]): diff_list = list(diff[row, :]) indices = [ idx for idx, _ in enumerate(diff_list) if diff_list[idx] < self.diff_thresh ] if len(indices) == 0: indices = range(self.n_classifiers_) selected_classifiers[row] = rng.choice(indices) elif self.selection_method == 'random': # TODO: Improve this part of the code rng = check_random_state(self.random_state) selected_classifiers = np.zeros(competences.shape[0], dtype=np.int) best_competence = competences[np.arange(competences.shape[0]), best_index] for row in range(competences.shape[0]): competence_list = list(competences[row, :]) # Select a random classifier among all with same competence # level indices = [ idx for idx, _ in enumerate(competence_list) if competence_list[idx] == best_competence[row] ] selected_classifiers[row] = rng.choice(indices) elif self.selection_method == 'all': # select all base classifiers with max competence estimates. max_value = np.max(competences, axis=1) selected_classifiers = (competences == max_value.reshape( competences.shape[0], -1)) return selected_classifiers
def test_format_invariance_with_1d_vectors(): random_state = check_random_state(0) y1 = random_state.randint(0, 2, size=(20, )) y2 = random_state.randint(0, 2, size=(20, )) y1_list = list(y1) y2_list = list(y2) y1_1d, y2_1d = np.array(y1), np.array(y2) assert_equal(y1_1d.ndim, 1) assert_equal(y2_1d.ndim, 1) y1_column = np.reshape(y1_1d, (-1, 1)) y2_column = np.reshape(y2_1d, (-1, 1)) y1_row = np.reshape(y1_1d, (1, -1)) y2_row = np.reshape(y2_1d, (1, -1)) for name, metric in ALL_METRICS.items(): if name in METRIC_UNDEFINED_BINARY_MULTICLASS: continue measure = metric(y1, y2) assert_almost_equal(metric(y1_list, y2_list), measure, err_msg="%s is not representation invariant " "with list" % name) assert_almost_equal(metric(y1_1d, y2_1d), measure, err_msg="%s is not representation invariant " "with np-array-1d" % name) assert_almost_equal(metric(y1_column, y2_column), measure, err_msg="%s is not representation invariant " "with np-array-column" % name) # Mix format support assert_almost_equal(metric(y1_1d, y2_list), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and list" % name) assert_almost_equal(metric(y1_list, y2_1d), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and list" % name) assert_almost_equal(metric(y1_1d, y2_column), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" % name) assert_almost_equal(metric(y1_column, y2_1d), measure, err_msg="%s is not representation invariant " "with mix np-array-1d and np-array-column" % name) assert_almost_equal(metric(y1_list, y2_column), measure, err_msg="%s is not representation invariant " "with mix list and np-array-column" % name) assert_almost_equal(metric(y1_column, y2_list), measure, err_msg="%s is not representation invariant " "with mix list and np-array-column" % name) # These mix representations aren't allowed assert_raises(ValueError, metric, y1_1d, y2_row) assert_raises(ValueError, metric, y1_row, y2_1d) assert_raises(ValueError, metric, y1_list, y2_row) assert_raises(ValueError, metric, y1_row, y2_list) assert_raises(ValueError, metric, y1_column, y2_row) assert_raises(ValueError, metric, y1_row, y2_column) # NB: We do not test for y1_row, y2_row as these may be # interpreted as multilabel or multioutput data. if (name not in (MULTIOUTPUT_METRICS + THRESHOLDED_MULTILABEL_METRICS + MULTILABELS_METRICS)): assert_raises(ValueError, metric, y1_row, y2_row)
def fit(self, X, y, check_input=True): """Build a similarity forest regressor from the training set (X, y). Parameters ---------- X : array-like of any type, as long as suitable similarity function is provided The training input samples. y : array-like, shape = [n_samples] The training outputs. Returns ------- self : object """ # Check input if check_input: # Check that X and y have correct shape X, y = check_X_y(X, y) # Input validation, check it to be a non-empty 2D array containing only finite values X = check_array(X) # Check if provided similarity function applies to input X = self._validate_X_predict(X, check_input) if self.criterion == 'theil' or self.criterion == 'atkinson': if not np.where(y >= 0)[0].size == y.size: raise ValueError( 'When using Theil or Atkinson indexes, one need to make sure y has all positive values' ) y = np.atleast_1d(y) self.base_estimator_ = SimilarityTreeRegressor # Check input random_state = check_random_state(self.random_state) if not isinstance(self.n_directions, int): raise ValueError('n_directions parameter must be an int') # Default similarity functions: dot product or rbf kernel if self.sim_function == 'dot': self.sim_function = dot_product elif self.sim_function == 'rbf': self.sim_function = rbf self.oob_score_ = 0.0 self.X_ = X self.y_ = y self.random_state_ = random_state self.estimators_ = [] for i in range(self.n_estimators): tree = SimilarityTreeRegressor( n_directions=self.n_directions, sim_function=self.sim_function, random_state=self.random_state, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, discriminative_sampling=self.discriminative_sampling, criterion=self.criterion, gamma=self.gamma) self.estimators_.append(tree) pool = Pool(processes=4) self.estimators_ = pool.map(self.fit_tree_, self.estimators_) pool.close() pool.join() if self.oob_score: self.oob_score_ /= self.n_estimators assert len(self.estimators_) == self.n_estimators self.is_fitted_ = True return self
def __init__(self, input_shape=(100, 28 * 28), random_state=None, batch_size=100, num_layers=4, num_units_per_layer=(10, 10, 10), dropout_per_layer=(0.5, 0.5, 0.5), std_per_layer=(0.005, 0.005, 0.005), num_output_units=2, dropout_output=0.5, learning_rate=0.01, lambda2=1e-4, momentum=0.9, beta1=0.9, beta2=0.9, rho=0.95, solver='adam', num_epochs=2, lr_policy='fixed', gamma=0.01, power=1.0, epoch_step=1, activation_per_layer=('relu', ) * 3, weight_init_per_layer=('henormal', ) * 3, leakiness_per_layer=(1. / 3., ) * 3, tanh_alpha_per_layer=(2. / 3., ) * 3, tanh_beta_per_layer=(1.7159, ) * 3, is_sparse=False, is_binary=False, is_regression=False, is_multilabel=False): self.random_state = random_state self.batch_size = batch_size self.input_shape = input_shape self.num_layers = num_layers self.num_units_per_layer = num_units_per_layer self.dropout_per_layer = np.asarray(dropout_per_layer, dtype=theano.config.floatX) self.num_output_units = num_output_units self.dropout_output = T.cast(dropout_output, dtype=theano.config.floatX) self.activation_per_layer = activation_per_layer self.weight_init_per_layer = weight_init_per_layer self.std_per_layer = np.asarray(std_per_layer, dtype=theano.config.floatX) self.leakiness_per_layer = np.asarray(leakiness_per_layer, dtype=theano.config.floatX) self.tanh_alpha_per_layer = np.asarray(tanh_alpha_per_layer, dtype=theano.config.floatX) self.tanh_beta_per_layer = np.asarray(tanh_beta_per_layer, dtype=theano.config.floatX) self.momentum = T.cast(momentum, dtype=theano.config.floatX) self.init_learning_rate = np.asarray(learning_rate, dtype=theano.config.floatX) self.learning_rate = np.asarray(learning_rate, dtype=theano.config.floatX) self.lambda2 = T.cast(lambda2, dtype=theano.config.floatX) self.beta1 = T.cast(beta1, dtype=theano.config.floatX) self.beta2 = T.cast(beta2, dtype=theano.config.floatX) self.rho = T.cast(rho, dtype=theano.config.floatX) self.num_epochs = num_epochs self.lr_policy = lr_policy self.gamma = np.asarray(gamma, dtype=theano.config.floatX) self.power = np.asarray(power, dtype=theano.config.floatX) self.epoch_step = np.asarray(epoch_step, dtype=theano.config.floatX) self.is_binary = is_binary self.is_regression = is_regression self.is_multilabel = is_multilabel self.is_sparse = is_sparse self.solver = solver if is_sparse: #input_var = S.csr_matrix('inputs', dtype=theano.config.floatX) input_var = T.matrix('inputs') else: input_var = T.matrix('inputs') if self.is_binary or self.is_multilabel or self.is_regression: target_var = T.matrix('targets') else: target_var = T.ivector('targets') if DEBUG: if self.is_binary: print("... using binary loss") if self.is_multilabel: print("... using multilabel prediction") if self.is_regression: print("... using regression loss") print("... building network!") print("Input shape:", input_shape) print("... with number of epochs:") print(num_epochs) # Added for reproducibility seed = check_random_state(self.random_state) lasagne.random.set_rng(seed) self.network = lasagne.layers.InputLayer(shape=input_shape, input_var=input_var) # Define each layer for i in range(num_layers - 1): init_weight = self._choose_weight_init(i) activation_function = self._choose_activation(i) self.network = lasagne.layers.DenseLayer( lasagne.layers.dropout(self.network, p=self.dropout_per_layer[i]), num_units=self.num_units_per_layer[i], W=init_weight, b=lasagne.init.Constant(val=0.0), nonlinearity=activation_function) # Define output layer and nonlinearity of last layer if self.is_regression: output_activation = lasagne.nonlinearities.linear elif self.is_binary or self.is_multilabel: output_activation = lasagne.nonlinearities.sigmoid else: output_activation = lasagne.nonlinearities.softmax self.network = lasagne.layers.DenseLayer( lasagne.layers.dropout(self.network, p=self.dropout_output), num_units=self.num_output_units, W=lasagne.init.GlorotNormal(), b=lasagne.init.Constant(), nonlinearity=output_activation) prediction = lasagne.layers.get_output(self.network) if self.is_regression: loss_function = lasagne.objectives.squared_error elif self.is_binary or self.is_multilabel: loss_function = lasagne.objectives.binary_crossentropy else: loss_function = lasagne.objectives.categorical_crossentropy loss = loss_function(prediction, target_var) # Aggregate loss mean function with l2 # Regularization on all layers' params if self.is_binary or self.is_multilabel: #loss = T.sum(loss, dtype=theano.config.floatX) loss = T.mean(loss, dtype=theano.config.floatX) else: loss = T.mean(loss, dtype=theano.config.floatX) l2_penalty = self.lambda2 * lasagne.regularization.regularize_network_params( self.network, lasagne.regularization.l2) loss += l2_penalty params = lasagne.layers.get_all_params(self.network, trainable=True) # Create the symbolic scalar lr for loss & updates function lr_scalar = T.scalar('lr', dtype=theano.config.floatX) if solver == "nesterov": updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=lr_scalar, momentum=self.momentum) elif solver == "adam": updates = lasagne.updates.adam(loss, params, learning_rate=lr_scalar, beta1=self.beta1, beta2=self.beta2) elif solver == "adadelta": updates = lasagne.updates.adadelta(loss, params, learning_rate=lr_scalar, rho=self.rho) elif solver == "adagrad": updates = lasagne.updates.adagrad(loss, params, learning_rate=lr_scalar) elif solver == "sgd": updates = lasagne.updates.sgd(loss, params, learning_rate=lr_scalar) elif solver == "momentum": updates = lasagne.updates.momentum(loss, params, learning_rate=lr_scalar, momentum=self.momentum) elif solver == "smorm3s": updates = smorm3s(loss, params, learning_rate=lr_scalar) else: updates = lasagne.updates.sgd(loss, params, learning_rate=lr_scalar) # Validation was removed, as auto-sklearn handles that, if this net # is to be used independently, validation accuracy has to be included if DEBUG: print("... compiling theano functions") self.train_fn = theano.function([input_var, target_var, lr_scalar], loss, updates=updates, allow_input_downcast=True, profile=False, on_unused_input='warn', name='train_fn') if DEBUG: print('... compiling update function') self.update_function = self._policy_function()
def _validate_train_parms(self, train_set, train_lab, classes=None): random_state = validation.check_random_state(self.random_state) train_set, train_lab = validation.check_X_y(train_set, train_lab.ravel()) if self.initial_fit: if classes: self.classes_ = np.asarray(classes) self.protos_initialized = np.zeros(self.classes_.size) else: self.classes_ = unique_labels(train_lab) self.protos_initialized = np.zeros(self.classes_.size) # Validate that labels have correct format for i in range(len(self.classes_)): if i not in self.classes_: raise ValueError('Labels have to be ascending int,\ starting at 0, got {}' .format(self.classes_)) nb_classes = len(self.classes_) nb_features = train_set.shape[1] # set prototypes per class if isinstance(self.prototypes_per_class, int): # ppc is int so we can give same number ppc to for all classes if self.prototypes_per_class < 0: raise ValueError('prototypes_per_class must be a positive int') # nb_ppc = number of protos per class nb_ppc = np.ones([nb_classes], dtype='int') * self.prototypes_per_class elif isinstance(self.prototypes_per_class, list): # its an array containing individual number of protos per class # - not fully supported yet nb_ppc = validation.column_or_1d( validation.check_array(self.prototypes_per_class, ensure_2d=False, dtype='int')) if nb_ppc.min() <= 0: raise ValueError( 'values in prototypes_per_class must be positive') if nb_ppc.size != nb_classes: raise ValueError( 'length of prototypes_per_class' ' does not fit the number of classes' 'classes=%d' 'length=%d' % (nb_classes, nb_ppc.size)) else: raise ValueError('Invalid data type for prototypes_per_class, ' 'must be int or list of int') # initialize prototypes if self.initial_prototypes is None: if self.initial_fit: self.w_ = np.empty([np.sum(nb_ppc), nb_features], dtype=np.double) self.c_w_ = np.empty([nb_ppc.sum()], dtype=self.classes_.dtype) pos = 0 for actClassIdx in range(len(self.classes_)): actClass = self.classes_[actClassIdx] nb_prot = nb_ppc[actClassIdx] # nb_ppc: prototypes per class if (self.protos_initialized[actClassIdx] == 0 and actClass in unique_labels(train_lab)): mean = np.mean( train_set[train_lab == actClass, :], 0) if self.prototypes_per_class == 1: # If only one prototype we init it to mean self.w_[pos:pos + nb_prot] = mean else: # else we add some random noise to distribute them self.w_[pos:pos + nb_prot] = mean + ( random_state.rand(nb_prot, nb_features) * 2 - 1) if math.isnan(self.w_[pos, 0]): raise ValueError('Prototype on position {} for class\ {} is NaN.' .format(pos, actClass)) else: self.protos_initialized[actClassIdx] = 1 self.c_w_[pos:pos + nb_prot] = actClass pos += nb_prot else: x = validation.check_array(self.initial_prototypes) self.w_ = x[:, :-1] self.c_w_ = x[:, -1] if self.w_.shape != (np.sum(nb_ppc), nb_features): raise ValueError("the initial prototypes have wrong shape\n" "found=(%d,%d)\n" "expected=(%d,%d)" % ( self.w_.shape[0], self.w_.shape[1], nb_ppc.sum(), nb_features)) if set(self.c_w_) != set(self.classes_): raise ValueError( "prototype labels and test data classes do not match\n" "classes={}\n" "prototype labels={}\n".format(self.classes_, self.c_w_)) if self.initial_fit: if self.gradient_descent == 'adadelta': self.squared_mean_gradient = np.zeros_like(self.w_) self.squared_mean_step = np.zeros_like(self.w_) self.initial_fit = False return train_set, train_lab
def make_forecasting_problem(n_timepoints=50, random_state=None): rng = check_random_state(random_state) return pd.Series(rng.random(size=n_timepoints), index=pd.Int64Index(np.arange(n_timepoints)))
def fit(self, X, y, check_input=True): """Build a similarity tree regressor from the training set (X, y). Parameters ---------- X : array-like of any type, as long as suitable similarity function is provided The training input samples. y : array-like, shape = [n_samples] The training outputs. Returns ------- self : object """ # Check input if check_input: # Check that X and y have correct shape X, y = check_X_y(X, y) # Input validation, check it to be a non-empty 2D array containing only finite values X = check_array(X) # Check if provided similarity function applies to input X = self._validate_X_predict(X, check_input) if self.criterion == 'theil' or self.criterion == 'atkinson': if not np.where(y >= 0)[0].size == y.size: raise ValueError( 'When using Theil or Atkinson indexes, one need to make sure y has all positive values' ) # Check parameters random_state = check_random_state(self.random_state) if not isinstance(self.n_directions, int): raise ValueError('n_directions parameter must be an int') self._lhs = None self._rhs = None self._p = None self._q = None self._similarities = [] self._split_point = -np.inf self._value = None self._is_leaf = False self.is_fitted_ = False self._impurity = None # Append self to the list of class instances # Current node id is length of all nodes list. Nodes are numbered from 1, the root node self._node_id = id(self) # Value of predicion self._value = np.mean(y) # Current node's impurity if self.criterion == 'variance': self._impurity = np.var(y) elif self.criterion == 'theil': self._impurity = theil(y) elif self.criterion == 'atkinson': self._impurity = atkinson(y) else: raise ValueError('Unknown split criterion') if y.size == 1: self._is_leaf = True self.is_fitted_ = True return self if self._is_pure(y): self._is_leaf = True self.is_fitted_ = True return self if self.max_depth is not None: if self.depth == self.max_depth: self._is_leaf = True self.is_fitted_ = True return self if len(y) <= self.min_samples_split: self._is_leaf = True self.is_fitted_ = True return self # Sample n_direction discriminative directions and find the best one best_impurity = np.inf best_split_point = None best_p = None best_q = None similarities = [] for i, j in self._sample_directions(random_state, y, self.n_directions): impurity, split_point, curr_similarities = find_split( X, y, X[i], X[j], self.criterion, self.sim_function, gamma=self.gamma) if impurity < best_impurity: best_impurity = impurity best_p = X[i] best_q = X[j] best_split_point = split_point similarities = curr_similarities if best_split_point is None: self.is_fitted_ = True self._is_leaf = True return self # if split improves impurity if self._impurity - best_impurity > 0.0: self._split_point = best_split_point self._p = best_p self._q = best_q self._similarities = np.array(similarities, dtype=np.float32) e = 0.000000001 # Left- and right-hand side partitioning lhs_idxs = np.nonzero( self._similarities - self._split_point < e)[0] rhs_idxs = np.nonzero( self._similarities - self._split_point > -e)[0] if len(lhs_idxs) > 0 and len(rhs_idxs) > 0: params = self.get_params() params['depth'] += 1 self._lhs = SimilarityTreeRegressor(**params).fit( X[lhs_idxs], y[lhs_idxs], check_input=False) self._rhs = SimilarityTreeRegressor(**params).fit( X[rhs_idxs], y[rhs_idxs], check_input=False) else: raise ValueError( 'Left- and right-hand-side indexes havn\'t been found,' 'even though the split had been found') # Split doesn't improve impurity, stop growing a tree else: self.is_fitted_ = True self._is_leaf = True return self return self
def fit(self, X, y): """Prepare the DS model by setting the KNN algorithm and pre-processing the information required to apply the DS methods Parameters ---------- X : array of shape = [n_samples, n_features] The input data. y : array of shape = [n_samples] class labels of each example in X. Returns ------- self """ self.random_state_ = check_random_state(self.random_state) # Check if the length of X and y are consistent. X, y = check_X_y(X, y) # Check if the pool of classifiers is None. # If yes, use a BaggingClassifier for the pool. if self.pool_classifiers is None: if len(X) < 2: raise ValueError('More than one sample is needed ' 'if the pool of classifiers is not informed.') # Split the dataset into training (for the base classifier) and # DSEL (for DS) X_train, X_dsel, y_train, y_dsel = train_test_split( X, y, test_size=self.DSEL_perc, random_state=self.random_state_) self.pool_classifiers_ = BaggingClassifier( random_state=self.random_state_) self.pool_classifiers_.fit(X_train, y_train) else: self._check_base_classifier_fitted() self.pool_classifiers_ = self.pool_classifiers X_dsel = X y_dsel = y self.n_classifiers_ = len(self.pool_classifiers_) # check if the input parameters are correct. Raise an error if the # generated_pool is not fitted or k < 1 self._validate_parameters() # Check label encoder on the pool of classifiers self.check_label_encoder() self._setup_label_encoder(y) y_dsel = self.enc_.transform(y_dsel) self._set_dsel(X_dsel, y_dsel) # validate the value of k self._validate_k() self._set_region_of_competence_algorithm() self._fit_region_competence(X_dsel, y_dsel) # validate the IH if self.with_IH: self._validate_ih() return self
def _initialize_components(n_components, input, y=None, init='auto', verbose=False, random_state=None, has_classes=True): """Returns the initial transformation to be used depending on the arguments. Parameters ---------- n_components : int The number of components to take. (Note: it should have been checked before, meaning it should not be None and it should be a value in [1, X.shape[1]]) input : array-like The input samples (can be tuples or regular samples). y : array-like or None The input labels (or not if there are no labels). init : string or numpy array, optional (default='auto') Initialization of the linear transformation. Possible options are 'auto', 'pca', 'lda', 'identity', 'random', and a numpy array of shape (n_features_a, n_features_b). 'auto' Depending on ``n_components``, the most reasonable initialization will be chosen. If ``n_components <= n_classes`` we use 'lda' (see the description of 'lda' init), as it uses labels information. If not, but ``n_components < min(n_features, n_samples)``, we use 'pca', as it projects data onto meaningful directions (those of higher variance). Otherwise, we just use 'identity'. 'pca' ``n_components`` principal components of the inputs passed to :meth:`fit` will be used to initialize the transformation. (See `sklearn.decomposition.PCA`) 'lda' ``min(n_components, n_classes)`` most discriminative components of the inputs passed to :meth:`fit` will be used to initialize the transformation. (If ``n_components > n_classes``, the rest of the components will be zero.) (See `sklearn.discriminant_analysis.LinearDiscriminantAnalysis`). This initialization is possible only if `has_classes == True`. 'identity' The identity matrix. If ``n_components`` is strictly smaller than the dimensionality of the inputs passed to :meth:`fit`, the identity matrix will be truncated to the first ``n_components`` rows. 'random' The initial transformation will be a random array of shape `(n_components, n_features)`. Each value is sampled from the standard normal distribution. numpy array n_features_b must match the dimensionality of the inputs passed to :meth:`fit` and n_features_a must be less than or equal to that. If ``n_components`` is not None, n_features_a must match it. verbose : bool Whether to print the details of the initialization or not. random_state : int or `numpy.RandomState` or None, optional (default=None) A pseudo random number generator object or a seed for it if int. If ``init='random'``, ``random_state`` is used to initialize the random transformation. If ``init='pca'``, ``random_state`` is passed as an argument to PCA when initializing the transformation. has_classes : bool (default=True) Whether the labels are in fact classes. If true, this will allow to use the 'lda' initialization. Returns ------- init_components : `numpy.ndarray` The initial transformation to use. """ # if we are doing a regression we cannot use lda: n_features = input.shape[-1] authorized_inits = ['auto', 'pca', 'identity', 'random'] if has_classes: authorized_inits.append('lda') if isinstance(init, np.ndarray): # we copy the array, so that if we update the metric, we don't want to # update the init init = check_array(init, copy=True) # Assert that init.shape[1] = X.shape[1] if init.shape[1] != n_features: raise ValueError( 'The input dimensionality ({}) of the given ' 'linear transformation `init` must match the ' 'dimensionality of the given inputs `X` ({}).'.format( init.shape[1], n_features)) # Assert that init.shape[0] <= init.shape[1] if init.shape[0] > init.shape[1]: raise ValueError( 'The output dimensionality ({}) of the given ' 'linear transformation `init` cannot be ' 'greater than its input dimensionality ({}).'.format( init.shape[0], init.shape[1])) # Assert that self.n_components = init.shape[0] if n_components != init.shape[0]: raise ValueError('The preferred dimensionality of the ' 'projected space `n_components` ({}) does' ' not match the output dimensionality of ' 'the given linear transformation ' '`init` ({})!'.format(n_components, init.shape[0])) elif init not in authorized_inits: raise ValueError( "`init` must be '{}' " "or a numpy array of shape (n_components, n_features).".format( "', '".join(authorized_inits))) random_state = check_random_state(random_state) if isinstance(init, np.ndarray): return init n_samples = input.shape[0] if init == 'auto': if has_classes: n_classes = len(np.unique(y)) else: n_classes = -1 init = _auto_select_init(has_classes, n_features, n_samples, n_components, n_classes) if init == 'identity': return np.eye(n_components, input.shape[-1]) elif init == 'random': return random_state.randn(n_components, input.shape[-1]) elif init in {'pca', 'lda'}: init_time = time.time() if init == 'pca': pca = PCA(n_components=n_components, random_state=random_state) if verbose: print('Finding principal components... ') sys.stdout.flush() pca.fit(input) transformation = pca.components_ elif init == 'lda': lda = LinearDiscriminantAnalysis(n_components=n_components) if verbose: print('Finding most discriminative components... ') sys.stdout.flush() lda.fit(input, y) transformation = lda.scalings_.T[:n_components] if verbose: print('done in {:5.2f}s'.format(time.time() - init_time)) return transformation
def test_sample_weight_invariance(n_samples=50): random_state = check_random_state(0) # regression y_true = random_state.random_sample(size=(n_samples, )) y_pred = random_state.random_sample(size=(n_samples, )) for name in ALL_METRICS: if name not in REGRESSION_METRICS: continue if name in METRICS_WITHOUT_SAMPLE_WEIGHT: continue metric = ALL_METRICS[name] yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_pred # binary random_state = check_random_state(0) y_true = random_state.randint(0, 2, size=(n_samples, )) y_pred = random_state.randint(0, 2, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples, )) for name in ALL_METRICS: if name in REGRESSION_METRICS: continue if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or name in METRIC_UNDEFINED_BINARY): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_score else: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_pred # multiclass random_state = check_random_state(0) y_true = random_state.randint(0, 5, size=(n_samples, )) y_pred = random_state.randint(0, 5, size=(n_samples, )) y_score = random_state.random_sample(size=(n_samples, 5)) for name in ALL_METRICS: if name in REGRESSION_METRICS: continue if (name in METRICS_WITHOUT_SAMPLE_WEIGHT or name in METRIC_UNDEFINED_BINARY_MULTICLASS): continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_score else: yield _named_check(check_sample_weight_invariance, name), name,\ metric, y_true, y_pred # multilabel indicator _, ya = make_multilabel_classification(n_features=1, n_classes=20, random_state=0, n_samples=100, allow_unlabeled=False) _, yb = make_multilabel_classification(n_features=1, n_classes=20, random_state=1, n_samples=100, allow_unlabeled=False) y_true = np.vstack([ya, yb]) y_pred = np.vstack([ya, ya]) y_score = random_state.randint(1, 4, size=y_true.shape) for name in (MULTILABELS_METRICS + THRESHOLDED_MULTILABEL_METRICS + MULTIOUTPUT_METRICS): if name in METRICS_WITHOUT_SAMPLE_WEIGHT: continue metric = ALL_METRICS[name] if name in THRESHOLDED_METRICS: yield (_named_check(check_sample_weight_invariance, name), name, metric, y_true, y_score) else: yield (_named_check(check_sample_weight_invariance, name), name, metric, y_true, y_pred)
def fit(self, X, y, check_input=True): """Build a similarity tree classifier from the training set (X, y). Parameters ---------- X : array-like of any type, as long as suitable similarity function is provided The training input samples. y : array-like, shape = [n_samples] The labels. Returns ------- self : object """ # Check input if check_input: # Check that X and y have correct shape X, y = check_X_y(X, y) # Input validation, check it to be a non-empty 2D array containing only finite values X = check_array(X) # Check if provided similarity function applies to input X = self._validate_X_predict(X, check_input) y = np.atleast_1d(y) is_classification = is_classifier(self) if is_classification: check_classification_targets(y) y = np.copy(y) if self.classes is None: self.classes_ = unique_labels(y) else: self.classes_ = self.classes self.n_classes_ = len(self.classes_) # Check parameters if self.random_state is not None: random_state = check_random_state(self.random_state) else: random_state = np.random.RandomState() if not isinstance(self.n_directions, int): raise ValueError('n_directions parameter must be an int') self._lhs = None self._rhs = None self._p = None self._q = None self._similarities = [] self._split_point = -np.inf self._value = None self._is_leaf = False self.is_fitted_ = False self.n_ = len(y) # Append self to the list of class instances # Current node id is length of all nodes list. Nodes are numbered from 1, the root node self._node_id = id(self) # Value of predicion probs = np.ones(shape=self.n_classes_) for i, c in enumerate(self.classes_): count = np.where(y == c)[0].size probs[i] = count / len(y) + 0.000000001 self._value = probs self._class_prediction = self.classes_[np.argmax(self._value)] if not 1.0 - 0.00001 <= self._value.sum() <= 1.0 + 0.00001: raise ValueError('Wrong node class probability values.') # Return leaf node value if self._is_pure(y): self._is_leaf = True return self if len(y) == 1: self._is_leaf = True return self if self.max_depth is not None: if self.depth == self.max_depth: self._is_leaf = True return self # Sample n_direction discriminative split directions and find the best one best_impurity = 1.0 best_split_point = -np.inf best_p = None best_q = None similarities = [] for i, j in self._sample_directions(random_state, y, self.n_directions): impurity, split_point, curr_similarities = find_split( X, y, X[i], X[j], 'gini', self.sim_function, gamma=self.gamma) if impurity < best_impurity: best_impurity = impurity best_split_point = split_point best_p = X[i] best_q = X[j] similarities = curr_similarities if best_impurity < 1.0: self._split_point = best_split_point self._p = best_p self._q = best_q self._similarities = np.array(similarities) self._impurity = best_impurity # Left- and right-hand side partitioning lhs_idxs = np.nonzero(self._similarities <= self._split_point)[0] rhs_idxs = np.nonzero(self._similarities > self._split_point)[0] if len(lhs_idxs) > 0 and len(rhs_idxs) > 0: params = self.get_params() params['depth'] += 1 params['classes'] = self.classes_ self._lhs = SimilarityTreeClassifier(**params).fit( X[lhs_idxs], y[lhs_idxs], check_input=False) self._rhs = SimilarityTreeClassifier(**params).fit( X[rhs_idxs], y[rhs_idxs], check_input=False) else: self._is_leaf = True return self self.is_fitted_ = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. TSF has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. unequal : bool Flag to adjust the fitting to account for unequal length series Returns ------- self : object """ # Make this only for unequal, but try get around this #Try except for now try: X, y = check_X_y( X, y, enforce_univariate=not self.capabilities["multivariate"], coerce_to_numpy=True, ) X = X.squeeze(1) #Number of instances, length of each series n_instances, self.series_length = X.shape rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals_ = [ _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng) for _ in range(self.n_estimators) ] self.unequal = False # CHANGE THIS TO HANDLE SPECIFIC ERROR except ValueError: self.unequal = True n_instances = X.shape[0] rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.intervals_ = [] for i in range(self.n_estimators): series_length = X.iloc[i % X.size][0].size n_intervals = int(math.sqrt(series_length)) if n_intervals == 0: n_intervals = 1 if series_length < self.min_interval: self.min_interval = series_length self.intervals_.append( _get_intervals(n_intervals, self.min_interval, series_length, rng)) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed( _fit_estimator)(X, y, self.base_estimator, self.intervals_[i], self.random_state, self.unequal) for i in range(self.n_estimators)) self._is_fitted = True return self
def fit(self, X, y, check_input=True): """Build a forest of trees from the training set (X, y) Parameters ---------- X : array-like matrix of shape = [n_samples, n_features] The training data samples. y : array-like matrix of shape = [n_samples,] The training data labels. Returns ------- self : object. """ # Check input if check_input: # Check that X and y have correct shape X, y = check_X_y(X, y) # Input validation, check it to be a non-empty 2D array containing only finite values X = check_array(X) # Check if provided similarity function applies to input X = self._validate_X_predict(X, check_input) y = np.atleast_1d(y) is_classification = is_classifier(self) if is_classification: check_classification_targets(y) y = np.copy(y) y = np.array(y) self.classes_ = unique_labels(y) self.n_classes_ = self.classes_.shape[0] self.base_estimator_ = SimilarityTreeClassifier # Check input if self.random_state is not None: random_state = check_random_state(self.random_state) else: random_state = np.random.RandomState() if not isinstance(self.n_directions, int): raise ValueError('n_directions parameter must be an int') # Default similarity functions: dot product or rbf kernel if self.sim_function == 'dot': self.sim_function = dot_product elif self.sim_function == 'rbf': self.sim_function = rbf self.oob_score_ = 0.0 self.estimators_ = [] for i in range(self.n_estimators): if self.bootstrap: all_idxs = range(len(y)) idxs = random_state.choice(all_idxs, len(y), replace=True) tree = SimilarityTreeClassifier(classes=self.classes_, n_directions=self.n_directions, sim_function=self.sim_function, random_state=self.random_state, gamma=self.gamma) tree.fit(X[idxs], y[idxs], check_input=False) self.estimators_.append(tree) if self.oob_score: idxs_oob = np.setdiff1d(np.array(range(y.size)), idxs) self.oob_score_ += tree.score(X[idxs_oob], y[idxs_oob]) else: tree = SimilarityTreeClassifier(classes=self.classes_, n_directions=self.n_directions, sim_function=self.sim_function, random_state=self.random_state, max_depth=self.max_depth, gamma=self.gamma) tree.fit(X, y, check_input=False) self.estimators_.append(tree) if self.oob_score: self.oob_score_ /= self.n_estimators assert len(self.estimators_) == self.n_estimators self.is_fitted_ = True return self