def test_distribution(): rng = check_random_state(12321) # Single variable with 4 values X = rng.randint(0, 4, size=(1000, 1)) y = rng.rand(1000) n_trees = 500 clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y) uniques = defaultdict(int) for tree in clf.estimators_: tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-") for f, t in zip(tree.tree_.feature, tree.tree_.threshold)) uniques[tree] += 1 uniques = sorted([(1. * count / n_trees, tree) for tree, count in uniques.items()]) # On a single variable problem where X_0 has 4 equiprobable values, there # are 5 ways to build a random tree. The more compact (0,1/0,0/--0,2/--) of # them has probability 1/3 while the 4 others have probability 1/6. assert len(uniques) == 5 assert 0.20 > uniques[0][0] # Rough approximation of 1/6. assert 0.20 > uniques[1][0] assert 0.20 > uniques[2][0] assert 0.20 > uniques[3][0] assert uniques[4][0] > 0.3 assert uniques[4][1] == "0,1/0,0/--0,2/--" # Two variables, one with 2 values, one with 3 values X = np.empty((1000, 2)) X[:, 0] = np.random.randint(0, 2, 1000) X[:, 1] = np.random.randint(0, 3, 1000) y = rng.rand(1000) clf = ExtraTreesRegressor(max_features=1, random_state=1).fit(X, y) uniques = defaultdict(int) for tree in clf.estimators_: tree = "".join(("%d,%d/" % (f, int(t)) if f >= 0 else "-") for f, t in zip(tree.tree_.feature, tree.tree_.threshold)) uniques[tree] += 1 uniques = [(count, tree) for tree, count in uniques.items()] assert len(uniques) == 8
def test_parallel_train(): rng = check_random_state(12321) n_samples, n_features = 80, 30 X_train = rng.randn(n_samples, n_features) y_train = rng.randint(0, 2, n_samples) clfs = [ RandomForestClassifier(n_estimators=20, n_jobs=n_jobs, random_state=12345).fit(X_train, y_train) for n_jobs in [1, 2, 3, 8, 16, 32] ] X_test = rng.randn(n_samples, n_features) probas = [clf.predict_proba(X_test) for clf in clfs] for proba1, proba2 in zip(probas, probas[1:]): assert_array_almost_equal(proba1, proba2)
def check_importances(name, criterion, dtype, tolerance): # cast as dype X = X_large.astype(dtype, copy=False) y = y_large.astype(dtype, copy=False) ForestEstimator = FOREST_ESTIMATORS[name] est = ForestEstimator(n_estimators=10, criterion=criterion, random_state=0) est.fit(X, y) importances = est.feature_importances_ # The forest estimator can detect that only the first 3 features of the # dataset are informative: n_important = np.sum(importances > 0.1) assert importances.shape[0] == 10 assert n_important == 3 assert np.all(importances[:3] > 0.1) # Check with parallel importances = est.feature_importances_ est.set_params(n_jobs=2) importances_parallel = est.feature_importances_ assert_array_almost_equal(importances, importances_parallel) # Check with sample weights sample_weight = check_random_state(0).randint(1, 10, len(X)) est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=sample_weight) importances = est.feature_importances_ assert np.all(importances >= 0.0) for scale in [0.5, 100]: est = ForestEstimator(n_estimators=10, random_state=0, criterion=criterion) est.fit(X, y, sample_weight=scale * sample_weight) importances_bis = est.feature_importances_ assert np.abs(importances - importances_bis).mean() < tolerance
norm_diff(X - A, norm=2) / X_spectral_norm) f = scalable_frobenius_norm_discrepancy(X, U, s, V) all_frobenius[label].append(f / X_fro_norm) if len(all_time) == 0: raise ValueError("No tests ran. Aborting.") if enable_spectral_norm: title = "normalized spectral norm diff vs running time" scatter_time_vs_s(all_time, all_spectral, datasets, title) title = "normalized Frobenius norm diff vs running time" scatter_time_vs_s(all_time, all_frobenius, datasets, title) if __name__ == '__main__': random_state = check_random_state(1234) power_iter = np.linspace(0, 6, 7, dtype=int) n_comps = 50 for dataset_name in datasets: X = get_data(dataset_name) if X is None: continue print(" >>>>>> Benching mrex and fbpca on %s %d x %d" % (dataset_name, X.shape[0], X.shape[1])) bench_a(X, dataset_name, power_iter, n_oversamples=2, n_comps=np.minimum(n_comps, np.min(X.shape)))
# toy sample X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]] y = [-1, -1, -1, 1, 1, 1] T = [[-1, -1], [2, 2], [3, 2]] true_result = [-1, 1, 1] # Larger classification sample used for testing feature importances X_large, y_large = datasets.make_classification( n_samples=500, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, shuffle=False, random_state=0) # also load the iris dataset # and randomly permute it iris = datasets.load_iris() rng = check_random_state(0) perm = rng.permutation(iris.target.size) iris.data = iris.data[perm] iris.target = iris.target[perm] # also load the boston dataset # and randomly permute it boston = datasets.load_boston() perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] # also make a hastie_10_2 dataset hastie_X, hastie_y = datasets.make_hastie_10_2(n_samples=20, random_state=1) hastie_X = hastie_X.astype(np.float32)