def test_random_hasher(): # test random forest hashing on circles dataset # make sure that it is linearly separable. # even after projected to two pca dimensions hasher = RandomForestEmbedding(n_estimators=30, random_state=0) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # test fit and transform: hasher = RandomForestEmbedding(n_estimators=30, random_state=0) assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray()) # one leaf active per data point per forest assert_equal(X_transformed.shape[0], X.shape[0]) assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators) pca = RandomizedPCA(n_components=2) X_reduced = pca.fit_transform(X_transformed) linear_clf = LinearSVC() linear_clf.fit(X_reduced, y) assert_equal(linear_clf.score(X_reduced, y), 1.)
space with an ExtraTreesClassifier forests learned on the original data. """ import pylab as pl import numpy as np from sklearn.datasets import make_circles from sklearn.ensemble import RandomForestEmbedding, ExtraTreesClassifier from sklearn.decomposition import RandomizedPCA from sklearn.naive_bayes import BernoulliNB # make a synthetic dataset X, y = make_circles(factor=0.5, random_state=0, noise=0.05) # use RandomForestEmbedding to transform data hasher = RandomForestEmbedding(n_estimators=10, random_state=0, max_depth=3) X_transformed = hasher.fit_transform(X) # Visualize result using PCA pca = RandomizedPCA(n_components=2) X_reduced = pca.fit_transform(X_transformed) # Learn a Naive Bayes classifier on the transformed data nb = BernoulliNB() nb.fit(X_transformed, y) # Learn an ExtraTreesClassifier for comparison trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0) trees.fit(X, y)