from mrex.ensemble import ExtraTreesClassifier # Number of cores to use to perform parallel fitting of the forest model n_jobs = 1 # Load the faces dataset data = fetch_olivetti_faces() X, y = data.data, data.target mask = y < 5 # Limit to 5 classes X = X[mask] y = y[mask] # Build a forest and compute the pixel importances print("Fitting ExtraTreesClassifier on faces data with %d cores..." % n_jobs) t0 = time() forest = ExtraTreesClassifier(n_estimators=1000, max_features=128, n_jobs=n_jobs, random_state=0) forest.fit(X, y) print("done in %0.3fs" % (time() - t0)) importances = forest.feature_importances_ importances = importances.reshape(data.images[0].shape) # Plot pixel importances plt.matshow(importances, cmap=plt.cm.hot) plt.title("Pixel importances with forests of trees") plt.show()
# Parameters n_classes = 3 n_estimators = 30 cmap = plt.cm.RdYlBu plot_step = 0.02 # fine step width for decision surface contours plot_step_coarser = 0.5 # step widths for coarse classifier guesses RANDOM_SEED = 13 # fix the seed on each iteration # Load data iris = load_iris() plot_idx = 1 models = [DecisionTreeClassifier(max_depth=None), RandomForestClassifier(n_estimators=n_estimators), ExtraTreesClassifier(n_estimators=n_estimators), AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), n_estimators=n_estimators)] for pair in ([0, 1], [0, 2], [2, 3]): for model in models: # We only take the two corresponding features X = iris.data[:, pair] y = iris.target # Shuffle idx = np.arange(X.shape[0]) np.random.seed(RANDOM_SEED) np.random.shuffle(idx) X = X[idx] y = y[idx]
def test_importances_asymptotic(): # Check whether variable importances of totally randomized trees # converge towards their theoretical values (See Louppe et al, # Understanding variable importances in forests of randomized trees, 2013). def binomial(k, n): return 0 if k < 0 or k > n else comb(int(n), int(k), exact=True) def entropy(samples): n_samples = len(samples) entropy = 0. for count in np.bincount(samples): p = 1. * count / n_samples if p > 0: entropy -= p * np.log2(p) return entropy def mdi_importance(X_m, X, y): n_samples, n_features = X.shape features = list(range(n_features)) features.pop(X_m) values = [np.unique(X[:, i]) for i in range(n_features)] imp = 0. for k in range(n_features): # Weight of each B of size k coef = 1. / (binomial(k, n_features) * (n_features - k)) # For all B of size k for B in combinations(features, k): # For all values B=b for b in product(*[values[B[j]] for j in range(k)]): mask_b = np.ones(n_samples, dtype=np.bool) for j in range(k): mask_b &= X[:, B[j]] == b[j] X_, y_ = X[mask_b, :], y[mask_b] n_samples_b = len(X_) if n_samples_b > 0: children = [] for xi in values[X_m]: mask_xi = X_[:, X_m] == xi children.append(y_[mask_xi]) imp += (coef * (1. * n_samples_b / n_samples) # P(B=b) * (entropy(y_) - sum([entropy(c) * len(c) / n_samples_b for c in children]))) return imp data = np.array([[0, 0, 1, 0, 0, 1, 0, 1], [1, 0, 1, 1, 1, 0, 1, 2], [1, 0, 1, 1, 0, 1, 1, 3], [0, 1, 1, 1, 0, 1, 0, 4], [1, 1, 0, 1, 0, 1, 1, 5], [1, 1, 0, 1, 1, 1, 1, 6], [1, 0, 1, 0, 0, 1, 0, 7], [1, 1, 1, 1, 1, 1, 1, 8], [1, 1, 1, 1, 0, 1, 1, 9], [1, 1, 1, 0, 1, 1, 1, 0]]) X, y = np.array(data[:, :7], dtype=np.bool), data[:, 7] n_features = X.shape[1] # Compute true importances true_importances = np.zeros(n_features) for i in range(n_features): true_importances[i] = mdi_importance(i, X, y) # Estimate importances with totally randomized trees clf = ExtraTreesClassifier(n_estimators=500, max_features=1, criterion="entropy", random_state=0).fit(X, y) importances = sum(tree.tree_.compute_feature_importances(normalize=False) for tree in clf.estimators_) / clf.n_estimators # Check correctness assert_almost_equal(entropy(y), sum(importances)) assert np.abs(true_importances - importances).mean() < 0.01
# Create train-test split (as [Joachims, 2006]) print("Creating train-test split...") n_train = 60000 X_train = X[:n_train] y_train = y[:n_train] X_test = X[n_train:] y_test = y[n_train:] return X_train, X_test, y_train, y_test ESTIMATORS = { "dummy": DummyClassifier(), 'CART': DecisionTreeClassifier(), 'ExtraTrees': ExtraTreesClassifier(), 'RandomForest': RandomForestClassifier(), 'Nystroem-SVM': make_pipeline( Nystroem(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'SampledRBF-SVM': make_pipeline( RBFSampler(gamma=0.015, n_components=1000), LinearSVC(C=100)), 'LogisticRegression-SAG': LogisticRegression(solver='sag', tol=1e-1, C=1e4), 'LogisticRegression-SAGA': LogisticRegression(solver='saga', tol=1e-1, C=1e4), 'MultilayerPerceptron': MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4, solver='sgd', learning_rate_init=0.2, momentum=0.9, verbose=1, tol=1e-4, random_state=1), 'MLP-adam': MLPClassifier( hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
X_test = X[n_train:] y_test = y[n_train:] # Standardize first 10 features (the numerical ones) mean = X_train.mean(axis=0) std = X_train.std(axis=0) mean[10:] = 0.0 std[10:] = 1.0 X_train = (X_train - mean) / std X_test = (X_test - mean) / std return X_train, X_test, y_train, y_test ESTIMATORS = { 'GBRT': GradientBoostingClassifier(n_estimators=250), 'ExtraTrees': ExtraTreesClassifier(n_estimators=20), 'RandomForest': RandomForestClassifier(n_estimators=20), 'CART': DecisionTreeClassifier(min_samples_split=5), 'SGD': SGDClassifier(alpha=0.001), 'GaussianNB': GaussianNB(), 'liblinear': LinearSVC(loss="l2", penalty="l2", C=1000, dual=False, tol=1e-3), 'SAG': LogisticRegression(solver='sag', max_iter=2, C=1000) } if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--classifiers',
from mrex.datasets import make_classification from mrex.ensemble import ExtraTreesClassifier # Build a classification task using 3 informative features X, y = make_classification(n_samples=1000, n_features=10, n_informative=3, n_redundant=0, n_repeated=0, n_classes=2, random_state=0, shuffle=False) # Build a forest and compute the feature importances forest = ExtraTreesClassifier(n_estimators=250, random_state=0) forest.fit(X, y) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(X.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # Plot the feature importances of the forest plt.figure()
from mrex.metrics import accuracy_score from mrex.utils.validation import check_array from mrex.ensemble import RandomForestClassifier from mrex.ensemble import ExtraTreesClassifier from mrex.ensemble import AdaBoostClassifier from mrex.linear_model import LogisticRegression from mrex.naive_bayes import MultinomialNB ESTIMATORS = { "dummy": DummyClassifier(), "random_forest": RandomForestClassifier(max_features="sqrt", min_samples_split=10), "extra_trees": ExtraTreesClassifier(max_features="sqrt", min_samples_split=10), "logistic_regression": LogisticRegression(), "naive_bayes": MultinomialNB(), "adaboost": AdaBoostClassifier(n_estimators=10), } ############################################################################### # Data if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-e',
X, y = make_circles(factor=0.5, random_state=0, noise=0.05) # use RandomTreesEmbedding to transform data hasher = RandomTreesEmbedding(n_estimators=10, random_state=0, max_depth=3) X_transformed = hasher.fit_transform(X) # Visualize result after dimensionality reduction using truncated SVD svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_transformed) # Learn a Naive Bayes classifier on the transformed data nb = BernoulliNB() nb.fit(X_transformed, y) # Learn an ExtraTreesClassifier for comparison trees = ExtraTreesClassifier(max_depth=3, n_estimators=10, random_state=0) trees.fit(X, y) # scatter plot of original and reduced data fig = plt.figure(figsize=(9, 8)) ax = plt.subplot(221) ax.scatter(X[:, 0], X[:, 1], c=y, s=50, edgecolor='k') ax.set_title("Original Data (2d)") ax.set_xticks(()) ax.set_yticks(()) ax = plt.subplot(222) ax.scatter(X_reduced[:, 0], X_reduced[:, 1], c=y, s=50, edgecolor='k') ax.set_title("Truncated SVD reduction (2d) of transformed data (%dd)" % X_transformed.shape[1])