def _partition(self, X, y, variables, n_samples, depth=0): rng = self.random_state_ # Leaf if len(variables) == 0 or (self.max_depth is not None and depth >= self.max_depth): values = 1. * np.bincount(y, minlength=self.n_classes_) / len(y) return (values, len(y)) # Internal node else: variables = copy.copy(variables) n_variables = len(variables) n_node = len(X) best = None best_score = -np.inf best_children = None features = (rng.permutation(n_variables))[:min(self.k, n_variables)] for i in features: X_i = variables[i] children = [] for xi in self.values_[X_i]: mask_xi = X[:, X_i] == xi if sum(mask_xi) > 0: children.append((X[mask_xi], y[mask_xi], sum(mask_xi))) score = ((1. * n_node / n_samples) # P(B=b) * (entropy(y) - sum([1. * entropy(c_y) * c_n / n_node for _, c_y, c_n in children]))) if score > best_score: best = i best_score = score best_children = children X_i = variables.pop(best) return (X_i, best_score, [self._partition(c_X, c_y, variables, n_samples, depth=depth+1) for c_X, c_y, _ in best_children])
models = [("TRT", partial(RandomizedID3Ensemble, base_estimator=RandomizedID3Classifier(k=1))), ("ETs K=1", partial(ExtraTreesClassifier, max_features=1, criterion="entropy")), ("ETs K=3", partial(ExtraTreesClassifier, max_features=3, criterion="entropy")), ("ETs K=5", partial(ExtraTreesClassifier, max_features=5, criterion="entropy")), ("RF K=1", partial(RandomForestClassifier, max_features=1, bootstrap=True, criterion="entropy")), ("RF K=3", partial(RandomForestClassifier, max_features=3, bootstrap=True, criterion="entropy")), ("RF K=5", partial(RandomForestClassifier, max_features=5, bootstrap=True, criterion="entropy")),] n_repeat = 5 r = {} for i in range(n_repeat): print "Iteration", i X, y = generate_strobl_null(n_samples=120) print entropy(y) for name, cls in models: f = feature_importances(X, y, cls=cls, n_trees=500) if i == 0: r[name] = np.array(f) else: r[name] += np.array(f) print name, np.sum(f) for name in r: r[name] /= n_repeat # Convert to pandas and plot
criterion="entropy")), ("RF K=5", partial(RandomForestClassifier, max_features=5, bootstrap=True, criterion="entropy")), ] n_repeat = 5 r = {} for i in range(n_repeat): print "Iteration", i X, y = generate_strobl_null(n_samples=120) print entropy(y) for name, cls in models: f = feature_importances(X, y, cls=cls, n_trees=500) if i == 0: r[name] = np.array(f) else: r[name] += np.array(f) print name, np.sum(f) for name in r: r[name] /= n_repeat # Convert to pandas and plot