def fit(self, X, y): """ Trains the decision tree classifier with (X, y). :param X: <numpy ndarray> An array containing the feature vectors :param y: <numpy array> An array containing the target features :return: self """ X, y = check_X_y(X, y, dtype=None) self._encoder = LabelEncoder() y = self._encoder.fit_transform(y) self._n_instances, self._n_features = X.shape self._n_classes = utils.count_classes(y) self._tree_builder = TreeBuilder(split_criterion=self._split_criterion, feature_prob=self._feature_prob, feature_selection=self._feature_selection, max_depth=self._max_depth, min_samples_leaf=self._min_samples_leaf, min_gain_split=self._min_gain_split, min_samples_split=self._min_samples_split, split_chooser=self._split_chooser) self._tree = self._tree_builder.build_tree(X, y, self._n_classes) return self
def fit(self, X, y): """ Trains the decision forest classifier with (X, y). :param X: <numpy ndarray> An array containing the feature vectors :param y: <numpy array> An array containing the target features :return: self """ X, y = check_X_y(X, y, dtype=None) self._encoder = LabelEncoder() y = self._encoder.fit_transform(y) self._n_instances, self._n_features = X.shape self._n_classes = utils.count_classes(y) self._trees = [] if self._bootstrap: set_generator = BaggingSet(self._n_instances) else: set_generator = SimpleSet(self._n_instances) ledger = FIProbabilityLedger(probabilities=self._feature_prob, n_features=self._n_features, alpha=self.alpha) self._tree_builder = TreeBuilder(split_criterion=self._split_criterion, feature_prob=ledger.probabilities, feature_selection=self._feature_selection, max_depth=self._max_depth, min_samples_leaf=self._min_samples_leaf, min_gain_split=self._min_gain_split, min_samples_split=self._min_samples_split, split_chooser=self._split_chooser) for i in range(1, self._n_estimators+1): ids = set_generator.training_ids() X_new = X[ids] y_new = y[ids] new_tree = self._tree_builder.build_tree(X_new, y_new, self._n_classes) if self._bootstrap: validation_ids = set_generator.oob_ids() if validation_ids: new_tree.weight = accuracy_score(y[validation_ids], self._predict_on_tree(X[validation_ids], new_tree)) self._trees.append(new_tree) set_generator.clear() rate = i/self._n_estimators ledger.update_probabilities(new_tree, rate=rate) self._tree_builder.feature_prob = ledger.probabilities return self
def test_min_gain_split_non_negative_value(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), min_gain_split=1) self.assertEqual(self.tree_builder.min_gain_split, 1)
def test_min_gain_split_exception_negative_value(self): with self.assertRaises(ValueError): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), min_gain_split=-1)
def test_min_samples_split_positive_value_greater_than_one(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), min_samples_split=2) self.assertEqual(self.tree_builder.min_samples_split, 2)
def test_min_samples_split_exception_less_than_two_instances(self): with self.assertRaises(ValueError): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), min_samples_split=1)
def test_min_samples_leaf_positive_value(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), min_samples_leaf=1) self.assertEqual(self.tree_builder.min_samples_leaf, 1)
def test_min_samples_leaf_exception_none_value(self): with self.assertRaises(ValueError): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), min_samples_leaf=None)
class DecisionTreeClassifier(BaseEstimator, ClassifierMixin): def __init__(self, max_depth=None, split_chooser='best', split_criterion='gini', min_samples_leaf=1, min_samples_split=2, feature_selection='all', feature_prob=None, min_gain_split=0): """ Builds a decision tree for a classification problem. :param max_depth: <int> or <None> Defines the maximum depth of the tree :param split_chooser: <string> The name of the split chooser: "best" for selecting the best possible split "rand" for selecting a random split :param split_criterion: <string> The name of the split criterion: "gini" for selecting the Gini criterion "entropy" for selecting the Entropy criterion :param min_samples_leaf: <int> Minimum number of instances to place in a leaf :param min_samples_split: <int> Minimum number of instances to consider creating a split :param feature_selection: <string> The name of the feature selection criteria: "all" for selecting all features as candidate features "log" for selecting log(n)+1 as candidate features "prob" for selecting the candidate features according to its probabilities :param feature_prob: <list> Feature probabilities :param min_gain_split: <float> Minimum split gain value to consider splitting a node """ # Classifier parameters self._tree = None self._n_features = None self._n_instances = None self._tree_builder = None self._encoder = None self._n_classes = None # Tree parameters self._max_depth = None self._min_samples_leaf = None self._min_samples_split = None self._feature_prob = None self._min_gain_split = None self._split_chooser = None self._split_criterion = None self._feature_selection = None if max_depth is None or max_depth > 0: self._max_depth = max_depth else: raise ValueError('The depth of the tree must be greater than 0.') if min_samples_leaf is not None and min_samples_leaf > 0: self._min_samples_leaf = min_samples_leaf else: raise ValueError('The minimum number of instances to place in a leaf must be greater than 0.') if min_samples_split is not None and min_samples_split > 1: self._min_samples_split = min_samples_split else: raise ValueError('The minimum number of instances to make a split must be greater than 1') if feature_prob is None or (utils.check_array_sum_one(feature_prob) and utils.check_positive_array(feature_prob)): self._feature_prob = feature_prob else: raise ValueError('The features probabilities must be positive values and the sum must be one') if min_gain_split is not None and min_gain_split >= 0: self._min_gain_split = min_gain_split else: raise ValueError('The minimum value of gain to make a split must be greater or equal to 0') if split_chooser is not None: self._split_chooser = resolve_split_selection(split_chooser) else: raise ValueError('The split chooser can not be None.') if split_criterion is not None: self._split_criterion = resolve_split_criterion(split_criterion) else: raise ValueError('The split criterion can not be None.') if feature_selection is not None: self._feature_selection = resolve_feature_selection(feature_selection) else: raise ValueError('The feature selection criteria can not be None.') @property def max_depth(self): return self._max_depth @max_depth.setter def max_depth(self, max_depth): self._max_depth = max_depth @property def min_samples_leaf(self): return self._min_samples_leaf @min_samples_leaf.setter def min_samples_leaf(self, min_samples_leaf): self._min_samples_leaf = min_samples_leaf @property def min_samples_split(self): return self._min_samples_split @min_samples_split.setter def min_samples_split(self, min_samples_split): self._min_samples_split = min_samples_split @property def feature_prob(self): return self._feature_prob @feature_prob.setter def feature_prob(self, feature_prob): self._feature_prob = feature_prob @property def min_gain_split(self): return self._min_gain_split @min_gain_split.setter def min_gain_split(self, min_gain_split): self._min_gain_split = min_gain_split @property def split_chooser(self): return self._split_chooser.name @split_chooser.setter def split_chooser(self, split_chooser): self._split_chooser = split_chooser @property def split_criterion(self): return self._split_criterion.name @split_criterion.setter def split_criterion(self, split_criterion): self._split_criterion = split_criterion @property def feature_selection(self): return self._feature_selection.name @feature_selection.setter def feature_selection(self, feature_selection): self._feature_selection = feature_selection def fit(self, X, y): """ Trains the decision tree classifier with (X, y). :param X: <numpy ndarray> An array containing the feature vectors :param y: <numpy array> An array containing the target features :return: self """ X, y = check_X_y(X, y, dtype=None) self._encoder = LabelEncoder() y = self._encoder.fit_transform(y) self._n_instances, self._n_features = X.shape self._n_classes = utils.count_classes(y) self._tree_builder = TreeBuilder(split_criterion=self._split_criterion, feature_prob=self._feature_prob, feature_selection=self._feature_selection, max_depth=self._max_depth, min_samples_leaf=self._min_samples_leaf, min_gain_split=self._min_gain_split, min_samples_split=self._min_samples_split, split_chooser=self._split_chooser) self._tree = self._tree_builder.build_tree(X, y, self._n_classes) return self def predict(self, X, check_input=True): """ Predicts the classes for the new instances in X. :param X: <numpy ndarray> An array containing the feature vectors :param check_input: <bool> If input array must be checked :return: <numpy array> """ if check_input: X = self._validate_predict(X, check_input=check_input) sample_size, features_count = X.shape result = np.zeros(sample_size, dtype=int) for i in range(sample_size): x = X[i] result[i] = self._tree.predict(x) return self._encoder.inverse_transform(result) def predict_proba(self, X, check_input=True): """ Predicts the class distribution probabilities for the new instances in X. :param X: <numpy ndarray> An array containing the feature vectors :param check_input: <bool> If input array must be checked :return: <numpy array> """ if check_input: X = self._validate_predict(X, check_input=check_input) sample_size, features_count = X.shape result = list(range(sample_size)) for i in range(sample_size): x = X[i] result[i] = self._tree.predict_proba(x) return result def _validate_predict(self, X, check_input): """ Validate X whenever one tries to predict or predict_proba. :param X: <numpy ndarray> An array containing the feature vectors :param check_input: <bool> If input array must be checked :return: <bool> """ if self._tree is None: raise NotFittedError("Estimator not fitted, " "call `fit` before exploiting the model.") if check_input: X = check_array(X, dtype=None) n_features = X.shape[1] if self._n_features != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self._n_features, n_features)) return X
def test_max_depth_none_value(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), max_depth=None) self.assertIsNone(self.tree_builder.max_depth)
def setUp(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection())
class TreeBuilderTest(TestCase): def setUp(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection()) def tearDown(self): pass def test_build_tree_only_root(self): n_classes = 1 x = np.array([1, 1]).reshape((2, 1)) y = np.array([0, 0]) returned = self.tree_builder.build_tree(x, y, n_classes) expected_length = 1 expected_root_samples = [2] self.assertEqual(len(returned.nodes), expected_length) self.assertEqual(returned.nodes[returned.last_node_id].samples, expected_root_samples) self.assertIsInstance(returned.nodes[returned.last_node_id], DecisionLeaf) def test_build_tree(self): n_classes = 2 x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3)) y = np.array([1, 1, 0]) expected_length = 3 expected_root_value = 'B' expected_root_feature_id = 1 returned = self.tree_builder.build_tree(x, y, n_classes) self.assertEqual(len(returned.nodes), expected_length) self.assertEqual(returned.nodes[0].value, expected_root_value) self.assertEqual(returned.nodes[0].feature_id, expected_root_feature_id) self.assertEqual([returned.nodes[1].result, returned.nodes[2].result], [1, 0]) self.assertIsInstance(returned.nodes[0], DecisionForkCategorical) def test_build_tree_recursive_all_same_class_two_classes(self): x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3)) y = np.array([1, 1, 1]) self.tree_builder._n_classes = 2 tree = DecisionTree(n_features=3) tree.last_node_id = tree.root() self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1) expected_length = 1 expected_root_samples = [0, 3] self.assertEqual(len(tree.nodes), expected_length) self.assertIsInstance(tree.nodes[tree.last_node_id], DecisionLeaf) self.assertEqual(tree.nodes[tree.last_node_id].samples, expected_root_samples) def test_build_tree_recursive_min_samples_split(self): x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3)) y = np.array([1, 1, 0]) self.tree_builder._n_classes = 2 self.tree_builder._min_samples_split = 4 tree = DecisionTree(n_features=3) tree.last_node_id = tree.root() self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1) expected_length = 1 expected_root_samples = [1, 2] self.assertEqual(len(tree.nodes), expected_length) self.assertIsInstance(tree.nodes[tree.last_node_id], DecisionLeaf) self.assertEqual(tree.nodes[tree.last_node_id].samples, expected_root_samples) def test_build_tree_recursive_max_depth(self): x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3)) y = np.array([1, 1, 0]) self.tree_builder._n_classes = 2 self.tree_builder._max_depth = 0 tree = DecisionTree(n_features=3) tree.last_node_id = tree.root() self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1) expected_length = 1 expected_root_samples = [1, 2] self.assertEqual(len(tree.nodes), expected_length) self.assertIsInstance(tree.nodes[tree.last_node_id], DecisionLeaf) self.assertEqual(tree.nodes[tree.last_node_id].samples, expected_root_samples) def test_build_tree_recursive(self): x = np.array([0, 1, 0, 1, 1, 2, 0, 2, 1]).reshape((3, 3)) y = np.array([1, 1, 0]) self.tree_builder._n_classes = 2 tree = DecisionTree(n_features=3) tree.last_node_id = tree.root() self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1) expected_length = 3 expected_root_feature_id = 1 expected_root_value = 1.5 self.assertEqual(len(tree.nodes), expected_length) self.assertIsInstance(tree.nodes[0], DecisionForkNumerical) self.assertEqual(tree.nodes[0].feature_id, expected_root_feature_id) self.assertEqual(tree.nodes[0].value, expected_root_value) self.assertEqual([tree.nodes[1].result, tree.nodes[2].result], [1, 0]) def test_find_best_split_categorical(self): x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3)) y = np.array([1, 1, 0]) expected_split_value = 'B' expected_split_feature_id = 1 expected_split_gain = 0.44 returned_split = self.tree_builder._find_split(x, y, 3) self.assertEqual(returned_split.value, expected_split_value) self.assertEqual(returned_split.feature_id, expected_split_feature_id) self.assertAlmostEqual(returned_split.gain, expected_split_gain, places=2) def test_find_best_split_numerical(self): x = np.array([0, 1, 0, 1, 1, 2, 0, 2, 1]).reshape((3, 3)) y = np.array([1, 1, 0]) expected_split_value = 1.5 expected_split_feature_id = 1 expected_split_gain = 0.44 returned_split = self.tree_builder._find_split(x, y, 3) self.assertEqual(returned_split.value, expected_split_value) self.assertEqual(returned_split.feature_id, expected_split_feature_id) self.assertAlmostEqual(returned_split.gain, expected_split_gain, places=2) def test_find_best_split_without_examples(self): x = np.array([]).reshape((0, 0)) y = np.array([]) expected_split = None returned_split = self.tree_builder._find_split(x, y, 0) self.assertEqual(returned_split, expected_split)
class ProactiveForestClassifier(DecisionForestClassifier): def __init__(self, n_estimators=100, bootstrap=True, max_depth=None, split_chooser='best', split_criterion='gini', min_samples_leaf=1, feature_selection='log', feature_prob=None, min_gain_split=0, min_samples_split=2, alpha=0.1): """ Builds a proactive forest for a classification problem. :param n_estimators: <int> Number of trees in the forest :param bootstrap: <bool> Whether to use bagging or not :param max_depth: <int> or <None> Defines the maximum depth of the tree :param split_chooser: <string> The name of the split chooser: "best" for selecting the best possible split "rand" for selecting a random split :param split_criterion: <string> The name of the split criterion: "gini" for selecting the Gini criterion "entropy" for selecting the Entropy criterion :param min_samples_leaf: <int> Minimum number of instances to place in a leaf :param feature_selection: <string> The name of the feature selection criteria: "all" for selecting all features as candidate features "log" for selecting log(n)+1 as candidate features "prob" for selecting the candidate features according to its probabilities :param feature_prob: <list> Feature probabilities :param min_gain_split: <float> Minimum split gain value to consider splitting a node :param min_samples_split: <int> Minimum number of instances to consider creating a split :param alpha: <float> Diversity rate. It can take values from (0, 1] """ if 0 < alpha <= 1: self.alpha = alpha else: raise ValueError("The diversity rate can only take values from (0, 1].") super().__init__(n_estimators=n_estimators, bootstrap=bootstrap, max_depth=max_depth, split_chooser=split_chooser, split_criterion=split_criterion, min_samples_leaf=min_samples_leaf, feature_selection=feature_selection, feature_prob=feature_prob, min_gain_split=min_gain_split, min_samples_split=min_samples_split ) def fit(self, X, y): """ Trains the decision forest classifier with (X, y). :param X: <numpy ndarray> An array containing the feature vectors :param y: <numpy array> An array containing the target features :return: self """ X, y = check_X_y(X, y, dtype=None) self._encoder = LabelEncoder() y = self._encoder.fit_transform(y) self._n_instances, self._n_features = X.shape self._n_classes = utils.count_classes(y) self._trees = [] if self._bootstrap: set_generator = BaggingSet(self._n_instances) else: set_generator = SimpleSet(self._n_instances) ledger = FIProbabilityLedger(probabilities=self._feature_prob, n_features=self._n_features, alpha=self.alpha) self._tree_builder = TreeBuilder(split_criterion=self._split_criterion, feature_prob=ledger.probabilities, feature_selection=self._feature_selection, max_depth=self._max_depth, min_samples_leaf=self._min_samples_leaf, min_gain_split=self._min_gain_split, min_samples_split=self._min_samples_split, split_chooser=self._split_chooser) for i in range(1, self._n_estimators+1): ids = set_generator.training_ids() X_new = X[ids] y_new = y[ids] new_tree = self._tree_builder.build_tree(X_new, y_new, self._n_classes) if self._bootstrap: validation_ids = set_generator.oob_ids() if validation_ids: new_tree.weight = accuracy_score(y[validation_ids], self._predict_on_tree(X[validation_ids], new_tree)) self._trees.append(new_tree) set_generator.clear() rate = i/self._n_estimators ledger.update_probabilities(new_tree, rate=rate) self._tree_builder.feature_prob = ledger.probabilities return self
class DecisionForestClassifier(BaseEstimator, ClassifierMixin): def __init__(self, n_estimators=100, bootstrap=True, max_depth=None, split_chooser='best', split_criterion='gini', min_samples_leaf=1, feature_selection='log', feature_prob=None, min_gain_split=0, min_samples_split=2): """ Builds a decision forest for a classification problem. :param n_estimators: <int> Number of trees in the forest :param bootstrap: <bool> Whether to use bagging or not :param max_depth: <int> or <None> Defines the maximum depth of the tree :param split_chooser: <string> The name of the split chooser: "best" for selecting the best possible split "rand" for selecting a random split :param split_criterion: <string> The name of the split criterion: "gini" for selecting the Gini criterion "entropy" for selecting the Entropy criterion :param min_samples_leaf: <int> Minimum number of instances to place in a leaf :param feature_selection: <string> The name of the feature selection criteria: "all" for selecting all features as candidate features "log" for selecting log(n)+1 as candidate features "prob" for selecting the candidate features according to its probabilities :param feature_prob: <list> Feature probabilities :param min_gain_split: <float> Minimum split gain value to consider splitting a node :param min_samples_split: <int> Minimum number of instances to consider creating a split """ self._trees = None self._n_features = None self._n_instances = None self._tree_builder = None self._n_classes = None self._encoder = None # Ensemble parameters self._n_estimators = None self._bootstrap = bootstrap # Tree parameters self._max_depth = None self._min_samples_leaf = None self._min_samples_split = None self._feature_prob = None self._min_gain_split = None self._split_chooser = None self._split_criterion = None self._feature_selection = None if n_estimators is None or n_estimators > 0: self._n_estimators = n_estimators else: raise ValueError('The number of trees must be greater than 0.') if bootstrap is not None: self._bootstrap = bootstrap else: raise ValueError('The value of bootstrap can not be None.') if max_depth is None or max_depth > 0: self._max_depth = max_depth else: raise ValueError('The depth of the tree must be greater than 0.') if min_samples_leaf is not None and min_samples_leaf > 0: self._min_samples_leaf = min_samples_leaf else: raise ValueError('The minimum number of instances to place in a leaf must be greater than 0.') if min_samples_split is not None and min_samples_split > 1: self._min_samples_split = min_samples_split else: raise ValueError('The minimum number of instances to make a split must be greater than 1') if feature_prob is None or (utils.check_array_sum_one(feature_prob) and utils.check_positive_array(feature_prob)): self._feature_prob = feature_prob else: raise ValueError('The features probabilities must be positive values and the sum must be one') if min_gain_split is not None and min_gain_split >= 0: self._min_gain_split = min_gain_split else: raise ValueError('The minimum value of gain to make a split must be greater or equal to 0') if split_chooser is not None: self._split_chooser = resolve_split_selection(split_chooser) else: raise ValueError('The split chooser can not be None.') if split_criterion is not None: self._split_criterion = resolve_split_criterion(split_criterion) else: raise ValueError('The split criterion can not be None.') if feature_selection is not None: self._feature_selection = resolve_feature_selection(feature_selection) else: raise ValueError('The feature selection criteria can not be None.') @property def n_estimators(self): return self._n_estimators @n_estimators.setter def n_estimators(self, n_estimators): self._n_estimators = n_estimators @property def bootstrap(self): return self._bootstrap @bootstrap.setter def bootstrap(self, bootstrap): self._bootstrap = bootstrap @property def max_depth(self): return self._max_depth @max_depth.setter def max_depth(self, max_depth): self._max_depth = max_depth @property def min_samples_leaf(self): return self._min_samples_leaf @min_samples_leaf.setter def min_samples_leaf(self, min_samples_leaf): self._min_samples_leaf = min_samples_leaf @property def min_samples_split(self): return self._min_samples_split @min_samples_split.setter def min_samples_split(self, min_samples_split): self._min_samples_split = min_samples_split @property def feature_prob(self): return self._feature_prob @feature_prob.setter def feature_prob(self, feature_prob): self._feature_prob = feature_prob @property def min_gain_split(self): return self._min_gain_split @min_gain_split.setter def min_gain_split(self, min_gain_split): self._min_gain_split = min_gain_split @property def split_chooser(self): return self._split_chooser.name @split_chooser.setter def split_chooser(self, split_chooser): self._split_chooser = split_chooser @property def split_criterion(self): return self._split_criterion.name @split_criterion.setter def split_criterion(self, split_criterion): self._split_criterion = split_criterion @property def feature_selection(self): return self._feature_selection.name @feature_selection.setter def feature_selection(self, feature_selection): self._feature_selection = feature_selection def fit(self, X, y): """ Trains the decision forest classifier with (X, y). :param X: <numpy ndarray> An array containing the feature vectors :param y: <numpy array> An array containing the target features :return: self """ X, y = check_X_y(X, y, dtype=None) self._encoder = LabelEncoder() y = self._encoder.fit_transform(y) self._n_instances, self._n_features = X.shape self._n_classes = utils.count_classes(y) self._trees = [] if self._bootstrap: set_generator = BaggingSet(self._n_instances) else: set_generator = SimpleSet(self._n_instances) self._tree_builder = TreeBuilder(split_criterion=self._split_criterion, feature_prob=self._feature_prob, feature_selection=self._feature_selection, max_depth=self._max_depth, min_samples_leaf=self._min_samples_leaf, min_gain_split=self._min_gain_split, min_samples_split=self._min_samples_split, split_chooser=self._split_chooser) for _ in range(self._n_estimators): ids = set_generator.training_ids() X_new = X[ids] y_new = y[ids] new_tree = self._tree_builder.build_tree(X_new, y_new, self._n_classes) if self._bootstrap: validation_ids = set_generator.oob_ids() if validation_ids: new_tree.weight = accuracy_score(y[validation_ids], self._predict_on_tree(X[validation_ids], new_tree)) self._trees.append(new_tree) set_generator.clear() return self def predict(self, X, check_input=True): """ Predicts the classes for the new instances in X. :param X: <numpy ndarray> An array containing the feature vectors :param check_input: <bool> If input array must be checked :return: <numpy array> """ if check_input: X = self._validate(X, check_input=check_input) voter = PerformanceWeightingVoter(self._trees, self._n_classes) sample_size, features_count = X.shape result = np.zeros(sample_size, dtype=int) for i in range(sample_size): x = X[i] result[i] = voter.predict(x) return self._encoder.inverse_transform(result) def predict_proba(self, X, check_input=True): """ Predicts the class distribution probabilities for the new instances in X. :param X: <numpy ndarray> An array containing the feature vectors :param check_input: <bool> If input array must be checked :return: <numpy array> """ if check_input: X = self._validate(X, check_input=check_input) voter = PerformanceWeightingVoter(self._trees, self._n_classes) sample_size, features_count = X.shape result = list(range(sample_size)) for i in range(sample_size): x = X[i] result[i] = voter.predict_proba(x) return result def feature_importances(self): """ Calculates the feature importances according to Breiman 2001. :return: <numpy array> """ importances = np.zeros(self._n_features) for tree in self._trees: importances += tree.feature_importances() importances /= len(self._trees) return importances def trees_mean_weight(self): """ Calculates the mean weight of the trees in the forest. :return: <float> """ weights = [tree.weight for tree in self._trees] mean_weight = np.mean(weights) return mean_weight def diversity_measure(self, X, y, diversity='pcd'): """ Calculates the diversity measure for the forest. :param X: <numpy ndarray> An array containing the feature vectors :param y: <numpy array> An array containing the target features :param diversity: <string> The type of diversity to be calculated "pcd" for Percentage of Correct Diversity "qstat" for QStatistic Diversity :return: <float> """ X, y = check_X_y(X, y, dtype=None) y = self._encoder.transform(y) if diversity == 'pcd': metric = PercentageCorrectDiversity() elif diversity == 'qstat': metric = QStatisticDiversity() else: raise ValueError("It was not possible to recognize the diversity measure.") forest_diversity = metric.get_measure(self._trees, X, y) return forest_diversity def _validate(self, X, check_input): """ Validate X whenever one tries to predict or predict_proba. :param X: <numpy ndarray> An array containing the feature vectors :param check_input: <bool> If input array must be checked :return: <bool> """ if self._trees is None: raise NotFittedError("Estimator not fitted, " "call `fit` before exploiting the model.") if check_input: X = check_array(X, dtype=None) n_features = X.shape[1] if self._n_features != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self._n_features, n_features)) return X def _predict_on_tree(self, X, tree, check_input=True): """ Predicts the classes for the new instances in X. :param X: <numpy ndarray> An array containing the feature vectors :param tree: <DecisionTree> The tree in which to predict :param check_input: <bool> If input array must be checked :return: <numpy array> """ if check_input: X = self._validate(X, check_input=check_input) sample_size, features_count = X.shape result = np.zeros(sample_size, dtype=int) for i in range(sample_size): x = X[i] result[i] = tree.predict(x) return result
def test_split_chooser_admissible_value(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection()) self.assertIsInstance(self.tree_builder.split_chooser, BestSplitChooser)
def test_split_criterion_exception_none_value(self): with self.assertRaises(ValueError): self.tree_builder = TreeBuilder(split_criterion=None, split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection())
def test_max_depth_positive_value(self): self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(), feature_selection=AllFeatureSelection(), max_depth=1) self.assertEqual(self.tree_builder.max_depth, 1)