Esempio n. 1
0
    def fit(self, X, y):
        """
        Trains the decision tree classifier with (X, y).

        :param X: <numpy ndarray> An array containing the feature vectors
        :param y: <numpy array> An array containing the target features
        :return: self
        """
        X, y = check_X_y(X, y, dtype=None)
        self._encoder = LabelEncoder()
        y = self._encoder.fit_transform(y)
        self._n_instances, self._n_features = X.shape
        self._n_classes = utils.count_classes(y)

        self._tree_builder = TreeBuilder(split_criterion=self._split_criterion,
                                         feature_prob=self._feature_prob,
                                         feature_selection=self._feature_selection,
                                         max_depth=self._max_depth,
                                         min_samples_leaf=self._min_samples_leaf,
                                         min_gain_split=self._min_gain_split,
                                         min_samples_split=self._min_samples_split,
                                         split_chooser=self._split_chooser)
        self._tree = self._tree_builder.build_tree(X, y, self._n_classes)

        return self
Esempio n. 2
0
    def fit(self, X, y):
        """
        Trains the decision forest classifier with (X, y).

        :param X: <numpy ndarray> An array containing the feature vectors
        :param y: <numpy array> An array containing the target features
        :return: self
        """
        X, y = check_X_y(X, y, dtype=None)
        self._encoder = LabelEncoder()
        y = self._encoder.fit_transform(y)
        self._n_instances, self._n_features = X.shape
        self._n_classes = utils.count_classes(y)
        self._trees = []

        if self._bootstrap:
            set_generator = BaggingSet(self._n_instances)
        else:
            set_generator = SimpleSet(self._n_instances)

        ledger = FIProbabilityLedger(probabilities=self._feature_prob, n_features=self._n_features, alpha=self.alpha)

        self._tree_builder = TreeBuilder(split_criterion=self._split_criterion,
                                         feature_prob=ledger.probabilities,
                                         feature_selection=self._feature_selection,
                                         max_depth=self._max_depth,
                                         min_samples_leaf=self._min_samples_leaf,
                                         min_gain_split=self._min_gain_split,
                                         min_samples_split=self._min_samples_split,
                                         split_chooser=self._split_chooser)

        for i in range(1, self._n_estimators+1):

            ids = set_generator.training_ids()
            X_new = X[ids]
            y_new = y[ids]

            new_tree = self._tree_builder.build_tree(X_new, y_new, self._n_classes)

            if self._bootstrap:
                validation_ids = set_generator.oob_ids()
                if validation_ids:
                    new_tree.weight = accuracy_score(y[validation_ids], self._predict_on_tree(X[validation_ids], new_tree))

            self._trees.append(new_tree)
            set_generator.clear()

            rate = i/self._n_estimators
            ledger.update_probabilities(new_tree, rate=rate)
            self._tree_builder.feature_prob = ledger.probabilities

        return self
Esempio n. 3
0
 def test_min_gain_split_non_negative_value(self):
     self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                     feature_selection=AllFeatureSelection(), min_gain_split=1)
     self.assertEqual(self.tree_builder.min_gain_split, 1)
Esempio n. 4
0
 def test_min_gain_split_exception_negative_value(self):
     with self.assertRaises(ValueError):
         self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                         feature_selection=AllFeatureSelection(), min_gain_split=-1)
Esempio n. 5
0
 def test_min_samples_split_positive_value_greater_than_one(self):
     self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                     feature_selection=AllFeatureSelection(), min_samples_split=2)
     self.assertEqual(self.tree_builder.min_samples_split, 2)
Esempio n. 6
0
 def test_min_samples_split_exception_less_than_two_instances(self):
     with self.assertRaises(ValueError):
         self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                         feature_selection=AllFeatureSelection(), min_samples_split=1)
Esempio n. 7
0
 def test_min_samples_leaf_positive_value(self):
     self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                     feature_selection=AllFeatureSelection(), min_samples_leaf=1)
     self.assertEqual(self.tree_builder.min_samples_leaf, 1)
Esempio n. 8
0
 def test_min_samples_leaf_exception_none_value(self):
     with self.assertRaises(ValueError):
         self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                         feature_selection=AllFeatureSelection(), min_samples_leaf=None)
Esempio n. 9
0
class DecisionTreeClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 max_depth=None,
                 split_chooser='best',
                 split_criterion='gini',
                 min_samples_leaf=1,
                 min_samples_split=2,
                 feature_selection='all',
                 feature_prob=None,
                 min_gain_split=0):
        """
        Builds a decision tree for a classification problem.

        :param max_depth: <int> or <None> Defines the maximum depth of the tree
        :param split_chooser: <string> The name of the split chooser:
                            "best" for selecting the best possible split
                            "rand" for selecting a random split
        :param split_criterion: <string> The name of the split criterion:
                            "gini" for selecting the Gini criterion
                            "entropy" for selecting the Entropy criterion
        :param min_samples_leaf: <int> Minimum number of instances to place in a leaf
        :param min_samples_split: <int> Minimum number of instances to consider creating a split
        :param feature_selection: <string> The name of the feature selection criteria:
                            "all" for selecting all features as candidate features
                            "log" for selecting log(n)+1 as candidate features
                            "prob" for selecting the candidate features according to its probabilities
        :param feature_prob: <list> Feature probabilities
        :param min_gain_split: <float> Minimum split gain value to consider splitting a node
        """
        # Classifier parameters
        self._tree = None
        self._n_features = None
        self._n_instances = None
        self._tree_builder = None
        self._encoder = None
        self._n_classes = None

        # Tree parameters
        self._max_depth = None
        self._min_samples_leaf = None
        self._min_samples_split = None
        self._feature_prob = None
        self._min_gain_split = None
        self._split_chooser = None
        self._split_criterion = None
        self._feature_selection = None

        if max_depth is None or max_depth > 0:
            self._max_depth = max_depth
        else:
            raise ValueError('The depth of the tree must be greater than 0.')

        if min_samples_leaf is not None and min_samples_leaf > 0:
            self._min_samples_leaf = min_samples_leaf
        else:
            raise ValueError('The minimum number of instances to place in a leaf must be greater than 0.')

        if min_samples_split is not None and min_samples_split > 1:
            self._min_samples_split = min_samples_split
        else:
            raise ValueError('The minimum number of instances to make a split must be greater than 1')

        if feature_prob is None or (utils.check_array_sum_one(feature_prob) and
                                    utils.check_positive_array(feature_prob)):
            self._feature_prob = feature_prob
        else:
            raise ValueError('The features probabilities must be positive values and the sum must be one')

        if min_gain_split is not None and min_gain_split >= 0:
            self._min_gain_split = min_gain_split
        else:
            raise ValueError('The minimum value of gain to make a split must be greater or equal to 0')

        if split_chooser is not None:
            self._split_chooser = resolve_split_selection(split_chooser)
        else:
            raise ValueError('The split chooser can not be None.')

        if split_criterion is not None:
            self._split_criterion = resolve_split_criterion(split_criterion)
        else:
            raise ValueError('The split criterion can not be None.')

        if feature_selection is not None:
            self._feature_selection = resolve_feature_selection(feature_selection)
        else:
            raise ValueError('The feature selection criteria can not be None.')

    @property
    def max_depth(self):
        return self._max_depth

    @max_depth.setter
    def max_depth(self, max_depth):
        self._max_depth = max_depth

    @property
    def min_samples_leaf(self):
        return self._min_samples_leaf

    @min_samples_leaf.setter
    def min_samples_leaf(self, min_samples_leaf):
        self._min_samples_leaf = min_samples_leaf

    @property
    def min_samples_split(self):
        return self._min_samples_split

    @min_samples_split.setter
    def min_samples_split(self, min_samples_split):
        self._min_samples_split = min_samples_split

    @property
    def feature_prob(self):
        return self._feature_prob

    @feature_prob.setter
    def feature_prob(self, feature_prob):
        self._feature_prob = feature_prob

    @property
    def min_gain_split(self):
        return self._min_gain_split

    @min_gain_split.setter
    def min_gain_split(self, min_gain_split):
        self._min_gain_split = min_gain_split

    @property
    def split_chooser(self):
        return self._split_chooser.name

    @split_chooser.setter
    def split_chooser(self, split_chooser):
        self._split_chooser = split_chooser

    @property
    def split_criterion(self):
        return self._split_criterion.name

    @split_criterion.setter
    def split_criterion(self, split_criterion):
        self._split_criterion = split_criterion

    @property
    def feature_selection(self):
        return self._feature_selection.name

    @feature_selection.setter
    def feature_selection(self, feature_selection):
        self._feature_selection = feature_selection

    def fit(self, X, y):
        """
        Trains the decision tree classifier with (X, y).

        :param X: <numpy ndarray> An array containing the feature vectors
        :param y: <numpy array> An array containing the target features
        :return: self
        """
        X, y = check_X_y(X, y, dtype=None)
        self._encoder = LabelEncoder()
        y = self._encoder.fit_transform(y)
        self._n_instances, self._n_features = X.shape
        self._n_classes = utils.count_classes(y)

        self._tree_builder = TreeBuilder(split_criterion=self._split_criterion,
                                         feature_prob=self._feature_prob,
                                         feature_selection=self._feature_selection,
                                         max_depth=self._max_depth,
                                         min_samples_leaf=self._min_samples_leaf,
                                         min_gain_split=self._min_gain_split,
                                         min_samples_split=self._min_samples_split,
                                         split_chooser=self._split_chooser)
        self._tree = self._tree_builder.build_tree(X, y, self._n_classes)

        return self

    def predict(self, X, check_input=True):
        """
        Predicts the classes for the new instances in X.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param check_input: <bool> If input array must be checked
        :return: <numpy array>
        """
        if check_input:
            X = self._validate_predict(X, check_input=check_input)

        sample_size, features_count = X.shape
        result = np.zeros(sample_size, dtype=int)
        for i in range(sample_size):
            x = X[i]
            result[i] = self._tree.predict(x)
        return self._encoder.inverse_transform(result)

    def predict_proba(self, X, check_input=True):
        """
        Predicts the class distribution probabilities for the new instances in X.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param check_input: <bool> If input array must be checked
        :return: <numpy array>
        """
        if check_input:
            X = self._validate_predict(X, check_input=check_input)

        sample_size, features_count = X.shape
        result = list(range(sample_size))
        for i in range(sample_size):
            x = X[i]
            result[i] = self._tree.predict_proba(x)
        return result

    def _validate_predict(self, X, check_input):
        """
        Validate X whenever one tries to predict or predict_proba.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param check_input: <bool> If input array must be checked
        :return: <bool>
        """
        if self._tree is None:
            raise NotFittedError("Estimator not fitted, "
                                 "call `fit` before exploiting the model.")

        if check_input:
            X = check_array(X, dtype=None)

        n_features = X.shape[1]
        if self._n_features != n_features:
            raise ValueError("Number of features of the model must "
                             " match the input. Model n_features is %s and "
                             " input n_features is %s "
                             % (self._n_features, n_features))

        return X
Esempio n. 10
0
 def test_max_depth_none_value(self):
     self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                     feature_selection=AllFeatureSelection(), max_depth=None)
     self.assertIsNone(self.tree_builder.max_depth)
Esempio n. 11
0
 def setUp(self):
     self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(),
                                     split_chooser=BestSplitChooser(),
                                     feature_selection=AllFeatureSelection())
Esempio n. 12
0
class TreeBuilderTest(TestCase):
    def setUp(self):
        self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(),
                                        split_chooser=BestSplitChooser(),
                                        feature_selection=AllFeatureSelection())

    def tearDown(self):
        pass

    def test_build_tree_only_root(self):
        n_classes = 1
        x = np.array([1, 1]).reshape((2, 1))
        y = np.array([0, 0])
        returned = self.tree_builder.build_tree(x, y, n_classes)
        expected_length = 1
        expected_root_samples = [2]

        self.assertEqual(len(returned.nodes), expected_length)
        self.assertEqual(returned.nodes[returned.last_node_id].samples, expected_root_samples)
        self.assertIsInstance(returned.nodes[returned.last_node_id], DecisionLeaf)

    def test_build_tree(self):
        n_classes = 2
        x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3))
        y = np.array([1, 1, 0])
        expected_length = 3
        expected_root_value = 'B'
        expected_root_feature_id = 1

        returned = self.tree_builder.build_tree(x, y, n_classes)
        self.assertEqual(len(returned.nodes), expected_length)
        self.assertEqual(returned.nodes[0].value, expected_root_value)
        self.assertEqual(returned.nodes[0].feature_id, expected_root_feature_id)
        self.assertEqual([returned.nodes[1].result, returned.nodes[2].result], [1, 0])
        self.assertIsInstance(returned.nodes[0], DecisionForkCategorical)

    def test_build_tree_recursive_all_same_class_two_classes(self):
        x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3))
        y = np.array([1, 1, 1])
        self.tree_builder._n_classes = 2
        tree = DecisionTree(n_features=3)
        tree.last_node_id = tree.root()
        self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1)
        expected_length = 1
        expected_root_samples = [0, 3]

        self.assertEqual(len(tree.nodes), expected_length)
        self.assertIsInstance(tree.nodes[tree.last_node_id], DecisionLeaf)
        self.assertEqual(tree.nodes[tree.last_node_id].samples, expected_root_samples)

    def test_build_tree_recursive_min_samples_split(self):
        x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3))
        y = np.array([1, 1, 0])
        self.tree_builder._n_classes = 2
        self.tree_builder._min_samples_split = 4
        tree = DecisionTree(n_features=3)
        tree.last_node_id = tree.root()
        self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1)
        expected_length = 1
        expected_root_samples = [1, 2]

        self.assertEqual(len(tree.nodes), expected_length)
        self.assertIsInstance(tree.nodes[tree.last_node_id], DecisionLeaf)
        self.assertEqual(tree.nodes[tree.last_node_id].samples, expected_root_samples)

    def test_build_tree_recursive_max_depth(self):
        x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3))
        y = np.array([1, 1, 0])
        self.tree_builder._n_classes = 2
        self.tree_builder._max_depth = 0
        tree = DecisionTree(n_features=3)
        tree.last_node_id = tree.root()
        self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1)
        expected_length = 1
        expected_root_samples = [1, 2]

        self.assertEqual(len(tree.nodes), expected_length)
        self.assertIsInstance(tree.nodes[tree.last_node_id], DecisionLeaf)
        self.assertEqual(tree.nodes[tree.last_node_id].samples, expected_root_samples)

    def test_build_tree_recursive(self):
        x = np.array([0, 1, 0, 1, 1, 2, 0, 2, 1]).reshape((3, 3))
        y = np.array([1, 1, 0])
        self.tree_builder._n_classes = 2
        tree = DecisionTree(n_features=3)
        tree.last_node_id = tree.root()
        self.tree_builder._build_tree_recursive(tree, tree.last_node_id, x, y, depth=1)
        expected_length = 3
        expected_root_feature_id = 1
        expected_root_value = 1.5

        self.assertEqual(len(tree.nodes), expected_length)
        self.assertIsInstance(tree.nodes[0], DecisionForkNumerical)
        self.assertEqual(tree.nodes[0].feature_id, expected_root_feature_id)
        self.assertEqual(tree.nodes[0].value, expected_root_value)
        self.assertEqual([tree.nodes[1].result, tree.nodes[2].result], [1, 0])

    def test_find_best_split_categorical(self):
        x = np.array(['A', 'B', 'A', 'B', 'B', 'C', 'A', 'C', 'B']).reshape((3, 3))
        y = np.array([1, 1, 0])
        expected_split_value = 'B'
        expected_split_feature_id = 1
        expected_split_gain = 0.44
        returned_split = self.tree_builder._find_split(x, y, 3)

        self.assertEqual(returned_split.value, expected_split_value)
        self.assertEqual(returned_split.feature_id, expected_split_feature_id)
        self.assertAlmostEqual(returned_split.gain, expected_split_gain, places=2)

    def test_find_best_split_numerical(self):
        x = np.array([0, 1, 0, 1, 1, 2, 0, 2, 1]).reshape((3, 3))
        y = np.array([1, 1, 0])
        expected_split_value = 1.5
        expected_split_feature_id = 1
        expected_split_gain = 0.44
        returned_split = self.tree_builder._find_split(x, y, 3)

        self.assertEqual(returned_split.value, expected_split_value)
        self.assertEqual(returned_split.feature_id, expected_split_feature_id)
        self.assertAlmostEqual(returned_split.gain, expected_split_gain, places=2)

    def test_find_best_split_without_examples(self):
        x = np.array([]).reshape((0, 0))
        y = np.array([])
        expected_split = None
        returned_split = self.tree_builder._find_split(x, y, 0)

        self.assertEqual(returned_split, expected_split)
Esempio n. 13
0
class ProactiveForestClassifier(DecisionForestClassifier):
    def __init__(self,
                 n_estimators=100,
                 bootstrap=True,
                 max_depth=None,
                 split_chooser='best',
                 split_criterion='gini',
                 min_samples_leaf=1,
                 feature_selection='log',
                 feature_prob=None,
                 min_gain_split=0,
                 min_samples_split=2,
                 alpha=0.1):
        """
        Builds a proactive forest for a classification problem.

        :param n_estimators: <int> Number of trees in the forest
        :param bootstrap: <bool> Whether to use bagging or not
        :param max_depth: <int> or <None> Defines the maximum depth of the tree
        :param split_chooser: <string> The name of the split chooser:
                            "best" for selecting the best possible split
                            "rand" for selecting a random split
        :param split_criterion: <string> The name of the split criterion:
                            "gini" for selecting the Gini criterion
                            "entropy" for selecting the Entropy criterion
        :param min_samples_leaf: <int> Minimum number of instances to place in a leaf
        :param feature_selection: <string> The name of the feature selection criteria:
                            "all" for selecting all features as candidate features
                            "log" for selecting log(n)+1 as candidate features
                            "prob" for selecting the candidate features according to its probabilities
        :param feature_prob: <list> Feature probabilities
        :param min_gain_split: <float> Minimum split gain value to consider splitting a node
        :param min_samples_split: <int> Minimum number of instances to consider creating a split
        :param alpha: <float> Diversity rate. It can take values from (0, 1]
        """
        if 0 < alpha <= 1:
            self.alpha = alpha
        else:
            raise ValueError("The diversity rate can only take values from (0, 1].")
        super().__init__(n_estimators=n_estimators,
                         bootstrap=bootstrap,
                         max_depth=max_depth,
                         split_chooser=split_chooser,
                         split_criterion=split_criterion,
                         min_samples_leaf=min_samples_leaf,
                         feature_selection=feature_selection,
                         feature_prob=feature_prob,
                         min_gain_split=min_gain_split,
                         min_samples_split=min_samples_split
                         )

    def fit(self, X, y):
        """
        Trains the decision forest classifier with (X, y).

        :param X: <numpy ndarray> An array containing the feature vectors
        :param y: <numpy array> An array containing the target features
        :return: self
        """
        X, y = check_X_y(X, y, dtype=None)
        self._encoder = LabelEncoder()
        y = self._encoder.fit_transform(y)
        self._n_instances, self._n_features = X.shape
        self._n_classes = utils.count_classes(y)
        self._trees = []

        if self._bootstrap:
            set_generator = BaggingSet(self._n_instances)
        else:
            set_generator = SimpleSet(self._n_instances)

        ledger = FIProbabilityLedger(probabilities=self._feature_prob, n_features=self._n_features, alpha=self.alpha)

        self._tree_builder = TreeBuilder(split_criterion=self._split_criterion,
                                         feature_prob=ledger.probabilities,
                                         feature_selection=self._feature_selection,
                                         max_depth=self._max_depth,
                                         min_samples_leaf=self._min_samples_leaf,
                                         min_gain_split=self._min_gain_split,
                                         min_samples_split=self._min_samples_split,
                                         split_chooser=self._split_chooser)

        for i in range(1, self._n_estimators+1):

            ids = set_generator.training_ids()
            X_new = X[ids]
            y_new = y[ids]

            new_tree = self._tree_builder.build_tree(X_new, y_new, self._n_classes)

            if self._bootstrap:
                validation_ids = set_generator.oob_ids()
                if validation_ids:
                    new_tree.weight = accuracy_score(y[validation_ids], self._predict_on_tree(X[validation_ids], new_tree))

            self._trees.append(new_tree)
            set_generator.clear()

            rate = i/self._n_estimators
            ledger.update_probabilities(new_tree, rate=rate)
            self._tree_builder.feature_prob = ledger.probabilities

        return self
Esempio n. 14
0
class DecisionForestClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 n_estimators=100,
                 bootstrap=True,
                 max_depth=None,
                 split_chooser='best',
                 split_criterion='gini',
                 min_samples_leaf=1,
                 feature_selection='log',
                 feature_prob=None,
                 min_gain_split=0,
                 min_samples_split=2):
        """
        Builds a decision forest for a classification problem.

        :param n_estimators: <int> Number of trees in the forest
        :param bootstrap: <bool> Whether to use bagging or not
        :param max_depth: <int> or <None> Defines the maximum depth of the tree
        :param split_chooser: <string> The name of the split chooser:
                            "best" for selecting the best possible split
                            "rand" for selecting a random split
        :param split_criterion: <string> The name of the split criterion:
                            "gini" for selecting the Gini criterion
                            "entropy" for selecting the Entropy criterion
        :param min_samples_leaf: <int> Minimum number of instances to place in a leaf
        :param feature_selection: <string> The name of the feature selection criteria:
                            "all" for selecting all features as candidate features
                            "log" for selecting log(n)+1 as candidate features
                            "prob" for selecting the candidate features according to its probabilities
        :param feature_prob: <list> Feature probabilities
        :param min_gain_split: <float> Minimum split gain value to consider splitting a node
        :param min_samples_split: <int> Minimum number of instances to consider creating a split
        """
        self._trees = None
        self._n_features = None
        self._n_instances = None
        self._tree_builder = None
        self._n_classes = None
        self._encoder = None

        # Ensemble parameters
        self._n_estimators = None
        self._bootstrap = bootstrap

        # Tree parameters
        self._max_depth = None
        self._min_samples_leaf = None
        self._min_samples_split = None
        self._feature_prob = None
        self._min_gain_split = None
        self._split_chooser = None
        self._split_criterion = None
        self._feature_selection = None

        if n_estimators is None or n_estimators > 0:
            self._n_estimators = n_estimators
        else:
            raise ValueError('The number of trees must be greater than 0.')

        if bootstrap is not None:
            self._bootstrap = bootstrap
        else:
            raise ValueError('The value of bootstrap can not be None.')

        if max_depth is None or max_depth > 0:
            self._max_depth = max_depth
        else:
            raise ValueError('The depth of the tree must be greater than 0.')

        if min_samples_leaf is not None and min_samples_leaf > 0:
            self._min_samples_leaf = min_samples_leaf
        else:
            raise ValueError('The minimum number of instances to place in a leaf must be greater than 0.')

        if min_samples_split is not None and min_samples_split > 1:
            self._min_samples_split = min_samples_split
        else:
            raise ValueError('The minimum number of instances to make a split must be greater than 1')

        if feature_prob is None or (utils.check_array_sum_one(feature_prob) and
                                    utils.check_positive_array(feature_prob)):
            self._feature_prob = feature_prob
        else:
            raise ValueError('The features probabilities must be positive values and the sum must be one')

        if min_gain_split is not None and min_gain_split >= 0:
            self._min_gain_split = min_gain_split
        else:
            raise ValueError('The minimum value of gain to make a split must be greater or equal to 0')

        if split_chooser is not None:
            self._split_chooser = resolve_split_selection(split_chooser)
        else:
            raise ValueError('The split chooser can not be None.')

        if split_criterion is not None:
            self._split_criterion = resolve_split_criterion(split_criterion)
        else:
            raise ValueError('The split criterion can not be None.')

        if feature_selection is not None:
            self._feature_selection = resolve_feature_selection(feature_selection)
        else:
            raise ValueError('The feature selection criteria can not be None.')

    @property
    def n_estimators(self):
        return self._n_estimators

    @n_estimators.setter
    def n_estimators(self, n_estimators):
        self._n_estimators = n_estimators

    @property
    def bootstrap(self):
        return self._bootstrap

    @bootstrap.setter
    def bootstrap(self, bootstrap):
        self._bootstrap = bootstrap

    @property
    def max_depth(self):
        return self._max_depth

    @max_depth.setter
    def max_depth(self, max_depth):
        self._max_depth = max_depth

    @property
    def min_samples_leaf(self):
        return self._min_samples_leaf

    @min_samples_leaf.setter
    def min_samples_leaf(self, min_samples_leaf):
        self._min_samples_leaf = min_samples_leaf

    @property
    def min_samples_split(self):
        return self._min_samples_split

    @min_samples_split.setter
    def min_samples_split(self, min_samples_split):
        self._min_samples_split = min_samples_split

    @property
    def feature_prob(self):
        return self._feature_prob

    @feature_prob.setter
    def feature_prob(self, feature_prob):
        self._feature_prob = feature_prob

    @property
    def min_gain_split(self):
        return self._min_gain_split

    @min_gain_split.setter
    def min_gain_split(self, min_gain_split):
        self._min_gain_split = min_gain_split

    @property
    def split_chooser(self):
        return self._split_chooser.name

    @split_chooser.setter
    def split_chooser(self, split_chooser):
        self._split_chooser = split_chooser

    @property
    def split_criterion(self):
        return self._split_criterion.name

    @split_criterion.setter
    def split_criterion(self, split_criterion):
        self._split_criterion = split_criterion

    @property
    def feature_selection(self):
        return self._feature_selection.name

    @feature_selection.setter
    def feature_selection(self, feature_selection):
        self._feature_selection = feature_selection

    def fit(self, X, y):
        """
        Trains the decision forest classifier with (X, y).

        :param X: <numpy ndarray> An array containing the feature vectors
        :param y: <numpy array> An array containing the target features
        :return: self
        """
        X, y = check_X_y(X, y, dtype=None)
        self._encoder = LabelEncoder()
        y = self._encoder.fit_transform(y)
        self._n_instances, self._n_features = X.shape
        self._n_classes = utils.count_classes(y)
        self._trees = []

        if self._bootstrap:
            set_generator = BaggingSet(self._n_instances)
        else:
            set_generator = SimpleSet(self._n_instances)

        self._tree_builder = TreeBuilder(split_criterion=self._split_criterion,
                                         feature_prob=self._feature_prob,
                                         feature_selection=self._feature_selection,
                                         max_depth=self._max_depth,
                                         min_samples_leaf=self._min_samples_leaf,
                                         min_gain_split=self._min_gain_split,
                                         min_samples_split=self._min_samples_split,
                                         split_chooser=self._split_chooser)

        for _ in range(self._n_estimators):
            ids = set_generator.training_ids()
            X_new = X[ids]
            y_new = y[ids]

            new_tree = self._tree_builder.build_tree(X_new, y_new, self._n_classes)

            if self._bootstrap:
                validation_ids = set_generator.oob_ids()
                if validation_ids:
                    new_tree.weight = accuracy_score(y[validation_ids], self._predict_on_tree(X[validation_ids], new_tree))

            self._trees.append(new_tree)
            set_generator.clear()

        return self

    def predict(self, X, check_input=True):
        """
        Predicts the classes for the new instances in X.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param check_input: <bool> If input array must be checked
        :return: <numpy array>
        """
        if check_input:
            X = self._validate(X, check_input=check_input)

        voter = PerformanceWeightingVoter(self._trees, self._n_classes)

        sample_size, features_count = X.shape
        result = np.zeros(sample_size, dtype=int)
        for i in range(sample_size):
            x = X[i]
            result[i] = voter.predict(x)
        return self._encoder.inverse_transform(result)

    def predict_proba(self, X, check_input=True):
        """
        Predicts the class distribution probabilities for the new instances in X.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param check_input: <bool> If input array must be checked
        :return: <numpy array>
        """
        if check_input:
            X = self._validate(X, check_input=check_input)

        voter = PerformanceWeightingVoter(self._trees, self._n_classes)

        sample_size, features_count = X.shape
        result = list(range(sample_size))
        for i in range(sample_size):
            x = X[i]
            result[i] = voter.predict_proba(x)
        return result

    def feature_importances(self):
        """
        Calculates the feature importances according to Breiman 2001.

        :return: <numpy array>
        """
        importances = np.zeros(self._n_features)
        for tree in self._trees:
            importances += tree.feature_importances()
        importances /= len(self._trees)
        return importances

    def trees_mean_weight(self):
        """
        Calculates the mean weight of the trees in the forest.

        :return: <float>
        """
        weights = [tree.weight for tree in self._trees]
        mean_weight = np.mean(weights)
        return mean_weight

    def diversity_measure(self, X, y, diversity='pcd'):
        """
        Calculates the diversity measure for the forest.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param y: <numpy array> An array containing the target features
        :param diversity: <string> The type of diversity to be calculated
                        "pcd" for Percentage of Correct Diversity
                        "qstat" for QStatistic Diversity
        :return: <float>
        """
        X, y = check_X_y(X, y, dtype=None)
        y = self._encoder.transform(y)

        if diversity == 'pcd':
            metric = PercentageCorrectDiversity()
        elif diversity == 'qstat':
            metric = QStatisticDiversity()
        else:
            raise ValueError("It was not possible to recognize the diversity measure.")

        forest_diversity = metric.get_measure(self._trees, X, y)
        return forest_diversity

    def _validate(self, X, check_input):
        """
        Validate X whenever one tries to predict or predict_proba.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param check_input: <bool> If input array must be checked
        :return: <bool>
        """
        if self._trees is None:
            raise NotFittedError("Estimator not fitted, "
                                 "call `fit` before exploiting the model.")

        if check_input:
            X = check_array(X, dtype=None)

        n_features = X.shape[1]
        if self._n_features != n_features:
            raise ValueError("Number of features of the model must "
                             " match the input. Model n_features is %s and "
                             " input n_features is %s "
                             % (self._n_features, n_features))

        return X

    def _predict_on_tree(self, X, tree, check_input=True):
        """
        Predicts the classes for the new instances in X.

        :param X: <numpy ndarray> An array containing the feature vectors
        :param tree: <DecisionTree> The tree in which to predict
        :param check_input: <bool> If input array must be checked
        :return: <numpy array>
        """
        if check_input:
            X = self._validate(X, check_input=check_input)

        sample_size, features_count = X.shape
        result = np.zeros(sample_size, dtype=int)
        for i in range(sample_size):
            x = X[i]
            result[i] = tree.predict(x)
        return result
Esempio n. 15
0
 def test_split_chooser_admissible_value(self):
     self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                     feature_selection=AllFeatureSelection())
     self.assertIsInstance(self.tree_builder.split_chooser, BestSplitChooser)
Esempio n. 16
0
 def test_split_criterion_exception_none_value(self):
     with self.assertRaises(ValueError):
         self.tree_builder = TreeBuilder(split_criterion=None, split_chooser=BestSplitChooser(),
                                         feature_selection=AllFeatureSelection())
Esempio n. 17
0
 def test_max_depth_positive_value(self):
     self.tree_builder = TreeBuilder(split_criterion=GiniCriterion(), split_chooser=BestSplitChooser(),
                                     feature_selection=AllFeatureSelection(), max_depth=1)
     self.assertEqual(self.tree_builder.max_depth, 1)