def _build_tree(self, X, y, current_depth=0):
        with pyRAPL.Measurement('Build_tree', output=csv_output):
            """ Recursive method which builds out the decision tree and splits X and respective y on the feature of X which (based on impurity) best separates the data"""
            largest_impurity = 0
            best_criteria = None  # Feature index and threshold
            best_sets = None  # Subsets of the data

            # Check if expansion of y is needed
            if len(np.shape(y)) == 1:
                y = np.expand_dims(y, axis=1)

            # Add y as last column of X
            Xy = np.concatenate((X, y), axis=1)

            n_samples, n_features = np.shape(X)

            if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
                # Calculate the impurity for each feature
                for feature_i in range(n_features):
                    # All values of feature_i
                    feature_values = np.expand_dims(X[:, feature_i], axis=1)
                    unique_values = np.unique(feature_values)

                    # Iterate through all unique values of feature column i and
                    # calculate the impurity
                    for threshold in unique_values:
                        # Divide X and y depending on if the feature value of X at index feature_i
                        # meets the threshold
                        Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                        if len(Xy1) > 0 and len(Xy2) > 0:
                            # Select the y-values of the two sets
                            y1 = Xy1[:, n_features:]
                            y2 = Xy2[:, n_features:]

                            # Calculate impurity
                            impurity = self._impurity_calculation(y, y1, y2)

                            # If this threshold resulted in a higher information gain than previously
                            # recorded save the threshold value and the feature
                            # index
                            if impurity > largest_impurity:
                                largest_impurity = impurity
                                best_criteria = {
                                    "feature_i": feature_i,
                                    "threshold": threshold
                                }
                                best_sets = {
                                    "leftX":
                                    Xy1[:, :n_features],  # X of left subtree
                                    "lefty":
                                    Xy1[:, n_features:],  # y of left subtree
                                    "rightX":
                                    Xy2[:, :n_features],  # X of right subtree
                                    "righty":
                                    Xy2[:, n_features:]  # y of right subtree
                                }

            if largest_impurity > self.min_impurity:
                # Build subtrees for the right and left branches
                true_branch = self._build_tree(best_sets["leftX"],
                                               best_sets["lefty"],
                                               current_depth + 1)
                false_branch = self._build_tree(best_sets["rightX"],
                                                best_sets["righty"],
                                                current_depth + 1)
                return DecisionNode(feature_i=best_criteria["feature_i"],
                                    threshold=best_criteria["threshold"],
                                    true_branch=true_branch,
                                    false_branch=false_branch)

            # We're at leaf => determine value
            leaf_value = self._leaf_value_calculation(y)

            return DecisionNode(value=leaf_value)
        csv_output.save()
Ejemplo n.º 2
0
    def _build_tree(self, X, y, current_depth=0):
        """ Recursive method which builds out the decision tree and splits X and respective y
        on the feature of X which (based on impurity) best separates the data"""

        largest_impurity = 0
        best_criteria = None    # Feature index and threshold
        best_sets = None        # Subsets of the data

        # Check if expansion of y is needed
        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)

        # Add y as last column of X
        Xy = np.concatenate((X, y), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            # Calculate the impurity for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the impurity
                for threshold in unique_values:
                    # Divide X and y depending on if the feature value of X at index feature_i
                    # meets the threshold
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                    if len(Xy1) > 0 and len(Xy2) > 0:
                        # Select the y-values of the two sets
                        y1 = Xy1[:, n_features:]
                        y2 = Xy2[:, n_features:]

                        # Calculate impurity
                        impurity = self._impurity_calculation(y, y1, y2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature
                        # index
                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {"feature_i": feature_i, "threshold": threshold}
                            best_sets = {
                                "leftX": Xy1[:, :n_features],   # X of left subtree
                                "lefty": Xy1[:, n_features:],   # y of left subtree
                                "rightX": Xy2[:, :n_features],  # X of right subtree
                                "righty": Xy2[:, n_features:]   # y of right subtree
                                }

        if largest_impurity > self.min_impurity:
            # Build subtrees for the right and left branches
            true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
            false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
                                "threshold"], true_branch=true_branch, false_branch=false_branch)

        # We're at leaf => determine value
        leaf_value = self._leaf_value_calculation(y)

        return DecisionNode(value=leaf_value)