Esempio n. 1
0
    def _calculate_variance_reduction(self, y, y_1, y_2):
        var_tot = calculate_variance(np.expand_dims(y, axis=1))
        var_1 = calculate_variance(np.expand_dims(y_1, axis=1))
        var_2 = calculate_variance(np.expand_dims(y_2, axis=1))
        frac_1 = len(y_1) / len(y)
        frac_2 = len(y_2) / len(y)

        # Calculate the variance reduction
        variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)

        return variance_reduction
Esempio n. 2
0
    def _calculate_variance_reduction(self, y, y1, y2):
        var_tot = calculate_variance(y)
        var_1 = calculate_variance(y1)
        var_2 = calculate_variance(y2)
        frac_1 = len(y1) / len(y)
        frac_2 = len(y2) / len(y)

        # Calculate the variance reduction
        variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)

        return sum(variance_reduction)
Esempio n. 3
0
    def _build_tree(self, X, y):

        largest_variance_reduction = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Add y as last column of X
        X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split:
            # Calculate the information gain for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the informaion gain
                for threshold in unique_values:
                    Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
                    # If one subset there is no use of calculating the
                    # information gain
                    if len(Xy_1) > 0 and len(Xy_2) > 0:

                        X_1 = Xy_1[:, :-1]
                        X_2 = Xy_2[:, :-1]

                        # Calculate the variance of the data set and the
                        # two split sets
                        var_tot = calculate_variance(X)
                        var_1 = calculate_variance(X_1)
                        var_2 = calculate_variance(X_2)

                        # Calculate the variance reduction
                        variance_reduction = var_tot - (var_1 + var_2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature
                        # index
                        if variance_reduction > largest_variance_reduction:
                            largest_variance_reduction = variance_reduction
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "left_branch": Xy_1,
                                "right_branch": Xy_2
                            }

        # If we have any information gain to go by we build the tree deeper
        if self.current_depth < self.max_depth and largest_variance_reduction > self.min_var_red:
            leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[
                "left_branch"][:, -1]  # X - all cols. but last, y - last
            rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[
                "right_branch"][:, -1]  # X - all cols. but last, y - last
            true_branch = self._build_tree(leftX, leftY)
            false_branch = self._build_tree(rightX, rightY)
            self.current_depth += 1
            return RegressionNode(feature_i=best_criteria["feature_i"],
                                  threshold=best_criteria["threshold"],
                                  true_branch=true_branch,
                                  false_branch=false_branch)

        # Set y prediction for this leaf as the mean
        # of the y training data values of this leaf
        return RegressionNode(value=np.mean(y))
Esempio n. 4
0
    def _build_tree(self, X, y, current_depth=0):

        largest_variance_reduction = 0
        best_criteria = None    # Feature index and threshold
        best_sets = None        # Subsets of the data

        # Add y as last column of X
        X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split:
            # Calculate the variance reduction for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Find points to split at as the mean of every following
                # pair of points
                x = unique_values
                split_points = [(x[i-1]+x[i])/2 for i in range(1,len(x))]

                # Iterate through all unique values of feature column i and
                # calculate the variance reduction
                for threshold in split_points:
                    Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)

                    if len(Xy_1) > 0 and len(Xy_2) > 0:

                        y_1 = Xy_1[:, -1]
                        y_2 = Xy_2[:, -1]

                        var_tot = calculate_variance(np.expand_dims(y, axis=1))
                        var_1 = calculate_variance(np.expand_dims(y_1, axis=1))
                        var_2 = calculate_variance(np.expand_dims(y_2, axis=1))
                        frac_1 = len(y_1) / len(y)
                        frac_2 = len(y_2) / len(y)

                        # Calculate the variance reduction
                        variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)

                        # If this threshold resulted in a larger variance reduction than
                        # previously registered we save the feature index and threshold
                        # and the two sets
                        if variance_reduction > largest_variance_reduction:
                            largest_variance_reduction = variance_reduction
                            best_criteria = {
                                "feature_i": feature_i, "threshold": threshold}
                            best_sets = {
                                "left_branch": Xy_1, "right_branch": Xy_2}

        # If we have any information gain to go by we build the tree deeper
        if current_depth < self.max_depth and largest_variance_reduction > self.min_var_red:
            leftX, leftY = best_sets["left_branch"][
                :, :-1], best_sets["left_branch"][:, -1]    # X - all cols. but last, y - last
            rightX, rightY = best_sets["right_branch"][
                :, :-1], best_sets["right_branch"][:, -1]    # X - all cols. but last, y - last
            true_branch = self._build_tree(leftX, leftY, current_depth + 1)
            false_branch = self._build_tree(rightX, rightY, current_depth + 1)
            return RegressionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
                                "threshold"], true_branch=true_branch, false_branch=false_branch)

        # Set y prediction for this leaf as the mean
        # of the y training data values of this leaf
        return RegressionNode(value=np.mean(y))