def _build_tree(self, X, y, current_depth=0):

        largest_impurity = 0
        best_criteria = None    # Feature index and threshold
        best_sets = None        # Subsets of the data

        expand_needed = len(np.shape(y)) == 1
        if expand_needed:
            y = np.expand_dims(y, axis=1)

        # Add y as last column of X
        X_y = np.concatenate((X, y), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
            # Calculate the impurity for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the impurity
                for threshold in unique_values:
                    Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
                    
                    if len(Xy_1) > 0 and len(Xy_2) > 0:
                        y_1 = Xy_1[:, n_features:]
                        y_2 = Xy_2[:, n_features:]

                        # Calculate impurity
                        impurity = self._impurity_calculation(y, y_1, y_2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature
                        # index
                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {
                                "feature_i": feature_i, "threshold": threshold}
                            best_sets = {
                                "left_branch": Xy_1, "right_branch": Xy_2}


        if largest_impurity > self.min_impurity:
            leftX = best_sets["left_branch"][:, :n_features]
            leftY = best_sets["left_branch"][:, n_features:]    # X - all cols. but last, y - last
            rightX = best_sets["right_branch"][:, :n_features]
            rightY = best_sets["right_branch"][:, n_features:]    # X - all cols. but last, y - last
            true_branch = self._build_tree(leftX, leftY, current_depth + 1)
            false_branch = self._build_tree(rightX, rightY, current_depth + 1)
            return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
                                "threshold"], true_branch=true_branch, false_branch=false_branch)

        # We're at leaf => determine value
        leaf_value = self._leaf_value_calculation(y)

        return DecisionNode(value=leaf_value)
    def _build_tree(self, X, y, current_depth=0):
        # print('Start building the tree')
        largest_impurity = 0
        best_criteria = None
        best_sets = None

        if len(np.shape(y)) == 1:
            y = np.expand_dims(y, axis=1)

        largest_impurity = 0

        Xy = np.concatenate((X, y), axis=1)
        n_samples, n_features = np.shape(X)

        if current_depth <= self.max_depth and n_samples >= self.min_samples_split:
            # for-loop: Calculate the impurity for each feature
            for feature_i in range(n_features):
                # Get all unique values in curent feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                for threshold in unique_values:
                    # Divide X and y depending on if the feature value of X at index feature_i meets the threshold
                    Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)

                    # comfirm that the divide is done, if there is no subtree here, we just skip it
                    if len(Xy1) > 0 and len(Xy2) > 0:

                        y1 = Xy1[:, n_features:]
                        y2 = Xy2[:, n_features:]

                        # Calculate the current impurity
                        impurity = self._impurity_calculation(y, y1, y2)

                        if impurity > largest_impurity:
                            largest_impurity = impurity
                            best_criteria = {'feature_i': feature_i, 'threshold': threshold}
                            best_sets = {
                                'leftX': Xy1[:, :n_features],
                                'lefty': Xy1[:, n_features:],
                                'rightX': Xy2[:, :n_features],
                                'righty': Xy2[:, n_features:]
                            }

        # we don't need exactly stop the process until we cannot get any impurity impoving, we just set a min_impurity
        if largest_impurity > self.min_impurity:
            # Build subtrees for the right and left branches
            # true_branch is just 'left' branch
            true_branch = self._build_tree(best_sets['leftX'], best_sets['lefty'], current_depth + 1)
            false_branch = self._build_tree(best_sets['rightX'], best_sets['righty'], current_depth + 1)
            return DecisionNode(feature_i=best_criteria['feature_i'], threshold=best_criteria['threshold'],
                                true_branch=true_branch, false_branch=false_branch)

        # we should know where return the result, including the leaf node and the decision node!

        # we are at leaf node, so we determine the value
        leaf_value = self._leaf_value_calculation(y)
        return DecisionNode(value=leaf_value)
Beispiel #3
0
    def _build_tree(self, X, y):

        largest_variance_reduction = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Add y as last column of X
        X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split:
            # Calculate the information gain for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the informaion gain
                for threshold in unique_values:
                    Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
                    # If one subset there is no use of calculating the
                    # information gain
                    if len(Xy_1) > 0 and len(Xy_2) > 0:

                        X_1 = Xy_1[:, :-1]
                        X_2 = Xy_2[:, :-1]

                        # Calculate the variance of the data set and the
                        # two split sets
                        var_tot = calculate_variance(X)
                        var_1 = calculate_variance(X_1)
                        var_2 = calculate_variance(X_2)

                        # Calculate the variance reduction
                        variance_reduction = var_tot - (var_1 + var_2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature
                        # index
                        if variance_reduction > largest_variance_reduction:
                            largest_variance_reduction = variance_reduction
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "left_branch": Xy_1,
                                "right_branch": Xy_2
                            }

        # If we have any information gain to go by we build the tree deeper
        if self.current_depth < self.max_depth and largest_variance_reduction > self.min_var_red:
            leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[
                "left_branch"][:, -1]  # X - all cols. but last, y - last
            rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[
                "right_branch"][:, -1]  # X - all cols. but last, y - last
            true_branch = self._build_tree(leftX, leftY)
            false_branch = self._build_tree(rightX, rightY)
            self.current_depth += 1
            return RegressionNode(feature_i=best_criteria["feature_i"],
                                  threshold=best_criteria["threshold"],
                                  true_branch=true_branch,
                                  false_branch=false_branch)

        # Set y prediction for this leaf as the mean
        # of the y training data values of this leaf
        return RegressionNode(value=np.mean(y))
Beispiel #4
0
    def _build_tree(self, X, y):
        # Calculate the entropy by the label values
        entropy = calculate_entropy(y)

        highest_info_gain = 0
        best_criteria = None  # Feature index and threshold
        best_sets = None  # Subsets of the data

        # Add y as last column of X
        X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split:
            # Calculate the information gain for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Iterate through all unique values of feature column i and
                # calculate the informaion gain
                for threshold in unique_values:
                    Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
                    # If one subset there is no use of calculating the information gain
                    if len(Xy_1) > 0 and len(Xy_2) > 0:
                        # Calculate information gain
                        p = len(Xy_1) / n_samples
                        y1 = Xy_1[:, -1]
                        y2 = Xy_2[:, -1]
                        info_gain = entropy - p * calculate_entropy(y1) - (
                            1 - p) * calculate_entropy(y2)

                        # If this threshold resulted in a higher information gain than previously
                        # recorded save the threshold value and the feature index
                        if info_gain > highest_info_gain:
                            highest_info_gain = info_gain
                            best_criteria = {
                                "feature_i": feature_i,
                                "threshold": threshold
                            }
                            best_sets = {
                                "left_branch": Xy_1,
                                "right_branch": Xy_2
                            }

        # If we have any information gain to go by we build the tree deeper
        if self.current_depth < self.max_depth and highest_info_gain > self.min_gain:
            leftX, leftY = best_sets["left_branch"][:, :-1], best_sets[
                "left_branch"][:, -1]  # X - all cols. but last, y - last
            rightX, rightY = best_sets["right_branch"][:, :-1], best_sets[
                "right_branch"][:, -1]  # X - all cols. but last, y - last
            true_branch = self._build_tree(leftX, leftY)
            false_branch = self._build_tree(rightX, rightY)
            self.current_depth += 1
            return DecisionNode(feature_i=best_criteria["feature_i"],
                                threshold=best_criteria["threshold"],
                                true_branch=true_branch,
                                false_branch=false_branch)
        # There's no recorded information gain so we are at a leaf
        most_common = None
        max_count = 0
        results = {}
        for label in np.unique(y):
            count = len(y[y == label])
            if count > max_count:
                most_common = label
                max_count = count
        return DecisionNode(label=most_common)
	def _build_tree(self, X, y):
		# Calculate the entropy by the label values
		entropy = calculate_entropy(y)

		# Save the best informaion gain
		highest_info_gain = 0
		best_criteria = None	# Feature index and threshold
		best_sets = None		# Subsets of the data

		# Add y as last column of X
		X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

		n_samples, n_features = np.shape(X)

		if n_samples >= self.min_samples_split:
			# Calculate the information gain for each feature
			for feature_i in range(n_features):
				# All values of feature_i
				feature_values = np.expand_dims(X[:, feature_i], axis=1)
				unique_values = np.unique(feature_values)

				# Iterate through all unique values of feature column i and
				# calculate the informaion gain
				for threshold in unique_values:

					Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)
					if np.shape(X_y)[0] != np.shape(Xy_1)[0] + np.shape(Xy_2)[0]:
						print "Aj"
						sys.exit(0)

					# If one subset there is no use of calculating the information gain
					if len(Xy_1) > 0 and len(Xy_2) > 0:
						# Calculate information gain
						p = len(Xy_1) / n_samples
						y1 = Xy_1[:,-1]
						y2 = Xy_2[:,-1]
						info_gain = entropy - p * calculate_entropy(y1) - (1 - p) * calculate_entropy(y2)

						# If this threshold resulted in a higher information gain than previously
						# recorded save the threshold value and the feature index
						if info_gain > highest_info_gain:
							highest_info_gain = info_gain
							best_criteria = {"feature_i": feature_i, "threshold": threshold}
							best_sets = np.array([Xy_1, Xy_2])

		# If we have any information gain to go by we build the tree deeper
		if self.current_depth < self.max_depth and highest_info_gain > self.min_gain:
			X_1, y_1 = best_sets[0][:, :-1], best_sets[0][:, -1]
			X_2, y_2 = best_sets[1][:, :-1], best_sets[1][:, -1]
			true_branch = self._build_tree(X_1, y_1)
			false_branch = self._build_tree(X_2, y_2)
			self.current_depth += 1
			return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria["threshold"], true_branch=true_branch, false_branch=false_branch)
		# There's no recorded information gain so we are at a leaf
		most_common = None
		max_count = 0
		results = {}
		for label in np.unique(y):
			count = len(y[y == label])
			if count > max_count:
				most_common = label
				max_count = count
		return DecisionNode(label=most_common)
Beispiel #6
0
    def _build_tree(self, X, y, current_depth=0):

        largest_variance_reduction = 0
        best_criteria = None    # Feature index and threshold
        best_sets = None        # Subsets of the data

        # Add y as last column of X
        X_y = np.concatenate((X, np.expand_dims(y, axis=1)), axis=1)

        n_samples, n_features = np.shape(X)

        if n_samples >= self.min_samples_split:
            # Calculate the variance reduction for each feature
            for feature_i in range(n_features):
                # All values of feature_i
                feature_values = np.expand_dims(X[:, feature_i], axis=1)
                unique_values = np.unique(feature_values)

                # Find points to split at as the mean of every following
                # pair of points
                x = unique_values
                split_points = [(x[i-1]+x[i])/2 for i in range(1,len(x))]

                # Iterate through all unique values of feature column i and
                # calculate the variance reduction
                for threshold in split_points:
                    Xy_1, Xy_2 = divide_on_feature(X_y, feature_i, threshold)

                    if len(Xy_1) > 0 and len(Xy_2) > 0:

                        y_1 = Xy_1[:, -1]
                        y_2 = Xy_2[:, -1]

                        var_tot = calculate_variance(np.expand_dims(y, axis=1))
                        var_1 = calculate_variance(np.expand_dims(y_1, axis=1))
                        var_2 = calculate_variance(np.expand_dims(y_2, axis=1))
                        frac_1 = len(y_1) / len(y)
                        frac_2 = len(y_2) / len(y)

                        # Calculate the variance reduction
                        variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)

                        # If this threshold resulted in a larger variance reduction than
                        # previously registered we save the feature index and threshold
                        # and the two sets
                        if variance_reduction > largest_variance_reduction:
                            largest_variance_reduction = variance_reduction
                            best_criteria = {
                                "feature_i": feature_i, "threshold": threshold}
                            best_sets = {
                                "left_branch": Xy_1, "right_branch": Xy_2}

        # If we have any information gain to go by we build the tree deeper
        if current_depth < self.max_depth and largest_variance_reduction > self.min_var_red:
            leftX, leftY = best_sets["left_branch"][
                :, :-1], best_sets["left_branch"][:, -1]    # X - all cols. but last, y - last
            rightX, rightY = best_sets["right_branch"][
                :, :-1], best_sets["right_branch"][:, -1]    # X - all cols. but last, y - last
            true_branch = self._build_tree(leftX, leftY, current_depth + 1)
            false_branch = self._build_tree(rightX, rightY, current_depth + 1)
            return RegressionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
                                "threshold"], true_branch=true_branch, false_branch=false_branch)

        # Set y prediction for this leaf as the mean
        # of the y training data values of this leaf
        return RegressionNode(value=np.mean(y))