def _handle_value_not_trained_for(self): """ Handles the case where a sample has a value for a feature which was not seen in the training set and therefore is not accounted for in the tree. Current strategy is to just return the most common label in the training data set. It might be better to narrow this down to the most common among samples that would reach the node at which the unrecognized value was found. Returns: label: The best guess at the label. """ return collection_utils.get_most_common(self.training_set.get_labels())
def _handle_value_not_trained_for(self): """ Handles the case where a sample has a value for a feature which was not seen in the training set and therefore is not accounted for in the tree. Current strategy is to just return the most common label in the training data set. It might be better to narrow this down to the most common among samples that would reach the node at which the unrecognized value was found. Returns: label: The best guess at the label. """ return collection_utils.get_most_common( self.training_set.get_labels())
def _build_tree_recursively(dataset): """ Private function used to build the decision tree in a recursive fashion. Args: dataset: model.DataSet The data at the current level of the tree. Lower levels of the tree have filtered subsets of the original data set. Returns: current_root: Node The node which is the root of the level being processed. For example, on the first/outermost call to this function the root node will be returned. Subsequent calls will return the various child nodes. """ label_set = set(dataset.get_labels()) if len(label_set) == 1: # All remaining samples have the same label, no need to split further return Node(label_set.pop()) if len(dataset.feature_list()) == 0: # No more features to split on return Node(get_most_common(dataset.get_labels())) # We can still split further split_feature = choose_feature_to_split(dataset) node = Node(split_feature) for value in dataset.get_feature_values(split_feature): subset = dataset.value_filter( split_feature, value).drop_column(split_feature) node.add_child(value, _build_tree_recursively(subset)) return node
def _build_tree_recursively(dataset): """ Private function used to build the decision tree in a recursive fashion. Args: dataset: model.DataSet The data at the current level of the tree. Lower levels of the tree have filtered subsets of the original data set. Returns: current_root: Node The node which is the root of the level being processed. For example, on the first/outermost call to this function the root node will be returned. Subsequent calls will return the various child nodes. """ label_set = set(dataset.get_labels()) if len(label_set) == 1: # All remaining samples have the same label, no need to split further return Node(label_set.pop()) if len(dataset.feature_list()) == 0: # No more features to split on return Node(get_most_common(dataset.get_labels())) # We can still split further split_feature = choose_feature_to_split(dataset) node = Node(split_feature) for value in dataset.get_feature_values(split_feature): subset = dataset.value_filter(split_feature, value).drop_column(split_feature) node.add_child(value, _build_tree_recursively(subset)) return node
def test_get_most_common_empty(self): collection = [] self.assertIsNone(collection_utils.get_most_common(collection))
def test_get_most_common(self): collection = ["a", "b", "a", "a", "b"] self.assertEqual(collection_utils.get_most_common(collection), "a")