Example #1
0
 def _handle_value_not_trained_for(self):
     """
     Handles the case where a sample has a value for a feature which was 
     not seen in the training set and therefore is not accounted for in 
     the tree.
     
     Current strategy is to just return the most common label in the 
     training data set.  It might be better to narrow this down to the 
     most common among samples that would reach the node at which the 
     unrecognized value was found.
     
     Returns:
       label:
         The best guess at the label.
     """
     return collection_utils.get_most_common(self.training_set.get_labels())
Example #2
0
 def _handle_value_not_trained_for(self):
     """
     Handles the case where a sample has a value for a feature which was 
     not seen in the training set and therefore is not accounted for in 
     the tree.
     
     Current strategy is to just return the most common label in the 
     training data set.  It might be better to narrow this down to the 
     most common among samples that would reach the node at which the 
     unrecognized value was found.
     
     Returns:
       label:
         The best guess at the label.
     """
     return collection_utils.get_most_common(
                                 self.training_set.get_labels())
Example #3
0
def _build_tree_recursively(dataset):
    """
    Private function used to build the decision tree in a recursive fashion.
    
    Args:
      dataset: model.DataSet
        The data at the current level of the tree.  Lower levels of the tree 
        have filtered subsets of the original data set.
    
    Returns:
      current_root: Node
        The node which is the root of the level being processed.  For 
        example, on the first/outermost call to this function the root 
        node will be returned.  Subsequent calls will return the various 
        child nodes.
    """
    label_set = set(dataset.get_labels())
    if len(label_set) == 1:
        # All remaining samples have the same label, no need to split further
        return Node(label_set.pop())
    
    if len(dataset.feature_list()) == 0:
        # No more features to split on
        return Node(get_most_common(dataset.get_labels()))

    # We can still split further
    split_feature = choose_feature_to_split(dataset)
    
    node = Node(split_feature)
    
    for value in dataset.get_feature_values(split_feature):
        subset = dataset.value_filter(
                            split_feature, value).drop_column(split_feature)
        node.add_child(value, _build_tree_recursively(subset))
    
    return node
Example #4
0
def _build_tree_recursively(dataset):
    """
    Private function used to build the decision tree in a recursive fashion.
    
    Args:
      dataset: model.DataSet
        The data at the current level of the tree.  Lower levels of the tree 
        have filtered subsets of the original data set.
    
    Returns:
      current_root: Node
        The node which is the root of the level being processed.  For 
        example, on the first/outermost call to this function the root 
        node will be returned.  Subsequent calls will return the various 
        child nodes.
    """
    label_set = set(dataset.get_labels())
    if len(label_set) == 1:
        # All remaining samples have the same label, no need to split further
        return Node(label_set.pop())

    if len(dataset.feature_list()) == 0:
        # No more features to split on
        return Node(get_most_common(dataset.get_labels()))

    # We can still split further
    split_feature = choose_feature_to_split(dataset)

    node = Node(split_feature)

    for value in dataset.get_feature_values(split_feature):
        subset = dataset.value_filter(split_feature,
                                      value).drop_column(split_feature)
        node.add_child(value, _build_tree_recursively(subset))

    return node
 def test_get_most_common_empty(self):
     collection = []
     self.assertIsNone(collection_utils.get_most_common(collection))
 def test_get_most_common(self):
     collection = ["a", "b", "a", "a", "b"]
     self.assertEqual(collection_utils.get_most_common(collection), "a")
Example #7
0
 def test_get_most_common_empty(self):
     collection = []
     self.assertIsNone(collection_utils.get_most_common(collection))
Example #8
0
 def test_get_most_common(self):
     collection = ["a", "b", "a", "a", "b"]
     self.assertEqual(collection_utils.get_most_common(collection), "a")