def _put(self, key, val, currentNode):
     if key < currentNode.key:
         if currentNode.has_left_child():
             self._put(key, val, currentNode.left_child)
         else:
             currentNode.left_child = tree_node.TreeNode(key,
                                                         val,
                                                         parent=currentNode)
     else:
         if currentNode.has_right_child():
             self._put(key, val, currentNode.right_child)
         else:
             currentNode.right_child = tree_node.TreeNode(
                 key, val, parent=currentNode)
Beispiel #2
0
    def train_tree(self):

        #first normalize weight of training examples before you feed them to root
        self.normalize_weight()

        self.root = tree_node.TreeNode(self.training_examples, 0,
                                       self.maxdepth)
        #Create children recursively
        self.root.create_children()
        return
def update_tree(window, in_tree, genome_num):
    if window[0] in in_tree.children:  # check if window[0] in retTree.children
        if in_tree.children[window[0]].last_update != genome_num:
            in_tree.children[window[0]].inc(1)  # increase count by one
            in_tree.children[window[0]].last_update = genome_num
    else:  # add window[0] to in_tree.children
        in_tree.children[window[0]] = tree_node.TreeNode(window[0], 1, in_tree)
        in_tree.children[window[0]].last_update = genome_num

    if len(window) > 1:  # call updateTree() with remaining ordered items
        update_tree(window[1::], in_tree.children[window[0]], genome_num)
Beispiel #4
0
def create_fake_tree_node(contingency_table):
    """Creates a fake TreeNode consistent with the given contingency table."""
    num_classes = contingency_table.shape[1]
    fake_dataset = dataset.Dataset(None,
                                   None,
                                   None,
                                   None,
                                   None,
                                   load_dataset=False)
    fake_dataset.num_classes = num_classes
    fake_dataset.num_samples = np.sum(contingency_table)
    fake_tree_node = tree_node.TreeNode(fake_dataset, [True],
                                        calculate_contingency_tables=False)
    num_samples_per_value = np.sum(contingency_table, axis=1)
    fake_tree_node.contingency_tables = [
        tree_node.ContingencyTable(contingency_table, num_samples_per_value)
    ]
    return fake_tree_node
def create_tree(data_set, length, min_sup=1):
    header_table = {}  # {gene: no. of genomes it appears in}

    # this pass counts frequency of occurrence
    for genome in data_set:  # genome id
        local_set = set()
        for window in data_set[genome]:
            for gene in window:
                local_set.add(gene)
        for gene in local_set:
            header_table[gene] = header_table.get(gene, 0) + 1

    # now, header_table is {gene: no. of genomes it appears in}

    for gene in list(header_table):  # remove items not meeting minSup
        if header_table[gene] < min_sup:
            del (header_table[gene])
    freq_gene_set = set(header_table.keys())

    if len(freq_gene_set) == 0:
        return None  # if no genes meet min support --> get out

    ret_tree = tree_node.TreeNode('Null Set', 1, None)  # create tree

    for genome, windows in data_set.items():
        for window in windows:
            filtered_window_d = {}  # {gene: no of genomes it's in}
            for gene in window:
                if gene in freq_gene_set:
                    filtered_window_d[gene] = header_table[gene]
                    # this makes sure that each gene appears only once in ordered items

            if len(filtered_window_d) > length:  # there are frequent genes in this window - at least l
                ordered_path = [v[0] for v in sorted(filtered_window_d.items(), key=lambda p: p[1], reverse=True)]
                # (ordered_path is the window that is ordered in a descending order based on
                # frequency of each gene in the genomes)
                update_tree(ordered_path, ret_tree, genome)  # populate tree with ordered_path

    return ret_tree
 def put(self, key, val):
     if self.root:
         self._put(key, val, self.root)
     else:
         self.root = tree_node.TreeNode(key, val)
     self.size += 1
Beispiel #7
0
def grow_decision_tree(examples, attributes, default, depth):
    # for this case, we have numeric (real valued) features and a categorical result
    print ("starting to process a node at depth", depth)
    # check stopping conditions - if that's the case, return a leaf node

    # no examples left or less than minimum
    if len(examples) <= MIN_EXAMPLES:
        leaf = tree_node.TreeNode()
        leaf.depth = depth
        leaf.label = default
        print("returning a leaf at depth", depth)
        return leaf

    # all examples have same label
    if examples.iloc[:, 0].nunique() == 1:
        # return the label of the first one, since they're all the same
        leaf = tree_node.TreeNode()
        leaf.depth = depth
        leaf.label = examples.iloc[0, 0]
        print("returning a leaf at depth", depth)
        return leaf

    # no attributes (not relevant now, may be implemented later)

    # maximum depth
    if depth >= MAX_DEPTH:
        leaf = tree_node.TreeNode()
        leaf.depth = depth
        leaf.label = default
        print("returning a leaf at depth", depth)
        return leaf

    # else, grow recursively
    best = choose_best_attr(attributes, examples)

    # check for None - if we have it, no acceptable split was found, and we should make this a leaf
    if best[0] is None or best[1] is None:
        leaf = tree_node.TreeNode()
        leaf.depth = depth
        leaf.label = default
        print("returning a leaf at depth", depth)
        return leaf

    # else, there is an OK split
    tree = tree_node.TreeNode()
    tree.depth = depth
    tree.split_on = best[0]
    tree.split_value = best[1]

    # split examples
    left_examples = examples[examples.iloc[:, tree.split_on] <= tree.split_value]
    right_examples = examples[examples.iloc[:, tree.split_on] > tree.split_value]

    # if multiple modes, arbitrarily choose the first one
    left_label = left_examples.iloc[:,0].mode()[0]
    right_label = right_examples.iloc[:,0].mode()[0]

    # passing entire set of attributes because they are real-valued
    tree.left_child = grow_decision_tree(left_examples, attributes, left_label, depth + 1)
    tree.right_child = grow_decision_tree(right_examples, attributes, right_label, depth + 1)

    print("finishing a node at depth", depth)
    return tree