def _put(self, key, val, currentNode): if key < currentNode.key: if currentNode.has_left_child(): self._put(key, val, currentNode.left_child) else: currentNode.left_child = tree_node.TreeNode(key, val, parent=currentNode) else: if currentNode.has_right_child(): self._put(key, val, currentNode.right_child) else: currentNode.right_child = tree_node.TreeNode( key, val, parent=currentNode)
def train_tree(self): #first normalize weight of training examples before you feed them to root self.normalize_weight() self.root = tree_node.TreeNode(self.training_examples, 0, self.maxdepth) #Create children recursively self.root.create_children() return
def update_tree(window, in_tree, genome_num): if window[0] in in_tree.children: # check if window[0] in retTree.children if in_tree.children[window[0]].last_update != genome_num: in_tree.children[window[0]].inc(1) # increase count by one in_tree.children[window[0]].last_update = genome_num else: # add window[0] to in_tree.children in_tree.children[window[0]] = tree_node.TreeNode(window[0], 1, in_tree) in_tree.children[window[0]].last_update = genome_num if len(window) > 1: # call updateTree() with remaining ordered items update_tree(window[1::], in_tree.children[window[0]], genome_num)
def create_fake_tree_node(contingency_table): """Creates a fake TreeNode consistent with the given contingency table.""" num_classes = contingency_table.shape[1] fake_dataset = dataset.Dataset(None, None, None, None, None, load_dataset=False) fake_dataset.num_classes = num_classes fake_dataset.num_samples = np.sum(contingency_table) fake_tree_node = tree_node.TreeNode(fake_dataset, [True], calculate_contingency_tables=False) num_samples_per_value = np.sum(contingency_table, axis=1) fake_tree_node.contingency_tables = [ tree_node.ContingencyTable(contingency_table, num_samples_per_value) ] return fake_tree_node
def create_tree(data_set, length, min_sup=1): header_table = {} # {gene: no. of genomes it appears in} # this pass counts frequency of occurrence for genome in data_set: # genome id local_set = set() for window in data_set[genome]: for gene in window: local_set.add(gene) for gene in local_set: header_table[gene] = header_table.get(gene, 0) + 1 # now, header_table is {gene: no. of genomes it appears in} for gene in list(header_table): # remove items not meeting minSup if header_table[gene] < min_sup: del (header_table[gene]) freq_gene_set = set(header_table.keys()) if len(freq_gene_set) == 0: return None # if no genes meet min support --> get out ret_tree = tree_node.TreeNode('Null Set', 1, None) # create tree for genome, windows in data_set.items(): for window in windows: filtered_window_d = {} # {gene: no of genomes it's in} for gene in window: if gene in freq_gene_set: filtered_window_d[gene] = header_table[gene] # this makes sure that each gene appears only once in ordered items if len(filtered_window_d) > length: # there are frequent genes in this window - at least l ordered_path = [v[0] for v in sorted(filtered_window_d.items(), key=lambda p: p[1], reverse=True)] # (ordered_path is the window that is ordered in a descending order based on # frequency of each gene in the genomes) update_tree(ordered_path, ret_tree, genome) # populate tree with ordered_path return ret_tree
def put(self, key, val): if self.root: self._put(key, val, self.root) else: self.root = tree_node.TreeNode(key, val) self.size += 1
def grow_decision_tree(examples, attributes, default, depth): # for this case, we have numeric (real valued) features and a categorical result print ("starting to process a node at depth", depth) # check stopping conditions - if that's the case, return a leaf node # no examples left or less than minimum if len(examples) <= MIN_EXAMPLES: leaf = tree_node.TreeNode() leaf.depth = depth leaf.label = default print("returning a leaf at depth", depth) return leaf # all examples have same label if examples.iloc[:, 0].nunique() == 1: # return the label of the first one, since they're all the same leaf = tree_node.TreeNode() leaf.depth = depth leaf.label = examples.iloc[0, 0] print("returning a leaf at depth", depth) return leaf # no attributes (not relevant now, may be implemented later) # maximum depth if depth >= MAX_DEPTH: leaf = tree_node.TreeNode() leaf.depth = depth leaf.label = default print("returning a leaf at depth", depth) return leaf # else, grow recursively best = choose_best_attr(attributes, examples) # check for None - if we have it, no acceptable split was found, and we should make this a leaf if best[0] is None or best[1] is None: leaf = tree_node.TreeNode() leaf.depth = depth leaf.label = default print("returning a leaf at depth", depth) return leaf # else, there is an OK split tree = tree_node.TreeNode() tree.depth = depth tree.split_on = best[0] tree.split_value = best[1] # split examples left_examples = examples[examples.iloc[:, tree.split_on] <= tree.split_value] right_examples = examples[examples.iloc[:, tree.split_on] > tree.split_value] # if multiple modes, arbitrarily choose the first one left_label = left_examples.iloc[:,0].mode()[0] right_label = right_examples.iloc[:,0].mode()[0] # passing entire set of attributes because they are real-valued tree.left_child = grow_decision_tree(left_examples, attributes, left_label, depth + 1) tree.right_child = grow_decision_tree(right_examples, attributes, right_label, depth + 1) print("finishing a node at depth", depth) return tree