def on_exp(self, target, option, names, values): """ exp : NUMBER | IDENTIFIER | LPAREN exp RPAREN | LBRACKET exp RBRACKET | LCBRACKET exp RCBRACKET | unary | binary | nary """ if option == 0: # rule: NUMBER # TODO: A bit hacky, this achieves long integers and floats. value = float(values[0]) if '.' in values[0] else int(values[0]) return Leaf(value) if option == 1: # rule: IDENTIFIER return Leaf(values[0]) if 2 <= option <= 4: # rule: LPAREN exp RPAREN | LBRACKET exp RBRACKET # | LCBRACKET exp RCBRACKET values[1].parens = pred(values[1]) > TIMES_PRED return values[1] if 5 <= option <= 7: # rule: unary | binary | nary return values[0] raise BisonSyntaxError('Unsupported option %d in target "%s".' % (option, target)) # pragma: nocover
def __build_tree(self, X, y, attribute_list): # Only one class left left_classes = set(y) if len(left_classes) == 1: label = list(left_classes)[0] return Leaf(label) elif not attribute_list: # No attribute left to split on, Get most probable class label = tools.get_majority_vote(y) return Leaf(label) else: # This condition is stated here because of a bug in pandas when # using valu_counts try: percent = y.value_counts()[0] / len(y) if percent >= self.stop_threshold: label = tools.get_majority_vote(y) return Leaf(label) except Exception as e: print(str(e)) # Normal condition base_class = super(DecisionTreeContinuous, self) best_attribute, is_continuous = base_class.select_attribute( X, y, attribute_list, metric='naive') # Create node root = Node() # Update the attribute list attribute_list.remove(best_attribute) if is_continuous: # Continuous data -> Train an SVM classifier clf, X_list, y_list, class_list = self.__split_continuous_data( X, y) for data, labels, c in zip(X_list, y_list, class_list): root.add_son( Question(is_continuous, best_attribute, c, clf), Leaf(c)) #root.add_son(Question(is_continuous, best_attribute, c, clf), # self.__build_tree(data, labels, attribute_list)) else: # Categorical data # get the branchs and their attribute_value data_list, val_list = base_class.split_categorical_data( X, best_attribute) branch_data = base_class.get_clean_branchs( data_list, val_list, y) # Add branchs to the node for data, labels, attribute_val_list in branch_data: root.add_son( Question(is_continuous, best_attribute, attribute_val_list), self.__build_tree(data, labels, attribute_list)) # Return node return root
def build(): try: p = int (n.get ()) q = int (m.get ()) resultList = random.sample (range (0, q + 1), p) with open ('data.bin', 'wb') as w: for i in range (p): w.write (struct.pack ('i', resultList[i])) w.write (struct.pack ('12s', generate_random_str ().encode ('utf-8'))) l = [] with open ('data.bin', 'rb') as w: for i in range (p): t = '' t = t + str (struct.unpack ('i', w.read (4))[0]) t = t + ' ' + str (struct.unpack ('12s', w.read (12))[0])[2:-1] l.append (t) tree.root = Leaf (None, None, None, 4) for l1 in l: l1 = l1.split (' ') tree.insert (int (l1[0]), l1[1]) try: z.delete (1.0, "end") except: None tree.r = '' tree.print_tree (tree.root, ' ', 0) z.insert ("insert", tree.r) tk.messagebox.showinfo (title='温馨提醒', message='创建成功') except Exception as e: traceback.print_exc () tk.messagebox.showinfo (title='温馨提醒', message=e)
def __id3(self, dataset: Dataset, parent_dataset: Dataset, depth: int) -> Union[Node, Leaf]: """Performs the ID3 machine learning algorithm and returns the constructed decision tree. ID3 algorithm constructs the decision tree in which each node corresponds to some feature, and branches leading to the child nodes correspond to distinct values of the feature in the provided dataset. When constructing a new node, the dataset is grouped by its most discriminatory feature, which provides most information gain on the subsequent split. Leaf nodes of the tree correspond to class labels. if there is no depth limit, the id3 algorithm is guaranteed to construct a tree which perfectly classifies all examples from the given dataset, provided there are no examples with equal features but different labels. :param dataset: dataset for which the subtree of the decision tree is constructed. :param parent_dataset: dataset of the parent node in the decision tree. If the current execution of the method is the first one, parent dataset should be equal to the training dataset (root node has no parent) :param depth: depth of the node to be constructed. If called on the whole training dataset, depth should be 0 :return: instance of Node corresponding to the root of the decision tree for the given dataset. Alternatively, returns a Leaf node with predicted class label if the depth limit is reached, or the dataset is empty, or all examples in the dataset are classified with the same value. """ if self.__max_depth is None or depth < self.__max_depth: # depth limit not reached if len(dataset) == 0: # empty dataset return Leaf(parent_dataset.most_frequent_label ) # most frequent label of the parent elif len(dataset.label_space) == 1 or len( dataset.feature_names) == 0: # all examples have the same label or there is no features left in the dataset return Leaf(dataset.most_frequent_label) else: mdf: str = dataset.most_discriminatory_feature sub_datasets: dict[str, Dataset] = dataset.group_by_feature(mdf) node: Node = Node(mdf, dataset.most_frequent_label) # for each child, create the corresponding decision subtree for feature_value, sub_dataset in sub_datasets.items(): feature_value: str sub_dataset: Dataset child_node: Union[Node, Leaf] = self.__id3( sub_dataset, dataset, depth + 1) # create the subtree node.add_child(feature_value, child_node) # add as a child return node else: # depth limit reached return Leaf(dataset.most_frequent_label)
def computeFrequencies(self, s): h = [] # create a list(была проблема при сравнении листа и узла) #print(Counter(s).items()) for ch, freq in Counter(s).items(): h.append((freq, len(h), Leaf(ch) )) # method adds an item to the end of the list why len? # print("{} {} {}".format(freq, len(h), Leaf(ch))) #print(h) return h
def match_to_node(matcher: BaseMatcher) -> BaseNode: """ Converts a matcher into a node """ child_mask = matcher.is_match(X[attribute]) child_X = X[child_mask] child_y = y[child_mask] if child_X.size == 0: return Leaf(matcher, popular_class) else: return DecisionTree.__build_tree(child_X, child_y, child_attributes, matcher)
def on_integral_bounds(self, target, option, names, values): """ integral_bounds : INTEGRAL SUB exp """ if option == 0: # rule: INTEGRAL SUB exp if values[2].is_op(OP_POW): lbnd, ubnd = values[2] lbnd.negated += values[2].negated else: lbnd = values[2] ubnd = Leaf(INFINITY) apply_operator_negation(values[1], lbnd) return lbnd, ubnd
def reduced_error_pruning(current_node, X, y): """ X, y are validation sets """ if isinstance(current_node, Leaf) or len(y) == 0: return current_node current_accuracy = current_node.get_accuracy(X, y) node_copy = copy.deepcopy(current_node) majority_label = tools.get_majority_vote(y) current_node = Leaf(majority_label) new_accuracy = current_node.get_accuracy(X, y) if new_accuracy >= current_accuracy: return current_node else: current_node = node_copy # Son management node_sons = copy.deepcopy(current_node.sons) current_node.sons = [] for question, son in node_sons: # If the node is a leaf, nothing to do if isinstance(son, Leaf): current_node.add_son(question, son) current_node.add_son(question, reduced_error_pruning(son, X, y)) return current_node
def __init__(self, max_p): self.max_p = max_p self.root = Leaf(None, None, None, self.max_p) self.r = ''
def __build_tree(self, X, y, attribute_list): # Only one class left left_classes = set(y) if len(left_classes) == 1: label = list(left_classes)[0] return Leaf(label) elif not attribute_list: # No attribute left to split on, Get most probable class label = tools.get_majority_vote(y) return Leaf(label) else: # This condition is stated here because of a bug in pandas when # using valu_counts try: percent = y.value_counts()[0] / len(y) if percent >= self.stop_threshold: label = tools.get_majority_vote(y) return Leaf(label) except Exception as e: print(str(e)) # Normal condition base_class = super(DecisionTreeClassic, self) best_attribute, candidate_split = base_class.select_attribute( X, y, attribute_list, metric='naive') # Create node root = Node() # Update the attribute list attribute_list.remove(best_attribute) if candidate_split is not None: # Continuous data # Get less_or_equal and greater_than branchs le_branch, gt_branch = self.__split_continuous_data( X, best_attribute, candidate_split) # Get branchs labels y_le = y.loc[le_branch.index] y_gt = y.loc[gt_branch.index] # Add branchs # Less or equal branch root.add_son( QuestionClassic(best_attribute, [candidate_split], operator.le), self.__build_tree(le_branch, y_le, attribute_list)) # Greater than branch root.add_son( QuestionClassic(best_attribute, [candidate_split], operator.gt), self.__build_tree(gt_branch, y_gt, attribute_list)) else: # Categorical data # get the branchs and their attribute_value data_list, val_list = self.split_categorical_data( X, best_attribute) branch_data = self.get_clean_branchs(data_list, val_list, y) # Add branchs to the node for branch, labels, attribute_val_list in branch_data: root.add_son( QuestionClassic(best_attribute, attribute_val_list, operator.eq), self.__build_tree(branch, labels, attribute_list)) # Return node return root
def __build_tree(X: DataFrame, y: Series, attributes: List[str], matcher: BaseMatcher = None) -> BaseNode: """ Recursively builds a tree by greedily creating nodes based on the attribute with the highest information gain. :param X: The remaining values :param y: Classes corresponding to X :param attributes: The remaining attributes to choose from :param matcher: Matcher that checks the parent nodes attribute :return: Root of subtree that classifies values based on the remaining attributes """ assert X.shape[0] == y.shape[0] and X.shape[0] > 0 # Check for leaf: Only one class remains class_groups: GroupBy = y.groupby(y) classes = list(class_groups.groups.keys()) if len(class_groups.groups) == 1: return Leaf(matcher, classes[0]) # Check for no attributes left popular_class = sorted(class_groups, key=lambda key_value: key_value[1].size)[-1][0] if not any(True for _ in attributes): return Leaf(matcher, popular_class) def entropy(set: Series) -> float: """ Measures the uncertainty in set """ probs = map(lambda label: len(list(filter(lambda entry: entry == label, set))) / set.size, classes) return -sum(map(lambda prob: prob * log2(prob) if prob > 0 else 0, probs)) def gain(subsets: Iterator[Series]): """ Measures the information gain of splitting into subsets """ return entropy(y) - sum(map(lambda subset: subset.size / X.size * entropy(subset), subsets)) def discrete_gain(attr: str) -> float: """ Measures the information gain of splitting into all values at an attribute """ return gain(map(lambda val: y[X[attr] == val], set(X[attr]))) def continuous_gain(attr: str, threshold: float) -> float: """ Measure the information gain of splitting along a threshold at an attribute """ return gain([y[X[attr] <= threshold], y[X[attr] > threshold]]) def attribute_gain(attr: str) -> float: """ Measures the information gain based on the type of attribute """ attr_type = X.dtypes[attr] if attr_type == float64: # Find highest gain achievable with any threshold thresholds = map(lambda value: 0.5 * (value[0] + value[1]), pairwise(sorted(set(X[attr])))) return max(map(lambda threshold: continuous_gain(attr, threshold), thresholds)) # Find highest gain achievable by splitting at all values return discrete_gain(attr) # Find highest information gain attribute attributes = sorted(attributes, key=attribute_gain) attribute = attributes[-1] child_attributes = attributes[:-1] def generate_matchers() -> Iterator[BaseMatcher]: """ Generates matchers for the chosen attribute """ if X.dtypes[attribute] == float64: # Find threshold with highest information gain thresholds = map(lambda value: 0.5 * (value[0] + value[1]), pairwise(sorted(set(X[attribute])))) threshold = sorted(thresholds, key=lambda threshold: continuous_gain(attribute, threshold))[-1] # Generate matchers for <= and > than threshold for threshold_direction in [False, True]: yield ContinuousMatcher(threshold, threshold_direction) else: # Create matchers for all values in discrete case for child_value in set(X[attribute]): yield DiscreteMatcher(child_value) def match_to_node(matcher: BaseMatcher) -> BaseNode: """ Converts a matcher into a node """ child_mask = matcher.is_match(X[attribute]) child_X = X[child_mask] child_y = y[child_mask] if child_X.size == 0: return Leaf(matcher, popular_class) else: return DecisionTree.__build_tree(child_X, child_y, child_attributes, matcher) return Node(matcher, attribute, list(map(match_to_node, generate_matchers())), popular_class)
def __init__(self, name, description): Leaf.__init__(self, name, description)
def on_binary(self, target, option, names, values): """ binary : exp TIMES exp | exp PLUS exp | exp EQ exp | exp AND exp | exp OR exp | exp DIVIDE exp | exp MINUS exp | exp POW exp | exp SUB exp """ if option == 0: # rule: exp TIMES exp first = values[0] node = Node(values[1], first, values[2]) if first.negated and not first.parens: node.negated += first.negated first.negated = 0 return node if 1 <= option <= 4: # rule: exp {PLUS,EQ,AND,OR} exp return Node(values[1], values[0], values[2]) if option == 5: # rule: exp DIVIDE exp top = values[0] bottom = values[2] negated = 0 if top.negated and not top.parens: negated = top.negated top.negated = 0 if top.is_op(OP_MUL) and bottom.is_op(OP_MUL): dtop, fx = top dbot, x = bottom if dtop.is_identifier('d') and dbot.is_identifier('d') \ and x.is_identifier(): # (d (fx)) / (dx) return Node(OP_DXDER, fx, x, negated=negated) return Node(OP_DIV, top, bottom, negated=negated) if option == 6: # rule: exp MINUS exp right = values[2] right.negated += 1 # Explicit call the hook handler on the created unary negation. self.hook_handler('unary', 0, names, values, right) return Node(OP_ADD, values[0], right) if option == 7: # rule: exp POW exp apply_operator_negation(values[1], values[2]) return Node(OP_POW, values[0], values[2]) if option == 8: # rule: exp SUB exp bounds = values[2] if bounds.is_op(OP_POW): lbnd, ubnd = bounds lbnd.negated += bounds.negated else: lbnd = bounds ubnd = Leaf(INFINITY) lbnd.negated += len(values[1]) - 1 return Node(OP_INT_DEF, values[0], lbnd, ubnd) raise BisonSyntaxError('Unsupported option %d in target "%s".' % (option, target)) # pragma: nocover
def on_unary(self, target, option, names, values): """ unary : MINUS exp %prec NEG | FUNCTION exp | FUNCTION_PAREN exp RPAREN | raised_function exp %prec FUNCTION | DERIVATIVE exp | exp PRIME | INTEGRAL exp | integral_bounds exp %prec INTEGRAL | PIPE exp PIPE | LOGARITHM exp | logarithm_subscript exp %prec LOGARITHM | TIMES exp """ if option == 0: # rule: MINUS exp values[1].negated += 1 return values[1] if option in (1, 2): # rule: FUNCTION exp | FUNCTION_PAREN exp RPAREN fun = values[0] if option == 1 else values[0][:-1].rstrip() if values[1].is_op(OP_COMMA): return Node(fun, *values[1]) return Node(fun, values[1]) if option == 3: # rule: raised_function exp func, exponent = values[0] if values[1].is_op(OP_COMMA): return Node(OP_POW, Node(func, *values[1]), exponent) return Node(OP_POW, Node(func, values[1]), exponent) if option == 4: # rule: DERIVATIVE exp # DERIVATIVE looks like 'd/d*x' -> extract the 'x' return Node(OP_DXDER, values[1], Leaf(values[0][-1])) if option == 5: # rule: exp PRIME return Node(OP_PRIME, values[0]) if option == 6: # rule: INTEGRAL exp fx, x = find_integration_variable(values[1]) return Node(OP_INT, fx, x) if option == 7: # rule: integral_bounds exp lbnd, ubnd = values[0] fx, x = find_integration_variable(values[1]) return Node(OP_INT, fx, x, lbnd, ubnd) if option == 8: # rule: PIPE exp PIPE return Node(OP_ABS, values[1]) if option == 9: # rule: LOGARITHM exp if values[1].is_op(OP_COMMA): return Node(OP_LOG, *values[1]) if values[0] == 'ln': base = E else: base = DEFAULT_LOGARITHM_BASE return Node(OP_LOG, values[1], Leaf(base)) if option == 10: # rule: logarithm_subscript exp if values[1].is_op(OP_COMMA): raise BisonSyntaxError('Shortcut logarithm base "log_%s" does ' 'not support additional arguments.' % (values[0])) return Node(OP_LOG, values[1], values[0]) if option == 11: # rule: TIMES exp return values[1] raise BisonSyntaxError('Unsupported option %d in target "%s".' % (option, target)) # pragma: nocover