Esempio n. 1
0
File: parser.py Progetto: smvv/trs
    def on_exp(self, target, option, names, values):
        """
        exp : NUMBER
            | IDENTIFIER
            | LPAREN exp RPAREN
            | LBRACKET exp RBRACKET
            | LCBRACKET exp RCBRACKET
            | unary
            | binary
            | nary
        """
        if option == 0:  # rule: NUMBER
            # TODO: A bit hacky, this achieves long integers and floats.
            value = float(values[0]) if '.' in values[0] else int(values[0])
            return Leaf(value)

        if option == 1:  # rule: IDENTIFIER
            return Leaf(values[0])

        if 2 <= option <= 4:  # rule: LPAREN exp RPAREN | LBRACKET exp RBRACKET
            #       | LCBRACKET exp RCBRACKET
            values[1].parens = pred(values[1]) > TIMES_PRED
            return values[1]

        if 5 <= option <= 7:  # rule: unary | binary | nary
            return values[0]

        raise BisonSyntaxError('Unsupported option %d in target "%s".' %
                               (option, target))  # pragma: nocover
Esempio n. 2
0
 def __build_tree(self, X, y, attribute_list):
     # Only one class left
     left_classes = set(y)
     if len(left_classes) == 1:
         label = list(left_classes)[0]
         return Leaf(label)
     elif not attribute_list:
         # No attribute left to split on, Get most probable class
         label = tools.get_majority_vote(y)
         return Leaf(label)
     else:
         # This condition is stated here because of a bug in pandas when
         # using valu_counts
         try:
             percent = y.value_counts()[0] / len(y)
             if percent >= self.stop_threshold:
                 label = tools.get_majority_vote(y)
                 return Leaf(label)
         except Exception as e:
             print(str(e))
         # Normal condition
         base_class = super(DecisionTreeContinuous, self)
         best_attribute, is_continuous = base_class.select_attribute(
             X, y, attribute_list, metric='naive')
         # Create node
         root = Node()
         # Update the attribute list
         attribute_list.remove(best_attribute)
         if is_continuous:
             # Continuous data -> Train an SVM classifier
             clf, X_list, y_list, class_list = self.__split_continuous_data(
                 X, y)
             for data, labels, c in zip(X_list, y_list, class_list):
                 root.add_son(
                     Question(is_continuous, best_attribute, c, clf),
                     Leaf(c))
                 #root.add_son(Question(is_continuous, best_attribute, c, clf),
                 #    self.__build_tree(data, labels, attribute_list))
         else:
             # Categorical data
             # get the branchs and their attribute_value
             data_list, val_list = base_class.split_categorical_data(
                 X, best_attribute)
             branch_data = base_class.get_clean_branchs(
                 data_list, val_list, y)
             # Add branchs to the node
             for data, labels, attribute_val_list in branch_data:
                 root.add_son(
                     Question(is_continuous, best_attribute,
                              attribute_val_list),
                     self.__build_tree(data, labels, attribute_list))
         # Return node
         return root
Esempio n. 3
0
def build():
    try:
        p = int (n.get ())
        q = int (m.get ())
        resultList = random.sample (range (0, q + 1), p)
        with open ('data.bin', 'wb') as w:
            for i in range (p):
                w.write (struct.pack ('i', resultList[i]))
                w.write (struct.pack ('12s', generate_random_str ().encode ('utf-8')))

        l = []
        with open ('data.bin', 'rb') as w:
            for i in range (p):
                t = ''
                t = t + str (struct.unpack ('i', w.read (4))[0])
                t = t + ' ' + str (struct.unpack ('12s', w.read (12))[0])[2:-1]
                l.append (t)
        tree.root = Leaf (None, None, None, 4)
        for l1 in l:
            l1 = l1.split (' ')
            tree.insert (int (l1[0]), l1[1])
        try:
            z.delete (1.0, "end")
        except:
            None
        tree.r = ''
        tree.print_tree (tree.root, '   ', 0)
        z.insert ("insert", tree.r)
        tk.messagebox.showinfo (title='温馨提醒', message='创建成功')
    except Exception as e:
        traceback.print_exc ()
        tk.messagebox.showinfo (title='温馨提醒', message=e)
    def __id3(self, dataset: Dataset, parent_dataset: Dataset,
              depth: int) -> Union[Node, Leaf]:
        """Performs the ID3 machine learning algorithm and returns the constructed decision tree.

        ID3 algorithm constructs the decision tree in which each node corresponds to some feature, and
        branches leading to the child nodes correspond to distinct values of the feature in the provided
        dataset. When constructing a new node, the dataset is grouped by its most discriminatory feature,
        which provides most information gain on the subsequent split. Leaf nodes of the tree correspond
        to class labels. if there is no depth limit, the id3 algorithm is guaranteed to construct a tree
        which perfectly classifies all examples from the given dataset, provided there are no examples
        with equal features but different labels.

        :param dataset: dataset for which the subtree of the decision tree is constructed.
        :param parent_dataset: dataset of the parent node in the decision tree. If the current execution of the
        method is the first one, parent dataset should be equal to the training dataset (root node has no parent)
        :param depth: depth of the node to be constructed. If called on the whole training dataset, depth should be 0
        :return: instance of Node corresponding to the root of the decision tree for the given dataset. Alternatively,
                returns a Leaf node with predicted class label if the depth limit is reached, or the dataset is
                empty, or all examples in the dataset are classified with the same value.
        """

        if self.__max_depth is None or depth < self.__max_depth:  # depth limit not reached
            if len(dataset) == 0:  # empty dataset
                return Leaf(parent_dataset.most_frequent_label
                            )  # most frequent label of the parent
            elif len(dataset.label_space) == 1 or len(
                    dataset.feature_names) == 0:
                # all examples have the same label or there is no features left in the dataset
                return Leaf(dataset.most_frequent_label)
            else:
                mdf: str = dataset.most_discriminatory_feature
                sub_datasets: dict[str,
                                   Dataset] = dataset.group_by_feature(mdf)
                node: Node = Node(mdf, dataset.most_frequent_label)
                # for each child, create the corresponding decision subtree
                for feature_value, sub_dataset in sub_datasets.items():
                    feature_value: str
                    sub_dataset: Dataset

                    child_node: Union[Node, Leaf] = self.__id3(
                        sub_dataset, dataset, depth + 1)  # create the subtree
                    node.add_child(feature_value, child_node)  # add as a child
                return node
        else:  # depth limit reached
            return Leaf(dataset.most_frequent_label)
Esempio n. 5
0
 def computeFrequencies(self, s):
     h = []  # create a list(была проблема при сравнении листа и узла)
     #print(Counter(s).items())
     for ch, freq in Counter(s).items():
         h.append((freq, len(h), Leaf(ch)
                   ))  # method adds an item to the end of the list why len?
         # print("{} {} {}".format(freq, len(h), Leaf(ch)))
     #print(h)
     return h
Esempio n. 6
0
 def match_to_node(matcher: BaseMatcher) -> BaseNode:
     """
     Converts a matcher into a node
     """
     child_mask = matcher.is_match(X[attribute])
     child_X = X[child_mask]
     child_y = y[child_mask]
     if child_X.size == 0:
         return Leaf(matcher, popular_class)
     else:
         return DecisionTree.__build_tree(child_X, child_y, child_attributes, matcher)
Esempio n. 7
0
File: parser.py Progetto: smvv/trs
    def on_integral_bounds(self, target, option, names, values):
        """
        integral_bounds : INTEGRAL SUB exp
        """
        if option == 0:  # rule: INTEGRAL SUB exp
            if values[2].is_op(OP_POW):
                lbnd, ubnd = values[2]
                lbnd.negated += values[2].negated
            else:
                lbnd = values[2]
                ubnd = Leaf(INFINITY)

            apply_operator_negation(values[1], lbnd)
            return lbnd, ubnd
Esempio n. 8
0
def reduced_error_pruning(current_node, X, y):
    """ X, y are validation sets """
    if isinstance(current_node, Leaf) or len(y) == 0:
        return current_node

    current_accuracy = current_node.get_accuracy(X, y)
    node_copy = copy.deepcopy(current_node)
    majority_label = tools.get_majority_vote(y)
    current_node = Leaf(majority_label)
    new_accuracy = current_node.get_accuracy(X, y)
    if new_accuracy >= current_accuracy:
        return current_node
    else:
        current_node = node_copy
        # Son management
        node_sons = copy.deepcopy(current_node.sons)
        current_node.sons = []
        for question, son in node_sons:
            # If the node is a leaf, nothing to do
            if isinstance(son, Leaf):
                current_node.add_son(question, son)
            current_node.add_son(question, reduced_error_pruning(son, X, y))
        return current_node
Esempio n. 9
0
 def __init__(self, max_p):
     self.max_p = max_p
     self.root = Leaf(None, None, None, self.max_p)
     self.r = ''
 def __build_tree(self, X, y, attribute_list):
     # Only one class left
     left_classes = set(y)
     if len(left_classes) == 1:
         label = list(left_classes)[0]
         return Leaf(label)
     elif not attribute_list:
         # No attribute left to split on, Get most probable class
         label = tools.get_majority_vote(y)
         return Leaf(label)
     else:
         # This condition is stated here because of a bug in pandas when
         # using valu_counts
         try:
             percent = y.value_counts()[0] / len(y)
             if percent >= self.stop_threshold:
                 label = tools.get_majority_vote(y)
                 return Leaf(label)
         except Exception as e:
             print(str(e))
         # Normal condition
         base_class = super(DecisionTreeClassic, self)
         best_attribute, candidate_split = base_class.select_attribute(
             X, y, attribute_list, metric='naive')
         # Create node
         root = Node()
         # Update the attribute list
         attribute_list.remove(best_attribute)
         if candidate_split is not None:
             # Continuous data
             # Get less_or_equal and greater_than branchs
             le_branch, gt_branch = self.__split_continuous_data(
                 X, best_attribute, candidate_split)
             # Get branchs labels
             y_le = y.loc[le_branch.index]
             y_gt = y.loc[gt_branch.index]
             # Add branchs
             # Less or equal branch
             root.add_son(
                 QuestionClassic(best_attribute, [candidate_split],
                                 operator.le),
                 self.__build_tree(le_branch, y_le, attribute_list))
             # Greater than branch
             root.add_son(
                 QuestionClassic(best_attribute, [candidate_split],
                                 operator.gt),
                 self.__build_tree(gt_branch, y_gt, attribute_list))
         else:
             # Categorical data
             # get the branchs and their attribute_value
             data_list, val_list = self.split_categorical_data(
                 X, best_attribute)
             branch_data = self.get_clean_branchs(data_list, val_list, y)
             # Add branchs to the node
             for branch, labels, attribute_val_list in branch_data:
                 root.add_son(
                     QuestionClassic(best_attribute, attribute_val_list,
                                     operator.eq),
                     self.__build_tree(branch, labels, attribute_list))
         # Return node
         return root
Esempio n. 11
0
    def __build_tree(X: DataFrame, y: Series, attributes: List[str], matcher: BaseMatcher = None) -> BaseNode:
        """
        Recursively builds a tree by greedily creating nodes based on the attribute with the highest information gain.
        :param X: The remaining values
        :param y: Classes corresponding to X
        :param attributes: The remaining attributes to choose from
        :param matcher: Matcher that checks the parent nodes attribute
        :return: Root of subtree that classifies values based on the remaining attributes
        """
        assert X.shape[0] == y.shape[0] and X.shape[0] > 0

        # Check for leaf: Only one class remains
        class_groups: GroupBy = y.groupby(y)
        classes = list(class_groups.groups.keys())
        if len(class_groups.groups) == 1:
            return Leaf(matcher, classes[0])

        # Check for no attributes left
        popular_class = sorted(class_groups, key=lambda key_value: key_value[1].size)[-1][0]
        if not any(True for _ in attributes):
            return Leaf(matcher, popular_class)

        def entropy(set: Series) -> float:
            """
            Measures the uncertainty in set
            """
            probs = map(lambda label: len(list(filter(lambda entry: entry == label, set))) / set.size, classes)
            return -sum(map(lambda prob: prob * log2(prob) if prob > 0 else 0, probs))

        def gain(subsets: Iterator[Series]):
            """
            Measures the information gain of splitting into subsets
            """
            return entropy(y) - sum(map(lambda subset: subset.size / X.size * entropy(subset), subsets))

        def discrete_gain(attr: str) -> float:
            """
            Measures the information gain of splitting into all values at an attribute
            """
            return gain(map(lambda val: y[X[attr] == val], set(X[attr])))

        def continuous_gain(attr: str, threshold: float) -> float:
            """
            Measure the information gain of splitting along a threshold at an attribute
            """
            return gain([y[X[attr] <= threshold], y[X[attr] > threshold]])

        def attribute_gain(attr: str) -> float:
            """
            Measures the information gain based on the type of attribute
            """
            attr_type = X.dtypes[attr]
            if attr_type == float64:
                # Find highest gain achievable with any threshold
                thresholds = map(lambda value: 0.5 * (value[0] + value[1]), pairwise(sorted(set(X[attr]))))
                return max(map(lambda threshold: continuous_gain(attr, threshold), thresholds))

            # Find highest gain achievable by splitting at all values
            return discrete_gain(attr)

        # Find highest information gain attribute
        attributes = sorted(attributes, key=attribute_gain)
        attribute = attributes[-1]
        child_attributes = attributes[:-1]

        def generate_matchers() -> Iterator[BaseMatcher]:
            """
            Generates matchers for the chosen attribute
            """
            if X.dtypes[attribute] == float64:
                # Find threshold with highest information gain
                thresholds = map(lambda value: 0.5 * (value[0] + value[1]), pairwise(sorted(set(X[attribute]))))
                threshold = sorted(thresholds, key=lambda threshold: continuous_gain(attribute, threshold))[-1]

                # Generate matchers for <= and > than threshold
                for threshold_direction in [False, True]:
                    yield ContinuousMatcher(threshold, threshold_direction)
            else:
                # Create matchers for all values in discrete case
                for child_value in set(X[attribute]):
                    yield DiscreteMatcher(child_value)

        def match_to_node(matcher: BaseMatcher) -> BaseNode:
            """
            Converts a matcher into a node
            """
            child_mask = matcher.is_match(X[attribute])
            child_X = X[child_mask]
            child_y = y[child_mask]
            if child_X.size == 0:
                return Leaf(matcher, popular_class)
            else:
                return DecisionTree.__build_tree(child_X, child_y, child_attributes, matcher)

        return Node(matcher, attribute, list(map(match_to_node, generate_matchers())), popular_class)
Esempio n. 12
0
 def __init__(self, name, description):
     Leaf.__init__(self, name, description)
Esempio n. 13
0
File: parser.py Progetto: smvv/trs
    def on_binary(self, target, option, names, values):
        """
        binary : exp TIMES exp
               | exp PLUS exp
               | exp EQ exp
               | exp AND exp
               | exp OR exp
               | exp DIVIDE exp
               | exp MINUS exp
               | exp POW exp
               | exp SUB exp
        """

        if option == 0:  # rule: exp TIMES exp
            first = values[0]
            node = Node(values[1], first, values[2])

            if first.negated and not first.parens:
                node.negated += first.negated
                first.negated = 0

            return node

        if 1 <= option <= 4:  # rule: exp {PLUS,EQ,AND,OR} exp
            return Node(values[1], values[0], values[2])

        if option == 5:  # rule: exp DIVIDE exp
            top = values[0]
            bottom = values[2]
            negated = 0

            if top.negated and not top.parens:
                negated = top.negated
                top.negated = 0

            if top.is_op(OP_MUL) and bottom.is_op(OP_MUL):
                dtop, fx = top
                dbot, x = bottom

                if dtop.is_identifier('d') and dbot.is_identifier('d') \
                        and x.is_identifier():
                    # (d (fx)) / (dx)
                    return Node(OP_DXDER, fx, x, negated=negated)

            return Node(OP_DIV, top, bottom, negated=negated)

        if option == 6:  # rule: exp MINUS exp
            right = values[2]
            right.negated += 1

            # Explicit call the hook handler on the created unary negation.
            self.hook_handler('unary', 0, names, values, right)

            return Node(OP_ADD, values[0], right)

        if option == 7:  # rule: exp POW exp
            apply_operator_negation(values[1], values[2])
            return Node(OP_POW, values[0], values[2])

        if option == 8:  # rule: exp SUB exp
            bounds = values[2]

            if bounds.is_op(OP_POW):
                lbnd, ubnd = bounds
                lbnd.negated += bounds.negated
            else:
                lbnd = bounds
                ubnd = Leaf(INFINITY)

            lbnd.negated += len(values[1]) - 1

            return Node(OP_INT_DEF, values[0], lbnd, ubnd)

        raise BisonSyntaxError('Unsupported option %d in target "%s".' %
                               (option, target))  # pragma: nocover
Esempio n. 14
0
File: parser.py Progetto: smvv/trs
    def on_unary(self, target, option, names, values):
        """
        unary : MINUS exp %prec NEG
              | FUNCTION exp
              | FUNCTION_PAREN exp RPAREN
              | raised_function exp %prec FUNCTION
              | DERIVATIVE exp
              | exp PRIME
              | INTEGRAL exp
              | integral_bounds exp %prec INTEGRAL
              | PIPE exp PIPE
              | LOGARITHM exp
              | logarithm_subscript exp %prec LOGARITHM
              | TIMES exp
        """

        if option == 0:  # rule: MINUS exp
            values[1].negated += 1

            return values[1]

        if option in (1, 2):  # rule: FUNCTION exp | FUNCTION_PAREN exp RPAREN
            fun = values[0] if option == 1 else values[0][:-1].rstrip()

            if values[1].is_op(OP_COMMA):
                return Node(fun, *values[1])

            return Node(fun, values[1])

        if option == 3:  # rule: raised_function exp
            func, exponent = values[0]

            if values[1].is_op(OP_COMMA):
                return Node(OP_POW, Node(func, *values[1]), exponent)

            return Node(OP_POW, Node(func, values[1]), exponent)

        if option == 4:  # rule: DERIVATIVE exp
            # DERIVATIVE looks like 'd/d*x' -> extract the 'x'
            return Node(OP_DXDER, values[1], Leaf(values[0][-1]))

        if option == 5:  # rule: exp PRIME
            return Node(OP_PRIME, values[0])

        if option == 6:  # rule: INTEGRAL exp
            fx, x = find_integration_variable(values[1])
            return Node(OP_INT, fx, x)

        if option == 7:  # rule: integral_bounds exp
            lbnd, ubnd = values[0]
            fx, x = find_integration_variable(values[1])
            return Node(OP_INT, fx, x, lbnd, ubnd)

        if option == 8:  # rule: PIPE exp PIPE
            return Node(OP_ABS, values[1])

        if option == 9:  # rule: LOGARITHM exp
            if values[1].is_op(OP_COMMA):
                return Node(OP_LOG, *values[1])

            if values[0] == 'ln':
                base = E
            else:
                base = DEFAULT_LOGARITHM_BASE

            return Node(OP_LOG, values[1], Leaf(base))

        if option == 10:  # rule: logarithm_subscript exp
            if values[1].is_op(OP_COMMA):
                raise BisonSyntaxError('Shortcut logarithm base "log_%s" does '
                                       'not support additional arguments.' %
                                       (values[0]))

            return Node(OP_LOG, values[1], values[0])

        if option == 11:  # rule: TIMES exp
            return values[1]

        raise BisonSyntaxError('Unsupported option %d in target "%s".' %
                               (option, target))  # pragma: nocover