def CoherentFusionDecisionTree(dTree1, node, dTree2):
    """adding the coherent part of tree dTree2 to node 'node' of tree dTree1"""
    dtree1 = copy.deepcopy(dTree1)
    dtree2 = copy.deepcopy(dTree2)

    leaf = lib_tree.cut_into_leaf2(dtree1, node)
    rule = lib_tree.extract_rule(dtree1, leaf)

    ForceCoherence(dtree2, rule, node=0)
    lib_tree.fusionDecisionTree(dtree1, leaf, dtree2)

    return dtree1
Example #2
0
def RandomOnePrune(dtree, policy=None, depth_thresh=2):

    if policy is None:
        all_inds = np.arange(0, dtree.tree_.feature.size)
        depths = depth_array(dtree, all_inds)
        mx_d = max(depths)

        inds1 = np.where(dtree.tree_.feature != -2)[0]
        inds2 = np.where(depths >= depth_thresh)[0]

        inds = list(set(list(inds1)).intersection(set(list(inds2))))

        if len(inds) != 0:
            node = np.random.choice(inds)
            cut_into_leaf2(dtree, node)

        return dtree

    elif policy == 'exp':

        all_inds = np.arange(0, dtree.tree_.feature.size)
        depths = depth_array(dtree, all_inds)
        mx_d = max(depths)

        inds1 = np.where(dtree.tree_.feature != -2)[0]
        inds2 = np.where(depths >= depth_thresh)[0]
        t = np.random.exponential(mx_d / 2)
        inds3 = np.where(depths >= int(mx_d - t))[0]

        inds = set(list(inds1)).intersection(set(list(inds2)))
        inds = inds.intersection(set(list(inds3)))
        inds = list(inds)

        if len(inds) != 0:
            node = np.random.choice(inds)
            cut_into_leaf2(dtree, node)

        return dtree
    else:
        all_inds = np.arange(0, dtree.tree_.feature.size)
        depths = depth_array(dtree, all_inds)
        mx_d = max(depths)

        inds1 = np.where(dtree.tree_.feature != -2)[0]
        inds2 = np.where(depths > depth_thresh)[0]

        inds = list(set(list(inds1)).intersection(set(list(inds2))))

        if len(inds) != 0:
            node = np.random.choice(inds)
            cut_into_leaf2(dtree, node)

        return dtree
def eq_rec_tree(dtree_or,
                actual_new_node,
                dtree_new=None,
                actual_rule=None,
                K_union_rules=None,
                actual_reaching_class=None,
                considered_splits=None,
                max_depth=None,
                from_depth=None,
                on_subtrees=False,
                subtrees_nodes=None,
                finishing_features=list(),
                smallest_tree=False):

    if from_depth is not None:
        from_depth = int(from_depth)
        if from_depth < 1:
            print('WARNING : Given depth < 1 !')
        else:
            nodes_depth = np.array(nodes_in_depth(dtree_or, from_depth))
            on_subtrees = True
            subtrees_nodes = nodes_depth

    if actual_new_node == 0:
        if on_subtrees:
            if subtrees_nodes is None:
                print('WARNING : No specified subtrees !')

            dtree_new = copy.deepcopy(dtree_or)

            for i in subtrees_nodes:
                r = extract_rule(dtree_or, i)
                subtree = sub_tree(dtree_or, i)

                cut_node, b_ = search_rule(dtree_new, r)
                fusion_node = cut_into_leaf2(dtree_new, cut_node)

                subeqtree = eq_rec_tree(subtree,
                                        0,
                                        max_depth=max_depth,
                                        finishing_features=finishing_features,
                                        smallest_tree=smallest_tree)
                dtree_new = fusionDecisionTree(dtree_new, fusion_node,
                                               subeqtree)

            return dtree_new
        else:
            leaves, rules = extract_leaves_rules(dtree_or)

            if K_union_rules is None:
                K_union_rules = np.zeros(dtree_or.n_classes_, dtype=object)
                for c, K in enumerate(dtree_or.classes_.astype(int)):

                    K_union_rules[c] = list()
                for k, l in enumerate(leaves):
                    c = int(np.argmax(dtree_or.tree_.value[l, :, :]))
                    K_union_rules[c].append(rules[k])

            all_splits = np.zeros(dtree_or.tree_.node_count - leaves.size,
                                  dtype=[("phi", '<i8'), ("th", '<f8')])

            compt = 0
            for i in range(dtree_or.tree_.node_count):
                if i not in leaves:
                    all_splits[compt] = (dtree_or.tree_.feature[i],
                                         dtree_or.tree_.threshold[i])
                    compt = compt + 1

            considered_splits = all_splits

            actual_reaching_class = dtree_or.classes_.astype(int)
            dtree_new = CreateFullNewTree(dtree_or)

    if actual_rule is not None:
        phi_actual, th_actual, b_actual = actual_rule
    else:
        phi_actual, th_actual, b_actual = np.array([]), np.array([]), np.array(
            [])

    if len(actual_reaching_class) > 1:

        #Warning : no equivalence waranty if a max_depth is specified
        if (max_depth is not None) and (
                actual_rule
                is not None) and actual_rule[0].size >= int(max_depth):

            for c in actual_reaching_class:
                dtree_new.tree_.value[actual_new_node, :, c] = 1
                add_to_parents(dtree_new, actual_new_node,
                               dtree_new.tree_.value[actual_new_node])

            dtree_new.tree_.n_node_samples[actual_new_node] = len(
                actual_reaching_class)
            dtree_new.tree_.weighted_n_node_samples[actual_new_node] = len(
                actual_reaching_class)

        else:

            if len(finishing_features) > 0:
                particular_splits, other_considered_splits = filter_feature(
                    considered_splits, finishing_features)

                if other_considered_splits.size == 0:
                    considered_splits = particular_splits
                else:
                    considered_splits = other_considered_splits
            ###
            if smallest_tree:
                gains = EntropyGainFromClasses(actual_rule,
                                               actual_reaching_class,
                                               considered_splits,
                                               K_union_rules,
                                               n_cl=dtree_or.n_classes_)
                p = np.zeros(considered_splits.size)
                p[gains == np.amax(gains)] = 1
                p = p / sum(p)
            else:
                p = np.ones(considered_splits.size)
                p = p / sum(p)
            ###

            phi, th = new_random_split(p, considered_splits)

            dtree_new.tree_.feature[actual_new_node] = phi
            dtree_new.tree_.threshold[actual_new_node] = th

            new_rule_l = np.concatenate(
                (phi_actual, np.array([phi]))), np.concatenate(
                    (th_actual, np.array([th]))), np.concatenate(
                        (b_actual, np.array([-1])))
            new_rule_r = np.concatenate(
                (phi_actual, np.array([phi]))), np.concatenate(
                    (th_actual, np.array([th]))), np.concatenate(
                        (b_actual, np.array([1])))

            if len(finishing_features) > 0:

                if particular_splits.size == 0:
                    considered_splits = other_considered_splits
                elif other_considered_splits.size == 0:
                    considered_splits = particular_splits
                else:
                    considered_splits = np.concatenate(
                        (particular_splits, other_considered_splits))

            considered_splits_l = all_coherent_splits(new_rule_l,
                                                      considered_splits)
            considered_splits_r = all_coherent_splits(new_rule_r,
                                                      considered_splits)

            reach_class_l = list()
            reach_class_r = list()

            for c, K in enumerate(dtree_or.classes_.astype(int)):
                for r in K_union_rules[c]:
                    if not isdisj(r, new_rule_l):
                        reach_class_l.append(c)
                    if not isdisj(r, new_rule_r):
                        reach_class_r.append(c)

            reach_class_l = list(set(reach_class_l))
            reach_class_r = list(set(reach_class_r))

            dtree_new, child_l = add_child_leaf(dtree_new, actual_new_node, -1)
            dtree_new = eq_rec_tree(dtree_or,
                                    child_l,
                                    dtree_new,
                                    actual_rule=new_rule_l,
                                    K_union_rules=K_union_rules,
                                    actual_reaching_class=reach_class_l,
                                    considered_splits=considered_splits_l,
                                    max_depth=max_depth,
                                    from_depth=from_depth,
                                    on_subtrees=on_subtrees,
                                    subtrees_nodes=subtrees_nodes,
                                    finishing_features=finishing_features,
                                    smallest_tree=smallest_tree)

            dtree_new, child_r = add_child_leaf(dtree_new, actual_new_node, 1)
            dtree_new = eq_rec_tree(dtree_or,
                                    child_r,
                                    dtree_new,
                                    actual_rule=new_rule_r,
                                    K_union_rules=K_union_rules,
                                    actual_reaching_class=reach_class_r,
                                    considered_splits=considered_splits_r,
                                    max_depth=max_depth,
                                    from_depth=from_depth,
                                    on_subtrees=on_subtrees,
                                    subtrees_nodes=subtrees_nodes,
                                    finishing_features=finishing_features,
                                    smallest_tree=smallest_tree)

    elif len(actual_reaching_class) == 1:

        c = actual_reaching_class[0]

        dtree_new.tree_.value[actual_new_node, :, c] = 1
        add_to_parents(dtree_new, actual_new_node,
                       dtree_new.tree_.value[actual_new_node])
        dtree_new.tree_.n_node_samples[actual_new_node] = 1
        dtree_new.tree_.weighted_n_node_samples[actual_new_node] = 1

    else:
        print('ERREUR 0 données !')

    if actual_new_node == 0:
        dtree_new.max_depth = dtree_new.tree_.max_depth

    return dtree_new
Example #4
0
def SER(dTree,
        node,
        X_target_node,
        y_target_node,
        original_ser=True,
        no_red_on_cl=False,
        cl_no_red=None,
        no_ext_on_cl=False,
        cl_no_ext=None,
        ext_cond=None,
        leaf_loss_quantify=False,
        leaf_loss_threshold=None,
        coeffs=None,
        root_source_values=None,
        Nkmin=None):

    # Deep copy of value
    old_values = dTree.tree_.value[node].copy()
    maj_class = np.argmax(dTree.tree_.value[node, :].copy())

    if cl_no_red is None:
        old_size_cl_no_red = 0
    else:
        old_size_cl_no_red = np.sum(dTree.tree_.value[node][:, cl_no_red])

    if no_red_on_cl is not None or no_ext_on_cl is not None:
        if no_ext_on_cl:
            cl = cl_no_ext[0]
        if no_red_on_cl:
            cl = cl_no_red[0]

    if leaf_loss_quantify and ((no_red_on_cl or no_ext_on_cl) and maj_class
                               == cl) and dTree.tree_.feature[node] == -2:

        ps_rf = dTree.tree_.value[node, 0, :] / sum(dTree.tree_.value[node,
                                                                      0, :])
        p1_in_l = dTree.tree_.value[node, 0, cl] / root_source_values[cl]
        cond1 = np.power(1 - p1_in_l, Nkmin) > leaf_loss_threshold
        cond2 = np.argmax(np.multiply(coeffs, ps_rf)) == cl

    ### VALUES UPDATE ###
    val = np.zeros((dTree.n_outputs_, dTree.n_classes_))

    for i in range(dTree.n_classes_):
        val[:, i] = list(y_target_node).count(i)

    dTree.tree_.value[node] = val
    dTree.tree_.n_node_samples[node] = np.sum(val)
    dTree.tree_.weighted_n_node_samples[node] = np.sum(val)

    if dTree.tree_.feature[node] == -2:
        if original_ser:
            if y_target_node.size > 0 and len(set(list(y_target_node))) > 1:
                # la classe change automatiquement en fonction de target par les values updates

                DT_to_add = DecisionTreeClassifier()

                try:
                    DT_to_add.min_impurity_decrease = 0
                except:
                    DT_to_add.min_impurity_split = 0
                DT_to_add.fit(X_target_node, y_target_node)
                lib_tree.fusionDecisionTree(dTree, node, DT_to_add)

            return node, False

        else:
            bool_no_red = False
            cond_extension = False

            if y_target_node.size > 0:
                # Extension
                if not no_ext_on_cl:
                    DT_to_add = DecisionTreeClassifier()
                    # to make a complete tree
                    try:
                        DT_to_add.min_impurity_decrease = 0
                    except:
                        DT_to_add.min_impurity_split = 0
                    DT_to_add.fit(X_target_node, y_target_node)
                    lib_tree.fusionDecisionTree(dTree, node, DT_to_add)
                else:
                    cond_maj = (maj_class not in cl_no_ext)
                    cond_sub_target = ext_cond and (
                        maj_class in y_target_node) and (maj_class
                                                         in cl_no_ext)
                    cond_leaf_loss = leaf_loss_quantify and maj_class == cl and not (
                        cond1 and cond2)

                    cond_extension = cond_maj or cond_sub_target or cond_leaf_loss

                    if cond_extension:
                        DT_to_add = DecisionTreeClassifier()
                        # to make a complete tree
                        try:
                            DT_to_add.min_impurity_decrease = 0
                        except:
                            DT_to_add.min_impurity_split = 0
                        DT_to_add.fit(X_target_node, y_target_node)
                        lib_tree.fusionDecisionTree(dTree, node, DT_to_add)
                    else:
                        # Compliqué de ne pas induire d'incohérence au niveau des values
                        # en laissant intactes les feuilles de cette manière.
                        # Cela dit, ça n'a pas d'impact sur l'arbre décisionnel qu'on veut
                        # obtenir (ça en a un sur l'arbre probabilisé)
                        dTree.tree_.value[node] = old_values
                        dTree.tree_.n_node_samples[node] = np.sum(old_values)
                        dTree.tree_.weighted_n_node_samples[node] = np.sum(
                            old_values)
                        lib_tree.add_to_parents(dTree, node, old_values)
                        if no_red_on_cl:
                            bool_no_red = True

            # no red protection with values
            if no_red_on_cl and y_target_node.size == 0 and old_size_cl_no_red > 0 and maj_class in cl_no_red:

                if leaf_loss_quantify:

                    if cond1 and cond2:
                        dTree.tree_.value[node] = old_values
                        dTree.tree_.n_node_samples[node] = np.sum(old_values)
                        dTree.tree_.weighted_n_node_samples[node] = np.sum(
                            old_values)
                        lib_tree.add_to_parents(dTree, node, old_values)
                        bool_no_red = True
                else:
                    dTree.tree_.value[node] = old_values
                    dTree.tree_.n_node_samples[node] = np.sum(old_values)
                    dTree.tree_.weighted_n_node_samples[node] = np.sum(
                        old_values)
                    lib_tree.add_to_parents(dTree, node, old_values)
                    bool_no_red = True

            return node, bool_no_red

    ### Left / right target computation ###
    bool_test = X_target_node[:, dTree.tree_.
                              feature[node]] <= dTree.tree_.threshold[node]
    not_bool_test = X_target_node[:, dTree.tree_.
                                  feature[node]] > dTree.tree_.threshold[node]

    ind_left = np.where(bool_test)[0]
    ind_right = np.where(not_bool_test)[0]

    X_target_node_left = X_target_node[ind_left]
    y_target_node_left = y_target_node[ind_left]

    X_target_node_right = X_target_node[ind_right]
    y_target_node_right = y_target_node[ind_right]

    if original_ser:
        new_node_left, bool_no_red_l = SER(dTree,
                                           dTree.tree_.children_left[node],
                                           X_target_node_left,
                                           y_target_node_left,
                                           original_ser=True)
        node, b = lib_tree.find_parent(dTree, new_node_left)

        new_node_right, bool_no_red_r = SER(dTree,
                                            dTree.tree_.children_right[node],
                                            X_target_node_right,
                                            y_target_node_right,
                                            original_ser=True)
        node, b = lib_tree.find_parent(dTree, new_node_right)

    else:
        new_node_left, bool_no_red_l = SER(
            dTree,
            dTree.tree_.children_left[node],
            X_target_node_left,
            y_target_node_left,
            original_ser=False,
            no_red_on_cl=no_red_on_cl,
            cl_no_red=cl_no_red,
            no_ext_on_cl=no_ext_on_cl,
            cl_no_ext=cl_no_ext,
            leaf_loss_quantify=leaf_loss_quantify,
            leaf_loss_threshold=leaf_loss_threshold,
            coeffs=coeffs,
            root_source_values=root_source_values,
            Nkmin=Nkmin)

        node, b = lib_tree.find_parent(dTree, new_node_left)

        new_node_right, bool_no_red_r = SER(
            dTree,
            dTree.tree_.children_right[node],
            X_target_node_right,
            y_target_node_right,
            original_ser=False,
            no_red_on_cl=no_red_on_cl,
            cl_no_red=cl_no_red,
            no_ext_on_cl=no_ext_on_cl,
            cl_no_ext=cl_no_ext,
            leaf_loss_quantify=leaf_loss_quantify,
            leaf_loss_threshold=leaf_loss_threshold,
            coeffs=coeffs,
            root_source_values=root_source_values,
            Nkmin=Nkmin)

        node, b = lib_tree.find_parent(dTree, new_node_right)

    if original_ser:
        bool_no_red = False
    else:
        bool_no_red = bool_no_red_l or bool_no_red_r

    le = lib_tree.leaf_error(dTree.tree_, node)
    e = lib_tree.error(dTree.tree_, node)

    if le <= e:
        if original_ser:
            new_node_leaf = lib_tree.cut_into_leaf2(dTree, node)
            node = new_node_leaf
        else:
            if no_red_on_cl:
                if not bool_no_red:
                    new_node_leaf = lib_tree.cut_into_leaf2(dTree, node)
                    node = new_node_leaf


#                else:
#                    print('avoid pruning')
            else:
                new_node_leaf = lib_tree.cut_into_leaf2(dTree, node)
                node = new_node_leaf

    if dTree.tree_.feature[node] != -2:
        if original_ser:
            if ind_left.size == 0:
                node = lib_tree.cut_from_left_right(dTree, node, -1)

            if ind_right.size == 0:
                node = lib_tree.cut_from_left_right(dTree, node, 1)
        else:
            if no_red_on_cl:
                if ind_left.size == 0 and np.sum(dTree.tree_.value[
                        dTree.tree_.children_left[node]]) == 0:
                    node = lib_tree.cut_from_left_right(dTree, node, -1)

                if ind_right.size == 0 and np.sum(dTree.tree_.value[
                        dTree.tree_.children_right[node]]) == 0:
                    node = lib_tree.cut_from_left_right(dTree, node, 1)
            else:
                if ind_left.size == 0:
                    node = lib_tree.cut_from_left_right(dTree, node, -1)

                if ind_right.size == 0:
                    node = lib_tree.cut_from_left_right(dTree, node, 1)

    return node, bool_no_red
Example #5
0
def STRUT(decisiontree,
          node_index,
          X_target_node,
          Y_target_node,
          adapt_prop=False,
          coeffs=[1, 1],
          use_divergence=True,
          measure_default_IG=True):

    phi = decisiontree.tree_.feature[node_index]
    classes = decisiontree.classes_
    threshold = decisiontree.tree_.threshold[node_index]
    #old_threshold = threshold

    current_class_distribution = lib_tree.compute_class_distribution(
        classes, Y_target_node)

    decisiontree.tree_.weighted_n_node_samples[node_index] = Y_target_node.size
    decisiontree.tree_.impurity[node_index] = lib_tree.GINI(
        current_class_distribution)
    decisiontree.tree_.n_node_samples[node_index] = Y_target_node.size

    # If it is a leaf one, exit
    if decisiontree.tree_.children_left[node_index] == -2:

        decisiontree.tree_.value[node_index] = current_class_distribution
        return node_index

    is_reached_update = (current_class_distribution.sum() != 0)
    prune_cond = not is_reached_update

    if prune_cond:

        p, b = lib_tree.find_parent(decisiontree, node_index)
        node_index = lib_tree.cut_from_left_right(decisiontree, p, b)

        return node_index

    # else:
    # print("NO Pruning at node ", node_index)
    # return 0

    # update tree.value with target data
    decisiontree.tree_.value[node_index] = current_class_distribution

    # Only one class is present in the node -> terminal leaf
    if (current_class_distribution > 0).sum() == 1:

        node_index = lib_tree.cut_into_leaf2(decisiontree, node_index)
        return node_index

    # update threshold
    if type(threshold) is np.float64:
        Q_source_l, Q_source_r = lib_tree.get_children_distributions(
            decisiontree, node_index)
        Sl = np.sum(Q_source_l)
        Sr = np.sum(Q_source_r)

        if adapt_prop:
            Sl = np.sum(Q_source_l)
            Sr = np.sum(Q_source_r)
            Slt = Y_target_node.size
            Srt = Y_target_node.size

            D = np.sum(np.multiply(coeffs, Q_source_l))
            Q_source_l = (Slt / Sl) * np.multiply(coeffs,
                                                  np.divide(Q_source_l, D))
            D = np.sum(np.multiply(coeffs, Q_source_r))
            Q_source_r = (Srt / Sr) * np.multiply(coeffs,
                                                  np.divide(Q_source_r, D))

        Q_source_parent = lib_tree.get_node_distribution(
            decisiontree, node_index)

        t1 = threshold_selection(Q_source_parent,
                                 Q_source_l.copy(),
                                 Q_source_r.copy(),
                                 X_target_node,
                                 Y_target_node,
                                 phi,
                                 classes,
                                 use_divergence=use_divergence,
                                 measure_default_IG=measure_default_IG)
        Q_target_l, Q_target_r = lib_tree.compute_Q_children_target(
            X_target_node, Y_target_node, phi, t1, classes)
        DG_t1 = lib_tree.DG(Q_source_l.copy(), Q_source_r.copy(), Q_target_l,
                            Q_target_r)
        t2 = threshold_selection(Q_source_parent,
                                 Q_source_r.copy(),
                                 Q_source_l.copy(),
                                 X_target_node,
                                 Y_target_node,
                                 phi,
                                 classes,
                                 use_divergence=use_divergence,
                                 measure_default_IG=measure_default_IG)

        Q_target_l, Q_target_r = lib_tree.compute_Q_children_target(
            X_target_node, Y_target_node, phi, t2, classes)
        DG_t2 = lib_tree.DG(Q_source_r.copy(), Q_source_l.copy(), Q_target_l,
                            Q_target_r)

        if DG_t1 >= DG_t2:
            decisiontree.tree_.threshold[node_index] = t1
        else:
            decisiontree.tree_.threshold[node_index] = t2
            # swap children
            old_child_r_id = decisiontree.tree_.children_right[node_index]
            decisiontree.tree_.children_right[
                node_index] = decisiontree.tree_.children_left[node_index]
            decisiontree.tree_.children_left[node_index] = old_child_r_id

    if decisiontree.tree_.children_left[node_index] != -1:
        # Computing target data passing through node NOT updated
        #index_X_child_l = X_target_node_noupdate[:, phi] <= old_threshold
        #X_target_node_noupdate_l = X_target_node_noupdate[index_X_child_l, :]
        #Y_target_node_noupdate_l = Y_target_node_noupdate[index_X_child_l]
        # Computing target data passing through node updated
        threshold = decisiontree.tree_.threshold[node_index]
        index_X_child_l = X_target_node[:, phi] <= threshold
        X_target_child_l = X_target_node[index_X_child_l, :]
        Y_target_child_l = Y_target_node[index_X_child_l]

        node_index = STRUT(decisiontree,
                           decisiontree.tree_.children_left[node_index],
                           X_target_child_l,
                           Y_target_child_l,
                           adapt_prop=adapt_prop,
                           coeffs=coeffs,
                           use_divergence=use_divergence,
                           measure_default_IG=measure_default_IG)

        ## IMPORTANT ##
        node_index, b = lib_tree.find_parent(decisiontree, node_index)

    if decisiontree.tree_.children_right[node_index] != -1:
        # Computing target data passing through node NOT updated
        #index_X_child_r = X_target_node_noupdate[:, phi] > old_threshold
        #X_target_node_noupdate_r = X_target_node_noupdate[index_X_child_r, :]
        #Y_target_node_noupdate_r = Y_target_node_noupdate[index_X_child_r]
        # Computing target data passing through node updated
        threshold = decisiontree.tree_.threshold[node_index]
        index_X_child_r = X_target_node[:, phi] > threshold
        X_target_child_r = X_target_node[index_X_child_r, :]
        Y_target_child_r = Y_target_node[index_X_child_r]

        node_index = STRUT(decisiontree,
                           decisiontree.tree_.children_right[node_index],
                           X_target_child_r,
                           Y_target_child_r,
                           adapt_prop=adapt_prop,
                           coeffs=coeffs,
                           use_divergence=use_divergence,
                           measure_default_IG=measure_default_IG)

        ## IMPORTANT ##
        node_index, b = lib_tree.find_parent(decisiontree, node_index)

    return node_index