def CoherentFusionDecisionTree(dTree1, node, dTree2): """adding the coherent part of tree dTree2 to node 'node' of tree dTree1""" dtree1 = copy.deepcopy(dTree1) dtree2 = copy.deepcopy(dTree2) leaf = lib_tree.cut_into_leaf2(dtree1, node) rule = lib_tree.extract_rule(dtree1, leaf) ForceCoherence(dtree2, rule, node=0) lib_tree.fusionDecisionTree(dtree1, leaf, dtree2) return dtree1
def RandomOnePrune(dtree, policy=None, depth_thresh=2): if policy is None: all_inds = np.arange(0, dtree.tree_.feature.size) depths = depth_array(dtree, all_inds) mx_d = max(depths) inds1 = np.where(dtree.tree_.feature != -2)[0] inds2 = np.where(depths >= depth_thresh)[0] inds = list(set(list(inds1)).intersection(set(list(inds2)))) if len(inds) != 0: node = np.random.choice(inds) cut_into_leaf2(dtree, node) return dtree elif policy == 'exp': all_inds = np.arange(0, dtree.tree_.feature.size) depths = depth_array(dtree, all_inds) mx_d = max(depths) inds1 = np.where(dtree.tree_.feature != -2)[0] inds2 = np.where(depths >= depth_thresh)[0] t = np.random.exponential(mx_d / 2) inds3 = np.where(depths >= int(mx_d - t))[0] inds = set(list(inds1)).intersection(set(list(inds2))) inds = inds.intersection(set(list(inds3))) inds = list(inds) if len(inds) != 0: node = np.random.choice(inds) cut_into_leaf2(dtree, node) return dtree else: all_inds = np.arange(0, dtree.tree_.feature.size) depths = depth_array(dtree, all_inds) mx_d = max(depths) inds1 = np.where(dtree.tree_.feature != -2)[0] inds2 = np.where(depths > depth_thresh)[0] inds = list(set(list(inds1)).intersection(set(list(inds2)))) if len(inds) != 0: node = np.random.choice(inds) cut_into_leaf2(dtree, node) return dtree
def eq_rec_tree(dtree_or, actual_new_node, dtree_new=None, actual_rule=None, K_union_rules=None, actual_reaching_class=None, considered_splits=None, max_depth=None, from_depth=None, on_subtrees=False, subtrees_nodes=None, finishing_features=list(), smallest_tree=False): if from_depth is not None: from_depth = int(from_depth) if from_depth < 1: print('WARNING : Given depth < 1 !') else: nodes_depth = np.array(nodes_in_depth(dtree_or, from_depth)) on_subtrees = True subtrees_nodes = nodes_depth if actual_new_node == 0: if on_subtrees: if subtrees_nodes is None: print('WARNING : No specified subtrees !') dtree_new = copy.deepcopy(dtree_or) for i in subtrees_nodes: r = extract_rule(dtree_or, i) subtree = sub_tree(dtree_or, i) cut_node, b_ = search_rule(dtree_new, r) fusion_node = cut_into_leaf2(dtree_new, cut_node) subeqtree = eq_rec_tree(subtree, 0, max_depth=max_depth, finishing_features=finishing_features, smallest_tree=smallest_tree) dtree_new = fusionDecisionTree(dtree_new, fusion_node, subeqtree) return dtree_new else: leaves, rules = extract_leaves_rules(dtree_or) if K_union_rules is None: K_union_rules = np.zeros(dtree_or.n_classes_, dtype=object) for c, K in enumerate(dtree_or.classes_.astype(int)): K_union_rules[c] = list() for k, l in enumerate(leaves): c = int(np.argmax(dtree_or.tree_.value[l, :, :])) K_union_rules[c].append(rules[k]) all_splits = np.zeros(dtree_or.tree_.node_count - leaves.size, dtype=[("phi", '<i8'), ("th", '<f8')]) compt = 0 for i in range(dtree_or.tree_.node_count): if i not in leaves: all_splits[compt] = (dtree_or.tree_.feature[i], dtree_or.tree_.threshold[i]) compt = compt + 1 considered_splits = all_splits actual_reaching_class = dtree_or.classes_.astype(int) dtree_new = CreateFullNewTree(dtree_or) if actual_rule is not None: phi_actual, th_actual, b_actual = actual_rule else: phi_actual, th_actual, b_actual = np.array([]), np.array([]), np.array( []) if len(actual_reaching_class) > 1: #Warning : no equivalence waranty if a max_depth is specified if (max_depth is not None) and ( actual_rule is not None) and actual_rule[0].size >= int(max_depth): for c in actual_reaching_class: dtree_new.tree_.value[actual_new_node, :, c] = 1 add_to_parents(dtree_new, actual_new_node, dtree_new.tree_.value[actual_new_node]) dtree_new.tree_.n_node_samples[actual_new_node] = len( actual_reaching_class) dtree_new.tree_.weighted_n_node_samples[actual_new_node] = len( actual_reaching_class) else: if len(finishing_features) > 0: particular_splits, other_considered_splits = filter_feature( considered_splits, finishing_features) if other_considered_splits.size == 0: considered_splits = particular_splits else: considered_splits = other_considered_splits ### if smallest_tree: gains = EntropyGainFromClasses(actual_rule, actual_reaching_class, considered_splits, K_union_rules, n_cl=dtree_or.n_classes_) p = np.zeros(considered_splits.size) p[gains == np.amax(gains)] = 1 p = p / sum(p) else: p = np.ones(considered_splits.size) p = p / sum(p) ### phi, th = new_random_split(p, considered_splits) dtree_new.tree_.feature[actual_new_node] = phi dtree_new.tree_.threshold[actual_new_node] = th new_rule_l = np.concatenate( (phi_actual, np.array([phi]))), np.concatenate( (th_actual, np.array([th]))), np.concatenate( (b_actual, np.array([-1]))) new_rule_r = np.concatenate( (phi_actual, np.array([phi]))), np.concatenate( (th_actual, np.array([th]))), np.concatenate( (b_actual, np.array([1]))) if len(finishing_features) > 0: if particular_splits.size == 0: considered_splits = other_considered_splits elif other_considered_splits.size == 0: considered_splits = particular_splits else: considered_splits = np.concatenate( (particular_splits, other_considered_splits)) considered_splits_l = all_coherent_splits(new_rule_l, considered_splits) considered_splits_r = all_coherent_splits(new_rule_r, considered_splits) reach_class_l = list() reach_class_r = list() for c, K in enumerate(dtree_or.classes_.astype(int)): for r in K_union_rules[c]: if not isdisj(r, new_rule_l): reach_class_l.append(c) if not isdisj(r, new_rule_r): reach_class_r.append(c) reach_class_l = list(set(reach_class_l)) reach_class_r = list(set(reach_class_r)) dtree_new, child_l = add_child_leaf(dtree_new, actual_new_node, -1) dtree_new = eq_rec_tree(dtree_or, child_l, dtree_new, actual_rule=new_rule_l, K_union_rules=K_union_rules, actual_reaching_class=reach_class_l, considered_splits=considered_splits_l, max_depth=max_depth, from_depth=from_depth, on_subtrees=on_subtrees, subtrees_nodes=subtrees_nodes, finishing_features=finishing_features, smallest_tree=smallest_tree) dtree_new, child_r = add_child_leaf(dtree_new, actual_new_node, 1) dtree_new = eq_rec_tree(dtree_or, child_r, dtree_new, actual_rule=new_rule_r, K_union_rules=K_union_rules, actual_reaching_class=reach_class_r, considered_splits=considered_splits_r, max_depth=max_depth, from_depth=from_depth, on_subtrees=on_subtrees, subtrees_nodes=subtrees_nodes, finishing_features=finishing_features, smallest_tree=smallest_tree) elif len(actual_reaching_class) == 1: c = actual_reaching_class[0] dtree_new.tree_.value[actual_new_node, :, c] = 1 add_to_parents(dtree_new, actual_new_node, dtree_new.tree_.value[actual_new_node]) dtree_new.tree_.n_node_samples[actual_new_node] = 1 dtree_new.tree_.weighted_n_node_samples[actual_new_node] = 1 else: print('ERREUR 0 données !') if actual_new_node == 0: dtree_new.max_depth = dtree_new.tree_.max_depth return dtree_new
def SER(dTree, node, X_target_node, y_target_node, original_ser=True, no_red_on_cl=False, cl_no_red=None, no_ext_on_cl=False, cl_no_ext=None, ext_cond=None, leaf_loss_quantify=False, leaf_loss_threshold=None, coeffs=None, root_source_values=None, Nkmin=None): # Deep copy of value old_values = dTree.tree_.value[node].copy() maj_class = np.argmax(dTree.tree_.value[node, :].copy()) if cl_no_red is None: old_size_cl_no_red = 0 else: old_size_cl_no_red = np.sum(dTree.tree_.value[node][:, cl_no_red]) if no_red_on_cl is not None or no_ext_on_cl is not None: if no_ext_on_cl: cl = cl_no_ext[0] if no_red_on_cl: cl = cl_no_red[0] if leaf_loss_quantify and ((no_red_on_cl or no_ext_on_cl) and maj_class == cl) and dTree.tree_.feature[node] == -2: ps_rf = dTree.tree_.value[node, 0, :] / sum(dTree.tree_.value[node, 0, :]) p1_in_l = dTree.tree_.value[node, 0, cl] / root_source_values[cl] cond1 = np.power(1 - p1_in_l, Nkmin) > leaf_loss_threshold cond2 = np.argmax(np.multiply(coeffs, ps_rf)) == cl ### VALUES UPDATE ### val = np.zeros((dTree.n_outputs_, dTree.n_classes_)) for i in range(dTree.n_classes_): val[:, i] = list(y_target_node).count(i) dTree.tree_.value[node] = val dTree.tree_.n_node_samples[node] = np.sum(val) dTree.tree_.weighted_n_node_samples[node] = np.sum(val) if dTree.tree_.feature[node] == -2: if original_ser: if y_target_node.size > 0 and len(set(list(y_target_node))) > 1: # la classe change automatiquement en fonction de target par les values updates DT_to_add = DecisionTreeClassifier() try: DT_to_add.min_impurity_decrease = 0 except: DT_to_add.min_impurity_split = 0 DT_to_add.fit(X_target_node, y_target_node) lib_tree.fusionDecisionTree(dTree, node, DT_to_add) return node, False else: bool_no_red = False cond_extension = False if y_target_node.size > 0: # Extension if not no_ext_on_cl: DT_to_add = DecisionTreeClassifier() # to make a complete tree try: DT_to_add.min_impurity_decrease = 0 except: DT_to_add.min_impurity_split = 0 DT_to_add.fit(X_target_node, y_target_node) lib_tree.fusionDecisionTree(dTree, node, DT_to_add) else: cond_maj = (maj_class not in cl_no_ext) cond_sub_target = ext_cond and ( maj_class in y_target_node) and (maj_class in cl_no_ext) cond_leaf_loss = leaf_loss_quantify and maj_class == cl and not ( cond1 and cond2) cond_extension = cond_maj or cond_sub_target or cond_leaf_loss if cond_extension: DT_to_add = DecisionTreeClassifier() # to make a complete tree try: DT_to_add.min_impurity_decrease = 0 except: DT_to_add.min_impurity_split = 0 DT_to_add.fit(X_target_node, y_target_node) lib_tree.fusionDecisionTree(dTree, node, DT_to_add) else: # Compliqué de ne pas induire d'incohérence au niveau des values # en laissant intactes les feuilles de cette manière. # Cela dit, ça n'a pas d'impact sur l'arbre décisionnel qu'on veut # obtenir (ça en a un sur l'arbre probabilisé) dTree.tree_.value[node] = old_values dTree.tree_.n_node_samples[node] = np.sum(old_values) dTree.tree_.weighted_n_node_samples[node] = np.sum( old_values) lib_tree.add_to_parents(dTree, node, old_values) if no_red_on_cl: bool_no_red = True # no red protection with values if no_red_on_cl and y_target_node.size == 0 and old_size_cl_no_red > 0 and maj_class in cl_no_red: if leaf_loss_quantify: if cond1 and cond2: dTree.tree_.value[node] = old_values dTree.tree_.n_node_samples[node] = np.sum(old_values) dTree.tree_.weighted_n_node_samples[node] = np.sum( old_values) lib_tree.add_to_parents(dTree, node, old_values) bool_no_red = True else: dTree.tree_.value[node] = old_values dTree.tree_.n_node_samples[node] = np.sum(old_values) dTree.tree_.weighted_n_node_samples[node] = np.sum( old_values) lib_tree.add_to_parents(dTree, node, old_values) bool_no_red = True return node, bool_no_red ### Left / right target computation ### bool_test = X_target_node[:, dTree.tree_. feature[node]] <= dTree.tree_.threshold[node] not_bool_test = X_target_node[:, dTree.tree_. feature[node]] > dTree.tree_.threshold[node] ind_left = np.where(bool_test)[0] ind_right = np.where(not_bool_test)[0] X_target_node_left = X_target_node[ind_left] y_target_node_left = y_target_node[ind_left] X_target_node_right = X_target_node[ind_right] y_target_node_right = y_target_node[ind_right] if original_ser: new_node_left, bool_no_red_l = SER(dTree, dTree.tree_.children_left[node], X_target_node_left, y_target_node_left, original_ser=True) node, b = lib_tree.find_parent(dTree, new_node_left) new_node_right, bool_no_red_r = SER(dTree, dTree.tree_.children_right[node], X_target_node_right, y_target_node_right, original_ser=True) node, b = lib_tree.find_parent(dTree, new_node_right) else: new_node_left, bool_no_red_l = SER( dTree, dTree.tree_.children_left[node], X_target_node_left, y_target_node_left, original_ser=False, no_red_on_cl=no_red_on_cl, cl_no_red=cl_no_red, no_ext_on_cl=no_ext_on_cl, cl_no_ext=cl_no_ext, leaf_loss_quantify=leaf_loss_quantify, leaf_loss_threshold=leaf_loss_threshold, coeffs=coeffs, root_source_values=root_source_values, Nkmin=Nkmin) node, b = lib_tree.find_parent(dTree, new_node_left) new_node_right, bool_no_red_r = SER( dTree, dTree.tree_.children_right[node], X_target_node_right, y_target_node_right, original_ser=False, no_red_on_cl=no_red_on_cl, cl_no_red=cl_no_red, no_ext_on_cl=no_ext_on_cl, cl_no_ext=cl_no_ext, leaf_loss_quantify=leaf_loss_quantify, leaf_loss_threshold=leaf_loss_threshold, coeffs=coeffs, root_source_values=root_source_values, Nkmin=Nkmin) node, b = lib_tree.find_parent(dTree, new_node_right) if original_ser: bool_no_red = False else: bool_no_red = bool_no_red_l or bool_no_red_r le = lib_tree.leaf_error(dTree.tree_, node) e = lib_tree.error(dTree.tree_, node) if le <= e: if original_ser: new_node_leaf = lib_tree.cut_into_leaf2(dTree, node) node = new_node_leaf else: if no_red_on_cl: if not bool_no_red: new_node_leaf = lib_tree.cut_into_leaf2(dTree, node) node = new_node_leaf # else: # print('avoid pruning') else: new_node_leaf = lib_tree.cut_into_leaf2(dTree, node) node = new_node_leaf if dTree.tree_.feature[node] != -2: if original_ser: if ind_left.size == 0: node = lib_tree.cut_from_left_right(dTree, node, -1) if ind_right.size == 0: node = lib_tree.cut_from_left_right(dTree, node, 1) else: if no_red_on_cl: if ind_left.size == 0 and np.sum(dTree.tree_.value[ dTree.tree_.children_left[node]]) == 0: node = lib_tree.cut_from_left_right(dTree, node, -1) if ind_right.size == 0 and np.sum(dTree.tree_.value[ dTree.tree_.children_right[node]]) == 0: node = lib_tree.cut_from_left_right(dTree, node, 1) else: if ind_left.size == 0: node = lib_tree.cut_from_left_right(dTree, node, -1) if ind_right.size == 0: node = lib_tree.cut_from_left_right(dTree, node, 1) return node, bool_no_red
def STRUT(decisiontree, node_index, X_target_node, Y_target_node, adapt_prop=False, coeffs=[1, 1], use_divergence=True, measure_default_IG=True): phi = decisiontree.tree_.feature[node_index] classes = decisiontree.classes_ threshold = decisiontree.tree_.threshold[node_index] #old_threshold = threshold current_class_distribution = lib_tree.compute_class_distribution( classes, Y_target_node) decisiontree.tree_.weighted_n_node_samples[node_index] = Y_target_node.size decisiontree.tree_.impurity[node_index] = lib_tree.GINI( current_class_distribution) decisiontree.tree_.n_node_samples[node_index] = Y_target_node.size # If it is a leaf one, exit if decisiontree.tree_.children_left[node_index] == -2: decisiontree.tree_.value[node_index] = current_class_distribution return node_index is_reached_update = (current_class_distribution.sum() != 0) prune_cond = not is_reached_update if prune_cond: p, b = lib_tree.find_parent(decisiontree, node_index) node_index = lib_tree.cut_from_left_right(decisiontree, p, b) return node_index # else: # print("NO Pruning at node ", node_index) # return 0 # update tree.value with target data decisiontree.tree_.value[node_index] = current_class_distribution # Only one class is present in the node -> terminal leaf if (current_class_distribution > 0).sum() == 1: node_index = lib_tree.cut_into_leaf2(decisiontree, node_index) return node_index # update threshold if type(threshold) is np.float64: Q_source_l, Q_source_r = lib_tree.get_children_distributions( decisiontree, node_index) Sl = np.sum(Q_source_l) Sr = np.sum(Q_source_r) if adapt_prop: Sl = np.sum(Q_source_l) Sr = np.sum(Q_source_r) Slt = Y_target_node.size Srt = Y_target_node.size D = np.sum(np.multiply(coeffs, Q_source_l)) Q_source_l = (Slt / Sl) * np.multiply(coeffs, np.divide(Q_source_l, D)) D = np.sum(np.multiply(coeffs, Q_source_r)) Q_source_r = (Srt / Sr) * np.multiply(coeffs, np.divide(Q_source_r, D)) Q_source_parent = lib_tree.get_node_distribution( decisiontree, node_index) t1 = threshold_selection(Q_source_parent, Q_source_l.copy(), Q_source_r.copy(), X_target_node, Y_target_node, phi, classes, use_divergence=use_divergence, measure_default_IG=measure_default_IG) Q_target_l, Q_target_r = lib_tree.compute_Q_children_target( X_target_node, Y_target_node, phi, t1, classes) DG_t1 = lib_tree.DG(Q_source_l.copy(), Q_source_r.copy(), Q_target_l, Q_target_r) t2 = threshold_selection(Q_source_parent, Q_source_r.copy(), Q_source_l.copy(), X_target_node, Y_target_node, phi, classes, use_divergence=use_divergence, measure_default_IG=measure_default_IG) Q_target_l, Q_target_r = lib_tree.compute_Q_children_target( X_target_node, Y_target_node, phi, t2, classes) DG_t2 = lib_tree.DG(Q_source_r.copy(), Q_source_l.copy(), Q_target_l, Q_target_r) if DG_t1 >= DG_t2: decisiontree.tree_.threshold[node_index] = t1 else: decisiontree.tree_.threshold[node_index] = t2 # swap children old_child_r_id = decisiontree.tree_.children_right[node_index] decisiontree.tree_.children_right[ node_index] = decisiontree.tree_.children_left[node_index] decisiontree.tree_.children_left[node_index] = old_child_r_id if decisiontree.tree_.children_left[node_index] != -1: # Computing target data passing through node NOT updated #index_X_child_l = X_target_node_noupdate[:, phi] <= old_threshold #X_target_node_noupdate_l = X_target_node_noupdate[index_X_child_l, :] #Y_target_node_noupdate_l = Y_target_node_noupdate[index_X_child_l] # Computing target data passing through node updated threshold = decisiontree.tree_.threshold[node_index] index_X_child_l = X_target_node[:, phi] <= threshold X_target_child_l = X_target_node[index_X_child_l, :] Y_target_child_l = Y_target_node[index_X_child_l] node_index = STRUT(decisiontree, decisiontree.tree_.children_left[node_index], X_target_child_l, Y_target_child_l, adapt_prop=adapt_prop, coeffs=coeffs, use_divergence=use_divergence, measure_default_IG=measure_default_IG) ## IMPORTANT ## node_index, b = lib_tree.find_parent(decisiontree, node_index) if decisiontree.tree_.children_right[node_index] != -1: # Computing target data passing through node NOT updated #index_X_child_r = X_target_node_noupdate[:, phi] > old_threshold #X_target_node_noupdate_r = X_target_node_noupdate[index_X_child_r, :] #Y_target_node_noupdate_r = Y_target_node_noupdate[index_X_child_r] # Computing target data passing through node updated threshold = decisiontree.tree_.threshold[node_index] index_X_child_r = X_target_node[:, phi] > threshold X_target_child_r = X_target_node[index_X_child_r, :] Y_target_child_r = Y_target_node[index_X_child_r] node_index = STRUT(decisiontree, decisiontree.tree_.children_right[node_index], X_target_child_r, Y_target_child_r, adapt_prop=adapt_prop, coeffs=coeffs, use_divergence=use_divergence, measure_default_IG=measure_default_IG) ## IMPORTANT ## node_index, b = lib_tree.find_parent(decisiontree, node_index) return node_index