Example #1
0
def main():
    print('--- Adaboost ---')
    data = datasets.load_digits()
    X, y = data.data, data.target

    digit1 = 1
    digit2 = 8

    idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0])
    y = data.target[idx]

    y[y == digit1] = 1
    y[y == digit2] = -1

    X = data.data[idx]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    clf = Adaboost(n_estimators=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_pred, y_test)

    clf_tree = ClassificationTree()
    clf_tree.fit(X_train, y_train)
    y_pred_tree = clf_tree.predict(X_test)
    acc_tree = accuracy_score(y_pred, y_test)

    print("Adaboost_Accuracy:", acc)
    print("Tree_Accuracy:", acc_tree)
Example #2
0
    def fit(self, X, y):
        n_samples, n_features = X.shape[0], X.shape[1]

        # 初始化各样本的权重
        w = np.full(n_samples, (1 / n_samples))
        # 储存每个分类器
        self.clfs = []

        for i in tqdm(range(self.n_clfs)):
            # 实例化一个分类器
            clf = ClassificationTree()
            # 训练
            clf.fit(X, y)
            # 得到训练结果
            y_pred = clf.predict(X)
            # 计算训练误差
            print(accuracy_score(y, y_pred))
            error = sum(w[y != y_pred])
            # 对于错误率大于0.5的分类器,因为是二分类问题(Adaboost只能解决二分类问题),我们可以翻转
            # 分类器的预测结果来使得其错误率为1-error>0.5
            print(error)
            if error > 0.5:
                self.polarity[i] = -1
                y_pred *= -1
                error = 1 - error
            self.alphas[i] = 0.5 * np.log((1.0 - error) / (error + 1e-10))
            predictions = np.array(self.polarity[i] * y_pred)
            w *= np.exp(-self.alphas[i] * y * predictions)
            w /= sum(w)
            self.clfs.append(clf)
Example #3
0
    def evolve(self, X, y, n_iter=7, min_size_prune=1):
        self.X = X
        self.y = y
        for i in range(n_iter):
            #print("TAO iter ", i, " di ", n_iter)
            for depth in reversed(range(self.classification_tree.depth + 1)):
                #print("Ottimizzo depth", depth, "....")
                T = self.classification_tree
                nodes = ClassificationTree.get_nodes_at_depth(depth, T)
                #print ([node.id for node in nodes])

                for node in nodes:
                    self.optimize_nodes(node)
                #pool = Pool(4)
                #pool.map(self.optimize_nodes, nodes)
                #pool.close()
                #pool.join()

                #for node in nodes:
                #self.optimize_nodes(node)

                #Rimetto apposto i punti associati ad ogni nodo
                self.classification_tree.build_idxs_of_subtree(
                    X, range(len(X)), T.tree[0], oblique=T.oblique)
        #Effettua il pruning finale per togliere dead branches e pure subtrees
        #self.prune(min_size = min_size_prune)
        ClassificationTree.restore_tree(self.classification_tree)
Example #4
0
    def optimize_node_parallel(self, node, X, y, alfa, complexity):

        rho = 0
        if node.is_leaf:
            rho = 1
        error_best = ClassificationTree.misclassification_loss(
            node, X, y, node.data_idxs,
            self.classification_tree.oblique) + alfa * complexity

        #print("prima parallel")
        #Provo lo split
        if self.classification_tree.oblique:
            node_para, error_para = self.best_svm_split(
                node, X, y, alfa, complexity)
        else:
            node_para, error_para = self.best_parallel_split(
                node, X, y, alfa, complexity)
        #print(node.id, "  fatto parallel plit")

        if error_para < error_best:
            #print("error para migliore")
            error_best = error_para
            ClassificationTree.replace_node(node, node_para,
                                            self.classification_tree)

        #Metto questo if perchè nel caso di svm se il nodo è una foglia pura allora
        #non riesco a fare SVM perchè giustamente si aspetta label differenti.
        #Questo non perde di generalità poichè se il nodo fosse una foglia pura allora
        #creando due nuovi figli e ottimizzando otterremmo un figlio foglia puro e l'altro
        #senza punti. Che avrebbe la stessa loss iniziale del padre. Quindi alla fine la foglia iniziale
        #non verrebbe branchata.
        if node_para.left_node != None and node_para.right_node != None:
            #Errore sul figlio sinistro, se è migliore allora sostituisco
            error_lower = ClassificationTree.misclassification_loss(
                node_para.left_node, X, y, node.data_idxs,
                self.classification_tree.oblique) + alfa * (complexity + rho)
            if error_lower < error_best:
                #print("error lower migliore")
                ClassificationTree.replace_node(node, node_para.left_node,
                                                self.classification_tree)
                error_best = error_lower

            #Errore sul figlio sinistro, se è migliore allora sostituisco
            error_upper = ClassificationTree.misclassification_loss(
                node_para.right_node, X, y, node.data_idxs,
                self.classification_tree.oblique) + alfa * (complexity + rho)
            if error_upper < error_best:
                #print("error upper migliore")
                ClassificationTree.replace_node(node, node_para.right_node,
                                                self.classification_tree)
                error_best = error_upper
        return node
Example #5
0
def zero_one_loss(node, data, targets):
    loss = 0
    #Per ogni punto del nodo verifico dove questo lo inoltrerebbe
    #se il punto viene inoltrato su un figlio che porta a misclassificazione
    #è errore
    predictions = ClassificationTree.predict_label(data, node, False)
    n_misclassified = np.count_nonzero(targets - predictions)
    return n_misclassified
Example #6
0
    def evolve(self, X, y, alfa=0, max_iteration=1):
        ClassificationTree.restore_tree(self.classification_tree)
        complexity = self.classification_tree.n_leaves - 1
        T = self.classification_tree.tree
        self.X = X
        self.y = y
        i = 0

        while (i < max_iteration):
            optimized = []
            error_prev = ClassificationTree.misclassification_loss(
                T[0], X, y, T[0].data_idxs,
                self.classification_tree.oblique) + alfa * complexity

            values = list(self.classification_tree.tree.keys())
            random.shuffle(values)
            #print(values)
            #print("values: ", values)
            while (len(values) > 0):
                #print(complexity)
                node_id = values.pop()
                optimized.append(node_id)
                #print("optimizing node:", node_id)
                self.optimize_node_parallel(T[node_id], X, y, alfa, complexity)
                #print("nodo ottimizzato:  ", node_id)
                ids = ClassificationTree.restore_tree(self.classification_tree)
                complexity = self.classification_tree.n_leaves - 1
                #print("complexity: ", complexity)
                #print("ids: ", ids)
                values = list(set(ids) - set(optimized))
                #print("values dopo restore:  ", values)
                self.classification_tree.build_idxs_of_subtree(
                    X, range(len(X)), T[0], self.classification_tree.oblique)
                error_curr = ClassificationTree.misclassification_loss(
                    T[0], X, y, T[0].data_idxs,
                    self.classification_tree.oblique) + alfa * complexity
            #print(self.max_id)
            #print("i-->", i, "node: ", node_id)
            #for node_id in to_delete:
            #self.delete_node(node_id)

            i += 1
            #print("Ottimizzato nodi algoritmo ls: ", i, " volte")
            if np.abs(error_curr - error_prev) < 1e-01:
                break
Example #7
0
 def best_svm_split(self, node, X, y, alfa, complexity):
     T = self.classification_tree
     #Creo un nuovo sottoalbero fittizio con root nel nodo
     new_node = TreeNode.copy_node(node)
     #if new_node.is_leaf:
     #complexity += 1
     rho = 0
     data = X[new_node.data_idxs]
     label = y[new_node.data_idxs]
     if not all(i == label[0] for i in label) and len(data) > 0:
         #Questo fa SVM multiclasse 1 vs rest
         clf = LinearSVC(tol=1e-6,
                         C=10,
                         max_iter=10000,
                         loss='squared_hinge',
                         penalty='l1',
                         dual=False)
         clf.fit(data, label)
         #for n_class in range(n_classes):
         #n_misclassified = np.count_nonzero(label-np.sign(np.dot(data, clf.coef_[n_class].T)+clf.intercept_[n_class]))
         #Devo capire come ottenere i coefficienti migliori tra il numero di iperpiani addestrati
         weights = clf.coef_.reshape((len(X[0])), )
         intercept = clf.intercept_
         if new_node.is_leaf:
             ClassificationTree.create_new_children(node,
                                                    X,
                                                    y,
                                                    self.max_id,
                                                    None,
                                                    None,
                                                    oblique=T.oblique,
                                                    weights=weights,
                                                    intercept=intercept)
             rho = 1
         else:
             new_node.weights = weights
             new_node.intercept = intercept
         return new_node, ClassificationTree.misclassification_loss(
             new_node, X, y, new_node.data_idxs,
             T.oblique) + alfa * (complexity + rho)
     return new_node, np.inf
def genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR=0.95, l = 1):
    clf = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
    clf.fit(X, labels)
    random_trees = clf.estimators_
    trees = []
    best_loss = np.inf
    best_tree = None
    for tree in random_trees:
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, tree)
        tao_opt(T, X, labels)
        trees.append(T)
        ClassificationTree.build_idxs_of_subtree(X, range(len(labels)), T.tree[0], oblique)
        #ClassificationTree.restore_tree(T)
    #multi_optimize_tao(trees, X, labels)

    best_loss = np.inf
    best_tree = None
    for i in range(n_iter):
        print("Iter: ", i)
        #multi_optimize_evolution(trees, X, labels, CR)

        for tree in trees:
            #optimize_evolution(tree, trees, X, labels, X_valid, y_valid, CR)
            trial = mutation(tree, trees, CR, X, labels, depth)
            tao_opt(trial, X, labels)
            trial_loss = regularized_loss(trial.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            loss = regularized_loss(tree.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            if trial_loss < loss:
                tree = trial
                #print("migliore")
            if loss < best_loss:
                best_loss = loss
                best_tree = tree
                print ("best loss: ", best_loss)
                print("loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
                print("loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))

    print("ritorno loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
    print("ritono loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))
    return best_tree, best_loss
def crossover(tree, trees, CR, X):
    partners = random.sample(trees, 3)
    partners = [ClassificationTree.copy_tree(partners[0]), ClassificationTree.copy_tree(partners[1]), ClassificationTree.copy_tree(partners[2])]
    tree_nodes = list(tree.tree.values())
    d = random.choice(range(1, depth))
    #for d in reversed(range(tree.depth-1)):
    nodes_at_depth = ClassificationTree.get_nodes_at_depth(d, tree)
    other_nodes_at_depth = ClassificationTree.get_nodes_at_depth(d, partners[0])
    #other_nodes_at_depth.extend(ClassificationTree.get_nodes_at_depth(d, partners[1]))
    #other_nodes_at_depth.extend(ClassificationTree.get_nodes_at_depth(d, partners[2]))
    #for node in nodes_at_depth:
    node = random.choice(nodes_at_depth)
    p = np.random.uniform()
    if p < CR:
        choice = random.choice(other_nodes_at_depth)

        if choice.left_node != None and not choice.left_node.is_leaf and choice.right_node != None and not choice.right_node.is_leaf:
            p2 = np.random.uniform()
            if p2 < 0.5:
                node.feature = choice.left_node.feature
                node.threshold = choice.left_node.threshold
                #ClassificationTree.replace_node(node, choice.left_node, tree)
            else:
                node.feature = choice.right_node.feature
                node.threshold = choice.right_node.threshold
                #ClassificationTree.replace_node(node, choice.right_node, tree)

        else:
            #ClassificationTree.replace_node(node, choice, tree)
            node.feature = choice.feature
            node.threshold = choice.threshold
        '''
        if choice.parent_id != -1:
            node.feature = tree.tree[choice.parent_id].feature
            node.threshold = tree.tree[choice.parent_id].threshold
        '''

    else:
        node.feature = random.choice(range(len(X[0])))
        node.threshold = 0
Example #10
0
def best_split(node, X, labels):
    error_best = np.inf
    j = node.feature

    vals = {}
    best_node = None
    for point_idx in node.data_idxs:
        vals[point_idx] = X[point_idx, j]

    values = sorted(vals.items(), key=lambda x: x[1])
    sorted_indexes = [tuple[0] for tuple in values]
    #print ("len sort indx: ", len(sorted_indexes))
    thresh = 0.5 * X[sorted_indexes[0], j]
    node.threshold = thresh
    ClassificationTree.create_new_children(node, X, labels, node.id, j, thresh,
                                           False)

    actual_loss = zero_one_loss(node, X, labels)
    #Ciclo su ogni valore di quella componente e provo tutti gli split
    #possibili
    i = 0
    while i < len(sorted_indexes):
        if i < len(sorted_indexes) - 1:
            thresh = 0.5 * (X[sorted_indexes[i], j] +
                            X[sorted_indexes[i + 1], j])
        else:
            thresh = 1.5 * X[sorted_indexes[i], j]
        node.threshold = thresh
        ClassificationTree.create_new_children(node, X, labels, node.id, j,
                                               thresh, False)

        actual_loss = zero_one_loss(node, X, labels)
        if actual_loss < error_best:
            error_best = actual_loss
            best_left = node.left_node
            best_right = node.right_node
            best_t = thresh
        i += 1

    return best_t, best_left, best_right
def optimize_evolution(tree, trees, X, labels, X_valid, y_valid, CR):
    trial = ClassificationTree.copy_tree(tree)
    '''
    n_nodes = len(X[0])/(2**tree.depth - 1)
    if n_nodes < 1:
        n_nodes = 1


    p = np.random.uniform()
    for node in nodes:
        if p < CR and not node.is_leaf:
            node.feature = np.random.randint(0, len(X[0]))
            node.threshold = np.random.uniform(np.min(X[node.feature]),np.max(X[node.feature]))
    '''
    crossover(trial, trees, CR, X)
    ClassificationTree.build_idxs_of_subtree(X, range(len(labels)), trial.tree[0], oblique)
    #partners = random.sample(trees, 2)
    #optimize_crossover(trial, ClassificationTree.copy_tree(partners[0]), ClassificationTree.copy_tree(partners[1]), depth, X, labels, oblique)
    tao_opt(trial, X, labels)
    loss = regularized_loss(trial.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique)
    #Se il nuovo individuo è migliore del padre li scambio
    if loss < regularized_loss(tree.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique):
    #if loss < ClassificationTree.misclassification_loss(tree.tree[0], X, labels, range(len(labels)), oblique):
        tree = trial
Example #12
0
    def __init__(self, n_estimators=10, feature_num=None, min_samples_split=2, min_impurity=1e-7,
                 max_depth=10):
        self.n_estimators = n_estimators                            # 决策树的数量
        self.min_samples_split = min_samples_split                  # 决策树生长所需的最小样本数量
        self.min_impurity = min_impurity                            # 最小不纯度--即最大纯度
        self.max_depth = max_depth                                  # 树的最大深度
        self.feature_num = feature_num                              # 每棵树所需的特征数量
        

        self.trees = []
        # 实例化每棵决策树
        for _ in range(self.n_estimators):
            tree = ClassificationTree(min_impurity=self.min_impurity,
                                      min_samples_split=self.min_samples_split,
                                      max_depth=self.max_depth)
            self.trees.append(tree)
        self.feature_index = []
def optimize_crossover(tree_a, tree_b, tree_c, depth, X, labels, oblique):
    d = random.choice(range(depth))
    #for d in reversed(range(depth)):
    nodes_a = ClassificationTree.get_nodes_at_depth(d, tree_a)
    nodes_a = [node for node in nodes_a if not node.is_leaf]
    all_nodes = ClassificationTree.get_nodes_at_depth(d, tree_a)
    all_nodes.extend(ClassificationTree.get_nodes_at_depth(d, tree_b))
    all_nodes.extend(ClassificationTree.get_nodes_at_depth(d, tree_c))
    all_nodes = [node for node in all_nodes if not node.is_leaf]
    #print (node.id for node in nodes_a)
    #worst = nodes_a[np.argmax([ClassificationTree.misclassification_loss(node, X, labels, node.data_idxs, oblique) for node in nodes_a])]
    for node in nodes_a:
        if len(node.data_idxs) > 0:
            best = find_best_branch(all_nodes,  node.data_idxs, X, labels, oblique)
            best.data_idxs = node.data_idxs
            ClassificationTree.replace_node(node, best, tree_a)
    def fit(self, X, y):
        """
        Grows a forest of decision trees based off the num_trees
        attribute

        Parameters
        ----------
        X : N x D matrix of real or ordinal values
        y : size N vector consisting of either real values or labels for corresponding
        index in X
        """

        data = np.column_stack((X, y))
        self.forest = np.empty(shape=self.num_trees, dtype='object')
        sample_size = int(X.shape[0] * self.sample_percentage)

        for i in range(self.num_trees):
            sample = data[np.random.choice(data.shape[0], sample_size, replace=True)]

            sampled_X = data[:, :data.shape[1] - 1]
            sampled_y = data[:, data.shape[1] - 1]

            if isinstance(self, RegressionForest):
                tree = RegressionTree(
                    max_depth=self.max_depth,
                    min_size=self.min_size,
                    in_forest=True)
            else:
                tree = ClassificationTree(
                    cost_func=self.cost_func,
                    max_depth=self.max_depth,
                    min_size=self.min_size,
                    in_forest=True)

            tree.fit(sampled_X, sampled_y)
            self.forest[i] = tree
Example #15
0
    def prune(self, min_size=1):
        #Prima controllo se ci sono sottoalberi puri.
        #Visito l'albero e verifico se esistono nodi branch ai quali arrivano punti associati a solo una label
        #Poi vedo se il nodo attuale è morto
        T = self.classification_tree
        stack = [T.tree[0]]
        while (stack):
            actual = stack.pop()
            if len(actual.data_idxs) > 0:
                #Se il nodo è puro
                if not actual.is_leaf and all(
                        i == self.y[actual.data_idxs[0]]
                        for i in self.y[actual.data_idxs]):
                    #Devo far diventare una foglia questo nodo con valore pari alla label
                    actual.is_leaf = True
                    actual.left_node = None
                    actual.right_node = None
                    actual.left_node_id = -1
                    actual.right_node_id = -1
                    actual.value = self.y[actual.data_idxs[0]]

                #Se il nodo ha un figlio morto devo sostituire il padre con l'altro figlio
                elif not actual.is_leaf and len(
                        actual.left_node.data_idxs) < min_size:
                    stack.append(actual.right_node)
                    ClassificationTree.replace_node(actual, actual.right_node,
                                                    T)
                elif not actual.is_leaf and len(
                        actual.right_node.data_idxs) < min_size:
                    stack.append(actual.left_node)
                    ClassificationTree.replace_node(actual, actual.left_node,
                                                    T)
                elif not actual.is_leaf:
                    stack.append(actual.right_node)
                    stack.append(actual.left_node)
            ClassificationTree.restore_tree(T)
Example #16
0
    y_train = label[0:valid_id]

    X_valid = data[valid_id:]
    y_valid = label[valid_id:]
    '''
    X_train, X_valid, y_train, y_valid = train_test_split(data,
                                                          label,
                                                          stratify=label,
                                                          test_size=0.2)

    clf = DecisionTreeClassifier(random_state=0,
                                 max_depth=3,
                                 min_samples_leaf=10)
    clf.fit(X_train, y_train)

    T = ClassificationTree(oblique=False)
    T.initialize_from_CART(X_train, y_train, clf)
    T.compute_prob(X_train, y_train)
    cart_auc_train += T.auc(X_train, y_train)
    cart_auc_valid += T.auc(X_valid, y_valid)
    #tao_train_score+=1-T.misclassification_loss(X_train, y_train, T.tree[0])
    #print ("score before: ", tao_train_score)
    #x = data[8]
    #print (T.predict_label(x.reshape((1, -1)), 0))
    #print (clf.predict(x.reshape((1, -1))))
    #print ("x--->", x)
    #print(T.get_path_to(x, 0))
    #T.print_tree_structure()

    #print ("T acc -> ", 1-T.misclassification_loss(data, label, T.tree[0]))
    #print ("clf acc -> ", clf.score(data, label))
Example #17
0
    def best_parallel_split(self, node, X, y, alfa, complexity):

        T = self.classification_tree

        #Creo un nuovo sottoalbero fittizio con root nel nodo
        new_node = TreeNode.copy_node(node)
        error_best = np.inf
        #if new_node.is_leaf:
        #complexity += 1
        was_leaf = False
        improve = False
        rho = 0
        if new_node.is_leaf:
            was_leaf = True
            rho = 1

        if new_node.data_idxs:
            for j in range(len(X[0])):

                #Prendo tutte le j-esime componenti del dataset e le ordino
                #in modo crescente
                vals = {}
                for point_idx in new_node.data_idxs:
                    vals[point_idx] = X[point_idx, j]

                values = sorted(vals.items(), key=lambda x: x[1])
                sorted_indexes = [tuple[0] for tuple in values]

                #plt.scatter(X[sorted_indexes, j], range(len(values)), s=0.4, c=list(correct_classification_tuples.values()))
                #plt.show()
                new_node.feature = j
                #if j==2:
                #base = actual_loss
                #actual_loss = self.binary_loss(node_id, X, y, sorted_indexes[i], correct_classification_tuples[sorted_indexes[i]], actual_loss, thresh)
                #print ("loss: ", actual_loss, "n punti: ", len(care_points_idxs))
                #print("vecchia: ", self.vecchia_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh))
                #Ciclo su ogni valore di quella componente e provo tutti gli split
                #possibili
                '''
                new_node.threshold = 0.5*X[sorted_indexes[0], j]
                '''
                i = -1
                actual_loss = ClassificationTree.misclassification_loss(
                    new_node, X, y, sorted_indexes, self.classification_tree.
                    oblique) + alfa * (complexity + rho)
                while i < len(sorted_indexes):

                    pre_thresh = new_node.threshold
                    #print("Ottimizzo best parallel: ", i*100/len(sorted_indexes))

                    if i < 0:
                        thresh = 0.5 * X[sorted_indexes[0], j]

                    if i < len(sorted_indexes) - 1:
                        thresh = 0.5 * (X[sorted_indexes[i], j] +
                                        X[sorted_indexes[i + 1], j])
                    else:
                        thresh = 1.5 * X[sorted_indexes[i], j]

                    new_node.threshold = thresh
                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        self.create_new_children(new_node, j, thresh)
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc
                    else:
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc

                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        ClassificationTree.create_new_children(
                            new_node,
                            X,
                            y,
                            self.max_id,
                            j,
                            thresh,
                            oblique=T.oblique)

                    actual_loss = ClassificationTree.misclassification_loss(
                        new_node, X, y, sorted_indexes,
                        self.classification_tree.oblique) + alfa * (
                            complexity + rho)

                    if actual_loss < error_best:
                        improve = True
                        error_best = actual_loss
                        best_t = thresh
                        best_feature = j
                    i += 1

            #print ("error best: ", error_best)
            new_node.threshold = best_t
            new_node.feature = best_feature
            if was_leaf and improve:
                self.max_id += 2
        return new_node, error_best
def regularized_loss(node, X, labels, X_valid, y_valid, idxs, oblique, n_classes = 2, l = 0.9):
    #print(len(node.data_idxs))
    return ClassificationTree.misclassification_loss(node, X, labels, idxs, oblique) + l*ClassificationTree.misclassification_loss(node, X_valid, y_valid, range(len(y_valid)), oblique)#+ l*gini(node, X, labels, idxs, oblique, n_classes)
def test(n_runs, ls_train, ls_test, svm_train, svm_test, random_train, random_test, cart_train, tao_train, global_train, cart_test, tao_test, global_test):
    for run in range(n_runs):
        depth = 3
        oblique = False
        n_trees = 200
        n_iter = 5
        data = np.load('cancer_train.npy')
        y = np.load('cancer_label.npy')
        print ("Run -> ", run)
        idx = np.random.permutation(len(data))
        data = data[idx]
        y = y[idx]
        train_split = 0.50
        valid_split = 0.75
        #data = dataset.data[idx]
        #label = dataset.target[idx]
        train_id = int(len(data)*train_split)
        valid_id = int(len(data)*valid_split)
        X = data[0:train_id]
        labels = y[0:train_id]

        X_valid = data[train_id:valid_id]
        y_valid = y[train_id:valid_id]

        X_test = data[valid_id:]
        y_test = y[valid_id:]

        #CART
        clf = DecisionTreeClassifier(random_state=0, max_depth=depth, min_samples_leaf=4)
        clf.fit(X, labels)


        #TAO
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, clf)
        tao = TAO(T)
        tao.evolve(X, labels)

        T.print_tree_structure()
        #LS

        '''
        L = ClassificationTree(oblique = oblique)
        L.initialize_from_CART(X, labels, clf)
        ls = LocalSearch(L)
        ls.evolve(X, labels, alfa=1000000, max_iteration=10)
        '''


        #SVM
        svm = LinearSVC(tol=1e-6, max_iter=10000, dual=False)
        svm.fit(X, labels)

        #RandomForest
        random_for = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
        random_for.fit(X, labels)

        #Genetic
        best_t, best_loss = genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR = 0, l = 0)
        #best_t.print_tree_structure()

        best_t.print_tree_structure()
        #Train Score
        cart_train.append(clf.score(X, labels))
        #ls_train.append(1-ClassificationTree.misclassification_loss(L.tree[0], X, labels, range(len(labels)), oblique))
        tao_train.append(1-ClassificationTree.misclassification_loss(T.tree[0], X, labels, range(len(labels)), oblique))
        global_train.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X, labels, range(len(labels)), oblique))
        svm_train.append(svm.score(X, labels))
        random_train.append(random_for.score(X, labels))

        #Test Score
        cart_test.append(clf.score(X_test, y_test))
        #ls_test.append(1-ClassificationTree.misclassification_loss(L.tree[0], X_test, y_test, range(len(y_test)), oblique))
        tao_test.append(1-ClassificationTree.misclassification_loss(T.tree[0], X_test, y_test, range(len(y_test)), oblique))
        global_test.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X_test, y_test, range(len(y_test)), oblique))
        svm_test.append(svm.score(X_test, y_test))
        random_test.append(random_for.score(X_test, y_test))
Example #20
0
    def TAO_best_parallel_split(self, node_id, X, y, care_points_idxs,
                                correct_classification_tuples, T):
        #print ("node id:", node_id)
        error_best = np.inf
        best_t = T.tree[node_id].threshold
        best_feature = T.tree[node_id].feature
        if self.train_on_all_features:
            features = range(len(X[0]))
        else:
            features = ClassificationTree.get_features(T)

        for j in features:
            #print(j)
            #Prendo tutte le j-esime componenti del dataset e le ordino
            #in modo crescente
            vals = {}
            for point_idx in care_points_idxs:
                vals[point_idx] = X[point_idx, j]

            values = sorted(vals.items(), key=lambda x: x[1])
            sorted_indexes = [tuple[0] for tuple in values]
            #plt.scatter(X[sorted_indexes, j], range(len(values)), s=0.4, c=list(correct_classification_tuples.values()))
            #plt.show()
            T.tree[node_id].feature = j
            thresh = 0.5 * (X[sorted_indexes[0], j] + X[sorted_indexes[1], j])
            actual_loss, start = self.zero_one_loss(
                node_id, X, y, sorted_indexes, correct_classification_tuples,
                thresh, sorted_indexes)

            #if j==2:
            #base = actual_loss
            #actual_loss = self.binary_loss(node_id, X, y, sorted_indexes[i], correct_classification_tuples[sorted_indexes[i]], actual_loss, thresh)
            #print ("loss: ", actual_loss, "n punti: ", len(care_points_idxs))
            #print("vecchia: ", self.vecchia_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh))
            #Ciclo su ogni valore di quella componente e provo tutti gli split
            #possibili
            if actual_loss < error_best:
                error_best = actual_loss
                best_t = thresh
                best_feature = j
            i = start
            while i < len(sorted_indexes) - 1:

                thresh = 0.5 * (X[sorted_indexes[i], j] +
                                X[sorted_indexes[i + 1], j])

                #sum, k = self.binary_loss(X, j, sorted_indexes, i, correct_classification_tuples)
                #actual_loss += sum

                #Qualche print per debug
                '''
                if j==2 and node_id==3:
                    ones = [correct_classification_tuples[k] for k in sorted_indexes]
                    print("base: ", base, "  n punti:", len(sorted_indexes), "    loss:", actual_loss, "     thresh:", thresh, "     x:", X[sorted_indexes[i], j], "    prediction: ", correct_classification_tuples[sorted_indexes[i]])
                    print(X[sorted_indexes, j])
                    print()
                '''
                #actual_loss = self.zero_one_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh)

                s, k = self.binary_loss(X, j, sorted_indexes, i,
                                        correct_classification_tuples)
                #print ("dopo binary")
                actual_loss += s
                if actual_loss < error_best:
                    error_best = actual_loss
                    best_t = thresh
                    best_feature = j
                i += k
            '''
            #print(error_best)
            #errors = [self.binary_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, threshes[i]) for i in range(len(values)-1)]
            #min_index = np.argmin(errors)
        '''
        #Ancora print per debug
        #if node_id==1:
        #print("finale: ", error_best)
        #print ("ones:", ones, " len:  ", len(ones), "   care_p_len:", len(care_points_idxs))
        #print("best_feature:", best_feature, "   best_thresh:", best_t)

        return best_t, best_feature
Example #21
0
                "node %s." % (
                    actual_node.depth * "\t",
                    actual_node.id,
                    actual_node.parent_id,
                    actual_node.left_node_id,
                    actual_node.feature,
                    actual_node.threshold,
                    actual_node.right_node_id,
                ))
            stack.append(actual_node.left_node)
            stack.append(actual_node.right_node)

    spear_train += 1 - zero_one_loss(node, X, labels) / len(labels)
    spear_valid += 1 - zero_one_loss(node, X_valid, y_valid) / len(y_valid)

    clf = DecisionTreeClassifier(random_state=0,
                                 max_depth=3,
                                 min_samples_leaf=4)
    clf.fit(X, labels)
    clf_train += clf.score(X, labels)
    clf_valid += clf.score(X_valid, y_valid)

    L = ClassificationTree(oblique=False)
    L.initialize_from_CART(X, labels, clf)

    L.print_tree_structure()
print("clf train: ", clf_train / 30)
print("spearman train: ", spear_train / 30)
print("clf valid: ", clf_valid / 30)
print("spearman valid: ", spear_valid / 30)