Beispiel #1
0
    def optimize_node_parallel(self, node, X, y, alfa, complexity):

        rho = 0
        if node.is_leaf:
            rho = 1
        error_best = ClassificationTree.misclassification_loss(
            node, X, y, node.data_idxs,
            self.classification_tree.oblique) + alfa * complexity

        #print("prima parallel")
        #Provo lo split
        if self.classification_tree.oblique:
            node_para, error_para = self.best_svm_split(
                node, X, y, alfa, complexity)
        else:
            node_para, error_para = self.best_parallel_split(
                node, X, y, alfa, complexity)
        #print(node.id, "  fatto parallel plit")

        if error_para < error_best:
            #print("error para migliore")
            error_best = error_para
            ClassificationTree.replace_node(node, node_para,
                                            self.classification_tree)

        #Metto questo if perchè nel caso di svm se il nodo è una foglia pura allora
        #non riesco a fare SVM perchè giustamente si aspetta label differenti.
        #Questo non perde di generalità poichè se il nodo fosse una foglia pura allora
        #creando due nuovi figli e ottimizzando otterremmo un figlio foglia puro e l'altro
        #senza punti. Che avrebbe la stessa loss iniziale del padre. Quindi alla fine la foglia iniziale
        #non verrebbe branchata.
        if node_para.left_node != None and node_para.right_node != None:
            #Errore sul figlio sinistro, se è migliore allora sostituisco
            error_lower = ClassificationTree.misclassification_loss(
                node_para.left_node, X, y, node.data_idxs,
                self.classification_tree.oblique) + alfa * (complexity + rho)
            if error_lower < error_best:
                #print("error lower migliore")
                ClassificationTree.replace_node(node, node_para.left_node,
                                                self.classification_tree)
                error_best = error_lower

            #Errore sul figlio sinistro, se è migliore allora sostituisco
            error_upper = ClassificationTree.misclassification_loss(
                node_para.right_node, X, y, node.data_idxs,
                self.classification_tree.oblique) + alfa * (complexity + rho)
            if error_upper < error_best:
                #print("error upper migliore")
                ClassificationTree.replace_node(node, node_para.right_node,
                                                self.classification_tree)
                error_best = error_upper
        return node
Beispiel #2
0
    def evolve(self, X, y, alfa=0, max_iteration=1):
        ClassificationTree.restore_tree(self.classification_tree)
        complexity = self.classification_tree.n_leaves - 1
        T = self.classification_tree.tree
        self.X = X
        self.y = y
        i = 0

        while (i < max_iteration):
            optimized = []
            error_prev = ClassificationTree.misclassification_loss(
                T[0], X, y, T[0].data_idxs,
                self.classification_tree.oblique) + alfa * complexity

            values = list(self.classification_tree.tree.keys())
            random.shuffle(values)
            #print(values)
            #print("values: ", values)
            while (len(values) > 0):
                #print(complexity)
                node_id = values.pop()
                optimized.append(node_id)
                #print("optimizing node:", node_id)
                self.optimize_node_parallel(T[node_id], X, y, alfa, complexity)
                #print("nodo ottimizzato:  ", node_id)
                ids = ClassificationTree.restore_tree(self.classification_tree)
                complexity = self.classification_tree.n_leaves - 1
                #print("complexity: ", complexity)
                #print("ids: ", ids)
                values = list(set(ids) - set(optimized))
                #print("values dopo restore:  ", values)
                self.classification_tree.build_idxs_of_subtree(
                    X, range(len(X)), T[0], self.classification_tree.oblique)
                error_curr = ClassificationTree.misclassification_loss(
                    T[0], X, y, T[0].data_idxs,
                    self.classification_tree.oblique) + alfa * complexity
            #print(self.max_id)
            #print("i-->", i, "node: ", node_id)
            #for node_id in to_delete:
            #self.delete_node(node_id)

            i += 1
            #print("Ottimizzato nodi algoritmo ls: ", i, " volte")
            if np.abs(error_curr - error_prev) < 1e-01:
                break
def genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR=0.95, l = 1):
    clf = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
    clf.fit(X, labels)
    random_trees = clf.estimators_
    trees = []
    best_loss = np.inf
    best_tree = None
    for tree in random_trees:
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, tree)
        tao_opt(T, X, labels)
        trees.append(T)
        ClassificationTree.build_idxs_of_subtree(X, range(len(labels)), T.tree[0], oblique)
        #ClassificationTree.restore_tree(T)
    #multi_optimize_tao(trees, X, labels)

    best_loss = np.inf
    best_tree = None
    for i in range(n_iter):
        print("Iter: ", i)
        #multi_optimize_evolution(trees, X, labels, CR)

        for tree in trees:
            #optimize_evolution(tree, trees, X, labels, X_valid, y_valid, CR)
            trial = mutation(tree, trees, CR, X, labels, depth)
            tao_opt(trial, X, labels)
            trial_loss = regularized_loss(trial.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            loss = regularized_loss(tree.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            if trial_loss < loss:
                tree = trial
                #print("migliore")
            if loss < best_loss:
                best_loss = loss
                best_tree = tree
                print ("best loss: ", best_loss)
                print("loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
                print("loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))

    print("ritorno loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
    print("ritono loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))
    return best_tree, best_loss
Beispiel #4
0
 def best_svm_split(self, node, X, y, alfa, complexity):
     T = self.classification_tree
     #Creo un nuovo sottoalbero fittizio con root nel nodo
     new_node = TreeNode.copy_node(node)
     #if new_node.is_leaf:
     #complexity += 1
     rho = 0
     data = X[new_node.data_idxs]
     label = y[new_node.data_idxs]
     if not all(i == label[0] for i in label) and len(data) > 0:
         #Questo fa SVM multiclasse 1 vs rest
         clf = LinearSVC(tol=1e-6,
                         C=10,
                         max_iter=10000,
                         loss='squared_hinge',
                         penalty='l1',
                         dual=False)
         clf.fit(data, label)
         #for n_class in range(n_classes):
         #n_misclassified = np.count_nonzero(label-np.sign(np.dot(data, clf.coef_[n_class].T)+clf.intercept_[n_class]))
         #Devo capire come ottenere i coefficienti migliori tra il numero di iperpiani addestrati
         weights = clf.coef_.reshape((len(X[0])), )
         intercept = clf.intercept_
         if new_node.is_leaf:
             ClassificationTree.create_new_children(node,
                                                    X,
                                                    y,
                                                    self.max_id,
                                                    None,
                                                    None,
                                                    oblique=T.oblique,
                                                    weights=weights,
                                                    intercept=intercept)
             rho = 1
         else:
             new_node.weights = weights
             new_node.intercept = intercept
         return new_node, ClassificationTree.misclassification_loss(
             new_node, X, y, new_node.data_idxs,
             T.oblique) + alfa * (complexity + rho)
     return new_node, np.inf
Beispiel #5
0
    def best_parallel_split(self, node, X, y, alfa, complexity):

        T = self.classification_tree

        #Creo un nuovo sottoalbero fittizio con root nel nodo
        new_node = TreeNode.copy_node(node)
        error_best = np.inf
        #if new_node.is_leaf:
        #complexity += 1
        was_leaf = False
        improve = False
        rho = 0
        if new_node.is_leaf:
            was_leaf = True
            rho = 1

        if new_node.data_idxs:
            for j in range(len(X[0])):

                #Prendo tutte le j-esime componenti del dataset e le ordino
                #in modo crescente
                vals = {}
                for point_idx in new_node.data_idxs:
                    vals[point_idx] = X[point_idx, j]

                values = sorted(vals.items(), key=lambda x: x[1])
                sorted_indexes = [tuple[0] for tuple in values]

                #plt.scatter(X[sorted_indexes, j], range(len(values)), s=0.4, c=list(correct_classification_tuples.values()))
                #plt.show()
                new_node.feature = j
                #if j==2:
                #base = actual_loss
                #actual_loss = self.binary_loss(node_id, X, y, sorted_indexes[i], correct_classification_tuples[sorted_indexes[i]], actual_loss, thresh)
                #print ("loss: ", actual_loss, "n punti: ", len(care_points_idxs))
                #print("vecchia: ", self.vecchia_loss(node_id, X, y, care_points_idxs, correct_classification_tuples, thresh))
                #Ciclo su ogni valore di quella componente e provo tutti gli split
                #possibili
                '''
                new_node.threshold = 0.5*X[sorted_indexes[0], j]
                '''
                i = -1
                actual_loss = ClassificationTree.misclassification_loss(
                    new_node, X, y, sorted_indexes, self.classification_tree.
                    oblique) + alfa * (complexity + rho)
                while i < len(sorted_indexes):

                    pre_thresh = new_node.threshold
                    #print("Ottimizzo best parallel: ", i*100/len(sorted_indexes))

                    if i < 0:
                        thresh = 0.5 * X[sorted_indexes[0], j]

                    if i < len(sorted_indexes) - 1:
                        thresh = 0.5 * (X[sorted_indexes[i], j] +
                                        X[sorted_indexes[i + 1], j])
                    else:
                        thresh = 1.5 * X[sorted_indexes[i], j]

                    new_node.threshold = thresh
                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        self.create_new_children(new_node, j, thresh)
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc
                    else:
                        inc, k = self.binary_loss(X, new_node, sorted_indexes, i, y, pre_thresh)
                        actual_loss += inc

                    '''
                    #Se il nodo da ottimizzare era una foglia allora dobbiamo creare le foglie figlie
                    #queste vengono ottimizzate subito per maggioranza
                    if was_leaf:
                        ClassificationTree.create_new_children(
                            new_node,
                            X,
                            y,
                            self.max_id,
                            j,
                            thresh,
                            oblique=T.oblique)

                    actual_loss = ClassificationTree.misclassification_loss(
                        new_node, X, y, sorted_indexes,
                        self.classification_tree.oblique) + alfa * (
                            complexity + rho)

                    if actual_loss < error_best:
                        improve = True
                        error_best = actual_loss
                        best_t = thresh
                        best_feature = j
                    i += 1

            #print ("error best: ", error_best)
            new_node.threshold = best_t
            new_node.feature = best_feature
            if was_leaf and improve:
                self.max_id += 2
        return new_node, error_best
Beispiel #6
0
    T.print_tree_structure()
    tao = TAO(T)
    tao.evolve(X_train, y_train)
    T.compute_prob(X_train, y_train)
    tao_auc_train += T.auc(X_train, y_train)
    tao_auc_valid += T.auc(X_valid, y_valid)

    L = ClassificationTree(oblique=False)
    L.initialize_from_CART(X_train, y_train, clf)
    ls = LocalSearch(L)
    ls.evolve(X_train, y_train, alfa=1000000, max_iteration=10)
    L.compute_prob(X_train, y_train)
    ls_auc_train += L.auc(X_train, y_train)
    ls_auc_valid += L.auc(X_valid, y_valid)
    clf_train_score += clf.score(X_train, y_train)
    tao_train_score += 1 - T.misclassification_loss(
        T.tree[0], X_train, y_train, range(len(X_train)), T.oblique)
    ls_train_score += 1 - L.misclassification_loss(
        L.tree[0], X_train, y_train, range(len(X_train)), L.oblique)
    clf_valid_score += clf.score(X_valid, y_valid)
    tao_valid_score += 1 - T.misclassification_loss(
        T.tree[0], X_valid, y_valid, range(len(X_valid)), T.oblique)
    ls_valid_score += 1 - L.misclassification_loss(
        L.tree[0], X_valid, y_valid, range(len(X_valid)), L.oblique)

#to_delete = ls.evolve(data, label)
#T.print_tree_structure()
#for (id, node) in T.tree.items():
#print("Dopo: node ", id, " items -->", node.data_idxs)

print("LS train acc -> ", ls_train_score / n_trial, "Depth: ", L.depth)
print("TAO train acc -> ", tao_train_score / n_trial, "Depth: ", T.depth)
def test(n_runs, ls_train, ls_test, svm_train, svm_test, random_train, random_test, cart_train, tao_train, global_train, cart_test, tao_test, global_test):
    for run in range(n_runs):
        depth = 3
        oblique = False
        n_trees = 200
        n_iter = 5
        data = np.load('cancer_train.npy')
        y = np.load('cancer_label.npy')
        print ("Run -> ", run)
        idx = np.random.permutation(len(data))
        data = data[idx]
        y = y[idx]
        train_split = 0.50
        valid_split = 0.75
        #data = dataset.data[idx]
        #label = dataset.target[idx]
        train_id = int(len(data)*train_split)
        valid_id = int(len(data)*valid_split)
        X = data[0:train_id]
        labels = y[0:train_id]

        X_valid = data[train_id:valid_id]
        y_valid = y[train_id:valid_id]

        X_test = data[valid_id:]
        y_test = y[valid_id:]

        #CART
        clf = DecisionTreeClassifier(random_state=0, max_depth=depth, min_samples_leaf=4)
        clf.fit(X, labels)


        #TAO
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, clf)
        tao = TAO(T)
        tao.evolve(X, labels)

        T.print_tree_structure()
        #LS

        '''
        L = ClassificationTree(oblique = oblique)
        L.initialize_from_CART(X, labels, clf)
        ls = LocalSearch(L)
        ls.evolve(X, labels, alfa=1000000, max_iteration=10)
        '''


        #SVM
        svm = LinearSVC(tol=1e-6, max_iter=10000, dual=False)
        svm.fit(X, labels)

        #RandomForest
        random_for = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
        random_for.fit(X, labels)

        #Genetic
        best_t, best_loss = genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR = 0, l = 0)
        #best_t.print_tree_structure()

        best_t.print_tree_structure()
        #Train Score
        cart_train.append(clf.score(X, labels))
        #ls_train.append(1-ClassificationTree.misclassification_loss(L.tree[0], X, labels, range(len(labels)), oblique))
        tao_train.append(1-ClassificationTree.misclassification_loss(T.tree[0], X, labels, range(len(labels)), oblique))
        global_train.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X, labels, range(len(labels)), oblique))
        svm_train.append(svm.score(X, labels))
        random_train.append(random_for.score(X, labels))

        #Test Score
        cart_test.append(clf.score(X_test, y_test))
        #ls_test.append(1-ClassificationTree.misclassification_loss(L.tree[0], X_test, y_test, range(len(y_test)), oblique))
        tao_test.append(1-ClassificationTree.misclassification_loss(T.tree[0], X_test, y_test, range(len(y_test)), oblique))
        global_test.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X_test, y_test, range(len(y_test)), oblique))
        svm_test.append(svm.score(X_test, y_test))
        random_test.append(random_for.score(X_test, y_test))
def regularized_loss(node, X, labels, X_valid, y_valid, idxs, oblique, n_classes = 2, l = 0.9):
    #print(len(node.data_idxs))
    return ClassificationTree.misclassification_loss(node, X, labels, idxs, oblique) + l*ClassificationTree.misclassification_loss(node, X_valid, y_valid, range(len(y_valid)), oblique)#+ l*gini(node, X, labels, idxs, oblique, n_classes)