def genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR=0.95, l = 1):
    clf = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
    clf.fit(X, labels)
    random_trees = clf.estimators_
    trees = []
    best_loss = np.inf
    best_tree = None
    for tree in random_trees:
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, tree)
        tao_opt(T, X, labels)
        trees.append(T)
        ClassificationTree.build_idxs_of_subtree(X, range(len(labels)), T.tree[0], oblique)
        #ClassificationTree.restore_tree(T)
    #multi_optimize_tao(trees, X, labels)

    best_loss = np.inf
    best_tree = None
    for i in range(n_iter):
        print("Iter: ", i)
        #multi_optimize_evolution(trees, X, labels, CR)

        for tree in trees:
            #optimize_evolution(tree, trees, X, labels, X_valid, y_valid, CR)
            trial = mutation(tree, trees, CR, X, labels, depth)
            tao_opt(trial, X, labels)
            trial_loss = regularized_loss(trial.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            loss = regularized_loss(tree.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            if trial_loss < loss:
                tree = trial
                #print("migliore")
            if loss < best_loss:
                best_loss = loss
                best_tree = tree
                print ("best loss: ", best_loss)
                print("loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
                print("loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))

    print("ritorno loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
    print("ritono loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))
    return best_tree, best_loss
Beispiel #2
0
    X_valid = data[valid_id:]
    y_valid = label[valid_id:]
    '''
    X_train, X_valid, y_train, y_valid = train_test_split(data,
                                                          label,
                                                          stratify=label,
                                                          test_size=0.2)

    clf = DecisionTreeClassifier(random_state=0,
                                 max_depth=3,
                                 min_samples_leaf=10)
    clf.fit(X_train, y_train)

    T = ClassificationTree(oblique=False)
    T.initialize_from_CART(X_train, y_train, clf)
    T.compute_prob(X_train, y_train)
    cart_auc_train += T.auc(X_train, y_train)
    cart_auc_valid += T.auc(X_valid, y_valid)
    #tao_train_score+=1-T.misclassification_loss(X_train, y_train, T.tree[0])
    #print ("score before: ", tao_train_score)
    #x = data[8]
    #print (T.predict_label(x.reshape((1, -1)), 0))
    #print (clf.predict(x.reshape((1, -1))))
    #print ("x--->", x)
    #print(T.get_path_to(x, 0))
    #T.print_tree_structure()

    #print ("T acc -> ", 1-T.misclassification_loss(data, label, T.tree[0]))
    #print ("clf acc -> ", clf.score(data, label))
    #node_id = 4
Beispiel #3
0
                "node %s." % (
                    actual_node.depth * "\t",
                    actual_node.id,
                    actual_node.parent_id,
                    actual_node.left_node_id,
                    actual_node.feature,
                    actual_node.threshold,
                    actual_node.right_node_id,
                ))
            stack.append(actual_node.left_node)
            stack.append(actual_node.right_node)

    spear_train += 1 - zero_one_loss(node, X, labels) / len(labels)
    spear_valid += 1 - zero_one_loss(node, X_valid, y_valid) / len(y_valid)

    clf = DecisionTreeClassifier(random_state=0,
                                 max_depth=3,
                                 min_samples_leaf=4)
    clf.fit(X, labels)
    clf_train += clf.score(X, labels)
    clf_valid += clf.score(X_valid, y_valid)

    L = ClassificationTree(oblique=False)
    L.initialize_from_CART(X, labels, clf)

    L.print_tree_structure()
print("clf train: ", clf_train / 30)
print("spearman train: ", spear_train / 30)
print("clf valid: ", clf_valid / 30)
print("spearman valid: ", spear_valid / 30)
def test(n_runs, ls_train, ls_test, svm_train, svm_test, random_train, random_test, cart_train, tao_train, global_train, cart_test, tao_test, global_test):
    for run in range(n_runs):
        depth = 3
        oblique = False
        n_trees = 200
        n_iter = 5
        data = np.load('cancer_train.npy')
        y = np.load('cancer_label.npy')
        print ("Run -> ", run)
        idx = np.random.permutation(len(data))
        data = data[idx]
        y = y[idx]
        train_split = 0.50
        valid_split = 0.75
        #data = dataset.data[idx]
        #label = dataset.target[idx]
        train_id = int(len(data)*train_split)
        valid_id = int(len(data)*valid_split)
        X = data[0:train_id]
        labels = y[0:train_id]

        X_valid = data[train_id:valid_id]
        y_valid = y[train_id:valid_id]

        X_test = data[valid_id:]
        y_test = y[valid_id:]

        #CART
        clf = DecisionTreeClassifier(random_state=0, max_depth=depth, min_samples_leaf=4)
        clf.fit(X, labels)


        #TAO
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, clf)
        tao = TAO(T)
        tao.evolve(X, labels)

        T.print_tree_structure()
        #LS

        '''
        L = ClassificationTree(oblique = oblique)
        L.initialize_from_CART(X, labels, clf)
        ls = LocalSearch(L)
        ls.evolve(X, labels, alfa=1000000, max_iteration=10)
        '''


        #SVM
        svm = LinearSVC(tol=1e-6, max_iter=10000, dual=False)
        svm.fit(X, labels)

        #RandomForest
        random_for = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
        random_for.fit(X, labels)

        #Genetic
        best_t, best_loss = genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR = 0, l = 0)
        #best_t.print_tree_structure()

        best_t.print_tree_structure()
        #Train Score
        cart_train.append(clf.score(X, labels))
        #ls_train.append(1-ClassificationTree.misclassification_loss(L.tree[0], X, labels, range(len(labels)), oblique))
        tao_train.append(1-ClassificationTree.misclassification_loss(T.tree[0], X, labels, range(len(labels)), oblique))
        global_train.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X, labels, range(len(labels)), oblique))
        svm_train.append(svm.score(X, labels))
        random_train.append(random_for.score(X, labels))

        #Test Score
        cart_test.append(clf.score(X_test, y_test))
        #ls_test.append(1-ClassificationTree.misclassification_loss(L.tree[0], X_test, y_test, range(len(y_test)), oblique))
        tao_test.append(1-ClassificationTree.misclassification_loss(T.tree[0], X_test, y_test, range(len(y_test)), oblique))
        global_test.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X_test, y_test, range(len(y_test)), oblique))
        svm_test.append(svm.score(X_test, y_test))
        random_test.append(random_for.score(X_test, y_test))