Esempio n. 1
0
    def fit(self, X, y):
        n_samples, n_features = X.shape[0], X.shape[1]

        # 初始化各样本的权重
        w = np.full(n_samples, (1 / n_samples))
        # 储存每个分类器
        self.clfs = []

        for i in tqdm(range(self.n_clfs)):
            # 实例化一个分类器
            clf = ClassificationTree()
            # 训练
            clf.fit(X, y)
            # 得到训练结果
            y_pred = clf.predict(X)
            # 计算训练误差
            print(accuracy_score(y, y_pred))
            error = sum(w[y != y_pred])
            # 对于错误率大于0.5的分类器,因为是二分类问题(Adaboost只能解决二分类问题),我们可以翻转
            # 分类器的预测结果来使得其错误率为1-error>0.5
            print(error)
            if error > 0.5:
                self.polarity[i] = -1
                y_pred *= -1
                error = 1 - error
            self.alphas[i] = 0.5 * np.log((1.0 - error) / (error + 1e-10))
            predictions = np.array(self.polarity[i] * y_pred)
            w *= np.exp(-self.alphas[i] * y * predictions)
            w /= sum(w)
            self.clfs.append(clf)
Esempio n. 2
0
def main():
    print('--- Adaboost ---')
    data = datasets.load_digits()
    X, y = data.data, data.target

    digit1 = 1
    digit2 = 8

    idx = np.append(np.where(y == digit1)[0], np.where(y == digit2)[0])
    y = data.target[idx]

    y[y == digit1] = 1
    y[y == digit2] = -1

    X = data.data[idx]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    clf = Adaboost(n_estimators=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_pred, y_test)

    clf_tree = ClassificationTree()
    clf_tree.fit(X_train, y_train)
    y_pred_tree = clf_tree.predict(X_test)
    acc_tree = accuracy_score(y_pred, y_test)

    print("Adaboost_Accuracy:", acc)
    print("Tree_Accuracy:", acc_tree)
Esempio n. 3
0
    def __init__(self, n_estimators=10, feature_num=None, min_samples_split=2, min_impurity=1e-7,
                 max_depth=10):
        self.n_estimators = n_estimators                            # 决策树的数量
        self.min_samples_split = min_samples_split                  # 决策树生长所需的最小样本数量
        self.min_impurity = min_impurity                            # 最小不纯度--即最大纯度
        self.max_depth = max_depth                                  # 树的最大深度
        self.feature_num = feature_num                              # 每棵树所需的特征数量
        

        self.trees = []
        # 实例化每棵决策树
        for _ in range(self.n_estimators):
            tree = ClassificationTree(min_impurity=self.min_impurity,
                                      min_samples_split=self.min_samples_split,
                                      max_depth=self.max_depth)
            self.trees.append(tree)
        self.feature_index = []
def genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR=0.95, l = 1):
    clf = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
    clf.fit(X, labels)
    random_trees = clf.estimators_
    trees = []
    best_loss = np.inf
    best_tree = None
    for tree in random_trees:
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, tree)
        tao_opt(T, X, labels)
        trees.append(T)
        ClassificationTree.build_idxs_of_subtree(X, range(len(labels)), T.tree[0], oblique)
        #ClassificationTree.restore_tree(T)
    #multi_optimize_tao(trees, X, labels)

    best_loss = np.inf
    best_tree = None
    for i in range(n_iter):
        print("Iter: ", i)
        #multi_optimize_evolution(trees, X, labels, CR)

        for tree in trees:
            #optimize_evolution(tree, trees, X, labels, X_valid, y_valid, CR)
            trial = mutation(tree, trees, CR, X, labels, depth)
            tao_opt(trial, X, labels)
            trial_loss = regularized_loss(trial.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            loss = regularized_loss(tree.tree[0], X, labels, X_valid, y_valid, range(len(labels)), oblique, l=l)
            if trial_loss < loss:
                tree = trial
                #print("migliore")
            if loss < best_loss:
                best_loss = loss
                best_tree = tree
                print ("best loss: ", best_loss)
                print("loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
                print("loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))

    print("ritorno loss train best: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X, labels, range(len(labels)), oblique))
    print("ritono loss valid: ", 1-ClassificationTree.misclassification_loss(best_tree.tree[0], X_valid, y_valid, range(len(y_valid)), oblique))
    return best_tree, best_loss
    def fit(self, X, y):
        """
        Grows a forest of decision trees based off the num_trees
        attribute

        Parameters
        ----------
        X : N x D matrix of real or ordinal values
        y : size N vector consisting of either real values or labels for corresponding
        index in X
        """

        data = np.column_stack((X, y))
        self.forest = np.empty(shape=self.num_trees, dtype='object')
        sample_size = int(X.shape[0] * self.sample_percentage)

        for i in range(self.num_trees):
            sample = data[np.random.choice(data.shape[0], sample_size, replace=True)]

            sampled_X = data[:, :data.shape[1] - 1]
            sampled_y = data[:, data.shape[1] - 1]

            if isinstance(self, RegressionForest):
                tree = RegressionTree(
                    max_depth=self.max_depth,
                    min_size=self.min_size,
                    in_forest=True)
            else:
                tree = ClassificationTree(
                    cost_func=self.cost_func,
                    max_depth=self.max_depth,
                    min_size=self.min_size,
                    in_forest=True)

            tree.fit(sampled_X, sampled_y)
            self.forest[i] = tree
Esempio n. 6
0
    y_train = label[0:valid_id]

    X_valid = data[valid_id:]
    y_valid = label[valid_id:]
    '''
    X_train, X_valid, y_train, y_valid = train_test_split(data,
                                                          label,
                                                          stratify=label,
                                                          test_size=0.2)

    clf = DecisionTreeClassifier(random_state=0,
                                 max_depth=3,
                                 min_samples_leaf=10)
    clf.fit(X_train, y_train)

    T = ClassificationTree(oblique=False)
    T.initialize_from_CART(X_train, y_train, clf)
    T.compute_prob(X_train, y_train)
    cart_auc_train += T.auc(X_train, y_train)
    cart_auc_valid += T.auc(X_valid, y_valid)
    #tao_train_score+=1-T.misclassification_loss(X_train, y_train, T.tree[0])
    #print ("score before: ", tao_train_score)
    #x = data[8]
    #print (T.predict_label(x.reshape((1, -1)), 0))
    #print (clf.predict(x.reshape((1, -1))))
    #print ("x--->", x)
    #print(T.get_path_to(x, 0))
    #T.print_tree_structure()

    #print ("T acc -> ", 1-T.misclassification_loss(data, label, T.tree[0]))
    #print ("clf acc -> ", clf.score(data, label))
def test(n_runs, ls_train, ls_test, svm_train, svm_test, random_train, random_test, cart_train, tao_train, global_train, cart_test, tao_test, global_test):
    for run in range(n_runs):
        depth = 3
        oblique = False
        n_trees = 200
        n_iter = 5
        data = np.load('cancer_train.npy')
        y = np.load('cancer_label.npy')
        print ("Run -> ", run)
        idx = np.random.permutation(len(data))
        data = data[idx]
        y = y[idx]
        train_split = 0.50
        valid_split = 0.75
        #data = dataset.data[idx]
        #label = dataset.target[idx]
        train_id = int(len(data)*train_split)
        valid_id = int(len(data)*valid_split)
        X = data[0:train_id]
        labels = y[0:train_id]

        X_valid = data[train_id:valid_id]
        y_valid = y[train_id:valid_id]

        X_test = data[valid_id:]
        y_test = y[valid_id:]

        #CART
        clf = DecisionTreeClassifier(random_state=0, max_depth=depth, min_samples_leaf=4)
        clf.fit(X, labels)


        #TAO
        T = ClassificationTree(oblique = oblique)
        T.initialize_from_CART(X, labels, clf)
        tao = TAO(T)
        tao.evolve(X, labels)

        T.print_tree_structure()
        #LS

        '''
        L = ClassificationTree(oblique = oblique)
        L.initialize_from_CART(X, labels, clf)
        ls = LocalSearch(L)
        ls.evolve(X, labels, alfa=1000000, max_iteration=10)
        '''


        #SVM
        svm = LinearSVC(tol=1e-6, max_iter=10000, dual=False)
        svm.fit(X, labels)

        #RandomForest
        random_for = RandomForestClassifier(n_estimators = n_trees, max_depth=depth, random_state=0, min_samples_leaf= 4)
        random_for.fit(X, labels)

        #Genetic
        best_t, best_loss = genetic_tree_optimization(n_trees, n_iter, depth, X, labels, oblique, X_valid, y_valid, CR = 0, l = 0)
        #best_t.print_tree_structure()

        best_t.print_tree_structure()
        #Train Score
        cart_train.append(clf.score(X, labels))
        #ls_train.append(1-ClassificationTree.misclassification_loss(L.tree[0], X, labels, range(len(labels)), oblique))
        tao_train.append(1-ClassificationTree.misclassification_loss(T.tree[0], X, labels, range(len(labels)), oblique))
        global_train.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X, labels, range(len(labels)), oblique))
        svm_train.append(svm.score(X, labels))
        random_train.append(random_for.score(X, labels))

        #Test Score
        cart_test.append(clf.score(X_test, y_test))
        #ls_test.append(1-ClassificationTree.misclassification_loss(L.tree[0], X_test, y_test, range(len(y_test)), oblique))
        tao_test.append(1-ClassificationTree.misclassification_loss(T.tree[0], X_test, y_test, range(len(y_test)), oblique))
        global_test.append(1-ClassificationTree.misclassification_loss(best_t.tree[0], X_test, y_test, range(len(y_test)), oblique))
        svm_test.append(svm.score(X_test, y_test))
        random_test.append(random_for.score(X_test, y_test))