Esempi in Python per RandomForestClassifier, esempi in Python per random_forest.RandomForestClassifier

Esempio n. 1

0

Mostra file

def test_forest():

    #加载数据
    train_set = pd.read_csv('./data_set/seeds.csv')
    data_set = np.array(train_set)

    X = data_set[:, :-1]
    y = data_set[:, -1]

    train_X, test_X, train_y, y_true = train_test_split(X,
                                                        y,
                                                        test_size=1 / 3.,
                                                        random_state=7)
    # 加载模型
    rf_model = RandomForestClassifier(n_estimators=3,
                                      criterion='gini',
                                      max_features='sqrt',
                                      max_depth=20)

    rf_model.fit(train_X, train_y)  #创建决策树集合

    print('rf_model.predict...begin...')
    pre_result = rf_model.predict(test_X)
    print('训练数据的预测概率向量：')
    print(pre_result)
    print('真实标签 ：')
    print(y_true)
    print('训练数据的预测准确度：')
    print(accuracy_score(y_true, pre_result))

Esempio n. 2

0

Mostra file

File: mnist.py Progetto: ayarimatsui/Random_Forest

def grid_search_RF():
    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    trees_num_list = [16, 32, 64, 128]  # ランダムフォレストに含まれる決定木の個数の候補
    bootstrap_list = [0.1, 0.3, 0.5, 0.7, 0.9]   # ブートストラップ法で復元するデータ量の元のデータ量に対する割合の候補

    best_acc = 0
    best_trees_num = None
    best_bootstrap = None
    with tqdm(total=len(trees_num_list)*len(bootstrap_list), desc='Progress') as pbar:
        for trees_num in trees_num_list:
            for bootstrap in bootstrap_list:
                random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=5, bootstrap=bootstrap)
                random_forest.fit(X_train, y_train)
                acc = random_forest.accuracy_score(X_test, y_test)
                if acc > best_acc:
                    best_acc = acc
                    best_trees_num = trees_num
                    best_bootstrap = bootstrap
                pbar.update(1)
    
    print('best acc : {:.4f}    best trees_num : {}     best bootstrap : {}'.format(best_acc, best_trees_num, best_bootstrap))

    return best_trees_num, best_bootstrap

Esempio n. 3

0

Mostra file

def test_rf_classification():
    iris = datasets.load_iris()
    X, y = iris.data, iris.target
    print (X.shape, y.shape)
    train_X, train_y, test_X, test_y = split_train_test(X, y)
    print (train_X.shape, train_y.shape, test_X.shape, test_y.shape)

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(train_X, train_y)
    preds = clf.predict(test_X)
    accuracy = cal_accuracy(test_y, preds)
    print ('accuracy: ', accuracy)

Esempio n. 4

0

Mostra file

def visualize(model, max_depth=None):
    iris_dataset = datasets.load_iris()
    petal_features = iris_dataset['data'][:, 2:]
    targets = iris_dataset['target']

    if max_depth is None:
        # 決定木の最大深度は制限しない
        # アヤメのデータセットの場合は、データ数やクラス数が少ないため、深度を制限しなくても計算時間はあまりかからない
        if model == 'decision_tree':
            clf = DecisionTreeClassifier()
        else:
            clf = RandomForestClassifier()
    else:
        if model == 'decision_tree':
            clf = DecisionTreeClassifier(max_depth=max_depth)
        else:
            clf = RandomForestClassifier(max_depth=max_depth)

    clf.fit(petal_features, targets)

    # データの取りうる範囲 +-1 を計算する
    x_min = max(0, petal_features[:, 0].min() - 1)
    y_min = max(0, petal_features[:, 1].min() - 1)
    x_max = petal_features[:, 0].max() + 1
    y_max = petal_features[:, 1].max() + 1

    # 教師データの取りうる範囲でメッシュ状の座標を作る
    grid_interval = 0.2
    xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_interval),
                         np.arange(y_min, y_max, grid_interval))

    # メッシュの座標を学習したモデルで判定させる
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # 各点の判定結果をグラフに描画する
    plt.contourf(xx, yy, Z.reshape(xx.shape), cmap=plt.cm.rainbow, alpha=0.4)

    # データもプロット
    for c in np.unique(targets):
        plt.scatter(petal_features[targets == c, 0],
                    petal_features[targets == c, 1])

    feature_names = iris_dataset['feature_names']
    plt.xlabel(feature_names[2])
    plt.ylabel(feature_names[3])
    if max_depth is None:
        plt.title('Max Depth : No Limitation')
        plt.savefig('figures/iris/{}_no_limit.png'.format(model))
    else:
        plt.title('Max Depth : ' + str(max_depth))
        plt.savefig('figures/iris/{}_depth_{}.png'.format(model, max_depth))

    plt.close()

Esempio n. 5

0

Mostra file

File: mnist.py Progetto: ayarimatsui/Random_Forest

def compare_performance(trees_num, max_depth, bootstrap):

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # 決定木
    print('#####　決定木の性能  #####')
    decision_tree = DecisionTreeClassifier(max_depth=max_depth)
    dt_lr_start = time.time()  # 学習開始時間を記録
    decision_tree.fit(X_train, y_train)
    dt_lr_time = time.time() - dt_lr_start  # 学習時間
    dt_est_start = time.time()  # 推論開始時間を記録
    y_est = decision_tree.predict(X_test)
    dt_est_time = time.time() - dt_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time))
    dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
    dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

    # ランダムフォレスト
    print('#####　ランダムフォレストの性能　#####')
    random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap)
    rf_lr_start = time.time()  # 学習開始時間を記録
    random_forest.fit(X_train, y_train)
    rf_lr_time = time.time() - rf_lr_start  # 学習時間
    rf_est_start = time.time()  # 推論開始時間を記録
    y_est = random_forest.predict(X_test)
    rf_est_time = time.time() - rf_est_start  # 推論時間
    print('学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time))
    rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
    rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
    print('train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))

Esempio n. 6

0

Mostra file

    def random_forest_classification(self, train_data):

        train_X = train_data[:, :-1]
        train_y = train_data[:, -1]

        print("train_X ......type.....")
        print(type(train_X))

        self.k_class_ = list(set(train_y))  #

        rf_model = RandomForestClassifier(n_estimators=10,
                                          criterion='gini',
                                          max_features='sqrt',
                                          max_depth=20)

        rf_model.fit(train_X, train_y)
        # 保存模型
        self.rf_model = rf_model
        save_model_rf_model(rf_model)

Esempio n. 7

0

Mostra file

    def _compare_sklearn_model(self,
                               features,
                               labels,
                               categorical_features,
                               feature_names,
                               model_config=None):
        label_encoder = LabelEncoder()
        labels = label_encoder.fit_transform(labels)

        # Encode categorical features
        for categorical_feature in categorical_features:
            features[categorical_feature] = pd.factorize(
                features[categorical_feature])[0]

        # Find categorical feature indeces
        categorical_feature_indeces = [
            feature_names.index(categorical_feature)
            for categorical_feature in categorical_features
        ]

        imputer = Imputer()
        features = imputer.fit_transform(features)

        features_training, features_test, labels_training, labels_test = train_test_split(
            features, labels, test_size=0.3)

        model_vanilearn = RandomForestClassifier(
            categorical_feature_indeces=categorical_feature_indeces,
            **model_config)
        model_sklearn = RandomForestClassifierSklearn(**model_config)

        for model in [model_sklearn, model_vanilearn]:

            model.fit(features_training, labels_training)

            predictions_training = model.predict(features_training)
            predictions_test = model.predict(features_test)

            accuracy_training = accuracy_score(labels_training,
                                               predictions_training)
            accuracy_test = accuracy_score(labels_test, predictions_test)

            print("")
            print(model)
            print(accuracy_training)
            print(accuracy_test)

Esempio n. 8

0

Mostra file

File: improvement.py Progetto: Boon-Yang/Decision-Tree-From-Scratch

def train_and_predict(x_train, y_train, x_test, x_val, y_val):
    """ Interface to train and test the new/improved decision tree.
    
    This function is an interface for training and testing the new/improved
    decision tree classifier. 

    x_train and y_train should be used to train your classifier, while 
    x_test should be used to test your classifier. 
    x_val and y_val may optionally be used as the validation dataset. 
    You can just ignore x_val and y_val if you do not need a validation dataset.

    Args:
    x_train (numpy.ndarray): Training instances, numpy array of shape (N, K) 
                       N is the number of instances
                       K is the number of attributes
    y_train (numpy.ndarray): Class labels, numpy array of shape (N, )
                       Each element in y is a str 
    x_test (numpy.ndarray): Test instances, numpy array of shape (M, K) 
                            M is the number of test instances
                            K is the number of attributes
    x_val (numpy.ndarray): Validation instances, numpy array of shape (L, K) 
                       L is the number of validation instances
                       K is the number of attributes
    y_val (numpy.ndarray): Class labels of validation set, numpy array of shape (L, )
    """

    #######################################################################
    #                 ** TASK 4.2: COMPLETE THIS FUNCTION **
    #######################################################################
    # TODO: Train new classifier
    forest = RandomForestClassifier()
    # Forest is trained on the best hyperparameter set
    forest.update_hyperparameters(feature_sel=True,
                                  cross_val=False,
                                  max_tree_depth=13,
                                  min_sample_size=2,
                                  num_trees=20)

    forest.fit(x_train, y_train)
    # set up an empty (M, ) numpy array to store the predicted labels
    # feel free to change this if needed

    # TODO: Make predictions on x_test using new classifier
    predictions = forest.predict(x_test)

    # return result on best classifier option
    # remember to change this if you rename the variable
    return predictions

Esempio n. 9

0

Mostra file

File: mnist.py Progetto: ayarimatsui/Random_Forest

def compare_depth():

    dataset = datasets.load_digits()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3,  random_state=0)

    # ランダムフォレストに関して、最良のハイパーパラメータを調べる
    trees_num, bootstrap = grid_search_RF()

    # 決定木の深度の制限を変えて、調べる
    depth_list = [i for i in range(21)]  # 深さの制限0~20まで調べる

    dt_train_acc_list = []
    dt_test_acc_list = []
    rf_train_acc_list = []
    rf_test_acc_list = []

    for depth in tqdm(depth_list):
        print('***** max_depth = {} *****'.format(depth))
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        decision_tree.fit(X_train, y_train)
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        dt_train_acc_list.append(dt_train_accuracy)
        dt_test_acc_list.append(dt_test_accuracy)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=depth, bootstrap=bootstrap)
        random_forest.fit(X_train, y_train)
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        # accuracyをリストに追加
        rf_train_acc_list.append(rf_train_accuracy)
        rf_test_acc_list.append(rf_test_accuracy)
        print('ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))

    # グラフの描画
    plt.plot(depth_list, dt_train_acc_list, label='Decision Tree - train accuracy', color='r')
    plt.plot(depth_list, dt_test_acc_list, label='Decision Tree - test accuracy', color='g')
    plt.plot(depth_list, rf_train_acc_list, label='Random Forest - train accuracy', color='y')
    plt.plot(depth_list, rf_test_acc_list, label='Random Forest - test accuracy', color='b')

    plt.xlabel('Max Depth')
    plt.ylabel('Accuracy')
    plt.xlim(0, 20)
    plt.xticks(np.arange(0, 21, 2))
    plt.ylim(0, 1.0)
    plt.legend(loc='lower right')
    plt.title('Max Depth of Decision Trees and Accuracy')
    # グラフを保存
    plt.savefig('figures/mnist/max_depth_&_accuracy.png')

Esempio n. 10

0

Mostra file

def main():

    dataset = datasets.load_iris()
    X_train, X_test, y_train, y_test = train_test_split(dataset['data'],
                                                        dataset['target'],
                                                        test_size=0.3,
                                                        random_state=0)

    # 決定木の深度の制限が1~3、制限なしの各場合について調べる
    depth_list = [1, 2, 3, None]
    for depth in depth_list:
        print('######### max_depth = {} #########'.format(depth))
        # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる
        # 決定木
        decision_tree = DecisionTreeClassifier(max_depth=depth)
        dt_lr_start = time.time()  # 学習開始時間を記録
        decision_tree.fit(X_train, y_train)
        dt_lr_time = time.time() - dt_lr_start  # 学習時間
        dt_est_start = time.time()  # 推論開始時間を記録
        y_est = decision_tree.predict(X_test)
        dt_est_time = time.time() - dt_est_start  # 推論時間
        print('決定木       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.format(
            dt_lr_time, dt_est_time))
        dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train)
        dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test)
        print('決定木       train accuracy : {:.4f}     test_accuracy : {:.4f}'.
              format(dt_train_accuracy, dt_test_accuracy))

        # ランダムフォレスト
        random_forest = RandomForestClassifier(max_depth=depth)
        rf_lr_start = time.time()  # 学習開始時間を記録
        random_forest.fit(X_train, y_train)
        rf_lr_time = time.time() - rf_lr_start  # 学習時間
        rf_est_start = time.time()  # 推論開始時間を記録
        y_est = random_forest.predict(X_test)
        rf_est_time = time.time() - rf_est_start  # 推論時間
        print('ランダムフォレスト       学習時間 : {:.6f} [sec]     推論時間 : {:.6f} [sec]'.
              format(rf_lr_time, rf_est_time))
        rf_train_accuracy = random_forest.accuracy_score(X_train, y_train)
        rf_test_accuracy = random_forest.accuracy_score(X_test, y_test)
        print(
            'ランダムフォレスト       train accuracy : {:.4f}     test_accuracy : {:.4f}'
            .format(rf_train_accuracy, rf_test_accuracy))

        # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化
        visualize('decision_tree', max_depth=depth)
        visualize('random_forest', max_depth=depth)

Esempio n. 11

0

Mostra file

    def test_random_foerst_fit_predict(self):
        model = RandomForestClassifier(n_estimators=100)

        features = np.array([
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [0, 1],
            [0, 1],
            [1, 0],
            [1, 0],
            [1, 0],
            [1, 1],
            [1, 1],
            [1, 1],
        ])
        labels = np.array([
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            1,
            1,
            1,
            1,
            0,
            0,
            0,
        ])

        model.fit(features, labels)
        """
        for tree in model._models:
            print("=================================")
            from pprint import pprint
            pprint(tree._node)

        for tree in model._models:
            print(tree.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])))
        """

        predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))

        self.assertEqual(predictions.tolist(), [0, 1, 1, 0])

Esempio n. 12

0

Mostra file

import reader
import numpy as np
from random_forest import RandomForestClassifier
import metrics

x_train, y_train = reader.read_from_csv("data/train_full.txt")
x_val, y_val = reader.read_from_csv("data/validation.txt")
x_test, y_test = reader.read_from_csv("data/test.txt")

forest = RandomForestClassifier()
"""
# 4.2.3 starts here
forest.update_hyperparameters(num_trees=10, k_fold=10, cross_val=True)
forest.fit(x_train, y_train)

pred_val = forest.predict(x_val)
acc_val_1 = metrics.accuracy(pred_val, y_val)

print(f"10 tree Random Forest Validation Accuracy, ",
      f"without feature selection or sampling: {acc_val_1}")

forest.update_hyperparameters(feature_sel=True, cross_val=True)
forest.fit(x_train, y_train)
pred_val = forest.predict(x_val)
acc_val_2 = metrics.accuracy(pred_val, y_val)

print(f"10 tree Random Forest Validation Accuracy, ",
      f"with feature selection: {acc_val_2}")

forest.update_hyperparameters(feature_sel=False, cross_val=False)
forest.fit(x_train, y_train)