def test_forest(): #加载数据 train_set = pd.read_csv('./data_set/seeds.csv') data_set = np.array(train_set) X = data_set[:, :-1] y = data_set[:, -1] train_X, test_X, train_y, y_true = train_test_split(X, y, test_size=1 / 3., random_state=7) # 加载模型 rf_model = RandomForestClassifier(n_estimators=3, criterion='gini', max_features='sqrt', max_depth=20) rf_model.fit(train_X, train_y) #创建决策树集合 print('rf_model.predict...begin...') pre_result = rf_model.predict(test_X) print('训练数据的预测概率向量:') print(pre_result) print('真实标签 :') print(y_true) print('训练数据的预测准确度:') print(accuracy_score(y_true, pre_result))
def grid_search_RF(): dataset = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) trees_num_list = [16, 32, 64, 128] # ランダムフォレストに含まれる決定木の個数の候補 bootstrap_list = [0.1, 0.3, 0.5, 0.7, 0.9] # ブートストラップ法で復元するデータ量の元のデータ量に対する割合の候補 best_acc = 0 best_trees_num = None best_bootstrap = None with tqdm(total=len(trees_num_list)*len(bootstrap_list), desc='Progress') as pbar: for trees_num in trees_num_list: for bootstrap in bootstrap_list: random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=5, bootstrap=bootstrap) random_forest.fit(X_train, y_train) acc = random_forest.accuracy_score(X_test, y_test) if acc > best_acc: best_acc = acc best_trees_num = trees_num best_bootstrap = bootstrap pbar.update(1) print('best acc : {:.4f} best trees_num : {} best bootstrap : {}'.format(best_acc, best_trees_num, best_bootstrap)) return best_trees_num, best_bootstrap
def test_rf_classification(): iris = datasets.load_iris() X, y = iris.data, iris.target print (X.shape, y.shape) train_X, train_y, test_X, test_y = split_train_test(X, y) print (train_X.shape, train_y.shape, test_X.shape, test_y.shape) clf = RandomForestClassifier(n_estimators=100) clf.fit(train_X, train_y) preds = clf.predict(test_X) accuracy = cal_accuracy(test_y, preds) print ('accuracy: ', accuracy)
def visualize(model, max_depth=None): iris_dataset = datasets.load_iris() petal_features = iris_dataset['data'][:, 2:] targets = iris_dataset['target'] if max_depth is None: # 決定木の最大深度は制限しない # アヤメのデータセットの場合は、データ数やクラス数が少ないため、深度を制限しなくても計算時間はあまりかからない if model == 'decision_tree': clf = DecisionTreeClassifier() else: clf = RandomForestClassifier() else: if model == 'decision_tree': clf = DecisionTreeClassifier(max_depth=max_depth) else: clf = RandomForestClassifier(max_depth=max_depth) clf.fit(petal_features, targets) # データの取りうる範囲 +-1 を計算する x_min = max(0, petal_features[:, 0].min() - 1) y_min = max(0, petal_features[:, 1].min() - 1) x_max = petal_features[:, 0].max() + 1 y_max = petal_features[:, 1].max() + 1 # 教師データの取りうる範囲でメッシュ状の座標を作る grid_interval = 0.2 xx, yy = np.meshgrid(np.arange(x_min, x_max, grid_interval), np.arange(y_min, y_max, grid_interval)) # メッシュの座標を学習したモデルで判定させる Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # 各点の判定結果をグラフに描画する plt.contourf(xx, yy, Z.reshape(xx.shape), cmap=plt.cm.rainbow, alpha=0.4) # データもプロット for c in np.unique(targets): plt.scatter(petal_features[targets == c, 0], petal_features[targets == c, 1]) feature_names = iris_dataset['feature_names'] plt.xlabel(feature_names[2]) plt.ylabel(feature_names[3]) if max_depth is None: plt.title('Max Depth : No Limitation') plt.savefig('figures/iris/{}_no_limit.png'.format(model)) else: plt.title('Max Depth : ' + str(max_depth)) plt.savefig('figures/iris/{}_depth_{}.png'.format(model, max_depth)) plt.close()
def compare_performance(trees_num, max_depth, bootstrap): dataset = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木 print('##### 決定木の性能 #####') decision_tree = DecisionTreeClassifier(max_depth=max_depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト print('##### ランダムフォレストの性能 #####') random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))
def random_forest_classification(self, train_data): train_X = train_data[:, :-1] train_y = train_data[:, -1] print("train_X ......type.....") print(type(train_X)) self.k_class_ = list(set(train_y)) # rf_model = RandomForestClassifier(n_estimators=10, criterion='gini', max_features='sqrt', max_depth=20) rf_model.fit(train_X, train_y) # 保存模型 self.rf_model = rf_model save_model_rf_model(rf_model)
def _compare_sklearn_model(self, features, labels, categorical_features, feature_names, model_config=None): label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) # Encode categorical features for categorical_feature in categorical_features: features[categorical_feature] = pd.factorize( features[categorical_feature])[0] # Find categorical feature indeces categorical_feature_indeces = [ feature_names.index(categorical_feature) for categorical_feature in categorical_features ] imputer = Imputer() features = imputer.fit_transform(features) features_training, features_test, labels_training, labels_test = train_test_split( features, labels, test_size=0.3) model_vanilearn = RandomForestClassifier( categorical_feature_indeces=categorical_feature_indeces, **model_config) model_sklearn = RandomForestClassifierSklearn(**model_config) for model in [model_sklearn, model_vanilearn]: model.fit(features_training, labels_training) predictions_training = model.predict(features_training) predictions_test = model.predict(features_test) accuracy_training = accuracy_score(labels_training, predictions_training) accuracy_test = accuracy_score(labels_test, predictions_test) print("") print(model) print(accuracy_training) print(accuracy_test)
def train_and_predict(x_train, y_train, x_test, x_val, y_val): """ Interface to train and test the new/improved decision tree. This function is an interface for training and testing the new/improved decision tree classifier. x_train and y_train should be used to train your classifier, while x_test should be used to test your classifier. x_val and y_val may optionally be used as the validation dataset. You can just ignore x_val and y_val if you do not need a validation dataset. Args: x_train (numpy.ndarray): Training instances, numpy array of shape (N, K) N is the number of instances K is the number of attributes y_train (numpy.ndarray): Class labels, numpy array of shape (N, ) Each element in y is a str x_test (numpy.ndarray): Test instances, numpy array of shape (M, K) M is the number of test instances K is the number of attributes x_val (numpy.ndarray): Validation instances, numpy array of shape (L, K) L is the number of validation instances K is the number of attributes y_val (numpy.ndarray): Class labels of validation set, numpy array of shape (L, ) """ ####################################################################### # ** TASK 4.2: COMPLETE THIS FUNCTION ** ####################################################################### # TODO: Train new classifier forest = RandomForestClassifier() # Forest is trained on the best hyperparameter set forest.update_hyperparameters(feature_sel=True, cross_val=False, max_tree_depth=13, min_sample_size=2, num_trees=20) forest.fit(x_train, y_train) # set up an empty (M, ) numpy array to store the predicted labels # feel free to change this if needed # TODO: Make predictions on x_test using new classifier predictions = forest.predict(x_test) # return result on best classifier option # remember to change this if you rename the variable return predictions
def compare_depth(): dataset = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # ランダムフォレストに関して、最良のハイパーパラメータを調べる trees_num, bootstrap = grid_search_RF() # 決定木の深度の制限を変えて、調べる depth_list = [i for i in range(21)] # 深さの制限0~20まで調べる dt_train_acc_list = [] dt_test_acc_list = [] rf_train_acc_list = [] rf_test_acc_list = [] for depth in tqdm(depth_list): print('***** max_depth = {} *****'.format(depth)) # 決定木 decision_tree = DecisionTreeClassifier(max_depth=depth) decision_tree.fit(X_train, y_train) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) # accuracyをリストに追加 dt_train_acc_list.append(dt_train_accuracy) dt_test_acc_list.append(dt_test_accuracy) print('決定木 train accuracy : {:.4f} test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=depth, bootstrap=bootstrap) random_forest.fit(X_train, y_train) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) # accuracyをリストに追加 rf_train_acc_list.append(rf_train_accuracy) rf_test_acc_list.append(rf_test_accuracy) print('ランダムフォレスト train accuracy : {:.4f} test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy)) # グラフの描画 plt.plot(depth_list, dt_train_acc_list, label='Decision Tree - train accuracy', color='r') plt.plot(depth_list, dt_test_acc_list, label='Decision Tree - test accuracy', color='g') plt.plot(depth_list, rf_train_acc_list, label='Random Forest - train accuracy', color='y') plt.plot(depth_list, rf_test_acc_list, label='Random Forest - test accuracy', color='b') plt.xlabel('Max Depth') plt.ylabel('Accuracy') plt.xlim(0, 20) plt.xticks(np.arange(0, 21, 2)) plt.ylim(0, 1.0) plt.legend(loc='lower right') plt.title('Max Depth of Decision Trees and Accuracy') # グラフを保存 plt.savefig('figures/mnist/max_depth_&_accuracy.png')
def main(): dataset = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木の深度の制限が1~3、制限なしの各場合について調べる depth_list = [1, 2, 3, None] for depth in depth_list: print('######### max_depth = {} #########'.format(depth)) # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる # 決定木 decision_tree = DecisionTreeClassifier(max_depth=depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('決定木 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format( dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('決定木 train accuracy : {:.4f} test_accuracy : {:.4f}'. format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト random_forest = RandomForestClassifier(max_depth=depth) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('ランダムフォレスト 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'. format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print( 'ランダムフォレスト train accuracy : {:.4f} test_accuracy : {:.4f}' .format(rf_train_accuracy, rf_test_accuracy)) # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化 visualize('decision_tree', max_depth=depth) visualize('random_forest', max_depth=depth)
def test_random_foerst_fit_predict(self): model = RandomForestClassifier(n_estimators=100) features = np.array([ [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], ]) labels = np.array([ 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, ]) model.fit(features, labels) """ for tree in model._models: print("=================================") from pprint import pprint pprint(tree._node) for tree in model._models: print(tree.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))) """ predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])) self.assertEqual(predictions.tolist(), [0, 1, 1, 0])
import reader import numpy as np from random_forest import RandomForestClassifier import metrics x_train, y_train = reader.read_from_csv("data/train_full.txt") x_val, y_val = reader.read_from_csv("data/validation.txt") x_test, y_test = reader.read_from_csv("data/test.txt") forest = RandomForestClassifier() """ # 4.2.3 starts here forest.update_hyperparameters(num_trees=10, k_fold=10, cross_val=True) forest.fit(x_train, y_train) pred_val = forest.predict(x_val) acc_val_1 = metrics.accuracy(pred_val, y_val) print(f"10 tree Random Forest Validation Accuracy, ", f"without feature selection or sampling: {acc_val_1}") forest.update_hyperparameters(feature_sel=True, cross_val=True) forest.fit(x_train, y_train) pred_val = forest.predict(x_val) acc_val_2 = metrics.accuracy(pred_val, y_val) print(f"10 tree Random Forest Validation Accuracy, ", f"with feature selection: {acc_val_2}") forest.update_hyperparameters(feature_sel=False, cross_val=False) forest.fit(x_train, y_train)