def compare_performance(trees_num, max_depth, bootstrap): dataset = datasets.load_digits() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木 print('##### 決定木の性能 #####') decision_tree = DecisionTreeClassifier(max_depth=max_depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト print('##### ランダムフォレストの性能 #####') random_forest = RandomForestClassifier(trees_num=trees_num, max_depth=max_depth, bootstrap=bootstrap) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print('train accuracy : {:.4f} test_accuracy : {:.4f}'.format(rf_train_accuracy, rf_test_accuracy))
def test_forest(): #加载数据 train_set = pd.read_csv('./data_set/seeds.csv') data_set = np.array(train_set) X = data_set[:, :-1] y = data_set[:, -1] train_X, test_X, train_y, y_true = train_test_split(X, y, test_size=1 / 3., random_state=7) # 加载模型 rf_model = RandomForestClassifier(n_estimators=3, criterion='gini', max_features='sqrt', max_depth=20) rf_model.fit(train_X, train_y) #创建决策树集合 print('rf_model.predict...begin...') pre_result = rf_model.predict(test_X) print('训练数据的预测概率向量:') print(pre_result) print('真实标签 :') print(y_true) print('训练数据的预测准确度:') print(accuracy_score(y_true, pre_result))
def test_rf_classification(): iris = datasets.load_iris() X, y = iris.data, iris.target print (X.shape, y.shape) train_X, train_y, test_X, test_y = split_train_test(X, y) print (train_X.shape, train_y.shape, test_X.shape, test_y.shape) clf = RandomForestClassifier(n_estimators=100) clf.fit(train_X, train_y) preds = clf.predict(test_X) accuracy = cal_accuracy(test_y, preds) print ('accuracy: ', accuracy)
def train_and_predict(x_train, y_train, x_test, x_val, y_val): """ Interface to train and test the new/improved decision tree. This function is an interface for training and testing the new/improved decision tree classifier. x_train and y_train should be used to train your classifier, while x_test should be used to test your classifier. x_val and y_val may optionally be used as the validation dataset. You can just ignore x_val and y_val if you do not need a validation dataset. Args: x_train (numpy.ndarray): Training instances, numpy array of shape (N, K) N is the number of instances K is the number of attributes y_train (numpy.ndarray): Class labels, numpy array of shape (N, ) Each element in y is a str x_test (numpy.ndarray): Test instances, numpy array of shape (M, K) M is the number of test instances K is the number of attributes x_val (numpy.ndarray): Validation instances, numpy array of shape (L, K) L is the number of validation instances K is the number of attributes y_val (numpy.ndarray): Class labels of validation set, numpy array of shape (L, ) """ ####################################################################### # ** TASK 4.2: COMPLETE THIS FUNCTION ** ####################################################################### # TODO: Train new classifier forest = RandomForestClassifier() # Forest is trained on the best hyperparameter set forest.update_hyperparameters(feature_sel=True, cross_val=False, max_tree_depth=13, min_sample_size=2, num_trees=20) forest.fit(x_train, y_train) # set up an empty (M, ) numpy array to store the predicted labels # feel free to change this if needed # TODO: Make predictions on x_test using new classifier predictions = forest.predict(x_test) # return result on best classifier option # remember to change this if you rename the variable return predictions
def main(): dataset = datasets.load_iris() X_train, X_test, y_train, y_test = train_test_split(dataset['data'], dataset['target'], test_size=0.3, random_state=0) # 決定木の深度の制限が1~3、制限なしの各場合について調べる depth_list = [1, 2, 3, None] for depth in depth_list: print('######### max_depth = {} #########'.format(depth)) # 全ての特徴量を使用したときの精度、学習時間、推論時間、汎化性能を調べる # 決定木 decision_tree = DecisionTreeClassifier(max_depth=depth) dt_lr_start = time.time() # 学習開始時間を記録 decision_tree.fit(X_train, y_train) dt_lr_time = time.time() - dt_lr_start # 学習時間 dt_est_start = time.time() # 推論開始時間を記録 y_est = decision_tree.predict(X_test) dt_est_time = time.time() - dt_est_start # 推論時間 print('決定木 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'.format( dt_lr_time, dt_est_time)) dt_train_accuracy = decision_tree.accuracy_score(X_train, y_train) dt_test_accuracy = decision_tree.accuracy_score(X_test, y_test) print('決定木 train accuracy : {:.4f} test_accuracy : {:.4f}'. format(dt_train_accuracy, dt_test_accuracy)) # ランダムフォレスト random_forest = RandomForestClassifier(max_depth=depth) rf_lr_start = time.time() # 学習開始時間を記録 random_forest.fit(X_train, y_train) rf_lr_time = time.time() - rf_lr_start # 学習時間 rf_est_start = time.time() # 推論開始時間を記録 y_est = random_forest.predict(X_test) rf_est_time = time.time() - rf_est_start # 推論時間 print('ランダムフォレスト 学習時間 : {:.6f} [sec] 推論時間 : {:.6f} [sec]'. format(rf_lr_time, rf_est_time)) rf_train_accuracy = random_forest.accuracy_score(X_train, y_train) rf_test_accuracy = random_forest.accuracy_score(X_test, y_test) print( 'ランダムフォレスト train accuracy : {:.4f} test_accuracy : {:.4f}' .format(rf_train_accuracy, rf_test_accuracy)) # 使用する特徴量を2つ(Petal legth, Petal width)に絞って、二次元で可視化 visualize('decision_tree', max_depth=depth) visualize('random_forest', max_depth=depth)
def test_random_foerst_fit_predict(self): model = RandomForestClassifier(n_estimators=100) features = np.array([ [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], [0, 0], [0, 0], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [1, 0], [1, 1], [1, 1], [1, 1], ]) labels = np.array([ 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, ]) model.fit(features, labels) """ for tree in model._models: print("=================================") from pprint import pprint pprint(tree._node) for tree in model._models: print(tree.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))) """ predictions = model.predict(np.array([[0, 0], [0, 1], [1, 0], [1, 1]])) self.assertEqual(predictions.tolist(), [0, 1, 1, 0])
f"with feature selection and sampling: {acc_val_4}") results = np.asarray([acc_val_1, acc_val_2, acc_val_3, acc_val_4]) print(f"Best result was achieved with setup {np.argmax(results) + 1}") """ # 4.2.4 starts here # Trees start following the best tree model from improvement 1 forest.update_hyperparameters(feature_sel=True, cross_val=False, max_tree_depth=13, min_sample_size=3, num_trees=10) forest.fit(x_train, y_train) pred_val = forest.predict(x_val) acc_val_5 = metrics.accuracy(pred_val, y_val) print(f"10 Best Tree Random Forest Validation Accuracy, ", f"with feature selection and sampling: {acc_val_5}") # start by tuning the trees used param_space = { "max_tree_depth": [x for x in range(13, 15)], "min_sample_size": [y for y in range(2, 4)], "num_trees": [10, 20] } best_param = metrics.grid_search(forest, x_train, y_train, x_val,