def search_param(xTrain, yTrain, xTest, yTest): perf = pd.DataFrame() # do a grid search using OOB and set max trees to 250 for crit in ['gini', 'entropy']: for mf in range(5,12): for md in range(1,6): for mls in range(5,8): print(crit, mf, md, mls) rf = RandomForest(250, mf, crit, md, mls) oobErr = rf.train(xTrain, yTrain) # use oob to keep track of stuff tmpDF = pd.DataFrame.from_dict(oobErr, orient='index', columns=['err']) tmpDF['nest'] = tmpDF.index tmpDF['crit'] = crit tmpDF['mf'] = mf tmpDF['md'] = md tmpDF['mls'] = mls perf = pd.concat([perf, tmpDF]) # clean up the indexing for the pandas dataframe perf = perf.reset_index(drop=True) return perf
def main(): # set path iris_path = 'DataAnalysisProjectDesign/Experiment2/iris_train.arff' adult_path = 'DataAnalysisProjectDesign/Experiment2/adult_train.arff' # get choice data_choice = input('Enter 1 for iris; Enter 2 for adult:') dt_num = int(input('Enter your expected tree number:')) path = select_dataset(data_choice,iris_path,adult_path) # create data instance data_obj = Data(path) data_obj.load_data() data_obj.fill_missing_data() # create random forest rf = RandomForest( data=data_obj, dt_num=dt_num ) rf.bagging() rf.train_rf() correct_rate,conf_mat = rf.test_rf() return dt_num,correct_rate,conf_mat
import matplotlib.pyplot as plt from rf import RandomForest, file_to_numpy xTrain = file_to_numpy("q4xTrain.csv") yTrain = file_to_numpy("q4yTrain.csv") xTest = file_to_numpy("q4xTest.csv") yTest = file_to_numpy("q4yTest.csv") nests = range(1, 100) results = [] for nest in nests: rf = RandomForest(nest=nest) res = rf.train(xTrain, yTrain) results.append(res[-1]) plt.xlabel("number of trees used") plt.ylabel("OOB Error") plt.plot(nests, results) plt.savefig("nestcount") plt.close() rf = RandomForest(nest=300) res = rf.train(xTrain, yTrain) plt.plot(range(300), res) plt.save("progress") plt.close() maxFeats = range(1, xTrain.shape[1]) results = [] for featset_length in maxFeats:
def ensemble_diff_plot(): fig, axes = plt.subplots(3, 3) fig.set_size_inches(10, 10) for ax in axes.flatten(): n_ex = 100 n_trees = 50 n_feats = np.random.randint(2, 100) max_depth_d = np.random.randint(1, 100) max_depth_r = np.random.randint(1, 10) classifier = np.random.choice([True, False]) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model def loss(yp, y): return accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine = RandomForest( classifier=classifier, n_feats=n_feats, n_trees=n_trees, criterion=criterion, max_depth=max_depth_r, ) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) mine_g = GradientBoostedDecisionTree( n_iter=10, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="crossentropy", step_size="constant", split_criterion=criterion, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=1) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model criterion = "mse" loss = mean_squared_error mine = RandomForest( criterion=criterion, n_feats=n_feats, n_trees=n_trees, max_depth=max_depth_r, classifier=classifier, ) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) mine_g = GradientBoostedDecisionTree( # n_trees=n_trees, n_iter=10, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="mse", step_size="adaptive", split_criterion=criterion, ) # fit 'em mine.fit(X, Y) mine_d.fit(X, Y) mine_g.fit(X, Y) # get preds on test set y_pred_mine_test = mine.predict(X_test) y_pred_mine_test_d = mine_d.predict(X_test) y_pred_mine_test_g = mine_g.predict(X_test) loss_mine_test = loss(y_pred_mine_test, Y_test) loss_mine_test_d = loss(y_pred_mine_test_d, Y_test) loss_mine_test_g = loss(y_pred_mine_test_g, Y_test) if classifier: entries = [("RF", loss_mine_test, y_pred_mine_test), ("DT", loss_mine_test_d, y_pred_mine_test_d), ("GB", loss_mine_test_g, y_pred_mine_test_g)] (lbl, test_loss, preds) = entries[np.random.randint(3)] ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100)) for i in np.unique(Y_test): ax.scatter( X_test[preds == i, 0].flatten(), X_test[preds == i, 1].flatten(), # s=0.5, ) else: X_ax = np.linspace( np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100).reshape(-1, 1) y_pred_mine_test = mine.predict(X_ax) y_pred_mine_test_d = mine_d.predict(X_ax) y_pred_mine_test_g = mine_g.predict(X_ax) ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5) # s=0.5) ax.plot( X_ax.flatten(), y_pred_mine_test_g.flatten(), # linewidth=0.5, label="GB".format(n_trees, n_feats, max_depth_d), color="red", ) ax.plot( X_ax.flatten(), y_pred_mine_test.flatten(), # linewidth=0.5, label="RF".format(n_trees, n_feats, max_depth_r), color="cornflowerblue", ) ax.plot( X_ax.flatten(), y_pred_mine_test_d.flatten(), # linewidth=0.5, label="DT".format(max_depth_d), color="yellowgreen", ) ax.set_title("GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format( loss_mine_test_g, loss_mine_test, loss_mine_test_d)) ax.legend() ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) # plt.savefig("plot.png", dpi=300) plt.show() plt.close("all")
def test_RandomForest(): np.random.seed(12345) i = 1 while True: n_ex = np.random.randint(2, 100) n_feats = np.random.randint(2, 100) n_trees = np.random.randint(2, 100) max_depth = np.random.randint(1, 5) classifier = np.random.choice([True, False]) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs( n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i ) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model def loss(yp, y): return 1 - accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine = RandomForest( classifier=classifier, n_feats=n_feats, n_trees=n_trees, criterion=criterion, max_depth=max_depth, ) gold = RandomForestClassifier( n_estimators=n_trees, max_features=n_feats, criterion=criterion, max_depth=max_depth, bootstrap=True, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model criterion = "mse" loss = mean_squared_error mine = RandomForest( criterion=criterion, n_feats=n_feats, n_trees=n_trees, max_depth=max_depth, classifier=classifier, ) gold = RandomForestRegressor( n_estimators=n_trees, max_features=n_feats, criterion=criterion, max_depth=max_depth, bootstrap=True, ) print("Trial {}".format(i)) print("\tClassifier={}, criterion={}".format(classifier, criterion)) print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex)) if classifier: print("\tn_classes: {}".format(n_classes)) # fit 'em mine.fit(X, Y) gold.fit(X, Y) # get preds y_pred_mine = mine.predict(X) y_pred_gold = gold.predict(X) loss_mine = loss(y_pred_mine, Y) loss_gold = loss(y_pred_gold, Y) # get preds on test set y_pred_mine_test = mine.predict(X_test) y_pred_gold_test = gold.predict(X_test) loss_mine_test = loss(y_pred_mine_test, Y_test) loss_gold_test = loss(y_pred_gold_test, Y_test) try: np.testing.assert_almost_equal(loss_mine, loss_gold) print("\tLoss on training: {}".format(loss_mine)) except AssertionError as e: print("\tTraining losses not equal:\n{}".format(e)) try: np.testing.assert_almost_equal(loss_mine_test, loss_gold_test) print("\tLoss on test: {}".format(loss_mine_test)) except AssertionError as e: print("\tTest losses not equal:\n{}".format(e)) print("PASSED") i += 1
def test_RandomForest(): np.random.seed(12345) i = 1 while True: n_ex = np.random.randint(2, 100) n_feats = 50#np.random.randint(2, 100) n_trees = 20#np.random.randint(2, 100) max_depth = 3#np.random.randint(1, 5) classifier = True#np.random.choice([True, False]) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs( n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i ) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model def loss(yp, y): return 1 - accuracy_score(yp, y) # initialize model criterion = "gini"#np.random.choice(["entropy", "gini"]) mine = RandomForest( classifier=classifier, n_feats=n_feats, n_trees=n_trees, criterion=criterion, max_depth=max_depth, ) #n_estimators:在利用最大投票数或平均值来预测之前,你想要建立子树的数量。 较多的子树可以让模型有更好的性能,但同时让你的代码变慢。 你应该选择尽可能高的值,只要你的处理器能够承受的住,因为这使你的预测更好更稳定。 #max_features:随机森林允许单个决策树使用特征的最大数量。 Python为最大特征数提供了多个可选项。 下面是其中的几个: #Auto/None :简单地选取所有特征,每颗树都可以利用他们。这种情况下,每颗树都没有任何的限制。 #sqrt :此选项是每颗子树可以利用总特征数的平方根个。 例如,如果变量(特征)的总数是100,所以每颗子树只能取其中的10个。“log2”是另一种相似类型的选项。 #0.2:此选项允许每个随机森林的子树可以利用变量(特征)数的20%。如果想考察的特征x%的作用, 我们可以使用“0.X”的格式。 gold = RandomForestClassifier( n_estimators=n_trees, max_features=n_feats, criterion=criterion, max_depth=max_depth, bootstrap=True, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model criterion = "mse" loss = mean_squared_error mine = RandomForest( criterion=criterion, n_feats=n_feats, n_trees=n_trees, max_depth=max_depth, classifier=classifier, ) gold = RandomForestRegressor( n_estimators=n_trees, max_features=n_feats, criterion=criterion, max_depth=max_depth, bootstrap=True, ) print("Trial {}".format(i)) print("\tClassifier={}, criterion={}".format(classifier, criterion)) print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex)) if classifier: print("\tn_classes: {}".format(n_classes)) # fit 'em mine.fit(X, Y) gold.fit(X, Y) # get preds y_pred_mine = mine.predict(X) y_pred_gold = gold.predict(X) loss_mine = loss(y_pred_mine, Y) loss_gold = loss(y_pred_gold, Y) print(f"train data mine acc: {1-loss_mine}") print(f"train data sklearn acc: {1-loss_gold}") # get preds on test set y_pred_mine_test = mine.predict(X_test) y_pred_gold_test = gold.predict(X_test) loss_mine_test = loss(y_pred_mine_test, Y_test) loss_gold_test = loss(y_pred_gold_test, Y_test) print(f"test data mine acc: {1-loss_mine_test}") print(f"test data sklearn acc: {1-loss_gold_test}") #try: #np.testing.assert_almost_equal(loss_mine, loss_gold) #print("\tLoss on training: {}".format(loss_mine)) #except AssertionError as e: #print("\tTraining losses not equal:\n{}".format(e)) #try: #np.testing.assert_almost_equal(loss_mine_test, loss_gold_test) #print("\tLoss on test: {}".format(loss_mine_test)) #except AssertionError as e: #print("\tTest losses not equal:\n{}".format(e)) print("PASSED") i += 1
def eval_opt(xTrain, yTrain, xTest, yTest): bst = RandomForest(47, 7, 'gini', 4, 5) ypred = bst.predict(xTest) # evaluate predictions print(1-skm.accuracy_score(yTest, ypred))