def fit(self, X, Y): # if self.loss == "mse": # loss = MSELoss() # elif self.loss == "0_1_err": # loss = ClassifyLoss() N, M = X.shape self.learners = np.empty((self.n_iter, 1), dtype=object) self.alpha = np.ones((self.n_iter, 1)) # self.weights = np.mat(np.ones((N, 1)) / N) #样本权重 Y_pred = np.zeros((N, 1)) for i in range(0, self.n_iter): #迭代几次则拟合几棵树 # use MSE as the surrogate loss when fitting to negative gradients t = DecisionTree(classifier=False, max_depth=self.max_depth, criterion="entropy") t.fit(X, Y) self.learners[i] = t Y_pred = t.predict(X) errArr = np.mat(np.ones((N, 1))) errArr[Y_pred == Y] = 0 weightedError = self.weights * errArr self.alpha[i] = float(0.5 * np.log( (1.0 - weightedError) / np.max(weightedError, np.inf))) expon = np.multiply(-1 * self.alpha[i] * np.mat(Y).T, Y_pred) self.weights = np.multiply(self.weights, np.exp(expon)) self.weights = self.weights / self.weights.sum()
def fit(self, x): trees = [] for _ in range(self.num_trees): tree = DecisionTree(max_depth=self.max_depth, min_size=self.min_size, features_ratio=self.features_ratio) subsample = self.subsample(x, self.sampling_ratio) tree.fit(subsample) trees.append(tree) self.trees = trees return trees
def fit(self, X, Y): if self.loss == "mse": loss = MSELoss() elif self.loss == "crossentropy": loss = CrossEntropyLoss() # convert Y to one_hot if not already if self.classifier: Y = to_one_hot(Y.flatten()) else: Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y N, M = X.shape self.out_dims = Y.shape[1] self.learners = np.empty((self.n_iter, self.out_dims), dtype=object) self.weights = np.ones((self.n_iter, self.out_dims)) self.weights[1:, :] *= self.learning_rate # fit the base estimator Y_pred = np.zeros((N, self.out_dims)) for k in range(self.out_dims): t = loss.base_estimator() t.fit(X, Y[:, k]) Y_pred[:, k] += t.predict(X) self.learners[0, k] = t # incrementally fit each learner on the negative gradient of the loss # wrt the previous fit (pseudo-residuals) for i in range(1, self.n_iter): for k in range(self.out_dims): y, y_pred = Y[:, k], Y_pred[:, k] neg_grad = -1 * loss.grad(y, y_pred) # use MSE as the surrogate loss when fitting to negative gradients t = DecisionTree(classifier=False, max_depth=self.max_depth, criterion="mse") # fit current learner to negative gradients t.fit(X, neg_grad) self.learners[i, k] = t # compute step size and weight for the current learner step = 1.0 h_pred = t.predict(X) if self.step_size == "adaptive": step = loss.line_search(y, y_pred, h_pred) # update weights and our overall prediction for Y self.weights[i, k] *= step Y_pred[:, k] += self.weights[i, k] * h_pred
def fit(self, X, Y): """ Create `n_trees`-worth of bootstrapped samples from the training data and use each to fit a separate decision tree. """ self.trees = [] for _ in range(self.n_trees): X_samp, Y_samp = bootstrap_sample(X, Y) tree = DecisionTree( n_feats=self.n_feats, max_depth=self.max_depth, criterion=self.criterion, classifier=self.classifier, ) tree.fit(X_samp, Y_samp) self.trees.append(tree)
def test_regressor(): boston = load_boston() X, Xtest, Y, Ytest = train_test_split(boston.data, boston.target, test_size=0.2, random_state=0) regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X, Y) y_pred = regressor.predict(Xtest) err = mean_squared_error(Ytest, y_pred) print(err) # 32.41637254901961 from dt import DecisionTree mine = DecisionTree(criterion="mse", classifier=False) mine.fit(X, Y) y_pred = mine.predict(Xtest) err = mean_squared_error(Ytest, y_pred) print(err) # 32.74450980392157
def models(): # models function iris = load_iris() # load the sklearn iris data set feature = iris.data[:, :2] # set the features of the data label = iris.target # set the label as the target X_train, X_test, y_train, y_test = train_test_split( feature, label, random_state=42) # split the data into train and test """ ### Created Decision Tree Model ### """ scratch_dt_model = DecisionTree( max_depth=2, # create our decision tree model with params min_splits=10) scratch_dt_model.fit(X_train, y_train) # fit the model scratch_dt_model_pred = scratch_dt_model.pred( X_test) # create predicitons from the model """ ### Sklearn Decision Tree Model ### """ sk_dt_model = DecisionTreeClassifier( max_depth=2, # use the decision tree model from Sklearn with params min_samples_split=10) sk_dt_model.fit(X_train, y_train) # fit the model sk_dt_model_pred = sk_dt_model.predict( X_test) # create predicitons from the model """ ### Results ### """ print("Scratch Model Accuracy : {0}".format( acc_score(scratch_dt_model_pred, y_test))) # print the scratch models accuracy score print("SK-Learn Model Accuracy : {0}".format( acc_score(sk_dt_model_pred, y_test))) # print the sklearn models accuracy score print( list(zip(scratch_dt_model_pred, sk_dt_model_pred, y_test)) ) # print the scratch models prediction, sklearn models prediction, and the actual value
def ensemble_diff_plot(): fig, axes = plt.subplots(3, 3) fig.set_size_inches(10, 10) for ax in axes.flatten(): n_ex = 100 n_trees = 50 n_feats = np.random.randint(2, 100) max_depth_d = np.random.randint(1, 100) max_depth_r = np.random.randint(1, 10) classifier = np.random.choice([True, False]) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model def loss(yp, y): return accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine = RandomForest( classifier=classifier, n_feats=n_feats, n_trees=n_trees, criterion=criterion, max_depth=max_depth_r, ) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) mine_g = GradientBoostedDecisionTree( n_iter=10, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="crossentropy", step_size="constant", split_criterion=criterion, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=1) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model criterion = "mse" loss = mean_squared_error mine = RandomForest( criterion=criterion, n_feats=n_feats, n_trees=n_trees, max_depth=max_depth_r, classifier=classifier, ) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) mine_g = GradientBoostedDecisionTree( # n_trees=n_trees, n_iter=10, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="mse", step_size="adaptive", split_criterion=criterion, ) # fit 'em mine.fit(X, Y) mine_d.fit(X, Y) mine_g.fit(X, Y) # get preds on test set y_pred_mine_test = mine.predict(X_test) y_pred_mine_test_d = mine_d.predict(X_test) y_pred_mine_test_g = mine_g.predict(X_test) loss_mine_test = loss(y_pred_mine_test, Y_test) loss_mine_test_d = loss(y_pred_mine_test_d, Y_test) loss_mine_test_g = loss(y_pred_mine_test_g, Y_test) if classifier: entries = [("RF", loss_mine_test, y_pred_mine_test), ("DT", loss_mine_test_d, y_pred_mine_test_d), ("GB", loss_mine_test_g, y_pred_mine_test_g)] (lbl, test_loss, preds) = entries[np.random.randint(3)] ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100)) for i in np.unique(Y_test): ax.scatter( X_test[preds == i, 0].flatten(), X_test[preds == i, 1].flatten(), # s=0.5, ) else: X_ax = np.linspace( np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100).reshape(-1, 1) y_pred_mine_test = mine.predict(X_ax) y_pred_mine_test_d = mine_d.predict(X_ax) y_pred_mine_test_g = mine_g.predict(X_ax) ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5) # s=0.5) ax.plot( X_ax.flatten(), y_pred_mine_test_g.flatten(), # linewidth=0.5, label="GB".format(n_trees, n_feats, max_depth_d), color="red", ) ax.plot( X_ax.flatten(), y_pred_mine_test.flatten(), # linewidth=0.5, label="RF".format(n_trees, n_feats, max_depth_r), color="cornflowerblue", ) ax.plot( X_ax.flatten(), y_pred_mine_test_d.flatten(), # linewidth=0.5, label="DT".format(max_depth_d), color="yellowgreen", ) ax.set_title("GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format( loss_mine_test_g, loss_mine_test, loss_mine_test_d)) ax.legend() ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) # plt.savefig("plot.png", dpi=300) plt.show() plt.close("all")
def dt_plot(): fig, axes = plt.subplots(2, 2) # fig.set_size_inches(10,10) for ax in axes.flatten(): n_ex = 100 n_trees = 50 # belong to rf n_feats = np.random.randint(2, 100) max_depth_d = np.random.randint(1, 100) max_depth_r = np.random.randint(1, 10) # belong to rf classifier = np.random.choice([True]) #, False # generate samples based different problem(classification-->label, # regression-->continous) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model def loss(yp, y): return accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=1) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model criterion = "mse" loss = mean_squared_error mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) # fit 'em mine_d.fit(X, Y) # get preds on test set y_pred_mine_test_d = mine_d.predict(X_test) loss_mine_test_d = loss(y_pred_mine_test_d, Y_test) if classifier: entries = [("DT", loss_mine_test_d, y_pred_mine_test_d)] (lbl, test_loss, preds) = entries[np.random.randint(1)] ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100)) for i in np.unique(Y_test): ax.scatter( X_test[preds == i, 0].flatten(), X_test[preds == i, 1].flatten(), # s=0.5, ) else: X_ax = np.linspace( np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100).reshape(-1, 1) y_pred_mine_test_d = mine_d.predict(X_ax) ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5) # s=0.5) ax.plot( X_ax.flatten(), y_pred_mine_test_d.flatten(), # linewidth=0.5, label="DT".format(max_depth_d), color="yellowgreen", ) ax.set_title("DT: {:.1f} ".format(loss_mine_test_d)) ax.legend() ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) # plt.savefig("plot.png", dpi=300) plt.show()
def test_DecisionTree(): i = 1 np.random.seed(12345) while True: n_ex = np.random.randint(2, 100) n_feats = np.random.randint(2, 100) max_depth = np.random.randint(1, 5) classifier = np.random.choice([True, False]) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs( n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i ) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model def loss(yp, y): return 1 - accuracy_score(yp, y) criterion = np.random.choice(["entropy", "gini"]) mine = DecisionTree( classifier=classifier, max_depth=max_depth, criterion=criterion ) gold = DecisionTreeClassifier( criterion=criterion, max_depth=max_depth, splitter="best", random_state=i, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model criterion = "mse" loss = mean_squared_error mine = DecisionTree( criterion=criterion, max_depth=max_depth, classifier=classifier ) gold = DecisionTreeRegressor( criterion=criterion, max_depth=max_depth, splitter="best" ) print("Trial {}".format(i)) print("\tClassifier={}, criterion={}".format(classifier, criterion)) print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex)) if classifier: print("\tn_classes: {}".format(n_classes)) # fit 'em mine.fit(X, Y) gold.fit(X, Y) # get preds on training set y_pred_mine = mine.predict(X) y_pred_gold = gold.predict(X) loss_mine = loss(y_pred_mine, Y) loss_gold = loss(y_pred_gold, Y) # get preds on test set y_pred_mine_test = mine.predict(X_test) y_pred_gold_test = gold.predict(X_test) loss_mine_test = loss(y_pred_mine_test, Y_test) loss_gold_test = loss(y_pred_gold_test, Y_test) try: np.testing.assert_almost_equal(loss_mine, loss_gold) print("\tLoss on training: {}".format(loss_mine)) except AssertionError as e: print("\tTraining losses not equal:\n{}".format(e)) try: np.testing.assert_almost_equal(loss_mine_test, loss_gold_test) print("\tLoss on test: {}".format(loss_mine_test)) except AssertionError as e: print("\tTest losses not equal:\n{}".format(e)) i += 1
accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy df = pd.read_csv('Training.csv') X = df.iloc[:, 0:132].values y = df.iloc[:, -1].values labelencoder_Y = LabelEncoder() y = labelencoder_Y.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) clf = DecisionTree(max_depth=10) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy(y_test, y_pred) print ("Accuracy:", acc*100, "%") # from matplotlib import pyplot as plt def saveModel(): with open("DTModel", "wb") as f: pickle.dump(clf, f) saveModel() # header = ['itching','skin_rash','nodal_skin_eruptions','continuous_sneezing','shivering','chills','joint_pain', # 'stomach_pain','acidity','ulcers_on_tongue','muscle_wasting','vomiting','burning_micturition',