def fit(self, X, Y):
        # if self.loss == "mse":
        #     loss = MSELoss()
        # elif self.loss == "0_1_err":
        #     loss = ClassifyLoss()
        N, M = X.shape
        self.learners = np.empty((self.n_iter, 1), dtype=object)
        self.alpha = np.ones((self.n_iter, 1))  #
        self.weights = np.mat(np.ones((N, 1)) / N)  #样本权重
        Y_pred = np.zeros((N, 1))

        for i in range(0, self.n_iter):  #迭代几次则拟合几棵树
            # use MSE as the surrogate loss when fitting to negative gradients
            t = DecisionTree(classifier=False,
                             max_depth=self.max_depth,
                             criterion="entropy")
            t.fit(X, Y)
            self.learners[i] = t

            Y_pred = t.predict(X)
            errArr = np.mat(np.ones((N, 1)))
            errArr[Y_pred == Y] = 0
            weightedError = self.weights * errArr
            self.alpha[i] = float(0.5 * np.log(
                (1.0 - weightedError) / np.max(weightedError, np.inf)))

            expon = np.multiply(-1 * self.alpha[i] * np.mat(Y).T, Y_pred)
            self.weights = np.multiply(self.weights, np.exp(expon))
            self.weights = self.weights / self.weights.sum()
Beispiel #2
0
 def fit(self, x):
     trees = []
     for _ in range(self.num_trees):
         tree = DecisionTree(max_depth=self.max_depth,
                             min_size=self.min_size,
                             features_ratio=self.features_ratio)
         subsample = self.subsample(x, self.sampling_ratio)
         tree.fit(subsample)
         trees.append(tree)
     self.trees = trees
     return trees
Beispiel #3
0
    def fit(self, X, Y):
        if self.loss == "mse":
            loss = MSELoss()
        elif self.loss == "crossentropy":
            loss = CrossEntropyLoss()

        # convert Y to one_hot if not already
        if self.classifier:
            Y = to_one_hot(Y.flatten())
        else:
            Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y

        N, M = X.shape
        self.out_dims = Y.shape[1]
        self.learners = np.empty((self.n_iter, self.out_dims), dtype=object)
        self.weights = np.ones((self.n_iter, self.out_dims))
        self.weights[1:, :] *= self.learning_rate

        # fit the base estimator
        Y_pred = np.zeros((N, self.out_dims))
        for k in range(self.out_dims):
            t = loss.base_estimator()
            t.fit(X, Y[:, k])
            Y_pred[:, k] += t.predict(X)
            self.learners[0, k] = t

        # incrementally fit each learner on the negative gradient of the loss
        # wrt the previous fit (pseudo-residuals)
        for i in range(1, self.n_iter):
            for k in range(self.out_dims):
                y, y_pred = Y[:, k], Y_pred[:, k]
                neg_grad = -1 * loss.grad(y, y_pred)

                # use MSE as the surrogate loss when fitting to negative gradients
                t = DecisionTree(classifier=False,
                                 max_depth=self.max_depth,
                                 criterion="mse")

                # fit current learner to negative gradients
                t.fit(X, neg_grad)
                self.learners[i, k] = t

                # compute step size and weight for the current learner
                step = 1.0
                h_pred = t.predict(X)
                if self.step_size == "adaptive":
                    step = loss.line_search(y, y_pred, h_pred)

                # update weights and our overall prediction for Y
                self.weights[i, k] *= step
                Y_pred[:, k] += self.weights[i, k] * h_pred
Beispiel #4
0
 def fit(self, X, Y):
     """
     Create `n_trees`-worth of bootstrapped samples from the training data
     and use each to fit a separate decision tree.
     """
     self.trees = []
     for _ in range(self.n_trees):
         X_samp, Y_samp = bootstrap_sample(X, Y)
         tree = DecisionTree(
             n_feats=self.n_feats,
             max_depth=self.max_depth,
             criterion=self.criterion,
             classifier=self.classifier,
         )
         tree.fit(X_samp, Y_samp)
         self.trees.append(tree)
Beispiel #5
0
def test_regressor():
    boston = load_boston()
    X, Xtest, Y, Ytest = train_test_split(boston.data,
                                          boston.target,
                                          test_size=0.2,
                                          random_state=0)

    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X, Y)
    y_pred = regressor.predict(Xtest)
    err = mean_squared_error(Ytest, y_pred)
    print(err)  # 32.41637254901961

    from dt import DecisionTree
    mine = DecisionTree(criterion="mse", classifier=False)
    mine.fit(X, Y)
    y_pred = mine.predict(Xtest)
    err = mean_squared_error(Ytest, y_pred)
    print(err)  # 32.74450980392157
Beispiel #6
0
def models():  # models function
    iris = load_iris()  # load the sklearn iris data set
    feature = iris.data[:, :2]  # set the features of the data
    label = iris.target  # set the label as the target
    X_train, X_test, y_train, y_test = train_test_split(
        feature, label, random_state=42)  # split the data into train and test
    """
    ### Created Decision Tree Model ###
    """
    scratch_dt_model = DecisionTree(
        max_depth=2,  # create our decision tree model with params
        min_splits=10)
    scratch_dt_model.fit(X_train, y_train)  # fit the model
    scratch_dt_model_pred = scratch_dt_model.pred(
        X_test)  # create predicitons from the model
    """
    ### Sklearn Decision Tree Model ###
    """
    sk_dt_model = DecisionTreeClassifier(
        max_depth=2,  # use the decision tree model from Sklearn with params
        min_samples_split=10)
    sk_dt_model.fit(X_train, y_train)  # fit the model
    sk_dt_model_pred = sk_dt_model.predict(
        X_test)  # create predicitons from the model
    """
    ### Results ###
    """
    print("Scratch Model Accuracy : {0}".format(
        acc_score(scratch_dt_model_pred,
                  y_test)))  # print the scratch models accuracy score
    print("SK-Learn Model Accuracy : {0}".format(
        acc_score(sk_dt_model_pred,
                  y_test)))  # print the sklearn models accuracy score
    print(
        list(zip(scratch_dt_model_pred, sk_dt_model_pred, y_test))
    )  # print the scratch models prediction, sklearn models prediction, and the actual value
Beispiel #7
0
def ensemble_diff_plot():
    fig, axes = plt.subplots(3, 3)
    fig.set_size_inches(10, 10)
    for ax in axes.flatten():
        n_ex = 100
        n_trees = 50
        n_feats = np.random.randint(2, 100)
        max_depth_d = np.random.randint(1, 100)
        max_depth_r = np.random.randint(1, 10)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            def loss(yp, y):
                return accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine = RandomForest(
                classifier=classifier,
                n_feats=n_feats,
                n_trees=n_trees,
                criterion=criterion,
                max_depth=max_depth_r,
            )
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
            mine_g = GradientBoostedDecisionTree(
                n_iter=10,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="crossentropy",
                step_size="constant",
                split_criterion=criterion,
            )

        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=1)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = RandomForest(
                criterion=criterion,
                n_feats=n_feats,
                n_trees=n_trees,
                max_depth=max_depth_r,
                classifier=classifier,
            )
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
            mine_g = GradientBoostedDecisionTree(
                # n_trees=n_trees,
                n_iter=10,
                max_depth=max_depth_d,
                classifier=classifier,
                learning_rate=1,
                loss="mse",
                step_size="adaptive",
                split_criterion=criterion,
            )

        # fit 'em
        mine.fit(X, Y)
        mine_d.fit(X, Y)
        mine_g.fit(X, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_mine_test_d = mine_d.predict(X_test)
        y_pred_mine_test_g = mine_g.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)
        loss_mine_test_g = loss(y_pred_mine_test_g, Y_test)

        if classifier:
            entries = [("RF", loss_mine_test, y_pred_mine_test),
                       ("DT", loss_mine_test_d, y_pred_mine_test_d),
                       ("GB", loss_mine_test_g, y_pred_mine_test_g)]
            (lbl, test_loss, preds) = entries[np.random.randint(3)]
            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
            for i in np.unique(Y_test):
                ax.scatter(
                    X_test[preds == i, 0].flatten(),
                    X_test[preds == i, 1].flatten(),
                    #  s=0.5,
                )
        else:
            X_ax = np.linspace(
                np.min(X_test.flatten()) - 1,
                np.max(X_test.flatten()) + 1, 100).reshape(-1, 1)
            y_pred_mine_test = mine.predict(X_ax)
            y_pred_mine_test_d = mine_d.predict(X_ax)
            y_pred_mine_test_g = mine_g.predict(X_ax)

            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
            #  s=0.5)
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_g.flatten(),
                #  linewidth=0.5,
                label="GB".format(n_trees, n_feats, max_depth_d),
                color="red",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test.flatten(),
                #  linewidth=0.5,
                label="RF".format(n_trees, n_feats, max_depth_r),
                color="cornflowerblue",
            )
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_d.flatten(),
                #  linewidth=0.5,
                label="DT".format(max_depth_d),
                color="yellowgreen",
            )
            ax.set_title("GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format(
                loss_mine_test_g, loss_mine_test, loss_mine_test_d))
            ax.legend()
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
    # plt.savefig("plot.png", dpi=300)
    plt.show()
    plt.close("all")
Beispiel #8
0
def dt_plot():
    fig, axes = plt.subplots(2, 2)
    # fig.set_size_inches(10,10)
    for ax in axes.flatten():
        n_ex = 100
        n_trees = 50  # belong to rf
        n_feats = np.random.randint(2, 100)
        max_depth_d = np.random.randint(1, 100)
        max_depth_r = np.random.randint(1, 10)  # belong to rf

        classifier = np.random.choice([True])  #, False
        # generate samples based different problem(classification-->label,
        # regression-->continous)
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            def loss(yp, y):
                return accuracy_score(yp, y)

            # initialize model
            criterion = np.random.choice(["entropy", "gini"])
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=1)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3)
            n_feats = min(n_feats, X.shape[1])

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine_d = DecisionTree(criterion=criterion,
                                  max_depth=max_depth_d,
                                  classifier=classifier)

        # fit 'em
        mine_d.fit(X, Y)
        # get preds on test set
        y_pred_mine_test_d = mine_d.predict(X_test)

        loss_mine_test_d = loss(y_pred_mine_test_d, Y_test)

        if classifier:
            entries = [("DT", loss_mine_test_d, y_pred_mine_test_d)]
            (lbl, test_loss, preds) = entries[np.random.randint(1)]
            ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100))
            for i in np.unique(Y_test):
                ax.scatter(
                    X_test[preds == i, 0].flatten(),
                    X_test[preds == i, 1].flatten(),
                    # s=0.5,
                )
        else:
            X_ax = np.linspace(
                np.min(X_test.flatten()) - 1,
                np.max(X_test.flatten()) + 1, 100).reshape(-1, 1)
            y_pred_mine_test_d = mine_d.predict(X_ax)

            ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5)
            #  s=0.5)
            ax.plot(
                X_ax.flatten(),
                y_pred_mine_test_d.flatten(),
                #  linewidth=0.5,
                label="DT".format(max_depth_d),
                color="yellowgreen",
            )
            ax.set_title("DT: {:.1f} ".format(loss_mine_test_d))
            ax.legend()
        ax.xaxis.set_ticklabels([])
        ax.yaxis.set_ticklabels([])
    # plt.savefig("plot.png", dpi=300)
    plt.show()
Beispiel #9
0
def test_DecisionTree():
    i = 1
    np.random.seed(12345)
    while True:
        n_ex = np.random.randint(2, 100)
        n_feats = np.random.randint(2, 100)
        max_depth = np.random.randint(1, 5)

        classifier = np.random.choice([True, False])
        if classifier:
            # create classification problem
            n_classes = np.random.randint(2, 10)
            X, Y = make_blobs(
                n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i
            )
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            def loss(yp, y):
                return 1 - accuracy_score(yp, y)

            criterion = np.random.choice(["entropy", "gini"])
            mine = DecisionTree(
                classifier=classifier, max_depth=max_depth, criterion=criterion
            )
            gold = DecisionTreeClassifier(
                criterion=criterion,
                max_depth=max_depth,
                splitter="best",
                random_state=i,
            )
        else:
            # create regeression problem
            X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i)
            X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i)

            # initialize model
            criterion = "mse"
            loss = mean_squared_error
            mine = DecisionTree(
                criterion=criterion, max_depth=max_depth, classifier=classifier
            )
            gold = DecisionTreeRegressor(
                criterion=criterion, max_depth=max_depth, splitter="best"
            )

        print("Trial {}".format(i))
        print("\tClassifier={}, criterion={}".format(classifier, criterion))
        print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex))
        if classifier:
            print("\tn_classes: {}".format(n_classes))

        # fit 'em
        mine.fit(X, Y)
        gold.fit(X, Y)

        # get preds on training set
        y_pred_mine = mine.predict(X)
        y_pred_gold = gold.predict(X)

        loss_mine = loss(y_pred_mine, Y)
        loss_gold = loss(y_pred_gold, Y)

        # get preds on test set
        y_pred_mine_test = mine.predict(X_test)
        y_pred_gold_test = gold.predict(X_test)

        loss_mine_test = loss(y_pred_mine_test, Y_test)
        loss_gold_test = loss(y_pred_gold_test, Y_test)

        try:
            np.testing.assert_almost_equal(loss_mine, loss_gold)
            print("\tLoss on training: {}".format(loss_mine))
        except AssertionError as e:
            print("\tTraining losses not equal:\n{}".format(e))

        try:
            np.testing.assert_almost_equal(loss_mine_test, loss_gold_test)
            print("\tLoss on test: {}".format(loss_mine_test))
        except AssertionError as e:
            print("\tTest losses not equal:\n{}".format(e))
        i += 1
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

df = pd.read_csv('Training.csv')



X = df.iloc[:, 0:132].values
y = df.iloc[:, -1].values
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

clf = DecisionTree(max_depth=10)
clf.fit(X_train, y_train)
    
y_pred = clf.predict(X_test)
acc = accuracy(y_test, y_pred)

print ("Accuracy:", acc*100, "%")


# from matplotlib import pyplot as plt
def saveModel():
    with open("DTModel", "wb") as f:
        pickle.dump(clf, f)
saveModel()

# header = ['itching','skin_rash','nodal_skin_eruptions','continuous_sneezing','shivering','chills','joint_pain',
#  'stomach_pain','acidity','ulcers_on_tongue','muscle_wasting','vomiting','burning_micturition',