Esempio n. 1
0
 def boostrap(self, train_data, train_label):
     index = np.random.randint(0, len(train_data), (len(train_data)))
     x = train_data[index]
     y = train_label[index]
     clf = DecisionTreeClassifier(t=self.alpha)
     clf.train(x, y)
     return clf
Esempio n. 2
0
def question4():
    """
        This function is used for question4.
        The function will find the best M value for early pruning (ID3 with M value) using K-Cross-Validation.
        Then, this function will fit a DecisionTreeClassifier (with ID3 and the best M value found an) on the entire
        train-set and print its loss on the test-set ("10 times loss").
        """
    M_values = [1, 3, 6, 8, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150]
    X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET)

    # Find best M using new loss function
    best_M = find_best_M(M_values, ten_times_penalty, minimize=True)

    # Train model on entire train-set using the best M found
    dt = DecisionTreeClassifier().use_alg(ID3_with_early_pruning,
                                          extra_args={
                                              "M": best_M
                                          }).fit(X_train, y_train)

    # Test model
    X_test, y_test = get_dataset(DataSet.TEST_SET)
    y_hat = dt.predict(X_test)

    # Print Results
    print(ten_times_penalty(y_hat, y_test))
Esempio n. 3
0
 def fit(self, X, y):
     self.classifiers_ = []
     self.alphas_ = np.zeros(self.n_learners)
     self.n_outputs_ = np.unique(y)
     X_ = self.__get_values(X)
     y_ = self.__get_values(y)
     weights = np.full(len(X_), 1/len(X_))
     for i in range(self.n_learners):
         model = DecisionTreeClassifier(
             self.tol, 
             self.max_depth, 
             self.min_members, 
             self.criterion, 
             self.split_method, 
             self.max_features
         )
         model.fit(X_, y_, weights)
         self.classifiers_.append(model)
         y_pred = model.predict(X_)
         wrong_pred = y_ != y_pred
         weighted_error = np.sum(weights[wrong_pred]) / np.sum(weights)
         alpha = np.log((1-weighted_error)/weighted_error + 10e-8)
         weights[wrong_pred] *= np.exp(alpha)
         weights /= np.sum(weights)
         self.alphas_[i] = alpha
Esempio n. 4
0
 def train_tree(self, data):
     """
     Trains a singular tree and returns that tree.
     :param data: list representing the tree index being trained and the dataset being used for training.
     :return: DecisionNodeClassifier object.
     """
     tree = DecisionTreeClassifier(max_depth=self.max_depth)
     tree.fit(data[1])
     return tree
Esempio n. 5
0
    def fit(self, X, y):
        self.forest = []
        n_samples = len(y)
        n_sub_samples = int(round(n_samples*self.bootstrap))
        for i in xrange(self.n_estimators):
            X, y = shuffle_in_unison(X, y)
            X_subset = [ X[i] for i in range(n_sub_samples) ]
            y_subset = [ y[i] for i in range(n_sub_samples) ]

            tree = DecisionTreeClassifier(self.max_features, self.max_depth,
                                            self.min_samples_split)
            tree.fit(X_subset, y_subset)
            self.forest.append(tree)
Esempio n. 6
0
    def fit(self, X, y):
        self.forest = []
        n_samples = len(y)
        n_sub_samples = int(round(n_samples * self.bootstrap))
        for i in xrange(self.n_estimators):
            X, y = shuffle_in_unison(X, y)
            X_subset = [X[i] for i in range(n_sub_samples)]
            y_subset = [y[i] for i in range(n_sub_samples)]

            tree = DecisionTreeClassifier(self.max_features, self.max_depth,
                                          self.min_samples_split)
            tree.fit(X_subset, y_subset)
            self.forest.append(tree)
Esempio n. 7
0
def question1() -> None:
    """
    This function is used for question1.
    Running this function will fit a DecisionTreeClassifier with ID3 algorithm and print its accuracy on the test-set.
    """
    # Train model
    X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET)
    dt = DecisionTreeClassifier().use_alg(ID3).fit(X_train, y_train)

    # Test model
    X_test, y_test = get_dataset(DataSet.TEST_SET)
    y_hat = dt.predict(X_test)

    # Print Results
    print(classification_rate(y_hat, y_test))
 def fit(self, X, y):
     self.classifiers_ = []
     X_ = self.__get_values(X)
     y_ = self.__get_values(y)
     for _ in range(self.n_trees):
         sample = self.__get_sample(X.shape[0])
         model = DecisionTreeClassifier(
             self.tol, 
             self.max_depth, 
             self.min_members, 
             self.criterion, 
             self.split_method, 
             self.max_features
         )
         model.fit(X_[sample], y_[sample])
         self.classifiers_.append(model)
Esempio n. 9
0
    def fit(self, X, y):
        if len(X) != len(y):
            raise Exception(f"Dimension Error! len(X) != len(y).")
        if len(y) <= 0:
            raise Exception(f"Cannot fit model without examples!")

        # Build trees
        centroids = []
        trees = []
        stds = []
        for tree_id in range(self.N):
            indices = sample_without_replacement(
                n_population=len(X),
                n_samples=int(self.p * len(X)),
                method="reservoir_sampling",
                random_state=ID + KNNForestClassifier.n_classifiers +
                tree_id)  # Choose train indices
            mean = X[indices].mean(axis=0)
            std = np.sqrt(np.mean(((X[indices] - mean)**2), axis=0))
            trees.append(DecisionTreeClassifier().use_alg(ID3).fit(
                std_normalization(X[indices], mean, std), y[indices]))
            centroids.append(mean)
            stds.append(std)

        self.centroids = np.array(centroids)
        self._stds = np.array(stds)
        self.trees = trees
        return self
Esempio n. 10
0
def find_best_M(M_values: Sequence[int],
                evaluate_fn: callable,
                minimize: bool = False,
                return_score: bool = False):
    print(f"Searching for best params for ID3.")
    print(f"Looking for best M value.")
    print(f"Trying the following values: {M_values}.")
    X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET)
    avg_scores = []

    for M in M_values:
        print(f"\nRunning cross-validation for param M = {M}")
        model = DecisionTreeClassifier().use_alg(ID3_with_early_pruning,
                                                 extra_args={"M": M})
        avg_score = k_cross_validation(model, X_train, y_train, evaluate_fn)
        avg_scores.append(avg_score)
        print(f"Average validation score for this M: {avg_score}\n")

    if minimize:
        best_M, best_score = M_values[int(np.argmin(
            np.array(avg_scores)))], np.min(avg_scores)
    else:
        best_M, best_score = M_values[int(np.argmax(
            np.array(avg_scores)))], np.max(avg_scores)

    print(f"Best M found: M = {best_M}")
    print(f"Average score for best M is: {best_score}")

    if return_score:
        return best_M, avg_scores
    return best_M
Esempio n. 11
0
def experiment():
    """ It is best to use `python ID3.py -h` to get all necessary information, but if not possible see documentation on
    question3() function in this file. """
    M_values = [1, 3, 6, 8, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150]
    X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET)

    best_M, avg_accuracies = find_best_M(M_values,
                                         classification_rate,
                                         minimize=False,
                                         return_score=True)
    plot_graph(M_values, avg_accuracies, "Average Accuracy per M", "M",
               "Average Accuracy")

    # Train the model on the entire train-set
    dt = DecisionTreeClassifier().use_alg(ID3_with_early_pruning,
                                          extra_args={
                                              "M": best_M
                                          }).fit(X_train, y_train)

    # Predict and evaluate
    X_test, y_test = get_dataset(DataSet.TEST_SET)
    y_hat = dt.predict(X_test)
    print(classification_rate(y_hat, y_test))
def main():
    # loading the wine_dataset
    wine_ds = dt.load_wine()
    # iris_ds = dt.load_iris()
    X = wine_ds.data
    Y = wine_ds.target
    # X = iris_ds.data
    # Y = iris_ds.target
    # X.head(5),Y.head(5)
    
    # X = np.array([[0,1,0],
    #               [0,0,0],
    #               [0,0,1],
    #               [1,1,0],
    #               [1,1,1],
    #               [1,0,1],
    #               [1,0,0],
    #               [2,1,0],
    #               [2,0,1],
    #               [2,0,0]])
    # Y = np.array([0,
    #               0,
    #               0,
    #               0,
    #               0,
    #               1,
    #               1,
    #               1,
    #               1,
    #               1])
    
    # Now changing every feature column of X from linear
    # to class based
    for i in range(X.shape[1]):
        X[:, i] = np.array(get_labelled_data(X[:, i]))
    # Now we divide it in training and testing data
    x_train, x_test, y_train, y_test = train_test_split(X, Y, shuffle=True, test_size=0.30, random_state=3)

    # x_train, x_test, y_train, y_test = X, X, Y, Y
    
    # Now feeding the Training data to Decision tree classifier
    clf1 = DecisionTreeClassifier()
    clf1.fit(x_train, y_train)
    ypred = clf1.predict(x_test)
    print(f"Score is : {clf1.score(ypred, y_test)}")
    get_tree_pdf("wine_ds_n.pdf", clf1.get_root())
Esempio n. 13
0
from DecisionTree import DecisionTreeClassifier
from sklearn.datasets import load_iris, load_wine
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn import tree
import matplotlib.pyplot as plt
import seaborn as sns

# Test w/ Iris dataset using my class
dataset = load_iris()
X, y = dataset.data, dataset.target
clf_iris = DecisionTreeClassifier(max_depth=5)
# Test to make target class strings instead of integers
y = ["one" if val == 1 or val == 2 else "zero" for val in y]
y = np.array(y)
# Need to ordinally encode strings to integers
if "int" not in str(y.dtype):
    # Reshape y array so it works w/ ordinal encoder
    y = y.reshape(-1, 1)
    encoder = OrdinalEncoder()
    y = encoder.fit_transform(y)
y = y.astype(int)
y = y.reshape(y.size, )

clf_iris.fit(X, y)
temp = np.array([[3, 2, 1, .5]])
print("My Iris DT:")
clf_iris.print_tree()
print("------------------------------------------------------")
 def train_tree(self, data):
     logging.info("Training tree {}".format(data[0] + 1))
     tree = DecisionTreeClassifier(max_depth=self.max_depth)
     tree.fit(data[1])
     return tree
Esempio n. 15
0
 def train_tree(self, data):
     tree = DecisionTreeClassifier(max_depth = self.max_depth)
     self.count+=1
     print(self.count)
     tree.fit(data)
     return tree
 def train_tree(self, data):
     if (data[0] + 1) % 25 == 0:
         print("Training Tree {}".format(data[0] + 1))
     tree = DecisionTreeClassifier(max_depth=self.max_depth)
     tree.fit(data[1])
     return tree