def boostrap(self, train_data, train_label): index = np.random.randint(0, len(train_data), (len(train_data))) x = train_data[index] y = train_label[index] clf = DecisionTreeClassifier(t=self.alpha) clf.train(x, y) return clf
def question4(): """ This function is used for question4. The function will find the best M value for early pruning (ID3 with M value) using K-Cross-Validation. Then, this function will fit a DecisionTreeClassifier (with ID3 and the best M value found an) on the entire train-set and print its loss on the test-set ("10 times loss"). """ M_values = [1, 3, 6, 8, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150] X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET) # Find best M using new loss function best_M = find_best_M(M_values, ten_times_penalty, minimize=True) # Train model on entire train-set using the best M found dt = DecisionTreeClassifier().use_alg(ID3_with_early_pruning, extra_args={ "M": best_M }).fit(X_train, y_train) # Test model X_test, y_test = get_dataset(DataSet.TEST_SET) y_hat = dt.predict(X_test) # Print Results print(ten_times_penalty(y_hat, y_test))
def fit(self, X, y): self.classifiers_ = [] self.alphas_ = np.zeros(self.n_learners) self.n_outputs_ = np.unique(y) X_ = self.__get_values(X) y_ = self.__get_values(y) weights = np.full(len(X_), 1/len(X_)) for i in range(self.n_learners): model = DecisionTreeClassifier( self.tol, self.max_depth, self.min_members, self.criterion, self.split_method, self.max_features ) model.fit(X_, y_, weights) self.classifiers_.append(model) y_pred = model.predict(X_) wrong_pred = y_ != y_pred weighted_error = np.sum(weights[wrong_pred]) / np.sum(weights) alpha = np.log((1-weighted_error)/weighted_error + 10e-8) weights[wrong_pred] *= np.exp(alpha) weights /= np.sum(weights) self.alphas_[i] = alpha
def train_tree(self, data): """ Trains a singular tree and returns that tree. :param data: list representing the tree index being trained and the dataset being used for training. :return: DecisionNodeClassifier object. """ tree = DecisionTreeClassifier(max_depth=self.max_depth) tree.fit(data[1]) return tree
def fit(self, X, y): self.forest = [] n_samples = len(y) n_sub_samples = int(round(n_samples*self.bootstrap)) for i in xrange(self.n_estimators): X, y = shuffle_in_unison(X, y) X_subset = [ X[i] for i in range(n_sub_samples) ] y_subset = [ y[i] for i in range(n_sub_samples) ] tree = DecisionTreeClassifier(self.max_features, self.max_depth, self.min_samples_split) tree.fit(X_subset, y_subset) self.forest.append(tree)
def fit(self, X, y): self.forest = [] n_samples = len(y) n_sub_samples = int(round(n_samples * self.bootstrap)) for i in xrange(self.n_estimators): X, y = shuffle_in_unison(X, y) X_subset = [X[i] for i in range(n_sub_samples)] y_subset = [y[i] for i in range(n_sub_samples)] tree = DecisionTreeClassifier(self.max_features, self.max_depth, self.min_samples_split) tree.fit(X_subset, y_subset) self.forest.append(tree)
def question1() -> None: """ This function is used for question1. Running this function will fit a DecisionTreeClassifier with ID3 algorithm and print its accuracy on the test-set. """ # Train model X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET) dt = DecisionTreeClassifier().use_alg(ID3).fit(X_train, y_train) # Test model X_test, y_test = get_dataset(DataSet.TEST_SET) y_hat = dt.predict(X_test) # Print Results print(classification_rate(y_hat, y_test))
def fit(self, X, y): self.classifiers_ = [] X_ = self.__get_values(X) y_ = self.__get_values(y) for _ in range(self.n_trees): sample = self.__get_sample(X.shape[0]) model = DecisionTreeClassifier( self.tol, self.max_depth, self.min_members, self.criterion, self.split_method, self.max_features ) model.fit(X_[sample], y_[sample]) self.classifiers_.append(model)
def fit(self, X, y): if len(X) != len(y): raise Exception(f"Dimension Error! len(X) != len(y).") if len(y) <= 0: raise Exception(f"Cannot fit model without examples!") # Build trees centroids = [] trees = [] stds = [] for tree_id in range(self.N): indices = sample_without_replacement( n_population=len(X), n_samples=int(self.p * len(X)), method="reservoir_sampling", random_state=ID + KNNForestClassifier.n_classifiers + tree_id) # Choose train indices mean = X[indices].mean(axis=0) std = np.sqrt(np.mean(((X[indices] - mean)**2), axis=0)) trees.append(DecisionTreeClassifier().use_alg(ID3).fit( std_normalization(X[indices], mean, std), y[indices])) centroids.append(mean) stds.append(std) self.centroids = np.array(centroids) self._stds = np.array(stds) self.trees = trees return self
def find_best_M(M_values: Sequence[int], evaluate_fn: callable, minimize: bool = False, return_score: bool = False): print(f"Searching for best params for ID3.") print(f"Looking for best M value.") print(f"Trying the following values: {M_values}.") X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET) avg_scores = [] for M in M_values: print(f"\nRunning cross-validation for param M = {M}") model = DecisionTreeClassifier().use_alg(ID3_with_early_pruning, extra_args={"M": M}) avg_score = k_cross_validation(model, X_train, y_train, evaluate_fn) avg_scores.append(avg_score) print(f"Average validation score for this M: {avg_score}\n") if minimize: best_M, best_score = M_values[int(np.argmin( np.array(avg_scores)))], np.min(avg_scores) else: best_M, best_score = M_values[int(np.argmax( np.array(avg_scores)))], np.max(avg_scores) print(f"Best M found: M = {best_M}") print(f"Average score for best M is: {best_score}") if return_score: return best_M, avg_scores return best_M
def experiment(): """ It is best to use `python ID3.py -h` to get all necessary information, but if not possible see documentation on question3() function in this file. """ M_values = [1, 3, 6, 8, 10, 15, 20, 25, 30, 40, 50, 75, 100, 125, 150] X_train, y_train = get_dataset(data_set=DataSet.TRAIN_SET) best_M, avg_accuracies = find_best_M(M_values, classification_rate, minimize=False, return_score=True) plot_graph(M_values, avg_accuracies, "Average Accuracy per M", "M", "Average Accuracy") # Train the model on the entire train-set dt = DecisionTreeClassifier().use_alg(ID3_with_early_pruning, extra_args={ "M": best_M }).fit(X_train, y_train) # Predict and evaluate X_test, y_test = get_dataset(DataSet.TEST_SET) y_hat = dt.predict(X_test) print(classification_rate(y_hat, y_test))
def main(): # loading the wine_dataset wine_ds = dt.load_wine() # iris_ds = dt.load_iris() X = wine_ds.data Y = wine_ds.target # X = iris_ds.data # Y = iris_ds.target # X.head(5),Y.head(5) # X = np.array([[0,1,0], # [0,0,0], # [0,0,1], # [1,1,0], # [1,1,1], # [1,0,1], # [1,0,0], # [2,1,0], # [2,0,1], # [2,0,0]]) # Y = np.array([0, # 0, # 0, # 0, # 0, # 1, # 1, # 1, # 1, # 1]) # Now changing every feature column of X from linear # to class based for i in range(X.shape[1]): X[:, i] = np.array(get_labelled_data(X[:, i])) # Now we divide it in training and testing data x_train, x_test, y_train, y_test = train_test_split(X, Y, shuffle=True, test_size=0.30, random_state=3) # x_train, x_test, y_train, y_test = X, X, Y, Y # Now feeding the Training data to Decision tree classifier clf1 = DecisionTreeClassifier() clf1.fit(x_train, y_train) ypred = clf1.predict(x_test) print(f"Score is : {clf1.score(ypred, y_test)}") get_tree_pdf("wine_ds_n.pdf", clf1.get_root())
from DecisionTree import DecisionTreeClassifier from sklearn.datasets import load_iris, load_wine import numpy as np from sklearn.preprocessing import OrdinalEncoder from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier as DTC from sklearn import tree import matplotlib.pyplot as plt import seaborn as sns # Test w/ Iris dataset using my class dataset = load_iris() X, y = dataset.data, dataset.target clf_iris = DecisionTreeClassifier(max_depth=5) # Test to make target class strings instead of integers y = ["one" if val == 1 or val == 2 else "zero" for val in y] y = np.array(y) # Need to ordinally encode strings to integers if "int" not in str(y.dtype): # Reshape y array so it works w/ ordinal encoder y = y.reshape(-1, 1) encoder = OrdinalEncoder() y = encoder.fit_transform(y) y = y.astype(int) y = y.reshape(y.size, ) clf_iris.fit(X, y) temp = np.array([[3, 2, 1, .5]]) print("My Iris DT:") clf_iris.print_tree() print("------------------------------------------------------")
def train_tree(self, data): logging.info("Training tree {}".format(data[0] + 1)) tree = DecisionTreeClassifier(max_depth=self.max_depth) tree.fit(data[1]) return tree
def train_tree(self, data): tree = DecisionTreeClassifier(max_depth = self.max_depth) self.count+=1 print(self.count) tree.fit(data) return tree
def train_tree(self, data): if (data[0] + 1) % 25 == 0: print("Training Tree {}".format(data[0] + 1)) tree = DecisionTreeClassifier(max_depth=self.max_depth) tree.fit(data[1]) return tree