def fit(self, X, Y): # if self.loss == "mse": # loss = MSELoss() # elif self.loss == "0_1_err": # loss = ClassifyLoss() N, M = X.shape self.learners = np.empty((self.n_iter, 1), dtype=object) self.alpha = np.ones((self.n_iter, 1)) # self.weights = np.mat(np.ones((N, 1)) / N) #样本权重 Y_pred = np.zeros((N, 1)) for i in range(0, self.n_iter): #迭代几次则拟合几棵树 # use MSE as the surrogate loss when fitting to negative gradients t = DecisionTree(classifier=False, max_depth=self.max_depth, criterion="entropy") t.fit(X, Y) self.learners[i] = t Y_pred = t.predict(X) errArr = np.mat(np.ones((N, 1))) errArr[Y_pred == Y] = 0 weightedError = self.weights * errArr self.alpha[i] = float(0.5 * np.log( (1.0 - weightedError) / np.max(weightedError, np.inf))) expon = np.multiply(-1 * self.alpha[i] * np.mat(Y).T, Y_pred) self.weights = np.multiply(self.weights, np.exp(expon)) self.weights = self.weights / self.weights.sum()
def test_test(): dt = DecisionTree(open('data/dt_train.txt')) assert dt.test({ 'age': '<=30', 'income': 'low', 'student': 'no', 'credit_rating': 'fair', }) == 'no'
def train(): print "Training: %s" % options.filename training = pandas.read_csv(options.input, sep=' ',header=1, skiprows=[0,2]) tree = DecisionTree(training, 'poisonouse') tree.grow(0.01) pickle.dump(tree, open(options.filename, 'wb')) print "DONE"
def fit(self, x): trees = [] for _ in range(self.num_trees): tree = DecisionTree(max_depth=self.max_depth, min_size=self.min_size, features_ratio=self.features_ratio) subsample = self.subsample(x, self.sampling_ratio) tree.fit(subsample) trees.append(tree) self.trees = trees return trees
def train(): print "Training: %s" % options.filename training = pandas.read_csv(options.input, sep=' ', header=1, skiprows=[0, 2]) tree = DecisionTree(training, 'poisonouse') tree.grow(0.01) pickle.dump(tree, open(options.filename, 'wb')) print "DONE"
def fit(self, X, Y): if self.loss == "mse": loss = MSELoss() elif self.loss == "crossentropy": loss = CrossEntropyLoss() # convert Y to one_hot if not already if self.classifier: Y = to_one_hot(Y.flatten()) else: Y = Y.reshape(-1, 1) if len(Y.shape) == 1 else Y N, M = X.shape self.out_dims = Y.shape[1] self.learners = np.empty((self.n_iter, self.out_dims), dtype=object) self.weights = np.ones((self.n_iter, self.out_dims)) self.weights[1:, :] *= self.learning_rate # fit the base estimator Y_pred = np.zeros((N, self.out_dims)) for k in range(self.out_dims): t = loss.base_estimator() t.fit(X, Y[:, k]) Y_pred[:, k] += t.predict(X) self.learners[0, k] = t # incrementally fit each learner on the negative gradient of the loss # wrt the previous fit (pseudo-residuals) for i in range(1, self.n_iter): for k in range(self.out_dims): y, y_pred = Y[:, k], Y_pred[:, k] neg_grad = -1 * loss.grad(y, y_pred) # use MSE as the surrogate loss when fitting to negative gradients t = DecisionTree(classifier=False, max_depth=self.max_depth, criterion="mse") # fit current learner to negative gradients t.fit(X, neg_grad) self.learners[i, k] = t # compute step size and weight for the current learner step = 1.0 h_pred = t.predict(X) if self.step_size == "adaptive": step = loss.line_search(y, y_pred, h_pred) # update weights and our overall prediction for Y self.weights[i, k] *= step Y_pred[:, k] += self.weights[i, k] * h_pred
def test_testfile(): from io import StringIO dt = DecisionTree(open('data/dt_train.txt')) output = StringIO() dt.testfile(open('data/dt_test.txt'), output) contents = output.getvalue() for line in contents.split("\n"): if not len(line): continue last_elm = line.split("\t")[-1] assert last_elm in ('Class:buys_computer', 'yes', 'no')
def fit(self, X, Y): """ Create `n_trees`-worth of bootstrapped samples from the training data and use each to fit a separate decision tree. """ self.trees = [] for _ in range(self.n_trees): X_samp, Y_samp = bootstrap_sample(X, Y) tree = DecisionTree( n_feats=self.n_feats, max_depth=self.max_depth, criterion=self.criterion, classifier=self.classifier, ) tree.fit(X_samp, Y_samp) self.trees.append(tree)
def accuracy_test(rows): """Performs cross-out-one validation methods on the dataset passed.""" # Removing the label rows = rows[1:] total_accuracy = 0.0 total_elements = len(rows) for i, row in enumerate(rows): singled_out = row rows.remove(row) remaining_rows = rows actual_label = row[-1] dt = DecisionTree(remaining_rows) node = dt.build_tree() #node = build_tree(remaining_rows) result = dt.classify(singled_out, node) # either 'yes' or 'no' prediction if len(result) == 1: if actual_label in result: total_accuracy += 1 continue else: total_accuracy += 0 continue # both 'yes' and 'no' prediction else: prob_correct = result[actual_label] if actual_label == 'yes': incorrect_label = 'no' else: incorrect_label = 'yes' prob_incorrect = result[incorrect_label] total_prob = prob_correct + prob_incorrect accuracy_for_this_test = prob_correct / total_prob total_accuracy += accuracy_for_this_test rows.append(singled_out) final_accuracy = (total_accuracy / total_elements) return final_accuracy
def test_regressor(): boston = load_boston() X, Xtest, Y, Ytest = train_test_split(boston.data, boston.target, test_size=0.2, random_state=0) regressor = DecisionTreeRegressor(random_state=0) regressor.fit(X, Y) y_pred = regressor.predict(Xtest) err = mean_squared_error(Ytest, y_pred) print(err) # 32.41637254901961 from dt import DecisionTree mine = DecisionTree(criterion="mse", classifier=False) mine.fit(X, Y) y_pred = mine.predict(Xtest) err = mean_squared_error(Ytest, y_pred) print(err) # 32.74450980392157
def test_tree(): dt = DecisionTree(open('data/dt_train.txt')) assert dt.tree[0] == 'age' assert dt.tree[1]['<=30'][0] == 'student' assert dt.tree[1]['<=30'][1]['yes'] == 'yes' assert dt.tree[1]['<=30'][1]['no'] == 'no' assert dt.tree[1]['31...40'] == 'yes' assert dt.tree[1]['>40'][0] == 'credit_rating' assert dt.tree[1]['>40'][1]['excellent'] == 'no' assert dt.tree[1]['>40'][1]['fair'] == 'yes'
def models(): # models function iris = load_iris() # load the sklearn iris data set feature = iris.data[:, :2] # set the features of the data label = iris.target # set the label as the target X_train, X_test, y_train, y_test = train_test_split( feature, label, random_state=42) # split the data into train and test """ ### Created Decision Tree Model ### """ scratch_dt_model = DecisionTree( max_depth=2, # create our decision tree model with params min_splits=10) scratch_dt_model.fit(X_train, y_train) # fit the model scratch_dt_model_pred = scratch_dt_model.pred( X_test) # create predicitons from the model """ ### Sklearn Decision Tree Model ### """ sk_dt_model = DecisionTreeClassifier( max_depth=2, # use the decision tree model from Sklearn with params min_samples_split=10) sk_dt_model.fit(X_train, y_train) # fit the model sk_dt_model_pred = sk_dt_model.predict( X_test) # create predicitons from the model """ ### Results ### """ print("Scratch Model Accuracy : {0}".format( acc_score(scratch_dt_model_pred, y_test))) # print the scratch models accuracy score print("SK-Learn Model Accuracy : {0}".format( acc_score(sk_dt_model_pred, y_test))) # print the sklearn models accuracy score print( list(zip(scratch_dt_model_pred, sk_dt_model_pred, y_test)) ) # print the scratch models prediction, sklearn models prediction, and the actual value
args = parser.parse_args() # load the train and test data xTrain = pd.read_csv(args.xTrain) yTrain = pd.read_csv(args.yTrain) xTest = pd.read_csv(args.xTest) yTest = pd.read_csv(args.yTest) # create an instance of the decision tree using gini maxDepth = list(np.arange(1, 10)) minLeaf = list(np.arange(1, 10)) yDepth = np.ones((9, 2)) yLeaf = np.ones((9, 2)) for m in maxDepth: dt = DecisionTree('gini', m, 5) trainauc, testauc = dt_train_test(dt, xTrain, yTrain, xTest, yTest) yDepth[m - 1, 0] = trainauc yDepth[m - 1, 1] = testauc for l in minLeaf: dt = DecisionTree('gini', 5, l) trainauc, testauc = dt_train_test(dt, xTrain, yTrain, xTest, yTest) yLeaf[l - 1, 0] = trainauc yLeaf[l - 1, 1] = testauc plt.subplot(1, 2, 1) plt.plot(maxDepth, yDepth[:, 0], color='g', label='Train') plt.plot(maxDepth, yDepth[:, 1], color='r', label='Test') plt.xlabel('Tree Depth') plt.ylabel('Accuracy')
def test_DecisionTree(): i = 1 np.random.seed(12345) while True: n_ex = np.random.randint(2, 100) n_feats = np.random.randint(2, 100) max_depth = np.random.randint(1, 5) classifier = np.random.choice([True, False]) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs( n_samples=n_ex, centers=n_classes, n_features=n_feats, random_state=i ) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model def loss(yp, y): return 1 - accuracy_score(yp, y) criterion = np.random.choice(["entropy", "gini"]) mine = DecisionTree( classifier=classifier, max_depth=max_depth, criterion=criterion ) gold = DecisionTreeClassifier( criterion=criterion, max_depth=max_depth, splitter="best", random_state=i, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=n_feats, random_state=i) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3, random_state=i) # initialize model criterion = "mse" loss = mean_squared_error mine = DecisionTree( criterion=criterion, max_depth=max_depth, classifier=classifier ) gold = DecisionTreeRegressor( criterion=criterion, max_depth=max_depth, splitter="best" ) print("Trial {}".format(i)) print("\tClassifier={}, criterion={}".format(classifier, criterion)) print("\tmax_depth={}, n_feats={}, n_ex={}".format(max_depth, n_feats, n_ex)) if classifier: print("\tn_classes: {}".format(n_classes)) # fit 'em mine.fit(X, Y) gold.fit(X, Y) # get preds on training set y_pred_mine = mine.predict(X) y_pred_gold = gold.predict(X) loss_mine = loss(y_pred_mine, Y) loss_gold = loss(y_pred_gold, Y) # get preds on test set y_pred_mine_test = mine.predict(X_test) y_pred_gold_test = gold.predict(X_test) loss_mine_test = loss(y_pred_mine_test, Y_test) loss_gold_test = loss(y_pred_gold_test, Y_test) try: np.testing.assert_almost_equal(loss_mine, loss_gold) print("\tLoss on training: {}".format(loss_mine)) except AssertionError as e: print("\tTraining losses not equal:\n{}".format(e)) try: np.testing.assert_almost_equal(loss_mine_test, loss_gold_test) print("\tLoss on test: {}".format(loss_mine_test)) except AssertionError as e: print("\tTest losses not equal:\n{}".format(e)) i += 1
save = pickle.load(f) X_train = save['X_train'] y_train = save['y_train'] X_test = save['X_test'] y_test = save['y_test'] del save print('X_train ', X_train.shape) print('y_train', y_train.shape) print('X_test ', X_test.shape) y_train = np.reshape(y_train, (y_train.shape[0], 1)) dataset = np.concatenate((X_train, y_train), axis=1) #Instance of Decision Tree Object a = DecisionTree(headers, 5) X_tr = dataset[0:6400] y_tr = y_train[0:6400] X_val = X_train[6400:8000] y_val = y_train[6400:8000] y_val = np.reshape(y_val, (y_val.shape[0], )) t = a.train(dataset) #Saving Trained Model dill.dump(a, open("vamshi.model", "w")) v = dill.load(open("vamshi.model")) y_pred = a.predict(None, X_val)
def create_decision_tree( local_attributes: np.ndarray, local_data: np.ndarray, local_output: np.ndarray, ) -> DecisionTree: """ in each recursion, we calculate for each remaining attribute a list of unique possible values this iteration. from the remainder attributes, we choose the best attribute with the lowest average conditional entropy and we get its index and partition entropies we create the Node associated to this attribute, and after that, for each partition of chosen best node we get the value of attribute in current partition if partition entropy is 0 => for each value of partition, we have the same output we get the output for partition, and create a terminal node, a leaf, with value equal to output of partition value else we filter each other attribute and also the output, so that we only have elements which can be descendents of current partition value we calculate "data", "attributes", and "output" for a future recursion if no further recursion is possible (we have no other attribute to be chosen next) we create a leaf with entropy, and value = count of each output apparition for current partition value else we recursively calculate the child of current partition and append it to chosen best node children :param local_attributes: array of column names :param local_data: array of data for each attribute :param local_output: array of output for each row :return: a Node in tree along with its children """ nonlocal output_set nonlocal node_index possible_values_for_attribute = np.array( [np.array(list(set(x))) for x in local_data], dtype=object) current_best_attribute_index, node_values = get_best_attribute_index( local_attributes, local_data, local_output, output_set, possible_values_for_attribute) current_node = DecisionTree( value=local_attributes[current_best_attribute_index], node_id=node_index) node_index += 1 for i, v in enumerate(node_values): specific_condition = possible_values_for_attribute[ current_best_attribute_index][i] if node_values[i] == 0: for value in range( len(local_data[current_best_attribute_index])): if local_data[current_best_attribute_index][ value] == specific_condition: node_output = local_output[value] child = DecisionTree(node_id=node_index, value=node_output, is_leaf=True, parent_edge=specific_condition, parent=current_node.node_id, parent_value=current_node.value) node_index += 1 else: filter_array = [] for value in local_data[current_best_attribute_index]: if value == specific_condition: filter_array.append(True) else: filter_array.append(False) future_attributes = np.copy(local_attributes) future_attributes = np.delete(future_attributes, obj=current_best_attribute_index, axis=0) future_data = np.copy(local_data) future_data = np.delete(future_data, obj=current_best_attribute_index, axis=0) aux = [] for future_row in range(len(future_data)): aux.append(future_data[future_row][filter_array]) future_data = np.array(aux) future_output = np.copy(local_output)[filter_array] if len(future_attributes) == 1: unique, counts = np.unique(future_output, return_counts=True) value = str(dict(zip(unique, counts))) child = DecisionTree(node_id=node_index, is_leaf=True, value=value, parent_edge=specific_condition, parent=current_node.node_id, parent_value=current_node.value) node_index += 1 else: child = create_decision_tree(future_attributes, future_data, future_output) child.parent_edge = specific_condition child.parent = current_node.node_id child.parent_value = current_node.value current_node.add_children(child) return current_node
import pandas as pd from importlib import import_module from sklearn.model_selection import train_test_split from dt import DecisionTree dataset_names = ['iris', 'german', 'page_blocks', 'seeds', 'wine'] results = [] columns = ['trial', 'name', 'prune_method', 'loss', 'acc'] for i in range(1, 6): for name in dataset_names: dataset = import_module('milksets.' + name) X_train, X_test, y_train, y_test = train_test_split(*dataset.load(), test_size=0.2, random_state=i) for prune_method in ['Reduce', 'Pessim', 'Comp']: for loss in ['entropy', 'gini', '0-1']: tr = DecisionTree(X_train, y_train, prune_method, loss, name=name) tr.postprune() # Use alpha = 0 by default, since it's the best choice by experiment. acc = tr.score(X_test, y_test) results.append([i, name, prune_method, loss, acc]) df = pd.DataFrame(results, columns=columns) df.to_csv('prune_loss_test.csv', index=False)
def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy df = pd.read_csv('Training.csv') X = df.iloc[:, 0:132].values y = df.iloc[:, -1].values labelencoder_Y = LabelEncoder() y = labelencoder_Y.fit_transform(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234) clf = DecisionTree(max_depth=10) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) acc = accuracy(y_test, y_pred) print ("Accuracy:", acc*100, "%") # from matplotlib import pyplot as plt def saveModel(): with open("DTModel", "wb") as f: pickle.dump(clf, f) saveModel() # header = ['itching','skin_rash','nodal_skin_eruptions','continuous_sneezing','shivering','chills','joint_pain',
def main(): if args.modelIdx == '1': model = DecisionTree() elif args.modelIdx == '2': model = BaggedDecisionTrees(n_estimators=50) elif args.modelIdx == '3': model = RandomForest(n_estimators=50) elif args.modelIdx == '4': model = BoostedDecisionTrees(n_estimators=50) elif args.modelIdx == '5': model = SupportVectorMachine() elif args.modelIdx == 'A1': models = ['DT', 'BDT', 'BODT', 'RF', 'SVM'] tssp = [0.025, 0.05, 0.125, 0.25] num_words = [1000] max_depth = [10] n_estimators = [50] analysis_1(models, tssp, num_words, max_depth, n_estimators, debug=False) return elif args.modelIdx == 'A2': models = ['DT', 'BDT', 'BODT', 'RF', 'SVM'] tssp = [0.25] num_words = [200, 500, 1000, 1500] max_depth = [10] n_estimators = [50] analysis_2(models, tssp, num_words, max_depth, n_estimators, debug=False) return elif args.modelIdx == 'A3': models = ['DT', 'BDT', 'BODT', 'RF', 'SVM'] tssp = [0.25] num_words = [1000] max_depth = [5, 10, 15, 20] n_estimators = [50] analysis_3(models, tssp, num_words, max_depth, n_estimators, debug=False) return elif args.modelIdx == 'A4': models = ['DT', 'BDT', 'BODT', 'RF', 'SVM'] tssp = [0.25] num_words = [1000] max_depth = [10] n_estimators = [10, 25, 50, 100] analysis_4(models, tssp, num_words, max_depth, n_estimators, debug=False) return else: return model.train_from_csv(args.trainingDataFilename) model.test_from_csv(args.testDataFilename)
def test_majority_voting(): dt = DecisionTree(open('tests/one_attr.txt')) assert dt.tree[1]['<=30'] == 'no' assert dt.tree[1]['31...40'] == 'yes' assert dt.tree[1]['>40'] == 'yes'
def test_info(): assert (DecisionTree.info(9, 5) - 0.94) <= 0.001
def dt_plot(): fig, axes = plt.subplots(2, 2) # fig.set_size_inches(10,10) for ax in axes.flatten(): n_ex = 100 n_trees = 50 # belong to rf n_feats = np.random.randint(2, 100) max_depth_d = np.random.randint(1, 100) max_depth_r = np.random.randint(1, 10) # belong to rf classifier = np.random.choice([True]) #, False # generate samples based different problem(classification-->label, # regression-->continous) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model def loss(yp, y): return accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=1) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model criterion = "mse" loss = mean_squared_error mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) # fit 'em mine_d.fit(X, Y) # get preds on test set y_pred_mine_test_d = mine_d.predict(X_test) loss_mine_test_d = loss(y_pred_mine_test_d, Y_test) if classifier: entries = [("DT", loss_mine_test_d, y_pred_mine_test_d)] (lbl, test_loss, preds) = entries[np.random.randint(1)] ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100)) for i in np.unique(Y_test): ax.scatter( X_test[preds == i, 0].flatten(), X_test[preds == i, 1].flatten(), # s=0.5, ) else: X_ax = np.linspace( np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100).reshape(-1, 1) y_pred_mine_test_d = mine_d.predict(X_ax) ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5) # s=0.5) ax.plot( X_ax.flatten(), y_pred_mine_test_d.flatten(), # linewidth=0.5, label="DT".format(max_depth_d), color="yellowgreen", ) ax.set_title("DT: {:.1f} ".format(loss_mine_test_d)) ax.legend() ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) # plt.savefig("plot.png", dpi=300) plt.show()
def ensemble_diff_plot(): fig, axes = plt.subplots(3, 3) fig.set_size_inches(10, 10) for ax in axes.flatten(): n_ex = 100 n_trees = 50 n_feats = np.random.randint(2, 100) max_depth_d = np.random.randint(1, 100) max_depth_r = np.random.randint(1, 10) classifier = np.random.choice([True, False]) if classifier: # create classification problem n_classes = np.random.randint(2, 10) X, Y = make_blobs(n_samples=n_ex, centers=n_classes, n_features=2) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model def loss(yp, y): return accuracy_score(yp, y) # initialize model criterion = np.random.choice(["entropy", "gini"]) mine = RandomForest( classifier=classifier, n_feats=n_feats, n_trees=n_trees, criterion=criterion, max_depth=max_depth_r, ) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) mine_g = GradientBoostedDecisionTree( n_iter=10, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="crossentropy", step_size="constant", split_criterion=criterion, ) else: # create regeression problem X, Y = make_regression(n_samples=n_ex, n_features=1) X, X_test, Y, Y_test = train_test_split(X, Y, test_size=0.3) n_feats = min(n_feats, X.shape[1]) # initialize model criterion = "mse" loss = mean_squared_error mine = RandomForest( criterion=criterion, n_feats=n_feats, n_trees=n_trees, max_depth=max_depth_r, classifier=classifier, ) mine_d = DecisionTree(criterion=criterion, max_depth=max_depth_d, classifier=classifier) mine_g = GradientBoostedDecisionTree( # n_trees=n_trees, n_iter=10, max_depth=max_depth_d, classifier=classifier, learning_rate=1, loss="mse", step_size="adaptive", split_criterion=criterion, ) # fit 'em mine.fit(X, Y) mine_d.fit(X, Y) mine_g.fit(X, Y) # get preds on test set y_pred_mine_test = mine.predict(X_test) y_pred_mine_test_d = mine_d.predict(X_test) y_pred_mine_test_g = mine_g.predict(X_test) loss_mine_test = loss(y_pred_mine_test, Y_test) loss_mine_test_d = loss(y_pred_mine_test_d, Y_test) loss_mine_test_g = loss(y_pred_mine_test_g, Y_test) if classifier: entries = [("RF", loss_mine_test, y_pred_mine_test), ("DT", loss_mine_test_d, y_pred_mine_test_d), ("GB", loss_mine_test_g, y_pred_mine_test_g)] (lbl, test_loss, preds) = entries[np.random.randint(3)] ax.set_title("{} Accuracy: {:.2f}%".format(lbl, test_loss * 100)) for i in np.unique(Y_test): ax.scatter( X_test[preds == i, 0].flatten(), X_test[preds == i, 1].flatten(), # s=0.5, ) else: X_ax = np.linspace( np.min(X_test.flatten()) - 1, np.max(X_test.flatten()) + 1, 100).reshape(-1, 1) y_pred_mine_test = mine.predict(X_ax) y_pred_mine_test_d = mine_d.predict(X_ax) y_pred_mine_test_g = mine_g.predict(X_ax) ax.scatter(X_test.flatten(), Y_test.flatten(), c="b", alpha=0.5) # s=0.5) ax.plot( X_ax.flatten(), y_pred_mine_test_g.flatten(), # linewidth=0.5, label="GB".format(n_trees, n_feats, max_depth_d), color="red", ) ax.plot( X_ax.flatten(), y_pred_mine_test.flatten(), # linewidth=0.5, label="RF".format(n_trees, n_feats, max_depth_r), color="cornflowerblue", ) ax.plot( X_ax.flatten(), y_pred_mine_test_d.flatten(), # linewidth=0.5, label="DT".format(max_depth_d), color="yellowgreen", ) ax.set_title("GB: {:.1f} / RF: {:.1f} / DT: {:.1f} ".format( loss_mine_test_g, loss_mine_test, loss_mine_test_d)) ax.legend() ax.xaxis.set_ticklabels([]) ax.yaxis.set_ticklabels([]) # plt.savefig("plot.png", dpi=300) plt.show() plt.close("all")
xTrain = pd.read_csv(args.xTrain) yTrain = pd.read_csv(args.yTrain) xTest = pd.read_csv(args.xTest) yTest = pd.read_csv(args.yTest) # create an instance of the decision tree using gini maxDepth = np.arange(1,10) minLeaf = np.arange(1,10) trainauc = np.ones((9,9)) testauc = np.ones((9,9)) X, Y = np.meshgrid(maxDepth, minLeaf) for m in maxDepth: for l in minLeaf: dt = DecisionTree('gini', m, l) trainauc[m-1,l-1], testauc[m-1, l-1] = dt_train_test(dt, xTrain, yTrain, xTest, yTest) fig = plt.figure(figsize=(9,5)) ax = plt.axes(projection="3d") ax.plot_wireframe(X, Y, trainauc, color='g',label='Train') ax.plot_wireframe(X, Y, testauc, color='r',label='Test') ax.set_title('3D plot of accuracy using Gini') ax.set_xlabel('Min Leaf Samples') ax.set_ylabel('Tree Depth') ax.set_zlabel('Accuracy') ax.legend() plt.savefig('q1c.eps', format='eps', dpi=1000) plt.show()
def cross_validate(csv_file_name, losses_file_name, models, tssp, num_words, max_depth, n_estimators, debug=False): ''' Perform 10-fold incremental cross validation. ''' total_num = 2000 lists_of_dict = [] setups = [(p, w, d, t) for p in tssp for w in num_words for d in max_depth for t in n_estimators] losses = zeros((5, len(setups), 10)) # #models, #cases, #folds sklosses = zeros((2, len(setups), 10)) generate_train_and_test_files_cv(csv_file_name, 10) # Generate temp CV files for i in range(10): lists_of_dict.append(csv_to_dict('cv%d.dat' % (i))) i = 0 for prop, nwords, maxdep, ntrees in setups: for j in range(10): # Contruct train set training_lists_of_dict = lists_of_dict[:j] + lists_of_dict[j + 1:] training_list_of_dict = [ item for sublist in training_lists_of_dict for item in sublist ] testing_list_of_dict = lists_of_dict[j] # Randomly select samples random_indices = permutation(len(training_list_of_dict)) random_indices = random_indices[:int(total_num * prop)] training_list_of_dict = [ training_list_of_dict[k] for k in random_indices ] # Find the word features feature_words = construct_word_feature(training_list_of_dict, nwords) # Extract features and labels training_X, training_y = extract_word_feature_and_label( training_list_of_dict, feature_words) testing_X, testing_y = extract_word_feature_and_label( testing_list_of_dict, feature_words) # DT if 'DT' in models: dt = DecisionTree(max_depth=maxdep) t1 = time.time() dt.train(training_X, training_y) t2 = time.time() losses[0, i, j] = dt.test(testing_X, testing_y) if debug: print "DT training: %fs, testing: %f" % (t2 - t1, time.time() - t2) # BDT if 'BDT' in models: bdt = BaggedDecisionTrees(max_depth=maxdep, n_estimators=ntrees) t1 = time.time() bdt.train(training_X, training_y) t2 = time.time() losses[1, i, j] = bdt.test(testing_X, testing_y) if debug: print "BDT training: %fs, testing: %f" % (t2 - t1, time.time() - t2) # BODT if 'BODT' in models: bodt = BoostedDecisionTrees(max_depth=maxdep, n_estimators=ntrees) bodt.train(training_X, training_y) t2 = time.time() losses[2, i, j] = bodt.test(testing_X, testing_y) # RF if 'RF' in models: rf = RandomForest(max_depth=maxdep, n_estimators=ntrees) rf.train(training_X, training_y) losses[3, i, j] = rf.test(testing_X, testing_y) # SVM if 'SVM' in models: svm = SupportVectorMachine() svm.train(training_X, training_y) losses[4, i, j] = svm.test(testing_X, testing_y) # Libary functions if debug: training_y[training_y == 0] = -1 testing_y[testing_y == 0] = -1 skdt = skDecisionTree(max_depth=maxdep, min_samples_split=10) skdt.fit(training_X.T, training_y) sklosses[0, i, j] = 1 - skdt.score(testing_X.T, testing_y) print "ZERO-ONE-LOSS-SKDT %.4f" % sklosses[0, i, j] skrf = skRandomForest(max_depth=maxdep, n_estimators=ntrees, min_samples_split=10) skrf.fit(training_X.T, training_y) sklosses[1, i, j] = 1 - skrf.score(testing_X.T, testing_y) print "ZERO-ONE-LOSS-SKRF %.4f" % sklosses[1, i, j] i += 1 save(losses_file_name, losses) save('debug_' + losses_file_name, sklosses)
final_accuracy = (total_accuracy / total_elements) return final_accuracy if __name__ == '__main__': data, filename = process_file() print("This is a decision tree classifier program") print("\nThe dataset you've chosen is {}".format(filename)) print("\nIf you wanted to experiment with a different dataset,") print("please quit this program and enter:") print("`python driver.py data_file.txt`") print("You can choose from any files under the 'dataset/' directory.") t = DecisionTree(data) t.build_tree() print("\n\n...Successfully built a classifer based on the datset") print("\nIf you want to print the tree, hit 1. Otherwise, hit `enter`:\n") see_tree = input() if see_tree == '1': print("Classification Tree from {} data".format(filename)) print("=============================================================") t.print_tree() print("=============================================================") print( "You can randomly choose a data point, and this program can classify") print( "that data point for you. Hit 1 if you want to test a random example")