# print('\nTesting root low low impurity') # assert dt.root.low.low.impurity == 0 # print('passed') # print('\nTesting root low high impurity') # assert dt.root.low.high.impurity == 0 # print('passed') print('Splitting Tests') df = DataFrame.from_array( [[1, 11, 'A'], [1, 12, 'A'], [2, 11, 'A'], [1, 13, 'B'], [2, 13, 'B'], [3, 13, 'B'], [3, 11, 'B']], columns=['x', 'y', 'class']) dt = DecisionTree(split_metric='gini') dt.initialize(df) dt.split() dt.split() assert dt.root.high.row_indices == [3, 4, 5] assert dt.root.low.low.row_indices == [0, 1, 2] assert dt.root.low.high.row_indices == [6] print('passed') dt = DecisionTree(split_metric='gini') dt.fit(df) assert dt.root.high.row_indices == [3, 4, 5] assert dt.root.low.low.row_indices == [0, 1, 2] assert dt.root.low.high.row_indices == [6] print('passed')
# .......................... X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # Rescaled labels {-1, 1} rescaled_y_train = 2 * y_train - np.ones(np.shape(y_train)) rescaled_y_test = 2 * y_test - np.ones(np.shape(y_test)) # ....... # SETUP # ....... adaboost = Adaboost(n_clf=8) naive_bayes = NaiveBayes() knn = KNN(k=4) logistic_regression = LogisticRegression() mlp = MultilayerPerceptron(n_hidden=20) perceptron = Perceptron() decision_tree = DecisionTree() random_forest = RandomForest(n_estimators=150) support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel) lda = LDA() # ........ # TRAIN # ........ print "Training:" print "\tAdaboost" adaboost.fit(X_train, rescaled_y_train) print "\tNaive Bayes" naive_bayes.fit(X_train, y_train) print "\tLogistic Regression" logistic_regression.fit(X_train, y_train) print "\tLDA"
def main(): parser = ArgumentParser() parser.add_argument('--decision_tree', action='store_true') parser.add_argument('--knn', action='store_true') parser.add_argument('--knn_cv', action='store_true') parser.add_argument('--decision_tree_cv', action='store_true') parser.add_argument('--make_predictions', action='store_true') parser.add_argument('--add_index', action='store_true') args = parser.parse_args() if len(sys.argv) == 1: args.make_predictions = True x_train = pd.read_csv(os.path.join(DATA_DIR, 'x_train.csv'), header=None) x_test = pd.read_csv(os.path.join(DATA_DIR, 'x_test.csv'), header=None) y_train = pd.read_csv(os.path.join(DATA_DIR, 'y_train.csv'), header=None) y_train = y_train.T.squeeze() # make it a Series # decision tree if args.decision_tree: print('running decision tree, max_depth=5 min_size=5') tree = DecisionTree(max_depth=5, min_size=5) tree.fit(x_train, y_train) y_pred = tree.predict(x_test) pd.Series(y_pred).to_csv(os.path.join(PREDICTIONS_DIR, 'decision_tree_predictions.csv'), index=False, header=False) # knn if args.knn: print('running knn, n_neighbors=5') knn = KNN(n_neighbors=5) knn.fit(x_train, y_train) y_pred = knn.predict(x_test) pd.Series(y_pred).to_csv(os.path.join(PREDICTIONS_DIR, 'knn_predictions.csv'), index=False, header=False) if args.knn_cv: for n_neighbors in [3, 5, 10, 20, 25]: knn = KNN(n_neighbors=n_neighbors) kfold = KFold(n_splits=5) metrics = [] for fit_index, val_index in kfold.split(x_train, y_train): x_fit, y_fit = x_train.iloc[fit_index], y_train[fit_index] x_val, y_val = x_train.iloc[val_index], y_train[val_index] print(f'running knn cv, n_neighbors={n_neighbors}') knn.fit(x_fit, y_fit) y_pred_val = knn.predict(x_val) metrics.append(get_metrics(y_val, y_pred_val)) a, p, r, f = zip(*metrics) mean_val_accuracy = np.mean(a) mean_val_precision = np.mean(p) mean_val_recall = np.mean(r) mean_val_f1 = np.mean(f) print(f'knn cv metrics for n_neighbors={n_neighbors}:') print('avg accuracy:', mean_val_accuracy) print('avg precision:', mean_val_precision) print('avg recall:', mean_val_recall) print('avg f1 score:', mean_val_f1) if args.decision_tree_cv: for max_depth in [3, 6, 9, 12, 15]: tree = DecisionTree(max_depth=max_depth, min_size=5) kfold = KFold(n_splits=5) metrics = [] for fit_index, val_index in kfold.split(x_train, y_train): x_fit, y_fit = x_train.iloc[fit_index], y_train[fit_index] x_val, y_val = x_train.iloc[val_index], y_train[val_index] print( f'running decision tree cv, max_depth={n_neighbors} min_size=5' ) tree.fit(x_fit, y_fit) y_pred_val = tree.predict(x_val) metrics.append(get_metrics(y_val, y_pred_val)) a, p, r, f = zip(*metrics) mean_val_accuracy = np.mean(a) mean_val_precision = np.mean(p) mean_val_recall = np.mean(r) mean_val_f1 = np.mean(f) print( f'decision tree cv metrics for max_depth={max_depth} min_size=5:' ) print('avg accuracy:', mean_val_accuracy) print('avg precision:', mean_val_precision) print('avg recall:', mean_val_recall) print('avg f1 score:', mean_val_f1) if args.make_predictions: ''' # using k-fold cross validation to select the best hyperparameters for the decision tree model param_grid = { 'max_depth': np.arange(3, 11), 'min_size': [2, 3, 5, 8, 10] } final_model = GridSearchCV(tree, param_grid, cv=5, scoring='f1', refit=True, verbose=10) final_model.fit(x_train, y_train) print(final_model.best_params_) ''' # {'max_depth': 10, 'min_size': 3} final_model = DecisionTree(max_depth=10, min_size=3) kfold = KFold(n_splits=5) metrics = [] aucs = [] for fit_index, val_index in kfold.split(x_train, y_train): x_fit, y_fit = x_train.iloc[fit_index], y_train[fit_index] x_val, y_val = x_train.iloc[val_index], y_train[val_index] print( 'running best model cv, decision tree with max_depth=10 min_size=3' ) final_model.fit(x_fit, y_fit) y_pred_val = final_model.predict(x_val) metrics.append(get_metrics(y_val, y_pred_val)) aucs.append(roc_auc_score(y_val, y_pred_val)) a, p, r, f = zip(*metrics) mean_val_accuracy = np.mean(a) mean_val_precision = np.mean(p) mean_val_recall = np.mean(r) mean_val_f1 = np.mean(f) mean_val_auc = np.mean(aucs) print('best model cv metrics:') print('avg accuracy:', mean_val_accuracy) print('avg precision:', mean_val_precision) print('avg recall:', mean_val_recall) print('avg f1 score:', mean_val_f1) print('avg auc:', mean_val_auc) final_model.fit(x_train, y_train) y_pred_train = final_model.predict(x_train) accuracy, precision, recall, f1 = get_metrics(y_train, y_pred_train) auc = roc_auc_score(y_train, y_pred_train) print('best model training set metrics:') print('accuracy:', accuracy) print('precision:', precision) print('recall:', recall) print('f1 score:', f1) print('auc:', auc) y_pred = final_model.predict(x_test) pd.Series(y_pred).to_csv(os.path.join(PREDICTIONS_DIR, 'best.csv'), index=False, header=False) if args.add_index: y_pred = pd.read_csv(os.path.join(PREDICTIONS_DIR, 'best.csv'), header=None) y_pred.to_csv(os.path.join(PREDICTIONS_DIR, 'best.csv'), header=None) # this adds the index by default print('finished adding index')
def evaluate_performance(): ''' Evaluate the performance of decision trees and logistic regression, average over 1,000 trials of 10-fold cross validation Return: a matrix giving the performance that will contain the following entries: stats[0,0] = mean accuracy of decision tree stats[0,1] = std deviation of decision tree accuracy stats[1,0] = mean accuracy of logistic regression stats[1,1] = std deviation of logistic regression accuracy ** Note that your implementation must follow this API** ''' # Load Data filename = 'data/SPECTF.dat' data = np.loadtxt(filename, delimiter=',') X = data[:, 1:] y = np.array([data[:, 0]]).T n, d = X.shape for trial in range(1000): # TODO: shuffle for each of the trials. # the following code is for reference only. idx = np.arange(n) np.random.seed(13) np.random.shuffle(idx) X = X[idx] y = y[idx] # TODO: write your own code to split data (for cross validation) # the code here is for your reference. Xtrain = X[1:101, :] # train on first 100 instances Xtest = X[101:, :] ytrain = y[1:101, :] # test on remaining instances ytest = y[101:, :] # train the decision tree classifier = DecisionTree(100) classifier.fit(Xtrain, ytrain) # output predictions on the remaining data y_pred = classifier.predict(Xtest) accuracy = accuracy_score(ytest, y_pred) break # compute the training accuracy of the model meanDecisionTreeAccuracy = np.mean(all_accuracies) # TODO: update these statistics based on the results of your experiment stddevDecisionTreeAccuracy = 0 meanLogisticRegressionAccuracy = 0 stddevLogisticRegressionAccuracy = 0 meanRandomForestAccuracy = 0 stddevRandomForestAccuracy = 0 # make certain that the return value matches the API specification stats = np.zeros((2, 2)) stats[0, 0] = meanDecisionTreeAccuracy stats[0, 1] = stddevDecisionTreeAccuracy stats[1, 0] = meanRandomForestAccuracy stats[1, 1] = stddevRandomForestAccuracy stats[2, 0] = meanLogisticRegressionAccuracy stats[2, 1] = stddevLogisticRegressionAccuracy return stats
def run(retrain=True): tree = DecisionTree() if retrain == False: tree = DecisionTree().load('./model/tree.pkl') else: tree.fit(train_set, train_labels) # arr = [] # Without Early Stopping print("-----------------------------") print("Without Early Stopping") print("-----------------------------") print("Height: {} | Terminal Nodes: {}".format(tree.height, tree.leaves)) print("Train Accuracy: ", tree.accuracy(train_set, train_labels)) print("Validation Accuracy: ", tree.accuracy(validation_set, validation_labels)) print("Test Accuracy: ", tree.accuracy(test_set, test_labels)) print("Number of times an attribute is used as the splitting function:") print("Feature (Polarity Bin): Frequency") for feature in range(feature_count): print('%.2f' % bins[feature], ': ', tree.attribute_frequency[feature]) # arr.append((bins[feature], tree.attribute_frequency[feature])) # arr = np.array(arr) # np.savetxt("./graphs/freq.csv", arr, delimiter=",") print("-----------------------------") print("-----------------------------") if retrain == False: # With Early Stopping # Height = 40 tree = DecisionTree().load('./model/early_stopping/tree_height.pkl') print("Early Stopping by Max Height = 40") print("-----------------------------") print("Height: {} | Terminal Nodes: {}".format(tree.height, tree.leaves)) print("Train Accuracy: ", tree.accuracy(train_set, train_labels)) print("Validation Accuracy: ", tree.accuracy(validation_set, validation_labels)) print("Test Accuracy: ", tree.accuracy(test_set, test_labels)) print( "Number of times an attribute is used as the splitting function:") print("Feature (Polarity Bin): Frequency") for feature in range(feature_count): print('%.2f' % bins[feature], ': ', tree.attribute_frequency[feature]) print("-----------------------------") print("-----------------------------") # IG Threshold = 1e-2 tree = DecisionTree().load('./model/early_stopping/tree_ig.pkl') print("Early Stopping by Information Gain Threshold = 0.01") print("-----------------------------") print("Height: {} | Terminal Nodes: {}".format(tree.height, tree.leaves)) print("Train Accuracy: ", tree.accuracy(train_set, train_labels)) print("Validation Accuracy: ", tree.accuracy(validation_set, validation_labels)) print("Test Accuracy: ", tree.accuracy(test_set, test_labels)) print( "Number of times an attribute is used as the splitting function:") print("Feature (Polarity Bin): Frequency") for feature in range(feature_count): print('%.2f' % bins[feature], ': ', tree.attribute_frequency[feature]) print("-----------------------------") print("-----------------------------") else: height_options = [0, 5, 10, 20, 40, 60] for height in height_options: tree = DecisionTree(max_height=height) tree.fit(train_set, train_labels) print("Early Stopping by Max Height = {}".format(height)) print("-----------------------------") print("Height: {} | Terminal Nodes: {}".format( tree.height, tree.leaves)) print("Train Accuracy: ", tree.accuracy(train_set, train_labels)) print("Validation Accuracy: ", tree.accuracy(validation_set, validation_labels)) print("Test Accuracy: ", tree.accuracy(test_set, test_labels)) print( "Number of times an attribute is used as the splitting function:" ) print("Feature (Polarity Bin): Frequency") for feature in range(feature_count): print('%.2f' % bins[feature], ': ', tree.attribute_frequency[feature]) print("-----------------------------") ig_options = [0, 1e-4, 1e-3, 1e-2] for ig in ig_options: tree = DecisionTree(ig_threshold=ig) tree.fit(train_set, train_labels) print("Early Stopping by Information Gain = {}".format(ig)) print("-----------------------------") print("Height: {} | Terminal Nodes: {}".format( tree.height, tree.leaves)) print("Train Accuracy: ", tree.accuracy(train_set, train_labels)) print("Validation Accuracy: ", tree.accuracy(validation_set, validation_labels)) print("Test Accuracy: ", tree.accuracy(test_set, test_labels)) print( "Number of times an attribute is used as the splitting function:" ) print("Feature (Polarity Bin): Frequency") for feature in range(feature_count): print('%.2f' % bins[feature], ': ', tree.attribute_frequency[feature]) print("-----------------------------")
def test_entropy(self): decision_tree = DecisionTree() labels = np.array([1, 1]) self.assertAlmostEqual(decision_tree._entropy(labels), 0)
def test_iterate(self): decision_tree = DecisionTree() data = np.array([[1], [2], [3]]) labels = np.array([0, 0, 1]) node = decision_tree._iterate(data, labels) self.assertTrue(isinstance(node, DecisionNode))
def randomize(self): for i in range(0, self.size): self.genes.append(DecisionTree(6, self.limits))
def mutate(self): if random.randint(0, 10) == 0: gene = random.randint(0, self.size - 1) self.genes[gene] = DecisionTree(5, self.limits)
"Rating": "category"} if __name__ == "__main__": # Move to data directory #cwd = os.getcwd() #os.chdir(cwd + "\\data") for i in range(1, 5): # Read in synth_data synth_data = pd.read_csv("data/synthetic-{}.csv".format(i), names=synth_data_names, dtype=synth_data_types) print("synthetic-{} data".format(i)) # Train tree (default bin values of 10) tree = DecisionTree(10) tree.fit(synth_data, limit=3) print(RenderTree(tree.root)) results = tree.predict(synth_data) num_correct = sum(row["label"] == synth_data.at[index, "label"] for index, row in results.iterrows()) accuracy = num_correct / len(synth_data) print("accuracy: {}".format(accuracy)) # Plot decision surface of best decision tree for current data; # Following example plots from scikitlearn: # https://scikit-learn.org/0.15/auto_examples/tree/plot_iris.html # Make subplot plt.subplot(2, 2, i, title="synthetic-{} data".format(i))
def __init__(self, num_trees): # TODO: do initialization here. self.num_trees = num_trees self.decision_trees = [DecisionTree() for i in range(num_trees)]
counts_array = np.zeros((len(X), len(self._classes))) # n_samples x n_classes for row_index, count in enumerate(counts): for class_index, class_ in enumerate(self._classes): counts_array[row_index, class_index] = count[class_] proba = counts_array / self._n_trees # 規格化する return proba if __name__ == '__main__': iris = datasets.load_iris() X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) dt = DecisionTree() dt.fit(X_train, y_train) rf = RandomForest() rf.fit(X_train, y_train) print('DecisionTree: ') # dt_predicted_y_train = dt.predict(X_train) # print(' predicted_y_train: {}'.format(dt_predicted_y_train)) # print(' (actual) : {}'.format(y_train)) print(' score_train: {}'.format(dt.score(X_train, y_train))) # dt_predicted_y_test = dt.predict(X_test) # print(' predicted_y_test: {}'.format(dt_predicted_y_test)) # print(' (actual) : {}'.format(y_test)) print(' score_test: {}'.format(dt.score(X_test, y_test)))
f.tight_layout() return f def plot3d(data, classes): f = plt.figure() ax = f.add_subplot(111, projection='3d') ax.scatter([p[0] for p in data], [p[1] for p in data], [p[2] for p in data], c=[classlabel for classlabel in classes]) return f if __name__ == '__main__': data, classes = load_data() f = plot3d(data, classes) f = plot2d(data, classes) tree = DecisionTree(data, classes, maxDepth=2) v1 = np.array([4.1, -0.1, 2.2]) v2 = np.array([6.1, 0.4, 1.3]) print("Prediction for {}: {}".format(v1, tree.predict(v1))) print("Prediction for {}: {}".format(v2, tree.predict(v2))) newdata = list() newclasses = list() for x1 in np.linspace(0, 10, 10): for x2 in np.linspace(-1, 1, 10): for x3 in np.linspace(0, 6, 10): v = np.array([x1, x2, x3]) newdata.append(v) newclasses.append(tree.predict(v))
action='store', dest='file', help='name of the dataset', required=True) argparser.add_argument( '-m', '-measure', action='store', dest='measure', help= 'Choose criterion, it can be information_gain or gini. By default: information_gain', default='information_gain') argparser.add_argument('-p', '-prune', action='store', dest='prune', help='Set prune. By default: ""', default="") results = argparser.parse_args() dataset = pd.read_csv(f'datasets/{results.file}') attribute_list = list(dataset.columns)[:-1] x_data, y_data = np.split(dataset, [-1], axis=1) x_train, x_test, y_train, y_test = train_test_split(x_data, y_data) decision_tree = DecisionTree(results.measure, results.prune) decision_tree.fit(x_train, y_train, attribute_list) print(decision_tree.score(x_test, y_test))
def __init__(self, num_trees): self.num_trees = num_trees self.decision_trees = [DecisionTree() for i in range(num_trees)]
def evaluate_performance(): ''' evaluate_performance a function that evaluates the performance of decision trees and logistic regression, averages over 1,000 trials of 10-fold cross validation Return: a matrix giving the performance that will contain the following entries: stats[0,0] = mean accuracy of decision tree stats[0,1] = std deviation of decision tree accuracy stats[1,0] = mean accuracy of logistic regression stats[1,1] = std deviation of logistic regression accuracy ''' # Load Data filename = 'data/SPECTF.dat' data = np.loadtxt(filename, delimiter=',') X = data[:, 1:] y = np.array([data[:, 0]]).T n, d = X.shape accuracies_tree = [] accuracies_forest = [] #accuracies_logistic = [] for trial in range(1000): if trial % 100 == 0: print(trial) # TODO: shuffle for each of the trials. # the following code is for reference only. idx = np.arange(n) np.random.shuffle(idx) X = X[idx] y = y[idx] # TODO: write your own code to split data (for cross validation) # the code here is for your reference. Xtrain = X[0:100, :] # train on first 100 instances Xtest = X[100:, :] ytrain = y[0:100, :] # test on remaining instances ytest = y[100:, :] train = (np.hstack((Xtrain, ytrain))).tolist() # train the decision tree classifier = DecisionTree(100) tree = classifier.fit(train) # output predictions on the remaining data y_pred_tree = classifier.predict(Xtest, tree, []) accuracy_tree = accuracy_score(ytest, y_pred_tree) accuracies_tree.append(accuracy_tree) clt = RandomForest(10, 100) forest = clt.fit(Xtrain, ytrain) y_pred_forest, conf = clt.predict(Xtest, forest) accuracy_forest = accuracy_score(ytest, y_pred_forest) accuracies_forest.append(accuracy_forest) # compute the training accuracy of the model meanDecisionTreeAccuracy = np.mean(accuracies_tree) stddevDecisionTreeAccuracy = np.std(accuracies_tree) #meanLogisticRegressionAccuracy = 0 #stddevLogisticRegressionAccuracy = 0 meanRandomForestAccuracy = np.mean(accuracies_forest) stddevRandomForestAccuracy = np.std(accuracies_forest) # make certain that the return value matches the API specification stats = np.zeros((2, 2)) stats[0, 0] = meanDecisionTreeAccuracy stats[0, 1] = stddevDecisionTreeAccuracy stats[1, 0] = meanRandomForestAccuracy stats[1, 1] = stddevRandomForestAccuracy #stats[2, 0] = meanLogisticRegressionAccuracy #stats[2, 1] = stddevLogisticRegressionAccuracy return stats
# PLOT RESULT utils.plotClassifier(model, X, y) fname = os.path.join("..", "figs", "q2_2_decisionBoundary.pdf") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) elif question == "2.3": # 1. Load citiesSmall dataset dataset = load_dataset("citiesSmall.pkl") X = dataset["X"] y = dataset["y"] # 2. Evaluate decision tree model = DecisionTree(max_depth=2) model.fit(X, y) y_pred = model.spredict(X) error = np.mean(y_pred != y) print("Error: %.3f" % error) elif question == "2.4": dataset = load_dataset("citiesSmall.pkl") X = dataset["X"] y = dataset["y"] print("n = %d" % X.shape[0]) depths = np.arange(1, 15) # depths to try
def get_data_set(): data_set = list() df = pd.read_csv('xg.csv') for i, row in df.iterrows(): temp_dict = dict() temp_dict['id'] = row[0] temp_dict['class'] = row[-1] temp_dict['feature'] = list(row[1: -1]) data_set.append(temp_dict) return data_set if __name__ == '__main__': d_set = get_data_set() train_set = d_set[0: 10] test_set = d_set[10:] # train_set = d_set print(len(train_set)) print(len(test_set)) dt = DecisionTree(train_set, [0, 1, 2, 3, 4, 5]) t = dt.get_tree_dict() json.dump(t, open('p.json', 'w')) correction = 0 for data in test_set: if data['class'] == dt.predict(data): correction += 1 accuracy = correction / len(test_set) print('accuracy', accuracy)
def test_square_loss(self): decision_tree = DecisionTree() labels = np.array([0, 0]) self.assertAlmostEqual(decision_tree._square_loss(labels), 0)
te_error = np.mean(y_pred != y_test) print("Training error: %.3f" % tr_error) print("Testing error: %.3f" % te_error) elif question == "1.1": with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f: dataset = pickle.load(f) X, y = dataset["X"], dataset["y"] X_test, y_test = dataset["Xtest"], dataset["ytest"] depths = np.arange(1, 15) # depths to try my_tree_training_errors = np.zeros(depths.size) for i, max_depth in enumerate(depths): model = DecisionTree(max_depth=max_depth) model.fit(X, y) y_pred = model.predict(X) my_tree_training_errors[i] = np.mean(y_pred != y) plt.plot(depths, my_tree_training_errors, label="trainingerrorrate") my_tree_testing_errors = np.zeros(depths.size) for i, max_depth in enumerate(depths): model = DecisionTree(max_depth=max_depth) model.fit(X_test, y_test) y_pred = model.predict(X_test) my_tree_testing_errors[i] = np.mean(y_pred != y_test) plt.plot(depths, my_tree_testing_errors, label="testingerrorrate") plt.xlabel("Depth of tree") plt.ylabel("Classification error")
def test_gini(self): decision_tree = DecisionTree() labels = np.array([0, 1]) self.assertAlmostEqual(decision_tree._gini(labels), 0.5)
correct = 0 training_df = DataFrame.from_array(training_set, ['bmi', 'weight', 'class']) decision_tree.fit(training_df) for test in testing_set: test_dict = {'bmi': test[0], 'weight': test[1]} if forest: prediction = decision_tree.predict(test_dict) else: prediction = decision_tree.classify(test_dict) if prediction == test[2]: correct += 1 return correct, len(testing_set) dt = DecisionTree('gini') total_correct = 0 total = 0 for i in range(len(splits[0])): print(i + 1, 'testing set') results = run_tests(splits[0][i], splits[1][i], dt) total += results[1] total_correct += results[0] print(total_correct, total) forests = [1, 10, 100, 1000] for num in forests: dt = RandomForest(num)
def __init__(self, num_trees, max_depth=-1, min_gain=0): # Initialization done here self.num_trees = num_trees self.decision_trees = [DecisionTree() for i in range(num_trees)] self.max_depth = max_depth self.min_gain = min_gain
new_feature_matrix = np.concatenate((feature_matrix, target.T), axis=1) # print(len(new_feature_matrix[0])) # print(len(feature_matrix)) # print(len(feature_matrix[0])) with open('feature_matrix.csv', 'w', newline='') as myfile: wr = csv.writer(myfile, quoting=csv.QUOTE_ALL) for row in feature_matrix: wr.writerow(row) df = pd.DataFrame(new_feature_matrix) df = df.iloc[1:] # print(df.head()) decision_model = DecisionTree(df) X_train, X_test, y_train, y_test = decision_model.split_dataset() gini_clf = decision_model.train_gini(X_train, y_train) entropy_clf = decision_model.train_entropy(X_train, y_train) print("using gini index:") gini_pred = decision_model.prediction(X_test, gini_clf) decision_model.accuracy(y_test, gini_pred) print("using entropy:") entropy_pred = decision_model.prediction(X_test, entropy_clf) decision_model.accuracy(y_test, entropy_pred) filename = input(f"Please enter the file name:\n") path = os.getcwd() test_data = Preprocess(path + '\\' + filename)
['bmi', 'weight', 'class']) decision_tree.fit(training_df) for test in testing_set: test_dict = {'bmi': test[0], 'weight': test[1]} if forest: prediction = decision_tree.predict(test_dict) else: prediction = decision_tree.classify(test_dict) if prediction == test[2]: correct += 1 else: indices.append(df.to_array().index(test)) return correct, len(testing_set) dt = DecisionTree('gini', max_depth=4) total_correct = 0 total = 0 # for i in range(len(splits[0])): # if i == 0: # print(len(splits[1][i])) # print(i+1,'testing set') print('HALFWAY TRAINING SET') results = run_tests(splits[0], splits[1], dt) total += results[1] total_correct += results[0] dt_results = {} dt_results['dt_gini'] = round(total_correct / total, 4)
utils.plotClassifier(model, X, y) fname = os.path.join("..", "figs", "q2_1_decisionBoundary.pdf") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) elif question == "2.3": # Q2.3 - Decision Tree with depth 2 # 1. Load citiesSmall dataset dataset = utils.load_dataset("citiesSmall") X = dataset["X"] y = dataset["y"] # 2. Evaluate decision tree model = DecisionTree(max_depth=2) model.fit(X, y) y_pred = model.predict(X) error = np.mean(y_pred != y) print("Error: %.3f" % error) elif question == "2.4": dataset = utils.load_dataset("citiesSmall") X = dataset["X"] y = dataset["y"] print("n = %d" % X.shape[0]) depths = np.arange(1,15) # depths to try
# split data into train, test train, test = train_test_split(data, test_size = 0.2, random_state = 42) # use libary decision tree to fit data and predict start = time.time() clf = tree.DecisionTreeClassifier() clf = clf.fit(train[:,:-1], train[:,-1]) sktree_predictions = clf.predict(test[:,:-1]) end = time.time() score = accuracy_score(test[:,-1], sktree_predictions) print(f'Time for sklearn decision tree to finish {datafetcher.name} ({datafetcher.feature} features, {datafetcher.len} observations) fitting and prediction is {end-start:.4f} seconds. Accuracy score is {score}.') # tree.plot_tree(clf) # use our decision tree to fit data and predict # prepare data for my tree start = time.time() my_tree = DecisionTree() my_tree.fit(header[:-1], train) my_predictions = [] i = 0 for observation in test: predicted = my_tree.predict(my_tree.tree, observation) my_predictions.append(predicted) end = time.time() my_score = accuracy_score(test[:, -1], my_predictions) print(f'Time for my decisioin tree to finish {datafetcher.name} ({datafetcher.feature} features, {datafetcher.len} observations) fitting and prediction is {end-start:.4f} seconds. Accuracy score is {my_score}.') # my_tree.print_tree()
plt.xlabel("x-coordinate") plt.ylabel("y-coordinate") plt.legend() fname = os.path.join("..", "figs", "q6_3_decisionBoundary.pdf") plt.savefig(fname) print("\nFigure saved as '%s'" % fname) elif question == "6.4": with open(os.path.join('..', 'data', 'citiesSmall.pkl'), 'rb') as f: dataset = pickle.load(f) X = dataset["X"] y = dataset["y"] model = DecisionTree(max_depth=2, stump_class=DecisionStumpInfoGain) model.fit(X, y) y_pred = model.predict(X) error = np.mean(y_pred != y) print("Error: %.3f" % error) utils.plotClassifier(model, X, y) plt.xlabel("x-coordinate") plt.ylabel("y-coordinate") plt.legend() fname = os.path.join("..", "figs", "q6_4_decisionBoundary.pdf") plt.savefig(fname)
def __init__(self, num_trees): # Initialization done here self.num_trees = num_trees self.decision_trees = [DecisionTree() for i in range(num_trees)]
def create_tree(verbose=False): """ Parameters ---------- verbose: boolean Returns ------- tree: DecisionTree """ # Load the data. trips = get_trips() arrival_times_df = get_arrival_times(trips) # Assume nan means that the train is late. arrival_times_df.fillna(value=30, inplace=True) # Split the data into training and testing sets. training_dates = [] tuning_dates = [] testing_dates = [] last_training_day = datetime.datetime.strptime('2016-04-30', '%Y-%m-%d') last_tuning_day = datetime.datetime.strptime('2017-04-30', '%Y-%m-%d') for datestr in arrival_times_df.columns: this_date = datetime.datetime.strptime(datestr, '%Y-%m-%d') if this_date <= last_training_day: training_dates.append(datestr) if this_date <= last_tuning_day: tuning_dates.append(datestr) else: testing_dates.append(datestr) training_df = arrival_times_df.loc[:, training_dates] tuning_df = arrival_times_df.loc[:, tuning_dates] testing_df = arrival_times_df.loc[:, testing_dates] training_features_df = create_features(list(training_df.columns)) judge = Judge(training_df) # Tune our hyperparameter. # Iterate over values for n_min. best_tuning_score = 1e10 best_n_min = 0 best_tree = None for n_min in range(10, 100, 10): tree = DecisionTree(err_fn=judge.find_total_absolute_deviation, n_min=n_min) tree.train(training_features_df) training_score = evaluate(tree, training_df) tuning_score = evaluate(tree, tuning_df) if tuning_score < best_tuning_score: best_tuning_score = tuning_score best_n_min = n_min best_tree = tree if verbose: print('n_min', n_min) print('training', training_score) print('tuning', tuning_score) tree.render() testing_score = evaluate(best_tree, testing_df) if verbose: print('best_n_min', best_n_min) print('best_tuning', best_tuning_score) print('testing score', testing_score) return best_tree