-
Notifications
You must be signed in to change notification settings - Fork 0
/
decision_tree_boosted.py
65 lines (57 loc) · 2.82 KB
/
decision_tree_boosted.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import pandas as pd
import decision_tree
import cross_vali
import data_preprocess as dp
def sample(data, weight):
sample_indices = np.random.choice(len(data), size=len(data), replace=True, p=weight)
bootstrap_x = data.iloc[sample_indices, :] # sampled training x
bootstrap_y = data.iloc[sample_indices, -1] # sampled training y
bootstrap_weight = [weight[s] for s in sample_indices] # sampled training weight
return bootstrap_x, bootstrap_y, bootstrap_weight
def classify_boosting(train, test, n_rounds):
weight = np.full(len(train), 1 / len(train)) # weight for all records
boosting_sum = np.zeros((n_rounds, len(test))) # sum matrix for all rounds
restart = False
for i in range(n_rounds):
# for each round, sample the training set with replacement according to weight
bootstrap_train_x, bootstrap_train_y, bootstrap_weight = sample(train, weight)
# generate decision tree
dt = decision_tree.create_tree(bootstrap_train_x, 0, 3)
# apply decision tree on training dataset
result_train = train.apply(lambda r: decision_tree.predict(r, dt), axis=1)
# apply decision tree on testing dataset
result_test = test.apply(lambda r: decision_tree.predict(r, dt), axis=1)
# miss = 1 if misclassified else 0
miss = np.logical_xor(result_train.values, bootstrap_train_y)
# error = sum(miss(i)*weight(i))/sum(weight(i))
error = np.sum(np.multiply(weight, miss)) / np.sum(weight)
# if error > 0.5 then start over
if (error > 0.5):
restart = True
break
# alpha(classifier weight) = 1/2 * ln(1- error / error).
alpha = 0.5 * np.log((1 - error) / error)
# calculate sum of alpha * y_test for this round
boosting_sum[i,:] = np.multiply([float(1 if r > 0 else -1) for r in result_test], alpha)
# update weight
# new weight = e ^ (alpha * miss)
weight = np.multiply(weight, np.exp([float(1 if m > 0 else -1) * alpha for m in miss]))
# normalize weight
weight = [float(w) / sum(weight) for w in weight]
if not restart:
# get final prediction based on the sum of weighted prediction of all rounds
classification = np.sign(boosting_sum.sum(axis=0))
classification = [1 if c > 0 else 0 for c in classification] # convert -1s to 0s
return classification
else:
return classify_boosting(train, test, n_rounds)
def boosting(x, y, test, n_rounds):
train = np.concatenate((np.array(x),np.array([y]).T), axis=1)
train = pd.DataFrame(train)
test = pd.DataFrame(test)
test['label'] = 0 # here I fill the label column with dummmy values to make its dimension the same as the training set
return classify_boosting(train, test, n_rounds)
if __name__ == "__main__":
x, y = dp.data_read("project3_dataset1.txt")
cross_vali.cross_validation(x, y, boosting, 2)