########### AdaBoostClassifier on Real Input and Discrete Output ###################
print("-----------------------------------------------------------")
print("Decision stump on random data")
print("-----------------------------------------------------------")
N = 30
P = 2
NUM_OP_CLASSES = 2
n_estimators = 3
X = pd.DataFrame(np.abs(np.random.randn(N, P)))
y = pd.Series(np.random.randint(NUM_OP_CLASSES, size=N), dtype="category")

criteria = 'information_gain'
tree = DecisionTree(criterion=criteria)
re = X.shape[0]
img_weights = [1 / re] * re
tree.fit(X, y, img_weights)
yhat = pd.Series(tree.predict(X))
print('Criteria :', criteria)
print('Accuracy: ', accuracy(yhat, y))
for cls in y.unique():
    print("***Class :" + str(cls) + "***")
    print('Precision: ', precision(yhat, y, cls))
    print('Recall: ', recall(yhat, y, cls))

print("-----------------------------------------------------------")
print("Adaboost on random data")
print("-----------------------------------------------------------")

Classifier_AB = AdaBoostClassifier(base_estimator=tree,
                                   n_estimators=n_estimators)
Classifier_AB.fit(X, y)
shuffled = estate.sample(frac=1).reset_index(drop=True)

# Preprocessing
X = shuffled.iloc[:, :-1].squeeze()
y = (shuffled.iloc[:, -1:]).T.squeeze()
len_estate = len(y)

# Splitting data
X_train, y_train = X.loc[:split*len_estate], y.loc[:split*len_estate]
X_test, y_test = X.loc[split*len_estate+1:].reset_index(
    drop=True), y.loc[split*len_estate+1:].reset_index(drop=True)

# Learning tree
print("Please wait for some time, it takes time, you can change max depth if it takes too long time.")
tree = DecisionTree(criterion="information_gain", max_depth=max_depth)
tree.fit(X_train, y_train)
tree.plot()

# Printing accuracies for different depths
for depth in range(2, max_depth+1):
    y_hat = tree.predict(X_test, max_depth=depth)
    print("Depth: ", depth)
    print('\tRMSE: ', rmse(y_hat, y_test))
    print('\tMAE: ', mae(y_hat, y_test))

# Decision Tree Regressor from Sci-kit learn
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train, y_train)
y_hat = pd.Series(dt.predict(X_test))

print('Sklearn RMSE: ', rmse(y_hat, y_test))
Esempio n. 3
0
from tree.base import DecisionTree
from metrics import *

np.random.seed(42)
# Test case 1
# Real Input and Real Output

N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randn(N))


for criteria in ['information_gain', 'gini_index']:
    tree = DecisionTree(criterion=criteria) #Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    tree.plot()
    print('Criteria :', criteria)
    print('RMSE: ', rmse(y_hat, y))
    print('MAE: ', mae(y_hat, y))

# Test case 2
# Real Input and Discrete Output

N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randint(P, size = N), dtype="category")

for criteria in ['information_gain', 'gini_index']:
Esempio n. 4
0
for i in range(len(y)):
    if (y[i]!='Iris-virginica'):
        y[i] = 'not virginica'
N = len(y)
t = int(np.floor(0.6*N))
X_train = X.iloc[:t,:]
y_train = y[:t]
X_test = X.iloc[t:,:]
y_test = list(y[t:])
y_test = pd.Series(y_test)
criteria = 'information_gain'
tree = DecisionTree(criterion=criteria,max_depth=1)
Classifier_AB = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators )
Classifier_AB.fit(X_train, y_train)
y_hat = Classifier_AB.predict(X_test)
# [fig1, fig2] = Classifier_AB.plot()
print('Criteria :', criteria)
print('Accuracy: ', accuracy(y_hat, y_test))
for cls in y.unique():
    print('Precision: ', precision(y_hat, y_test, cls))
    print('Recall: ', recall(y_hat, y_test, cls))

print("\nDECISION STUMP")
tree.fit(X_train,y_train,np.ones(N)/N)
y_hat = tree.predict(X_test)
print('Criteria :', criteria)
print('Accuracy: ', accuracy(y_hat, y_test))
for cls in y.unique():
    print('Precision: ', precision(y_hat, y_test, cls))
    print('Recall: ', recall(y_hat, y_test, cls))
Esempio n. 5
0
# 70:30 train test split
train_test_split = int(0.7*data.shape[0])

X = data.iloc[:train_test_split, :-1]
X_test = data.iloc[train_test_split:, :-1]
y = data.iloc[:train_test_split, -1]
y_test = data.iloc[train_test_split:, -1]


maxdepth = 4

# Building Decesion Tree based on my model
criteria = 'information_gain'
mytree = DecisionTree(criterion=criteria, max_depth=maxdepth) #Split based on Inf. Gain
mytree.fit(X, y)
mytree.plot()

print("My Model")
y_hat = mytree.predict(X)
print("Train Scores:")
print('\tRMSE: ', rmse(y_hat, y))
print('\tMAE: ', mae(y_hat, y))

y_test_hat = mytree.predict(X_test)
print("Test Scores:")
print('\tRMSE: ', rmse(y_test_hat, y_test))
print('\tMAE: ', mae(y_test_hat, y_test))

###################################################################################
Esempio n. 6
0
import numpy as np
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)

# Read real-estate data set
# ...
#
estate = pd.read_csv('Real_estate.csv', index_col='No', dtype=float)
estate = estate.sample(frac=1).reset_index(drop=True)
split_at = int(0.3 * (estate.shape[0]))
X_train = estate.iloc[:split_at, :-1]
y_train = estate.iloc[:split_at, -1]
X_test = estate.iloc[split_at:, :-1]
y_test = estate.iloc[split_at:, -1]

model = DecisionTree(max_depth=2)
model.fit(X_train, y_train)
y_out = model.predict(X_test)
print("Rmse is: ", rmse(y_out, y_test))
print("Mae is: ", mae(y_out, y_test))

model2 = DecisionTreeRegressor(max_depth=2)
model2.fit(X_train, y_train)
y_out = model2.predict(X_test)
print("Rmse of Sklearn is: ", rmse(y_out, y_test))
print("Mae of Sklearn is: ", mae(y_out, y_test))
np.random.seed(42)

# Read IRIS data set
# ...

#
dataset = load_iris()
X, y = dataset.data, dataset.target

#from sklearn.utils import shuffle
#X, y = shuffle(X, y, random_state=0)

print("fit model for iris dataset for 70-30 division")

clf = DecisionTree(criterion="a", max_depth=5)
clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category"))
y = y[120:]
y_hat = clf.predict(pd.DataFrame(X[120:]))
print("Accuracy", accuracy(pd.Series(y_hat), pd.Series(y)))
y = pd.Series(y)

for cls in y.unique():
    print('Precision: for class ', cls, " : ", precision(y_hat, y, cls))
    print('Recall: ', cls, " : ", recall(y_hat, y, cls))


def cross_validtion_5_fold(X, y, depth):
    X_original = X
    y_original = y

    clf = DecisionTree(criterion="a", max_depth=depth)