Exemple #1
0
def find_Time(case):
    axis_Nf=[0]*25 # These are time for different N values by fixing P on fit
    axis_Np=[0]*25 # for predict
    axis_Pf=[0]*11 # Different P for fixed P and for model fit
    axis_Pp=[0]*11 # for predict
    print("Started 1")
    for i in range(100,500,20):
        X,y=CreateFakeData(i,5,case) #we fix p = 5
        mod=DecisionTree()
        st1=time()
        mod.fit(X,y)
        ed1=time()
        st2=time()
        y_=mod.predict(X)
        ed2=time()
        axis_Nf[(i-100)//20]=(ed1-st1)
        axis_Np[(i-150)//20]=(ed2-st2)
    print("Started 2")    
    for i in range(2,24,2):
        X,y=CreateFakeData(100,i,case)
        mod=DecisionTree()
        st1=time()
        mod.fit(X,y)
        ed1=time()
        st2=time()
        y_=mod.predict(X)
        ed2=time()
        axis_Pf[(i-2)//2]=(ed1-st1)
        axis_Pp[(i-2)//2]=(ed2-st2)
    return axis_Nf,axis_Np,axis_Pf,axis_Pp
Exemple #2
0
def analyseTime(case):
    assert (1 <= case <= 4)
    fitTimes = {'N': list(), 'P': list(), 'time': list()}
    predictTimes = {'N': list(), 'P': list(), 'time': list()}
    for N in range(40, 50):
        for P in range(2, 10):
            print("Running with N", N, "and P", P)
            X, y = createFakeData(N, P, case)
            tree = DecisionTree(criterion="information_gain", max_depth=3)

            startTime = time.time()
            tree.fit(X, y)
            endTime = time.time()
            fitTimes['N'].append(N)
            fitTimes['P'].append(P)
            fitTimes['time'].append(endTime - startTime)

            startTime = time.time()
            y_hat = tree.predict(X)
            endTime = time.time()
            predictTimes['N'].append(N)
            predictTimes['P'].append(P)
            predictTimes['time'].append(endTime - startTime)

    plotTimings(fitTimes)
    plotTimings(predictTimes)
def nested_cross_validation(dataset, y):
    for i in range(5):
        test = dataset[30 * i:30 * (i + 1)]
        test_label = y[30 * i:30 * (i + 1)]
        if 30 * (i + 1) + 120 <= 150:
            train = dataset[30 * (i + 1):]
            train_label = y[30 * (i + 1):]
            #print("yo")

        else:
            train1 = dataset[0:30 * (i + 1) - 30]
            train1_label = y[0:30 * (i + 1) - 30]
            train2 = dataset[30 * (i + 1):]
            train2_label = y[30 * (i + 1):]
            train = np.append(train1, train2, axis=0)
            train_label = np.append(train1_label, train2_label, axis=0)
            #print("yoo")
        accuracy_validation = {}
        for depth in range(1, 11):
            avg_acc = 0
            for j in range(4):
                #print("yooooo")
                #print(train.shape,train_label.shape)
                validation = train[30 * j:30 * (j + 1)]
                validation_label = train_label[30 * j:30 * (j + 1)]
                train_1 = train[30 * (j + 1):]
                train1_label = train_label[30 * (j + 1):]
                train_2 = train[0:30 * (j + 1) - 30]
                train2_label = train_label[0:30 * (j + 1) - 30]
                train_new = np.append(train_1, train_2, axis=0)
                train_new_label = np.append(train1_label, train2_label, axis=0)
                tree = DecisionTree(criterion="gini_index", max_depth=depth)
                #print(pd.DataFrame[train])
                #print(train_new.shape,train_new_label.shape)
                #print(train_new.shape,train_new_label.shape)
                train_new = pd.DataFrame(train_new)
                train_new_label = pd.Series(train_new_label, dtype="category")
                train_new.reset_index(drop=True, inplace=True)
                train_new_label.reset_index(drop=True, inplace=True)
                #print(train_new)
                #print(train_new_label)
                tree.fit(train_new, train_new_label)
                #print("training done")
                #print("now testing")
                avg_acc += accuracy(tree.predict(validation), validation_label)
                #print("acc",acc)
                #print(tree.predict(pd.DataFrame(train)))
            accuracy_validation[depth] = avg_acc / 4
        value = max(accuracy_validation, key=accuracy_validation.get)
        tree = DecisionTree(criterion="gini_index", max_depth=value)
        train = pd.DataFrame(train)
        train_label = pd.Series(train_label, dtype="category")

        tree.fit(train, train_label)
        #tree = tree_iris(train,value,0)
        print("Accuracy is,", accuracy(tree.predict(test),
                                       test_label), " for iteration", i + 1,
              ". The depth of the optimal tree is ", value)
Exemple #4
0
def my_regr(X, y, max_depth=5, criterion="information_gain"):
    """Function to train and predict on estate dataset using my decision tree"""
    clf = DecisionTree(criterion=criterion, max_depth=max_depth)

    clf.fit(pd.DataFrame(X[0:330]), pd.Series(y[0:330]))

    # clf.plot()

    y = y[330:]

    y_hat = clf.predict(pd.DataFrame(X[330:]))

    y = pd.Series(y)

    print(rmse(y_hat, y))
    print(mae(y_hat, y))
def train_and_predict(X, y, max_depth=15):
    """Function to train and predict iris using my decision tree"""
    clf = DecisionTree(criterion="information_gain", max_depth=max_depth)

    clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category"))

    y = y[120:]

    y_hat = clf.predict(pd.DataFrame(X[120:]))

    print("Accuracy", accuracy(pd.Series(y_hat), pd.Series(y)))

    y = pd.Series(y)

    for cls in y.unique():
        print('Precision: ', cls, " : ", precision(y_hat, y, cls))
        print('Recall: ', cls, " : ", recall(y_hat, y, cls))
 def fit(self, X, y):
     self.data = X
     self.labels = y
     self.classes = list(set(y))
     n = len(X)
     weights = [1 / n for i in range(n)]
     for estimator in range(self.n_estimators):
         self.clfs.append(X)
         self.clfsy.append(y)
         Dtree = DecisionTree("information_gain", max_depth=1)
         Dtree.fit(X, y, sample_weights=weights)
         self.estimators_list.append(Dtree)
         err = 0
         for i in range(n):
             if Dtree.predict(X.iloc[[i]]) != y[i]:
                 err += weights[i]
         alpha = 0.5 * math.log2((1 - err) / err)
         self.alphas.append(alpha)
         for i in range(n):
             if Dtree.predict(X.iloc[[i]]) != y[i]:
                 weights[i] = weights[i] * math.exp(alpha)
             else:
                 weights[i] = weights[i] * math.exp(-alpha)
         #Normalise the weights
         temp = [t / sum(weights) for t in weights]
         weights = temp
def nested_cross(data, y, k1=5, k2=4):
    val1 = len(data) // k1
    for i in range(k1):
        y_test = y[val1 * i:val1 * (i + 1)]
        x_test = data[val1 * i:val1 * (i + 1)]
        x_train = np.append(data[0:val1 * i], data[val1 * (i + 1):], axis=0)
        y_train = np.append(y[0:val1 * i], y[val1 * (i + 1):], axis=0)
        acc = []
        for depth in range(2, 10):
            s = 0
            for j in range(4):
                val2 = len(x_train) // k2
                x_val_test = x_train[val2 * j:val2 * (j + 1)]
                y_val_test = y_train[val2 * j:val2 * (j + 1)]
                x_val_train = np.append(x_train[0:val2 * j],
                                        x_train[val2 * (j + 1):],
                                        axis=0)
                y_val_train = np.append(y_train[0:val2 * j],
                                        y_train[val2 * (j + 1):],
                                        axis=0)
                tree = DecisionTree("information_gain", max_depth=depth)
                x_val_train = pd.DataFrame(x_val_train)
                y_val_train = pd.DataFrame(y_val_train)
                x_val_test = pd.DataFrame(x_val_test)
                y_val_test = pd.DataFrame(y_val_test)
                x_val_train.dtype = "sda"
                y_val_train.dtype = "category"
                x_val_test.dtype = "sda"
                y_val_test.dtype = "category"
                tree.fit(x_val_train, y_val_train)
                s += (accuracy(np.array(y_val_test),
                               np.array(tree.predict(x_val_test))))
            acc.append(s / 4)
        value = max(acc)
        index = acc.index(max(acc))
        tree = DecisionTree("information_gain", max_depth=value)
        print("Best Accuracy is : - " + str(value))
        print("At Depth : - " + str(index + 1))
    "data", "Real estate valuation data set.xlsx"))
shuffled = estate.sample(frac=1).reset_index(drop=True)

# Preprocessing
X = shuffled.iloc[:, :-1].squeeze()
y = (shuffled.iloc[:, -1:]).T.squeeze()
len_estate = len(y)

# Splitting data
X_train, y_train = X.loc[:split*len_estate], y.loc[:split*len_estate]
X_test, y_test = X.loc[split*len_estate+1:].reset_index(
    drop=True), y.loc[split*len_estate+1:].reset_index(drop=True)

# Learning tree
print("Please wait for some time, it takes time, you can change max depth if it takes too long time.")
tree = DecisionTree(criterion="information_gain", max_depth=max_depth)
tree.fit(X_train, y_train)
tree.plot()

# Printing accuracies for different depths
for depth in range(2, max_depth+1):
    y_hat = tree.predict(X_test, max_depth=depth)
    print("Depth: ", depth)
    print('\tRMSE: ', rmse(y_hat, y_test))
    print('\tMAE: ', mae(y_hat, y_test))

# Decision Tree Regressor from Sci-kit learn
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train, y_train)
y_hat = pd.Series(dt.predict(X_test))
n = len(data)
y = pd.DataFrame(load_iris()["target"])

X_train, X_test, Y_train, Y_test = train_test_split(data, y)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

X_train.dtype = "da"
Y_train.dtype = "category"
X_test.dtype = "d"
X_train1 = X_train.copy()
X_train1.dtype = "d"

tree = DecisionTree("information_gain", max_depth=6)
tree.fit(X_train1, Y_train)
tree.root
y_pred = tree.predict(X_test)
print(accuracy(np.array(Y_test), np.array(y_pred)))

d_tree_sklearn = tree5.DecisionTreeRegressor()
d_tree_sklearn = d_tree_sklearn.fit(X_train, Y_train)
y_sklearn = d_tree_sklearn.predict(X_test)
print(accuracy(np.array(y_sklearn), np.array(Y_test)))

npy = np.array(Y_train)
classes = set()
for i in range(len(npy)):
    classes.add(npy[i][0])
def five_fold_validation(X, y, depth=5):
    """Function to do five fold cross validation on iris"""
    X_original = X
    y_original = y

    accs = []

    # last 5th chunk as test data
    clf = DecisionTree(criterion="information_gain", max_depth=depth)
    clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category"))
    y_hat = clf.predict(pd.DataFrame(X[120:]))
    accs.append(accuracy(pd.Series(y_hat), pd.Series(y[120:])))

    # 4rd chunk as test data
    clf = DecisionTree(criterion="information_gain", max_depth=depth)
    pass_X = pd.DataFrame(np.append(X[90:], X[0:60], axis=0))
    pass_y = pd.Series(np.append(y[90:], y[0:60], axis=0), dtype="category")
    clf.fit(pass_X, pass_y)
    y_hat = clf.predict(pd.DataFrame(X[60:90]))
    accs.append(accuracy(pd.Series(y_hat), pd.Series(y[60:90])))

    # 3nd chunk as test data
    clf = DecisionTree(criterion="information_gain", max_depth=depth)
    clf.fit(pd.DataFrame(np.append(X[120:], X[0:90], axis=0)),
            pd.Series(np.append(y[120:], y[0:90], axis=0), dtype="category"))
    y_hat = clf.predict(pd.DataFrame(X[90:120]))
    accs.append(accuracy(pd.Series(y_hat), pd.Series(y[90:120])))

    # 2st chunk as test data
    clf = DecisionTree(criterion="information_gain", max_depth=depth)
    clf.fit(pd.DataFrame(X[30:]), pd.Series(y[30:], dtype="category"))
    y_hat = clf.predict(pd.DataFrame(X[0:30]))
    accs.append(accuracy(pd.Series(y_hat), pd.Series(y[0:30])))

    # 1st chunk as test data
    clf = DecisionTree(criterion="information_gain", max_depth=depth)
    clf.fit(pd.DataFrame(np.append(X[0:30], X[60:], axis=0)),
            pd.Series(np.append(y[0:30], y[60:], axis=0), dtype="category"))
    y_hat = clf.predict(pd.DataFrame(X[30:60]))
    accs.append(accuracy(pd.Series(y_hat), pd.Series(y[30:60])))

    print("Individual Accuracies:")
    print(*accs)
    print("Average Accuracy:")
    avg = sum(accs) / 5
    print(avg)
Exemple #11
0

df = pd.read_excel('realestate.xlsx')
df = df.drop('No',axis=1)
attb = list(df.columns)
attb.remove('y')
#print(attb)

Dtree =  tree.DecisionTreeRegressor()
columns = list(df.columns)
columns.remove('y')
X = df[columns]
y = df['y']
y = y.rename('y')
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33,random_state = 42)
#print(X_test.head())
#print(X_train, X_test, y_train, y_test)
print("Scikit-learn decision tree")
Dtree.fit(X_train,y_train)
pred = Dtree.predict(X_test)
print("MAE and STDDEV are ",mean_absolute_error(pred, y_test), np.std(np.abs(pred-y_test)))


print("Our Decision Tree")
tree = DecisionTree(criterion="a",max_depth=5)
tree.fit(X_train,y_train)
y_hat = tree.predict(X_test)
print("RMSE",rmse(y_hat,y_test))
print("MAE",mae(y_hat,y_test))

np.random.seed(42)

# Read real-estate data set
# ...
#
data = pd.read_csv(r'C:\Users\Anshuman Yadav\Documents\Real.csv')
X_train, X_test, Y_train, Y_test = train_test_split(data[data.columns[1:-1]],
                                                    data[data.columns[-1]])
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
Y_train = Y_train.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)
X_train.dtype = "d"
X_test.dtype = "d"

tree = DecisionTree("ad", max_depth=25)
tree.fit(X_train, Y_train)
tree.root
y_pred = tree.predict(X_test)
print("MAE my tree : -")
print(mae(np.array(Y_test), np.array(y_pred)))
print("MSE my tree : -")
print(rmse(np.array(Y_test), np.array(y_pred)))

d_tree_sklearn = tree5.DecisionTreeRegressor()
d_tree_sklearn = d_tree_sklearn.fit(X_train, Y_train)
y_sklearn = d_tree_sklearn.predict(X_test)
print("MAE sklearn : -")
print(mae(np.array(Y_test), np.array(y_sklearn)))
print("MSE sklearn : -")
print(rmse(np.array(Y_test), np.array(y_sklearn)))
Exemple #13
0

# 70:30 train test split
train_test_split = int(0.7*data.shape[0])

X = data.iloc[:train_test_split, :-1]
X_test = data.iloc[train_test_split:, :-1]
y = data.iloc[:train_test_split, -1]
y_test = data.iloc[train_test_split:, -1]


maxdepth = 4

# Building Decesion Tree based on my model
criteria = 'information_gain'
mytree = DecisionTree(criterion=criteria, max_depth=maxdepth) #Split based on Inf. Gain
mytree.fit(X, y)
mytree.plot()

print("My Model")
y_hat = mytree.predict(X)
print("Train Scores:")
print('\tRMSE: ', rmse(y_hat, y))
print('\tMAE: ', mae(y_hat, y))

y_test_hat = mytree.predict(X_test)
print("Test Scores:")
print('\tRMSE: ', rmse(y_test_hat, y_test))
print('\tMAE: ', mae(y_test_hat, y_test))

###################################################################################
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *
from pprint import pprint

np.random.seed(42)

# Read IRIS data set
# ...
# 

tree = DecisionTree(criterion='information_gain',max_depth=10) #Split based on Inf. Gain
tree.output="category"
tree.input="real"
df=pd.read_csv("iris.data",names=['sepal_length','sepal_width','petal_length','petal_width','label'])
train_data,test_data=tree.train_test_split(df)
sub_tree = tree.decision_tree_algorithm(train_data)
tree.tree=sub_tree
rows,colums=test_data.values.shape
y_hat = tree.predict(test_data.iloc[:,0:colums-1])
y= test_data.iloc[:,-1]
print('Accuracy: ', accuracy(y_hat, y))
for cls in y.unique():
    print('Class Name: ',cls)
    print('Precision: ', precision(y_hat, y, cls))
    print('Recall: ', recall(y_hat, y, cls))
    print()

    y_data.append(row[4])
X_data = pd.DataFrame(data=X_data)
y_data = pd.Series(data=y_data, dtype="category")


# Defining Train Test Split
train_test_split = int(0.7*len(iris_data))

X = X_data.iloc[:train_test_split, :]
X_test = X_data.iloc[train_test_split:, :]
y = y_data.iloc[:train_test_split]
y_test = y_data.iloc[train_test_split:]

# Training and Testing
for criteria in ['information_gain', 'gini_index']:
    tree = DecisionTree(criterion=criteria, max_depth=3)
    # Build Decision Tree
    tree.fit(X, y)
    #Predict
    y_hat = tree.predict(X)
    y_test_hat = tree.predict(X_test)
    tree.plot()
    print('Criteria :', criteria)
    print('Train Accuracy: ', accuracy(y_hat, y))
    print('Test Accuracy: ', accuracy(y_test_hat, y_test))
    # Precesion and Recall for each class
    for cls in y.unique():
        print("Class =",cls)
        print('Precision: ', precision(y_test_hat, y_test, cls))
        print('Recall: ', recall(y_test_hat, y_test, cls))
Exemple #16
0
                if classes[key]>m:
                    finclass = key
                    m = classes[key]
            final_preds.append(finclass)
        return pd.Series(final_preds,dtype="category")
        

N = 30
P = 2
NUM_OP_CLASSES = 2
n_estimators = 3
X = pd.DataFrame(np.abs(np.random.randn(N, P)))
y = pd.Series(np.random.randint(NUM_OP_CLASSES, size = N), dtype="category")

criteria = 'information_gain'
tree = DecisionTree(criterion=criteria,max_depth=1)
Classifier_AB = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators )
Classifier_AB.fit(X, y)
y_hat = Classifier_AB.predict(X)
# [fig1, fig2] = Classifier_AB.plot()
print('Criteria :', criteria)
print('Accuracy: ', accuracy(y_hat, y))
for cls in y.unique():
    print('Precision: ', precision(y_hat, y, cls))
    print('Recall: ', recall(y_hat, y, cls))

print('\nIRIS DATASET')

##### AdaBoostClassifier on Iris data set using the entire data set with sepal width and petal width as the two features
dataset = pd.read_csv("tree/iris.data",delimiter=",",header=None)
a = dataset[1]
# N = 30
# P = 5

############################################################################################################
#    									DISCRETE DESCRETE
fit_time = []
predict_time = []
for N in range(2, 10):
    for P in range(100, 120):
        X = pd.DataFrame({
            i: pd.Series(np.random.randint(P, size=N), dtype="category")
            for i in range(5)
        })
        y = pd.Series(np.random.randint(P, size=N), dtype="category")
        tree = DecisionTree(criterion="a",
                            max_depth=8)  #Split based on Inf. Gain
        start = time.time()
        tree.fit(X, y)
        end = time.time()
        fit_time.append(end - start)

        start = time.time()
        y_hat = tree.predict(X)
        end = time.time()
        predict_time.append(end - start)
plt.plot(fit_time)
plt.ylabel('DIDO : Fit time', fontsize=16)
plt.show()

plt.plot(predict_time)
plt.ylabel('DIDO : Predict time', fontsize=16)
Exemple #18
0
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *

np.random.seed(42)
# Test case 1
# Real Input and Real Output

N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randn(N))

for criteria in ['information_gain', 'gini_index']:
    # Split based on Inf. Gain
    tree = DecisionTree(criterion=criteria)
    tree.fit(X, y)
    y_hat = tree.predict(X)
    tree.plot()
    print('Criteria :', criteria)
    print('RMSE: ', rmse(y_hat, y))
    print('MAE: ', mae(y_hat, y))

# Test case 2
# Real Input and Discrete Output

N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randint(P, size=N), dtype="category")
Exemple #19
0
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *

from sklearn import tree as sktree
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
from pprint import pprint

np.random.seed(42)

# Read real-estate data set
# ...
#
tree = DecisionTree(criterion='information_gain',
                    max_depth=10)  #Split based on Inf. Gain
tree.output = "discrete"
tree.input = "discrete"
df = pd.read_excel("Real estate valuation data set.xlsx",
                   names=[
                       'No', 'tran_date', 'age', 'distance_mrt', 'stores',
                       'lat', 'long', 'price'
                   ])
df = df.drop('No', axis=1)
train_data, test_data = tree.train_test_split(df)
sub_tree = tree.regression_tree_algorithm(df)
print(sub_tree)
tree.tree = sub_tree
rows, colums = test_data.values.shape
y_hat = tree.predict(test_data.iloc[:, 0:colums - 1])
y = test_data.iloc[:, -1]
# ..
# Function to create fake data (take inspiration from usage.py)
# ...
# ..other functions

# Test case 1
# Real Input and Real Output

N = 50
P = 5
X = pd.DataFrame(np.random.randn(N, P))
y = pd.Series(np.random.randn(N))

try:
    for criteria in ['information_gain', 'gini_index']:
        tree = DecisionTree(criterion=criteria,max_depth=10) #Split based on Inf. Gain
        start = timeit.default_timer()
        tree.fit(X, y)
        stop = timeit.default_timer()
        print('Real Input and Real Output Time - Build Tree: ', stop - start)
        start = timeit.default_timer()
        y_hat = tree.predict(X)
        stop = timeit.default_timer()
        print('Real Input and Real Output Time - Predict Tree: ', stop - start)
except:
    pass

# Test case 2
# Real Input and Discrete Output

X = pd.DataFrame(np.random.randn(N, P))
from ensemble.bagging import BaggingClassifier
from tree.base import DecisionTree
# Or use sklearn decision tree
from linearRegression.linearRegression import LinearRegression

########### BaggingClassifier ###################

N = 30
P = 2
NUM_OP_CLASSES = 2
n_estimators = 3
X = pd.DataFrame(np.abs(np.random.randn(N, P)))
y = pd.Series(np.random.randint(NUM_OP_CLASSES, size=N), dtype="category")

criteria = 'information_gain'
tree = DecisionTree(criterion=criteria)
Classifier_B = BaggingClassifier(base_estimator=tree,
                                 n_estimators=n_estimators)
Classifier_B.fit(X, y)
y_hat = Classifier_B.predict(X)
#[fig1, fig2] =
Classifier_B.plot()
print('Criteria :', criteria)
print('Accuracy: ', accuracy(y_hat, y))
print()
for cls in y.unique():
    print('Class: ', cls)
    print('Precision: ', precision(y_hat, y, cls))
    print('Recall: ', recall(y_hat, y, cls))
    print()
def cross_validtion_5_fold(X, y, depth):
    X_original = X
    y_original = y

    clf = DecisionTree(criterion="a", max_depth=depth)
    clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category"))
    y = y[120:]
    y_hat = clf.predict(pd.DataFrame(X[120:]))
    print(accuracy(pd.Series(y_hat), pd.Series(y)))

    X = X_original
    y = y_original

    clf = DecisionTree(criterion="a", max_depth=depth)
    clf.fit(pd.DataFrame(np.append(X[90:], X[0:60], axis=0)),
            pd.Series(np.append(y[90:], y[0:60], axis=0), dtype="category"))
    y = y[60:90]
    y_hat = clf.predict(X[60:90])
    print(accuracy(pd.Series(y_hat), pd.Series(y)))

    X = X_original
    y = y_original

    clf = DecisionTree(criterion="a", max_depth=depth)
    clf.fit(pd.DataFrame(np.append(X[120:], X[0:90], axis=0)),
            pd.Series(np.append(y[120:], y[0:90], axis=0), dtype="category"))
    y = y[90:120]
    y_hat = clf.predict(X[90:120])
    print(accuracy(pd.Series(y_hat), pd.Series(y)))

    X = X_original
    y = y_original

    clf = DecisionTree(criterion="a", max_depth=depth)
    clf.fit(pd.DataFrame(X[30:]), pd.Series(y[30:], dtype="category"))
    y = y[0:30]
    y_hat = clf.predict(X[0:30])
    print(accuracy(pd.Series(y_hat), pd.Series(y)))

    X = X_original
    y = y_original

    clf = DecisionTree(criterion="a", max_depth=depth)
    clf.fit(pd.DataFrame(np.append(X[0:30], X[60:], axis=0)),
            pd.Series(np.append(y[0:30], y[60:], axis=0), dtype="category"))
    y = y[30:60]
    y_hat = clf.predict(X[30:60])
    print(accuracy(pd.Series(y_hat), pd.Series(y)))
from tree.base import DecisionTree
from metrics import *

np.random.seed(42)

# Read IRIS data set
# ...
#
iris = pd.read_csv('iris.csv')
iris = iris.sample(frac=1).reset_index(drop=True)
split_at = int(0.7 * (iris.shape[0]))
X_train = iris.iloc[:split_at, :-1]
y_train = iris.iloc[:split_at, -1]
X_test = iris.iloc[split_at:, :-1]
y_test = iris.iloc[split_at:, -1]
model = DecisionTree()
model.fit(X_train, y_train)
y_out = model.predict(X_test)
print("Accuracy is: ", accuracy(y_out, y_test))
for group in np.unique(y_test):
    print("Precision of {} is: {}".format(group,
                                          precision(y_out, y_test, group)))
    print("Recal of {} is: {}".format(group, recall(y_out, y_test, group)))

#Accuracy of all five models
fold = int(0.2 * (iris.shape[0]))
for i in range(5):
    n_split1 = i * fold
    n_split2 = n_split1 + fold
    X_test1 = iris.iloc[n_split1:n_split2, :-1].reset_index(drop=True)
    y_test1 = pd.Series(list(iris.iloc[n_split1:n_split2, -1]))
np.random.seed(42)

# Read IRIS data set
# ...

#
dataset = load_iris()
X, y = dataset.data, dataset.target

#from sklearn.utils import shuffle
#X, y = shuffle(X, y, random_state=0)

print("fit model for iris dataset for 70-30 division")

clf = DecisionTree(criterion="a", max_depth=5)
clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category"))
y = y[120:]
y_hat = clf.predict(pd.DataFrame(X[120:]))
print("Accuracy", accuracy(pd.Series(y_hat), pd.Series(y)))
y = pd.Series(y)

for cls in y.unique():
    print('Precision: for class ', cls, " : ", precision(y_hat, y, cls))
    print('Recall: ', cls, " : ", recall(y_hat, y, cls))


def cross_validtion_5_fold(X, y, depth):
    X_original = X
    y_original = y
# ...
# ..other functions
""" Case: RIRO"""

learning_time = list()
predict_time = list()

for Ni in range(1, 7):
    for step in range(6, 42):
        N = Ni
        P = step
        X = pd.DataFrame(np.random.randn(N, P))
        y = pd.Series(np.random.randn(N))

        start_time = time.time()
        tree = DecisionTree(criterion="information_gain")
        tree.fit(X, y)
        end_time = time.time()

        learning_time.append(end_time - start_time)

        start_time = time.time()
        y_hat = tree.predict(X)
        end_time = time.time()

        predict_time.append(end_time - start_time)
plt.plot(list(learning_time))
plt.ylabel('RIRO : Fit time', fontsize=16)
plt.show()

plt.plot(list(predict_time))
Exemple #26
0
import numpy as np
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *
from sklearn.tree import DecisionTreeRegressor

np.random.seed(42)

# Read real-estate data set
# ...
#
estate = pd.read_csv('Real_estate.csv', index_col='No', dtype=float)
estate = estate.sample(frac=1).reset_index(drop=True)
split_at = int(0.3 * (estate.shape[0]))
X_train = estate.iloc[:split_at, :-1]
y_train = estate.iloc[:split_at, -1]
X_test = estate.iloc[split_at:, :-1]
y_test = estate.iloc[split_at:, -1]

model = DecisionTree(max_depth=2)
model.fit(X_train, y_train)
y_out = model.predict(X_test)
print("Rmse is: ", rmse(y_out, y_test))
print("Mae is: ", mae(y_out, y_test))

model2 = DecisionTreeRegressor(max_depth=2)
model2.fit(X_train, y_train)
y_out = model2.predict(X_test)
print("Rmse of Sklearn is: ", rmse(y_out, y_test))
print("Mae of Sklearn is: ", mae(y_out, y_test))
Exemple #27
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from metrics import *

np.random.seed(42)

N = 30
P = 5
X = pd.DataFrame({
    i: pd.Series(np.random.randint(P, size=N), dtype="category")
    for i in range(5)
})
y = pd.Series(np.random.randint(P, size=N), dtype="category")

print('\n\n##Discrete Input and Discrete Output##')
for criteria in ['information_gain']:
    tree = DecisionTree(criterion=criteria,
                        max_depth=np.inf)  #Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    tree.plot()
    print('Criteria :', criteria)
    print('Accuracy: ', accuracy(y_hat, y))
    for cls in y.unique():
        print(cls)
        print('Precision: ', precision(y_hat, y, cls))
        print('Recall: ', recall(y_hat, y, cls))
Exemple #28
0
def experiment(N=[
    30,
], M=[
    5,
], exp="sklearn", ios=[
    "dido",
]):
    '''
    Creates a new experiment while varing one parameter, either M or 
    N, exp can be sklearn or mine.
    N: number of samples
    M: number of attributes
    exp: sklearn or mine
    ios: list of input outputs, can be rido, dido, riro, diro
    '''
    results = np.zeros((len(ios), len(N), len(M), 4))

    assert (len(N) == 1 or len(M) == 1)
    pbar = tqdm(total=len(ios) * len(N) * len(M))
    for io in ios:
        for n in N:
            for m in M:
                X, y = datagen(N=n, M=m, io=io)

                if exp == "sklearn":
                    if io == "diro" or io == "riro":
                        tree = DecisionTreeRegressor()
                    else:
                        tree = DecisionTreeClassifier(criterion="entropy")
                else:
                    exp = "MyTree"
                    tree = DecisionTree()
                start = datetime.now()
                tree.fit(X, y)
                end = datetime.now()
                learn = (end - start).total_seconds()

                start = datetime.now()
                tree.predict(X)
                end = datetime.now()
                predict = (end - start).total_seconds()

                results[ios.index(io), N.index(n),
                        M.index(m)] = np.array([n, m, learn, predict])
                pbar.update(1)

    # Ploting for Learning tasks
    plt.figure()
    if len(N) > 1 or len(M) > 1:
        if len(N) > 1:
            for io in ios:
                plt.plot(results[ios.index(io), :, 0, 0],
                         results[ios.index(io), :, 0, 2],
                         label=io)
            plt.title("Learning Plot for N vs time for " + exp)
            plt.xlabel("Varying N")

        else:
            for io in ios:
                plt.plot(results[ios.index(io), 0, :, 1],
                         results[ios.index(io), 0, :, 2],
                         label=io)
            plt.title("Learning Plot for M vs time for " + exp)
            plt.xlabel("Varying M")
    plt.legend()
    plt.ylabel("Time taken in seconds")
    plt.savefig(os.path.join(experiments, "learn.png"))

    # Ploting for Predicting tasks
    plt.figure()
    if len(N) > 1 or len(M) > 1:
        if len(N) > 1:
            for io in ios:
                plt.plot(results[ios.index(io), :, 0, 0],
                         results[ios.index(io), :, 0, 3],
                         label=io)
            plt.title("Prediction Plot for N vs time for " + exp)
            plt.xlabel("Varying N")

        else:
            for io in ios:
                plt.plot(results[ios.index(io), 0, :, 1],
                         results[ios.index(io), 0, :, 3],
                         label=io)
            plt.title("Prediction Plot for M vs time for " + exp)
            plt.xlabel("Varying M")
    plt.legend()
    plt.ylabel("Time taken in seconds")
    plt.savefig(os.path.join(experiments, "predict.png"))
    return results
    np.random.shuffle(index)
    len,_=df.values.shape
    train_size=int(len*0.6)
    train_data=df.loc[index][0:train_size]
    test_data=df.loc[index][train_size:]  
    return train_data,test_data


df=pd.read_csv("iris.data",names=['sepal_length','sepal_width','petal_length','petal_width','label'])
df = df[['sepal_width','petal_width', 'label']]
df = df.replace({'label' : {'Iris-setosa': 0, 'Iris-versicolor': 0, 'Iris-virginica': 1}})
train_data,test_data = train_test_split(df)
X = train_data[['sepal_width','petal_width']]
y = train_data[['label']]['label']

tree_iris = DecisionTree(criterion=criteria,max_depth=1)
Classifier_AB_iris = AdaBoostClassifier(base_estimator=tree_iris, n_estimators=n_estimators )
Classifier_AB_iris.fit(X, y)
y_hat = Classifier_AB_iris.predict(X)
#[fig1, fig2] = Classifier_AB.plot()
print('Criteria :', criteria)
print('Accuracy: ', accuracy(y_hat, y))
print()
for cls in y.unique():
    if cls == 1:
        print('Category: Iris-virginica')
    else:
        print('Category: Not Iris-virginica')
    print('Precision: ', precision(y_hat, y, cls))
    print('Recall: ', recall(y_hat, y, cls))
    print()
np.random.seed(42)

########### AdaBoostClassifier on Real Input and Discrete Output ###################
print("-----------------------------------------------------------")
print("Decision stump on random data")
print("-----------------------------------------------------------")
N = 30
P = 2
NUM_OP_CLASSES = 2
n_estimators = 3
X = pd.DataFrame(np.abs(np.random.randn(N, P)))
y = pd.Series(np.random.randint(NUM_OP_CLASSES, size=N), dtype="category")

criteria = 'information_gain'
tree = DecisionTree(criterion=criteria)
re = X.shape[0]
img_weights = [1 / re] * re
tree.fit(X, y, img_weights)
yhat = pd.Series(tree.predict(X))
print('Criteria :', criteria)
print('Accuracy: ', accuracy(yhat, y))
for cls in y.unique():
    print("***Class :" + str(cls) + "***")
    print('Precision: ', precision(yhat, y, cls))
    print('Recall: ', recall(yhat, y, cls))

print("-----------------------------------------------------------")
print("Adaboost on random data")
print("-----------------------------------------------------------")