def find_Time(case): axis_Nf=[0]*25 # These are time for different N values by fixing P on fit axis_Np=[0]*25 # for predict axis_Pf=[0]*11 # Different P for fixed P and for model fit axis_Pp=[0]*11 # for predict print("Started 1") for i in range(100,500,20): X,y=CreateFakeData(i,5,case) #we fix p = 5 mod=DecisionTree() st1=time() mod.fit(X,y) ed1=time() st2=time() y_=mod.predict(X) ed2=time() axis_Nf[(i-100)//20]=(ed1-st1) axis_Np[(i-150)//20]=(ed2-st2) print("Started 2") for i in range(2,24,2): X,y=CreateFakeData(100,i,case) mod=DecisionTree() st1=time() mod.fit(X,y) ed1=time() st2=time() y_=mod.predict(X) ed2=time() axis_Pf[(i-2)//2]=(ed1-st1) axis_Pp[(i-2)//2]=(ed2-st2) return axis_Nf,axis_Np,axis_Pf,axis_Pp
def analyseTime(case): assert (1 <= case <= 4) fitTimes = {'N': list(), 'P': list(), 'time': list()} predictTimes = {'N': list(), 'P': list(), 'time': list()} for N in range(40, 50): for P in range(2, 10): print("Running with N", N, "and P", P) X, y = createFakeData(N, P, case) tree = DecisionTree(criterion="information_gain", max_depth=3) startTime = time.time() tree.fit(X, y) endTime = time.time() fitTimes['N'].append(N) fitTimes['P'].append(P) fitTimes['time'].append(endTime - startTime) startTime = time.time() y_hat = tree.predict(X) endTime = time.time() predictTimes['N'].append(N) predictTimes['P'].append(P) predictTimes['time'].append(endTime - startTime) plotTimings(fitTimes) plotTimings(predictTimes)
def nested_cross_validation(dataset, y): for i in range(5): test = dataset[30 * i:30 * (i + 1)] test_label = y[30 * i:30 * (i + 1)] if 30 * (i + 1) + 120 <= 150: train = dataset[30 * (i + 1):] train_label = y[30 * (i + 1):] #print("yo") else: train1 = dataset[0:30 * (i + 1) - 30] train1_label = y[0:30 * (i + 1) - 30] train2 = dataset[30 * (i + 1):] train2_label = y[30 * (i + 1):] train = np.append(train1, train2, axis=0) train_label = np.append(train1_label, train2_label, axis=0) #print("yoo") accuracy_validation = {} for depth in range(1, 11): avg_acc = 0 for j in range(4): #print("yooooo") #print(train.shape,train_label.shape) validation = train[30 * j:30 * (j + 1)] validation_label = train_label[30 * j:30 * (j + 1)] train_1 = train[30 * (j + 1):] train1_label = train_label[30 * (j + 1):] train_2 = train[0:30 * (j + 1) - 30] train2_label = train_label[0:30 * (j + 1) - 30] train_new = np.append(train_1, train_2, axis=0) train_new_label = np.append(train1_label, train2_label, axis=0) tree = DecisionTree(criterion="gini_index", max_depth=depth) #print(pd.DataFrame[train]) #print(train_new.shape,train_new_label.shape) #print(train_new.shape,train_new_label.shape) train_new = pd.DataFrame(train_new) train_new_label = pd.Series(train_new_label, dtype="category") train_new.reset_index(drop=True, inplace=True) train_new_label.reset_index(drop=True, inplace=True) #print(train_new) #print(train_new_label) tree.fit(train_new, train_new_label) #print("training done") #print("now testing") avg_acc += accuracy(tree.predict(validation), validation_label) #print("acc",acc) #print(tree.predict(pd.DataFrame(train))) accuracy_validation[depth] = avg_acc / 4 value = max(accuracy_validation, key=accuracy_validation.get) tree = DecisionTree(criterion="gini_index", max_depth=value) train = pd.DataFrame(train) train_label = pd.Series(train_label, dtype="category") tree.fit(train, train_label) #tree = tree_iris(train,value,0) print("Accuracy is,", accuracy(tree.predict(test), test_label), " for iteration", i + 1, ". The depth of the optimal tree is ", value)
def my_regr(X, y, max_depth=5, criterion="information_gain"): """Function to train and predict on estate dataset using my decision tree""" clf = DecisionTree(criterion=criterion, max_depth=max_depth) clf.fit(pd.DataFrame(X[0:330]), pd.Series(y[0:330])) # clf.plot() y = y[330:] y_hat = clf.predict(pd.DataFrame(X[330:])) y = pd.Series(y) print(rmse(y_hat, y)) print(mae(y_hat, y))
def train_and_predict(X, y, max_depth=15): """Function to train and predict iris using my decision tree""" clf = DecisionTree(criterion="information_gain", max_depth=max_depth) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y = y[120:] y_hat = clf.predict(pd.DataFrame(X[120:])) print("Accuracy", accuracy(pd.Series(y_hat), pd.Series(y))) y = pd.Series(y) for cls in y.unique(): print('Precision: ', cls, " : ", precision(y_hat, y, cls)) print('Recall: ', cls, " : ", recall(y_hat, y, cls))
def fit(self, X, y): self.data = X self.labels = y self.classes = list(set(y)) n = len(X) weights = [1 / n for i in range(n)] for estimator in range(self.n_estimators): self.clfs.append(X) self.clfsy.append(y) Dtree = DecisionTree("information_gain", max_depth=1) Dtree.fit(X, y, sample_weights=weights) self.estimators_list.append(Dtree) err = 0 for i in range(n): if Dtree.predict(X.iloc[[i]]) != y[i]: err += weights[i] alpha = 0.5 * math.log2((1 - err) / err) self.alphas.append(alpha) for i in range(n): if Dtree.predict(X.iloc[[i]]) != y[i]: weights[i] = weights[i] * math.exp(alpha) else: weights[i] = weights[i] * math.exp(-alpha) #Normalise the weights temp = [t / sum(weights) for t in weights] weights = temp
def nested_cross(data, y, k1=5, k2=4): val1 = len(data) // k1 for i in range(k1): y_test = y[val1 * i:val1 * (i + 1)] x_test = data[val1 * i:val1 * (i + 1)] x_train = np.append(data[0:val1 * i], data[val1 * (i + 1):], axis=0) y_train = np.append(y[0:val1 * i], y[val1 * (i + 1):], axis=0) acc = [] for depth in range(2, 10): s = 0 for j in range(4): val2 = len(x_train) // k2 x_val_test = x_train[val2 * j:val2 * (j + 1)] y_val_test = y_train[val2 * j:val2 * (j + 1)] x_val_train = np.append(x_train[0:val2 * j], x_train[val2 * (j + 1):], axis=0) y_val_train = np.append(y_train[0:val2 * j], y_train[val2 * (j + 1):], axis=0) tree = DecisionTree("information_gain", max_depth=depth) x_val_train = pd.DataFrame(x_val_train) y_val_train = pd.DataFrame(y_val_train) x_val_test = pd.DataFrame(x_val_test) y_val_test = pd.DataFrame(y_val_test) x_val_train.dtype = "sda" y_val_train.dtype = "category" x_val_test.dtype = "sda" y_val_test.dtype = "category" tree.fit(x_val_train, y_val_train) s += (accuracy(np.array(y_val_test), np.array(tree.predict(x_val_test)))) acc.append(s / 4) value = max(acc) index = acc.index(max(acc)) tree = DecisionTree("information_gain", max_depth=value) print("Best Accuracy is : - " + str(value)) print("At Depth : - " + str(index + 1))
"data", "Real estate valuation data set.xlsx")) shuffled = estate.sample(frac=1).reset_index(drop=True) # Preprocessing X = shuffled.iloc[:, :-1].squeeze() y = (shuffled.iloc[:, -1:]).T.squeeze() len_estate = len(y) # Splitting data X_train, y_train = X.loc[:split*len_estate], y.loc[:split*len_estate] X_test, y_test = X.loc[split*len_estate+1:].reset_index( drop=True), y.loc[split*len_estate+1:].reset_index(drop=True) # Learning tree print("Please wait for some time, it takes time, you can change max depth if it takes too long time.") tree = DecisionTree(criterion="information_gain", max_depth=max_depth) tree.fit(X_train, y_train) tree.plot() # Printing accuracies for different depths for depth in range(2, max_depth+1): y_hat = tree.predict(X_test, max_depth=depth) print("Depth: ", depth) print('\tRMSE: ', rmse(y_hat, y_test)) print('\tMAE: ', mae(y_hat, y_test)) # Decision Tree Regressor from Sci-kit learn dt = DecisionTreeRegressor(random_state=0) dt.fit(X_train, y_train) y_hat = pd.Series(dt.predict(X_test))
n = len(data) y = pd.DataFrame(load_iris()["target"]) X_train, X_test, Y_train, Y_test = train_test_split(data, y) X_train = X_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) Y_train = Y_train.reset_index(drop=True) Y_test = Y_test.reset_index(drop=True) X_train.dtype = "da" Y_train.dtype = "category" X_test.dtype = "d" X_train1 = X_train.copy() X_train1.dtype = "d" tree = DecisionTree("information_gain", max_depth=6) tree.fit(X_train1, Y_train) tree.root y_pred = tree.predict(X_test) print(accuracy(np.array(Y_test), np.array(y_pred))) d_tree_sklearn = tree5.DecisionTreeRegressor() d_tree_sklearn = d_tree_sklearn.fit(X_train, Y_train) y_sklearn = d_tree_sklearn.predict(X_test) print(accuracy(np.array(y_sklearn), np.array(Y_test))) npy = np.array(Y_train) classes = set() for i in range(len(npy)): classes.add(npy[i][0])
def five_fold_validation(X, y, depth=5): """Function to do five fold cross validation on iris""" X_original = X y_original = y accs = [] # last 5th chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y_hat = clf.predict(pd.DataFrame(X[120:])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[120:]))) # 4rd chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) pass_X = pd.DataFrame(np.append(X[90:], X[0:60], axis=0)) pass_y = pd.Series(np.append(y[90:], y[0:60], axis=0), dtype="category") clf.fit(pass_X, pass_y) y_hat = clf.predict(pd.DataFrame(X[60:90])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[60:90]))) # 3nd chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[120:], X[0:90], axis=0)), pd.Series(np.append(y[120:], y[0:90], axis=0), dtype="category")) y_hat = clf.predict(pd.DataFrame(X[90:120])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[90:120]))) # 2st chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(X[30:]), pd.Series(y[30:], dtype="category")) y_hat = clf.predict(pd.DataFrame(X[0:30])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[0:30]))) # 1st chunk as test data clf = DecisionTree(criterion="information_gain", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[0:30], X[60:], axis=0)), pd.Series(np.append(y[0:30], y[60:], axis=0), dtype="category")) y_hat = clf.predict(pd.DataFrame(X[30:60])) accs.append(accuracy(pd.Series(y_hat), pd.Series(y[30:60]))) print("Individual Accuracies:") print(*accs) print("Average Accuracy:") avg = sum(accs) / 5 print(avg)
df = pd.read_excel('realestate.xlsx') df = df.drop('No',axis=1) attb = list(df.columns) attb.remove('y') #print(attb) Dtree = tree.DecisionTreeRegressor() columns = list(df.columns) columns.remove('y') X = df[columns] y = df['y'] y = y.rename('y') X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.33,random_state = 42) #print(X_test.head()) #print(X_train, X_test, y_train, y_test) print("Scikit-learn decision tree") Dtree.fit(X_train,y_train) pred = Dtree.predict(X_test) print("MAE and STDDEV are ",mean_absolute_error(pred, y_test), np.std(np.abs(pred-y_test))) print("Our Decision Tree") tree = DecisionTree(criterion="a",max_depth=5) tree.fit(X_train,y_train) y_hat = tree.predict(X_test) print("RMSE",rmse(y_hat,y_test)) print("MAE",mae(y_hat,y_test))
np.random.seed(42) # Read real-estate data set # ... # data = pd.read_csv(r'C:\Users\Anshuman Yadav\Documents\Real.csv') X_train, X_test, Y_train, Y_test = train_test_split(data[data.columns[1:-1]], data[data.columns[-1]]) X_train = X_train.reset_index(drop=True) X_test = X_test.reset_index(drop=True) Y_train = Y_train.reset_index(drop=True) Y_test = Y_test.reset_index(drop=True) X_train.dtype = "d" X_test.dtype = "d" tree = DecisionTree("ad", max_depth=25) tree.fit(X_train, Y_train) tree.root y_pred = tree.predict(X_test) print("MAE my tree : -") print(mae(np.array(Y_test), np.array(y_pred))) print("MSE my tree : -") print(rmse(np.array(Y_test), np.array(y_pred))) d_tree_sklearn = tree5.DecisionTreeRegressor() d_tree_sklearn = d_tree_sklearn.fit(X_train, Y_train) y_sklearn = d_tree_sklearn.predict(X_test) print("MAE sklearn : -") print(mae(np.array(Y_test), np.array(y_sklearn))) print("MSE sklearn : -") print(rmse(np.array(Y_test), np.array(y_sklearn)))
# 70:30 train test split train_test_split = int(0.7*data.shape[0]) X = data.iloc[:train_test_split, :-1] X_test = data.iloc[train_test_split:, :-1] y = data.iloc[:train_test_split, -1] y_test = data.iloc[train_test_split:, -1] maxdepth = 4 # Building Decesion Tree based on my model criteria = 'information_gain' mytree = DecisionTree(criterion=criteria, max_depth=maxdepth) #Split based on Inf. Gain mytree.fit(X, y) mytree.plot() print("My Model") y_hat = mytree.predict(X) print("Train Scores:") print('\tRMSE: ', rmse(y_hat, y)) print('\tMAE: ', mae(y_hat, y)) y_test_hat = mytree.predict(X_test) print("Test Scores:") print('\tRMSE: ', rmse(y_test_hat, y_test)) print('\tMAE: ', mae(y_test_hat, y_test)) ###################################################################################
import pandas as pd import numpy as np import matplotlib.pyplot as plt from tree.base import DecisionTree from metrics import * from pprint import pprint np.random.seed(42) # Read IRIS data set # ... # tree = DecisionTree(criterion='information_gain',max_depth=10) #Split based on Inf. Gain tree.output="category" tree.input="real" df=pd.read_csv("iris.data",names=['sepal_length','sepal_width','petal_length','petal_width','label']) train_data,test_data=tree.train_test_split(df) sub_tree = tree.decision_tree_algorithm(train_data) tree.tree=sub_tree rows,colums=test_data.values.shape y_hat = tree.predict(test_data.iloc[:,0:colums-1]) y= test_data.iloc[:,-1] print('Accuracy: ', accuracy(y_hat, y)) for cls in y.unique(): print('Class Name: ',cls) print('Precision: ', precision(y_hat, y, cls)) print('Recall: ', recall(y_hat, y, cls)) print()
y_data.append(row[4]) X_data = pd.DataFrame(data=X_data) y_data = pd.Series(data=y_data, dtype="category") # Defining Train Test Split train_test_split = int(0.7*len(iris_data)) X = X_data.iloc[:train_test_split, :] X_test = X_data.iloc[train_test_split:, :] y = y_data.iloc[:train_test_split] y_test = y_data.iloc[train_test_split:] # Training and Testing for criteria in ['information_gain', 'gini_index']: tree = DecisionTree(criterion=criteria, max_depth=3) # Build Decision Tree tree.fit(X, y) #Predict y_hat = tree.predict(X) y_test_hat = tree.predict(X_test) tree.plot() print('Criteria :', criteria) print('Train Accuracy: ', accuracy(y_hat, y)) print('Test Accuracy: ', accuracy(y_test_hat, y_test)) # Precesion and Recall for each class for cls in y.unique(): print("Class =",cls) print('Precision: ', precision(y_test_hat, y_test, cls)) print('Recall: ', recall(y_test_hat, y_test, cls))
if classes[key]>m: finclass = key m = classes[key] final_preds.append(finclass) return pd.Series(final_preds,dtype="category") N = 30 P = 2 NUM_OP_CLASSES = 2 n_estimators = 3 X = pd.DataFrame(np.abs(np.random.randn(N, P))) y = pd.Series(np.random.randint(NUM_OP_CLASSES, size = N), dtype="category") criteria = 'information_gain' tree = DecisionTree(criterion=criteria,max_depth=1) Classifier_AB = AdaBoostClassifier(base_estimator=tree, n_estimators=n_estimators ) Classifier_AB.fit(X, y) y_hat = Classifier_AB.predict(X) # [fig1, fig2] = Classifier_AB.plot() print('Criteria :', criteria) print('Accuracy: ', accuracy(y_hat, y)) for cls in y.unique(): print('Precision: ', precision(y_hat, y, cls)) print('Recall: ', recall(y_hat, y, cls)) print('\nIRIS DATASET') ##### AdaBoostClassifier on Iris data set using the entire data set with sepal width and petal width as the two features dataset = pd.read_csv("tree/iris.data",delimiter=",",header=None) a = dataset[1]
# N = 30 # P = 5 ############################################################################################################ # DISCRETE DESCRETE fit_time = [] predict_time = [] for N in range(2, 10): for P in range(100, 120): X = pd.DataFrame({ i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5) }) y = pd.Series(np.random.randint(P, size=N), dtype="category") tree = DecisionTree(criterion="a", max_depth=8) #Split based on Inf. Gain start = time.time() tree.fit(X, y) end = time.time() fit_time.append(end - start) start = time.time() y_hat = tree.predict(X) end = time.time() predict_time.append(end - start) plt.plot(fit_time) plt.ylabel('DIDO : Fit time', fontsize=16) plt.show() plt.plot(predict_time) plt.ylabel('DIDO : Predict time', fontsize=16)
import matplotlib.pyplot as plt from tree.base import DecisionTree from metrics import * np.random.seed(42) # Test case 1 # Real Input and Real Output N = 30 P = 5 X = pd.DataFrame(np.random.randn(N, P)) y = pd.Series(np.random.randn(N)) for criteria in ['information_gain', 'gini_index']: # Split based on Inf. Gain tree = DecisionTree(criterion=criteria) tree.fit(X, y) y_hat = tree.predict(X) tree.plot() print('Criteria :', criteria) print('RMSE: ', rmse(y_hat, y)) print('MAE: ', mae(y_hat, y)) # Test case 2 # Real Input and Discrete Output N = 30 P = 5 X = pd.DataFrame(np.random.randn(N, P)) y = pd.Series(np.random.randint(P, size=N), dtype="category")
import matplotlib.pyplot as plt from tree.base import DecisionTree from metrics import * from sklearn import tree as sktree from sklearn import metrics from sklearn.model_selection import train_test_split import numpy as np from pprint import pprint np.random.seed(42) # Read real-estate data set # ... # tree = DecisionTree(criterion='information_gain', max_depth=10) #Split based on Inf. Gain tree.output = "discrete" tree.input = "discrete" df = pd.read_excel("Real estate valuation data set.xlsx", names=[ 'No', 'tran_date', 'age', 'distance_mrt', 'stores', 'lat', 'long', 'price' ]) df = df.drop('No', axis=1) train_data, test_data = tree.train_test_split(df) sub_tree = tree.regression_tree_algorithm(df) print(sub_tree) tree.tree = sub_tree rows, colums = test_data.values.shape y_hat = tree.predict(test_data.iloc[:, 0:colums - 1]) y = test_data.iloc[:, -1]
# .. # Function to create fake data (take inspiration from usage.py) # ... # ..other functions # Test case 1 # Real Input and Real Output N = 50 P = 5 X = pd.DataFrame(np.random.randn(N, P)) y = pd.Series(np.random.randn(N)) try: for criteria in ['information_gain', 'gini_index']: tree = DecisionTree(criterion=criteria,max_depth=10) #Split based on Inf. Gain start = timeit.default_timer() tree.fit(X, y) stop = timeit.default_timer() print('Real Input and Real Output Time - Build Tree: ', stop - start) start = timeit.default_timer() y_hat = tree.predict(X) stop = timeit.default_timer() print('Real Input and Real Output Time - Predict Tree: ', stop - start) except: pass # Test case 2 # Real Input and Discrete Output X = pd.DataFrame(np.random.randn(N, P))
from ensemble.bagging import BaggingClassifier from tree.base import DecisionTree # Or use sklearn decision tree from linearRegression.linearRegression import LinearRegression ########### BaggingClassifier ################### N = 30 P = 2 NUM_OP_CLASSES = 2 n_estimators = 3 X = pd.DataFrame(np.abs(np.random.randn(N, P))) y = pd.Series(np.random.randint(NUM_OP_CLASSES, size=N), dtype="category") criteria = 'information_gain' tree = DecisionTree(criterion=criteria) Classifier_B = BaggingClassifier(base_estimator=tree, n_estimators=n_estimators) Classifier_B.fit(X, y) y_hat = Classifier_B.predict(X) #[fig1, fig2] = Classifier_B.plot() print('Criteria :', criteria) print('Accuracy: ', accuracy(y_hat, y)) print() for cls in y.unique(): print('Class: ', cls) print('Precision: ', precision(y_hat, y, cls)) print('Recall: ', recall(y_hat, y, cls)) print()
def cross_validtion_5_fold(X, y, depth): X_original = X y_original = y clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y = y[120:] y_hat = clf.predict(pd.DataFrame(X[120:])) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[90:], X[0:60], axis=0)), pd.Series(np.append(y[90:], y[0:60], axis=0), dtype="category")) y = y[60:90] y_hat = clf.predict(X[60:90]) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[120:], X[0:90], axis=0)), pd.Series(np.append(y[120:], y[0:90], axis=0), dtype="category")) y = y[90:120] y_hat = clf.predict(X[90:120]) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(X[30:]), pd.Series(y[30:], dtype="category")) y = y[0:30] y_hat = clf.predict(X[0:30]) print(accuracy(pd.Series(y_hat), pd.Series(y))) X = X_original y = y_original clf = DecisionTree(criterion="a", max_depth=depth) clf.fit(pd.DataFrame(np.append(X[0:30], X[60:], axis=0)), pd.Series(np.append(y[0:30], y[60:], axis=0), dtype="category")) y = y[30:60] y_hat = clf.predict(X[30:60]) print(accuracy(pd.Series(y_hat), pd.Series(y)))
from tree.base import DecisionTree from metrics import * np.random.seed(42) # Read IRIS data set # ... # iris = pd.read_csv('iris.csv') iris = iris.sample(frac=1).reset_index(drop=True) split_at = int(0.7 * (iris.shape[0])) X_train = iris.iloc[:split_at, :-1] y_train = iris.iloc[:split_at, -1] X_test = iris.iloc[split_at:, :-1] y_test = iris.iloc[split_at:, -1] model = DecisionTree() model.fit(X_train, y_train) y_out = model.predict(X_test) print("Accuracy is: ", accuracy(y_out, y_test)) for group in np.unique(y_test): print("Precision of {} is: {}".format(group, precision(y_out, y_test, group))) print("Recal of {} is: {}".format(group, recall(y_out, y_test, group))) #Accuracy of all five models fold = int(0.2 * (iris.shape[0])) for i in range(5): n_split1 = i * fold n_split2 = n_split1 + fold X_test1 = iris.iloc[n_split1:n_split2, :-1].reset_index(drop=True) y_test1 = pd.Series(list(iris.iloc[n_split1:n_split2, -1]))
np.random.seed(42) # Read IRIS data set # ... # dataset = load_iris() X, y = dataset.data, dataset.target #from sklearn.utils import shuffle #X, y = shuffle(X, y, random_state=0) print("fit model for iris dataset for 70-30 division") clf = DecisionTree(criterion="a", max_depth=5) clf.fit(pd.DataFrame(X[0:120]), pd.Series(y[0:120], dtype="category")) y = y[120:] y_hat = clf.predict(pd.DataFrame(X[120:])) print("Accuracy", accuracy(pd.Series(y_hat), pd.Series(y))) y = pd.Series(y) for cls in y.unique(): print('Precision: for class ', cls, " : ", precision(y_hat, y, cls)) print('Recall: ', cls, " : ", recall(y_hat, y, cls)) def cross_validtion_5_fold(X, y, depth): X_original = X y_original = y
# ... # ..other functions """ Case: RIRO""" learning_time = list() predict_time = list() for Ni in range(1, 7): for step in range(6, 42): N = Ni P = step X = pd.DataFrame(np.random.randn(N, P)) y = pd.Series(np.random.randn(N)) start_time = time.time() tree = DecisionTree(criterion="information_gain") tree.fit(X, y) end_time = time.time() learning_time.append(end_time - start_time) start_time = time.time() y_hat = tree.predict(X) end_time = time.time() predict_time.append(end_time - start_time) plt.plot(list(learning_time)) plt.ylabel('RIRO : Fit time', fontsize=16) plt.show() plt.plot(list(predict_time))
import numpy as np import matplotlib.pyplot as plt from tree.base import DecisionTree from metrics import * from sklearn.tree import DecisionTreeRegressor np.random.seed(42) # Read real-estate data set # ... # estate = pd.read_csv('Real_estate.csv', index_col='No', dtype=float) estate = estate.sample(frac=1).reset_index(drop=True) split_at = int(0.3 * (estate.shape[0])) X_train = estate.iloc[:split_at, :-1] y_train = estate.iloc[:split_at, -1] X_test = estate.iloc[split_at:, :-1] y_test = estate.iloc[split_at:, -1] model = DecisionTree(max_depth=2) model.fit(X_train, y_train) y_out = model.predict(X_test) print("Rmse is: ", rmse(y_out, y_test)) print("Mae is: ", mae(y_out, y_test)) model2 = DecisionTreeRegressor(max_depth=2) model2.fit(X_train, y_train) y_out = model2.predict(X_test) print("Rmse of Sklearn is: ", rmse(y_out, y_test)) print("Mae of Sklearn is: ", mae(y_out, y_test))
import pandas as pd import numpy as np import matplotlib.pyplot as plt from tree.base import DecisionTree from metrics import * np.random.seed(42) N = 30 P = 5 X = pd.DataFrame({ i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5) }) y = pd.Series(np.random.randint(P, size=N), dtype="category") print('\n\n##Discrete Input and Discrete Output##') for criteria in ['information_gain']: tree = DecisionTree(criterion=criteria, max_depth=np.inf) #Split based on Inf. Gain tree.fit(X, y) y_hat = tree.predict(X) tree.plot() print('Criteria :', criteria) print('Accuracy: ', accuracy(y_hat, y)) for cls in y.unique(): print(cls) print('Precision: ', precision(y_hat, y, cls)) print('Recall: ', recall(y_hat, y, cls))
def experiment(N=[ 30, ], M=[ 5, ], exp="sklearn", ios=[ "dido", ]): ''' Creates a new experiment while varing one parameter, either M or N, exp can be sklearn or mine. N: number of samples M: number of attributes exp: sklearn or mine ios: list of input outputs, can be rido, dido, riro, diro ''' results = np.zeros((len(ios), len(N), len(M), 4)) assert (len(N) == 1 or len(M) == 1) pbar = tqdm(total=len(ios) * len(N) * len(M)) for io in ios: for n in N: for m in M: X, y = datagen(N=n, M=m, io=io) if exp == "sklearn": if io == "diro" or io == "riro": tree = DecisionTreeRegressor() else: tree = DecisionTreeClassifier(criterion="entropy") else: exp = "MyTree" tree = DecisionTree() start = datetime.now() tree.fit(X, y) end = datetime.now() learn = (end - start).total_seconds() start = datetime.now() tree.predict(X) end = datetime.now() predict = (end - start).total_seconds() results[ios.index(io), N.index(n), M.index(m)] = np.array([n, m, learn, predict]) pbar.update(1) # Ploting for Learning tasks plt.figure() if len(N) > 1 or len(M) > 1: if len(N) > 1: for io in ios: plt.plot(results[ios.index(io), :, 0, 0], results[ios.index(io), :, 0, 2], label=io) plt.title("Learning Plot for N vs time for " + exp) plt.xlabel("Varying N") else: for io in ios: plt.plot(results[ios.index(io), 0, :, 1], results[ios.index(io), 0, :, 2], label=io) plt.title("Learning Plot for M vs time for " + exp) plt.xlabel("Varying M") plt.legend() plt.ylabel("Time taken in seconds") plt.savefig(os.path.join(experiments, "learn.png")) # Ploting for Predicting tasks plt.figure() if len(N) > 1 or len(M) > 1: if len(N) > 1: for io in ios: plt.plot(results[ios.index(io), :, 0, 0], results[ios.index(io), :, 0, 3], label=io) plt.title("Prediction Plot for N vs time for " + exp) plt.xlabel("Varying N") else: for io in ios: plt.plot(results[ios.index(io), 0, :, 1], results[ios.index(io), 0, :, 3], label=io) plt.title("Prediction Plot for M vs time for " + exp) plt.xlabel("Varying M") plt.legend() plt.ylabel("Time taken in seconds") plt.savefig(os.path.join(experiments, "predict.png")) return results
np.random.shuffle(index) len,_=df.values.shape train_size=int(len*0.6) train_data=df.loc[index][0:train_size] test_data=df.loc[index][train_size:] return train_data,test_data df=pd.read_csv("iris.data",names=['sepal_length','sepal_width','petal_length','petal_width','label']) df = df[['sepal_width','petal_width', 'label']] df = df.replace({'label' : {'Iris-setosa': 0, 'Iris-versicolor': 0, 'Iris-virginica': 1}}) train_data,test_data = train_test_split(df) X = train_data[['sepal_width','petal_width']] y = train_data[['label']]['label'] tree_iris = DecisionTree(criterion=criteria,max_depth=1) Classifier_AB_iris = AdaBoostClassifier(base_estimator=tree_iris, n_estimators=n_estimators ) Classifier_AB_iris.fit(X, y) y_hat = Classifier_AB_iris.predict(X) #[fig1, fig2] = Classifier_AB.plot() print('Criteria :', criteria) print('Accuracy: ', accuracy(y_hat, y)) print() for cls in y.unique(): if cls == 1: print('Category: Iris-virginica') else: print('Category: Not Iris-virginica') print('Precision: ', precision(y_hat, y, cls)) print('Recall: ', recall(y_hat, y, cls)) print()
np.random.seed(42) ########### AdaBoostClassifier on Real Input and Discrete Output ################### print("-----------------------------------------------------------") print("Decision stump on random data") print("-----------------------------------------------------------") N = 30 P = 2 NUM_OP_CLASSES = 2 n_estimators = 3 X = pd.DataFrame(np.abs(np.random.randn(N, P))) y = pd.Series(np.random.randint(NUM_OP_CLASSES, size=N), dtype="category") criteria = 'information_gain' tree = DecisionTree(criterion=criteria) re = X.shape[0] img_weights = [1 / re] * re tree.fit(X, y, img_weights) yhat = pd.Series(tree.predict(X)) print('Criteria :', criteria) print('Accuracy: ', accuracy(yhat, y)) for cls in y.unique(): print("***Class :" + str(cls) + "***") print('Precision: ', precision(yhat, y, cls)) print('Recall: ', recall(yhat, y, cls)) print("-----------------------------------------------------------") print("Adaboost on random data") print("-----------------------------------------------------------")