def test_run(xTrainFile, yTrainFile): X = np.genfromtxt(xTrainFile,delimiter=",") Y = np.genfromtxt(yTrainFile,delimiter=",") M = X.shape[0] #maxDepth ######################## nBags = 3 YHat = np.zeros((M,nBags)) rforest = [None] * nBags maxDepth = 10 nFeatures = 91 minParent = 8 for l in range(1,nBags): print "bags", l Xi,Yi = ml.bootstrapData(X,Y, M) rforest[l] = dtree.treeRegress() rforest[l].train(Xi,Yi,maxDepth=maxDepth) YHat[:,l] = rforest[l].predict(X)[:,0] write_to_kaggle(YHat)
def train_from_triples(models, triple_file_name, destination_folder): Xdata, Ydata, Xtedata = init() Xs, _ = ml.transforms.rescale(Xdata) Ys = Ydata Xtes, _ = ml.transforms.rescale(Xtedata) print '----Training models------' #Xi, Yi = Xdata[0:10000], Ydata[0:10000] #Xs, Ys = Xs[0:10000], Ys[0:10000] Xi, Yi = Xdata, Ydata Xs, Ys = Xs, Ydata f = open(triple_file_name, 'r') triples = f.readlines() f.close() for triple in triples: nf = int(triple.split(',')[0].strip()) d = int(triple.split(',')[1].strip()) l = int(triple.split(',')[2].strip()) print 'Now Training (nf,d,ml):', nf, d, l #dt = ml.dtree.treeClassify(Xi, Yi, maxDepth=d, nFeatures=nf, minLeaf=l) #models.append(dt) #Ypred = dt.predict(Xi) #print 'Training error with triple on unscaled: ', triple.strip(), 'is', computeError(Ypred[:,np.newaxis], Yi) Xi, Yi = ml.bootstrapData(Xs, Ys, Xs.shape[0]) dt = ml.dtree.treeClassify(Xi, Yi, maxDepth=d, nFeatures=nf, minLeaf=l) Ypred = dt.predict(Xs) print 'Training error with triple on scaled: ', triple.strip( ), 'is', computeError(Ypred[:, np.newaxis], Ys) models.append(dt) #save_models(models, destination_folder) print '-----Predicting the scores------' kaggle_predict(models, True)
def test_run(xTrainFile, yTrainFile): X = np.genfromtxt(xTrainFile,delimiter=",") Y = np.genfromtxt(yTrainFile,delimiter=",") TEST = np.genfromtxt("/home/john/Downloads/kaggle.X1.test.txt",delimiter=",") M = X.shape[0] #maxDepth ######################## nBags = 125 rforest = [None] * nBags maxDepth = 40 nFeatures = 100 minParent = 8 for l in range(nBags): print "bags", l Xi,Yi = ml.bootstrapData(X,Y, M) rforest[l] = dtree.treeRegress() rforest[l].train(Xi,Yi,maxDepth=maxDepth) mTest = TEST.shape[0] predict = np.zeros( (mTest, nBags) ) for i in range(nBags): predict[:,i] = rforest[i].predict(TEST).T[0] predict = predict[:,0] _write_to_kaggle("treebag.csv",predict)
def train(models, lower, upper, destination_folder): Xdata, Ydata, Xtedata = init() #X, Y = Xdata[0:10], Ydata[0:10] X, Y = Xdata, Ydata X, _ = ml.transforms.rescale(X) nFolds = 5 trError = [] testError = [] thresholdError = 0.7 nFolds = 5 #leaves = [5, 7, 10, 13, 15, 18, 21, 24, 27, 30, 33, 36, 40] for nFeatures in range(lower, upper): for depth in [10, 15, 16, 17, 19, 21, 30, 45, 50]: for minLeaf in [ 5, 7, 10, 13, 20, 30, 64, 128, 150, 200, 250, 500, 1000, 1250 ]: #print 'depth', depth print 'Features, Depth, minLeaf, modelIndex: ', (nFeatures, depth, minLeaf, len(models)) start = time.time() Xi, Yi = ml.bootstrapData(X, Y, X.shape[0]) errTr, errTe, m = train_fold(Xi, Yi, nFolds, nFeatures, depth, minLeaf) end = time.time() #models.extend(m) trError.append(errTr) testError.append(errTe) print 'Average training erorr', trError[-1] print 'Average test error', testError[-1] print 'Total time for model', ( end - start), 'Time per split: ', ((end - start) / (1.0 * nFolds)) if testError[-1] < 0.29 or trError[-1] < 0.20: print ':::LOW ERR::: (f,d,ml,len_model,teE, trE', \ (nFeatures, depth, minLeaf, len(models), testError[-1], trError[-1]) # TODO: If if erorr is less than threshold, then add it to the models array # plt.plot(range(0, len(trError)), trError, 'b-') # plt.plot(range(0, len(testError)), testError, 'g-') # plt.show() f = open('training_error' + str(lower) + '_' + str(upper), 'w') p.dump(trError, f) f.close() f = open('test_error' + str(lower) + '_' + str(upper), 'w') p.dump(testError, f) f.close() save_models(models, destination_folder)
def __init__(self, X, Y, nFeatures, maxDepth, minLeaf, number_of_learner): (N, D) = X.shape self.number_of_learner = number_of_learner self.learners = [0] * self.number_of_learner for i in range(self.number_of_learner): (bstrp_x, bstrp_y) = ml.bootstrapData(X, Y) self.learners[i] = ml.dtree.treeClassify(bstrp_x, bstrp_y, nFeatures=nFeatures, maxDepth=maxDepth, minLeaf=minLeaf)
def __init__(self, X, Y, Nbags=80, maxDepth=20, nFeatures=20): self.bags = [] for i in range(Nbags): Xi, Yi = ml.bootstrapData(X, Y, X.shape[0]) tree = ml.dtree.treeClassify(Xi, Yi, maxDepth=maxDepth, nFeatures=nFeatures) self.bags.append(tree) self.bt = BaggedTree(self.bags) self.bt.classes = np.unique(Y)
def TrainEnsemble(): Xtr, Ytr = X[:10000, :], Y[:10000] Xval, Yval = X[10000:20000, :], Y[10000:20000] nFeatures = [5, 6, 7, 8, 9, 10, 11, 12, 13, 14] for nFeature in nFeatures: print "=" * 50 print 'Training Decision Trees with ', str(nFeature), ' features' bags = [1, 5, 10, 25, 45, 60, 75] bagTrainError = [] bagValidationError = [] ensembles = [] for bag in bags: print 'Training ', bag, ' decision tree(s)' decisionTrees = [None] * bag trainingError = [] for i in range(0, bag, 1): # Drawing a random training sample every single time Xi, Yi = ml.bootstrapData(Xtr, Ytr, n_boot=10000) decisionTrees[i] = DecisionTreeClassifier( max_features=nFeature) decisionTrees[i] = decisionTrees[i].fit(Xi, Yi) # decisionTrees[i] = ml.dtree.treeClassify(Xi, Yi, maxDepth=16, minLeaf=256, nFeatures=nFeature) YHatValidation = np.zeros((Xval.shape[0], bag)) YHatTraining = np.zeros((Xtr.shape[0], bag)) for i in range(0, len(decisionTrees), 1): decisionTree = decisionTrees[i] YHatValidation[:, i] = decisionTree.predict(Xval) YHatTraining[:, i] = decisionTree.predict(Xtr) # YHatValidation = np.sum(YHatValidation, axis=1)/float(bag) YHatValidation = np.mean(YHatValidation, axis=1) YHatValidation[YHatValidation > 0.5] = 1 YHatValidation[YHatValidation <= 0.5] = 0 # YHatTraining = np.sum(YHatTraining, axis=1)/float(bag) YHatTraining = np.mean(YHatTraining, axis=1) YHatTraining[YHatTraining > 0.5] = 1 YHatTraining[YHatTraining <= 0.5] = 0 bagValidationError.append(np.mean(YHatValidation != Yval)) bagTrainError.append(np.mean(YHatTraining != Ytr)) ensembles.append(decisionTrees) index = np.argmin(bagValidationError) print 'Minimum Validation Error = ', bagValidationError[index] print 'Number of learners in Bag = ', bags[index]
def setup_code(xTrainFile, yTrainFile): X1 = np.genfromtxt(xTrainFile,delimiter=",") Y = np.genfromtxt(yTrainFile,delimiter=",") Xtr,Xte,Ytr,Yte = ml.splitData(X1,Y,0.80) M = Xtr.shape[0] Mv= Xte.shape[0] #maxDepth ######################## nBags = 6000 YtHat = np.zeros((M,nBags)) YvHat = np.zeros((Mv,nBags)) rforest = [None] * nBags maxDepth = 40 lowestMaxDepth = LowestMSE() nFeatures = 60 minParent = 8 for l in range(1,nBags): print "bags", l Xi,Yi = ml.bootstrapData(Xtr,Ytr, M) rforest[l] = dtree.treeRegress() rforest[l].train(Xi,Yi,maxDepth=maxDepth) YtHat[:,l] = rforest[l].predict(Xtr)[:,0] # predict on training data YvHat[:,l] = rforest[l].predict(Xte)[:,0] mseT = ((Ytr - YtHat[:,0:l].mean(axis=1))**2).mean() mseV = ((Yte - YvHat[:,0:l].mean(axis=1))**2).mean() lowestMaxDepth.set(mseV, l, maxDepth, minParent, l) print "Lowest" print lowestMaxDepth
Y = np.genfromtxt("data/Y_train.txt", delimiter=None) Xtr, Ytr = X[:180000, :], Y[:180000] Xval, Yval = X[180000:, :], Y[180000:] bags = [1, 5, 10, 25, 45, 60, 75] bagTrainError = [] bagValidationError = [] ensembles = [] for bag in bags: print 'Training ', bag, ' decision trees' decisionTrees = [None] * bag trainingError = [] for i in range(0, bag, 1): # Drawing a random training sample every single time Xi, Yi = ml.bootstrapData(Xtr, Ytr, n_boot=180000) decisionTrees[i] = ml.dtree.treeClassify(Xi, Yi, maxDepth=16, minLeaf=256, nFeatures=9) YHatValidation = np.zeros((Xval.shape[0], bag)) YHatTraining = np.zeros((Xtr.shape[0], bag)) for i in range(0, len(decisionTrees), 1): decisionTree = decisionTrees[i] YHatValidation[:, i] = decisionTree.predict(Xval) YHatTraining[:, i] = decisionTree.predict(Xtr) # YHatValidation = np.sum(YHatValidation, axis=1)/float(bag) YHatValidation = np.mean(YHatValidation, axis=1)
# Note: file is comma-delimited X = np.genfromtxt("data/trainX.txt", delimiter=',') Y = np.genfromtxt("data/trainY.txt", delimiter=',') # also load features of the test data (to be predicted) Xe1 = np.genfromtxt("data/devX.txt", delimiter=',') Ye1 = np.genfromtxt("data/devY.txt", delimiter=',') print X.shape print Y.shape nBag = 10 m, n = X.shape classifiers = [None] * nBag # Allocate space for learners for i in range(nBag): Xi, Yi = ml.bootstrapData(X, Y) classifiers[i] = ml.dtree.treeRegress( Xi, Yi, maxDepth=20, minParent=1024, nFeatures=60) # Train a model on data Xi, Yi #training errors trainingErrors = np.zeros( nBag) # Allocate space for predictions from each model for i in range(nBag): temp = np.sqrt(classifiers[i].mse(X, Y)) # Apply each classifier trainingErrors[i] = temp # Make overall prediction by majority vote #tE = np.mean(trainingErrors, axis=) # test on data Xtest predict = np.zeros(nBag) # Allocate space for predictions from each model
def __init__(self, learners): """Constructs a BaggedTree class with a set of learners. """ self.learners = learners def predictSoft(self, X): """Predicts the probabilities with each bagged learner and average over the results. """ n_bags = len(self.learners) preds = [self.learners[l].predictSoft(X) for l in range(n_bags)] return np.mean(preds, axis=0) n_bags = 7 bags = [] # self.learners for l in range(n_bags): # Each boosted data is the size of the original data. Xi, Yi = ml.bootstrapData(Xtr, Ytr, Xtr.shape[0]) # Train the model on that draw tree = ml.dtree.treeClassify(Xi, Yi, minParent=2**6, maxDepth=100, nFeatures=6) bags.append(tree) bt = BaggedTree(bags) bt.classes = np.unique(Y) print("{0:>15}: {1:.4f}".format('Train AUC', bt.auc(Xtr, Ytr))) print("{0:>15}: {1:.4f}".format('Validation AUC', bt.auc(Xva, Yva)))
# # (g) # learner.train(Xtr, Ytr, minParent=4, maxDepth=14, minLeaf=4) # Ypred = learner.predictSoft(Xte) # print(Ypred.shape) # # Now output a file with two columns, a row ID and a confidence in class 1: # np.savetxt('data/Yhat_dtree.txt', np.vstack((np.arange(len(Ypred)), Ypred[:, 1])).T, '%d, %.2f', header='ID,Prob1', comments='', delimiter=',') # # # problem 3 # # (a) ensemble = [None] * 50 for i in range(0, 50): print(i) Xtri, Ytri = ml.bootstrapData(Xtr, Ytr) ensemble[i] = ml.dtree.treeClassify(Xtri, Ytri, minParent=8, maxDepth=14, minLeaf=4, nFeatures=8) sizeArray = [1, 5, 10, 25, 50] Yhat_va = np.zeros(10000) Yhat_tr = np.zeros(90000) valid_err = [] train_err = [] for size in sizeArray:
# ## produce the actual learner svr_rbf = SVR(kernel="rbf", C=10, gamma=0.1) svr_learner = svr_rbf.fit(Xtr, Ytr) ensemble.add(svr_learner) # In[ ]: ### Andrew Fischer's learners tree_learner = Ensemble() # Find results from bagged tree for i in range(0, 50): time_start = time.time() x,y = ml.bootstrapData(X, Y, len(X)) tl = tree.DecisionTreeRegressor(max_features=50) tl.fit(x, y) tree_learner.add(tl) time_end = time.time() print("Iteration=" + str(i) + ", seconds=" + str(time_end - time_start)) ensemble.add(tree_learner) # In[ ]: #--------------------------------- Ensemble ---------------------------------# # store the learners we have in to a list, index them and then make predictions Yhat = ensemble.predict(Xe1)
dt = ml.dtree.treeClassify(Xt,Yt,minLeaf=8, minParent = 16, maxDepth = d) err_depth_tr[d] = ml.dtree.treeClassify.err(dt,Xt,Yt) err_depth_v[d] = ml.dtree.treeClassify.err(dt,Xv,Yv) xs = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] plt.plot(xs, err_depth_tr, '-r', xs, err_depth_v, '-b') plt.legend(('Training Error Rate', 'Validation Error Rate'),'upper right') plt.xlabel('Maximum Depth') plt.ylabel('error rate') plt.show() rf = [None]*25 Yt_hat = np.zeros((Yt.shape[0],25)) Yv_hat = np.zeros((Yv.shape[0],25)) for i in range(0,24,1): [Xi, Yi] = ml.bootstrapData(Xt, Yt, Xt.shape[0]) rf[i] = ml.dtree.treeClassify(Xi, Yi, minLeaf=8, minParent = 512, maxDepth = 7, nFeatures = 14) Yt_hat[:, i] = rf[i].predict(Xt) Yv_hat[:, i] = rf[i].predict(Xv) err_e_t = [None]*6 err_e_v = [None]*6 Yt_hat_e = Yt_hat[:, 0] err_e_t[0] = np.mean(Yt_hat_e.reshape(Yt.shape) != Yt) Yv_hat_e = Yv_hat[:, 0] err_e_v[0] = np.mean(Yv_hat_e.reshape(Yv.shape) != Yv) j=1 for i in [5, 10, 15, 20, 25]: Yt_hat_e = (np.mean(Yt_hat[:,0:i], axis=1)>0.5)
# RANDOM FORESTS # 3 (A) # In[58]: #Random Forest of size 25 # Load data set X, Y for training the ensemble… m,n = Xtr.shape ensemble = [ None ] * 25 # Allocate space for learners n=7 for i in range(25): #ind = np.floor( m * np.random.rand(n) ).astype(int) #Xb, Yb = Xtr[ind,:],Ytr[ind] Xb,Yb=ml.bootstrapData(Xtr,Ytr) ensemble[i]=ml.dtree.treeClassify(Xb,Yb, maxDepth=5,minLeaf=256,nFeatures=n) # In[15]: # test on data Xva mTest = Xva.shape[0] predictTe = np.zeros( (mTest, 25) ) # Allocate space for predictions from each model predictTr = np.zeros( (mTest, 25) ) for i in range(25): predictTe[:,i] = ensemble[i].predict(Xva) # Apply each classifier predictTr[:,i] = ensemble[i].predict(Xtr) predictTest = np.mean(predictTe, axis=1)
#X = VarianceThreshold(threshold=(.8*.2)).fit_transform(X) Xtr,Xte,Ytr,Yte = ml.splitData(X,Y,0.8) #testdat = open('testdat.csv','w') netbags = [] for iter in range(100): for moment in [0.3]: for learnRate in [0.05]: for epochs in [30]: for depth in [3]: for hidw in [8]: Xboot, Yboot = ml.bootstrapData(Xtr,np.array([Ytr]).T,Xtr.shape[0]//50) print Xboot.shape Yboot = Yboot.T print Yboot net = FeedForwardNetwork() w = X.shape[1] hw = hidw#8 inl = TanhLayer(w) net.addInputModule(inl) last = inl for i in range(3):
print X.shape print Y.shape nBag = 101 learners = np.array([2, 5, 10, 20, 25, 50]) classifiers = [None] * nBag # Allocate space for learners errT = np.zeros((len(learners), )) nFolds = 10 errX = np.zeros((len(learners), nFolds)) for iFold in range(nFolds): [Xti, Xvi, Yti, Yvi] = ml.crossValidate(X, Y, nFolds, iFold) for i in range(nBag): Xi, Yi = ml.bootstrapData(Xti, Yti) classifiers[i] = ml.dtree.treeRegress( Xi, Yi, maxDepth=20, minParent=1024, nFeatures=60) # Train a model on data Xi, Yi for i in range(len(learners)): learnerNum = learners[i] predict = np.zeros( (learnerNum)) # Allocate space for predictions from each model for j in range(learnerNum): predict[j] = np.sqrt(classifiers[j].mse( Xvi, Yvi)) # Apply each classifier, calculate RMSE errX[i, iFold] = np.mean(predict) errX = np.mean(errX, axis=1) print errX.shape
Xv = np.genfromtxt("X_train.txt", delimiter=None)[10001:20000] # load the text file Yv = np.genfromtxt("Y_train.txt", delimiter=None)[10001:20000] # load the text file #test data Xte = np.genfromtxt("X_test.txt", delimiter=None) # load the text file ensemble = [] trainError = [] validError = [] predicts = [] aucs = [] for x in range(14, 21): ind = ml.bootstrapData(Xt, Yt, n_boot=x) ensem = ml.dtree.treeClassify(ind[0], ind[1], minLeaf=4) ## 0.55670, ensemble.append(ensem) trainError.append(ensem.err(Xt, Yt)) validError.append(ensem.err(Xv, Yv)) predicts.append(ensem.predict(Xte)) aucs.append(ensem.auc(Xv, Yv)) aucMean = 0 for x in aucs: aucMean = aucMean + x print aucMean / len(aucs) Ypred = [[x] for x in predicts[0]] for x in Ypred:
,minParent=512) Ypred = learnerTR.predictSoft(nXte) np.savetxt('Yhat_dtree.txt', np.vstack( (np.arange(len(Ypred)) , Ypred[:,1]) ).T, '%d, %.2f',header='ID,Prob1',comments='',delimiter=','); # Problem 3: Random Forests # Part A ensemble = [0]*25 Ytrhat = np.zeros((np.size(Ytr),25)) Ytehat = np.zeros((np.size(Yte),25)) # Evaluate for up to 25 learners. for i in range(25): Xb,Yb = ml.bootstrapData(Xtr,Ytr) ensemble[i] = ml.dtree.treeClassify(Xb,Yb,maxDepth=15 ,minLeaf=4,nFeatures=60) Ytrhat[:,i] = ensemble[i].predict(Xtr) Ytehat[:,i] = ensemble[i].predict(Xte) # Write down mseTR and mseTE for learners [1,5,10,25] mseTR = [] mseTE = [] for index, value in enumerate([1,5,10,25]): mseTR.append(np.mean( (Ytr-np.mean(Ytrhat[:,0:value],1))**2 )) mseTE.append(np.mean( (Yte-np.mean(Ytehat[:,0:value],1))**2 )) print(str(value)+" Ensemble Members: mseTR = " +str(mseTR[index])+" | mseTE = "+str(mseTE[index])) _,axis = plt.subplots()