def splitonmaxarg(self, x_tl, y_tl, features, D_t, isSparse=0): ret = [] pplus = sum(D_t * (y_tl > 0)) for feature_i in range(features): (dv, err) = ds.stump_fit(x_tl[:, feature_i], y_tl, D_t, pplus) ret.append((feature_i, dv, err)) a_ret = array(ret) arg = argmax(abs(0.5 - a_ret[:, 2])) return a_ret[arg]
def adaboost_train (x, y, T): cf = x.shape[1] n = y.shape[0] weights = ones(n)/n H = [] A = [] I = [] TE = [] for t in range(T): pplus = sum(weights * (y > 0)) # Let's train on all the features and find the one that works the best decisionVariables = [] score = [] we = [] for idx in range(cf): f = x[:,idx] # train the stump (dv, err) = ds.stump_fit(f, y, weights, pplus) we.append( err ) decisionVariables.append(dv) # score the classifiers on all features for this round score.append(abs(.5-err)) print "Round: ", t, str(datetime.now()) # choose the one feature we'll use for this round's classifier I.append(np.argmax(score)) H.append(decisionVariables[I[t]]) eps = we[I[t]] # calculate our alpha A.append(.5 * math.log((1-eps)/eps)) # update the weights numerators = weights * np.exp( -A[t] * y * ds.stump_predict(x[:,I[t]], H[t]) ) Z = numerators.sum() weights = numerators / Z # Calculate the overall training errors y_hat = adaboost_predict(A,H,I,x, len(A)) TE.append((y_hat * y < 0).sum() / float(n)) return A, H, I, TE
#!/usr/bin/python2 from scipy import * import scipy.sparse as sp import dstump as ds (f, y) = ds.two_clusters(100) pr = ones(len(y))/len(y) #This quantity is invariant for each Adaboost step, and helps us take #advantage of sparsity. pplus = sum(pr * (y > 0)) #The decision stump training routine accepts either a dense 1-d #array or a sparse 1-d CSC matrix. The resulting decision variable #might be different for dense and sparse data, but the errors are #the same. See implementation for details. (dv, err) = ds.stump_fit(f, y, pr, pplus) #Inplace transpose of a CSR matrix gives a CSC matrix. fs = sp.csr_matrix(f).T (dvs, errs) = ds.stump_fit(fs, y, pr, pplus)