def CV_PKB(inputs,sharedK,K_train,Kdims,Lambda,nfold=3,ESTOP=30,ncpu=1,parallel=False,gr_sub=False,plot=False): ########## split data ############### test_inds = subsamp(inputs.train_response,inputs.train_response.columns[0],nfold) temp = pd.Series(range(inputs.Ntrain),index= inputs.train_response.index) folds = [] for i in range(nfold): folds.append([ temp[test_inds[i]].values, np.setdiff1d(temp.values,temp[test_inds[i]].values)]) ########## initiate for each fold ############### ytrain_ls = [np.squeeze(inputs.train_response.iloc[folds[i][1]].values) for i in range(nfold)] ytest_ls = [np.squeeze(inputs.train_response.iloc[folds[i][0]].values) for i in range(nfold)] Ftrain_ls = [] Ftest_ls = [] test_err_ls = [[] for i in range(nfold)] test_loss_ls = [[] for i in range(nfold)] for i in range(nfold): F0 = np.log((ytrain_ls[i] == 1).sum()/(ytrain_ls[i] ==-1).sum()) if F0==0: F0+=10**(-2) Ftrain_ls.append(np.repeat(F0,len(folds[i][1]))) # keep track of F_t(x_i) on training data Ftest_ls.append(np.repeat(F0,len(folds[i][0]))) #keep track of F_t(x_i) on testing data #train_loss.append(loss_fun(F_train,ytrain)) test_err_ls[i].append((np.sign(Ftest_ls[i]) != ytest_ls[i]).sum()/len(ytest_ls[i])) test_loss_ls[i].append(loss_fun(Ftest_ls[i],ytest_ls[i])) opt_iter = 0 min_loss = prev_loss = np.mean([x[0] for x in test_loss_ls]) ave_err = [np.mean([x[0] for x in test_err_ls])] ave_loss = [prev_loss] ########## boosting for each fold ############### # time0 = time.time() print("-------------------- CV -----------------------") print("iteration\tMean test err\tMean test loss") for t in range(1,inputs.maxiter+1): # one iteration for k in range(nfold): if inputs.method == 'L2': [m,beta,c] = oneiter_L2(sharedK,Ftrain_ls[k],ytrain_ls[k],Kdims,Lambda=Lambda,ncpu = ncpu,parallel = parallel,\ sele_loc=folds[k][1]) if inputs.method == 'L1': [m,beta,c] = oneiter_L1(sharedK,Ftrain_ls[k],ytrain_ls[k],Kdims,Lambda=Lambda,ncpu = ncpu,parallel = parallel,\ sele_loc=folds[k][1],group_subset = gr_sub) # line search x = line_search(sharedK,Ftrain_ls[k],ytrain_ls[k],Kdims,[m,beta,c],sele_loc=folds[k][1]) beta *= x c *= x # update lists Ftrain_ls[k] += (K_train[:,:,m][np.ix_(folds[k][1],folds[k][1])].dot(beta) + c)*inputs.nu Ftest_ls[k] += (K_train[:,:,m][np.ix_(folds[k][0],folds[k][1])].dot(beta)+ c)*inputs.nu test_err_ls[k].append((np.sign(Ftest_ls[k])!=ytest_ls[k]).sum()/len(ytest_ls[k])) new_loss = loss_fun(Ftest_ls[k],ytest_ls[k]) test_loss_ls[k].append(new_loss) # save iteration cur_err= np.mean([x[-1] for x in test_err_ls]) cur_loss = np.mean([x[-1] for x in test_loss_ls]) #update best loss if cur_loss < min_loss: min_loss = cur_loss opt_iter = t ave_err.append(cur_err) ave_loss.append(cur_loss) # print report if t%20 == 0: print("%9.0f\t%13.4f\t%14.4f" % (t, cur_err,cur_loss)) # detect early stop if t-opt_iter >= ESTOP: print('Early stop criterion satisfied: break CV.') print('using iteration number:',opt_iter) break print("-----------------------------------------------\n") # visualization if plot: folder = inputs.output_folder if folder is None: print("No CV file name provided.\n") else: f = plt.figure() plt.plot(ave_err) plt.xlabel("iterations") plt.ylabel("CV error") f.savefig(folder+'/CV_err.png') f=plt.figure() plt.plot(ave_loss) plt.xlabel("iterations") plt.ylabel("CV loss") f.savefig(folder+'/CV_loss.png') return opt_iter
def CV_PKB(inputs, K_train, Lambda, nfold=3, ESTOP=50, parallel=False, gr_sub=False, plot=False): ########## split data ############### temp = pd.Series(range(inputs.Ntrain), index=inputs.train_response.index) if inputs.problem == "classification": test_inds = subsamp(inputs.train_response, inputs.train_response.columns[0], nfold) elif inputs.problem == 'survival': test_inds = subsamp(inputs.train_response, inputs.train_response.columns[1], nfold) elif inputs.problem == "regression": test_inds = simple_subsamp(inputs.train_response, nfold) folds = [] for i in range(nfold): folds.append([ temp[test_inds[i]].values, np.setdiff1d(temp.values, temp[test_inds[i]].values) ]) ########## initiate model for each fold ############### Ztrain_ls = [ inputs.train_clinical.values[folds[i][1], :] for i in range(nfold) ] Ztest_ls = [ inputs.train_clinical.values[folds[i][0], :] for i in range(nfold) ] K_train_ls = [ K_train[np.ix_(folds[i][1], folds[i][1])] for i in range(nfold) ] K_test_ls = [ K_train[np.ix_(folds[i][1], folds[i][0])] for i in range(nfold) ] if inputs.problem == "classification": ytrain_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][1]].values) for i in range(nfold) ] ytest_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][0]].values) for i in range(nfold) ] inputs_class = [ CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] models = [ assist.Classification.PKB_Classification(inputs_class[i], ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] elif inputs.problem == 'survival': ytrain_ls = [ inputs.train_response.iloc[folds[i][1], ].values for i in range(nfold) ] ytest_ls = [ inputs.train_response.iloc[folds[i][0], ].values for i in range(nfold) ] inputs_class = [ CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] models = [ assist.Survival.PKB_Survival(inputs_class[i], ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] elif inputs.problem == "regression": ytrain_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][1]].values) for i in range(nfold) ] ytest_ls = [ np.squeeze(inputs.train_response.iloc[folds[i][0]].values) for i in range(nfold) ] inputs_class = [ CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] models = [ assist.Regression.PKB_Regression(inputs_class[i], ytrain_ls[i], ytest_ls[i]) for i in range(nfold) ] for x in models: x.init_F() ########## boosting for each fold ############### opt_iter = 0 tmp_list = [x.test_loss[0] for x in models] min_loss = prev_loss = np.mean([x.test_loss[0] for x in models]) ave_loss = [prev_loss] print_section('Cross-Validation') print("{:>9}\t{:>14}\t{:>24}".format("iteration", "Mean test loss", "time (if no E-stop)")) time0 = time.time() for t in range(1, inputs.maxiter + 1): # one iteration for k in range(nfold): if inputs.method == 'L2': [m,beta,gamma] = oneiter_L2(K_train_ls[k],Ztrain_ls[k],models[k],\ Lambda=Lambda,parallel = parallel,group_subset = gr_sub) if inputs.method == 'L1': [m,beta,gamma] = oneiter_L1(K_train_ls[k],Ztrain_ls[k],models[k],\ Lambda=Lambda,parallel = parallel,group_subset = gr_sub) # line search x = line_search(K_train_ls[k], Ztrain_ls[k], models[k], [m, beta, gamma]) beta *= x gamma *= x # update model models[k].update([m, beta, gamma], K_train_ls[k][:, :, m], K_test_ls[k][:, :, m], Ztrain_ls[k], Ztest_ls[k], inputs.nu) # save iteration #print("loss values: {}".format([x.test_loss[-1] for x in models])) cur_loss = np.mean([x.test_loss[-1] for x in models]) #update best loss if cur_loss < min_loss: min_loss = cur_loss opt_iter = t ave_loss.append(cur_loss) # print report if t % 10 == 0: iter_persec = t / (time.time() - time0) # time of one iteration rem_time = (inputs.maxiter - t) / iter_persec # remaining time print("{:9.0f}\t{:14.4f}\t{:24.4f}".format(t, cur_loss, rem_time / 60)) # detect early stop if t - opt_iter >= ESTOP: print('Early stop criterion satisfied: break CV.') break # print the number of iterations used print('using iteration number:', opt_iter) # visualization if plot: folder = inputs.output_folder f = plt.figure() plt.plot(ave_loss) plt.xlabel("iterations") plt.ylabel("CV loss") f.savefig(folder + '/CV_loss.pdf') return opt_iter
"""--------------------------- CV FOR NUMBER OF ITERATIONS ----------------------------""" opt_iter = CV_PKB(inputs,sharedK,K_train,Kdims,Lambda,nfold=3,ESTOP=ESTOP,\ ncpu=1,parallel=False,gr_sub=False,plot=True) """--------------------------- BOOSTING ITERATIONS ----------------------------""" time0 = time.time() print("--------------------- Boosting -------------------") print("iteration\ttrain err\t time(min)") for t in range(1, opt_iter + 1): # one iteration if inputs.method == 'L2': [m,beta,c] = oneiter_L2(sharedK,F_train,ytrain,Kdims,\ Lambda=Lambda,ncpu = ncpu,parallel = parallel,\ sele_loc=None,group_subset = gr_sub) if inputs.method == 'L1': [m,beta,c] = oneiter_L1(sharedK,F_train,ytrain,Kdims,\ Lambda=Lambda,ncpu = ncpu,parallel = parallel,\ sele_loc=None,group_subset = gr_sub) # line search x = line_search(sharedK, F_train, ytrain, Kdims, [m, beta, c]) beta *= x c *= x # update outputs outputs.trace.append([m, beta, c]) F_train += (K_train[:, :, m].dot(beta) + c) * inputs.nu outputs.train_err.append((np.sign(F_train) != ytrain).sum() / len(ytrain))
opt_iter = CV_PKB(inputs,sharedK,K_train,Kdims,Lambda,nfold=3,ESTOP=ESTOP,\ ncpu=1,parallel=parallel,gr_sub=gr_sub,plot=True) #opt_iter = int(opt_iter*1.5) #opt_iter = 300 """--------------------------- BOOSTING ITERATIONS ----------------------------""" time0 = time.time() assist.util.print_section("BOOSTING") print("iteration\ttrain loss\ttest loss\t time") for t in range(1, opt_iter + 1): # one iteration if inputs.method == 'L2': [m,beta,gamma] = oneiter_L2(sharedK,Z_train,model,Kdims,\ Lambda=Lambda,ncpu = ncpu,parallel = parallel,\ sele_loc=None,group_subset = gr_sub) if inputs.method == 'L1': [m,beta,gamma] = oneiter_L1(sharedK,Z_train,model,Kdims,\ Lambda=Lambda,ncpu = ncpu,parallel = parallel,\ sele_loc=None,group_subset = gr_sub) #print([m,beta,gamma]) #print("\t beta norm: {}; gamma norm: {}".format(np.mean(beta**2), np.mean(gamma**2)) ) # line search x = assist.util.line_search(sharedK, Z_train, model, Kdims, [m, beta, gamma]) beta *= x gamma *= x # update model parameters
CV FOR NUMBER OF ITERATIONS ----------------------------""" opt_iter = CV_PKB(inputs,K_train,Lambda,nfold=3,ESTOP=ESTOP,\ parallel=parallel,gr_sub=gr_sub,plot=True) """--------------------------- BOOSTING ITERATIONS ----------------------------""" time0 = time.time() assist.util.print_section("BOOSTING") print("iteration\ttrain loss\ttest loss\t time") for t in range(1, opt_iter + 1): # one iteration if inputs.method == 'L2': [m,beta,gamma] = oneiter_L2(K_train,Z_train,model,Lambda=Lambda,\ parallel = parallel,group_subset = gr_sub) if inputs.method == 'L1': [m,beta,gamma] = oneiter_L1(K_train,Z_train,model,\ Lambda=Lambda,parallel = parallel,group_subset = gr_sub) # line search x = assist.util.line_search(K_train, Z_train, model, [m, beta, gamma]) beta *= x gamma *= x # update model parameters if model.hasTest: model.update([m, beta, gamma], K_train[:, :, m], K_test[:, :, m], Z_train, Z_test, inputs.nu) else: