Esempio n. 1
0
def CV_PKB(inputs,sharedK,K_train,Kdims,Lambda,nfold=3,ESTOP=30,ncpu=1,parallel=False,gr_sub=False,plot=False):
    ########## split data ###############
    test_inds = subsamp(inputs.train_response,inputs.train_response.columns[0],nfold)
    temp = pd.Series(range(inputs.Ntrain),index= inputs.train_response.index)
    folds = []
    for i in range(nfold):
        folds.append([ temp[test_inds[i]].values, np.setdiff1d(temp.values,temp[test_inds[i]].values)])
    
    ########## initiate for each fold ###############
    ytrain_ls = [np.squeeze(inputs.train_response.iloc[folds[i][1]].values) for i in range(nfold)]
    ytest_ls = [np.squeeze(inputs.train_response.iloc[folds[i][0]].values) for i in range(nfold)]
    
    Ftrain_ls = []
    Ftest_ls = []
    test_err_ls = [[] for i in range(nfold)]
    test_loss_ls = [[] for i in range(nfold)]
    for i in range(nfold):
        F0 = np.log((ytrain_ls[i] == 1).sum()/(ytrain_ls[i] ==-1).sum())      
        if F0==0: F0+=10**(-2)
        Ftrain_ls.append(np.repeat(F0,len(folds[i][1])))  # keep track of F_t(x_i) on training data
        Ftest_ls.append(np.repeat(F0,len(folds[i][0]))) #keep track of F_t(x_i) on testing data
        #train_loss.append(loss_fun(F_train,ytrain))
        test_err_ls[i].append((np.sign(Ftest_ls[i]) != ytest_ls[i]).sum()/len(ytest_ls[i]))
        test_loss_ls[i].append(loss_fun(Ftest_ls[i],ytest_ls[i]))    
    
    opt_iter = 0
    min_loss = prev_loss =  np.mean([x[0] for x in test_loss_ls])
    ave_err = [np.mean([x[0] for x in test_err_ls])]
    ave_loss = [prev_loss]
    ########## boosting for each fold ###############
    # time0 = time.time()
    print("-------------------- CV -----------------------")
    print("iteration\tMean test err\tMean test loss")
    for t in range(1,inputs.maxiter+1):
        # one iteration
        for k in range(nfold):
            if inputs.method == 'L2':
                [m,beta,c] = oneiter_L2(sharedK,Ftrain_ls[k],ytrain_ls[k],Kdims,Lambda=Lambda,ncpu = ncpu,parallel = parallel,\
                sele_loc=folds[k][1])    
            if inputs.method == 'L1':
                [m,beta,c] = oneiter_L1(sharedK,Ftrain_ls[k],ytrain_ls[k],Kdims,Lambda=Lambda,ncpu = ncpu,parallel = parallel,\
                sele_loc=folds[k][1],group_subset = gr_sub) 
    
            # line search
            x = line_search(sharedK,Ftrain_ls[k],ytrain_ls[k],Kdims,[m,beta,c],sele_loc=folds[k][1])
            beta *= x
            c *= x
    
            # update lists
            Ftrain_ls[k] += (K_train[:,:,m][np.ix_(folds[k][1],folds[k][1])].dot(beta) + c)*inputs.nu
            Ftest_ls[k] += (K_train[:,:,m][np.ix_(folds[k][0],folds[k][1])].dot(beta)+ c)*inputs.nu
            test_err_ls[k].append((np.sign(Ftest_ls[k])!=ytest_ls[k]).sum()/len(ytest_ls[k]))
            new_loss = loss_fun(Ftest_ls[k],ytest_ls[k])
            test_loss_ls[k].append(new_loss)
        
        # save iteration
        cur_err= np.mean([x[-1] for x in test_err_ls])
        cur_loss = np.mean([x[-1] for x in test_loss_ls])
            #update best loss
        if cur_loss < min_loss:
            min_loss = cur_loss
            opt_iter = t
        ave_err.append(cur_err)
        ave_loss.append(cur_loss)
        
        # print report
        if t%20 == 0:
            print("%9.0f\t%13.4f\t%14.4f" % (t, cur_err,cur_loss))
            
        # detect early stop
        if t-opt_iter >= ESTOP: 
            print('Early stop criterion satisfied: break CV.')
            print('using iteration number:',opt_iter)
            break
    print("-----------------------------------------------\n")
    # visualization
    if plot:
        folder = inputs.output_folder
        if folder is None: 
            print("No CV file name provided.\n")
        else:
            f = plt.figure()
            plt.plot(ave_err)
            plt.xlabel("iterations")
            plt.ylabel("CV error")
            f.savefig(folder+'/CV_err.png')
            f=plt.figure()
            plt.plot(ave_loss)
            plt.xlabel("iterations")
            plt.ylabel("CV loss")
            f.savefig(folder+'/CV_loss.png')
    return opt_iter
Esempio n. 2
0
def CV_PKB(inputs,
           K_train,
           Lambda,
           nfold=3,
           ESTOP=50,
           parallel=False,
           gr_sub=False,
           plot=False):
    ########## split data ###############
    temp = pd.Series(range(inputs.Ntrain), index=inputs.train_response.index)
    if inputs.problem == "classification":
        test_inds = subsamp(inputs.train_response,
                            inputs.train_response.columns[0], nfold)
    elif inputs.problem == 'survival':
        test_inds = subsamp(inputs.train_response,
                            inputs.train_response.columns[1], nfold)
    elif inputs.problem == "regression":
        test_inds = simple_subsamp(inputs.train_response, nfold)
    folds = []
    for i in range(nfold):
        folds.append([
            temp[test_inds[i]].values,
            np.setdiff1d(temp.values, temp[test_inds[i]].values)
        ])

    ########## initiate model for each fold ###############
    Ztrain_ls = [
        inputs.train_clinical.values[folds[i][1], :] for i in range(nfold)
    ]
    Ztest_ls = [
        inputs.train_clinical.values[folds[i][0], :] for i in range(nfold)
    ]
    K_train_ls = [
        K_train[np.ix_(folds[i][1], folds[i][1])] for i in range(nfold)
    ]
    K_test_ls = [
        K_train[np.ix_(folds[i][1], folds[i][0])] for i in range(nfold)
    ]

    if inputs.problem == "classification":
        ytrain_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][1]].values)
            for i in range(nfold)
        ]
        ytest_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][0]].values)
            for i in range(nfold)
        ]
        inputs_class = [
            CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold)
        ]
        models = [
            assist.Classification.PKB_Classification(inputs_class[i],
                                                     ytrain_ls[i], ytest_ls[i])
            for i in range(nfold)
        ]
    elif inputs.problem == 'survival':
        ytrain_ls = [
            inputs.train_response.iloc[folds[i][1], ].values
            for i in range(nfold)
        ]
        ytest_ls = [
            inputs.train_response.iloc[folds[i][0], ].values
            for i in range(nfold)
        ]
        inputs_class = [
            CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold)
        ]
        models = [
            assist.Survival.PKB_Survival(inputs_class[i], ytrain_ls[i],
                                         ytest_ls[i]) for i in range(nfold)
        ]
    elif inputs.problem == "regression":
        ytrain_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][1]].values)
            for i in range(nfold)
        ]
        ytest_ls = [
            np.squeeze(inputs.train_response.iloc[folds[i][0]].values)
            for i in range(nfold)
        ]
        inputs_class = [
            CVinputs(inputs, ytrain_ls[i], ytest_ls[i]) for i in range(nfold)
        ]
        models = [
            assist.Regression.PKB_Regression(inputs_class[i], ytrain_ls[i],
                                             ytest_ls[i]) for i in range(nfold)
        ]

    for x in models:
        x.init_F()

    ########## boosting for each fold ###############
    opt_iter = 0
    tmp_list = [x.test_loss[0] for x in models]
    min_loss = prev_loss = np.mean([x.test_loss[0] for x in models])
    ave_loss = [prev_loss]

    print_section('Cross-Validation')
    print("{:>9}\t{:>14}\t{:>24}".format("iteration", "Mean test loss",
                                         "time (if no E-stop)"))

    time0 = time.time()
    for t in range(1, inputs.maxiter + 1):
        # one iteration
        for k in range(nfold):
            if inputs.method == 'L2':
                [m,beta,gamma] = oneiter_L2(K_train_ls[k],Ztrain_ls[k],models[k],\
                                Lambda=Lambda,parallel = parallel,group_subset = gr_sub)
            if inputs.method == 'L1':
                [m,beta,gamma] = oneiter_L1(K_train_ls[k],Ztrain_ls[k],models[k],\
                                Lambda=Lambda,parallel = parallel,group_subset = gr_sub)
            # line search
            x = line_search(K_train_ls[k], Ztrain_ls[k], models[k],
                            [m, beta, gamma])
            beta *= x
            gamma *= x

            # update model
            models[k].update([m, beta, gamma], K_train_ls[k][:, :, m],
                             K_test_ls[k][:, :, m], Ztrain_ls[k], Ztest_ls[k],
                             inputs.nu)

        # save iteration
        #print("loss values: {}".format([x.test_loss[-1] for x in models]))
        cur_loss = np.mean([x.test_loss[-1] for x in models])
        #update best loss
        if cur_loss < min_loss:
            min_loss = cur_loss
            opt_iter = t
        ave_loss.append(cur_loss)

        # print report
        if t % 10 == 0:
            iter_persec = t / (time.time() - time0)  # time of one iteration
            rem_time = (inputs.maxiter - t) / iter_persec  # remaining time
            print("{:9.0f}\t{:14.4f}\t{:24.4f}".format(t, cur_loss,
                                                       rem_time / 60))

        # detect early stop
        if t - opt_iter >= ESTOP:
            print('Early stop criterion satisfied: break CV.')
            break

    # print the number of iterations used
    print('using iteration number:', opt_iter)

    # visualization
    if plot:
        folder = inputs.output_folder
        f = plt.figure()
        plt.plot(ave_loss)
        plt.xlabel("iterations")
        plt.ylabel("CV loss")
        f.savefig(folder + '/CV_loss.pdf')
    return opt_iter
Esempio n. 3
0
                  ncpu=1,parallel=False,gr_sub=False,plot=True)
"""---------------------------
BOOSTING ITERATIONS
----------------------------"""
time0 = time.time()
print("--------------------- Boosting -------------------")
print("iteration\ttrain err\t time(min)")
for t in range(1, opt_iter + 1):
    # one iteration
    if inputs.method == 'L2':
        [m,beta,c] = oneiter_L2(sharedK,F_train,ytrain,Kdims,\
                Lambda=Lambda,ncpu = ncpu,parallel = parallel,\
                sele_loc=None,group_subset = gr_sub)
    if inputs.method == 'L1':
        [m,beta,c] = oneiter_L1(sharedK,F_train,ytrain,Kdims,\
                Lambda=Lambda,ncpu = ncpu,parallel = parallel,\
                sele_loc=None,group_subset = gr_sub)

    # line search
    x = line_search(sharedK, F_train, ytrain, Kdims, [m, beta, c])
    beta *= x
    c *= x

    # update outputs
    outputs.trace.append([m, beta, c])
    F_train += (K_train[:, :, m].dot(beta) + c) * inputs.nu
    outputs.train_err.append((np.sign(F_train) != ytrain).sum() / len(ytrain))
    outputs.train_loss.append(loss_fun(F_train, ytrain))
    if not inputs.Ntest is None:
        F_test += (K_test[:, :, m].T.dot(beta) + c) * inputs.nu
Esempio n. 4
0
    """---------------------------
    BOOSTING ITERATIONS
    ----------------------------"""

    time0 = time.time()
    assist.util.print_section("BOOSTING")
    print("iteration\ttrain loss\ttest loss\t    time")
    for t in range(1, opt_iter + 1):
        # one iteration
        if inputs.method == 'L2':
            [m,beta,gamma] = oneiter_L2(sharedK,Z_train,model,Kdims,\
                    Lambda=Lambda,ncpu = ncpu,parallel = parallel,\
                    sele_loc=None,group_subset = gr_sub)
        if inputs.method == 'L1':
            [m,beta,gamma] = oneiter_L1(sharedK,Z_train,model,Kdims,\
                    Lambda=Lambda,ncpu = ncpu,parallel = parallel,\
                    sele_loc=None,group_subset = gr_sub)        #print([m,beta,gamma])
        #print("\t beta norm: {}; gamma norm: {}".format(np.mean(beta**2), np.mean(gamma**2)) )

        # line search
        x = assist.util.line_search(sharedK, Z_train, model, Kdims,
                                    [m, beta, gamma])
        beta *= x
        gamma *= x

        # update model parameters

        if model.hasTest:
            model.update([m, beta, gamma], K_train[:, :, m], K_test[:, :, m],
                         Z_train, Z_test, inputs.nu)
        else:
Esempio n. 5
0
File: PKB2.py Progetto: zengliX/PKB2
    opt_iter = CV_PKB(inputs,K_train,Lambda,nfold=3,ESTOP=ESTOP,\
                      parallel=parallel,gr_sub=gr_sub,plot=True)
    """---------------------------
    BOOSTING ITERATIONS
    ----------------------------"""

    time0 = time.time()
    assist.util.print_section("BOOSTING")
    print("iteration\ttrain loss\ttest loss\t    time")
    for t in range(1, opt_iter + 1):
        # one iteration
        if inputs.method == 'L2':
            [m,beta,gamma] = oneiter_L2(K_train,Z_train,model,Lambda=Lambda,\
                        parallel = parallel,group_subset = gr_sub)
        if inputs.method == 'L1':
            [m,beta,gamma] = oneiter_L1(K_train,Z_train,model,\
                    Lambda=Lambda,parallel = parallel,group_subset = gr_sub)

        # line search
        x = assist.util.line_search(K_train, Z_train, model, [m, beta, gamma])
        beta *= x
        gamma *= x

        # update model parameters

        if model.hasTest:
            model.update([m, beta, gamma], K_train[:, :, m], K_test[:, :, m],
                         Z_train, Z_test, inputs.nu)
        else:
            model.update([m, beta, gamma], K_train[:, :, m], None, Z_train,
                         None, inputs.nu)
        # print time report