def ensembleSelectionPrediction(model_folder, best_bagged_model_list, best_bagged_model_weight, cdf, cutoff=None):
    bagging_size = len(best_bagged_model_list)
    for bagging_iter in range(bagging_size):
        w_ens = 0
        iter = 0
        for model,w in zip(best_bagged_model_list[bagging_iter], best_bagged_model_weight[bagging_iter]):
            iter += 1
            pred_file = "%s/All/test.pred.%s.csv" % (model_folder, model)
            this_p_valid = pd.read_csv(pred_file, dtype=float)["prediction"].values
            this_w = w
            if iter == 1:
                p_ens_valid = np.zeros((this_p_valid.shape[0]),dtype=float)
                id_test = pd.read_csv(pred_file, dtype=float)["id"].values
                id_test = np.asarray(id_test, dtype=int)
            p_ens_valid = (w_ens * p_ens_valid + this_w * this_p_valid) / (w_ens + this_w)
            w_ens += this_w
        if bagging_iter == 0:
            p_ens_valid_bag = p_ens_valid
        else:
            p_ens_valid_bag = (bagging_iter * p_ens_valid_bag + p_ens_valid) / (bagging_iter+1.)
    if cutoff is None:
        p_ens_score = getScore(p_ens_valid_bag, cdf)
    else:
        p_ens_score = getTestScore(p_ens_valid_bag, cutoff)
    ##
    output = pd.DataFrame({"id": id_test, "prediction": p_ens_score})
    return output
def ensembleSelectionObj(param, p1_list, weight1, p2_list, true_label_list, cdf_list, numValidMatrix):

    weight2 = param['weight2']
    kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    for run in range(config.n_runs):
        for fold in range(config.n_folds):
            numValid = numValidMatrix[run][fold]
            p1 = p1_list[run,fold,:numValid]
            p2 = p2_list[run,fold,:numValid]
            true_label = true_label_list[run,fold,:numValid]
            cdf = cdf_list[run,fold,:]
            p_ens = (weight1 * p1 + weight2 * p2) / (weight1 + weight2)
            p_ens_score = getScore(p_ens, cdf)
            kappa_cv[run][fold] = quadratic_weighted_kappa(p_ens_score, true_label)
    kappa_cv_mean = np.mean(kappa_cv)
    return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
Esempio n. 3
0
def ensembleSelectionObj(param, p1_list, weight1, p2_list, true_label_list,
                         cdf_list, numValidMatrix):
    weight2 = param['weight2']
    kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    for run in range(config.n_runs):
        for fold in range(config.n_folds):
            numValid = numValidMatrix[run][fold]
            p1 = p1_list[run, fold, :numValid]
            p2 = p2_list[run, fold, :numValid]
            true_label = true_label_list[run, fold, :numValid]
            cdf = cdf_list[run, fold, :]
            p_ens = (weight1 * p1 + weight2 * p2) / (weight1 + weight2)
            p_ens_score = getScore(p_ens, cdf)
            kappa_cv[run][fold] = quadratic_weighted_kappa(
                p_ens_score, true_label)
    kappa_cv_mean = np.mean(kappa_cv)
    return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
Esempio n. 4
0
def ensembleSelection(feat_folder,
                      model_folder,
                      model_list,
                      cdf,
                      cdf_test,
                      subm_prefix,
                      hypteropt_max_evals=10,
                      w_min=-1.,
                      w_max=1.,
                      bagging_replacement=False,
                      bagging_fraction=0.5,
                      bagging_size=10,
                      init_top_k=5,
                      prunning_fraction=0.2):
    ## load all the prediction :maxNumValid 预测结果假定是12000行
    maxNumValid = 12000
    pred_list_valid = np.zeros(
        (len(model_list), config.n_runs, config.n_folds, maxNumValid),
        dtype=float)
    Y_list_valid = np.zeros((config.n_runs, config.n_folds, maxNumValid),
                            dtype=float)
    cdf_list_valid = np.zeros(
        (config.n_runs, config.n_folds, config.n_classes), dtype=float)
    numValidMatrix = np.zeros((config.n_runs, config.n_folds), dtype=int)
    p_ens_list_valid = np.zeros((config.n_runs, config.n_folds, maxNumValid),
                                dtype=float)

    numTest = 22513

    ## model to idx
    model2idx = dict()
    kappa_list = dict()
    for i, model in enumerate(model_list):
        model2idx[model] = i
        kappa_list[model] = 0
    print("============================================================")
    print("Load model...")
    for model in model_list:
        model_id = model2idx[model]
        print("model: %s" % model)
        kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
        ## load cvf
        for run in range(config.n_runs):
            for fold in range(config.n_folds):
                path = "%s/Run%d/Fold%d" % (model_folder, run + 1, fold + 1)
                pred_file = "%s/valid.pred.%s.csv" % (path, model)
                cdf_file = "%s/Run%d/Fold%d/valid.cdf" % (feat_folder, run + 1,
                                                          fold + 1)
                this_p_valid = pd.read_csv(pred_file, dtype=float)
                numValidMatrix[run][fold] = this_p_valid.shape[0]
                pred_list_valid[model_id, run, fold, :numValidMatrix[run]
                                [fold]] = this_p_valid["prediction"].values
                Y_list_valid[run,
                             fold, :numValidMatrix[run][fold]] = this_p_valid[
                                 "target"].values
                ## load cdf
                if cdf == None:
                    cdf_list_valid[run, fold, :] = np.loadtxt(cdf_file,
                                                              dtype=float)
                else:
                    cdf_list_valid[run, fold, :] = cdf
                ##
                score = getScore(
                    pred_list_valid[model_id, run,
                                    fold, :numValidMatrix[run][fold]],
                    cdf_list_valid[run, fold, :])
                kappa_cv[run][fold] = quadratic_weighted_kappa(
                    score, Y_list_valid[run, fold, :numValidMatrix[run][fold]])

        print("kappa: %.6f" % np.mean(kappa_cv))
        # 算出每个模型的平均kappa_cv
        kappa_list[model] = np.mean(kappa_cv)

    cdf_mean_init = np.mean(np.mean(cdf_list_valid, axis=0), axis=0)
    cdf_mean_init = cdf_mean_init.tolist()
    cdf_mean_init.insert(0, 0)
    # diff  1 2 4 --->1 2  ;把累计值拆分成 各个区间值
    pdf_mean_init = np.diff(np.asarray(cdf_mean_init))
    # 按照  kappa_cv值 排序-置逆;大的靠前
    sorted_models = sorted(kappa_list.items(), key=lambda x: x[1])[::-1]

    # greedy ensemble
    print("============================================================")
    print("Perform ensemble selection...")
    best_bagged_model_list = [[]] * bagging_size
    best_bagged_model_weight = [[]] * bagging_size
    num_model = len(model_list)
    # print bagging_size
    for bagging_iter in range(bagging_size):
        rng = np.random.RandomState(2015 + 100 * bagging_iter)
        if bagging_replacement:
            # 随机抽取
            sampleSize = int(num_model * bagging_fraction)
            index_base = rng.randint(num_model, size=sampleSize)
        else:
            # 均匀分布
            randnum = rng.uniform(size=num_model)
            index_base = [
                i for i in range(num_model) if randnum[i] < bagging_fraction
            ]
        this_sorted_models = [sorted_models[i] for i in sorted(index_base)]

        # print this_model_list
        best_model_list = []
        best_model_weight = []
        best_kappa = 0
        best_model = None
        p_ens_list_valid_tmp = np.zeros(
            (config.n_runs, config.n_folds, maxNumValid), dtype=float)
        #### initialization
        w_ens, this_w = 0, 1.0
        if init_top_k > 0:
            cnt = 0
            kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
            for model, kappa in this_sorted_models:
                if cnt >= init_top_k:
                    continue
                print("add to the ensembles the following model")
                print("model: %s" % model)
                print("kappa: %.6f" % kappa)
                this_p_list_valid = pred_list_valid[model2idx[model]]
                for run in range(config.n_runs):
                    for fold in range(config.n_folds):
                        numValid = numValidMatrix[run][fold]
                        if cnt == 0:
                            this_w = 1.0
                        else:
                            pass
                        p_ens_list_valid_tmp[run, fold, :numValid] = (
                            w_ens * p_ens_list_valid_tmp[run, fold, :numValid]
                            + this_w * this_p_list_valid[run, fold, :numValid]
                        ) / (w_ens + this_w)
                        # p_ens_list_valid_tmp[run,fold,:numValid] = p_ens_list_valid_tmp[run,fold,:numValid].argsort().argsort()
                        if cnt == init_top_k - 1:
                            cdf = cdf_list_valid[run, fold, :]
                            true_label = Y_list_valid[run, fold, :numValid]
                            score = getScore(
                                p_ens_list_valid_tmp[run, fold, :numValid],
                                cdf)
                            kappa_cv[run][fold] = quadratic_weighted_kappa(
                                score, true_label)
                best_model_list.append(model)
                best_model_weight.append(this_w)
                w_ens += this_w
                cnt += 1
            print("Init kappa: %.6f (%.6f)" %
                  (np.mean(kappa_cv), np.std(kappa_cv)))
        #### ensemble selection with replacement
        iter = 0
        while True:
            iter += 1
            for model, _ in this_sorted_models:
                this_p_list_valid = pred_list_valid[model2idx[model]]

                ## hyperopt for the best weight
                trials = Trials()
                # 不同模型的权重
                param_space = {'weight2': hp.uniform('weight2', w_min, w_max)}
                obj = lambda param: ensembleSelectionObj(
                    param, p_ens_list_valid_tmp, 1., this_p_list_valid,
                    Y_list_valid, cdf_list_valid, numValidMatrix)
                best_params = fmin(obj,
                                   param_space,
                                   algo=tpe.suggest,
                                   trials=trials,
                                   max_evals=hypteropt_max_evals)
                this_w = best_params['weight2']
                this_w *= w_ens
                # all the current prediction to the ensemble
                kappa_cv = np.zeros((config.n_runs, config.n_folds),
                                    dtype=float)
                for run in range(config.n_runs):
                    for fold in range(config.n_folds):
                        numValid = numValidMatrix[run][fold]
                        p1 = p_ens_list_valid_tmp[run, fold, :numValid]
                        p2 = this_p_list_valid[run, fold, :numValid]
                        true_label = Y_list_valid[run, fold, :numValid]
                        cdf = cdf_list_valid[run, fold, :]
                        p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w)
                        score = getScore(p_ens, cdf)
                        kappa_cv[run][fold] = quadratic_weighted_kappa(
                            score, true_label)
                if np.mean(kappa_cv) > best_kappa:
                    best_kappa, best_model, best_weight = np.mean(
                        kappa_cv), model, this_w
            if best_model == None:
                break
            print("Iter: %d" % iter)
            print("    model: %s" % best_model)
            print("    weight: %s" % best_weight)
            print("    kappa: %.6f" % best_kappa)

            best_model_list.append(best_model)
            best_model_weight.append(best_weight)
            # valid
            this_p_list_valid = pred_list_valid[model2idx[best_model]]
            for run in range(config.n_runs):
                for fold in range(config.n_folds):
                    numValid = numValidMatrix[run][fold]
                    p_ens_list_valid_tmp[run, fold, :numValid] = (
                        w_ens * p_ens_list_valid_tmp[run, fold, :numValid] +
                        best_weight * this_p_list_valid[run, fold, :numValid]
                    ) / (w_ens + best_weight)
            best_model = None
            w_ens += best_weight

        kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
        cutoff = np.zeros((3), dtype=float)
        for run in range(config.n_runs):
            for fold in range(config.n_folds):
                numValid = numValidMatrix[run][fold]
                true_label = Y_list_valid[run, fold, :numValid]
                cdf = cdf_list_valid[run, fold, :]
                p_ens_list_valid[run, fold, :numValid] = (
                    bagging_iter * p_ens_list_valid[run, fold, :numValid] +
                    p_ens_list_valid_tmp[run, fold, :numValid]) / (
                        bagging_iter + 1.)
                score, cutoff_tmp = getScore(
                    p_ens_list_valid[run, fold, :numValid], cdf, "valid")
                kappa_cv[run][fold] = quadratic_weighted_kappa(
                    score, true_label)
                cutoff += cutoff_tmp
        cutoff /= float(config.n_runs * config.n_folds)
        cutoff *= (22513 / ((2. / 3) * 10158))
        print("Bag %d, kappa: %.6f (%.6f)" %
              (bagging_iter + 1, np.mean(kappa_cv), np.std(kappa_cv)))
        best_kappa_mean = np.mean(kappa_cv)
        best_kappa_std = np.std(kappa_cv)
        best_bagged_model_list[bagging_iter] = best_model_list
        best_bagged_model_weight[bagging_iter] = best_model_weight

        ## save the current prediction
        # use cdf
        output = ensembleSelectionPrediction(
            model_folder, best_bagged_model_list[:(bagging_iter + 1)],
            best_bagged_model_weight[:(bagging_iter + 1)], cdf_test)
        sub_file = "%s_[InitTopK%d]_[BaggingSize%d]_[BaggingFraction%s]_[Mean%.6f]_[Std%.6f]_cdf.csv" % (
            subm_prefix, init_top_k, bagging_iter + 1, bagging_fraction,
            best_kappa_mean, best_kappa_std)
        output.to_csv(sub_file, index=False)
        # use cutoff
        output = ensembleSelectionPrediction(
            model_folder, best_bagged_model_list[:(bagging_iter + 1)],
            best_bagged_model_weight[:(bagging_iter + 1)], cdf_test, cutoff)
        sub_file = "%s_[InitTopK%d]_[BaggingSize%d]_[BaggingFraction%s]_[Mean%.6f]_[Std%.6f]_cutoff.csv" % (
            subm_prefix, init_top_k, bagging_iter + 1, bagging_fraction,
            best_kappa_mean, best_kappa_std)
        output.to_csv(sub_file, index=False)
    return best_kappa_mean, best_kappa_std, best_bagged_model_list, best_bagged_model_weight
Esempio n. 5
0
    def run(self):
        score = 0
        filter = FilterMessage(self.msg, self.data)

        ret = self.msg.hasSubject()
        if ret == True:
            self.passed_tests = self.passed_tests + "Subject exists" + "\n"
        else:
            score = utils.getScore("SUBJECT_MISSING", self.data)
            self.str_scan = self.str_scan + "\tSUBJECT_MISSING" + "\trule score:" + str(
                score) + "\n"
            self.score = self.score + score

        ret = self.msg.hasFrom()
        if ret == True:
            self.passed_tests = self.passed_tests + "Subject exists" + "\n"
        else:
            score = utils.getScore("FROM_MISSING", self.data)
            self.str_scan = self.str_scan + "\tFROM_MISSING" + "\trule score:" + str(
                score) + "\n"
            self.score = self.score + score

        ret = filter.check_header('Message-Id')
        if ret == 0:
            self.passed_tests = self.passed_tests + "Header Message-Id is valid" + "\n"
        else:
            score = utils.getScore("MESSAGEID_INVALID", self.data)
            self.str_scan = self.str_scan + "\tMESSAGEID_INVALID" + "\trule score:" + str(
                score) + "\n"
            self.score = self.score + score

        ret = filter.check_header('Date')
        if ret == 0:
            self.passed_tests = self.passed_tests + "Header Date is valid" + "\n"
        else:
            score = utils.getScore("DATE_INVALID", self.data)
            self.str_scan = self.str_scan + "\tDATE_INVALID" + "\trule score:" + str(
                score) + "\n"
            if score is not None:
                self.score = self.score + float(score)

        ret = filter.check_header('From')
        if ret == 0:
            self.passed_tests = self.passed_tests + "Header From is valid" + "\n"
        else:
            score = utils.getScore("FROM_INVALID", self.data)
            self.str_scan = self.str_scan + "\tFROM_INVALID" + "\trule score:" + str(
                score) + "\n"
            self.score = self.score + score

        ret = filter.check_forged_received_ip()
        if ret == 0:
            self.passed_tests = self.passed_tests + "Not forged header" + "\n"
        else:
            score = utils.getScore("RECEIVED_FORGED", self.data)
            self.str_scan = self.str_scan + "\tRECEIVED_FORGED" + "\trule score:" + str(
                score) + "\n"
            self.score = self.score + score

        ret = filter.filter_source_ip()
        if ret is None:
            self.passed_tests = self.passed_tests + "Source IP is not Spam" + "\n"
        else:
            score = utils.getScore("DNSBL_ANSWER", self.data)
            self.str_scan = self.str_scan + "\tDNSBL_ANSWER" + "\trule score:" + str(
                score) + "\n"
            self.score = self.score + score

        self.printResult()
        ret = self.msg.mark_subject(self.getLevelSpam())
        if ret != 0:
            logz.info("Couldn't mark subject")
Esempio n. 6
0
def test_getScore():
    score = getScore('廖妤宸')
    assert score == '85'
Esempio n. 7
0
from utils import (getStudentsIndex, getRankData, getScore, writeScore)

if __name__ == '__main__':
    # You should not modify this part.
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('--students',
                        default='studentsData.csv',
                        help='input students data file name')
    parser.add_argument('--rank',
                        default='0424rank.json',
                        help='rank file name')
    parser.add_argument('--output',
                        default='output.csv',
                        help='output file name')
    args = parser.parse_args()

    index = getStudentsIndex(args.students)
    rankData = getRankData(args.rank)
    score = getScore(index, rankData)
    writeScore(args.output, score)
def ensembleSelection(feat_folder, model_folder, model_list, cdf, cdf_test, subm_prefix,
                 hypteropt_max_evals=10, w_min=-1., w_max=1.,
                  bagging_replacement=False, bagging_fraction=0.5, bagging_size=10, init_top_k=5, prunning_fraction=0.2):
    ## load all the prediction
    maxNumValid = 12000
    pred_list_valid = np.zeros((len(model_list), config.n_runs, config.n_folds, maxNumValid), dtype=float)
    Y_list_valid = np.zeros((config.n_runs, config.n_folds, maxNumValid), dtype=float)
    cdf_list_valid = np.zeros((config.n_runs, config.n_folds, config.n_classes), dtype=float)
    numValidMatrix = np.zeros((config.n_runs, config.n_folds), dtype=int)
    p_ens_list_valid = np.zeros((config.n_runs, config.n_folds, maxNumValid), dtype=float)

    numTest = 22513
    
    ## model to idx
    model2idx = dict()
    kappa_list = dict()
    for i,model in enumerate(model_list): 
        model2idx[model] = i
        kappa_list[model] = 0
    print("============================================================")
    print("Load model...")
    for model in model_list:
        model_id = model2idx[model]
        print("model: %s" % model)
        kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
        ## load cvf
        for run in range(config.n_runs):
            for fold in range(config.n_folds):
                path = "%s/Run%d/Fold%d" % (model_folder, run+1, fold+1)
                pred_file = "%s/valid.pred.%s.csv" % (path, model)
                cdf_file = "%s/Run%d/Fold%d/valid.cdf" % (feat_folder, run+1, fold+1)
                this_p_valid = pd.read_csv(pred_file, dtype=float)
                numValidMatrix[run][fold] = this_p_valid.shape[0]           
                pred_list_valid[model_id,run,fold,:numValidMatrix[run][fold]] = this_p_valid["prediction"].values
                Y_list_valid[run,fold,:numValidMatrix[run][fold]] = this_p_valid["target"].values
                ## load cdf
                if cdf == None:
                    cdf_list_valid[run,fold, :] = np.loadtxt(cdf_file, dtype=float)
                else:
                    cdf_list_valid[run,fold, :] = cdf
                ##
                score = getScore(pred_list_valid[model_id,run,fold,:numValidMatrix[run][fold]], cdf_list_valid[run,fold, :])
                kappa_cv[run][fold] = quadratic_weighted_kappa(score, Y_list_valid[run,fold,:numValidMatrix[run][fold]])     

        print("kappa: %.6f" % np.mean(kappa_cv))
        kappa_list[model] = np.mean(kappa_cv)

    cdf_mean_init = np.mean(np.mean(cdf_list_valid, axis=0), axis=0)
    cdf_mean_init = cdf_mean_init.tolist()
    cdf_mean_init.insert(0, 0)
    pdf_mean_init = np.diff(np.asarray(cdf_mean_init))

    sorted_models = sorted(kappa_list.items(), key=lambda x: x[1])[::-1]
        
    # greedy ensemble
    print("============================================================")
    print("Perform ensemble selection...")
    best_bagged_model_list = [[]]*bagging_size
    best_bagged_model_weight = [[]]*bagging_size
    num_model = len(model_list)
    #print bagging_size
    for bagging_iter in range(bagging_size):
        rng = np.random.RandomState(2015 + 100 * bagging_iter)
        if bagging_replacement:
            sampleSize = int(num_model*bagging_fraction)
            index_base = rng.randint(num_model, size=sampleSize)
        else:
            randnum = rng.uniform(size=num_model)
            index_base = [i for i in range(num_model) if randnum[i] < bagging_fraction]
        this_sorted_models = [sorted_models[i] for i in sorted(index_base)]

        #print this_model_list
        best_model_list = []
        best_model_weight = []
        best_kappa = 0
        best_model = None
        p_ens_list_valid_tmp = np.zeros((config.n_runs, config.n_folds, maxNumValid), dtype=float)
        #### initialization
        w_ens, this_w = 0, 1.0
        if init_top_k > 0:
            cnt = 0
            kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
            for model,kappa in this_sorted_models:
                if cnt >= init_top_k:
                    continue
                print("add to the ensembles the following model")
                print("model: %s" % model)
                print("kappa: %.6f" % kappa)
                this_p_list_valid = pred_list_valid[model2idx[model]]
                for run in range(config.n_runs):
                    for fold in range(config.n_folds):
                        numValid = numValidMatrix[run][fold]
                        if cnt == 0:
                            this_w = 1.0
                        else:
                            pass
                        p_ens_list_valid_tmp[run,fold,:numValid] = (w_ens * p_ens_list_valid_tmp[run,fold,:numValid] + this_w * this_p_list_valid[run,fold,:numValid]) / (w_ens + this_w)
                        #p_ens_list_valid_tmp[run,fold,:numValid] = p_ens_list_valid_tmp[run,fold,:numValid].argsort().argsort()
                        if cnt == init_top_k - 1:
                            cdf = cdf_list_valid[run,fold,:]
                            true_label = Y_list_valid[run,fold,:numValid]
                            score = getScore(p_ens_list_valid_tmp[run,fold,:numValid], cdf)
                            kappa_cv[run][fold] = quadratic_weighted_kappa(score, true_label)
                best_model_list.append(model)
                best_model_weight.append(this_w)
                w_ens += this_w
                cnt += 1
            print("Init kappa: %.6f (%.6f)" % (np.mean(kappa_cv), np.std(kappa_cv)))
        #### ensemble selection with replacement
        iter = 0 
        while True:
            iter += 1
            for model,_ in this_sorted_models:
                this_p_list_valid = pred_list_valid[model2idx[model]]

                ## hyperopt for the best weight
                trials = Trials()
                param_space = {
                    'weight2': hp.uniform('weight2', w_min, w_max)
                }
                obj = lambda param: ensembleSelectionObj(param, p_ens_list_valid_tmp, 1., this_p_list_valid, Y_list_valid, cdf_list_valid, numValidMatrix)
                best_params = fmin(obj,
                                   param_space, algo=tpe.suggest,
                                   trials=trials, max_evals=hypteropt_max_evals)
                this_w = best_params['weight2']
                this_w *= w_ens
                # all the current prediction to the ensemble
                kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
                for run in range(config.n_runs):
                    for fold in range(config.n_folds):
                        numValid = numValidMatrix[run][fold]
                        p1 = p_ens_list_valid_tmp[run,fold,:numValid]
                        p2 = this_p_list_valid[run,fold,:numValid]
                        true_label = Y_list_valid[run,fold,:numValid]
                        cdf = cdf_list_valid[run,fold,:]
                        p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w)
                        score = getScore(p_ens, cdf)
                        kappa_cv[run][fold] = quadratic_weighted_kappa(score, true_label)
                if np.mean(kappa_cv) > best_kappa:
                    best_kappa, best_model, best_weight = np.mean(kappa_cv), model, this_w
            if best_model == None:
                break
            print("Iter: %d" % iter)
            print("    model: %s" % best_model)
            print("    weight: %s" % best_weight)
            print("    kappa: %.6f" % best_kappa)

            best_model_list.append(best_model)
            best_model_weight.append(best_weight)
            # valid
            this_p_list_valid = pred_list_valid[model2idx[best_model]]
            for run in range(config.n_runs):
                for fold in range(config.n_folds):
                    numValid = numValidMatrix[run][fold]
                    p_ens_list_valid_tmp[run,fold,:numValid] = (w_ens * p_ens_list_valid_tmp[run,fold,:numValid] + best_weight * this_p_list_valid[run,fold,:numValid]) / (w_ens + best_weight)
            best_model = None
            w_ens += best_weight
            
        kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
        cutoff = np.zeros((3), dtype=float)
        for run in range(config.n_runs):
            for fold in range(config.n_folds):
                numValid = numValidMatrix[run][fold]
                true_label = Y_list_valid[run,fold,:numValid]
                cdf = cdf_list_valid[run,fold,:]
                p_ens_list_valid[run,fold,:numValid] = (bagging_iter * p_ens_list_valid[run,fold,:numValid] + p_ens_list_valid_tmp[run,fold,:numValid]) / (bagging_iter+1.)
                score, cutoff_tmp = getScore(p_ens_list_valid[run,fold,:numValid], cdf, "valid")
                kappa_cv[run][fold] = quadratic_weighted_kappa(score, true_label)
                cutoff += cutoff_tmp
        cutoff /= float(config.n_runs*config.n_folds)
        cutoff *= (22513/((2./3)*10158))
        print( "Bag %d, kappa: %.6f (%.6f)" % (bagging_iter+1, np.mean(kappa_cv), np.std(kappa_cv)) )
        best_kappa_mean = np.mean(kappa_cv)
        best_kappa_std = np.std(kappa_cv)
        best_bagged_model_list[bagging_iter] = best_model_list
        best_bagged_model_weight[bagging_iter] = best_model_weight

        ## save the current prediction
        # use cdf
        output = ensembleSelectionPrediction(model_folder, best_bagged_model_list[:(bagging_iter+1)], best_bagged_model_weight[:(bagging_iter+1)], cdf_test)
        sub_file = "%s_[InitTopK%d]_[BaggingSize%d]_[BaggingFraction%s]_[Mean%.6f]_[Std%.6f]_cdf.csv" % (subm_prefix, init_top_k, bagging_iter+1, bagging_fraction, best_kappa_mean, best_kappa_std)
        output.to_csv(sub_file, index=False)
        # use cutoff
        output = ensembleSelectionPrediction(model_folder, best_bagged_model_list[:(bagging_iter+1)], best_bagged_model_weight[:(bagging_iter+1)], cdf_test, cutoff)
        sub_file = "%s_[InitTopK%d]_[BaggingSize%d]_[BaggingFraction%s]_[Mean%.6f]_[Std%.6f]_cutoff.csv" % (subm_prefix, init_top_k, bagging_iter+1, bagging_fraction, best_kappa_mean, best_kappa_std)
        output.to_csv(sub_file, index=False)
    return best_kappa_mean, best_kappa_std, best_bagged_model_list, best_bagged_model_weight