コード例 #1
0
ファイル: predict_ensemble.py プロジェクト: startrekor/jc
 def gen_kappa_cv(self, bagging_iter, y_list_valid, cdf_list_valid,
                  num_valid_matrix, p_ens_list_valid_topk,
                  p_ens_list_valid):
     """
     多次bagging 的结果平均值
     :param bagging_iter:第几次bagging,有几次,权重是几
     :param p_ens_list_valid: 多次执行有状态
     :param p_ens_list_valid_topk:
     :return:
     """
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     cutoff = np.zeros((3), dtype=float)
     for run in range(config.n_runs):
         for fold in range(config.n_folds):
             numValid = num_valid_matrix[run][fold]
             true_label = y_list_valid[run, fold, :numValid]
             cdf = cdf_list_valid[run, fold, :]
             # 每次bagging的结果平均
             p_ens_list_valid[run, fold, :numValid] = (
                 bagging_iter * p_ens_list_valid[run, fold, :numValid] +
                 p_ens_list_valid_topk[run, fold, :numValid]) / (
                     bagging_iter + 1.)
             score, cutoff_tmp = getScore(
                 p_ens_list_valid[run, fold, :numValid], cdf, "valid")
             kappa_cv[run][fold] = quadratic_weighted_kappa(
                 score, true_label)
             cutoff += cutoff_tmp
     cutoff /= float(config.n_runs * config.n_folds)
     # 没搞懂?
     cutoff *= (22513 / ((2. / 3) * 10158))
     print("Bag %d, kappa: %.6f (%.6f)" %
           (bagging_iter + 1, np.mean(kappa_cv), np.std(kappa_cv)))
     return kappa_cv, cutoff, p_ens_list_valid
コード例 #2
0
ファイル: predict_ensemble.py プロジェクト: startrekor/jc
 def ensemble_selection_obj(self, param, p1_list, weight1, p2_list,
                            y_list_valid, cdf_list_valid, num_valid_matrix):
     """
     优化param中的weight_current_model参数,使其平均kappa_cv_mean 最大
     :param param:
     :param p1_list: 集成前五个模型(也就是对前五个模型求平均值的结果)
     :param weight1: 1
     :param p2_list: 当前模型预测结果
     :return:
     """
     weight_current_model = param['weight_current_model']
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     for run in range(config.n_runs):
         for fold in range(config.n_folds):
             num_valid = num_valid_matrix[run][fold]
             p1 = p1_list[run, fold, :num_valid]
             p2 = p2_list[run, fold, :num_valid]
             true_label = y_list_valid[run, fold, :num_valid]
             cdf = cdf_list_valid[run, fold, :]
             p_ens = (weight1 * p1 + weight_current_model * p2) / (
                 weight1 + weight_current_model)
             p_ens_score = getScore(p_ens, cdf)
             kappa_cv[run][fold] = quadratic_weighted_kappa(
                 p_ens_score, true_label)
     kappa_cv_mean = np.mean(kappa_cv)
     return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
コード例 #3
0
ファイル: ensemble_selection.py プロジェクト: WallaceLiu/jc
def gen_kappa_cv(bagging_iter, Y_list_valid, cdf_list_valid, numValidMatrix,
                 p_ens_list_valid, p_ens_list_valid_tmp):
    """

    :param bagging_iter:
    :param Y_list_valid:
    :param cdf_list_valid:
    :param numValidMatrix:
    :param p_ens_list_valid:
    :param p_ens_list_valid_tmp:
    :return:
    """
    kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    cutoff = np.zeros((3), dtype=float)
    for run in range(config.n_runs):
        for fold in range(config.n_folds):
            numValid = numValidMatrix[run][fold]
            true_label = Y_list_valid[run, fold, :numValid]
            cdf = cdf_list_valid[run, fold, :]
            p_ens_list_valid[run, fold, :numValid] = (
                bagging_iter * p_ens_list_valid[run, fold, :numValid] +
                p_ens_list_valid_tmp[run, fold, :numValid]) / (bagging_iter +
                                                               1.)
            score, cutoff_tmp = getScore(
                p_ens_list_valid[run, fold, :numValid], cdf, "valid")
            kappa_cv[run][fold] = quadratic_weighted_kappa(score, true_label)

            cutoff += cutoff_tmp
    cutoff /= float(config.n_runs * config.n_folds)
    cutoff *= (22513 / ((2. / 3) * 10158))
    print("Bag %d, kappa: %.6f (%.6f)" %
          (bagging_iter + 1, np.mean(kappa_cv), np.std(kappa_cv)))
    return kappa_cv, cutoff
コード例 #4
0
ファイル: ensemble_selection.py プロジェクト: WallaceLiu/jc
def ensembleSelectionObj(param, p1_list, weight1, p2_list, true_label_list,
                         cdf_list, numValidMatrix):
    """
    优化param中的weight2参数,使其平均kappa_cv_mean
    :param param:
    :param p1_list:
    :param weight1:
    :param p2_list:
    :param true_label_list:
    :param cdf_list:
    :param numValidMatrix:
    :return:
    """
    weight2 = param['weight2']
    kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
    for run in range(config.n_runs):
        for fold in range(config.n_folds):
            numValid = numValidMatrix[run][fold]
            p1 = p1_list[run, fold, :numValid]
            p2 = p2_list[run, fold, :numValid]
            true_label = true_label_list[run, fold, :numValid]
            cdf = cdf_list[run, fold, :]
            p_ens = (weight1 * p1 + weight2 * p2) / (weight1 + weight2)
            p_ens_score = getScore(p_ens, cdf)
            kappa_cv[run][fold] = quadratic_weighted_kappa(
                p_ens_score, true_label)
    kappa_cv_mean = np.mean(kappa_cv)
    return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
コード例 #5
0
def evalerror_softmax_cdf(preds, dtrain, cdf):
    ## label are in [0,1,2,3]
    labels = dtrain.get_label() + 1
    preds = getClfScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
コード例 #6
0
ファイル: ensemble_selection.py プロジェクト: WallaceLiu/jc
def gen_ens_temp(init_top_k, this_sorted_models, model2idx, pred_list_valid,
                 numValidMatrix, cdf_list_valid, Y_list_valid,
                 p_ens_list_valid_tmp, best_model_list, best_model_weight):
    """

    :param init_top_k:
    :param this_sorted_models:
    :param model2idx:
    :param pred_list_valid:
    :param numValidMatrix:
    :param cdf_list_valid:
    :param Y_list_valid:
    :param p_ens_list_valid_tmp: 引用
    :param best_model_list: 引用
    :param best_model_weight: 引用
    :return:
    """
    #### initialization
    w_ens, this_w = 0, 1.0
    if init_top_k > 0:
        cnt = 0
        kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
        for model, kappa in this_sorted_models:
            if cnt >= init_top_k:
                continue
            print("add to the ensembles the following model")
            print("model: %s" % model)
            print("kappa: %.6f" % kappa)
            this_p_list_valid = pred_list_valid[model2idx[model]]
            for run in range(config.n_runs):
                for fold in range(config.n_folds):
                    numValid = numValidMatrix[run][fold]
                    if cnt == 0:
                        this_w = 1.0
                    else:
                        pass
                    p_ens_list_valid_tmp[run, fold, :numValid] = (
                        w_ens * p_ens_list_valid_tmp[run, fold, :numValid] +
                        this_w * this_p_list_valid[run, fold, :numValid]) / (
                            w_ens + this_w)
                    # p_ens_list_valid_tmp[run,fold,:numValid] = p_ens_list_valid_tmp[run,fold,:numValid].argsort().argsort()
                    if cnt == init_top_k - 1:
                        cdf = cdf_list_valid[run, fold, :]
                        true_label = Y_list_valid[run, fold, :numValid]
                        score = getScore(
                            p_ens_list_valid_tmp[run, fold, :numValid], cdf)
                        kappa_cv[run][fold] = quadratic_weighted_kappa(
                            score, true_label)
            best_model_list.append(model)
            best_model_weight.append(this_w)
            w_ens += this_w
            cnt += 1
        print("Init kappa: %.6f (%.6f)" %
              (np.mean(kappa_cv), np.std(kappa_cv)))
    return w_ens
コード例 #7
0
def evalerror_cocr_cdf(preds, dtrain, cdf):
    labels = dtrain.get_label() + 1
    # print preds.shape
    ## get prediction
    # preds = sigmoid(preds)
    preds = applyCOCRRule(preds)
    preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
コード例 #8
0
ファイル: ensemble_selection.py プロジェクト: WallaceLiu/jc
def gen_kappa_list(model_list, model2idx, model_folder, feat_folder, cdf,
                   pred_list_valid, Y_list_valid, cdf_list_valid,
                   numValidMatrix, kappa_list):
    """

    :param model_list:
    :param model2idx:
    :param model_folder:
    :param feat_folder:
    :param cdf:
    :param numValidMatrix:  引用
    :param pred_list_valid: 引用
    :param Y_list_valid: 引用
    :param cdf_list_valid: 引用
    :param kappa_list: 引用
    :return:
    """
    print("Load model...")
    for model in model_list:
        model_id = model2idx[model]
        print("model: %s" % model)
        kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
        ## load cvf
        for run in range(config.n_runs):
            for fold in range(config.n_folds):
                path = "%s/Run%d/Fold%d" % (model_folder, run + 1, fold + 1)
                pred_file = "%s/valid.pred.%s.csv" % (path, model)
                cdf_file = "%s/Run%d/Fold%d/valid.cdf" % (feat_folder, run + 1,
                                                          fold + 1)
                this_p_valid = pd.read_csv(pred_file, dtype=float)
                numValidMatrix[run][fold] = this_p_valid.shape[0]
                pred_list_valid[model_id, run, fold, :numValidMatrix[run]
                                [fold]] = this_p_valid["prediction"].values
                Y_list_valid[run,
                             fold, :numValidMatrix[run][fold]] = this_p_valid[
                                 "target"].values
                ## load cdf
                if cdf == None:
                    cdf_list_valid[run, fold, :] = np.loadtxt(cdf_file,
                                                              dtype=float)
                else:
                    cdf_list_valid[run, fold, :] = cdf
                ##
                score = getScore(
                    pred_list_valid[model_id, run,
                                    fold, :numValidMatrix[run][fold]],
                    cdf_list_valid[run, fold, :])
                kappa_cv[run][fold] = quadratic_weighted_kappa(
                    score, Y_list_valid[run, fold, :numValidMatrix[run][fold]])

        print("kappa: %.6f" % np.mean(kappa_cv))
        # 算出每个模型的平均kappa_cv
        kappa_list[model] = np.mean(kappa_cv)
コード例 #9
0
ファイル: predict_ensemble.py プロジェクト: startrekor/jc
 def init_topk_best_model(self, init_top_k, this_sorted_models,
                          pred_list_valid, y_list_valid, cdf_list_valid,
                          num_valid_matrix):
     """
     选择前五个模型 返回整合后的预测值;前五个模型名字;前五个模型的权重(全是1,相当于取平均值)
     读取实例变量:
     pred_list_valid
     num_valid_matrix
     model2idx
     cdf_list_valid
     y_list_valid
     :param init_top_k:
     :param this_sorted_models:
     :return:best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens
     """
     best_model_list = []
     best_model_weight = []
     p_ens_list_valid_topk = np.zeros(
         (config.n_runs, config.n_folds, self.max_num_valid), dtype=float)
     w_ens, this_w = 0, 1.0
     cnt = 0
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     for model, kappa in this_sorted_models[0:init_top_k]:
         print("add the following model to the ensembles ")
         print("model: %s" % model)
         print("kappa: %.6f" % kappa)
         # 指定模型的预测结果
         this_p_list_valid = pred_list_valid[self.model2idx[model]]
         for run in range(config.n_runs):
             for fold in range(config.n_folds):
                 num_valid = num_valid_matrix[run][fold]
                 # 多个模型预测值线性组合
                 p_ens_list_valid_topk[run, fold, :num_valid] = (
                     w_ens * p_ens_list_valid_topk[run, fold, :num_valid] +
                     this_w * this_p_list_valid[run, fold, :num_valid]) / (
                         w_ens + this_w)
                 # 在最后一个model,生成一些指标
                 if cnt == init_top_k - 1:
                     cdf = cdf_list_valid[run, fold, :]
                     true_label = y_list_valid[run, fold, :num_valid]
                     score = getScore(
                         p_ens_list_valid_topk[run, fold, :num_valid], cdf)
                     kappa_cv[run][fold] = quadratic_weighted_kappa(
                         score, true_label)
         best_model_list.append(model)
         best_model_weight.append(this_w)
         w_ens += this_w
         cnt += 1
         print("Init kappa: %.6f (%.6f)" %
               (np.mean(kappa_cv), np.std(kappa_cv)))
     return best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens
コード例 #10
0
def evalerror_ebc_cdf(preds, dtrain, cdf, hard_threshold=False):
    labels = dtrain.get_label()
    ## extended samples within the feature construction part
    if np.min(labels) == -1 and np.max(labels) == 1:
        labels = applyEBCRule(labels)
    ## extended samples within the objective value computation part
    ## See ebcobj function for detail
    else:
        ## label are in [0,1,2,3]
        labels += 1
    # print preds.shape
    ## get prediction
    # hard = False
    if hard_threshold:
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
    else:
        preds = sigmoid(preds)
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
        preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
コード例 #11
0
ファイル: predict_ensemble.py プロジェクト: startrekor/jc
 def find_best_model(self, this_sorted_models, pred_list_valid,
                     y_list_valid, cdf_list_valid, num_valid_matrix, w_ens,
                     w_min, w_max, best_kappa, hypteropt_max_evals,
                     p_ens_list_valid_topk):
     """
     从模型集合中找出一个最佳模型,最佳系数,与topK集成结果进行线性组合
     寻找最佳模型、权重、kappa值 从this_sorted_models找到一个最佳模型
     :param this_sorted_models:
     :param w_ens:
     :param w_min:
     :param w_max:
     :param hypteropt_max_evals:
     :param p_ens_list_valid_topk:
     :return:
     """
     best_model = None
     best_weight = 0
     for model, kappa in this_sorted_models:
         # 当前模型预测值
         this_p_list_valid = pred_list_valid[self.model2idx[model]]
         # hyperopt 找当前模型最优权重
         trials = Trials()
         # 不同模型的权重
         param_space = {
             'weight_current_model':
             hp.uniform('weight_current_model', w_min, w_max)
         }
         # topk权重是1 找另一个最佳权重
         obj = lambda param: self.ensemble_selection_obj(
             param, p_ens_list_valid_topk, 1., this_p_list_valid,
             y_list_valid, cdf_list_valid, num_valid_matrix)
         best_params = fmin(obj,
                            param_space,
                            algo=tpe.suggest,
                            trials=trials,
                            max_evals=hypteropt_max_evals)
         # 返回当前模型权重
         this_w = best_params['weight_current_model']
         # 按比例缩放当前权重 1 this_w --- w_ens this_w * w_ens
         this_w *= w_ens
         # 当前kappa cv
         kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
         for run in range(config.n_runs):
             for fold in range(config.n_folds):
                 num_valid = num_valid_matrix[run][fold]
                 # topk预测
                 p1 = p_ens_list_valid_topk[run, fold, :num_valid]
                 # 当前预测
                 p2 = this_p_list_valid[run, fold, :num_valid]
                 # 真实label
                 true_label = y_list_valid[run, fold, :num_valid]
                 cdf = cdf_list_valid[run, fold, :]
                 # 集成后的结果
                 p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w)
                 score = getScore(p_ens, cdf)
                 # 集成后kappa值
                 kappa_cv[run][fold] = quadratic_weighted_kappa(
                     score, true_label)
         # 集成后平均kappa cv 由于现在
         if np.mean(kappa_cv) > best_kappa:
             best_kappa, best_model, best_weight = np.mean(
                 kappa_cv), model, this_w
     return best_kappa, best_model, best_weight
コード例 #12
0
ファイル: predict_ensemble.py プロジェクト: startrekor/jc
    def init_model_metrics_by_run_fold(self, feat_folder, cdf):
        """
         为每个交叉验证数据按照 run-fold生成一系列指标
         初始化实例变量,供后续方法使用
         kappa_list      :每个模型的平均kappa值
         num_valid_matrix:每个run-fold 的预测结果行数
         y_list_valid    :每个run-fold 的真实label
         cdf_list_valid  :每个run-fold 的cdf
         kappa_cv        :每个run-fold 的kappa cv
         pred_list_valid :每个run-fold 的真实预测值

        :param feat_folder:
        :param cdf:
        :return:
        """
        kappa_list = dict()
        # 模型-run-fold-行 交叉验证-valid数据集预测结果
        pred_list_valid = np.zeros((len(self.model_list), config.n_runs,
                                    config.n_folds, self.max_num_valid),
                                   dtype=float)
        # run-fold-行      交叉验证-valid数据集真实label
        y_list_valid = np.zeros(
            (config.n_runs, config.n_folds, self.max_num_valid), dtype=float)
        # run-fold-4类别   交叉验证-valid数据集预测结果cdf
        cdf_list_valid = np.zeros(
            (config.n_runs, config.n_folds, config.num_of_class), dtype=float)
        # run-fold valid   交叉验证-valid数据集预测结果行数
        num_valid_matrix = np.zeros((config.n_runs, config.n_folds), dtype=int)
        print("Load model...")
        for i, model in enumerate(self.model_list):
            print("model: %s" % model)
            kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
            for run in range(config.n_runs):
                for fold in range(config.n_folds):
                    path = "%s/Run%d/Fold%d/pred" % (self.model_folder,
                                                     run + 1, fold + 1)
                    pred_file = "%s/valid.pred.%s.csv" % (path, model)
                    cdf_file = "%s/Run%d/Fold%d/valid.cdf" % (
                        config.solution_info, run + 1, fold + 1)
                    this_p_valid = pd.read_csv(pred_file, dtype=float)
                    # 这些指标只需要执行一次就行了,每个模型都一样
                    if i == 0:
                        # 记录run-fold的行数
                        num_valid_matrix[run][fold] = this_p_valid.shape[0]
                        # 记录run-fold的真实值
                        y_list_valid[run, fold, :num_valid_matrix[run]
                                     [fold]] = this_p_valid["target"].values
                        # load cdf
                        if cdf == None:
                            cdf_list_valid[run,
                                           fold, :] = np.loadtxt(cdf_file,
                                                                 dtype=float)
                        else:
                            cdf_list_valid[run, fold, :] = cdf
                        score = getScore(this_p_valid["prediction"].values,
                                         cdf_list_valid[run, fold, :])
                        kappa_cv[run][fold] = quadratic_weighted_kappa(
                            score,
                            y_list_valid[run,
                                         fold, :num_valid_matrix[run][fold]])
                    # 记录model-run-fold的预测值数组
                    pred_list_valid[
                        self.model2idx[model], run, fold, :this_p_valid.
                        shape[0]] = this_p_valid["prediction"].values
            print("kappa: %.6f" % np.mean(kappa_cv))
            # 算出每个模型的平均kappa_cv
            kappa_list[model] = np.mean(kappa_cv)

        return kappa_list, pred_list_valid, y_list_valid, cdf_list_valid, num_valid_matrix
コード例 #13
0
ファイル: ensemble_selection.py プロジェクト: WallaceLiu/jc
def gen_best_weight(this_sorted_models, model2idx, w_min, w_max,
                    pred_list_valid, hypteropt_max_evals, w_ens, Y_list_valid,
                    cdf_list_valid, numValidMatrix, p_ens_list_valid_tmp,
                    best_model_list, best_model_weight):
    """

    :param this_sorted_models:
    :param model2idx:
    :param w_min:
    :param w_max:
    :param pred_list_valid:
    :param hypteropt_max_evals:
    :param w_ens:
    :param Y_list_valid:
    :param cdf_list_valid:
    :param numValidMatrix:
    :param p_ens_list_valid_tmp:
    :param best_model_list: 引用
    :param best_model_weight: 引用
    :return:
    """
    iter = 0
    while True:
        iter += 1
        for model, _ in this_sorted_models:
            this_p_list_valid = pred_list_valid[model2idx[model]]

            ## hyperopt for the best weight
            trials = Trials()
            # 不同模型的权重
            param_space = {'weight2': hp.uniform('weight2', w_min, w_max)}
            obj = lambda param: ensembleSelectionObj(
                param, p_ens_list_valid_tmp, 1., this_p_list_valid,
                Y_list_valid, cdf_list_valid, numValidMatrix)
            best_params = fmin(obj,
                               param_space,
                               algo=tpe.suggest,
                               trials=trials,
                               max_evals=hypteropt_max_evals)
            this_w = best_params['weight2']
            this_w *= w_ens
            # all the current prediction to the ensemble
            kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
            for run in range(config.n_runs):
                for fold in range(config.n_folds):
                    numValid = numValidMatrix[run][fold]
                    p1 = p_ens_list_valid_tmp[run, fold, :numValid]
                    p2 = this_p_list_valid[run, fold, :numValid]
                    true_label = Y_list_valid[run, fold, :numValid]
                    cdf = cdf_list_valid[run, fold, :]
                    p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w)
                    score = getScore(p_ens, cdf)
                    kappa_cv[run][fold] = quadratic_weighted_kappa(
                        score, true_label)
            if np.mean(kappa_cv) > best_kappa:
                best_kappa, best_model, best_weight = np.mean(
                    kappa_cv), model, this_w
        if best_model == None:
            break
        print("Iter: %d" % iter)
        print("    model: %s" % best_model)
        print("    weight: %s" % best_weight)
        print("    kappa: %.6f" % best_kappa)

        best_model_list.append(best_model)
        best_model_weight.append(best_weight)
        # valid
        this_p_list_valid = pred_list_valid[model2idx[best_model]]
        for run in range(config.n_runs):
            for fold in range(config.n_folds):
                numValid = numValidMatrix[run][fold]
                p_ens_list_valid_tmp[run, fold, :numValid] = (
                    w_ens * p_ens_list_valid_tmp[run, fold, :numValid] +
                    best_weight * this_p_list_valid[run, fold, :numValid]) / (
                        w_ens + best_weight)
        best_model = None
        w_ens += best_weight