Beispiel #1
0
 def gen_kappa_cv(self, bagging_iter, y_list_valid, cdf_list_valid,
                  num_valid_matrix, p_ens_list_valid_topk,
                  p_ens_list_valid):
     """
     多次bagging 的结果平均值
     :param bagging_iter:第几次bagging,有几次,权重是几
     :param p_ens_list_valid: 多次执行有状态
     :param p_ens_list_valid_topk:
     :return:
     """
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     cutoff = np.zeros((3), dtype=float)
     for run in range(config.n_runs):
         for fold in range(config.n_folds):
             numValid = num_valid_matrix[run][fold]
             true_label = y_list_valid[run, fold, :numValid]
             cdf = cdf_list_valid[run, fold, :]
             # 每次bagging的结果平均
             p_ens_list_valid[run, fold, :numValid] = (
                 bagging_iter * p_ens_list_valid[run, fold, :numValid] +
                 p_ens_list_valid_topk[run, fold, :numValid]) / (
                     bagging_iter + 1.)
             score, cutoff_tmp = getScore(
                 p_ens_list_valid[run, fold, :numValid], cdf, "valid")
             kappa_cv[run][fold] = quadratic_weighted_kappa(
                 score, true_label)
             cutoff += cutoff_tmp
     cutoff /= float(config.n_runs * config.n_folds)
     # 没搞懂?
     cutoff *= (22513 / ((2. / 3) * 10158))
     print("Bag %d, kappa: %.6f (%.6f)" %
           (bagging_iter + 1, np.mean(kappa_cv), np.std(kappa_cv)))
     return kappa_cv, cutoff, p_ens_list_valid
Beispiel #2
0
 def ensemble_selection_obj(self, param, p1_list, weight1, p2_list,
                            y_list_valid, cdf_list_valid, num_valid_matrix):
     """
     优化param中的weight_current_model参数,使其平均kappa_cv_mean 最大
     :param param:
     :param p1_list: 集成前五个模型(也就是对前五个模型求平均值的结果)
     :param weight1: 1
     :param p2_list: 当前模型预测结果
     :return:
     """
     weight_current_model = param['weight_current_model']
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     for run in range(config.n_runs):
         for fold in range(config.n_folds):
             num_valid = num_valid_matrix[run][fold]
             p1 = p1_list[run, fold, :num_valid]
             p2 = p2_list[run, fold, :num_valid]
             true_label = y_list_valid[run, fold, :num_valid]
             cdf = cdf_list_valid[run, fold, :]
             p_ens = (weight1 * p1 + weight_current_model * p2) / (
                 weight1 + weight_current_model)
             p_ens_score = getScore(p_ens, cdf)
             kappa_cv[run][fold] = quadratic_weighted_kappa(
                 p_ens_score, true_label)
     kappa_cv_mean = np.mean(kappa_cv)
     return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
Beispiel #3
0
def evalerror_softmax_cdf(preds, dtrain, cdf):
    ## label are in [0,1,2,3]
    labels = dtrain.get_label() + 1
    preds = getClfScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
Beispiel #4
0
def evalerror_cocr_cdf(preds, dtrain, cdf):
    labels = dtrain.get_label() + 1
    # print preds.shape
    ## get prediction
    # preds = sigmoid(preds)
    preds = applyCOCRRule(preds)
    preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
Beispiel #5
0
 def init_topk_best_model(self, init_top_k, this_sorted_models,
                          pred_list_valid, y_list_valid, cdf_list_valid,
                          num_valid_matrix):
     """
     选择前五个模型 返回整合后的预测值;前五个模型名字;前五个模型的权重(全是1,相当于取平均值)
     读取实例变量:
     pred_list_valid
     num_valid_matrix
     model2idx
     cdf_list_valid
     y_list_valid
     :param init_top_k:
     :param this_sorted_models:
     :return:best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens
     """
     best_model_list = []
     best_model_weight = []
     p_ens_list_valid_topk = np.zeros(
         (config.n_runs, config.n_folds, self.max_num_valid), dtype=float)
     w_ens, this_w = 0, 1.0
     cnt = 0
     kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
     for model, kappa in this_sorted_models[0:init_top_k]:
         print("add the following model to the ensembles ")
         print("model: %s" % model)
         print("kappa: %.6f" % kappa)
         # 指定模型的预测结果
         this_p_list_valid = pred_list_valid[self.model2idx[model]]
         for run in range(config.n_runs):
             for fold in range(config.n_folds):
                 num_valid = num_valid_matrix[run][fold]
                 # 多个模型预测值线性组合
                 p_ens_list_valid_topk[run, fold, :num_valid] = (
                     w_ens * p_ens_list_valid_topk[run, fold, :num_valid] +
                     this_w * this_p_list_valid[run, fold, :num_valid]) / (
                         w_ens + this_w)
                 # 在最后一个model,生成一些指标
                 if cnt == init_top_k - 1:
                     cdf = cdf_list_valid[run, fold, :]
                     true_label = y_list_valid[run, fold, :num_valid]
                     score = getScore(
                         p_ens_list_valid_topk[run, fold, :num_valid], cdf)
                     kappa_cv[run][fold] = quadratic_weighted_kappa(
                         score, true_label)
         best_model_list.append(model)
         best_model_weight.append(this_w)
         w_ens += this_w
         cnt += 1
         print("Init kappa: %.6f (%.6f)" %
               (np.mean(kappa_cv), np.std(kappa_cv)))
     return best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens
Beispiel #6
0
def evalerror_ebc_cdf(preds, dtrain, cdf, hard_threshold=False):
    labels = dtrain.get_label()
    ## extended samples within the feature construction part
    if np.min(labels) == -1 and np.max(labels) == 1:
        labels = applyEBCRule(labels)
    ## extended samples within the objective value computation part
    ## See ebcobj function for detail
    else:
        ## label are in [0,1,2,3]
        labels += 1
    # print preds.shape
    ## get prediction
    # hard = False
    if hard_threshold:
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
    else:
        preds = sigmoid(preds)
        preds = applyEBCRule(preds, hard_threshold=hard_threshold)
        preds = getScore(preds, cdf)
    kappa = quadratic_weighted_kappa(labels, preds)
    ## we return -kappa for using early stopping
    kappa *= -1.
    return 'kappa', float(kappa)
Beispiel #7
0
 def find_best_model(self, this_sorted_models, pred_list_valid,
                     y_list_valid, cdf_list_valid, num_valid_matrix, w_ens,
                     w_min, w_max, best_kappa, hypteropt_max_evals,
                     p_ens_list_valid_topk):
     """
     从模型集合中找出一个最佳模型,最佳系数,与topK集成结果进行线性组合
     寻找最佳模型、权重、kappa值 从this_sorted_models找到一个最佳模型
     :param this_sorted_models:
     :param w_ens:
     :param w_min:
     :param w_max:
     :param hypteropt_max_evals:
     :param p_ens_list_valid_topk:
     :return:
     """
     best_model = None
     best_weight = 0
     for model, kappa in this_sorted_models:
         # 当前模型预测值
         this_p_list_valid = pred_list_valid[self.model2idx[model]]
         # hyperopt 找当前模型最优权重
         trials = Trials()
         # 不同模型的权重
         param_space = {
             'weight_current_model':
             hp.uniform('weight_current_model', w_min, w_max)
         }
         # topk权重是1 找另一个最佳权重
         obj = lambda param: self.ensemble_selection_obj(
             param, p_ens_list_valid_topk, 1., this_p_list_valid,
             y_list_valid, cdf_list_valid, num_valid_matrix)
         best_params = fmin(obj,
                            param_space,
                            algo=tpe.suggest,
                            trials=trials,
                            max_evals=hypteropt_max_evals)
         # 返回当前模型权重
         this_w = best_params['weight_current_model']
         # 按比例缩放当前权重 1 this_w --- w_ens this_w * w_ens
         this_w *= w_ens
         # 当前kappa cv
         kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
         for run in range(config.n_runs):
             for fold in range(config.n_folds):
                 num_valid = num_valid_matrix[run][fold]
                 # topk预测
                 p1 = p_ens_list_valid_topk[run, fold, :num_valid]
                 # 当前预测
                 p2 = this_p_list_valid[run, fold, :num_valid]
                 # 真实label
                 true_label = y_list_valid[run, fold, :num_valid]
                 cdf = cdf_list_valid[run, fold, :]
                 # 集成后的结果
                 p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w)
                 score = getScore(p_ens, cdf)
                 # 集成后kappa值
                 kappa_cv[run][fold] = quadratic_weighted_kappa(
                     score, true_label)
         # 集成后平均kappa cv 由于现在
         if np.mean(kappa_cv) > best_kappa:
             best_kappa, best_model, best_weight = np.mean(
                 kappa_cv), model, this_w
     return best_kappa, best_model, best_weight
Beispiel #8
0
    def init_model_metrics_by_run_fold(self, feat_folder, cdf):
        """
         为每个交叉验证数据按照 run-fold生成一系列指标
         初始化实例变量,供后续方法使用
         kappa_list      :每个模型的平均kappa值
         num_valid_matrix:每个run-fold 的预测结果行数
         y_list_valid    :每个run-fold 的真实label
         cdf_list_valid  :每个run-fold 的cdf
         kappa_cv        :每个run-fold 的kappa cv
         pred_list_valid :每个run-fold 的真实预测值

        :param feat_folder:
        :param cdf:
        :return:
        """
        kappa_list = dict()
        # 模型-run-fold-行 交叉验证-valid数据集预测结果
        pred_list_valid = np.zeros((len(self.model_list), config.n_runs,
                                    config.n_folds, self.max_num_valid),
                                   dtype=float)
        # run-fold-行      交叉验证-valid数据集真实label
        y_list_valid = np.zeros(
            (config.n_runs, config.n_folds, self.max_num_valid), dtype=float)
        # run-fold-4类别   交叉验证-valid数据集预测结果cdf
        cdf_list_valid = np.zeros(
            (config.n_runs, config.n_folds, config.num_of_class), dtype=float)
        # run-fold valid   交叉验证-valid数据集预测结果行数
        num_valid_matrix = np.zeros((config.n_runs, config.n_folds), dtype=int)
        print("Load model...")
        for i, model in enumerate(self.model_list):
            print("model: %s" % model)
            kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float)
            for run in range(config.n_runs):
                for fold in range(config.n_folds):
                    path = "%s/Run%d/Fold%d/pred" % (self.model_folder,
                                                     run + 1, fold + 1)
                    pred_file = "%s/valid.pred.%s.csv" % (path, model)
                    cdf_file = "%s/Run%d/Fold%d/valid.cdf" % (
                        config.solution_info, run + 1, fold + 1)
                    this_p_valid = pd.read_csv(pred_file, dtype=float)
                    # 这些指标只需要执行一次就行了,每个模型都一样
                    if i == 0:
                        # 记录run-fold的行数
                        num_valid_matrix[run][fold] = this_p_valid.shape[0]
                        # 记录run-fold的真实值
                        y_list_valid[run, fold, :num_valid_matrix[run]
                                     [fold]] = this_p_valid["target"].values
                        # load cdf
                        if cdf == None:
                            cdf_list_valid[run,
                                           fold, :] = np.loadtxt(cdf_file,
                                                                 dtype=float)
                        else:
                            cdf_list_valid[run, fold, :] = cdf
                        score = getScore(this_p_valid["prediction"].values,
                                         cdf_list_valid[run, fold, :])
                        kappa_cv[run][fold] = quadratic_weighted_kappa(
                            score,
                            y_list_valid[run,
                                         fold, :num_valid_matrix[run][fold]])
                    # 记录model-run-fold的预测值数组
                    pred_list_valid[
                        self.model2idx[model], run, fold, :this_p_valid.
                        shape[0]] = this_p_valid["prediction"].values
            print("kappa: %.6f" % np.mean(kappa_cv))
            # 算出每个模型的平均kappa_cv
            kappa_list[model] = np.mean(kappa_cv)

        return kappa_list, pred_list_valid, y_list_valid, cdf_list_valid, num_valid_matrix