def gen_kappa_cv(self, bagging_iter, y_list_valid, cdf_list_valid, num_valid_matrix, p_ens_list_valid_topk, p_ens_list_valid): """ 多次bagging 的结果平均值 :param bagging_iter:第几次bagging,有几次,权重是几 :param p_ens_list_valid: 多次执行有状态 :param p_ens_list_valid_topk: :return: """ kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) cutoff = np.zeros((3), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): numValid = num_valid_matrix[run][fold] true_label = y_list_valid[run, fold, :numValid] cdf = cdf_list_valid[run, fold, :] # 每次bagging的结果平均 p_ens_list_valid[run, fold, :numValid] = ( bagging_iter * p_ens_list_valid[run, fold, :numValid] + p_ens_list_valid_topk[run, fold, :numValid]) / ( bagging_iter + 1.) score, cutoff_tmp = getScore( p_ens_list_valid[run, fold, :numValid], cdf, "valid") kappa_cv[run][fold] = quadratic_weighted_kappa( score, true_label) cutoff += cutoff_tmp cutoff /= float(config.n_runs * config.n_folds) # 没搞懂? cutoff *= (22513 / ((2. / 3) * 10158)) print("Bag %d, kappa: %.6f (%.6f)" % (bagging_iter + 1, np.mean(kappa_cv), np.std(kappa_cv))) return kappa_cv, cutoff, p_ens_list_valid
def ensemble_selection_obj(self, param, p1_list, weight1, p2_list, y_list_valid, cdf_list_valid, num_valid_matrix): """ 优化param中的weight_current_model参数,使其平均kappa_cv_mean 最大 :param param: :param p1_list: 集成前五个模型(也就是对前五个模型求平均值的结果) :param weight1: 1 :param p2_list: 当前模型预测结果 :return: """ weight_current_model = param['weight_current_model'] kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): num_valid = num_valid_matrix[run][fold] p1 = p1_list[run, fold, :num_valid] p2 = p2_list[run, fold, :num_valid] true_label = y_list_valid[run, fold, :num_valid] cdf = cdf_list_valid[run, fold, :] p_ens = (weight1 * p1 + weight_current_model * p2) / ( weight1 + weight_current_model) p_ens_score = getScore(p_ens, cdf) kappa_cv[run][fold] = quadratic_weighted_kappa( p_ens_score, true_label) kappa_cv_mean = np.mean(kappa_cv) return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
def gen_kappa_cv(bagging_iter, Y_list_valid, cdf_list_valid, numValidMatrix, p_ens_list_valid, p_ens_list_valid_tmp): """ :param bagging_iter: :param Y_list_valid: :param cdf_list_valid: :param numValidMatrix: :param p_ens_list_valid: :param p_ens_list_valid_tmp: :return: """ kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) cutoff = np.zeros((3), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): numValid = numValidMatrix[run][fold] true_label = Y_list_valid[run, fold, :numValid] cdf = cdf_list_valid[run, fold, :] p_ens_list_valid[run, fold, :numValid] = ( bagging_iter * p_ens_list_valid[run, fold, :numValid] + p_ens_list_valid_tmp[run, fold, :numValid]) / (bagging_iter + 1.) score, cutoff_tmp = getScore( p_ens_list_valid[run, fold, :numValid], cdf, "valid") kappa_cv[run][fold] = quadratic_weighted_kappa(score, true_label) cutoff += cutoff_tmp cutoff /= float(config.n_runs * config.n_folds) cutoff *= (22513 / ((2. / 3) * 10158)) print("Bag %d, kappa: %.6f (%.6f)" % (bagging_iter + 1, np.mean(kappa_cv), np.std(kappa_cv))) return kappa_cv, cutoff
def ensembleSelectionObj(param, p1_list, weight1, p2_list, true_label_list, cdf_list, numValidMatrix): """ 优化param中的weight2参数,使其平均kappa_cv_mean :param param: :param p1_list: :param weight1: :param p2_list: :param true_label_list: :param cdf_list: :param numValidMatrix: :return: """ weight2 = param['weight2'] kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): numValid = numValidMatrix[run][fold] p1 = p1_list[run, fold, :numValid] p2 = p2_list[run, fold, :numValid] true_label = true_label_list[run, fold, :numValid] cdf = cdf_list[run, fold, :] p_ens = (weight1 * p1 + weight2 * p2) / (weight1 + weight2) p_ens_score = getScore(p_ens, cdf) kappa_cv[run][fold] = quadratic_weighted_kappa( p_ens_score, true_label) kappa_cv_mean = np.mean(kappa_cv) return {'loss': -kappa_cv_mean, 'status': STATUS_OK}
def ensemble_bagging_models_prediction(self, best_bagged_model_list, best_bagged_model_weight, cdf, cutoff=None): """ 按照bagging、model_list 集成预测结果;根据交叉验证选取的最佳模型,集成All预测结果 :param best_bagged_model_list: :param best_bagged_model_weight: :param cdf: :param cutoff: :return: """ bagging_size = len(best_bagged_model_list) # 多次分袋 for bagging_iter in range(bagging_size): # 初始化累计权重 w_ens = 0 iter = 0 # 多个模型集成结果(All预测结果) for model, w in zip(best_bagged_model_list[bagging_iter], best_bagged_model_weight[bagging_iter]): iter += 1 pred_file = "%s/All/pred/test.pred.%s.csv" % ( self.model_folder, model) # 获取当前模型预测值 this_p_valid = pd.read_csv(pred_file, dtype=float)["prediction"].values this_w = w if iter == 1: # 初始化整合预测值是0 p_ens_valid = np.zeros((this_p_valid.shape[0]), dtype=float) id_test = pd.read_csv(pred_file, dtype=float)["id"].values id_test = np.asarray(id_test, dtype=int) # 按照归一化权重 线性组合 p_ens_valid = (w_ens * p_ens_valid + this_w * this_p_valid) / (w_ens + this_w) # 累计权重 w_ens += this_w # 多个bagging进行集成,每个bagging的权重都相同 if bagging_iter == 0: p_ens_valid_bag = p_ens_valid else: # 每次bagging的权重都是1,同等权重 p_ens_valid_bag = (bagging_iter * p_ens_valid_bag + p_ens_valid) / (bagging_iter + 1.) # 根据cdf对排序后的预测结果进行映射成1-4 if cutoff is None: p_ens_score = getScore(p_ens_valid_bag, cdf) else: # 使用相近取整的方式得出预测结果 p_ens_score = getTestScore(p_ens_valid_bag, cutoff) # 输出集成后的结果 output = pd.DataFrame({"id": id_test, "prediction": p_ens_score}) return output
def gen_ens_temp(init_top_k, this_sorted_models, model2idx, pred_list_valid, numValidMatrix, cdf_list_valid, Y_list_valid, p_ens_list_valid_tmp, best_model_list, best_model_weight): """ :param init_top_k: :param this_sorted_models: :param model2idx: :param pred_list_valid: :param numValidMatrix: :param cdf_list_valid: :param Y_list_valid: :param p_ens_list_valid_tmp: 引用 :param best_model_list: 引用 :param best_model_weight: 引用 :return: """ #### initialization w_ens, this_w = 0, 1.0 if init_top_k > 0: cnt = 0 kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for model, kappa in this_sorted_models: if cnt >= init_top_k: continue print("add to the ensembles the following model") print("model: %s" % model) print("kappa: %.6f" % kappa) this_p_list_valid = pred_list_valid[model2idx[model]] for run in range(config.n_runs): for fold in range(config.n_folds): numValid = numValidMatrix[run][fold] if cnt == 0: this_w = 1.0 else: pass p_ens_list_valid_tmp[run, fold, :numValid] = ( w_ens * p_ens_list_valid_tmp[run, fold, :numValid] + this_w * this_p_list_valid[run, fold, :numValid]) / ( w_ens + this_w) # p_ens_list_valid_tmp[run,fold,:numValid] = p_ens_list_valid_tmp[run,fold,:numValid].argsort().argsort() if cnt == init_top_k - 1: cdf = cdf_list_valid[run, fold, :] true_label = Y_list_valid[run, fold, :numValid] score = getScore( p_ens_list_valid_tmp[run, fold, :numValid], cdf) kappa_cv[run][fold] = quadratic_weighted_kappa( score, true_label) best_model_list.append(model) best_model_weight.append(this_w) w_ens += this_w cnt += 1 print("Init kappa: %.6f (%.6f)" % (np.mean(kappa_cv), np.std(kappa_cv))) return w_ens
def gen_kappa_list(model_list, model2idx, model_folder, feat_folder, cdf, pred_list_valid, Y_list_valid, cdf_list_valid, numValidMatrix, kappa_list): """ :param model_list: :param model2idx: :param model_folder: :param feat_folder: :param cdf: :param numValidMatrix: 引用 :param pred_list_valid: 引用 :param Y_list_valid: 引用 :param cdf_list_valid: 引用 :param kappa_list: 引用 :return: """ print("Load model...") for model in model_list: model_id = model2idx[model] print("model: %s" % model) kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) ## load cvf for run in range(config.n_runs): for fold in range(config.n_folds): path = "%s/Run%d/Fold%d" % (model_folder, run + 1, fold + 1) pred_file = "%s/valid.pred.%s.csv" % (path, model) cdf_file = "%s/Run%d/Fold%d/valid.cdf" % (feat_folder, run + 1, fold + 1) this_p_valid = pd.read_csv(pred_file, dtype=float) numValidMatrix[run][fold] = this_p_valid.shape[0] pred_list_valid[model_id, run, fold, :numValidMatrix[run] [fold]] = this_p_valid["prediction"].values Y_list_valid[run, fold, :numValidMatrix[run][fold]] = this_p_valid[ "target"].values ## load cdf if cdf == None: cdf_list_valid[run, fold, :] = np.loadtxt(cdf_file, dtype=float) else: cdf_list_valid[run, fold, :] = cdf ## score = getScore( pred_list_valid[model_id, run, fold, :numValidMatrix[run][fold]], cdf_list_valid[run, fold, :]) kappa_cv[run][fold] = quadratic_weighted_kappa( score, Y_list_valid[run, fold, :numValidMatrix[run][fold]]) print("kappa: %.6f" % np.mean(kappa_cv)) # 算出每个模型的平均kappa_cv kappa_list[model] = np.mean(kappa_cv)
def out_put_all(self, feat_folder, feat_name, kappa_cv_mean, kappa_cv_std, pred_raw, pred_rank): # write output = pd.DataFrame({"id": self.all_matrix['id_test'], "prediction": pred_raw}) output.to_csv(self.all_matrix['raw_pred_test_path'], index=False) # write output = pd.DataFrame({"id": self.all_matrix['id_test'], "prediction": pred_rank}) output.to_csv(self.all_matrix['rank_pred_test_path'], index=False) # write score pred--原来代码有错:应该是pred_raw 因为pred_raw是多次装袋后平均预测值,不应该是其中一次装袋的预测值 pred_score = utils.getScore(pred_raw, self.all_matrix['cdf_test']) output = pd.DataFrame({"id": self.all_matrix['id_test'], "prediction": pred_score}) output.to_csv(self.all_matrix['subm_path'], index=False)
def init_topk_best_model(self, init_top_k, this_sorted_models, pred_list_valid, y_list_valid, cdf_list_valid, num_valid_matrix): """ 选择前五个模型 返回整合后的预测值;前五个模型名字;前五个模型的权重(全是1,相当于取平均值) 读取实例变量: pred_list_valid num_valid_matrix model2idx cdf_list_valid y_list_valid :param init_top_k: :param this_sorted_models: :return:best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens """ best_model_list = [] best_model_weight = [] p_ens_list_valid_topk = np.zeros( (config.n_runs, config.n_folds, self.max_num_valid), dtype=float) w_ens, this_w = 0, 1.0 cnt = 0 kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for model, kappa in this_sorted_models[0:init_top_k]: print("add the following model to the ensembles ") print("model: %s" % model) print("kappa: %.6f" % kappa) # 指定模型的预测结果 this_p_list_valid = pred_list_valid[self.model2idx[model]] for run in range(config.n_runs): for fold in range(config.n_folds): num_valid = num_valid_matrix[run][fold] # 多个模型预测值线性组合 p_ens_list_valid_topk[run, fold, :num_valid] = ( w_ens * p_ens_list_valid_topk[run, fold, :num_valid] + this_w * this_p_list_valid[run, fold, :num_valid]) / ( w_ens + this_w) # 在最后一个model,生成一些指标 if cnt == init_top_k - 1: cdf = cdf_list_valid[run, fold, :] true_label = y_list_valid[run, fold, :num_valid] score = getScore( p_ens_list_valid_topk[run, fold, :num_valid], cdf) kappa_cv[run][fold] = quadratic_weighted_kappa( score, true_label) best_model_list.append(model) best_model_weight.append(this_w) w_ens += this_w cnt += 1 print("Init kappa: %.6f (%.6f)" % (np.mean(kappa_cv), np.std(kappa_cv))) return best_model_list, best_model_weight, p_ens_list_valid_topk, w_ens
def gen_bagging(self, param, set_obj, all): """ 分袋整合预测结果 :param set_obj: :param all: :return: """ for n in range(model_param_conf.bagging_size): # 对数据进行自举法抽样;因为ratio=1 且bootstrap_replacement=false 说明没有用到,就使用的是全量数据 index_base, index_meta = utils.bootstrap_all(model_param_conf.bootstrap_replacement, set_obj['numTrain'], model_param_conf.bootstrap_ratio) set_obj['index_base'] = index_base set_obj['dtrain'] = xgb.DMatrix(set_obj['X_train'][index_base], label=set_obj['labels_train'][index_base], weight=set_obj['weight_train'][index_base]) if all: preds_bagging = np.zeros((set_obj['numTest'], model_param_conf.bagging_size), dtype=float) set_obj['dtest'] = xgb.DMatrix(set_obj['X_test'], label=set_obj['labels_test']) # watchlist set_obj['watchlist'] = [] if model_param_conf.verbose_level >= 2: set_obj['watchlist'] = [(set_obj['dtrain'], 'train')] # 调用 每个子类的train_predict方法,多态 pred = self.train_predict(param, set_obj, all) pred_test = pred preds_bagging[:, n] = pred_test else: preds_bagging = np.zeros((set_obj['numValid'], model_param_conf.bagging_size), dtype=float) set_obj['dvalid'] = xgb.DMatrix(set_obj['X_valid'], label=set_obj['labels_valid']) # watchlist set_obj['watchlist'] = [] if model_param_conf.verbose_level >= 2: set_obj['watchlist'] = [(set_obj['dtrain'], 'train'), (set_obj['dvalid_base'], 'valid')] # 调用 每个子类的train_predict方法,多态 pred = self.train_predict(param, set_obj, all) pred_valid = pred preds_bagging[:, n] = pred_valid # 每次会把当前bagging的结果累计进来 求均值 pred_raw = np.mean(preds_bagging[:, :(n + 1)], axis=1) # 为什么需要两次argsort? pred_rank = pred_raw.argsort().argsort() pred_score, cutoff = utils.getScore(pred_rank, set_obj['cdf_valid'], valid=True) kappa_valid = utils.quadratic_weighted_kappa(pred_score, set_obj['Y_valid']) if all: pred_raw = np.mean(preds_bagging, axis=1) pred_rank = pred_raw.argsort().argsort() return pred_raw, pred_rank else: return pred_raw, pred_rank, kappa_valid
def find_best_model(self, this_sorted_models, pred_list_valid, y_list_valid, cdf_list_valid, num_valid_matrix, w_ens, w_min, w_max, best_kappa, hypteropt_max_evals, p_ens_list_valid_topk): """ 从模型集合中找出一个最佳模型,最佳系数,与topK集成结果进行线性组合 寻找最佳模型、权重、kappa值 从this_sorted_models找到一个最佳模型 :param this_sorted_models: :param w_ens: :param w_min: :param w_max: :param hypteropt_max_evals: :param p_ens_list_valid_topk: :return: """ best_model = None best_weight = 0 for model, kappa in this_sorted_models: # 当前模型预测值 this_p_list_valid = pred_list_valid[self.model2idx[model]] # hyperopt 找当前模型最优权重 trials = Trials() # 不同模型的权重 param_space = { 'weight_current_model': hp.uniform('weight_current_model', w_min, w_max) } # topk权重是1 找另一个最佳权重 obj = lambda param: self.ensemble_selection_obj( param, p_ens_list_valid_topk, 1., this_p_list_valid, y_list_valid, cdf_list_valid, num_valid_matrix) best_params = fmin(obj, param_space, algo=tpe.suggest, trials=trials, max_evals=hypteropt_max_evals) # 返回当前模型权重 this_w = best_params['weight_current_model'] # 按比例缩放当前权重 1 this_w --- w_ens this_w * w_ens this_w *= w_ens # 当前kappa cv kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): num_valid = num_valid_matrix[run][fold] # topk预测 p1 = p_ens_list_valid_topk[run, fold, :num_valid] # 当前预测 p2 = this_p_list_valid[run, fold, :num_valid] # 真实label true_label = y_list_valid[run, fold, :num_valid] cdf = cdf_list_valid[run, fold, :] # 集成后的结果 p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w) score = getScore(p_ens, cdf) # 集成后kappa值 kappa_cv[run][fold] = quadratic_weighted_kappa( score, true_label) # 集成后平均kappa cv 由于现在 if np.mean(kappa_cv) > best_kappa: best_kappa, best_model, best_weight = np.mean( kappa_cv), model, this_w return best_kappa, best_model, best_weight
def init_model_metrics_by_run_fold(self, feat_folder, cdf): """ 为每个交叉验证数据按照 run-fold生成一系列指标 初始化实例变量,供后续方法使用 kappa_list :每个模型的平均kappa值 num_valid_matrix:每个run-fold 的预测结果行数 y_list_valid :每个run-fold 的真实label cdf_list_valid :每个run-fold 的cdf kappa_cv :每个run-fold 的kappa cv pred_list_valid :每个run-fold 的真实预测值 :param feat_folder: :param cdf: :return: """ kappa_list = dict() # 模型-run-fold-行 交叉验证-valid数据集预测结果 pred_list_valid = np.zeros((len(self.model_list), config.n_runs, config.n_folds, self.max_num_valid), dtype=float) # run-fold-行 交叉验证-valid数据集真实label y_list_valid = np.zeros( (config.n_runs, config.n_folds, self.max_num_valid), dtype=float) # run-fold-4类别 交叉验证-valid数据集预测结果cdf cdf_list_valid = np.zeros( (config.n_runs, config.n_folds, config.num_of_class), dtype=float) # run-fold valid 交叉验证-valid数据集预测结果行数 num_valid_matrix = np.zeros((config.n_runs, config.n_folds), dtype=int) print("Load model...") for i, model in enumerate(self.model_list): print("model: %s" % model) kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): path = "%s/Run%d/Fold%d/pred" % (self.model_folder, run + 1, fold + 1) pred_file = "%s/valid.pred.%s.csv" % (path, model) cdf_file = "%s/Run%d/Fold%d/valid.cdf" % ( config.solution_info, run + 1, fold + 1) this_p_valid = pd.read_csv(pred_file, dtype=float) # 这些指标只需要执行一次就行了,每个模型都一样 if i == 0: # 记录run-fold的行数 num_valid_matrix[run][fold] = this_p_valid.shape[0] # 记录run-fold的真实值 y_list_valid[run, fold, :num_valid_matrix[run] [fold]] = this_p_valid["target"].values # load cdf if cdf == None: cdf_list_valid[run, fold, :] = np.loadtxt(cdf_file, dtype=float) else: cdf_list_valid[run, fold, :] = cdf score = getScore(this_p_valid["prediction"].values, cdf_list_valid[run, fold, :]) kappa_cv[run][fold] = quadratic_weighted_kappa( score, y_list_valid[run, fold, :num_valid_matrix[run][fold]]) # 记录model-run-fold的预测值数组 pred_list_valid[ self.model2idx[model], run, fold, :this_p_valid. shape[0]] = this_p_valid["prediction"].values print("kappa: %.6f" % np.mean(kappa_cv)) # 算出每个模型的平均kappa_cv kappa_list[model] = np.mean(kappa_cv) return kappa_list, pred_list_valid, y_list_valid, cdf_list_valid, num_valid_matrix
def gen_best_weight(this_sorted_models, model2idx, w_min, w_max, pred_list_valid, hypteropt_max_evals, w_ens, Y_list_valid, cdf_list_valid, numValidMatrix, p_ens_list_valid_tmp, best_model_list, best_model_weight): """ :param this_sorted_models: :param model2idx: :param w_min: :param w_max: :param pred_list_valid: :param hypteropt_max_evals: :param w_ens: :param Y_list_valid: :param cdf_list_valid: :param numValidMatrix: :param p_ens_list_valid_tmp: :param best_model_list: 引用 :param best_model_weight: 引用 :return: """ iter = 0 while True: iter += 1 for model, _ in this_sorted_models: this_p_list_valid = pred_list_valid[model2idx[model]] ## hyperopt for the best weight trials = Trials() # 不同模型的权重 param_space = {'weight2': hp.uniform('weight2', w_min, w_max)} obj = lambda param: ensembleSelectionObj( param, p_ens_list_valid_tmp, 1., this_p_list_valid, Y_list_valid, cdf_list_valid, numValidMatrix) best_params = fmin(obj, param_space, algo=tpe.suggest, trials=trials, max_evals=hypteropt_max_evals) this_w = best_params['weight2'] this_w *= w_ens # all the current prediction to the ensemble kappa_cv = np.zeros((config.n_runs, config.n_folds), dtype=float) for run in range(config.n_runs): for fold in range(config.n_folds): numValid = numValidMatrix[run][fold] p1 = p_ens_list_valid_tmp[run, fold, :numValid] p2 = this_p_list_valid[run, fold, :numValid] true_label = Y_list_valid[run, fold, :numValid] cdf = cdf_list_valid[run, fold, :] p_ens = (w_ens * p1 + this_w * p2) / (w_ens + this_w) score = getScore(p_ens, cdf) kappa_cv[run][fold] = quadratic_weighted_kappa( score, true_label) if np.mean(kappa_cv) > best_kappa: best_kappa, best_model, best_weight = np.mean( kappa_cv), model, this_w if best_model == None: break print("Iter: %d" % iter) print(" model: %s" % best_model) print(" weight: %s" % best_weight) print(" kappa: %.6f" % best_kappa) best_model_list.append(best_model) best_model_weight.append(best_weight) # valid this_p_list_valid = pred_list_valid[model2idx[best_model]] for run in range(config.n_runs): for fold in range(config.n_folds): numValid = numValidMatrix[run][fold] p_ens_list_valid_tmp[run, fold, :numValid] = ( w_ens * p_ens_list_valid_tmp[run, fold, :numValid] + best_weight * this_p_list_valid[run, fold, :numValid]) / ( w_ens + best_weight) best_model = None w_ens += best_weight