def cv(self): start = time.time() if self.verbose: self.logger.info("=" * 50) self.logger.info("Task") self.logger.info(" %s" % str(self.__str__())) self.logger.info("Param") self._print_param_dict(self.learner.param_dict) self.logger.info("Result") self.logger.info(" Run RMSE Shape") rmse_cv = np.zeros(self.n_iter) for i in range(self.n_iter): # data X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data( i) # fit self.learner.fit(X_train, y_train) y_pred = self.learner.predict(X_valid) rmse_cv[i] = dist_utils._rmse(y_valid, y_pred) # log self.logger.info(" {:>3} {:>8} {} x {}".format( i + 1, np.round(rmse_cv[i], 6), X_train.shape[0], X_train.shape[1])) # save fname = "%s/Run%d/valid.pred.%s.csv" % (config.OUTPUT_DIR, i + 1, self.__str__()) df = pd.DataFrame({"target": y_valid, "prediction": y_pred}) df.to_csv(fname, index=False, columns=["target", "prediction"]) if hasattr(self.learner.learner, "predict_proba"): y_proba = self.learner.learner.predict_proba(X_valid) fname = "%s/Run%d/valid.proba.%s.csv" % (config.OUTPUT_DIR, i + 1, self.__str__()) columns = ["proba%d" % i for i in range(y_proba.shape[1])] df = pd.DataFrame(y_proba, columns=columns) df["target"] = y_valid df.to_csv(fname, index=False) self.rmse_cv_mean = np.mean(rmse_cv) self.rmse_cv_std = np.std(rmse_cv) end = time.time() _sec = end - start _min = int(_sec / 60.) if self.verbose: self.logger.info("RMSE") self.logger.info(" Mean: %.6f" % self.rmse_cv_mean) self.logger.info(" Std: %.6f" % self.rmse_cv_std) self.logger.info("Time") if _min > 0: self.logger.info(" %d mins" % _min) else: self.logger.info(" %d secs" % _sec) self.logger.info("-" * 50) return self
def cv(self): start = time.time() if self.verbose: self.logger.info("=" * 50) self.logger.info("Task") self.logger.info(" %s" % str(self.__str__())) self.logger.info("Param") self._print_param_dict(self.learner.param_dict) self.logger.info("Result") self.logger.info(" Run RMSE Shape") auc_cv = np.zeros(self.n_iter) total_train = self.feature.len_train train_pred = np.zeros(total_train) i = -1 for (X_train, y_train, X_valid, y_valid, train_ind, valid_ind) in self.feature._get_train_valid_data(): i += 1 print(X_train[:10]) print(y_train[:10]) # data #X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i) # fit self.learner.fit(X_train, y_train) y_pred = self.learner.predict(X_valid) train_pred[valid_ind] = y_pred auc_cv[i] = dist_utils._rmse(y_valid, y_pred) # log self.logger.info(" {:>3} {:>8} {} x {}".format( i + 1, np.round(auc_cv[i], 6), X_train.shape[0], X_train.shape[1])) # save fname = "%s/cv_pred.%s.csv" % (config.OUTPUT_DIR, self.__str__()) df = pd.DataFrame({"click_id": y_valid, "predicted": y_pred}) df.to_csv(fname, index=False, columns=["click_id", "predicted"]) self.rmse_cv_mean = np.mean(auc_cv) self.rmse_cv_std = np.std(auc_cv) end = time.time() _sec = end - start _min = int(_sec / 60.) if self.verbose: self.logger.info("AUC") self.logger.info(" Mean: %.6f" % self.rmse_cv_mean) self.logger.info(" Std: %.6f" % self.rmse_cv_std) self.logger.info("Time") if _min > 0: self.logger.info(" %d mins" % _min) else: self.logger.info(" %d secs" % _sec) self.logger.info("-" * 50) return self
def cv(self): start = time.time() if self.verbose: self.logger.info("="*50) self.logger.info("Task") self.logger.info(" %s" % str(self.__str__())) self.logger.info("Param") for k,v in sorted(self.learner.param_dict.items()): self.logger.info(" %s: %s" % (k,v)) self.logger.info("Result") self.logger.info(" Run RMSE Shape") rmse_cv = np.zeros(self.n_iter) for i in range(self.n_iter): # data X_train, y_train, X_valid, y_valid = self.feature._get_train_valid_data(i) # fit self.learner.fit(X_train, y_train) y_pred = self.learner.predict(X_valid) rmse_cv[i] = dist_utils._rmse(y_valid, y_pred) # log self.logger.info(" {:>3} {:>8} {} x {}".format( i+1, np.round(rmse_cv[i],6), X_train.shape[0], X_train.shape[1])) # save fname = "%s/Run%d/valid.pred.%s.csv"%(config.OUTPUT_DIR, i+1, self.__str__()) df = pd.DataFrame({"target": y_valid, "prediction": y_pred}) df.to_csv(fname, index=False, columns=["target", "prediction"]) if hasattr(self.learner.learner, "predict_proba"): y_proba = self.learner.learner.predict_proba(X_valid) fname = "%s/Run%d/valid.proba.%s.csv"%(config.OUTPUT_DIR, i+1, self.__str__()) columns = ["proba%d"%i for i in range(y_proba.shape[1])] df = pd.DataFrame(y_proba, columns=columns) df["target"] = y_valid df.to_csv(fname, index=False) self.rmse_cv_mean = np.mean(rmse_cv) self.rmse_cv_std = np.std(rmse_cv) end = time.time() _sec = end - start _min = int(_sec/60.) if self.verbose: self.logger.info("RMSE") self.logger.info(" Mean: %.6f"%self.rmse_cv_mean) self.logger.info(" Std: %.6f"%self.rmse_cv_std) self.logger.info("Time") if _min > 0: self.logger.info(" %d mins"%_min) else: self.logger.info(" %d secs"%_sec) self.logger.info("-"*50) return self
def _ens_obj_generic(self, weight2, p1_list, weight1, p2_list, true_label_list, numBSTMatrix, bst_inst_idx): rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numBST = numBSTMatrix[run, fold] bidx = bst_inst_idx[run, fold, :numBST].tolist() p1 = p1_list[run, fold, bidx] p2 = p2_list[run, fold, bidx] true_label = true_label_list[run, fold, bidx] p_ens = self._merge_pred(weight1, p1, weight2, p2) rmse_cv[run, fold] = dist_utils._rmse(p_ens, true_label) rmse_mean = np.mean(rmse_cv) rmse_std = np.std(rmse_cv) return rmse_mean, rmse_std
def _ens_obj_generic(self, weight2, p1_list, weight1, p2_list, true_label_list, numBSTMatrix, bst_inst_idx): rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numBST = numBSTMatrix[run,fold] bidx = bst_inst_idx[run,fold,:numBST].tolist() p1 = p1_list[run,fold,bidx] p2 = p2_list[run,fold,bidx] true_label = true_label_list[run,fold,bidx] p_ens = self._merge_pred(weight1, p1, weight2, p2) rmse_cv[run,fold] = dist_utils._rmse(p_ens, true_label) rmse_mean = np.mean(rmse_cv) rmse_std = np.std(rmse_cv) return rmse_mean, rmse_std
def _get_centroid_rmse(self, text1, text2): centroid1 = self._get_centroid_vector(text1) centroid2 = self._get_centroid_vector(text2) return dist_utils._rmse(centroid1, centroid2)
def go(self): ## initialization pred_list_valid = np.zeros((self.n_models, config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) Y_list_valid = np.zeros( (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) numValidMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int) p_ens_list_valid = np.zeros( (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) bst_inst_idx = np.zeros( (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) numBSTMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int) oob_inst_idx = np.zeros( (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) numOOBMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int) self.logger.info("Perform Extreme Ensemble Selection...") ## model index model_index_dict = dict(zip(self.model_list, range(self.n_models))) model_rmse_dict = dict(zip(self.model_list, [0] * self.n_models)) self.logger.info("=" * 80) self.logger.info("Load model...") for model in self.model_list: self.logger.info("model: %s" % model) model_id = model_index_dict[model] rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) ## load model for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): path = "%s/Run%d" % (self.model_folder, run + 1) pred_file = "%s/valid.pred.%s.csv" % (path, model) this_p_valid = pd.read_csv(pred_file, dtype=float) numValidMatrix[run, fold] = this_p_valid.shape[0] numValid = numValidMatrix[run, fold] this_target = this_p_valid["target"].values this_p_valid = this_p_valid["prediction"].values pred_list_valid[model_id, run, fold, :numValid] = np.clip( this_p_valid, 1., 3.) Y_list_valid[run, fold, :numValid] = this_target ## rmse_cv[run, fold] = dist_utils._rmse( pred_list_valid[model_id, run, fold, :numValid], Y_list_valid[run, fold, :numValid]) self.logger.info("rmse: %.6f (%.6f)" % (np.mean(rmse_cv), np.std(rmse_cv))) model_rmse_dict[model] = (np.mean(rmse_cv), np.std(rmse_cv)) self.logger.info("%d models in total." % self.n_models) sorted_models = sorted(model_rmse_dict.items(), key=lambda x: x[1][0]) # greedy ensemble self.logger.info("=" * 80) best_bagged_model_list = [[]] * self.bagging_size best_bagged_model_weight = [[]] * self.bagging_size score_valid_bag_mean = np.nan * np.zeros( (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX, self.bagging_size), dtype=float) rmse_cv_mean_mean_lst = [0] * self.bagging_size rmse_cv_mean_std_lst = [0] * self.bagging_size for bagging_iter in range(self.bagging_size): seed_model = self.random_seed + 100 * bagging_iter if not self.enable_extreme: this_sorted_models = self._pick_random_models( sorted_models, seed_model) #### instance level subsampling for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): if self.inst_splitter is None: # GENERAL APPROACH seed_inst = self.random_seed + 1000 * bagging_iter + 100 * run + 10 * fold rng_inst = np.random.RandomState(seed_inst) numValid = numValidMatrix[run, fold] if self.inst_subsample_replacement: sss = StratifiedShuffleSplitReplacement( Y_list_valid[run, fold, :numValid], n_iter=1, test_size=1. - self.inst_subsample, random_state=seed_inst) iidx, oidx = list(sss)[0] else: if self.inst_subsample < 1: # Stratified ShuffleSplit sss = ShuffleSplit( len(Y_list_valid[run, fold, :numValid]), n_iter=1, test_size=1. - self.inst_subsample, random_state=seed_inst) iidx, oidx = list(sss)[0] elif self.inst_subsample == 1: # set iidx (trianing) the same as oidx (validation) iidx = np.arange(numValid) oidx = np.arange(numValid) else: iidx, oidx = self.inst_splitter[run] numBSTMatrix[run, fold] = len(iidx) bst_inst_idx[run, fold, :numBSTMatrix[run, fold]] = iidx numOOBMatrix[run, fold] = len(oidx) oob_inst_idx[run, fold, :numOOBMatrix[run, fold]] = oidx #print this_model_list best_model_list = [] best_model_weight = [] best_model_rmse = [] best_rmse = 0 best_rmse_std = 0 best_model = None p_ens_list_valid_tmp = np.zeros( (config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) #### Technique: Ensemble Initialization iter = 0 w_ens, this_w = 0.0, 1.0 if self.init_top_k > 0: # self.logger.info("** Ensemble Initialization **") # init_top_k = min(init_top_k, num_model) rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) for cnt in range(self.init_top_k): iter += 1 start = time.time() seed_model = self.random_seed + 100 * bagging_iter + 10 * iter if self.enable_extreme: this_sorted_models = self._pick_random_models( sorted_models, seed_model) best_model, (rmse, rmse_std) = this_sorted_models[0] else: best_model, (rmse, rmse_std) = this_sorted_models[cnt] this_p_list_valid = pred_list_valid[ model_index_dict[best_model]] for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numValid = numValidMatrix[run, fold] numBST = numBSTMatrix[run, fold] bidx = bst_inst_idx[run, fold, :numBST].tolist() p_ens_list_valid_tmp[ run, fold, :numValid] = self._merge_pred( w_ens, p_ens_list_valid_tmp[run, fold, :numValid], this_w, this_p_list_valid[run, fold, :numValid]) true_label = Y_list_valid[run, fold, bidx] rmse_cv[run, fold] = dist_utils._rmse( p_ens_list_valid_tmp[run, fold, bidx], true_label) end = time.time() best_weight = this_w best_rmse = np.mean(rmse_cv) best_rmse_std = np.std(rmse_cv) self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start))) self.logger.info(" model: %s" % best_model) self.logger.info(" weight: %s" % best_weight) self.logger.info(" rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std)) best_model_list.append(best_model) best_model_weight.append(best_weight) w_ens += best_weight #### Technique: Ensemble Selection with Replacement while True: iter += 1 seed_model = self.random_seed + 100 * bagging_iter + 10 * iter if self.enable_extreme: this_sorted_models = self._pick_random_models( sorted_models, seed_model) if self.multiprocessing: start = time.time() models_tmp = [ model for model, (_, _) in this_sorted_models ] best_trial_rmse_mean_lst, best_trial_rmse_std_lst, model_lst, this_w_lst = \ zip(*Parallel(n_jobs=self.multiprocessing_num_cores)( delayed(self._find_optim_weight_scipy)( p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, bst_inst_idx, w_ens, model_index_dict, m ) for m in models_tmp )) ## ind_best = np.argmin(best_trial_rmse_mean_lst) best_trial_rmse_mean = best_trial_rmse_mean_lst[ind_best] best_trial_rmse_std = best_trial_rmse_std_lst[ind_best] model = model_lst[ind_best] this_w = this_w_lst[ind_best] if best_trial_rmse_mean < best_rmse: best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std best_model, best_weight = model, this_w end = time.time() else: start = time.time() for model, (_, _) in this_sorted_models: best_trial_rmse_mean, best_trial_rmse_std, model, this_w = \ self._find_optim_weight_scipy( p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, bst_inst_idx, w_ens, model_index_dict, model) if best_trial_rmse_mean < best_rmse: best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std best_model, best_weight = model, this_w end = time.time() if best_model is None: break if len(best_model_rmse) > 1 and ( best_model_rmse[-1] - best_rmse < self.epsilon): break ## self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start))) self.logger.info(" model: %s" % best_model) self.logger.info(" weight: %s" % best_weight) self.logger.info(" rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std)) # valid this_p_list_valid = pred_list_valid[ model_index_dict[best_model]] pred_raw_list = [] true_label_list = [] for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numValid = numValidMatrix[run, fold] numBST = numBSTMatrix[run, fold] bidx = bst_inst_idx[run, fold, :numBST].tolist() p_ens_list_valid_tmp[ run, fold, :numValid] = self._merge_pred( w_ens, p_ens_list_valid_tmp[run, fold, :numValid], best_weight, this_p_list_valid[run, fold, :numValid]) pred_raw_list.append(p_ens_list_valid_tmp[run, fold, bidx]) true_label_list.append(Y_list_valid[run, fold, bidx]) best_model_list.append(best_model) best_model_weight.append(best_weight) best_model_rmse.append(best_rmse) best_model = None w_ens += best_weight ## compute OOB score rmse_cv_mean = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numValid = numValidMatrix[run, fold] true_label = Y_list_valid[run, fold, :numValid] numOOB = numOOBMatrix[run, fold] oidx = oob_inst_idx[run, fold, :numOOB].tolist() pred_raw = p_ens_list_valid_tmp[run, fold, oidx] ## mean score_valid_bag_mean[run, fold, oidx, bagging_iter] = pred_raw pred_mean = np_utils._array_mean( score_valid_bag_mean[run, fold, :numValid, :(bagging_iter + 1)]) non_nan_idx = pred_mean != config.MISSING_VALUE_NUMERIC rmse_cv_mean[run, fold] = dist_utils._rmse( pred_mean[non_nan_idx], true_label[non_nan_idx]) self.logger.info("-" * 80) self.logger.info("Bag: %d" % (bagging_iter + 1)) self.logger.info("rmse-mean: %.6f (%.6f)" % (np.mean(rmse_cv_mean), np.std(rmse_cv_mean))) self.logger.info("-" * 80) best_bagged_model_list[bagging_iter] = best_model_list best_bagged_model_weight[bagging_iter] = best_model_weight ## save the current prediction mr = "R" + str(self.model_subsample_replacement).upper()[0] ir = "R" + str(self.inst_subsample_replacement).upper()[0] ## mean best_rmse_mean = np.mean(rmse_cv_mean) best_rmse_std = np.std(rmse_cv_mean) output = self._ens_predict( best_bagged_model_list[:(bagging_iter + 1)], best_bagged_model_weight[:(bagging_iter + 1)]) sub_file = "%s_[MS%.2f_%s]_[IS%.2f_%s]_[Top%d]_[Bag%d]_[Mean%.6f]_[Std%.6f].mean.csv" % ( self.subm_prefix, self.model_subsample, mr, self.inst_subsample, ir, self.init_top_k, bagging_iter + 1, best_rmse_mean, best_rmse_std) output.to_csv(sub_file, index=False) rmse_cv_mean_mean_lst[bagging_iter] = best_rmse_mean rmse_cv_mean_std_lst[bagging_iter] = best_rmse_std ## plot OOB score x = np.arange(1, bagging_iter + 2, 1) label = "Mean (Best = %.6f, Bag = %d)" % ( np.min(rmse_cv_mean_mean_lst[:(bagging_iter + 1)]), np.argmin(rmse_cv_mean_mean_lst[:(bagging_iter + 1)]) + 1) plt.errorbar(x, rmse_cv_mean_mean_lst[:(bagging_iter + 1)], yerr=rmse_cv_mean_std_lst[:(bagging_iter + 1)], fmt='-o', label=label) plt.xlim(1, self.bagging_size) plt.title("Extreme Ensemble Selection RMSE") plt.xlabel("Bag") plt.ylabel("CV/OOB RMSE") plt.legend(loc="upper right") fig_file = "%s/ensemble_selection_%d.pdf" % (config.FIG_DIR, bagging_iter + 1) plt.savefig(fig_file) plt.clf()
def _get_rmse(self, sent1, sent2): vect1 = self._get_vector(sent1) vect2 = self._get_vector(sent2) return dist_utils._rmse(vect1, vect2)
def go(self): ## initialization pred_list_valid = np.zeros((self.n_models, config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) Y_list_valid = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) numValidMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int) p_ens_list_valid = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) bst_inst_idx = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) numBSTMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int) oob_inst_idx = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) numOOBMatrix = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=int) self.logger.info("Perform Extreme Ensemble Selection...") ## model index model_index_dict = dict(zip(self.model_list, range(self.n_models))) model_rmse_dict = dict(zip(self.model_list, [0]*self.n_models)) self.logger.info("="*80) self.logger.info("Load model...") for model in self.model_list: self.logger.info("model: %s" % model) model_id = model_index_dict[model] rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) ## load model for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): path = "%s/Run%d" % (self.model_folder, run+1) pred_file = "%s/valid.pred.%s.csv" % (path, model) this_p_valid = pd.read_csv(pred_file, dtype=float) numValidMatrix[run,fold] = this_p_valid.shape[0] numValid = numValidMatrix[run,fold] this_target = this_p_valid["target"].values this_p_valid = this_p_valid["prediction"].values pred_list_valid[model_id,run,fold,:numValid] = np.clip(this_p_valid, 1., 3.) Y_list_valid[run,fold,:numValid] = this_target ## rmse_cv[run,fold] = dist_utils._rmse(pred_list_valid[model_id,run,fold,:numValid], Y_list_valid[run,fold,:numValid]) self.logger.info("rmse: %.6f (%.6f)" % (np.mean(rmse_cv), np.std(rmse_cv))) model_rmse_dict[model] = (np.mean(rmse_cv), np.std(rmse_cv)) self.logger.info("%d models in total." % self.n_models) sorted_models = sorted(model_rmse_dict.items(), key=lambda x: x[1][0]) # greedy ensemble self.logger.info("="*80) best_bagged_model_list = [[]]*self.bagging_size best_bagged_model_weight = [[]]*self.bagging_size score_valid_bag_mean = np.nan * np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX, self.bagging_size), dtype=float) rmse_cv_mean_mean_lst = [0]*self.bagging_size rmse_cv_mean_std_lst = [0]*self.bagging_size for bagging_iter in range(self.bagging_size): seed_model = self.random_seed + 100 * bagging_iter if not self.enable_extreme: this_sorted_models = self._pick_random_models(sorted_models, seed_model) #### instance level subsampling for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): if self.inst_splitter is None: # GENERAL APPROACH seed_inst = self.random_seed + 1000 * bagging_iter + 100 * run + 10 * fold rng_inst = np.random.RandomState(seed_inst) numValid = numValidMatrix[run,fold] if self.inst_subsample_replacement: sss = StratifiedShuffleSplitReplacement(Y_list_valid[run,fold,:numValid], n_iter=1, test_size=1.-self.inst_subsample, random_state=seed_inst) iidx, oidx = list(sss)[0] else: if self.inst_subsample < 1: # Stratified ShuffleSplit sss = ShuffleSplit(len(Y_list_valid[run,fold,:numValid]), n_iter=1, test_size=1.-self.inst_subsample, random_state=seed_inst) iidx, oidx = list(sss)[0] elif self.inst_subsample == 1: # set iidx (trianing) the same as oidx (validation) iidx = np.arange(numValid) oidx = np.arange(numValid) else: iidx, oidx = self.inst_splitter[run] numBSTMatrix[run,fold] = len(iidx) bst_inst_idx[run,fold,:numBSTMatrix[run,fold]] = iidx numOOBMatrix[run,fold] = len(oidx) oob_inst_idx[run,fold,:numOOBMatrix[run,fold]] = oidx #print this_model_list best_model_list = [] best_model_weight = [] best_model_rmse = [] best_rmse = 0 best_rmse_std = 0 best_model = None p_ens_list_valid_tmp = np.zeros((config.N_RUNS, config.N_FOLDS, config.VALID_SIZE_MAX), dtype=float) #### Technique: Ensemble Initialization iter = 0 w_ens, this_w = 0.0, 1.0 if self.init_top_k > 0: # self.logger.info("** Ensemble Initialization **") # init_top_k = min(init_top_k, num_model) rmse_cv = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) for cnt in range(self.init_top_k): iter += 1 start = time.time() seed_model = self.random_seed + 100 * bagging_iter + 10 * iter if self.enable_extreme: this_sorted_models = self._pick_random_models(sorted_models, seed_model) best_model,(rmse,rmse_std) = this_sorted_models[0] else: best_model,(rmse,rmse_std) = this_sorted_models[cnt] this_p_list_valid = pred_list_valid[model_index_dict[best_model]] for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numValid = numValidMatrix[run,fold] numBST = numBSTMatrix[run,fold] bidx = bst_inst_idx[run,fold,:numBST].tolist() p_ens_list_valid_tmp[run,fold,:numValid] = self._merge_pred( w_ens, p_ens_list_valid_tmp[run,fold,:numValid], this_w, this_p_list_valid[run,fold,:numValid]) true_label = Y_list_valid[run,fold,bidx] rmse_cv[run,fold] = dist_utils._rmse(p_ens_list_valid_tmp[run,fold,bidx], true_label) end = time.time() best_weight = this_w best_rmse = np.mean(rmse_cv) best_rmse_std = np.std(rmse_cv) self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start))) self.logger.info(" model: %s" % best_model) self.logger.info(" weight: %s" % best_weight) self.logger.info(" rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std)) best_model_list.append(best_model) best_model_weight.append(best_weight) w_ens += best_weight #### Technique: Ensemble Selection with Replacement while True: iter += 1 seed_model = self.random_seed + 100 * bagging_iter + 10 * iter if self.enable_extreme: this_sorted_models = self._pick_random_models(sorted_models, seed_model) if self.multiprocessing: start = time.time() models_tmp = [model for model,(_,_) in this_sorted_models] best_trial_rmse_mean_lst, best_trial_rmse_std_lst, model_lst, this_w_lst = \ zip(*Parallel(n_jobs=self.multiprocessing_num_cores)( delayed(self._find_optim_weight_scipy)( p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, bst_inst_idx, w_ens, model_index_dict, m ) for m in models_tmp )) ## ind_best = np.argmin(best_trial_rmse_mean_lst) best_trial_rmse_mean = best_trial_rmse_mean_lst[ind_best] best_trial_rmse_std = best_trial_rmse_std_lst[ind_best] model = model_lst[ind_best] this_w = this_w_lst[ind_best] if best_trial_rmse_mean < best_rmse: best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std best_model, best_weight = model, this_w end = time.time() else: start = time.time() for model,(_,_) in this_sorted_models: best_trial_rmse_mean, best_trial_rmse_std, model, this_w = \ self._find_optim_weight_scipy( p_ens_list_valid_tmp, pred_list_valid, Y_list_valid, numBSTMatrix, bst_inst_idx, w_ens, model_index_dict, model) if best_trial_rmse_mean < best_rmse: best_rmse, best_rmse_std = best_trial_rmse_mean, best_trial_rmse_std best_model, best_weight = model, this_w end = time.time() if best_model is None: break if len(best_model_rmse) > 1 and (best_model_rmse[-1] - best_rmse < self.epsilon): break ## self.logger.info("Iter: %d (%.2fs)" % (iter, (end - start))) self.logger.info(" model: %s" % best_model) self.logger.info(" weight: %s" % best_weight) self.logger.info(" rmse: %.6f (%.6f)" % (best_rmse, best_rmse_std)) # valid this_p_list_valid = pred_list_valid[model_index_dict[best_model]] pred_raw_list = [] true_label_list = [] for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numValid = numValidMatrix[run,fold] numBST = numBSTMatrix[run,fold] bidx = bst_inst_idx[run,fold,:numBST].tolist() p_ens_list_valid_tmp[run,fold,:numValid] = self._merge_pred( w_ens, p_ens_list_valid_tmp[run,fold,:numValid], best_weight, this_p_list_valid[run,fold,:numValid]) pred_raw_list.append( p_ens_list_valid_tmp[run,fold,bidx] ) true_label_list.append( Y_list_valid[run,fold,bidx] ) best_model_list.append(best_model) best_model_weight.append(best_weight) best_model_rmse.append(best_rmse) best_model = None w_ens += best_weight ## compute OOB score rmse_cv_mean = np.zeros((config.N_RUNS, config.N_FOLDS), dtype=float) for run in range(config.N_RUNS): for fold in range(config.N_FOLDS): numValid = numValidMatrix[run,fold] true_label = Y_list_valid[run,fold,:numValid] numOOB = numOOBMatrix[run,fold] oidx = oob_inst_idx[run,fold,:numOOB].tolist() pred_raw = p_ens_list_valid_tmp[run,fold,oidx] ## mean score_valid_bag_mean[run,fold,oidx,bagging_iter] = pred_raw pred_mean = np_utils._array_mean(score_valid_bag_mean[run,fold,:numValid,:(bagging_iter+1)]) non_nan_idx = pred_mean != config.MISSING_VALUE_NUMERIC rmse_cv_mean[run,fold] = dist_utils._rmse(pred_mean[non_nan_idx], true_label[non_nan_idx]) self.logger.info("-"*80) self.logger.info( "Bag: %d"% (bagging_iter+1)) self.logger.info( "rmse-mean: %.6f (%.6f)" % (np.mean(rmse_cv_mean), np.std(rmse_cv_mean))) self.logger.info("-"*80) best_bagged_model_list[bagging_iter] = best_model_list best_bagged_model_weight[bagging_iter] = best_model_weight ## save the current prediction mr = "R" + str(self.model_subsample_replacement).upper()[0] ir = "R" + str(self.inst_subsample_replacement).upper()[0] ## mean best_rmse_mean = np.mean(rmse_cv_mean) best_rmse_std = np.std(rmse_cv_mean) output = self._ens_predict(best_bagged_model_list[:(bagging_iter+1)], best_bagged_model_weight[:(bagging_iter+1)]) sub_file = "%s_[MS%.2f_%s]_[IS%.2f_%s]_[Top%d]_[Bag%d]_[Mean%.6f]_[Std%.6f].mean.csv" % ( self.subm_prefix, self.model_subsample, mr, self.inst_subsample, ir, self.init_top_k, bagging_iter+1, best_rmse_mean, best_rmse_std) output.to_csv(sub_file, index=False) rmse_cv_mean_mean_lst[bagging_iter] = best_rmse_mean rmse_cv_mean_std_lst[bagging_iter] = best_rmse_std ## plot OOB score x = np.arange(1,bagging_iter+2,1) label = "Mean (Best = %.6f, Bag = %d)"%( np.min(rmse_cv_mean_mean_lst[:(bagging_iter+1)]), np.argmin(rmse_cv_mean_mean_lst[:(bagging_iter+1)])+1) plt.errorbar(x, rmse_cv_mean_mean_lst[:(bagging_iter+1)], yerr=rmse_cv_mean_std_lst[:(bagging_iter+1)], fmt='-o', label=label) plt.xlim(1, self.bagging_size) plt.title("Extreme Ensemble Selection RMSE") plt.xlabel("Bag") plt.ylabel("CV/OOB RMSE") plt.legend(loc="upper right") fig_file = "%s/ensemble_selection_%d.pdf"%(config.FIG_DIR, bagging_iter+1) plt.savefig(fig_file) plt.clf()