def evaluate(self, eval_params=None): ''' Return the pessimistic bias of the average CV score by evaluating with the provided eval_params ''' if eval_params == None: eval_params = self.eval_params elif self.eval_params == None: self.eval_params = eval_params p = eval_params # alias xgb_params = p['xgb_params'] pos_balance_factor = p['pos_balance_factor'] min_child_weight_ratio = p['min_child_weight_ratio'] cutoff_thresholds = p['cutoff_thresholds'] split_scores = [] for k in range(5): # manual positive example re-weighting pos_ratio = self.pos_ratios[k] if pos_balance_factor != -1: xgb_params['scale_pos_weight'] = pos_balance_factor * pos_ratio else: try: del xgb_params['scale_pos_weight'] except: print "herp" pass # manual minimum child weight setup w_10 = self.w_10s[k] if min_child_weight_ratio != -1: if pos_balance_factor != -1: xgb_params['min_child_weight'] = \ w_10 * pos_balance_factor * min_child_weight_ratio else: xgb_params['min_child_weight'] = \ w_10 * min_child_weight_ratio else: try: del xgb_params['min_child_weight'] except: print "asdf" pass # train watchlist = [(self.xgmats_train[k], 'train')] bst = xgb.train(xgb_params, self.xgmats_train[k], xgb_params['num_round'], watchlist) # validate y_rank = bst.predict(self.xgmats_valid[k]) y_true = self.ys_valid[k] w = self.ws_valid[k] split_score, split_ct = search_best_score(y_true, y_rank, w, cutoff_thresholds) split_scores.append(split_score) score = float(np.mean(split_scores) - 0.25*np.std(split_scores, ddof=1)) if score > self.best_score: self.best_score = score self.best_preproc_params = self.preproc_params self.best_eval_params = p self.eval_params = p return score
subparams["eta"], subparams["max_depth"], ) plst = subparams.items() watchlist = [(xgmat, "train")] bst = xgb.train(plst, xgmat, n_trees, watchlist) """ Validate """ Xcv = X[valid] ycv = y[valid] wcv = w[valid] * float(test_size) / len(ycv) xgmat = xgb.DMatrix(Xcv) y_pred = bst.predict(xgmat) # search best cutoff_threshold and record score cutoff_thresholds = params["cutoff_thresholds"] split_score, split_ct = search_best_score(ycv, y_pred, wcv, cutoff_thresholds) # split_score, split_ct = search_best_score(ycv, y_pred, None, cutoff_thresholds, precision) split_scores.append(split_score) cv_score_mean = np.mean(split_scores) cv_score_std = np.std(split_scores, ddof=1) # unbiased """ Record """ record = dict() record["i_reduced"] = i_reduced record["discrete"] = discrete record["interact_threshold"] = it record["model_params"] = model_params record["cutoff_thresholds"] = params["cutoff_thresholds"] record["cv_score_mean"] = cv_score_mean record["cv_score_std"] = cv_score_std records.append(record)
X = X[:, cols] xgmat = xgb.DMatrix(X, label=y, weight=w) if pos_weight_ratio != 0: # positive example re-weighting sum_wpos = np.sum(w[i] for i in xrange(len(y)) if y[i] == 1) sum_wneg = np.sum(w[i] for i in xrange(len(y)) if y[i] == 0) subparams["scale_pos_weight"] = pos_weight_ratio * sum_wneg / sum_wpos plst = subparams.items() watchlist = [(xgmat, "train")] bst = xgb.train(plst, xgmat, n_trees, watchlist) """ Find the best cutoff_threshold """ xgmat = xgb.DMatrix(X) y_pred = bst.predict(xgmat) cutoff_thresholds = record["cutoff_thresholds"] best_score, best_ct = search_best_score(y, y_pred, w, cutoff_thresholds) # best_score, best_ct = search_best_score(y, y_pred, None, cutoff_thresholds, fbeta) print "%dth model score: %.2f" % (i_reduced, best_score) """ Save model """ bst.save_model(os.path.join(model_directory, "%d.model" % i_reduced)) print "%d.model saved" % i_reduced """ Save cols and cutoff_thresholds """ best_cols.append(cols) best_cts.append(best_ct) # finally, save the best cols & thresholds into a json file for reading later with open(os.path.join(model_directory, "cols_cts.json"), "wb") as fp: obj = [(cols, ct) for (cols, ct) in zip(best_cols, best_cts)] json.dump(obj, fp, indent=4)
print "wpos=%.2f, wneg=%.2f, ratio=%.2f" % \ (sum_wpos, sum_wneg, subparams["scale_pos_weight"]) print "i%d,t%d,p%d,k%d n_trees=%d, eta=%.2f, max_depth=%d" % \ (i_reduced, t, p, k, n_trees, subparams["eta"], subparams["max_depth"]) plst = subparams.items() watchlist = [(xgmat, 'train')] bst = xgb.train(plst, xgmat, n_trees, watchlist) ''' Validate ''' Xcv = X[valid] ycv = y[valid] wcv = w[valid] * float(test_size) / len(ycv) xgmat = xgb.DMatrix(Xcv) y_pred = bst.predict(xgmat) # search best cutoff_threshold and record score cutoff_thresholds = params["cutoff_thresholds"] split_score, split_ct = search_best_score( ycv, y_pred, wcv, cutoff_thresholds) #split_score, split_ct = search_best_score(ycv, y_pred, None, cutoff_thresholds, precision) split_scores.append(split_score) cv_score_mean = np.mean(split_scores) cv_score_std = np.std(split_scores, ddof=1) # unbiased ''' Record ''' record = dict() record["i_reduced"] = i_reduced record["discrete"] = discrete record["interact_threshold"] = it record["model_params"] = model_params record["cutoff_thresholds"] = params["cutoff_thresholds"] record["cv_score_mean"] = cv_score_mean record["cv_score_std"] = cv_score_std records.append(record) reduced_scores.append(records)
assert len(cols) != 0 X = X[:, cols] xgmat = xgb.DMatrix(X, label=y, weight=w) if pos_weight_ratio != 0: # positive example re-weighting sum_wpos = np.sum(w[i] for i in xrange(len(y)) if y[i] == 1) sum_wneg = np.sum(w[i] for i in xrange(len(y)) if y[i] == 0) subparams["scale_pos_weight"] = pos_weight_ratio * sum_wneg / sum_wpos plst = subparams.items() watchlist = [(xgmat, 'train')] bst = xgb.train(plst, xgmat, n_trees, watchlist) ''' Find the best cutoff_threshold ''' xgmat = xgb.DMatrix(X) y_pred = bst.predict(xgmat) cutoff_thresholds = record["cutoff_thresholds"] best_score, best_ct = search_best_score(y, y_pred, w, cutoff_thresholds) #best_score, best_ct = search_best_score(y, y_pred, None, cutoff_thresholds, fbeta) print "%dth model score: %.2f" % (i_reduced, best_score) ''' Save model ''' bst.save_model(os.path.join(model_directory, "%d.model" % i_reduced)) print "%d.model saved" % i_reduced ''' Save cols and cutoff_thresholds ''' best_cols.append(cols) best_cts.append(best_ct) # finally, save the best cols & thresholds into a json file for reading later with open(os.path.join(model_directory, "cols_cts.json"), 'wb') as fp: obj = [(cols, ct) for (cols, ct) in zip(best_cols, best_cts)] json.dump(obj, fp, indent=4) t1 = time.time()