예제 #1
0
	def evaluate(self, eval_params=None):
		''' Return the pessimistic bias of the average CV score by
			evaluating with the provided eval_params '''
		if eval_params == None:
			eval_params = self.eval_params
		elif self.eval_params == None:
			self.eval_params = eval_params
		
		p = eval_params  # alias
		xgb_params = p['xgb_params']
		pos_balance_factor = p['pos_balance_factor']
		min_child_weight_ratio = p['min_child_weight_ratio']
		cutoff_thresholds = p['cutoff_thresholds']
		
		split_scores = []
		for k in range(5):
			# manual positive example re-weighting
			pos_ratio = self.pos_ratios[k]
			if pos_balance_factor != -1:
				xgb_params['scale_pos_weight'] = pos_balance_factor * pos_ratio
			else:
				try:
					del xgb_params['scale_pos_weight']
				except:
					print "herp"
					pass
			# manual minimum child weight setup
			w_10 = self.w_10s[k]
			if min_child_weight_ratio != -1:
				if pos_balance_factor != -1:
					xgb_params['min_child_weight'] = \
						w_10 * pos_balance_factor * min_child_weight_ratio
				else:
					xgb_params['min_child_weight'] = \
						w_10 * min_child_weight_ratio
			else:
				try:
					del xgb_params['min_child_weight']
				except:
					print "asdf"
					pass
			# train
			watchlist = [(self.xgmats_train[k], 'train')]
			bst = xgb.train(xgb_params, self.xgmats_train[k],
				xgb_params['num_round'], watchlist)
			# validate
			y_rank = bst.predict(self.xgmats_valid[k])
			y_true = self.ys_valid[k]
			w = self.ws_valid[k]
			split_score, split_ct = search_best_score(y_true, y_rank,
				w, cutoff_thresholds)
			split_scores.append(split_score)
		score = float(np.mean(split_scores) - 0.25*np.std(split_scores, ddof=1))
		if score > self.best_score:
			self.best_score = score
			self.best_preproc_params = self.preproc_params
			self.best_eval_params = p
		
		self.eval_params = p
		return score
예제 #2
0
파일: higgs_cv.py 프로젝트: ulysseses/higgs
                        subparams["eta"],
                        subparams["max_depth"],
                    )
                    plst = subparams.items()
                    watchlist = [(xgmat, "train")]
                    bst = xgb.train(plst, xgmat, n_trees, watchlist)

                    """ Validate """
                    Xcv = X[valid]
                    ycv = y[valid]
                    wcv = w[valid] * float(test_size) / len(ycv)
                    xgmat = xgb.DMatrix(Xcv)
                    y_pred = bst.predict(xgmat)
                    # search best cutoff_threshold and record score
                    cutoff_thresholds = params["cutoff_thresholds"]
                    split_score, split_ct = search_best_score(ycv, y_pred, wcv, cutoff_thresholds)
                    # split_score, split_ct = search_best_score(ycv, y_pred, None, cutoff_thresholds, precision)
                    split_scores.append(split_score)
                cv_score_mean = np.mean(split_scores)
                cv_score_std = np.std(split_scores, ddof=1)  # unbiased

                """ Record """
                record = dict()
                record["i_reduced"] = i_reduced
                record["discrete"] = discrete
                record["interact_threshold"] = it
                record["model_params"] = model_params
                record["cutoff_thresholds"] = params["cutoff_thresholds"]
                record["cv_score_mean"] = cv_score_mean
                record["cv_score_std"] = cv_score_std
                records.append(record)
예제 #3
0
    X = X[:, cols]

    xgmat = xgb.DMatrix(X, label=y, weight=w)
    if pos_weight_ratio != 0:  # positive example re-weighting
        sum_wpos = np.sum(w[i] for i in xrange(len(y)) if y[i] == 1)
        sum_wneg = np.sum(w[i] for i in xrange(len(y)) if y[i] == 0)
        subparams["scale_pos_weight"] = pos_weight_ratio * sum_wneg / sum_wpos
    plst = subparams.items()
    watchlist = [(xgmat, "train")]
    bst = xgb.train(plst, xgmat, n_trees, watchlist)

    """ Find the best cutoff_threshold """
    xgmat = xgb.DMatrix(X)
    y_pred = bst.predict(xgmat)
    cutoff_thresholds = record["cutoff_thresholds"]
    best_score, best_ct = search_best_score(y, y_pred, w, cutoff_thresholds)
    # best_score, best_ct = search_best_score(y, y_pred, None, cutoff_thresholds, fbeta)
    print "%dth model score: %.2f" % (i_reduced, best_score)

    """ Save model """
    bst.save_model(os.path.join(model_directory, "%d.model" % i_reduced))
    print "%d.model saved" % i_reduced

    """ Save cols and cutoff_thresholds """
    best_cols.append(cols)
    best_cts.append(best_ct)

# finally, save the best cols & thresholds into a json file for reading later
with open(os.path.join(model_directory, "cols_cts.json"), "wb") as fp:
    obj = [(cols, ct) for (cols, ct) in zip(best_cols, best_cts)]
    json.dump(obj, fp, indent=4)
예제 #4
0
파일: higgs_cv.py 프로젝트: ulysseses/higgs
                     print "wpos=%.2f, wneg=%.2f, ratio=%.2f" % \
                      (sum_wpos, sum_wneg, subparams["scale_pos_weight"])
                 print "i%d,t%d,p%d,k%d n_trees=%d, eta=%.2f, max_depth=%d" % \
                  (i_reduced, t, p, k, n_trees, subparams["eta"], subparams["max_depth"])
                 plst = subparams.items()
                 watchlist = [(xgmat, 'train')]
                 bst = xgb.train(plst, xgmat, n_trees, watchlist)
                 ''' Validate '''
                 Xcv = X[valid]
                 ycv = y[valid]
                 wcv = w[valid] * float(test_size) / len(ycv)
                 xgmat = xgb.DMatrix(Xcv)
                 y_pred = bst.predict(xgmat)
                 # search best cutoff_threshold and record score
                 cutoff_thresholds = params["cutoff_thresholds"]
                 split_score, split_ct = search_best_score(
                     ycv, y_pred, wcv, cutoff_thresholds)
                 #split_score, split_ct = search_best_score(ycv, y_pred, None, cutoff_thresholds, precision)
                 split_scores.append(split_score)
             cv_score_mean = np.mean(split_scores)
             cv_score_std = np.std(split_scores, ddof=1)  # unbiased
             ''' Record '''
             record = dict()
             record["i_reduced"] = i_reduced
             record["discrete"] = discrete
             record["interact_threshold"] = it
             record["model_params"] = model_params
             record["cutoff_thresholds"] = params["cutoff_thresholds"]
             record["cv_score_mean"] = cv_score_mean
             record["cv_score_std"] = cv_score_std
             records.append(record)
 reduced_scores.append(records)
예제 #5
0
    assert len(cols) != 0
    X = X[:, cols]

    xgmat = xgb.DMatrix(X, label=y, weight=w)
    if pos_weight_ratio != 0:  # positive example re-weighting
        sum_wpos = np.sum(w[i] for i in xrange(len(y)) if y[i] == 1)
        sum_wneg = np.sum(w[i] for i in xrange(len(y)) if y[i] == 0)
        subparams["scale_pos_weight"] = pos_weight_ratio * sum_wneg / sum_wpos
    plst = subparams.items()
    watchlist = [(xgmat, 'train')]
    bst = xgb.train(plst, xgmat, n_trees, watchlist)
    ''' Find the best cutoff_threshold '''
    xgmat = xgb.DMatrix(X)
    y_pred = bst.predict(xgmat)
    cutoff_thresholds = record["cutoff_thresholds"]
    best_score, best_ct = search_best_score(y, y_pred, w, cutoff_thresholds)
    #best_score, best_ct = search_best_score(y, y_pred, None, cutoff_thresholds, fbeta)
    print "%dth model score: %.2f" % (i_reduced, best_score)
    ''' Save model '''
    bst.save_model(os.path.join(model_directory, "%d.model" % i_reduced))
    print "%d.model saved" % i_reduced
    ''' Save cols and cutoff_thresholds '''
    best_cols.append(cols)
    best_cts.append(best_ct)

# finally, save the best cols & thresholds into a json file for reading later
with open(os.path.join(model_directory, "cols_cts.json"), 'wb') as fp:
    obj = [(cols, ct) for (cols, ct) in zip(best_cols, best_cts)]
    json.dump(obj, fp, indent=4)

t1 = time.time()