def xgbTuning(self, pX, change = 3): w = self.getWeight(self.y) dm = xgb.DMatrix(pX, self.y, weight=w) best_auc = 0 n = pX.shape[0] best_params = None for i in range(change): randp = np.random.random_sample(3) param = { 'bst:eta': randp[0], 'max_depth': int(3+6*randp[1]) , 'nthread':4, 'silent':1, 'alpha':randp[2], 'eval_metric':'auc', 'objective': 'binary:logistic' } m = xgb.cv(param, dm, metrics='auc', nfold=3, num_boost_round = 50,early_stopping_rounds=5) auc = m['test-auc-mean'].max() if auc > best_auc : print 'xgb:' + str(auc) best_auc = auc best_params = param Xtrain, Xtest, ytrain, ytest = train_test_split(pX, self.y, test_size=.33) trainw = self.getWeight(ytrain) testw = self.getWeight(ytest) dtrain = xgb.DMatrix(Xtrain, label = ytrain, feature_names=Xtrain.columns, weight = trainw) dtest = xgb.DMatrix(Xtest, label = ytest, feature_names=Xtest.columns, weight = testw) evallist = [(dtrain, 'train'), (dtest, 'eval')] booster = xgb.train(best_params, dtrain, evals=evallist, num_boost_round=100,early_stopping_rounds=10) rounds = booster.attr("best_iteration") best_auc = booster.attr("best_score") return float(best_auc), xgb.train(best_params, dtrain, num_boost_round=int(rounds))
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000): param = {} param['objective'] = 'multi:softprob' param['eta'] = 0.1 param['max_depth'] = 6 param['silent'] = 1 param['num_class'] = 3 param['eval_metric'] = "mlogloss" param['min_child_weight'] = 1 param['subsample'] = 0.7 param['colsample_bytree'] = 0.7 param['seed'] = seed_val num_rounds = num_rounds plst = list(param.items()) xgtrain = xgb.DMatrix(train_X, label=train_y) if test_y is not None: xgtest = xgb.DMatrix(test_X, label=test_y) watchlist = [ (xgtrain,'train'), (xgtest, 'test') ] model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20) else: xgtest = xgb.DMatrix(test_X) model = xgb.train(plst, xgtrain, num_rounds) pred_test_y = model.predict(xgtest) return pred_test_y, model
def hyperopt_obj(self,param,train_X,train_y): # 5-fold crossvalidation error #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round']) kf = KFold(n_splits = 3) errors = [] r2 = [] int_params = ['max_depth','num_round'] for item in int_params: param[item] = int(param[item]) for train_ind,test_ind in kf.split(train_X): train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind] test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind] dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y) dtest = xgb.DMatrix(test_valid_x) pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round'])) pred_test = pred_model.predict(dtest) errors.append(mean_squared_error(test_valid_y,pred_test)) r2.append(r2_score(test_valid_y,pred_test)) all_dtrain = xgb.DMatrix(train_X,label = train_y) print('training score:') pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round'])) all_dtest = xgb.DMatrix(train_X) pred_train = pred_model.predict(all_dtest) print(str(r2_score(train_y,pred_train))) print(np.mean(r2)) print('\n') return {'loss':np.mean(errors),'status': STATUS_OK}
def fit(self, X, y): X = self.build_matrix(X, y) param = { 'silent': 1 if self.silent else 0, 'use_buffer': int(self.use_buffer), 'num_round': self.num_round, 'ntree_limit': self.ntree_limit, 'nthread': self.nthread, 'booster': self.booster, 'eta': self.eta, 'gamma': self.gamma, 'max_depth': self.max_depth, 'min_child_weight': self.min_child_weight, 'subsample': self.subsample, 'colsample_bytree': self.colsample_bytree, 'max_delta_step': self.max_delta_step, 'l': self.l, 'alpha': self.alpha, 'lambda_bias': self.lambda_bias, 'objective': self.objective, 'eval_metric': self.eval_metric, 'seed': self.seed } if self.num_class is not None: param['num_class']= self.num_class watchlist = [(X,'train')] if self.early_stopping_rounds > 0: self.bst = xgb.train(param, X, self.num_round, watchlist, early_stopping_rounds=self.early_stopping_rounds) else: self.bst = xgb.train(param, X, self.num_round, watchlist) return self
def test_predict(self): iterations = 10 np.random.seed(1) test_num_rows = [10, 1000, 5000] test_num_cols = [10, 50, 500] for num_rows in test_num_rows: for num_cols in test_num_cols: dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2)) watchlist = [(dtrain, 'train'), (dval, 'validation')] res = {} param = { "objective": "binary:logistic", "predictor": "gpu_predictor", 'eval_metric': 'auc', } bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res) assert self.non_decreasing(res["train"]["auc"]) gpu_pred_train = bst.predict(dtrain, output_margin=True) gpu_pred_test = bst.predict(dtest, output_margin=True) gpu_pred_val = bst.predict(dval, output_margin=True) param["predictor"] = "cpu_predictor" bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist) cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True) cpu_pred_test = bst_cpu.predict(dtest, output_margin=True) cpu_pred_val = bst_cpu.predict(dval, output_margin=True) np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-5) np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-5) np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-5)
def train_predict(self,X_train,y_train,X_test,base_train_prediction,base_test_prediction): xgmat_train = xgb.DMatrix(X_train, label=y_train,missing=-999) test_size = X_test.shape[0] param = {} param['objective'] = 'binary:logistic' param['bst:eta'] = self.eta param['colsample_bytree']=1 param['min_child_weight']=self.min_child_weight param['bst:max_depth'] = self.depth param['eval_metric'] = 'auc' param['silent'] = 1 param['nthread'] = self.threads plst = list(param.items()) watchlist = [ (xgmat_train,'train') ] num_round = self.num_round xgmat_test = xgb.DMatrix(X_test,missing=-999) if self.boost_from_exist_prediction: # train xgb with existing predictions # see more at https://github.com/tqchen/xgboost/blob/master/demo/guide-python/boost_from_prediction.py xgmat_train.set_base_margin(base_train_prediction) xgmat_test.set_base_margin(base_test_prediction) bst = xgb.train(param, xgmat_train, self.exist_num_round, watchlist ) else: bst = xgb.train( plst, xgmat_train, num_round, watchlist ) ypred = bst.predict(xgmat_test) return ypred
def test_custom_objective(self): param = {'max_depth':2, 'eta':1, 'silent':1 } watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 2 def logregobj(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0-preds) return grad, hess def evalerror(preds, dtrain): labels = dtrain.get_label() return 'error', float(sum(labels != (preds > 0.0))) / len(labels) # test custom_objective in training bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) assert err < 0.1 # test custom_objective in cross-validation xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, obj = logregobj, feval=evalerror) # test maximize parameter def neg_evalerror(preds, dtrain): labels = dtrain.get_label() return 'error', float(sum(labels == (preds > 0.0))) / len(labels) bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True) preds2 = bst2.predict(dtest) err2 = sum(1 for i in range(len(preds2)) if int(preds2[i]>0.5)!=labels[i]) / float(len(preds2)) assert err == err2
def test_fast_histmaker(self): variable_param = {'tree_method': ['hist'], 'max_depth': [2, 8], 'max_bin': [2, 256], 'grow_policy': ['depthwise', 'lossguide'], 'max_leaves': [64, 0], 'verbosity': [0]} for param in parameter_combinations(variable_param): result = run_suite(param) assert_results_non_increasing(result, 1e-2) # hist must be same as exact on all-categorial data dpath = 'demo/data/' ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') ag_param = {'max_depth': 2, 'tree_method': 'hist', 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc'} hist_res = {} exact_res = {} xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=hist_res) ag_param["tree_method"] = "exact" xgb.train(ag_param, ag_dtrain, 10, [(ag_dtrain, 'train'), (ag_dtest, 'test')], evals_result=exact_res) assert hist_res['train']['auc'] == exact_res['train']['auc'] assert hist_res['test']['auc'] == exact_res['test']['auc']
def run_benchmark(args): print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns)) print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size)) tmp = time.time() X, y = make_classification(args.rows, n_features=args.columns, random_state=7) if args.sparsity < 1.0: X = np.array([[np.nan if rng.uniform(0, 1) < args.sparsity else x for x in x_row] for x_row in X]) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7) print ("Generate Time: %s seconds" % (str(time.time() - tmp))) tmp = time.time() print ("DMatrix Start") dtrain = xgb.DMatrix(X_train, y_train, nthread=-1) dtest = xgb.DMatrix(X_test, y_test, nthread=-1) print ("DMatrix Time: %s seconds" % (str(time.time() - tmp))) param = {'objective': 'binary:logistic'} if args.params is not '': param.update(ast.literal_eval(args.params)) param['tree_method'] = args.tree_method print("Training with '%s'" % param['tree_method']) tmp = time.time() xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")]) print ("Train Time: %s seconds" % (str(time.time() - tmp)))
def run_benchmark(args, gpu_algorithm, cpu_algorithm): print("Generating dataset: {} rows * {} columns".format(args.rows,args.columns)) tmp = time.time() X, y = make_classification(args.rows, n_features=args.columns, random_state=7) print ("Generate Time: %s seconds" % (str(time.time() - tmp))) tmp = time.time() print ("DMatrix Start") # omp way dtrain = xgb.DMatrix(X, y, nthread=-1) # non-omp way #dtrain = xgb.DMatrix(X, y) print ("DMatrix Time: %s seconds" % (str(time.time() - tmp))) param = {'objective': 'binary:logistic', 'max_depth': 6, 'silent': 0, 'n_gpus': 1, 'gpu_id': 0, 'eval_metric': 'auc'} param['tree_method'] = gpu_algorithm print("Training with '%s'" % param['tree_method']) tmp = time.time() xgb.train(param, dtrain, args.iterations) print ("Train Time: %s seconds" % (str(time.time() - tmp))) param['silent'] = 1 param['tree_method'] = cpu_algorithm print("Training with '%s'" % param['tree_method']) tmp = time.time() xgb.train(param, dtrain, args.iterations) print ("Time: %s seconds" % (str(time.time() - tmp)))
def fit(self, X, y,num_boost_round=None): num_boost_round = num_boost_round or self.num_boost_round self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y)))) early_stopping = False if early_stopping == True: xg_train,xg_validate,xg_train_y,xg_validate_y = train_test_split(X,y,test_size=0.2) print self.params if self.params["objective"] == "binary:logistic": print "binary:logistic" dtrain = xgb.DMatrix(xg_train, label=xg_train_y) dvalid = xgb.DMatrix(xg_validate, label=xg_validate_y) else: dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in xg_train_y]) dvalid = xgb.DMatrix(X, label=[self.label2num[label] for label in xg_validate_y]) #evallist = [(dtrain,'train')] watchlist = [(dtrain,'train'),(dvalid,'val')] self.clf = xgb.train(self.params, dtrain, num_boost_round,watchlist,early_stopping_rounds=80) else: xg_train,xg_train_y = X,y if self.params["objective"] == "binary:logistic": print "binary:logistic" dtrain = xgb.DMatrix(xg_train, label=xg_train_y) watchlist = [(dtrain,'train')] self.clf = xgb.train(self.params, dtrain, num_boost_round,watchlist) else: dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in xg_train_y]) watchlist = [(dtrain,'train')] self.clf = xgb.train(self.params, dtrain, num_boost_round,watchlist)
def test_multi_predict(self): from sklearn.datasets import make_regression from sklearn.model_selection import train_test_split n = 1000 X, y = make_regression(n, random_state=rng) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123) dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) params = {} params["tree_method"] = "gpu_hist" params['predictor'] = "gpu_predictor" bst_gpu_predict = xgb.train(params, dtrain) params['predictor'] = "cpu_predictor" bst_cpu_predict = xgb.train(params, dtrain) predict0 = bst_gpu_predict.predict(dtest) predict1 = bst_gpu_predict.predict(dtest) cpu_predict = bst_cpu_predict.predict(dtest) assert np.allclose(predict0, predict1) assert np.allclose(predict0, cpu_predict)
def fit(self, X, y): #data data = X label = y skf = cross_validation.StratifiedKFold(label, n_folds=10) for train_index, val_index in skf: dtrain = xgb.DMatrix(data[train_index], label=label[train_index]) dval = xgb.DMatrix(data[val_index], label=label[val_index]) break #set params plst = self.param.items() plst += [('eval_metric', 'merror')] evallist = [ (dtrain,'train'), (dval,'eval')] #train if self.bst == None: num_round = 100000 self.bst = xgb.train(plst, dtrain, num_round, evals=evallist, early_stopping_rounds=100) self.best_score_ = 1-self.bst.best_score self.best_params_= self.bst.best_iteration #refit dtrain = xgb.DMatrix(data, label=label) num_round = self.best_params_ self.bst = xgb.train(plst, dtrain, num_round, evals=evallist)
def XgbTrain(self, submitfile): offset = 5000 X_train, y_train = self.dataMat, self.labelMat X_test = self.testData xgtest = xgb.DMatrix(X_test) xgtrain_train = xgb.DMatrix(X_train[offset:,:], label=y_train[offset:]) xgtrain_val = xgb.DMatrix(X_train[:offset,:], label=y_train[:offset]) watchlist = [(xgtrain_train, 'train'),(xgtrain_val, 'val')] model = xgb.train(self.params_best, xgtrain_train, self.num_rounds_best, watchlist,early_stopping_rounds=self.early_stopping_rounds_best) preds1 = model.predict(xgtest) X_train = X_train[::-1,:] y_train = y_train[::-1] xgtrain_train = xgb.DMatrix(X_train[offset:,:], label=y_train[offset:]) xgtrain_val = xgb.DMatrix(X_train[:offset,:], label=y_train[:offset]) watchlist = [(xgtrain_train, 'train'),(xgtrain_val, 'val')] model = xgb.train(self.params_best, xgtrain_train, self.num_rounds_best, watchlist, early_stopping_rounds=self.early_stopping_rounds_best) preds2 = model.predict(xgtest) preds = preds1 + preds2 #preds = pd.DataFrame({"Id": self.testid, "Hazard": preds}) if submitfile!='': writer=csv.writer(open(submitfile,'wb')) writer.writerow(['ID','Hazard']) for i in range(len(preds)): line = [self.testid[i], preds[i]] writer.writerow(line)
def train_predict(self,train_x,train_y,test_x): xgmat_train = xgb.DMatrix(train_x, label=train_y, missing=-9999) test_size = test_x.shape[0] params = { 'booster':'gbtree', 'objective':'binary:logistic', 'silent':self.silent, 'eta':self.eta, 'gamma':self.gamma, 'max_depth':self.max_depth, 'min_chile_weitght':self.min_chile_weight, 'subsample':self.subsample, 'lambda':self.lambda_, 'scale_pos_weight':self.scale_pos_weight, "colsample_bytree": self.colsample_bytree, 'eval_metirc':'auc', 'seed':2014, 'nthread':self.threads } watchlist = [ (xgmat_train,'train') ] num_round = self.num_boost_round bst = xgb.train( params, xgmat_train, num_round, watchlist ) xgmat_test = xgb.DMatrix(test_x,missing=-9999) if self.exist_prediction: tmp_train = bst.predict(xgmat_train, output_margin=True) tmp_test = bst.predict(xgmat_test, output_margin=True) xgmat_train.set_base_margin(tmp_train) xgmat_test.set_base_margin(tmp_test) bst = xgb.train(params, xgmat_train, self.exist_num_boost_round, watchlist ) ypred = bst.predict(xgmat_test) return ypred
def run(train_matrix,test_matrix): params = {'booster': 'gbtree', #'objective': 'multi:softmax', 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'gamma': 1, 'min_child_weight': 1.5, 'max_depth': 5, 'lambda': 10, 'subsample': 0.7, 'colsample_bytree': 0.7, 'colsample_bylevel': 0.7, 'eta': 0.03, 'tree_method': 'exact', 'seed': 2017, 'nthread': 12, "num_class":3 } num_round = 10000 early_stopping_rounds = 50 watchlist = [(train_matrix, 'train'), (test_matrix, 'eval') ] if test_matrix: model = xgb.train(params, train_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds ) pred_test_y = model.predict(test_matrix,ntree_limit=model.best_iteration) return pred_test_y, model else: model = xgb.train(params, train_matrix, num_boost_round=num_round ) return model
def run_benchmark(args, gpu_algorithm, cpu_algorithm): print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns)) print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size)) tmp = time.time() X, y = make_classification(args.rows, n_features=args.columns, random_state=7) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7) print ("Generate Time: %s seconds" % (str(time.time() - tmp))) tmp = time.time() print ("DMatrix Start") # omp way dtrain = xgb.DMatrix(X_train, y_train, nthread=-1) dtest = xgb.DMatrix(X_test, y_test, nthread=-1) print ("DMatrix Time: %s seconds" % (str(time.time() - tmp))) param = {'objective': 'binary:logistic', 'max_depth': 6, 'silent': 0, 'n_gpus': 1, 'gpu_id': 0, 'eval_metric': 'error', 'debug_verbose': 0, } param['tree_method'] = gpu_algorithm print("Training with '%s'" % param['tree_method']) tmp = time.time() xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")]) print ("Train Time: %s seconds" % (str(time.time() - tmp))) param['silent'] = 1 param['tree_method'] = cpu_algorithm print("Training with '%s'" % param['tree_method']) tmp = time.time() xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")]) print ("Time: %s seconds" % (str(time.time() - tmp)))
def evalModelOHE(train_data, eval_data, train_labels, eval_labels): params = {} # params["objective"] = "reg:linear" # params["eta"] = 0.05 # params["min_child_weight"] = 8 # params["subsample"] = 0.7 # params["colsample_bytree"] = 0.7 # params["scale_pos_weight"] = 1.0 # params["silent"] = 1 # params["max_depth"] = 8 # params["max_delta_step"]=2 params["objective"] = "reg:linear" params["eta"] = 0.013 params["min_child_weight"] = 6 params["subsample"] = 0.51 params["colsample_bytree"] = 0.6 params["scale_pos_weight"] = 1.0 params["silent"] = 1 params["max_depth"] = 10 params["max_delta_step"]=1 plst = list(params.items()) xgtrain = xgb.DMatrix(train_data,label=train_labels) xgeval = xgb.DMatrix(eval_data,label=eval_labels) evallist = [(xgeval,'eval'), (xgtrain,'train')] xgb.train(plst, xgtrain, num_boost_round=5000, evals=evallist,feval=evalerror)
def train(self, X, Y, getApproxError=False): dtrain = xgb.DMatrix(X, label=Y) self.bst = xgb.train(self.param, dtrain, self.nRounds) if getApproxError: e = 0.0 c = 0.0 kf = KFold(Y.shape[0], n_folds=4) for train_index, test_index in kf: XTrain = X[train_index, :] XTest = X[test_index, :] YTrain = Y[train_index] YTest = Y[test_index] dtrain2 = xgb.DMatrix(XTrain, label=YTrain) bst = xgb.train(self.param, dtrain2, self.nRounds) dtest = xgb.DMatrix(XTest) probs = bst.predict(dtest) ypred =numpy.argmax(probs, axis=1) error = float(numpy.sum(ypred != YTest)) e += error c += float(len(YTest)) e/=c return e
def xgboost_model(train, test, num_round, params): """ Takes in: training set, test set, number of estimators, params is a list Returns: predictions in correct format """ X = train.as_matrix(train.columns[:-1]).astype(float) y = train.as_matrix(["cost"])[:, 0].astype(float) ylog1p = np.log1p(y) X_test = test.as_matrix(test.columns[:-1]).astype(float) xgb_train = xgb.DMatrix(X, label=ylog1p) xgb_test = xgb.DMatrix(X_test) # Round 1 bst1 = xgb.train(params, xgb_train, num_round) y_pred1 = bst1.predict(xgb_test) # Round 2 # num_round2 = 2000 # bst2 = xgb.train(params, xgb_train, 2000) # y_pred2 = bst2.predict(xgb_test) # Power Train ypower3 = np.power(y, 1 / 47.0) xgb_train3 = xgb.DMatrix(X, label=ypower3) xst3 = xgb.train(params, xgb_train3, num_round) y_predp3 = xst3.predict(xgb_test) p = 0.5 y_pred = p * np.expm1(y_pred1) + (1 - p) * np.power(y_predp3, 47.0) return y_pred
def xgb_features(X,y,Xtest,params=None,random_state=0,n_folds=4,early_stop=20,eval_with_gini=False): try: if params['objective'] == 'reg:logistic': yt = MinMaxScaler().fit_transform(y*1.) else: yt = y skf = StratifiedKFold(yt, n_folds=n_folds,shuffle=True,random_state=random_state) ypred_test = np.zeros(Xtest.shape[0]) ypred_train =np.zeros(X.shape[0]) seed = random_state; dtest = xgb.DMatrix(data=Xtest) for train_index, test_index in skf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = yt[train_index], yt[test_index] dtrain = xgb.DMatrix(data=X_train,label=y_train) dvalid = xgb.DMatrix(data=X_test,label=y_test) evallist = [(dtrain,'train'),(dvalid,'valid')] num_round = 5000 params['seed'] = seed+1 seed+=1 plst = params.items() if eval_with_gini: bst = xgb.train( plst, dtrain, num_round,evallist,early_stopping_rounds=early_stop,feval=evalerror) else : bst = xgb.train( plst, dtrain, num_round,evallist,early_stopping_rounds=early_stop) ypred = bst.predict(dtest,ntree_limit=bst.best_iteration) ypred_valid = bst.predict(dvalid) print ("\tcross validation gini score %s: %f"%(params['objective'],gini(y_test,ypred_valid))) ypred_test += ypred ypred_train[test_index] = ypred_valid except KeyboardInterrupt: ypred_test = np.zeros(Xtest.shape[0]); ypred_train = np.zeros(X.shape[0]); return ypred_train, ypred_test return ypred_train, ypred_test*1./n_folds
def xgboost_pred(train,labels,test): params = {} params["objective"] = "reg:linear" params["eta"] = 0.01 params["min_child_weight"] = 25 params["subsample"] = 0.8 params["colsample_bytree"] = 0.85 params["scale_pos_weight"] = 1.0 params["silent"] = 1 params["max_depth"] = 10 plst = list(params.items()) #Using 8000 rows for early stopping. offset = 8000 num_rounds = 5000 xgtest = xgb.DMatrix(test) #create a train and validation dmatrices xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:]) xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset]) # xgtrain = xgb.DMatrix(train, label=labels) #xgval = xgb.DMatrix(train, label=labels) #train using early stopping and predict watchlist = [(xgtrain, 'train'),(xgval, 'val')] model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50) preds_valid = model.predict(xgval) valid_gini=Gini(np.array(labels[:offset,]), preds_valid[:,]) print valid_gini #model = xgb.train(plst, xgtrain, 1000) preds1 = model.predict(xgtest) #reverse train and labels and use different 5k for early stopping. # this adds very little to the score but it is an option if you are concerned about using all the data. train = train[::-1,:] labels = np.log(labels[::-1]) xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:]) xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset]) # xgtrain = xgb.DMatrix(train, label=labels) watchlist = [(xgtrain, 'train'),(xgval, 'val')] model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50) preds_valid = model.predict(xgval) valid_gini=Gini(np.array(labels[:offset,]), preds_valid[:,]) print valid_gini # model = xgb.train(plst, xgtrain, 1000) preds2 = model.predict(xgtest) #combine predictions #since the metric only cares about relative rank we don't need to average preds = preds1*0.4 + preds2*0.6 return preds
def model(self): x = self.train[self.features].values x_test = self.test[self.features].values # TODO compute from no. of rows & features k_folds = 10 n_times = 1 dx = xgb.DMatrix(x, label=self.y, missing = float('nan')) dtest = xgb.DMatrix(x_test, missing = float('nan') ) # setup parameters for xgboost param = {} param['objective'] = 'multi:softmax' param['eval_metric'] = 'mlogloss' param['num_class'] = self.noOfClasses param['eta'] = 0.3 param['max_depth'] = 4 param['silent'] = 1 # param['subsample'] = 0.5 # param['colsample_bytree'] = 0.6 scores = [] iters = [] for n in range(n_times): print '---------------- ' + str(n+1) skf = StratifiedKFold(self.y, n_folds=k_folds, shuffle=True) # for train_index, validation_index in skf: for validation_index, train_index in skf: x_train, x_validate = x[train_index], x[validation_index] y_train, y_validate = self.y[train_index], self.y[validation_index] dtrain = xgb.DMatrix(x_train, label=y_train, missing = float('nan') ) dval = xgb.DMatrix(x_validate, label=y_validate, missing = float('nan') ) # watchlist = [ (dtrain,'train'), (dval, 'test') ] num_round = 500 clf = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=30, verbose_eval=False) # scores.append(np.absolute(clf.best_score)) iters.append(clf.best_iteration) print('\n=========== overall eval metric for ' + str(k_folds) + ' folds ===========') print(' '.join(self.features) + '\n') print(param) print('XGBoost classifier = ' + str(1-np.mean(scores)) + ' +/- ' + str(round(np.std(scores)*100, 2)) + '%' ) n_rounds_max = np.max(iters) + 1 xgb_clf = xgb.train(param, dx, n_rounds_max) self.predictions = xgb_clf.predict(dtest) print self.predictions submission = pd.DataFrame({ self.id_col: self.test[self.id_col], self.y.name: self.predictions }) submission[self.y.name] = submission[self.y.name] submission.to_csv(os.path.join(self.directory, 'submission.csv' ), index=False)
def learn_and_predict_xgb(self, dataset='train'): ''' Use xgboost to do work ''' #predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Titles", "FamilyId"] predictors = self.PREDICTORS if dataset == 'train': param_dist = {'max_depth': sp_randint(3, 10), 'learning_rate': [0.01, 0.03, 0.1, 0.3, 1.0], 'gamma': [0, 0.1, 0.2, 0.3], 'subsample': [.1, .2, .3, .4, 0.5], 'colsample_bytree': [.4, .5], 'objective': ['binary:logistic'], 'n_estimators': sp_randint(20, 150), } clf = xgb.XGBClassifier() #random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=500, cv=3) #random_search.fit(self.train_df[predictors], self.train_df['Survived']) #report(random_search.grid_scores_) params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 bst = xgb.train(params, self.DMatrix_train) predictions = pd.Series(bst.predict(self.DMatrix_train)) predictions[predictions >= .5] = 1 predictions[predictions < .5] = 0 predictions = [int(x) for x in predictions.tolist()] train_model = pd.DataFrame({ 'PassengerId': self.train_df['PassengerId'], 'Survived': predictions, }) train_model.to_csv('./xgb_train.csv', index=False) else: params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 bst = xgb.train(params, self.DMatrix_train) #clf = xgb.XGBClassifier(params) #clf.fit(self.train_df[predictors], self.train_df['Survived'], verbose=True) #print(self.test_df[predictors]) predictions = pd.Series(bst.predict(self.DMatrix_test)) predictions_proba = predictions.copy() predictions[predictions >= .5] = 1 predictions[predictions < .5] = 0 predictions = [int(x) for x in predictions.tolist()] print(predictions) submission = pd.DataFrame({ 'PassengerId': self.test_df['PassengerId'], 'Survived': predictions }) submission.to_csv("xgboost_845.csv", index=False) submission_proba = pd.DataFrame({ 'PassengerId': self.test_df['PassengerId'], 'Survived': predictions_proba, }) submission_proba.to_csv("xgboost_845_soft.csv", index=False)
def predict(self, X, y, X_test, stage): np.random.seed(self.seed) n_train = X.shape[0] kf = KFold(n_train, n_folds=self.n_fold, shuffle=True) param = {} param['objective'] = self.obj param['eval_metric'] = self.eval_metric param['num_class'] = self.num_class param['nthread'] = self.nthread param['silent'] = self.silent param['eta'] = self.eta param['colsample_bytree'] = self.colsample_bytree param['subsample'] = self.subsample param['max_depth'] = self.max_depth param['max_delta_step'] = self.max_delta_step param['gamma'] = self.gamma param['alpha'] = self.alpha param['lambda'] = self.param_lambda num_round = 10000 best_score = [] best_iter = [] y_pred_sum = np.zeros((X_test.shape[0], self.num_class)) if stage=='base': meta_feat = np.zeros((n_train+X_test.shape[0], self.num_class)) xg_test = xgb.DMatrix(X_test) i = 0 for train, val in kf: i += 1 print(i) X_train, X_val, y_train, y_val = X[train], X[val], y[train], y[val] xg_train = xgb.DMatrix(X_train, y_train) xg_val = xgb.DMatrix(X_val, y_val) evallist = [(xg_train,'train'), (xg_val,'eval')] ## CV sets # train bst = xgb.train(param, xg_train, num_round, evallist, early_stopping_rounds=100) best_score += [bst.best_score] best_iter += [bst.best_iteration] # predict if stage=='base': meta_feat[val, :] = bst.predict(xg_val, ntree_limit=bst.best_iteration) else: y_pred = bst.predict(xg_test, ntree_limit=bst.best_iteration) y_pred_sum = y_pred_sum+y_pred print(np.mean(best_score), np.std(best_score)) ## test set if stage=='base': # train xg_train = xgb.DMatrix(X, y) evallist = [(xg_train,'train')] bst = xgb.train(param, xg_train, int(np.mean(best_iter)), evallist) # predict meta_feat[n_train:, :] = bst.predict(xg_test) return meta_feat else: y_pred = y_pred_sum/self.n_fold return y_pred
def doclassify(self, type='normal'): # Boosting if type == 'split': dtrainmis = xgb.DMatrix(array(self.misstrain), array(self.misstrain_y), missing=NAN) dtest = xgb.DMatrix(array(self.normaltest), missing=NAN) dtestmis = xgb.DMatrix(array(self.misstest), missing=NAN) param = {'bst:max_depth':10, 'bst:eta':0.02, 'silent':1, 'objective':'binary:logistic', 'subsample':0.8,"colsample_bytree": 0.68,"booster": "gbtree"} param['nthread'] = 4 param['eval_metric'] = 'logloss' evallist = [(dtrainmis, 'train')] num_round = 320 bstmis = xgb.train(param, dtrainmis, num_round, evallist,) dtrain = xgb.DMatrix(array(self.normaltrain), array(self.normaltrain_y)) num_round = 366 evallist = [(dtrain, 'train')] bst = xgb.train(param, dtrain, num_round, evallist,) ypredmis = bstmis.predict(dtestmis) ypred = bst.predict(dtest) result = [] output1 = list(ypredmis) output2 = list(ypred) for i in self.test: if dp.List_dataProcess().check_missing(i): result.append(output1.pop(0)) else: result.append(output2.pop(0)) print len(output1) print len(output2) writer(self.id, result) if type == 'normal': dtrain = xgb.DMatrix(array(self.train_x), array(self.train_y)) dtest = xgb.DMatrix(array(self.test)) param = {'bst:max_depth':10, 'bst:eta':0.02, 'silent':1, 'objective':'binary:logistic', 'subsample':0.9,"colsample_bytree": 0.68,"booster": "gbtree"} param['nthread'] = 4 param['eval_metric'] = 'logloss' evallist = [(dtrain, 'train')] num_round = 300 bst = xgb.train(param, dtrain, num_round, evallist,) ypred = bst.predict(dtest) writer(self.id, ypred) acc = 0.0 for i in range(10000): if array(self.train_y)[len(self.train_y)-10000+i] == 1 and ypred[i] > 0.35: acc += 1 if array(self.train_y)[len(self.train_y)-10000+i] == 0 and ypred[i] <= 0.35: acc += 1 print "Accuracy : ", acc/10000 fpr, tpr, thresholds = metrics.roc_curve(self.train_y[-10000:], ypred, pos_label=1) for i in range(len(fpr)): plt.plot(fpr[i], tpr[i], "b*") plt.plot(fpr, tpr) plt.title(val) plt.show() print "AUC : ", metrics.auc(fpr, tpr) print thresholds
def train(X, y, available_devices): dtrain = xgb.dask.create_worker_dmatrix(X, y) local_device = available_devices[xgb.rabit.get_rank()] # Specify the GPU algorithm and device for this worker params = {"tree_method": "gpu_hist", "gpu_id": local_device} print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row())) start = time.time() xgb.train(params, dtrain, num_boost_round=500) end = time.time() print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
def fit(self, X, y, X_valid=None, y_valid=None, sample_weight=None): """Fit training dafa. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) The input samples. y : array-like, shape=(n_samples,) X_valid : array-like or sparse matrix, shape=(n_valid_samples, n_features) The validation samples. y_valid : array-like, shape=(n_valid_samples,) sample_weight : array-like, shape = [n_samples], optional Returns ------- self : object Returns self. """ X, y = self._ready_to_fit(X, y) xgb_params = self.get_xgb_params() # xgboost accepts dense, csc, csr if isinstance(X, sp.sparse.coo_matrix): X = X.tocsc() if sample_weight is not None: xg_train = xgb.DMatrix(X, label=y, weight=sample_weight) else: xg_train = xgb.DMatrix(X, label=y) watchlist = [ (xg_train,'train')] if not (X_valid is None): if isinstance(X_valid, sp.sparse.coo_matrix): X_valid = X_valid.tocsc() if sample_weight is not None: xg_valid = xgb.DMatrix(X_valid, label=y_valid, weight=sample_weight) else: xg_valid = xgb.DMatrix(X_valid, label=y_valid) watchlist = [ (xg_train,'train'), (xg_valid, 'valid') ] if self.verbose: # with watchlist self.bst_ = xgb.train(params=xgb_params, dtrain=xg_train, num_boost_round=int(self.n_iter), evals=watchlist, early_stopping_rounds=int(self.early_stopping_rounds)) else: # without watchlist # early stopping is not available self.bst_ = xgb.train(params=xgb_params, dtrain=xg_train, num_boost_round=int(self.n_iter)) return self
def train(train_X, train_Y, validation_X, validation_Y, feature_names): train_X = array(train_X); train_Y = array(train_Y); validation_X = array(validation_X); validation_Y = array(validation_Y); print(type(train_X)); dtrain = xgb.DMatrix( train_X, label=train_Y, missing=float('NaN')); dvalidation = xgb.DMatrix( validation_X, label=validation_Y,missing=float('NaN')) #Clean up data del train_X, validation_X, train_Y, validation_Y; #Track metrics on the watchlist watchlist = [ (dtrain,'train'), (dvalidation, 'validation') ] parameters_to_try = generateParams(); best_params = None; overall_best_metric = 0; overall_best_nrounds = 0; for i in range(0, len(parameters_to_try)): param = parameters_to_try[i]; num_round = 1000; classifier = xgb.train(param,dtrain,num_round,evals=watchlist,early_stopping_rounds=100); metric = classifier.best_score; itr = classifier.best_iteration; print("\n Metric : " + str(metric) + " for Params " + str(param) + " occurs at " + str(itr)); if metric > overall_best_metric: overall_best_metric = metric; best_params = copy.copy(param); overall_best_nrounds = itr; print("\n Training the model on the entire training set with the best params") bst = xgb.train(best_params, dtrain, 1+overall_best_nrounds); print("\n\n Overall Best AUC : " + str(overall_best_metric) + " for Params " + str(best_params) + " occurs at " + str(overall_best_nrounds)); feature_imp = bst.get_fscore(); print("Feature Importance ... "); for w in sorted(feature_imp, key=feature_imp.get, reverse=True): print( str(feature_names[int(w.replace("f",""))]) + " : " + str(feature_imp[w]) ); return bst;
def buildXGB(self): ''' train_shfl = train.iloc[np.random.permutation(len(train))] X = train_shfl.as_matrix(train_shfl.columns[:-1]).astype(float) y = train_shfl.as_matrix(['cost'])[:,0].astype(float) ylog1p = np.log1p(y).astype(float) ''' X = self.train.as_matrix(self.train.columns[:-1]).astype(float) y = self.train.as_matrix(['cost'])[:,0].astype(float) ylog1p = np.log1p(y).astype(float) # cost is still last column X_test = self.test.as_matrix(self.test.columns[:-1]).astype(float) xgb_train = xgb.DMatrix(X, label = ylog1p) xgb_test = xgb.DMatrix(X_test) #Train multiple times # Round 1 num_round1 = 4000 self.bst1 = xgb.train(self.params, xgb_train, num_round1) y_pred1 = self.bst1.predict(xgb_test) # Round 2 num_round2 = 2000 self.bst2 = xgb.train(self.params, xgb_train, num_round2) y_pred2 = self.bst2.predict(xgb_test) #Power Train #ypower2 = np.power(y,1/5.0) ypower3 = np.power(y,1/20.0) #xgb_train2 = xgb.DMatrix(X, label = ypower2) xgb_train3 = xgb.DMatrix(X, label = ypower3) #self.xst2 = xgb.train(self.params, xgb_train2, self.num_round) #y_predp2 = self.xst2.predict(xgb_test) self.xst3 = xgb.train(self.params, xgb_train3, self.num_round) y_predp3 = self.xst3.predict(xgb_test) #y_power = (np.power(y_predp2,5.0) + np.power(y_predp3,10.0))/2.0 y_power = np.power(y_predp3,20.0) self.y_pred = (np.expm1(0.75*y_pred1+0.25*y_pred2) + y_power)/2.0 #self.y_pred = 0.35*np.expm1(0.75*y_pred1+0.25*y_pred2) + 0.65*y_power print print "================================================================" print "================ Finished with Prediction ===================" print "================================================================"
train_data = xgb.DMatrix(result_path + 'train.sparse') test_data = xgb.DMatrix(result_path + 'test.sparse') param = { 'bst:max_depth': 10, 'bst:min_child_weight': 5, 'bst:eta': 0.5, 'silent': 0, 'objective': 'binary:logistic', 'eval_metric': 'logloss' } param['nthread'] = 4 plst = param.items() evallist = [(train_data, 'train')] num_round = 30 bst = xgb.train(plst, train_data, num_round, evallist) bst.dump_model(result_path + 'dump.raw.txt') ypred = bst.predict(test_data) output = open(result_path + 'submission.csv', 'w') output.write('Id,Predicted\n') for p in ypred: output.write('{0},{1}\n'.format('anything', p)) output.close()
'base_score': y_mean, # base prediction = mean(target) 'silent': 1 } # prepare dict of params for xgboost to run with # NOTE: Make sure that the class is labeled 'class' in the data file dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train) dtest = xgb.DMatrix(test) num_boost_rounds = 1350 # train model model = xgb.train( dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds, # early_stopping_rounds=50, verbose_eval=10) y_pred = model.predict(dtest) '''Train the stacked models then predict the test data''' # Stacking with average en = make_pipeline(RobustScaler(), PCA(n_components=12), ElasticNet(alpha=0.001, l1_ratio=0.1)) rf = RandomForestRegressor(n_estimators=250, n_jobs=4, min_samples_split=25, min_samples_leaf=25, max_depth=3)
def run_training_continuation(self, xgb_params_01, xgb_params_02, xgb_params_03): from sklearn.datasets import load_digits from sklearn.metrics import mean_squared_error digits_2class = load_digits(n_class=2) digits_5class = load_digits(n_class=5) X_2class = digits_2class['data'] y_2class = digits_2class['target'] X_5class = digits_5class['data'] y_5class = digits_5class['target'] dtrain_2class = xgb.DMatrix(X_2class, label=y_2class) dtrain_5class = xgb.DMatrix(X_5class, label=y_5class) gbdt_01 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10) ntrees_01 = len(gbdt_01.get_dump()) assert ntrees_01 == 10 gbdt_02 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=0) gbdt_02.save_model('xgb_tc.model') gbdt_02a = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02) gbdt_02b = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.model") ntrees_02a = len(gbdt_02a.get_dump()) ntrees_02b = len(gbdt_02b.get_dump()) assert ntrees_02a == 10 assert ntrees_02b == 10 res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class)) assert res1 == res2 res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class)) assert res1 == res2 gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3) gbdt_03.save_model('xgb_tc.model') gbdt_03a = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03) gbdt_03b = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.model") ntrees_03a = len(gbdt_03a.get_dump()) ntrees_03b = len(gbdt_03b.get_dump()) assert ntrees_03a == 10 assert ntrees_03b == 10 os.remove('xgb_tc.model') res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class)) assert res1 == res2 gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res2 = mean_squared_error( y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04) assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) res2 = mean_squared_error( y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit)) assert res1 == res2 gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=7) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05) assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree res1 = gbdt_05.predict(dtrain_5class) res2 = gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit) np.testing.assert_almost_equal(res1, res2)
X = format_input_for_network(X_train, N_FEAT) X_eval = format_input_for_network(X_val, N_FEAT) callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=PATIENCE, restore_best_weights=True) # Create our baseline model: a gradient boosted tree. dtrain = xgb.DMatrix(X_train, label=y_train, silent=True) param = {'max_depth': 2, 'eta': 1, 'objective': 'reg:squarederror'} param['nthread'] = 4 param['eval_metric'] = 'rmse' dtest = xgb.DMatrix(X_val, label=y_val, silent=True) evallist = [(dtest, 'eval'), (dtrain, 'train')] bst = xgb.train(param, dtrain, PATIENCE, evallist, early_stopping_rounds=10, verbose_eval=10) ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit) #For ease of use and reproducability I have saved the model, # but here is the code to create it from scratch. ''' # define model model = Sequential() model.add(LSTM(50, activation='relu', input_shape=(N_STEPS, N_FEAT), dropout=0.25)) model.add(Dense(1)) model.compile(optimizer='adam', loss='mae') # fit model model.fit(X, y_train, epochs=200, verbose=1, callbacks=[callback])
'scale_pos_weight': 1, 'colsample_bytree': 0.8, 'eval_metric': 'logloss', 'nthread': 8, 'sample_type': 'uniform', 'normalize_type': 'forest', 'random_state': 1 } plst = params.items() evallist = [(xgb_train, 'train'), (xgb_val, 'eval')] num_round = 500 bst = xgb.train(plst, xgb_train, num_round, evals=evallist, early_stopping_rounds=10) # # bst = xgb.cv(params=params, dtrain=xgb_train, nfold=5, metrics='logloss', verbose_eval=2, early_stopping_rounds=10) # bst.save_model(out_path + 'xgb.model') feat_importance = bst.get_fscore() print(feat_importance) with open(out_path + 'feat_importance.csv', 'w') as fo: for k in feat_importance.keys(): fo.write(str(k) + ',') fo.write('\n')
# 'scale_pos_weight': 10, 'eval_metric': 'auc', 'subsample': 0.76, 'colsample_bytree': 0.95, # 'n_estimators': 5000, # 'eta_decay': 0.5, 'seed': 1, # 'min_child_weight': 0.8, } rounds = 10000 bst = xgb.train(param, dtrain, rounds, early_stopping_rounds=300, evals=evals, evals_result=gpu_res, verbose_eval=True) # pickle.dump(bst, open(_dir+"/models/model.1", "wb")) # bst = get_pickled(_dir, "models/model.1") # trained = xgb.train(param, dtrain, 782) # xgb.train(param, dtrain) # imp = xgb.plot_importance(bst) # scores = bst.get_score() # # scores_sort = sorted(scores.items(), key=lambda kv: kv[1])
# this is log likelihood loss def logregobj(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0 - preds) return grad, hess # user defined evaluation function, return a pair metric_name, result # NOTE: when you do customized loss function, the default prediction value is margin # this may make builtin evaluation metric not function properly # for example, we are doing logistic loss, the prediction is score before logistic transformation # the builtin evaluation error assumes input is after logistic transformation # Take this in mind when you use the customization, and maybe you need write customized evaluation function def evalerror(preds, dtrain): labels = dtrain.get_label() # return a pair metric_name, result. The metric name must not contain a colon (:) # since preds are margin(before logistic transformation, cutoff at 0) return 'error', float(sum(labels != (preds > 0.0))) / len(labels) # training with customized objective, we can also do step by step training # simply look at xgboost.py's implementation of train bst = xgb.train(param, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)
tr_angle_mat.append(0) tr_input_arr = np.array(tr_input_mat) tr_angle_arr = np.array(tr_angle_mat) dtrain = xgb.DMatrix(tr_input_arr, label=tr_angle_arr) param = { 'max_depth': 6, 'eta': 0.2, 'subsumble': 0.5, 'silent': 1, 'objective': 'binary:logistic' } watchlist = [(dtrain, 'train')] num_round = 3000 #10 #3000 # 1000 bst = xgb.train(param, dtrain, num_round, watchlist) bst.dump_model('./dump.raw.txt') bst.save_model('./hoge.model') ### training end # trade portfolio = 1000000 LONG = 1 SHORT = 2 NOT_HAVE = 3 pos_kind = NOT_HAVE HALF_SPREAD = 0.0015 SONKIRI_RATE = 0.05 RIKAKU_PIPS = 0.60
def fit(self, x_train, y_train, x_valid=None, y_valid=None): # xgb_start = time.time() best_model = None best_round = None best_score = {} if self.cv_folds is not None: log.logger.info('CrossValidation。。。。') d_train = xgb.DMatrix(x_train, label=y_train) cv_result = self._kfold(d_train) print('cv_result %s' % cv_result) print('type_cv_result %s' % type(cv_result)) # min_rmse = pd.Series(cv_result['test-rmse-mean']).min() # self.best_score['min_test-rmse-mean'] = min_rmse # self.best_round = cv_result[cv_result['test-rmse-mean'].isin([min_rmse])].index[0] best_score['min_test-rmse-mean'] = pd.Series( cv_result['test-rmse-mean']).min() best_round = pd.Series(cv_result['test-rmse-mean']).idxmin() best_model = xgb.train(self.xgb_params, d_train, best_round) elif self.ts_cv_folds is not None: log.logger.info('TimeSeriesCrossValidation。。。。') # 时间序列k_fold bst_score = 0 details = [] scores = [] tscv = TimeSeriesSplit(n_splits=self.ts_cv_folds) if self.xgb_params['objective'] is not 'reg:linear': log.logger.error('Objective ERROR........') exit() for n_fold, (tr_idx, val_idx) in enumerate(tscv.split(x_train)): print(f'the {n_fold} training start ...') tr_x, tr_y, val_x, val_y = x_train.iloc[tr_idx], y_train[ tr_idx], x_train.iloc[val_idx], y_train[val_idx] d_train = xgb.DMatrix(tr_x, label=tr_y) d_valid = xgb.DMatrix(val_x, label=val_y) watchlist = [(d_train, "train"), (d_valid, "valid")] xgb_model = xgb.train( params=self.xgb_params, dtrain=d_train, num_boost_round=self.num_boost_round, evals=watchlist, early_stopping_rounds=self.early_stop_round) details.append((xgb_model.best_score, xgb_model.best_iteration, xgb_model)) if xgb_model.best_score > bst_score: bst_score = xgb_model.best_score best_model = xgb_model best_round = xgb_model.best_iteration else: best_model = xgb_model best_round = xgb_model.best_iteration scores.append(xgb_model.best_score) best_score['avg_score'] = np.mean(scores) else: log.logger.info('NonCrossValidation。。。。') if x_valid is None and y_valid is None: # 注意这里的shift # x_train, x_valid, y_train, y_valid = train_test_sp(x_train, y_train, test_size=0.2, shift=0) d_train = xgb.DMatrix(x_train, label=y_train) watchlist = [(d_train, "train")] else: d_train = xgb.DMatrix(x_train, label=y_train) d_valid = xgb.DMatrix(x_valid, label=y_valid) watchlist = [(d_train, "train"), (d_valid, "valid")] best_model = xgb.train(params=self.xgb_params, dtrain=d_train, num_boost_round=self.num_boost_round, evals=watchlist, verbose_eval=5, early_stopping_rounds=self.early_stop_round) best_round = best_model.best_iteration best_score['best_score'] = best_model.best_score # print('spend time :' + str((time.time() - xgb_start)) + '(s)') return best_score, best_round, best_model
# 训练函数参数设定 params = { 'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 9, 'eval_metric': 'merror', 'seed': 0, 'missing': -999, 'num_class':num_class, 'silent' : 1 } #从数据中去掉多余的列 feature=[x for x in train1.columns if x not in ['user_id','label','shop_id','time_stamp','mall_id','wifi_infos']] df_train_1 = df_train[feature] #训练数据对null值进行填充-100 df_train_1 = df_train_1.where(df_train_1.notnull(), -100) xgbtrain = xgb.DMatrix(df_train_1, df_train['label']) df_test_1 = df_test[feature] #测试数据对null值进行填充-100 df_test_1 = df_test_1.where(df_test_1.notnull(), -100) xgbtest = xgb.DMatrix(df_test_1) watchlist = [ (xgbtrain,'train'), (xgbtrain, 'test') ] num_rounds=100 model = xgb.train(params, xgbtrain, num_rounds, watchlist, early_stopping_rounds=15) #训练模型 df_test['label']=model.predict(xgbtest) #预测 df_test['shop_id']=df_test['label'].apply(lambda x:lbl.inverse_transform(int(x))) #对标签1-N转化为shop_id r=df_test[['row_id','shop_id']] #选出r result=pd.concat([result,r]) j = j + 1 print j result['row_id']=result['row_id'].astype('int') result.to_csv('sub.csv',index=False)
def fit(self, data_x, data_y, num_class): if self.special_objective is None: # get the parameter list self.para_dict = { 'max_depth': self.max_depth, 'eta': self.eta, 'silent': self.silent_mode, 'objective': self.objective_func, 'num_class': num_class, 'eval_metric': self.eval_metric, 'booster': self.booster } else: # get the parameter list, without stating the objective function self.para_dict = { 'max_depth': self.max_depth, 'eta': self.eta, 'silent': self.silent_mode, 'eval_metric': self.eval_metric, 'booster': self.booster } # make sure data is in [nData * nSample] format assert len(data_x.shape) == 2 # check if data length is the same if data_x.shape[0] != data_y.shape[0]: raise ValueError( 'The numbner of instances for x and y data should be the same!' ) # data_x is in [nData*nDim] nData = data_x.shape[0] nDim = data_x.shape[1] # split the data into train and validation holistic_ind = np.random.permutation(nData) train_ind = holistic_ind[0:nData * 3 // 4] valid_ind = holistic_ind[nData * 3 // 4:nData] # indexing and get the data train_data = data_x[train_ind] train_label = data_y[train_ind] valid_data = data_x[valid_ind] valid_label = data_y[valid_ind] # marixilize the data and train the estimator dtrain = xgb.DMatrix(train_data, label=train_label) dvalid = xgb.DMatrix(valid_data, label=valid_label) self.eval_list = [(dvalid, 'valid'), (dtrain, 'train')] if self.special_objective is None: # fit the classfifier self.boosting_model = xgb.train(self.para_dict, dtrain, self.num_round, self.eval_list, verbose_eval=False) elif self.special_objective == 'weighted': # fit the classfifier self.boosting_model = xgb.train(self.para_dict, dtrain, self.num_round, self.eval_list, weighted_binary_cross_entropy, evalerror, verbose_eval=False) elif self.special_objective == 'focal': # fit the classfifier self.boosting_model = xgb.train(self.para_dict, dtrain, self.num_round, self.eval_list, focal_binary_object, evalerror, verbose_eval=False) else: raise ValueError( 'The input special objective mode not recognized! Could only be \'weighted\' or \'focal\', but got ' + str(self.special_objective))
'nthread': 20, 'seed': 42, 'silent': 0 } if online == False: Dtrain = xgb.DMatrix(train[features], train[target], feature_names=features) Dtest = xgb.DMatrix(test[features], test[target], feature_names=features) watchlist = [(Dtrain, 'train'), (Dtest, 'val')] clf = xgb.train(xgb_pars, Dtrain, num_boost_round=450, verbose_eval=1, evals=watchlist) #xx = clf.predict(Dtrain, output_margin=False, ntree_limit=0, pred_leaf=True) train['lgb_predict'] = clf.predict(Dtrain) print('train log_loss', log_loss(train[target], train['lgb_predict'])) test['lgb_predict'] = clf.predict(Dtest) print('test log_loss', log_loss(test[target], test['lgb_predict'])) """ xgb_feature = clf.predict(Dtrain,pred_leaf=True) xgb_feature = pd.DataFrame(xgb_feature,columns=["xgb_{}".format(i+1) for i in range(xgb_feature.shape[1])]) xgb_feature['instance_id'] = train['instance_id']
def xgb_train(train_df, test_df, mode, params, num_boost_round, early_stopping): if mode == "train": train = train_df.values[:, 1:-1] train_target = train_df.values[:, -1] # 5-fold kf = KFold(n_splits=5, shuffle=True) trainEorror = 0 error = 0 for train_index, valid_index in kf.split(train): x_train, x_valid = train[train_index], train[valid_index] y_train, y_valid = train_target[train_index], train_target[ valid_index] dtrain = xgb.DMatrix(x_train, y_train) dvalid = xgb.DMatrix(x_valid, y_valid) watchlist = [(dtrain, 'train'), (dvalid, 'eval')] gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping, verbose_eval=True) print("validating") tranHat = gbm.predict(xgb.DMatrix(x_train)) trainEorror += rmsep(y_train, tranHat) yhat = gbm.predict(xgb.DMatrix(x_valid)) error += rmsep(y_valid, yhat) print('rmse:{:.6f}'.format(error / 5.0)) else: train = train_df.values[:, 1:-1] train_target = train_df.values[:, -1] kf = KFold(n_splits=5, shuffle=True) result = np.zeros(2000) dtest = test_df.values[:, 1:] dtest = xgb.DMatrix(dtest) for train_index, valid_index in kf.split(train): x_train, x_valid = train[train_index], train[valid_index] y_train, y_valid = train_target[train_index], train_target[ valid_index] dtrain = xgb.DMatrix(x_train, y_train) watchlist = [(dtrain, 'train')] gbm = xgb.train( params, dtrain, num_boost_round=num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping, ) result += gbm.predict(dtest) result = result / 5.0 return result
def xgb_model_val(): # 'is_first_get_coupon','user_hour_count_label','context_timestamp_rank_desc_label' user_item_brand_count user_diff_shop_count train_set = pd.read_csv('data/ftrain.csv', sep=",") validate_set = pd.read_csv('data/fvalidate.csv', sep=",") train_x = train_set.drop([ 'instance_id', 'context_id', 'item_city_id', 'item_id', 'user_id', 'item_brand_id', 'shop_id', 'user_gender_id', 'user_occupation_id', 'is_trade', 'context_timestamp', 'context_page_id', 'context_timestamp_and_dates', 'dates', 'day', 'hour', 'item_category_list', 'item_property_list', 'predict_category_property', 'is_first_get_coupon', 'context_timestamp_rank_desc_label', 'category', 'user_shop_count_istrade_rate', 'user_item_brand_count', 'user_istrade_rate', 'user_diff_shop_count', 'user_count_minus_user_count_istrade', 'user_item_count_minus_user_item_istrade', 'user_count_istrade', 'user_and_user_occupation_count_label', 'predict_property_jiaoji_item_property', 'user_shop_count_minus_user_shop_istrade' ], axis=1) train_y = train_set['is_trade'] # # 相关性分析 # pearson_analysis_feature(train_x,train_y) # return val_x = validate_set.drop([ 'instance_id', 'context_id', 'item_city_id', 'item_id', 'user_id', 'item_brand_id', 'shop_id', 'user_gender_id', 'user_occupation_id', 'is_trade', 'context_timestamp', 'context_page_id', 'context_timestamp_and_dates', 'dates', 'day', 'hour', 'item_category_list', 'item_property_list', 'predict_category_property', 'is_first_get_coupon', 'context_timestamp_rank_desc_label', 'category', 'user_shop_count_istrade_rate', 'user_item_brand_count', 'user_istrade_rate', 'user_diff_shop_count', 'user_count_minus_user_count_istrade', 'user_item_count_minus_user_item_istrade', 'user_count_istrade', 'user_and_user_occupation_count_label', 'predict_property_jiaoji_item_property', 'user_shop_count_minus_user_shop_istrade' ], axis=1) val_y = validate_set['is_trade'] xgb_train = xgb.DMatrix(train_x, label=train_y) xgb_val = xgb.DMatrix(val_x, label=val_y) params = { 'booster': 'gbtree', 'objective': 'binary:logistic', # 二分类的问题 # 'gamma':0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 'max_depth': 5, # 构建树的深度,越大越容易过拟合 # 'lambda':2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 'subsample': 0.8, # 随机采样训练样本 'colsample_bytree': 0.8, # 生成树时进行的列采样 'min_child_weight': 3, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 'eta': 0.03, # 如同学习率 'nthread': 30, # cpu 线程数 'eval_metric': 'logloss' # 评价方式 } plst = list(params.items()) num_rounds = 500 # 迭代次数 watchlist = [(xgb_train, 'train'), (xgb_val, 'val')] # early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练 model = xgb.train(plst, xgb_train, num_rounds, watchlist) importance = model.get_fscore() #-----------------------important of feature start----------------------------------------- importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() print(df)
outfile.close() features = [x for x in train_data.columns] ceate_feature_map(features) import xgboost as xgb from xgboost import plot_importance print ('start running ....') dtrain = xgb.DMatrix(x_train,label=y_train) dval = xgb.DMatrix(x_val,label=y_val) param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 4, 'min_child_weight': 7, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 0.8, 'eta': 0.05, 'silent': 1, } num_round =100 plst = list(param.items()) plst += [('eval_metric', 'rmse')] evallist = [(dval, 'eval'), (dtrain, 'train')] bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=50) dtest = xgb.DMatrix(test_data) y3 = bst.predict(dtest) plot_importance(bst) plt.show() importance = bst.get_fscore(fmap='xgb.fmap') importance = sorted(importance.items(), key=operator.itemgetter(1))
params = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'max_depth': 4, 'lambda': 10, 'subsample': 0.75, 'colsample_bytree': 0.75, 'min_child_weight': 2, 'eta': 0.025, 'seed': 0, 'nthread': 8, 'silent': 1 } watchlist = [(dtrain, 'train')] bst = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist) ypred = bst.predict(dtest) # 设置阈值, 输出一些评价指标 y_pred = (ypred >= 0.5) * 1 print('AUC: %.4f' % metrics.roc_auc_score(test_y, ypred)) print('Accuracy: %.4f' % metrics.accuracy_score(test_y, y_pred)) print('Recall: %.4f' % metrics.recall_score(test_y, y_pred)) print('F1-score: %.4f' % metrics.f1_score(test_y, y_pred)) print('Precesion: %.4f' % metrics.precision_score(test_y, y_pred)) # metrics.confusion_matrix(test_y,y_pred)
def make_one_step(_df_in, _it): _df = _df_in.copy() _ratio = 4 # _x_train, _y_train, _x_val, _y_val = _dataframe _y_train = _df.pop('notified') _sample_set = get_pickled(data_features_prefix, features_storm_set) if _sample_set is None: _sample_set = set() _sample = take_sample_dim(_df, _ratio) _sample = persist_sample(_df, _sample, _sample_set, _ratio) # remove_columns(_df, _sample) # _cat_df = pd.DataFrame({col: _df[col].astype('category').cat.codes for col in _df}, # index=_df.index) _dummies_df = pd.get_dummies(_df, drop_first=True) _dummies_df['notified'] = _y_train # _df_positives = get_rows(_dummies_df, 'notified', 1) # _df_negatives = get_rows(_dummies_df, 'notified', 0) train_indexes, valid_indexes = get_pickled(_dir, "train_valid_shuffled_indexes") _train_df = _dummies_df.iloc[train_indexes] _valid_df = _dummies_df.iloc[valid_indexes] # _train_df, _valid_df = get_split_train_valid(_df_positives, _df_negatives, 0.8) _y_train = _train_df.pop('notified') _y_valid = _valid_df.pop('notified') _tmp = time.time() _gpu_res = {} _dtrain = xgb.DMatrix(_train_df.values, label=_y_train.values, missing=-999) _dval = xgb.DMatrix(_valid_df.values, label=_y_valid.values, missing=-999) _evals = [(_dval, 'valid')] _param = { 'objective': 'binary:logistic', # Specify multiclass classification 'num_class': 1, # Number of possible output classes 'tree_method': 'gpu_exact', # Use GPU accelerated algorithm 'scale_pos_weight': 16.32, 'gpu_id': 1, # 'scale_pos_weight': 1, # 'scale_pos_weight': 10, 'eval_metric': 'auc', 'subsample': 0.8, 'colsample_bytree': 0.9, # 'n_estimators': 5000, # 'eta_decay': 0.5, 'seed': 1, # 'min_child_weight': 0.8, } _rounds = 10000 _bst = None try: _bst = xgb.train(_param, _dtrain, _rounds, early_stopping_rounds=300, evals=_evals, evals_result=_gpu_res, verbose_eval=True) except Exception: _bst = xgb.train(_param, _dtrain, _rounds, early_stopping_rounds=300, evals=_evals, evals_result=_gpu_res, verbose_eval=True) print("GPU Training Time: %s seconds" % (str(time.time() - _tmp))) _predicted = _bst.predict(_dval) _auc = metrics.roc_auc_score(_y_valid, _predicted) # auc = trainClassifier(clf, xtrain, ytrain, xtest, ytest) write_chosen_solution(_sample, _auc) print("Iteration: {} AUC: {} sample: {}".format(_it, _auc, _sample)) i = 1
def xgboost_euro(conn, num, df): # 이거를 num만큼 반복 for j in range(1, num+1): print("{}번째".format(j)) df_xg = df # 1일 추가하고 주말제거하기 today = date.today() + relativedelta(days=+j) if getDay(today.year, today.month, today.day) == 'Sat' or getDay(today.year, today.month, today.day) == 'Sun': continue else: df_xg = df_xg.append({"Date": pd.Timestamp( today), "euro_close": float('nan')}, ignore_index=True) # 하루씩 추가하면서 예측하고 그 다음날을 추가해서 1주일, 2주일, 1달 이렇게 빼서 확인해보기 # XGBOOST # extract the date feature df_xg['day'] = df_xg.Date.dt.day df_xg['dayofweek'] = df_xg.Date.dt.dayofweek df_xg['dayofyear'] = df_xg.Date.dt.dayofyear df_xg['week'] = df_xg.Date.dt.week df_xg['month'] = df_xg.Date.dt.month df_xg['year'] = df_xg.Date.dt.year df_xg = df_xg.drop('Date', axis=1) # 시계열 데이터에서 이전데이터를 현재 데이터에 넣으면 좀 더 정확한 학습이 가능 # 이것을 lag(지연 데이터라고 표현) # lag 데이터를 만들어보도록 함 for i in range(1, 6): df_xg['lag'+str(i)] = df_xg.euro_close.shift(i).fillna(0) X = df_xg.drop('euro_close', axis=1) y = df_xg.euro_close X_train, X_test = X[:-1], X[-1:] y_train, y_test = y[:-1], y[-1:] # convert data to xgb matrix form dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test) # bayesian hyper parameter tuning # define the params def xgb_evaluate(max_depth, gamma, colsample_bytree): params = {'eval_metric': 'rmse', 'max_depth': int(max_depth), 'subsample': 0.8, 'eta': 0.1, 'gamma': gamma, 'colsample_bytree': colsample_bytree} cv_result = xgb.cv(params, dtrain, num_boost_round=250, nfold=3) return -1.0 * cv_result['test-rmse-mean'].iloc[-1] # run optimizer xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 7), 'gamma': (0, 1), 'colsample_bytree': (0.3, 0.9)}) # define iter points xgb_bo.maximize(init_points=10, n_iter=15, acq='ei') # get the best parameters params = xgb_bo.max['params'] params['max_depth'] = int(round(params['max_depth'])) # train the data model = xgb.train(params, dtrain, num_boost_round=200) # predict the test data predictions = model.predict(dtest) lenv_ = len(df_xg) df_xg.euro_close[lenv_-1] = predictions df_xg = df_xg.drop(['day', 'dayofweek', 'dayofyear', 'week', 'month', 'year', 'day', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5'], axis=1) if getDay(today.year, today.month, today.day) == 'Sat' or getDay(today.year, today.month, today.day) == 'Sun': continue else: df = df.append({"Date": pd.Timestamp(today), "euro_close": predictions[0]}, ignore_index=True) euro_close = float(predictions[0]) print(today) print(type(today)) print(euro_close) print(type(euro_close)) xgboost_EURO(conn, today, euro_close) xgboost_EURO_remove(conn) return
def xg_train_wrapper(parser): xgdata, data_test, params, params_t, params_other = conf_parser( parser.conf) '''x,y = load_svmlight_file(xgdata) x = x.todense() test_x,test_y = load_svmlight_file(data_test) test_x = test_x.todense()''' df_train = pd.read_csv(xgdata) y = df_train['label'].values x_columns = [ item for item in df_train.columns if item not in ['label', 'id'] ] x = df_train[x_columns].as_matrix() df_test = pd.read_csv(data_test) test_y = df_test['label'].values x_test_columns = [ item for item in df_test.columns if item not in ['label', 'id'] ] test_x = df_test[x_test_columns].as_matrix() x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=42) dtrain = xgb.DMatrix(x_train, label=y_train) dval = xgb.DMatrix(x_val, y_val) dtrain_whole = xgb.DMatrix(x, label=y) watchlist = [(dtrain, 'train'), (dval, 'eval')] watchlist_whole = [(dtrain_whole, 'eval')] scale_pos_weight = get_negative_positive_ratio(y) params['scale_pos_weight'] = scale_pos_weight custom_feval = set_custom_eval_metirc(params_other['eval_metric']) log = log_class.log_class('grid_search_xgb', params_other['log_dir']) log.add('scale_pos_weight:' + str(scale_pos_weight)) log.add('eval_metric:' + params_other['eval_metric']) print(params) num_round = tune_num_boost_round(params, dtrain, params_other['num_round'], log, watchlist, eval_metric=params_other['eval_metric'], feval=custom_feval, ascend=params_other['ascend']) params_t = [ dict(max_depth=params_t['max_depth']), dict(subsample=params_t['subsample']), dict(min_child_weight=params_t['min_child_weight']), dict(colsample_bytree=params_t['colsample_bytree']), dict(colsample_bylevel=params_t['colsample_bylevel']), dict(max_delta_step=params_t['max_delta_step']), dict(gamma=params_t['gamma']) ] for param_t in params_t: k = param_t.keys()[0] values = param_t[k] if (k == 'num_round'): continue log.add("=====" + str(k) + "=======" + str(values)) print('========== ', k, ' ========== ', values) result = [] if (len(values) == 1): params[k] = values[0] continue for v in values: print('**** for : %s ****\n' % (str(v))) log.add("**** for :" + str(v) + "****") params[k] = v if (custom_feval == None): params['eval_metric'] = params_other['eval_metric'] result_df = xgb.cv( params=params, dtrain=dtrain_whole, num_boost_round=num_round, nfold=params_other['cv'], # metrics=params_other['eval_metric'], feval=custom_feval, stratified=True, verbose_eval=False, show_stdv=False, shuffle=True, early_stopping_rounds=100) result_df = result_df[[ 'test-' + params_other['eval_metric'] + '-mean' ]] assert result_df.columns[0] == 'test-' + params_other[ 'eval_metric'] + '-mean', 'choose the correct column\n' result_np = result_df.as_matrix() result.append(float(result_np[-1][0])) print(zip(values, result)) if (params_other['ascend'] == 1): loc = max(enumerate(result), key=lambda x: x[1])[0] else: loc = min(enumerate(result), key=lambda x: x[1])[0] params[k] = values[loc] print('%s : %s\n' % (k, params[k])) log.add(k) log.add(str(params[k])) num_round = tune_num_boost_round(params, dtrain_whole, params_other['num_round'], log, watchlist_whole, eval_metric=params_other['eval_metric'], feval=custom_feval, ascend=params_other['ascend']) model = xgb.train(params, dtrain_whole, num_round, watchlist_whole, feval=custom_feval) pprint.pprint(params) time_str = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) if not os.path.isdir('./models'): os.mkdir('./models') dataname_model_path = os.path.join('./models', params_other['log_dir']) if not os.path.isdir(dataname_model_path): os.mkdir(dataname_model_path) model.save_model(dataname_model_path + '/' + time_str + '.xgmodel') print('saved : %s' % (dataname_model_path + '/' + time_str + '.xgmodel')) predict_test(model, test_x, test_y, log)
y_test = np.array(YY[0:start].reshape(1, -1)[0]) x_test = np.array(XX.iloc[0:start, selected]) dtrain = xgb.DMatrix(x2, label=y2) dtest = xgb.DMatrix(x_test, label=y_test) param = { 'max_depth': 20, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic' } evallist = [(dtrain, 'train')] num_round = 10 feature_names = dict(np.array([range(0, XX.shape[1]), np.array(XX.columns)]).T) bst = xgb.train(param, dtrain, num_round, evallist) #bst.save_model('0001.model') #bst.dump_model('dump.raw.txt') pred01 = bst.predict(dtrain) Y = y2 pred = [] for cutoff in np.linspace(0.001, 0.999, 2000): pred0[pred0 < cutoff] = 0 pred0[pred0 != 0] = 1 pred.append(accuracy_score(np.array(Y), pred0)) thre = np.linspace(0.001, 0.999, 2000)[np.where(pred == np.max(pred))[0][0]] pred01[pred01 >= thre] = 1
'objective': 'binary:logistic', 'eta': 0.1, 'colsample_bytree': 0.886, 'min_child_weight': 2, 'max_depth': 10, 'subsample': 0.886, 'alpha': 10, 'gamma': 30, 'lambda': 50, 'verbose_eval': True, 'nthread': 8, 'eval_metric': 'auc', 'scale_pos_weight': 10, 'seed': 201703, 'missing': -1 } xgbtrain = xgb.DMatrix(train_feat[predictors], train_feat['label']) xgbtest = xgb.DMatrix(test_feat[predictors]) model = xgb.train(params, xgbtrain, num_boost_round=120) del train_feat, xgbtrain gc.collect() test_feat.loc[:, 'pred'] = model.predict(xgbtest) result = reshape(test_feat) test = pd.read_csv(test_path) result = pd.merge(test[['orderid']], result, on='orderid', how='left') result.fillna('0', inplace=True) result.to_csv('result.csv', index=False, header=False) print('一共用时{}秒'.format(time.time() - t0))
'min_child_weight': 1, 'gamma': 0.1, 'subsample': 0.9, 'colsample_bytree': 0.9, 'reg_alpha': 2, 'reg_lambda': 0.1, 'objective': 'multi:softmax', 'nthread': 8, 'scale_pos_weight': 1 } plst = params.items() num_rounds = 500 model = xgb.train(plst, dtrain, num_rounds) ans = model.predict(dtest) y_test1 = np.asarray(y_test) cm1 = pd.crosstab(y_test, ans, rownames=['Actual'], colnames=['Predicted']) print(cm1) plt.figure(num=1, figsize=(12, 8)) plot_importance(model) plt.show() plt.savefig('xgbmodel3.png') #fit model on all training data xgb2.fit(x_train, y_train)
import numpy as np import matplotlib.pyplot as plt from scipy.io import loadmat import pandas as pd import xgboost as xgb from sklearn.metrics import mean_squared_error training = pd.read_csv("Titanic Training.csv") test = pd.read_csv("Titanic Test.csv") X_train = training.drop(columns="Survived").to_numpy() y_train = training["Survived"].to_numpy().reshape(y_train.shape[0], 1) m = y_train.shape[0] D = np.ones((m, 1)) / m X_test = test.drop(columns="PassengerId").to_numpy() test_ID = test["PassengerId"] D_train = xgb.DMatrix(X_train, label=y_train) D_test = xgb.DMatrix(X_test, label=None) parameters = {"max_depth": 10, "num_class": 2} steps = 100 model = xgb.train(parameters, D_train, steps) predictions_train = model.predict(D_train).reshape(X_train.shape[0], 1) predictions_test = model.predict(D_test).reshape(X_test.shape[0], 1) check_train = np.equal(predictions_train, y_train) * 1 correct_train = np.sum(check_train) accuracy_train = 100 * (correct_train / y_train.shape[0]) error_train = 100 - accuracy_train results_df = pd.concat([test_ID, pd.DataFrame(predictions_test)], axis=1) results_df = results_df.rename(columns={0: "Survived"})
def fit_log(self, records, plan_size): tic = time.time() # filter data, only pick the data with a same task data = [] for inp, res in records: if inp.task.name == self.task.name and \ inp.config.template_key == self.task.config_space.template_key: data.append((inp, res)) logger.debug("XGB load %d entries from history log file", len(data)) # extract feature self._reset_pool(self.space, self.target, self.task) pool = self._get_pool() if self.fea_type == 'itervar': feature_extract_func = _extract_itervar_feature_log elif self.fea_type == 'knob': feature_extract_func = _extract_knob_feature_log elif self.fea_type == 'curve': feature_extract_func = _extract_curve_feature_log else: raise RuntimeError("Invalid feature type: " + self.fea_type) res = pool.map(feature_extract_func, data) # filter out feature with different shapes fea_len = len(self._get_feature([0])[0]) xs, ys = [], [] for x, y in res: if len(x) == fea_len: xs.append(x) ys.append(y) if len(xs) < 500: # no enough samples return False xs, ys = np.array(xs), np.array(ys) x_train = xs y_train = ys y_max = np.max(y_train) y_train = y_train / max(y_max, 1e-8) index = np.random.permutation(len(x_train)) dtrain = xgb.DMatrix(x_train[index], y_train[index]) plan_size *= 2 self.bst = xgb.train( self.xgb_params, dtrain, num_boost_round=400, callbacks=[ custom_callback(stopping_rounds=100, metric='tr-a-recall@%d' % plan_size, evals=[(dtrain, 'tr')], maximize=True, fevals=[ xgb_average_recalln_curve_score(plan_size), ], verbose_eval=self.log_interval) ]) logger.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs)) return True
'colsample_bytree': 0.3, 'max_depth': 10, 'subsample': 0.8, 'lambda': 0.5, 'nthread': -1, 'booster': 'gbtree', 'silent': 1, #'eval_metric': 'rmsle', 'objective': 'reg:linear' } # You could try to train with more epoch model = xgb.train(xgb_pars, dtrain, 6000, watchlist, feval=rmsle_eval, early_stopping_rounds=50, maximize=False, verbose_eval=10) print('Modeling RMSLE %.5f' % model.best_score) t1 = dt.datetime.now() print('Training time: %i seconds' % (t1 - t0).seconds) print('4. ---> Submission ... ') ytest = model.predict(dtest) print('Test shape OK.') if test.shape[0] == ytest.shape[0] else print('Oops') test['trip_duration'] = np.exp(ytest) - 1 #test['trip_duration'] = ytest subfn = "base2__val_" + str(model.best_score) + "__rnd_" + str( model.best_iteration) + "csv.gz"
def train_xgboost(df_preds, df_preds2): df_preds = df_preds.drop(['Filename'], axis=1) df_preds = df_preds[['DenseNet121_Predictions','InceptionV3_Predictions','ResNet50_Predictions','Vgg_Predictions','sex', 'localization', 'age','dx']] X,y = df_preds.iloc[:,:-1],df_preds.iloc[:,-1] print(X.head()) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=None, random_state=50 ,shuffle=True) X_train.to_csv("X_train.csv", index=False) X_test.to_csv("X_test.csv", index=False) params = {"objective":"multi:softmax", "num_class":7 ,'colsample_bytree': 0.3,'learning_rate': 0.001, 'max_depth': 100, 'alpha': 10, "n_estimators":100} xg_class = xgb.XGBClassifier(objective ='mult:softmax', num_class=7, colsample_bytree = 0.03, learning_rate = 0.1, max_depth = 1000, alpha = 1000, n_estimators = 1000) xg_class.fit(X_train,y_train) df_preds2 = df_preds2.drop(['Filename'], axis=1) df_preds2 = df_preds2[['DenseNet121_Predictions','InceptionV3_Predictions','ResNet50_Predictions','Vgg_Predictions','sex', 'localization', 'age','dx']] X,y = df_preds2.iloc[:,:-1],df_preds2.iloc[:,-1] print(X.head()) data_dmatrix = xgb.DMatrix(data=X,label=y) X_test = X y_test = y preds = xg_class.predict(X_test) cm = confusion_matrix(y_test, preds) cm_plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'mel','nv', 'vasc'] plot_confusion_matrix(cm, cm_plot_labels, "xgboost") print("Balanced Accuracy: " + str(balanced_accuracy_score(y_test, preds))) print("Weighted Recall: " + str(recall_score(y_test, preds, average='weighted'))) print("Class Recall: " + str(recall_score(y_test, preds, average=None))) print("Weighted Precision: " + str(precision_score(y_test, preds, average='weighted'))) print("Class Precision: " + str(precision_score(y_test, preds, average=None))) print("Mean f1 score " + str(f1_score(y_test, preds, average='weighted'))) print("Class f1 score " + str(f1_score(y_test, preds, average=None))) file = open("xgboost_results_Original.txt","w+") file.write("Balanced Accuracy: " + str(balanced_accuracy_score(y_test, preds)) + "\n") file.write("Weighted Recall: " + str(recall_score(y_test, preds, average='weighted')) + "\n") file.write("Class Recall: " + str(recall_score(y_test, preds, average=None)) + "\n") file.write("Weighted Precision: " + str(precision_score(y_test, preds, average='weighted')) + "\n") file.write("Class Precision: " + str(precision_score(y_test, preds, average=None)) + "\n") file.write("Mean f1 score " + str(f1_score(y_test, preds, average='weighted'))) file.write("Class f1 score " + str(f1_score(y_test, preds, average=None))) file.close() xg_class = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10) ax = xgb.plot_importance(xg_class) fig = ax.figure fig.set_size_inches(20,20)
random_state=42, stratify=y) preds = np.ones(y_test.shape[0]) # Get from cross validation params = { 'booster': 'gbtree', 'objective': 'reg:logistic', 'colsample_bytree': 0.2, 'min_child_weight': 4, 'subsample': 1, 'learning_rate': 0.1, 'max_depth': 6, 'gamma': 0.05 } training_data = xgb.DMatrix(X_train, y_train) model = xgb.train(params, training_data, 230, feval=mcc_eval, maximize=True) preds = model.predict(xgb.DMatrix(X_test)) # pick the best threshold out-of-fold thresholds = np.linspace(0.01, 0.99, 99) mcc = np.array([matthews_corrcoef(y_test, preds > thr) for thr in thresholds]) plt.plot(thresholds, mcc) plt.show() best_threshold = thresholds[mcc.argmax()] print(mcc.max()) print(best_threshold) model = xgb.train(params, xgb.DMatrix(X, y), 230,
'min_split_loss': 0, 'max_depth': 6, 'min_child_weight': 1, 'max_delta_step': 0, 'subsample': 1, 'colsample_bytree': 1, 'colsample_bylevel': 1, 'reg_lambda': 1, 'reg_alpha': 0, 'grow_policy': 'depthwise', 'max_leaves': 0, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'seed': 7 } history = {} # This will record rmse score of training and test set eval_list = [(Train, "Training"), (Validation, "Validation")] clf = xgb.train(params, Train, num_boost_round=119, evals=eval_list, obj=None, feval=None, maximize=False, early_stopping_rounds=40, evals_result=history) prediction = clf.predict(xgb.DMatrix(x_test)) submission = pd.DataFrame({ "card_id": main_test["card_id"].values, "target": np.ravel(prediction) })
} dtrain = xgb.DMatrix(x_train, y_train) # cross-validation cv_result = xgb.cv(xgb_params, dtrain, nfold=5, num_boost_round=1000, early_stopping_rounds=50, verbose_eval=1, show_stdv=False) num_boost_rounds = len(cv_result) print(num_boost_rounds) # train model model = xgb.train(dict(xgb_params, silent=1), dtrain, num_boost_round=num_boost_rounds) res = [] for i in range(3): x_test['month_logerror'] = round( traingroupedMonth.ix[9 + int(i)]['logerror'], 6) x_test['quarter_logerror'] = round(traingroupedQuarter.ix[3]['logerror'], 6) test_set = sc.transform(x_test) dtest = xgb.DMatrix(test_set) pred = model.predict(dtest) res.append(pred) output = pd.DataFrame({ 'ParcelId': properties['parcelid'].astype(np.int32),
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242) #XGBoost算法 params = {} params['objective'] = 'binary:logistic' params['eva_metric'] = 'logloss' params['eta'] = 0.02 params['max_depth'] = 4 d_train = xgb.DMatrix(x_train, label=y_train) d_valid = xgb.DMatrix(x_valid, label=y_valid) watch_list = [(d_train, 'train'), (d_valid, 'valid')] bst = xgb.train(params, d_train, 400, watch_list, early_stopping_rounds=50, verbose_eval=10) d_test = xgb.DMatrix(x_test) p_test = bst.predict(d_test) sub = pd.DataFrame() sub['test_id'] = df_test['test_id'] sub['is_duplicate'] = p_test sub.to_csv('simple_xgb.csv', index=False)