Beispiel #1
0
	def xgbTuning(self, pX, change = 3):
		w = self.getWeight(self.y)
		dm = xgb.DMatrix(pX, self.y, weight=w)
		best_auc = 0
		n = pX.shape[0]
		best_params = None
		for i in range(change):
			randp = np.random.random_sample(3)
			param = {
				'bst:eta': randp[0],
				'max_depth': int(3+6*randp[1]) , 
				'nthread':4, 
				'silent':1,
				'alpha':randp[2],
				'eval_metric':'auc',
				'objective': 'binary:logistic' 
			}
			m = xgb.cv(param, dm, metrics='auc', nfold=3, num_boost_round = 50,early_stopping_rounds=5)
			auc = m['test-auc-mean'].max()
			if auc > best_auc :
				print 'xgb:' + str(auc)
				best_auc = auc
				best_params = param
		Xtrain, Xtest, ytrain, ytest = train_test_split(pX, self.y, test_size=.33)
		trainw = self.getWeight(ytrain)
		testw = self.getWeight(ytest)
		dtrain = xgb.DMatrix(Xtrain, label = ytrain, feature_names=Xtrain.columns, weight = trainw)
		dtest = xgb.DMatrix(Xtest, label = ytest, feature_names=Xtest.columns, weight = testw)
		evallist = [(dtrain, 'train'), (dtest, 'eval')]
		booster = xgb.train(best_params, dtrain, evals=evallist, num_boost_round=100,early_stopping_rounds=10)
		rounds = booster.attr("best_iteration")
		best_auc = booster.attr("best_score")
		return float(best_auc), xgb.train(best_params, dtrain, num_boost_round=int(rounds))
Beispiel #2
0
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model
Beispiel #3
0
 def hyperopt_obj(self,param,train_X,train_y):
     # 5-fold crossvalidation error
     #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round'])
     kf = KFold(n_splits = 3)
     errors = []
     r2 = []
     int_params = ['max_depth','num_round']
     for item in int_params:
         param[item] = int(param[item])
     for train_ind,test_ind in kf.split(train_X):
         train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind]
         test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind]
         dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y)
         dtest = xgb.DMatrix(test_valid_x)
         pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round']))
         pred_test = pred_model.predict(dtest)
         errors.append(mean_squared_error(test_valid_y,pred_test))
         r2.append(r2_score(test_valid_y,pred_test))
     all_dtrain = xgb.DMatrix(train_X,label = train_y)
     print('training score:')
     pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round']))
     all_dtest = xgb.DMatrix(train_X)
     pred_train = pred_model.predict(all_dtest)
     print(str(r2_score(train_y,pred_train)))
     print(np.mean(r2))
     print('\n')
     return {'loss':np.mean(errors),'status': STATUS_OK}
  def fit(self, X, y):    
    X = self.build_matrix(X, y)
    param = {
      'silent': 1 if self.silent else 0, 
      'use_buffer': int(self.use_buffer),
      'num_round': self.num_round,
      'ntree_limit': self.ntree_limit,
      'nthread': self.nthread,
      'booster': self.booster,
      'eta': self.eta,
      'gamma': self.gamma,
      'max_depth': self.max_depth,
      'min_child_weight': self.min_child_weight,
      'subsample': self.subsample,
      'colsample_bytree': self.colsample_bytree,
      'max_delta_step': self.max_delta_step,
      'l': self.l,
      'alpha': self.alpha,
      'lambda_bias': self.lambda_bias,
      'objective': self.objective,
      'eval_metric': self.eval_metric,
      'seed': self.seed          
    }
    if self.num_class is not None:
      param['num_class']= self.num_class

    watchlist  = [(X,'train')]    
    if self.early_stopping_rounds > 0:
      self.bst = xgb.train(param, X, self.num_round, watchlist, early_stopping_rounds=self.early_stopping_rounds)
    else:
      self.bst = xgb.train(param, X, self.num_round, watchlist)

    return self
    def test_predict(self):
        iterations = 10
        np.random.seed(1)
        test_num_rows = [10, 1000, 5000]
        test_num_cols = [10, 50, 500]
        for num_rows in test_num_rows:
            for num_cols in test_num_cols:
                dtrain = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2))
                dval = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2))
                dtest = xgb.DMatrix(np.random.randn(num_rows, num_cols), label=[0, 1] * int(num_rows / 2))
                watchlist = [(dtrain, 'train'), (dval, 'validation')]
                res = {}
                param = {
                    "objective": "binary:logistic",
                    "predictor": "gpu_predictor",
                    'eval_metric': 'auc',
                }
                bst = xgb.train(param, dtrain, iterations, evals=watchlist, evals_result=res)
                assert self.non_decreasing(res["train"]["auc"])
                gpu_pred_train = bst.predict(dtrain, output_margin=True)
                gpu_pred_test = bst.predict(dtest, output_margin=True)
                gpu_pred_val = bst.predict(dval, output_margin=True)

                param["predictor"] = "cpu_predictor"
                bst_cpu = xgb.train(param, dtrain, iterations, evals=watchlist)
                cpu_pred_train = bst_cpu.predict(dtrain, output_margin=True)
                cpu_pred_test = bst_cpu.predict(dtest, output_margin=True)
                cpu_pred_val = bst_cpu.predict(dval, output_margin=True)
                np.testing.assert_allclose(cpu_pred_train, gpu_pred_train, rtol=1e-5)
                np.testing.assert_allclose(cpu_pred_val, gpu_pred_val, rtol=1e-5)
                np.testing.assert_allclose(cpu_pred_test, gpu_pred_test, rtol=1e-5)
Beispiel #6
0
    def train_predict(self,X_train,y_train,X_test,base_train_prediction,base_test_prediction):
        xgmat_train = xgb.DMatrix(X_train, label=y_train,missing=-999)
        test_size = X_test.shape[0]
        param = {}
        param['objective'] = 'binary:logistic'

        param['bst:eta'] = self.eta
        param['colsample_bytree']=1
        param['min_child_weight']=self.min_child_weight
        param['bst:max_depth'] = self.depth
        param['eval_metric'] = 'auc'
        param['silent'] = 1
        param['nthread'] = self.threads
        plst = list(param.items())

        watchlist = [ (xgmat_train,'train') ]
        num_round = self.num_round

        xgmat_test = xgb.DMatrix(X_test,missing=-999)
    
        if self.boost_from_exist_prediction:
        # train xgb with existing predictions
        # see more at https://github.com/tqchen/xgboost/blob/master/demo/guide-python/boost_from_prediction.py
       
            xgmat_train.set_base_margin(base_train_prediction)
            xgmat_test.set_base_margin(base_test_prediction)
            bst = xgb.train(param, xgmat_train, self.exist_num_round, watchlist )
        else:
            bst = xgb.train( plst, xgmat_train, num_round, watchlist )
        ypred = bst.predict(xgmat_test)
        return ypred
	def test_custom_objective(self):
		param = {'max_depth':2, 'eta':1, 'silent':1 }
		watchlist  = [(dtest,'eval'), (dtrain,'train')]
		num_round = 2
		def logregobj(preds, dtrain):
			labels = dtrain.get_label()
			preds = 1.0 / (1.0 + np.exp(-preds))
			grad = preds - labels
			hess = preds * (1.0-preds)
			return grad, hess
		def evalerror(preds, dtrain):
			labels = dtrain.get_label()
			return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
		
		# test custom_objective in training
		bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
		assert isinstance(bst, xgb.core.Booster)
		preds = bst.predict(dtest)
		labels = dtest.get_label()
		err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
		assert err < 0.1

		# test custom_objective in cross-validation
		xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
	       obj = logregobj, feval=evalerror)

		# test maximize parameter
		def neg_evalerror(preds, dtrain):
			labels = dtrain.get_label()
			return 'error', float(sum(labels == (preds > 0.0))) / len(labels)
		bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True)
		preds2 = bst2.predict(dtest)
		err2 = sum(1 for i in range(len(preds2)) if int(preds2[i]>0.5)!=labels[i]) / float(len(preds2))
		assert err == err2
Beispiel #8
0
    def test_fast_histmaker(self):
        variable_param = {'tree_method': ['hist'],
                          'max_depth': [2, 8],
                          'max_bin': [2, 256],
                          'grow_policy': ['depthwise', 'lossguide'],
                          'max_leaves': [64, 0],
                          'verbosity': [0]}
        for param in parameter_combinations(variable_param):
            result = run_suite(param)
            assert_results_non_increasing(result, 1e-2)

        # hist must be same as exact on all-categorial data
        dpath = 'demo/data/'
        ag_dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
        ag_dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
        ag_param = {'max_depth': 2,
                    'tree_method': 'hist',
                    'eta': 1,
                    'verbosity': 0,
                    'objective': 'binary:logistic',
                    'eval_metric': 'auc'}
        hist_res = {}
        exact_res = {}

        xgb.train(ag_param, ag_dtrain, 10,
                  [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                  evals_result=hist_res)
        ag_param["tree_method"] = "exact"
        xgb.train(ag_param, ag_dtrain, 10,
                  [(ag_dtrain, 'train'), (ag_dtest, 'test')],
                  evals_result=exact_res)
        assert hist_res['train']['auc'] == exact_res['train']['auc']
        assert hist_res['test']['auc'] == exact_res['test']['auc']
Beispiel #9
0
def run_benchmark(args):
    print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
    print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
    tmp = time.time()
    X, y = make_classification(args.rows, n_features=args.columns, random_state=7)
    if args.sparsity < 1.0:
       X = np.array([[np.nan if rng.uniform(0, 1) < args.sparsity else x for x in x_row] for x_row in X])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7)
    print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
    tmp = time.time()
    print ("DMatrix Start")
    dtrain = xgb.DMatrix(X_train, y_train, nthread=-1)
    dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
    print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))

    param = {'objective': 'binary:logistic'}
    if args.params is not '':
        param.update(ast.literal_eval(args.params))

    param['tree_method'] = args.tree_method
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
    print ("Train Time: %s seconds" % (str(time.time() - tmp)))
def run_benchmark(args, gpu_algorithm, cpu_algorithm):
    print("Generating dataset: {} rows * {} columns".format(args.rows,args.columns))
    tmp = time.time()
    X, y = make_classification(args.rows, n_features=args.columns, random_state=7)
    print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
    tmp = time.time()
    print ("DMatrix Start")
    # omp way
    dtrain = xgb.DMatrix(X, y, nthread=-1)
    # non-omp way
    #dtrain = xgb.DMatrix(X, y)
    print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))

    param = {'objective': 'binary:logistic',
             'max_depth': 6,
             'silent': 0,
             'n_gpus': 1,
             'gpu_id': 0,
             'eval_metric': 'auc'}

    param['tree_method'] = gpu_algorithm
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations)
    print ("Train Time: %s seconds" % (str(time.time() - tmp)))

    param['silent'] = 1
    param['tree_method'] = cpu_algorithm
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations)
    print ("Time: %s seconds" % (str(time.time() - tmp)))
    def fit(self, X, y,num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        self.label2num = dict((label, i) for i, label in enumerate(sorted(set(y))))

        early_stopping = False
        if early_stopping == True:
            xg_train,xg_validate,xg_train_y,xg_validate_y = train_test_split(X,y,test_size=0.2)

            print self.params

            if self.params["objective"] == "binary:logistic":
                print "binary:logistic"
                dtrain = xgb.DMatrix(xg_train, label=xg_train_y)
                dvalid = xgb.DMatrix(xg_validate, label=xg_validate_y)
            else:
                dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in xg_train_y])
                dvalid = xgb.DMatrix(X, label=[self.label2num[label] for label in xg_validate_y])
            #evallist  = [(dtrain,'train')]

            watchlist = [(dtrain,'train'),(dvalid,'val')]
            self.clf = xgb.train(self.params, dtrain, num_boost_round,watchlist,early_stopping_rounds=80)
        else:
            xg_train,xg_train_y = X,y
            if self.params["objective"] == "binary:logistic":
                print "binary:logistic"
                dtrain = xgb.DMatrix(xg_train, label=xg_train_y)
                watchlist = [(dtrain,'train')]
                self.clf = xgb.train(self.params, dtrain, num_boost_round,watchlist)
            else:
                dtrain = xgb.DMatrix(X, label=[self.label2num[label] for label in xg_train_y])
                watchlist = [(dtrain,'train')]
                self.clf = xgb.train(self.params, dtrain, num_boost_round,watchlist)
Beispiel #12
0
    def test_multi_predict(self):
        from sklearn.datasets import make_regression
        from sklearn.model_selection import train_test_split

        n = 1000
        X, y = make_regression(n, random_state=rng)
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            random_state=123)
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        params = {}
        params["tree_method"] = "gpu_hist"

        params['predictor'] = "gpu_predictor"
        bst_gpu_predict = xgb.train(params, dtrain)

        params['predictor'] = "cpu_predictor"
        bst_cpu_predict = xgb.train(params, dtrain)

        predict0 = bst_gpu_predict.predict(dtest)
        predict1 = bst_gpu_predict.predict(dtest)
        cpu_predict = bst_cpu_predict.predict(dtest)

        assert np.allclose(predict0, predict1)
        assert np.allclose(predict0, cpu_predict)
Beispiel #13
0
    def fit(self, X, y):
        #data
        data = X
        label = y
        skf = cross_validation.StratifiedKFold(label, n_folds=10)
        for train_index, val_index in skf:
            dtrain = xgb.DMatrix(data[train_index], label=label[train_index])
            dval = xgb.DMatrix(data[val_index], label=label[val_index])
            break

        #set params
        plst = self.param.items()
        plst += [('eval_metric', 'merror')]
        evallist = [ (dtrain,'train'), (dval,'eval')]

        #train
        if self.bst == None:
            num_round = 100000
            self.bst = xgb.train(plst, dtrain, num_round, evals=evallist, early_stopping_rounds=100)
            self.best_score_ = 1-self.bst.best_score
            self.best_params_= self.bst.best_iteration

        #refit
        dtrain = xgb.DMatrix(data, label=label)
        num_round = self.best_params_
        self.bst = xgb.train(plst, dtrain, num_round, evals=evallist)
Beispiel #14
0
        def XgbTrain(self, submitfile):
              offset = 5000
              X_train, y_train = self.dataMat, self.labelMat
              X_test = self.testData
              xgtest = xgb.DMatrix(X_test)
              
              xgtrain_train = xgb.DMatrix(X_train[offset:,:], label=y_train[offset:])
              xgtrain_val = xgb.DMatrix(X_train[:offset,:], label=y_train[:offset])
              
                      
              watchlist = [(xgtrain_train, 'train'),(xgtrain_val, 'val')]
              model = xgb.train(self.params_best, xgtrain_train, self.num_rounds_best, watchlist,early_stopping_rounds=self.early_stopping_rounds_best)
              preds1 = model.predict(xgtest)
                      
              X_train = X_train[::-1,:]
              y_train = y_train[::-1]

              xgtrain_train = xgb.DMatrix(X_train[offset:,:], label=y_train[offset:])
              xgtrain_val = xgb.DMatrix(X_train[:offset,:], label=y_train[:offset])

              watchlist = [(xgtrain_train, 'train'),(xgtrain_val, 'val')]
              model = xgb.train(self.params_best, xgtrain_train, self.num_rounds_best, watchlist, early_stopping_rounds=self.early_stopping_rounds_best)
              preds2 = model.predict(xgtest)
                      
              preds = preds1 + preds2
              #preds = pd.DataFrame({"Id": self.testid, "Hazard": preds})
              if submitfile!='':
                writer=csv.writer(open(submitfile,'wb'))
                writer.writerow(['ID','Hazard'])
                for i in range(len(preds)):
                    line = [self.testid[i], preds[i]]
                    writer.writerow(line)
	def train_predict(self,train_x,train_y,test_x):
		xgmat_train = xgb.DMatrix(train_x, label=train_y, missing=-9999)
		test_size = test_x.shape[0]
		params = {
			'booster':'gbtree',
			'objective':'binary:logistic',
			'silent':self.silent,
			'eta':self.eta,
			'gamma':self.gamma,
			'max_depth':self.max_depth,
			'min_chile_weitght':self.min_chile_weight,
			'subsample':self.subsample,
			'lambda':self.lambda_,
			'scale_pos_weight':self.scale_pos_weight,
			"colsample_bytree": self.colsample_bytree,
			'eval_metirc':'auc',
			'seed':2014,
			'nthread':self.threads
		}

		watchlist = [ (xgmat_train,'train') ]
		num_round = self.num_boost_round

		bst = xgb.train( params, xgmat_train, num_round, watchlist )
		xgmat_test = xgb.DMatrix(test_x,missing=-9999)

		if self.exist_prediction:
			tmp_train = bst.predict(xgmat_train, output_margin=True)
			tmp_test = bst.predict(xgmat_test, output_margin=True)
			xgmat_train.set_base_margin(tmp_train)
			xgmat_test.set_base_margin(tmp_test)
			bst = xgb.train(params, xgmat_train, self.exist_num_boost_round, watchlist )

		ypred = bst.predict(xgmat_test)
		return ypred
Beispiel #16
0
def run(train_matrix,test_matrix):
    params = {'booster': 'gbtree',
              #'objective': 'multi:softmax',
              'objective': 'multi:softprob',
              'eval_metric': 'mlogloss',
              'gamma': 1,
              'min_child_weight': 1.5,
              'max_depth': 5,
              'lambda': 10,
              'subsample': 0.7,
              'colsample_bytree': 0.7,
              'colsample_bylevel': 0.7,
              'eta': 0.03,
              'tree_method': 'exact',
              'seed': 2017,
              'nthread': 12,
              "num_class":3
              }
    num_round = 10000
    early_stopping_rounds = 50
    watchlist = [(train_matrix, 'train'),
                 (test_matrix, 'eval')
                 ]
    if test_matrix:
        model = xgb.train(params, train_matrix, num_boost_round=num_round, evals=watchlist,
                      early_stopping_rounds=early_stopping_rounds
                      )
        pred_test_y = model.predict(test_matrix,ntree_limit=model.best_iteration)
        return pred_test_y, model
    else:
        model = xgb.train(params, train_matrix, num_boost_round=num_round
                      )
        return model
Beispiel #17
0
def run_benchmark(args, gpu_algorithm, cpu_algorithm):
    print("Generating dataset: {} rows * {} columns".format(args.rows, args.columns))
    print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size))
    tmp = time.time()
    X, y = make_classification(args.rows, n_features=args.columns, random_state=7)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=args.test_size, random_state=7)
    print ("Generate Time: %s seconds" % (str(time.time() - tmp)))
    tmp = time.time()
    print ("DMatrix Start")
    # omp way
    dtrain = xgb.DMatrix(X_train, y_train, nthread=-1)
    dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
    print ("DMatrix Time: %s seconds" % (str(time.time() - tmp)))

    param = {'objective': 'binary:logistic',
             'max_depth': 6,
             'silent': 0,
             'n_gpus': 1,
             'gpu_id': 0,
             'eval_metric': 'error',
             'debug_verbose': 0,
             }

    param['tree_method'] = gpu_algorithm
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
    print ("Train Time: %s seconds" % (str(time.time() - tmp)))

    param['silent'] = 1
    param['tree_method'] = cpu_algorithm
    print("Training with '%s'" % param['tree_method'])
    tmp = time.time()
    xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")])
    print ("Time: %s seconds" % (str(time.time() - tmp)))
def evalModelOHE(train_data, eval_data, train_labels, eval_labels):
    params = {}
#    params["objective"] = "reg:linear"
#    params["eta"] = 0.05
#    params["min_child_weight"] = 8
#    params["subsample"] = 0.7
#    params["colsample_bytree"] = 0.7
#    params["scale_pos_weight"] = 1.0
#    params["silent"] = 1
#    params["max_depth"] = 8
#    params["max_delta_step"]=2
    params["objective"] = "reg:linear"
    params["eta"] = 0.013
    params["min_child_weight"] = 6
    params["subsample"] = 0.51
    params["colsample_bytree"] = 0.6
    params["scale_pos_weight"] = 1.0
    params["silent"] = 1
    params["max_depth"] = 10
    params["max_delta_step"]=1
    plst = list(params.items())
    
    xgtrain = xgb.DMatrix(train_data,label=train_labels)
    xgeval = xgb.DMatrix(eval_data,label=eval_labels)
    evallist  = [(xgeval,'eval'), (xgtrain,'train')]
    xgb.train(plst, xgtrain, num_boost_round=5000, evals=evallist,feval=evalerror)
Beispiel #19
0
    def train(self, X, Y, getApproxError=False):
        dtrain = xgb.DMatrix(X, label=Y)
        self.bst = xgb.train(self.param, dtrain, self.nRounds)

        if getApproxError:

            e = 0.0
            c = 0.0

            kf = KFold(Y.shape[0], n_folds=4)
            for train_index, test_index in kf:

                XTrain = X[train_index, :]
                XTest  = X[test_index, :]

                YTrain = Y[train_index]
                YTest  = Y[test_index]

                dtrain2 = xgb.DMatrix(XTrain, label=YTrain)
                bst = xgb.train(self.param, dtrain2, self.nRounds)
              

                dtest = xgb.DMatrix(XTest)
                probs = bst.predict(dtest)
                ypred =numpy.argmax(probs, axis=1)

                

                error = float(numpy.sum(ypred != YTest))
                e += error
                c += float(len(YTest))

            e/=c

            return e
Beispiel #20
0
def xgboost_model(train, test, num_round, params):
    """
    Takes in: training set, test set, number of estimators, params is a list

    Returns: predictions in correct format
    """
    X = train.as_matrix(train.columns[:-1]).astype(float)
    y = train.as_matrix(["cost"])[:, 0].astype(float)
    ylog1p = np.log1p(y)
    X_test = test.as_matrix(test.columns[:-1]).astype(float)

    xgb_train = xgb.DMatrix(X, label=ylog1p)
    xgb_test = xgb.DMatrix(X_test)

    # Round 1
    bst1 = xgb.train(params, xgb_train, num_round)
    y_pred1 = bst1.predict(xgb_test)

    # Round 2
    # num_round2 = 2000
    # bst2 = xgb.train(params, xgb_train, 2000)
    # y_pred2 = bst2.predict(xgb_test)

    # Power Train
    ypower3 = np.power(y, 1 / 47.0)
    xgb_train3 = xgb.DMatrix(X, label=ypower3)
    xst3 = xgb.train(params, xgb_train3, num_round)
    y_predp3 = xst3.predict(xgb_test)

    p = 0.5
    y_pred = p * np.expm1(y_pred1) + (1 - p) * np.power(y_predp3, 47.0)

    return y_pred
def xgb_features(X,y,Xtest,params=None,random_state=0,n_folds=4,early_stop=20,eval_with_gini=False):
	try:
		if params['objective'] == 'reg:logistic':
			yt = MinMaxScaler().fit_transform(y*1.)		
		else:
			yt = y
		skf = StratifiedKFold(yt, n_folds=n_folds,shuffle=True,random_state=random_state)
		ypred_test = np.zeros(Xtest.shape[0])
		ypred_train =np.zeros(X.shape[0])
		seed = random_state;
		dtest = xgb.DMatrix(data=Xtest)
		for train_index, test_index in skf:
			X_train, X_test = X[train_index], X[test_index]
			y_train, y_test = yt[train_index], yt[test_index]
			dtrain = xgb.DMatrix(data=X_train,label=y_train)
			dvalid = xgb.DMatrix(data=X_test,label=y_test)
			evallist = [(dtrain,'train'),(dvalid,'valid')]
			num_round = 5000
			params['seed'] = seed+1
			seed+=1
			plst = params.items()
			if eval_with_gini:
				bst = xgb.train( plst, dtrain, num_round,evallist,early_stopping_rounds=early_stop,feval=evalerror)
			else :
				bst = xgb.train( plst, dtrain, num_round,evallist,early_stopping_rounds=early_stop)
			ypred = bst.predict(dtest,ntree_limit=bst.best_iteration)
			ypred_valid = bst.predict(dvalid)
			print ("\tcross validation gini score %s: %f"%(params['objective'],gini(y_test,ypred_valid)))
			ypred_test += ypred
			ypred_train[test_index] = ypred_valid
	except KeyboardInterrupt:
		ypred_test = np.zeros(Xtest.shape[0]);
		ypred_train = np.zeros(X.shape[0]);
		return ypred_train, ypred_test		
	return ypred_train, ypred_test*1./n_folds
Beispiel #22
0
def xgboost_pred(train,labels,test):
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.01
    params["min_child_weight"] = 25
    params["subsample"] = 0.8
    params["colsample_bytree"] = 0.85
    params["scale_pos_weight"] = 1.0
    params["silent"] = 1
    params["max_depth"] = 10

    plst = list(params.items())

    #Using 8000 rows for early stopping. 
    offset = 8000

    num_rounds = 5000
    xgtest = xgb.DMatrix(test)

    #create a train and validation dmatrices 
    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])
#   xgtrain = xgb.DMatrix(train, label=labels)
    #xgval = xgb.DMatrix(train, label=labels)

    #train using early stopping and predict
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50)

    preds_valid = model.predict(xgval)
    valid_gini=Gini(np.array(labels[:offset,]), preds_valid[:,])
    print valid_gini
    #model = xgb.train(plst, xgtrain, 1000)
    preds1 = model.predict(xgtest)


    #reverse train and labels and use different 5k for early stopping. 
    # this adds very little to the score but it is an option if you are concerned about using all the data. 
    train = train[::-1,:]
    labels = np.log(labels[::-1])

    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])
#   xgtrain = xgb.DMatrix(train, label=labels)


    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=50)
    preds_valid = model.predict(xgval)
    valid_gini=Gini(np.array(labels[:offset,]), preds_valid[:,])
    print valid_gini
#   model = xgb.train(plst, xgtrain, 1000)
    preds2 = model.predict(xgtest)


    #combine predictions
    #since the metric only cares about relative rank we don't need to average
    preds = preds1*0.4 + preds2*0.6
    return preds
Beispiel #23
0
    def model(self):
        x = self.train[self.features].values
        x_test = self.test[self.features].values        
        
        # TODO compute from no. of rows & features
        k_folds = 10
        n_times = 1
        
        dx = xgb.DMatrix(x, label=self.y, missing = float('nan'))        
        dtest = xgb.DMatrix(x_test, missing = float('nan') )        
        
        # setup parameters for xgboost
        param = {}
        param['objective'] = 'multi:softmax'
        param['eval_metric'] = 'mlogloss'
        param['num_class'] = self.noOfClasses
        param['eta'] = 0.3
        param['max_depth'] = 4
        param['silent'] = 1
#            param['subsample'] = 0.5
#            param['colsample_bytree'] = 0.6        
        
        scores = []
        iters = []
        
        for n in range(n_times):
            print '---------------- ' + str(n+1)
            skf = StratifiedKFold(self.y, n_folds=k_folds, shuffle=True) 
            
    #        for train_index, validation_index in skf:
            for validation_index, train_index in skf:
                x_train, x_validate = x[train_index], x[validation_index]
                y_train, y_validate = self.y[train_index], self.y[validation_index]        
                
                dtrain = xgb.DMatrix(x_train, label=y_train, missing = float('nan') )
                dval = xgb.DMatrix(x_validate, label=y_validate, missing = float('nan') ) # 
                
                watchlist = [ (dtrain,'train'), (dval, 'test') ]
                num_round = 500
                clf = xgb.train(param, dtrain, num_round, watchlist, early_stopping_rounds=30, verbose_eval=False) # 
                scores.append(np.absolute(clf.best_score))
                iters.append(clf.best_iteration)
        
        print('\n=========== overall eval metric for ' + str(k_folds) + ' folds ===========')      

        print(' '.join(self.features) + '\n')
        print(param)
        print('XGBoost classifier = ' + str(1-np.mean(scores)) + ' +/- ' + str(round(np.std(scores)*100, 2)) + '%' )
        n_rounds_max = np.max(iters) + 1
        
        xgb_clf = xgb.train(param, dx, n_rounds_max)
        self.predictions = xgb_clf.predict(dtest)
        print self.predictions
        
        submission = pd.DataFrame({ self.id_col: self.test[self.id_col],
                            self.y.name: self.predictions })
        submission[self.y.name] = submission[self.y.name]
        submission.to_csv(os.path.join(self.directory, 'submission.csv' ), index=False)
Beispiel #24
0
    def learn_and_predict_xgb(self, dataset='train'):
        '''
        Use xgboost to do work
        '''
	#predictors = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize", "Titles", "FamilyId"]
	predictors = self.PREDICTORS
        if dataset == 'train':
	    param_dist = {'max_depth': sp_randint(3, 10),
	                  'learning_rate': [0.01, 0.03, 0.1, 0.3, 1.0],
	                  'gamma': [0, 0.1, 0.2, 0.3],
			  'subsample': [.1, .2, .3, .4, 0.5],
			  'colsample_bytree': [.4, .5],
			  'objective': ['binary:logistic'],
			  'n_estimators': sp_randint(20, 150),
			  }

	    clf = xgb.XGBClassifier()
            #random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=500, cv=3)
	    #random_search.fit(self.train_df[predictors], self.train_df['Survived'])

	    #report(random_search.grid_scores_)
            params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 
	    bst = xgb.train(params, self.DMatrix_train)
	    predictions = pd.Series(bst.predict(self.DMatrix_train))
	    predictions[predictions >= .5] = 1
	    predictions[predictions < .5] = 0
	    predictions = [int(x) for x in predictions.tolist()]

            train_model = pd.DataFrame({
	                  'PassengerId': self.train_df['PassengerId'],
			  'Survived': predictions,
	    })
	    train_model.to_csv('./xgb_train.csv', index=False)

        else: 
            params = {'max_depth': 6, 'learning_rate': 0.1, 'colsample_bytree': 0.5, 'n_estimators': 54, 'subsample': .3, 'gamma': 0, 'objective':'binary:logistic', 'eval_metric': 'auc'} #0.845, cv=3 
	    bst = xgb.train(params, self.DMatrix_train)
	    #clf = xgb.XGBClassifier(params)
	    #clf.fit(self.train_df[predictors], self.train_df['Survived'], verbose=True)
	    #print(self.test_df[predictors])
	    predictions = pd.Series(bst.predict(self.DMatrix_test))
	    predictions_proba = predictions.copy()
	    predictions[predictions >= .5] = 1
	    predictions[predictions < .5] = 0
	    predictions = [int(x) for x in predictions.tolist()]
	    print(predictions)
            submission = pd.DataFrame({
                    'PassengerId': self.test_df['PassengerId'],
		    'Survived': predictions 
		    })
            submission.to_csv("xgboost_845.csv", index=False)
	    
            submission_proba = pd.DataFrame({
                    'PassengerId': self.test_df['PassengerId'],
		    'Survived': predictions_proba,
		    })
            submission_proba.to_csv("xgboost_845_soft.csv", index=False)
Beispiel #25
0
	def predict(self, X, y, X_test, stage):
		np.random.seed(self.seed)
		n_train = X.shape[0]
		kf = KFold(n_train, n_folds=self.n_fold, shuffle=True)
		param = {}
		param['objective'] = self.obj
		param['eval_metric'] = self.eval_metric
		param['num_class'] = self.num_class
		param['nthread'] = self.nthread
		param['silent'] = self.silent
		param['eta'] = self.eta
		param['colsample_bytree'] = self.colsample_bytree
		param['subsample'] = self.subsample
		param['max_depth'] = self.max_depth
		param['max_delta_step'] = self.max_delta_step
		param['gamma'] = self.gamma
		param['alpha'] = self.alpha
		param['lambda'] = self.param_lambda
		num_round = 10000
		best_score = []
		best_iter = []
		y_pred_sum = np.zeros((X_test.shape[0], self.num_class))
		if stage=='base':
			meta_feat = np.zeros((n_train+X_test.shape[0], self.num_class))
		xg_test = xgb.DMatrix(X_test)
		i = 0
		for train, val in kf:
			i += 1
			print(i)
			X_train, X_val, y_train, y_val = X[train], X[val], y[train], y[val]
			xg_train = xgb.DMatrix(X_train, y_train)
			xg_val = xgb.DMatrix(X_val, y_val)
			evallist  = [(xg_train,'train'), (xg_val,'eval')]
			## CV sets
			# train
			bst = xgb.train(param, xg_train, num_round, evallist, early_stopping_rounds=100)
			best_score += [bst.best_score]
			best_iter += [bst.best_iteration]
			# predict
			if stage=='base':
				meta_feat[val, :] = bst.predict(xg_val, ntree_limit=bst.best_iteration)
			else:
				y_pred = bst.predict(xg_test, ntree_limit=bst.best_iteration)
				y_pred_sum = y_pred_sum+y_pred
		print(np.mean(best_score), np.std(best_score))
		## test set
		if stage=='base':
			# train
			xg_train = xgb.DMatrix(X, y)
			evallist  = [(xg_train,'train')]
			bst = xgb.train(param, xg_train, int(np.mean(best_iter)), evallist)
			# predict
			meta_feat[n_train:, :] = bst.predict(xg_test)
			return meta_feat
		else:
			y_pred = y_pred_sum/self.n_fold
			return y_pred
    def doclassify(self, type='normal'):                # Boosting
        if type == 'split':
            dtrainmis = xgb.DMatrix(array(self.misstrain), array(self.misstrain_y), missing=NAN)
            dtest = xgb.DMatrix(array(self.normaltest), missing=NAN)
            dtestmis = xgb.DMatrix(array(self.misstest), missing=NAN)
            param = {'bst:max_depth':10, 'bst:eta':0.02, 'silent':1, 'objective':'binary:logistic', 'subsample':0.8,"colsample_bytree": 0.68,"booster": "gbtree"}
            param['nthread'] = 4
            param['eval_metric'] = 'logloss'
            evallist  = [(dtrainmis, 'train')]
            num_round = 320
            bstmis = xgb.train(param, dtrainmis, num_round, evallist,)
            dtrain = xgb.DMatrix(array(self.normaltrain), array(self.normaltrain_y))
            num_round = 366
            evallist  = [(dtrain, 'train')]
            bst = xgb.train(param, dtrain, num_round, evallist,)
            ypredmis = bstmis.predict(dtestmis)
            ypred = bst.predict(dtest)
            result = []
            output1 = list(ypredmis)
            output2 = list(ypred)
            for i in self.test:
                if dp.List_dataProcess().check_missing(i):
                    result.append(output1.pop(0))
                else:
                    result.append(output2.pop(0))
            print len(output1)
            print len(output2)
            writer(self.id, result)

        if type == 'normal':
            dtrain = xgb.DMatrix(array(self.train_x), array(self.train_y))
            dtest = xgb.DMatrix(array(self.test))
            param = {'bst:max_depth':10, 'bst:eta':0.02, 'silent':1, 'objective':'binary:logistic', 'subsample':0.9,"colsample_bytree": 0.68,"booster": "gbtree"}
            param['nthread'] = 4
            param['eval_metric'] = 'logloss'
            evallist  = [(dtrain, 'train')]
            num_round = 300
            bst = xgb.train(param, dtrain, num_round, evallist,)
            ypred = bst.predict(dtest)
            writer(self.id, ypred)

            acc = 0.0
            for i in range(10000):
                if array(self.train_y)[len(self.train_y)-10000+i] == 1 and ypred[i] > 0.35:
                    acc += 1
                if array(self.train_y)[len(self.train_y)-10000+i] == 0 and ypred[i] <= 0.35:
                    acc += 1
            print "Accuracy : ", acc/10000
            fpr, tpr, thresholds = metrics.roc_curve(self.train_y[-10000:], ypred, pos_label=1)
            for i in range(len(fpr)):
                plt.plot(fpr[i], tpr[i], "b*")
                plt.plot(fpr, tpr)
            plt.title(val)
            plt.show()
            print "AUC : ", metrics.auc(fpr, tpr)
            print thresholds
Beispiel #27
0
def train(X, y, available_devices):
    dtrain = xgb.dask.create_worker_dmatrix(X, y)
    local_device = available_devices[xgb.rabit.get_rank()]
    # Specify the GPU algorithm and device for this worker
    params = {"tree_method": "gpu_hist", "gpu_id": local_device}
    print("Worker {} starting training on {} rows".format(xgb.rabit.get_rank(), dtrain.num_row()))
    start = time.time()
    xgb.train(params, dtrain, num_boost_round=500)
    end = time.time()
    print("Worker {} finished training in {:0.2f}s".format(xgb.rabit.get_rank(), end - start))
Beispiel #28
0
    def fit(self, X, y, X_valid=None, y_valid=None, sample_weight=None):
        """Fit training dafa.

        Parameters
        ----------
        X : array-like or sparse matrix, shape=(n_samples, n_features)
            The input samples.

        y : array-like, shape=(n_samples,)

        X_valid : array-like or sparse matrix, shape=(n_valid_samples, n_features)
            The validation samples.

        y_valid : array-like, shape=(n_valid_samples,)

        sample_weight : array-like, shape = [n_samples], optional


        Returns
        -------
        self : object
            Returns self.

        """
        X, y = self._ready_to_fit(X, y)
        xgb_params = self.get_xgb_params()

        # xgboost accepts dense, csc, csr
        if isinstance(X, sp.sparse.coo_matrix):
            X = X.tocsc()

        if sample_weight is not None:
            xg_train = xgb.DMatrix(X, label=y, weight=sample_weight)
        else:
            xg_train = xgb.DMatrix(X, label=y)
        watchlist = [ (xg_train,'train')]

        if not (X_valid is None):
            if isinstance(X_valid, sp.sparse.coo_matrix):
                X_valid = X_valid.tocsc()
            if sample_weight is not None:
                xg_valid = xgb.DMatrix(X_valid, label=y_valid, weight=sample_weight)
            else:
                xg_valid = xgb.DMatrix(X_valid, label=y_valid)
            watchlist = [ (xg_train,'train'), (xg_valid, 'valid') ]

        if self.verbose:
            # with watchlist
            self.bst_ = xgb.train(params=xgb_params, dtrain=xg_train, num_boost_round=int(self.n_iter), evals=watchlist, early_stopping_rounds=int(self.early_stopping_rounds))
        else:
            # without watchlist
            # early stopping is not available
            self.bst_ = xgb.train(params=xgb_params, dtrain=xg_train, num_boost_round=int(self.n_iter))

        return self
Beispiel #29
0
def train(train_X, train_Y, validation_X, validation_Y, feature_names):

    train_X      = array(train_X);
    train_Y      = array(train_Y);
    validation_X = array(validation_X);
    validation_Y = array(validation_Y);

    print(type(train_X));

    dtrain = xgb.DMatrix( train_X, label=train_Y, missing=float('NaN'));
    dvalidation = xgb.DMatrix( validation_X, label=validation_Y,missing=float('NaN'))

    #Clean up data
    del train_X, validation_X, train_Y, validation_Y;

    #Track metrics on the watchlist
    watchlist = [ (dtrain,'train'), (dvalidation, 'validation') ]

    parameters_to_try = generateParams();

    best_params          = None;
    overall_best_metric  = 0;
    overall_best_nrounds = 0;

    for i in range(0, len(parameters_to_try)):
        param      = parameters_to_try[i];
        num_round  = 1000;

        classifier = xgb.train(param,dtrain,num_round,evals=watchlist,early_stopping_rounds=100);
        
        metric     = classifier.best_score;
        itr        = classifier.best_iteration;

        print("\n Metric : " + str(metric) + " for Params " + str(param) + " occurs at " + str(itr));

        if metric > overall_best_metric:
            overall_best_metric  = metric;
            best_params          = copy.copy(param);
            overall_best_nrounds = itr;

    print("\n Training the model on the entire training set with the best params")

    bst = xgb.train(best_params, dtrain, 1+overall_best_nrounds);
    print("\n\n Overall Best AUC : " + str(overall_best_metric) + " for Params " + str(best_params) + " occurs at " + str(overall_best_nrounds));
    

    feature_imp = bst.get_fscore();

    print("Feature Importance ... ");
    for w in sorted(feature_imp, key=feature_imp.get, reverse=True):
        print( str(feature_names[int(w.replace("f",""))]) + " : "  + str(feature_imp[w]) );

    
    return bst;
Beispiel #30
0
    def buildXGB(self):
        '''
        train_shfl = train.iloc[np.random.permutation(len(train))]
        X = train_shfl.as_matrix(train_shfl.columns[:-1]).astype(float)
        y = train_shfl.as_matrix(['cost'])[:,0].astype(float)
        ylog1p = np.log1p(y).astype(float)
        '''

        X = self.train.as_matrix(self.train.columns[:-1]).astype(float)
        y = self.train.as_matrix(['cost'])[:,0].astype(float)
        ylog1p = np.log1p(y).astype(float)
        # cost is still last column
        X_test = self.test.as_matrix(self.test.columns[:-1]).astype(float)


        xgb_train = xgb.DMatrix(X, label = ylog1p)
        xgb_test = xgb.DMatrix(X_test)

        #Train multiple times

        # Round 1
        num_round1 = 4000
        self.bst1 = xgb.train(self.params, xgb_train, num_round1)
        y_pred1 = self.bst1.predict(xgb_test)

        # Round 2
        num_round2 = 2000
        self.bst2 = xgb.train(self.params, xgb_train, num_round2)
        y_pred2 = self.bst2.predict(xgb_test)

        #Power Train
        #ypower2 = np.power(y,1/5.0)
        ypower3 = np.power(y,1/20.0)

        #xgb_train2 = xgb.DMatrix(X, label = ypower2)
        xgb_train3 = xgb.DMatrix(X, label = ypower3)


        #self.xst2 = xgb.train(self.params, xgb_train2, self.num_round)
        #y_predp2 = self.xst2.predict(xgb_test)

        self.xst3 = xgb.train(self.params, xgb_train3, self.num_round)
        y_predp3 = self.xst3.predict(xgb_test)

        #y_power = (np.power(y_predp2,5.0) + np.power(y_predp3,10.0))/2.0
        y_power = np.power(y_predp3,20.0)

        self.y_pred = (np.expm1(0.75*y_pred1+0.25*y_pred2) + y_power)/2.0
        #self.y_pred = 0.35*np.expm1(0.75*y_pred1+0.25*y_pred2) + 0.65*y_power
        print
        print "================================================================"
        print "================  Finished with Prediction   ==================="
        print "================================================================"
train_data = xgb.DMatrix(result_path + 'train.sparse')
test_data = xgb.DMatrix(result_path + 'test.sparse')

param = {
    'bst:max_depth': 10,
    'bst:min_child_weight': 5,
    'bst:eta': 0.5,
    'silent': 0,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

param['nthread'] = 4
plst = param.items()
evallist = [(train_data, 'train')]

num_round = 30

bst = xgb.train(plst, train_data, num_round, evallist)

bst.dump_model(result_path + 'dump.raw.txt')

ypred = bst.predict(test_data)

output = open(result_path + 'submission.csv', 'w')
output.write('Id,Predicted\n')
for p in ypred:
    output.write('{0},{1}\n'.format('anything', p))

output.close()
Beispiel #32
0
    'base_score': y_mean,  # base prediction = mean(target)
    'silent': 1
}

# prepare dict of params for xgboost to run with

# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1350
# train model
model = xgb.train(
    dict(xgb_params, silent=0),
    dtrain,
    num_boost_round=num_boost_rounds,
    # early_stopping_rounds=50,
    verbose_eval=10)
y_pred = model.predict(dtest)
'''Train the stacked models then predict the test data'''

# Stacking with average
en = make_pipeline(RobustScaler(), PCA(n_components=12),
                   ElasticNet(alpha=0.001, l1_ratio=0.1))

rf = RandomForestRegressor(n_estimators=250,
                           n_jobs=4,
                           min_samples_split=25,
                           min_samples_leaf=25,
                           max_depth=3)
Beispiel #33
0
    def run_training_continuation(self, xgb_params_01, xgb_params_02,
                                  xgb_params_03):
        from sklearn.datasets import load_digits
        from sklearn.metrics import mean_squared_error

        digits_2class = load_digits(n_class=2)
        digits_5class = load_digits(n_class=5)

        X_2class = digits_2class['data']
        y_2class = digits_2class['target']

        X_5class = digits_5class['data']
        y_5class = digits_5class['target']

        dtrain_2class = xgb.DMatrix(X_2class, label=y_2class)
        dtrain_5class = xgb.DMatrix(X_5class, label=y_5class)

        gbdt_01 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=10)
        ntrees_01 = len(gbdt_01.get_dump())
        assert ntrees_01 == 10

        gbdt_02 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=0)
        gbdt_02.save_model('xgb_tc.model')

        gbdt_02a = xgb.train(xgb_params_01,
                             dtrain_2class,
                             num_boost_round=10,
                             xgb_model=gbdt_02)
        gbdt_02b = xgb.train(xgb_params_01,
                             dtrain_2class,
                             num_boost_round=10,
                             xgb_model="xgb_tc.model")
        ntrees_02a = len(gbdt_02a.get_dump())
        ntrees_02b = len(gbdt_02b.get_dump())
        assert ntrees_02a == 10
        assert ntrees_02b == 10

        res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class))
        res2 = mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class))
        assert res1 == res2

        res1 = mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class))
        res2 = mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))
        assert res1 == res2

        gbdt_03 = xgb.train(xgb_params_01, dtrain_2class, num_boost_round=3)
        gbdt_03.save_model('xgb_tc.model')

        gbdt_03a = xgb.train(xgb_params_01,
                             dtrain_2class,
                             num_boost_round=7,
                             xgb_model=gbdt_03)
        gbdt_03b = xgb.train(xgb_params_01,
                             dtrain_2class,
                             num_boost_round=7,
                             xgb_model="xgb_tc.model")
        ntrees_03a = len(gbdt_03a.get_dump())
        ntrees_03b = len(gbdt_03b.get_dump())
        assert ntrees_03a == 10
        assert ntrees_03b == 10

        os.remove('xgb_tc.model')

        res1 = mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class))
        res2 = mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
        assert res1 == res2

        gbdt_04 = xgb.train(xgb_params_02, dtrain_2class, num_boost_round=3)
        assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration +
                                            1) * self.num_parallel_tree

        res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
        res2 = mean_squared_error(
            y_2class,
            gbdt_04.predict(dtrain_2class,
                            ntree_limit=gbdt_04.best_ntree_limit))
        assert res1 == res2

        gbdt_04 = xgb.train(xgb_params_02,
                            dtrain_2class,
                            num_boost_round=7,
                            xgb_model=gbdt_04)
        assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration +
                                            1) * self.num_parallel_tree

        res1 = mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class))
        res2 = mean_squared_error(
            y_2class,
            gbdt_04.predict(dtrain_2class,
                            ntree_limit=gbdt_04.best_ntree_limit))
        assert res1 == res2

        gbdt_05 = xgb.train(xgb_params_03, dtrain_5class, num_boost_round=7)
        assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration +
                                            1) * self.num_parallel_tree
        gbdt_05 = xgb.train(xgb_params_03,
                            dtrain_5class,
                            num_boost_round=3,
                            xgb_model=gbdt_05)
        assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration +
                                            1) * self.num_parallel_tree

        res1 = gbdt_05.predict(dtrain_5class)
        res2 = gbdt_05.predict(dtrain_5class,
                               ntree_limit=gbdt_05.best_ntree_limit)
        np.testing.assert_almost_equal(res1, res2)
Beispiel #34
0
X = format_input_for_network(X_train, N_FEAT)
X_eval = format_input_for_network(X_val, N_FEAT)
callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=PATIENCE,
                                            restore_best_weights=True)

# Create our baseline model: a gradient boosted tree.
dtrain = xgb.DMatrix(X_train, label=y_train, silent=True)
param = {'max_depth': 2, 'eta': 1, 'objective': 'reg:squarederror'}
param['nthread'] = 4
param['eval_metric'] = 'rmse'
dtest = xgb.DMatrix(X_val, label=y_val, silent=True)
evallist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param,
                dtrain,
                PATIENCE,
                evallist,
                early_stopping_rounds=10,
                verbose_eval=10)
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

#For ease of use and reproducability I have saved the model,
# but here is the code to create it from scratch.
'''
# define model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(N_STEPS, N_FEAT), dropout=0.25))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mae')

# fit model
model.fit(X, y_train, epochs=200, verbose=1, callbacks=[callback])
Beispiel #35
0
    'scale_pos_weight': 1,
    'colsample_bytree': 0.8,
    'eval_metric': 'logloss',
    'nthread': 8,
    'sample_type': 'uniform',
    'normalize_type': 'forest',
    'random_state': 1
}

plst = params.items()
evallist = [(xgb_train, 'train'), (xgb_val, 'eval')]
num_round = 500

bst = xgb.train(plst,
                xgb_train,
                num_round,
                evals=evallist,
                early_stopping_rounds=10)

#
# bst = xgb.cv(params=params, dtrain=xgb_train, nfold=5, metrics='logloss', verbose_eval=2, early_stopping_rounds=10)

# bst.save_model(out_path + 'xgb.model')

feat_importance = bst.get_fscore()
print(feat_importance)

with open(out_path + 'feat_importance.csv', 'w') as fo:
    for k in feat_importance.keys():
        fo.write(str(k) + ',')
    fo.write('\n')
Beispiel #36
0
    # 'scale_pos_weight': 10,
    'eval_metric': 'auc',
    'subsample': 0.76,
    'colsample_bytree': 0.95,
    # 'n_estimators': 5000,
    # 'eta_decay': 0.5,
    'seed': 1,
    # 'min_child_weight': 0.8,
}

rounds = 10000

bst = xgb.train(param,
                dtrain,
                rounds,
                early_stopping_rounds=300,
                evals=evals,
                evals_result=gpu_res,
                verbose_eval=True)

# pickle.dump(bst, open(_dir+"/models/model.1", "wb"))

# bst = get_pickled(_dir, "models/model.1")

# trained = xgb.train(param, dtrain, 782)
# xgb.train(param, dtrain)

# imp = xgb.plot_importance(bst)
# scores = bst.get_score()
#
# scores_sort = sorted(scores.items(), key=lambda kv: kv[1])
Beispiel #37
0
# this is log likelihood loss
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad, hess


# user defined evaluation function, return a pair metric_name, result
# NOTE: when you do customized loss function, the default prediction value is margin
# this may make builtin evaluation metric not function properly
# for example, we are doing logistic loss, the prediction is score before logistic transformation
# the builtin evaluation error assumes input is after logistic transformation
# Take this in mind when you use the customization, and maybe you need write customized evaluation function
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    # return a pair metric_name, result. The metric name must not contain a colon (:)
    # since preds are margin(before logistic transformation, cutoff at 0)
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)


# training with customized objective, we can also do step by step training
# simply look at xgboost.py's implementation of train
bst = xgb.train(param,
                dtrain,
                num_round,
                watchlist,
                obj=logregobj,
                feval=evalerror)
Beispiel #38
0
            tr_angle_mat.append(0)

    tr_input_arr = np.array(tr_input_mat)
    tr_angle_arr = np.array(tr_angle_mat)
    dtrain = xgb.DMatrix(tr_input_arr, label=tr_angle_arr)
    param = {
        'max_depth': 6,
        'eta': 0.2,
        'subsumble': 0.5,
        'silent': 1,
        'objective': 'binary:logistic'
    }

    watchlist = [(dtrain, 'train')]
    num_round = 3000  #10 #3000 # 1000
    bst = xgb.train(param, dtrain, num_round, watchlist)

    bst.dump_model('./dump.raw.txt')
    bst.save_model('./hoge.model')

### training end

# trade
portfolio = 1000000
LONG = 1
SHORT = 2
NOT_HAVE = 3
pos_kind = NOT_HAVE
HALF_SPREAD = 0.0015
SONKIRI_RATE = 0.05
RIKAKU_PIPS = 0.60
Beispiel #39
0
    def fit(self, x_train, y_train, x_valid=None, y_valid=None):
        # xgb_start = time.time()
        best_model = None
        best_round = None
        best_score = {}
        if self.cv_folds is not None:
            log.logger.info('CrossValidation。。。。')
            d_train = xgb.DMatrix(x_train, label=y_train)
            cv_result = self._kfold(d_train)
            print('cv_result %s' % cv_result)
            print('type_cv_result %s' % type(cv_result))
            # min_rmse = pd.Series(cv_result['test-rmse-mean']).min()
            # self.best_score['min_test-rmse-mean'] = min_rmse
            # self.best_round = cv_result[cv_result['test-rmse-mean'].isin([min_rmse])].index[0]
            best_score['min_test-rmse-mean'] = pd.Series(
                cv_result['test-rmse-mean']).min()
            best_round = pd.Series(cv_result['test-rmse-mean']).idxmin()
            best_model = xgb.train(self.xgb_params, d_train, best_round)

        elif self.ts_cv_folds is not None:
            log.logger.info('TimeSeriesCrossValidation。。。。')
            # 时间序列k_fold
            bst_score = 0
            details = []
            scores = []
            tscv = TimeSeriesSplit(n_splits=self.ts_cv_folds)
            if self.xgb_params['objective'] is not 'reg:linear':
                log.logger.error('Objective ERROR........')
                exit()

            for n_fold, (tr_idx, val_idx) in enumerate(tscv.split(x_train)):
                print(f'the {n_fold} training start ...')
                tr_x, tr_y, val_x, val_y = x_train.iloc[tr_idx], y_train[
                    tr_idx], x_train.iloc[val_idx], y_train[val_idx]
                d_train = xgb.DMatrix(tr_x, label=tr_y)
                d_valid = xgb.DMatrix(val_x, label=val_y)
                watchlist = [(d_train, "train"), (d_valid, "valid")]
                xgb_model = xgb.train(
                    params=self.xgb_params,
                    dtrain=d_train,
                    num_boost_round=self.num_boost_round,
                    evals=watchlist,
                    early_stopping_rounds=self.early_stop_round)
                details.append((xgb_model.best_score, xgb_model.best_iteration,
                                xgb_model))

                if xgb_model.best_score > bst_score:
                    bst_score = xgb_model.best_score
                    best_model = xgb_model
                    best_round = xgb_model.best_iteration
                else:
                    best_model = xgb_model
                    best_round = xgb_model.best_iteration
                scores.append(xgb_model.best_score)
            best_score['avg_score'] = np.mean(scores)

        else:
            log.logger.info('NonCrossValidation。。。。')
            if x_valid is None and y_valid is None:
                # 注意这里的shift
                # x_train, x_valid, y_train, y_valid = train_test_sp(x_train, y_train, test_size=0.2, shift=0)
                d_train = xgb.DMatrix(x_train, label=y_train)
                watchlist = [(d_train, "train")]
            else:
                d_train = xgb.DMatrix(x_train, label=y_train)
                d_valid = xgb.DMatrix(x_valid, label=y_valid)
                watchlist = [(d_train, "train"), (d_valid, "valid")]

            best_model = xgb.train(params=self.xgb_params,
                                   dtrain=d_train,
                                   num_boost_round=self.num_boost_round,
                                   evals=watchlist,
                                   verbose_eval=5,
                                   early_stopping_rounds=self.early_stop_round)
            best_round = best_model.best_iteration
            best_score['best_score'] = best_model.best_score
        # print('spend time :' + str((time.time() - xgb_start)) + '(s)')
        return best_score, best_round, best_model
    # 训练函数参数设定
    params = {
            'objective': 'multi:softmax',
            'eta': 0.1,
            'max_depth': 9,
            'eval_metric': 'merror',
            'seed': 0,
            'missing': -999,
            'num_class':num_class,
            'silent' : 1
            }
    #从数据中去掉多余的列
    feature=[x for x in train1.columns if x not in ['user_id','label','shop_id','time_stamp','mall_id','wifi_infos']]
    df_train_1 = df_train[feature]          #训练数据对null值进行填充-100
    df_train_1 = df_train_1.where(df_train_1.notnull(), -100)
    xgbtrain = xgb.DMatrix(df_train_1, df_train['label'])
    df_test_1 = df_test[feature]           #测试数据对null值进行填充-100
    df_test_1 = df_test_1.where(df_test_1.notnull(), -100)
    xgbtest = xgb.DMatrix(df_test_1)
    watchlist = [ (xgbtrain,'train'), (xgbtrain, 'test') ]
    num_rounds=100
    model = xgb.train(params, xgbtrain, num_rounds, watchlist, early_stopping_rounds=15)   #训练模型
    df_test['label']=model.predict(xgbtest)        #预测
    df_test['shop_id']=df_test['label'].apply(lambda x:lbl.inverse_transform(int(x)))   #对标签1-N转化为shop_id
    r=df_test[['row_id','shop_id']]  #选出r
    result=pd.concat([result,r])
    j = j + 1
    print j
result['row_id']=result['row_id'].astype('int')
result.to_csv('sub.csv',index=False)
Beispiel #41
0
 def fit(self, data_x, data_y, num_class):
     if self.special_objective is None:
         # get the parameter list
         self.para_dict = {
             'max_depth': self.max_depth,
             'eta': self.eta,
             'silent': self.silent_mode,
             'objective': self.objective_func,
             'num_class': num_class,
             'eval_metric': self.eval_metric,
             'booster': self.booster
         }
     else:
         # get the parameter list, without stating the objective function
         self.para_dict = {
             'max_depth': self.max_depth,
             'eta': self.eta,
             'silent': self.silent_mode,
             'eval_metric': self.eval_metric,
             'booster': self.booster
         }
     # make sure data is in [nData * nSample] format
     assert len(data_x.shape) == 2
     # check if data length is the same
     if data_x.shape[0] != data_y.shape[0]:
         raise ValueError(
             'The numbner of instances for x and y data should be the same!'
         )
     # data_x is in [nData*nDim]
     nData = data_x.shape[0]
     nDim = data_x.shape[1]
     # split the data into train and validation
     holistic_ind = np.random.permutation(nData)
     train_ind = holistic_ind[0:nData * 3 // 4]
     valid_ind = holistic_ind[nData * 3 // 4:nData]
     # indexing and get the data
     train_data = data_x[train_ind]
     train_label = data_y[train_ind]
     valid_data = data_x[valid_ind]
     valid_label = data_y[valid_ind]
     # marixilize the data and train the estimator
     dtrain = xgb.DMatrix(train_data, label=train_label)
     dvalid = xgb.DMatrix(valid_data, label=valid_label)
     self.eval_list = [(dvalid, 'valid'), (dtrain, 'train')]
     if self.special_objective is None:
         # fit the classfifier
         self.boosting_model = xgb.train(self.para_dict,
                                         dtrain,
                                         self.num_round,
                                         self.eval_list,
                                         verbose_eval=False)
     elif self.special_objective == 'weighted':
         # fit the classfifier
         self.boosting_model = xgb.train(self.para_dict,
                                         dtrain,
                                         self.num_round,
                                         self.eval_list,
                                         weighted_binary_cross_entropy,
                                         evalerror,
                                         verbose_eval=False)
     elif self.special_objective == 'focal':
         # fit the classfifier
         self.boosting_model = xgb.train(self.para_dict,
                                         dtrain,
                                         self.num_round,
                                         self.eval_list,
                                         focal_binary_object,
                                         evalerror,
                                         verbose_eval=False)
     else:
         raise ValueError(
             'The input special objective mode not recognized! Could only be \'weighted\' or \'focal\', but got '
             + str(self.special_objective))
Beispiel #42
0
            'nthread': 20,
            'seed': 42,
            'silent': 0
        }
        if online == False:
            Dtrain = xgb.DMatrix(train[features],
                                 train[target],
                                 feature_names=features)
            Dtest = xgb.DMatrix(test[features],
                                test[target],
                                feature_names=features)
            watchlist = [(Dtrain, 'train'), (Dtest, 'val')]

            clf = xgb.train(xgb_pars,
                            Dtrain,
                            num_boost_round=450,
                            verbose_eval=1,
                            evals=watchlist)

            #xx = clf.predict(Dtrain, output_margin=False, ntree_limit=0, pred_leaf=True)

            train['lgb_predict'] = clf.predict(Dtrain)
            print('train log_loss',
                  log_loss(train[target], train['lgb_predict']))

            test['lgb_predict'] = clf.predict(Dtest)
            print('test log_loss', log_loss(test[target], test['lgb_predict']))
            """
             xgb_feature = clf.predict(Dtrain,pred_leaf=True)
             xgb_feature = pd.DataFrame(xgb_feature,columns=["xgb_{}".format(i+1) for i in range(xgb_feature.shape[1])])
             xgb_feature['instance_id'] = train['instance_id']
def xgb_train(train_df, test_df, mode, params, num_boost_round,
              early_stopping):

    if mode == "train":

        train = train_df.values[:, 1:-1]
        train_target = train_df.values[:, -1]

        # 5-fold
        kf = KFold(n_splits=5, shuffle=True)
        trainEorror = 0
        error = 0

        for train_index, valid_index in kf.split(train):
            x_train, x_valid = train[train_index], train[valid_index]
            y_train, y_valid = train_target[train_index], train_target[
                valid_index]

            dtrain = xgb.DMatrix(x_train, y_train)
            dvalid = xgb.DMatrix(x_valid, y_valid)
            watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
            gbm = xgb.train(params,
                            dtrain,
                            num_boost_round,
                            evals=watchlist,
                            early_stopping_rounds=early_stopping,
                            verbose_eval=True)

            print("validating")

            tranHat = gbm.predict(xgb.DMatrix(x_train))
            trainEorror += rmsep(y_train, tranHat)
            yhat = gbm.predict(xgb.DMatrix(x_valid))
            error += rmsep(y_valid, yhat)

        print('rmse:{:.6f}'.format(error / 5.0))

    else:

        train = train_df.values[:, 1:-1]
        train_target = train_df.values[:, -1]

        kf = KFold(n_splits=5, shuffle=True)

        result = np.zeros(2000)

        dtest = test_df.values[:, 1:]
        dtest = xgb.DMatrix(dtest)

        for train_index, valid_index in kf.split(train):
            x_train, x_valid = train[train_index], train[valid_index]
            y_train, y_valid = train_target[train_index], train_target[
                valid_index]

            dtrain = xgb.DMatrix(x_train, y_train)
            watchlist = [(dtrain, 'train')]
            gbm = xgb.train(
                params,
                dtrain,
                num_boost_round=num_boost_round,
                evals=watchlist,
                early_stopping_rounds=early_stopping,
            )

            result += gbm.predict(dtest)

        result = result / 5.0
        return result
Beispiel #44
0
def xgb_model_val():

    #  'is_first_get_coupon','user_hour_count_label','context_timestamp_rank_desc_label'  user_item_brand_count  user_diff_shop_count
    train_set = pd.read_csv('data/ftrain.csv', sep=",")
    validate_set = pd.read_csv('data/fvalidate.csv', sep=",")

    train_x = train_set.drop([
        'instance_id', 'context_id', 'item_city_id', 'item_id', 'user_id',
        'item_brand_id', 'shop_id', 'user_gender_id', 'user_occupation_id',
        'is_trade', 'context_timestamp', 'context_page_id',
        'context_timestamp_and_dates', 'dates', 'day', 'hour',
        'item_category_list', 'item_property_list',
        'predict_category_property', 'is_first_get_coupon',
        'context_timestamp_rank_desc_label', 'category',
        'user_shop_count_istrade_rate', 'user_item_brand_count',
        'user_istrade_rate', 'user_diff_shop_count',
        'user_count_minus_user_count_istrade',
        'user_item_count_minus_user_item_istrade', 'user_count_istrade',
        'user_and_user_occupation_count_label',
        'predict_property_jiaoji_item_property',
        'user_shop_count_minus_user_shop_istrade'
    ],
                             axis=1)
    train_y = train_set['is_trade']

    # # 相关性分析
    # pearson_analysis_feature(train_x,train_y)
    # return

    val_x = validate_set.drop([
        'instance_id', 'context_id', 'item_city_id', 'item_id', 'user_id',
        'item_brand_id', 'shop_id', 'user_gender_id', 'user_occupation_id',
        'is_trade', 'context_timestamp', 'context_page_id',
        'context_timestamp_and_dates', 'dates', 'day', 'hour',
        'item_category_list', 'item_property_list',
        'predict_category_property', 'is_first_get_coupon',
        'context_timestamp_rank_desc_label', 'category',
        'user_shop_count_istrade_rate', 'user_item_brand_count',
        'user_istrade_rate', 'user_diff_shop_count',
        'user_count_minus_user_count_istrade',
        'user_item_count_minus_user_item_istrade', 'user_count_istrade',
        'user_and_user_occupation_count_label',
        'predict_property_jiaoji_item_property',
        'user_shop_count_minus_user_shop_istrade'
    ],
                              axis=1)
    val_y = validate_set['is_trade']

    xgb_train = xgb.DMatrix(train_x, label=train_y)
    xgb_val = xgb.DMatrix(val_x, label=val_y)

    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',  # 二分类的问题
        # 'gamma':0.1,  # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。
        'max_depth': 5,  # 构建树的深度,越大越容易过拟合
        # 'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。
        'subsample': 0.8,  # 随机采样训练样本
        'colsample_bytree': 0.8,  # 生成树时进行的列采样
        'min_child_weight': 3,
        # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言
        # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。
        # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。
        'silent': 0,  # 设置成1则没有运行信息输出,最好是设置为0.
        'eta': 0.03,  # 如同学习率
        'nthread': 30,  # cpu 线程数
        'eval_metric': 'logloss'  # 评价方式
    }

    plst = list(params.items())
    num_rounds = 500  # 迭代次数
    watchlist = [(xgb_train, 'train'), (xgb_val, 'val')]
    # early_stopping_rounds    当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练
    model = xgb.train(plst, xgb_train, num_rounds, watchlist)
    importance = model.get_fscore()

    #-----------------------important of feature start-----------------------------------------
    importance = sorted(importance.items(),
                        key=operator.itemgetter(1),
                        reverse=True)
    df = pd.DataFrame(importance, columns=['feature', 'fscore'])
    df['fscore'] = df['fscore'] / df['fscore'].sum()
    print(df)
    outfile.close()
features = [x for x in train_data.columns]
ceate_feature_map(features)

import xgboost as xgb
from  xgboost import plot_importance
print ('start running ....')
dtrain = xgb.DMatrix(x_train,label=y_train)
dval = xgb.DMatrix(x_val,label=y_val)
param = {'learning_rate' : 0.1,
        'n_estimators': 1000,
        'max_depth': 4,
        'min_child_weight': 7,
        'gamma': 0,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'eta': 0.05,
        'silent': 1,
        }

num_round =100
plst = list(param.items())
plst += [('eval_metric', 'rmse')]
evallist = [(dval, 'eval'), (dtrain, 'train')]
bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=50)
dtest = xgb.DMatrix(test_data)
y3 = bst.predict(dtest)
plot_importance(bst)
plt.show()
importance = bst.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
Beispiel #46
0
params = {
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 4,
    'lambda': 10,
    'subsample': 0.75,
    'colsample_bytree': 0.75,
    'min_child_weight': 2,
    'eta': 0.025,
    'seed': 0,
    'nthread': 8,
    'silent': 1
}

watchlist = [(dtrain, 'train')]

bst = xgb.train(params, dtrain, num_boost_round=100, evals=watchlist)

ypred = bst.predict(dtest)

# 设置阈值, 输出一些评价指标
y_pred = (ypred >= 0.5) * 1

print('AUC: %.4f' % metrics.roc_auc_score(test_y, ypred))
print('Accuracy: %.4f' % metrics.accuracy_score(test_y, y_pred))
print('Recall: %.4f' % metrics.recall_score(test_y, y_pred))
print('F1-score: %.4f' % metrics.f1_score(test_y, y_pred))
print('Precesion: %.4f' % metrics.precision_score(test_y, y_pred))
# metrics.confusion_matrix(test_y,y_pred)
Beispiel #47
0
def make_one_step(_df_in, _it):
    _df = _df_in.copy()
    _ratio = 4
    # _x_train, _y_train, _x_val, _y_val = _dataframe
    _y_train = _df.pop('notified')
    _sample_set = get_pickled(data_features_prefix, features_storm_set)
    if _sample_set is None:
        _sample_set = set()

    _sample = take_sample_dim(_df, _ratio)
    _sample = persist_sample(_df, _sample, _sample_set, _ratio)

    # remove_columns(_df, _sample)

    # _cat_df = pd.DataFrame({col: _df[col].astype('category').cat.codes for col in _df},
    #                   index=_df.index)

    _dummies_df = pd.get_dummies(_df, drop_first=True)
    _dummies_df['notified'] = _y_train

    # _df_positives = get_rows(_dummies_df, 'notified', 1)
    # _df_negatives = get_rows(_dummies_df, 'notified', 0)

    train_indexes, valid_indexes = get_pickled(_dir,
                                               "train_valid_shuffled_indexes")

    _train_df = _dummies_df.iloc[train_indexes]
    _valid_df = _dummies_df.iloc[valid_indexes]

    # _train_df, _valid_df = get_split_train_valid(_df_positives, _df_negatives, 0.8)
    _y_train = _train_df.pop('notified')
    _y_valid = _valid_df.pop('notified')

    _tmp = time.time()
    _gpu_res = {}
    _dtrain = xgb.DMatrix(_train_df.values,
                          label=_y_train.values,
                          missing=-999)
    _dval = xgb.DMatrix(_valid_df.values, label=_y_valid.values, missing=-999)
    _evals = [(_dval, 'valid')]

    _param = {
        'objective': 'binary:logistic',  # Specify multiclass classification
        'num_class': 1,  # Number of possible output classes
        'tree_method': 'gpu_exact',  # Use GPU accelerated algorithm
        'scale_pos_weight': 16.32,
        'gpu_id': 1,
        # 'scale_pos_weight': 1,
        # 'scale_pos_weight': 10,
        'eval_metric': 'auc',
        'subsample': 0.8,
        'colsample_bytree': 0.9,
        # 'n_estimators': 5000,
        # 'eta_decay': 0.5,
        'seed': 1,
        # 'min_child_weight': 0.8,
    }

    _rounds = 10000
    _bst = None
    try:
        _bst = xgb.train(_param,
                         _dtrain,
                         _rounds,
                         early_stopping_rounds=300,
                         evals=_evals,
                         evals_result=_gpu_res,
                         verbose_eval=True)
    except Exception:
        _bst = xgb.train(_param,
                         _dtrain,
                         _rounds,
                         early_stopping_rounds=300,
                         evals=_evals,
                         evals_result=_gpu_res,
                         verbose_eval=True)

    print("GPU Training Time: %s seconds" % (str(time.time() - _tmp)))

    _predicted = _bst.predict(_dval)
    _auc = metrics.roc_auc_score(_y_valid, _predicted)
    # auc = trainClassifier(clf, xtrain, ytrain, xtest, ytest)
    write_chosen_solution(_sample, _auc)
    print("Iteration: {} AUC: {} sample: {}".format(_it, _auc, _sample))
    i = 1
Beispiel #48
0
def xgboost_euro(conn, num, df):
    # 이거를 num만큼 반복
    for j in range(1, num+1):
        print("{}번째".format(j))
        df_xg = df
        # 1일 추가하고 주말제거하기
        today = date.today() + relativedelta(days=+j)
        if getDay(today.year, today.month, today.day) == 'Sat' or getDay(today.year, today.month, today.day) == 'Sun':
            continue
        else:
            df_xg = df_xg.append({"Date": pd.Timestamp(
                today), "euro_close": float('nan')}, ignore_index=True)

        # 하루씩 추가하면서 예측하고 그 다음날을 추가해서 1주일, 2주일, 1달 이렇게 빼서 확인해보기

        # XGBOOST
        # extract the date feature
        df_xg['day'] = df_xg.Date.dt.day
        df_xg['dayofweek'] = df_xg.Date.dt.dayofweek
        df_xg['dayofyear'] = df_xg.Date.dt.dayofyear
        df_xg['week'] = df_xg.Date.dt.week
        df_xg['month'] = df_xg.Date.dt.month
        df_xg['year'] = df_xg.Date.dt.year
        df_xg = df_xg.drop('Date', axis=1)
        # 시계열 데이터에서 이전데이터를 현재 데이터에 넣으면 좀 더 정확한 학습이 가능
        # 이것을 lag(지연 데이터라고 표현)
        # lag 데이터를 만들어보도록 함
        for i in range(1, 6):
            df_xg['lag'+str(i)] = df_xg.euro_close.shift(i).fillna(0)

        X = df_xg.drop('euro_close', axis=1)
        y = df_xg.euro_close

        X_train, X_test = X[:-1], X[-1:]
        y_train, y_test = y[:-1], y[-1:]

        # convert data to xgb matrix form
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test)

        # bayesian hyper parameter tuning
        # define the params
        def xgb_evaluate(max_depth, gamma, colsample_bytree):
            params = {'eval_metric': 'rmse',
                      'max_depth': int(max_depth),
                      'subsample': 0.8,
                      'eta': 0.1,
                      'gamma': gamma,
                      'colsample_bytree': colsample_bytree}

            cv_result = xgb.cv(params, dtrain, num_boost_round=250, nfold=3)
            return -1.0 * cv_result['test-rmse-mean'].iloc[-1]

        # run optimizer
        xgb_bo = BayesianOptimization(xgb_evaluate, {'max_depth': (3, 7),
                                                     'gamma': (0, 1),
                                                     'colsample_bytree': (0.3, 0.9)})
        # define iter points
        xgb_bo.maximize(init_points=10, n_iter=15, acq='ei')

        # get the best parameters
        params = xgb_bo.max['params']
        params['max_depth'] = int(round(params['max_depth']))
        # train the data
        model = xgb.train(params, dtrain, num_boost_round=200)

        # predict the test data
        predictions = model.predict(dtest)

        lenv_ = len(df_xg)
        df_xg.euro_close[lenv_-1] = predictions
        df_xg = df_xg.drop(['day', 'dayofweek', 'dayofyear', 'week', 'month',
                            'year', 'day', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5'], axis=1)
        if getDay(today.year, today.month, today.day) == 'Sat' or getDay(today.year, today.month, today.day) == 'Sun':
            continue
        else:
            df = df.append({"Date": pd.Timestamp(today),
                            "euro_close": predictions[0]}, ignore_index=True)
        euro_close = float(predictions[0])
        print(today)
        print(type(today))
        print(euro_close)
        print(type(euro_close))
        xgboost_EURO(conn, today, euro_close)
        xgboost_EURO_remove(conn)
    return
def xg_train_wrapper(parser):
    xgdata, data_test, params, params_t, params_other = conf_parser(
        parser.conf)
    '''x,y = load_svmlight_file(xgdata)
    x = x.todense()
    test_x,test_y = load_svmlight_file(data_test)
    test_x = test_x.todense()'''
    df_train = pd.read_csv(xgdata)
    y = df_train['label'].values
    x_columns = [
        item for item in df_train.columns if item not in ['label', 'id']
    ]
    x = df_train[x_columns].as_matrix()

    df_test = pd.read_csv(data_test)
    test_y = df_test['label'].values
    x_test_columns = [
        item for item in df_test.columns if item not in ['label', 'id']
    ]
    test_x = df_test[x_test_columns].as_matrix()

    x_train, x_val, y_train, y_val = train_test_split(x,
                                                      y,
                                                      test_size=0.3,
                                                      random_state=42)
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dval = xgb.DMatrix(x_val, y_val)
    dtrain_whole = xgb.DMatrix(x, label=y)
    watchlist = [(dtrain, 'train'), (dval, 'eval')]
    watchlist_whole = [(dtrain_whole, 'eval')]
    scale_pos_weight = get_negative_positive_ratio(y)

    params['scale_pos_weight'] = scale_pos_weight
    custom_feval = set_custom_eval_metirc(params_other['eval_metric'])
    log = log_class.log_class('grid_search_xgb', params_other['log_dir'])
    log.add('scale_pos_weight:' + str(scale_pos_weight))
    log.add('eval_metric:' + params_other['eval_metric'])
    print(params)
    num_round = tune_num_boost_round(params,
                                     dtrain,
                                     params_other['num_round'],
                                     log,
                                     watchlist,
                                     eval_metric=params_other['eval_metric'],
                                     feval=custom_feval,
                                     ascend=params_other['ascend'])

    params_t = [
        dict(max_depth=params_t['max_depth']),
        dict(subsample=params_t['subsample']),
        dict(min_child_weight=params_t['min_child_weight']),
        dict(colsample_bytree=params_t['colsample_bytree']),
        dict(colsample_bylevel=params_t['colsample_bylevel']),
        dict(max_delta_step=params_t['max_delta_step']),
        dict(gamma=params_t['gamma'])
    ]
    for param_t in params_t:
        k = param_t.keys()[0]
        values = param_t[k]
        if (k == 'num_round'):
            continue
        log.add("=====" + str(k) + "=======" + str(values))
        print('========== ', k, ' ========== ', values)
        result = []
        if (len(values) == 1):
            params[k] = values[0]
            continue
        for v in values:
            print('**** for : %s ****\n' % (str(v)))
            log.add("**** for :" + str(v) + "****")
            params[k] = v
            if (custom_feval == None):
                params['eval_metric'] = params_other['eval_metric']
            result_df = xgb.cv(
                params=params,
                dtrain=dtrain_whole,
                num_boost_round=num_round,
                nfold=params_other['cv'],
                # metrics=params_other['eval_metric'],
                feval=custom_feval,
                stratified=True,
                verbose_eval=False,
                show_stdv=False,
                shuffle=True,
                early_stopping_rounds=100)
            result_df = result_df[[
                'test-' + params_other['eval_metric'] + '-mean'
            ]]
            assert result_df.columns[0] == 'test-' + params_other[
                'eval_metric'] + '-mean', 'choose the correct column\n'
            result_np = result_df.as_matrix()
            result.append(float(result_np[-1][0]))
        print(zip(values, result))
        if (params_other['ascend'] == 1):
            loc = max(enumerate(result), key=lambda x: x[1])[0]
        else:
            loc = min(enumerate(result), key=lambda x: x[1])[0]
        params[k] = values[loc]
        print('%s : %s\n' % (k, params[k]))
        log.add(k)
        log.add(str(params[k]))
    num_round = tune_num_boost_round(params,
                                     dtrain_whole,
                                     params_other['num_round'],
                                     log,
                                     watchlist_whole,
                                     eval_metric=params_other['eval_metric'],
                                     feval=custom_feval,
                                     ascend=params_other['ascend'])
    model = xgb.train(params,
                      dtrain_whole,
                      num_round,
                      watchlist_whole,
                      feval=custom_feval)
    pprint.pprint(params)
    time_str = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
    if not os.path.isdir('./models'):
        os.mkdir('./models')
    dataname_model_path = os.path.join('./models', params_other['log_dir'])
    if not os.path.isdir(dataname_model_path):
        os.mkdir(dataname_model_path)
    model.save_model(dataname_model_path + '/' + time_str + '.xgmodel')
    print('saved : %s' % (dataname_model_path + '/' + time_str + '.xgmodel'))
    predict_test(model, test_x, test_y, log)
Beispiel #50
0
y_test = np.array(YY[0:start].reshape(1, -1)[0])
x_test = np.array(XX.iloc[0:start, selected])
dtrain = xgb.DMatrix(x2, label=y2)
dtest = xgb.DMatrix(x_test, label=y_test)
param = {
    'max_depth': 20,
    'eta': 1,
    'silent': 1,
    'objective': 'binary:logistic'
}
evallist = [(dtrain, 'train')]
num_round = 10

feature_names = dict(np.array([range(0, XX.shape[1]), np.array(XX.columns)]).T)

bst = xgb.train(param, dtrain, num_round, evallist)
#bst.save_model('0001.model')
#bst.dump_model('dump.raw.txt')

pred01 = bst.predict(dtrain)
Y = y2

pred = []
for cutoff in np.linspace(0.001, 0.999, 2000):
    pred0[pred0 < cutoff] = 0
    pred0[pred0 != 0] = 1
    pred.append(accuracy_score(np.array(Y), pred0))

thre = np.linspace(0.001, 0.999, 2000)[np.where(pred == np.max(pred))[0][0]]

pred01[pred01 >= thre] = 1
Beispiel #51
0
        'objective': 'binary:logistic',
        'eta': 0.1,
        'colsample_bytree': 0.886,
        'min_child_weight': 2,
        'max_depth': 10,
        'subsample': 0.886,
        'alpha': 10,
        'gamma': 30,
        'lambda': 50,
        'verbose_eval': True,
        'nthread': 8,
        'eval_metric': 'auc',
        'scale_pos_weight': 10,
        'seed': 201703,
        'missing': -1
    }

    xgbtrain = xgb.DMatrix(train_feat[predictors], train_feat['label'])
    xgbtest = xgb.DMatrix(test_feat[predictors])
    model = xgb.train(params, xgbtrain, num_boost_round=120)
    del train_feat, xgbtrain
    gc.collect()

    test_feat.loc[:, 'pred'] = model.predict(xgbtest)
    result = reshape(test_feat)
    test = pd.read_csv(test_path)
    result = pd.merge(test[['orderid']], result, on='orderid', how='left')
    result.fillna('0', inplace=True)
    result.to_csv('result.csv', index=False, header=False)
    print('一共用时{}秒'.format(time.time() - t0))
Beispiel #52
0
    'min_child_weight': 1,
    'gamma': 0.1,
    'subsample': 0.9,
    'colsample_bytree': 0.9,
    'reg_alpha': 2,
    'reg_lambda': 0.1,
    'objective': 'multi:softmax',
    'nthread': 8,
    'scale_pos_weight': 1
}

plst = params.items()

num_rounds = 500

model = xgb.train(plst, dtrain, num_rounds)

ans = model.predict(dtest)

y_test1 = np.asarray(y_test)

cm1 = pd.crosstab(y_test, ans, rownames=['Actual'], colnames=['Predicted'])
print(cm1)

plt.figure(num=1, figsize=(12, 8))
plot_importance(model)
plt.show()
plt.savefig('xgbmodel3.png')

#fit model on all training data
xgb2.fit(x_train, y_train)
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import loadmat
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_squared_error

training = pd.read_csv("Titanic Training.csv")
test = pd.read_csv("Titanic Test.csv")

X_train = training.drop(columns="Survived").to_numpy()
y_train = training["Survived"].to_numpy().reshape(y_train.shape[0], 1)
m = y_train.shape[0]
D = np.ones((m, 1)) / m
X_test = test.drop(columns="PassengerId").to_numpy()
test_ID = test["PassengerId"]

D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=None)
parameters = {"max_depth": 10, "num_class": 2}
steps = 100
model = xgb.train(parameters, D_train, steps)
predictions_train = model.predict(D_train).reshape(X_train.shape[0], 1)
predictions_test = model.predict(D_test).reshape(X_test.shape[0], 1)
check_train = np.equal(predictions_train, y_train) * 1
correct_train = np.sum(check_train)
accuracy_train = 100 * (correct_train / y_train.shape[0])
error_train = 100 - accuracy_train
results_df = pd.concat([test_ID, pd.DataFrame(predictions_test)], axis=1)
results_df = results_df.rename(columns={0: "Survived"})
    def fit_log(self, records, plan_size):
        tic = time.time()

        # filter data, only pick the data with a same task
        data = []
        for inp, res in records:
            if inp.task.name == self.task.name and \
                            inp.config.template_key == self.task.config_space.template_key:
                data.append((inp, res))

        logger.debug("XGB load %d entries from history log file", len(data))

        # extract feature
        self._reset_pool(self.space, self.target, self.task)
        pool = self._get_pool()
        if self.fea_type == 'itervar':
            feature_extract_func = _extract_itervar_feature_log
        elif self.fea_type == 'knob':
            feature_extract_func = _extract_knob_feature_log
        elif self.fea_type == 'curve':
            feature_extract_func = _extract_curve_feature_log
        else:
            raise RuntimeError("Invalid feature type: " + self.fea_type)
        res = pool.map(feature_extract_func, data)

        # filter out feature with different shapes
        fea_len = len(self._get_feature([0])[0])

        xs, ys = [], []
        for x, y in res:
            if len(x) == fea_len:
                xs.append(x)
                ys.append(y)

        if len(xs) < 500:  # no enough samples
            return False

        xs, ys = np.array(xs), np.array(ys)
        x_train = xs
        y_train = ys
        y_max = np.max(y_train)
        y_train = y_train / max(y_max, 1e-8)

        index = np.random.permutation(len(x_train))
        dtrain = xgb.DMatrix(x_train[index], y_train[index])

        plan_size *= 2
        self.bst = xgb.train(
            self.xgb_params,
            dtrain,
            num_boost_round=400,
            callbacks=[
                custom_callback(stopping_rounds=100,
                                metric='tr-a-recall@%d' % plan_size,
                                evals=[(dtrain, 'tr')],
                                maximize=True,
                                fevals=[
                                    xgb_average_recalln_curve_score(plan_size),
                                ],
                                verbose_eval=self.log_interval)
            ])

        logger.debug("XGB train: %.2f\tobs: %d", time.time() - tic, len(xs))

        return True
Beispiel #55
0
    'colsample_bytree': 0.3,
    'max_depth': 10,
    'subsample': 0.8,
    'lambda': 0.5,
    'nthread': -1,
    'booster': 'gbtree',
    'silent': 1,
    #'eval_metric': 'rmsle',
    'objective': 'reg:linear'
}

# You could try to train with more epoch
model = xgb.train(xgb_pars,
                  dtrain,
                  6000,
                  watchlist,
                  feval=rmsle_eval,
                  early_stopping_rounds=50,
                  maximize=False,
                  verbose_eval=10)

print('Modeling RMSLE %.5f' % model.best_score)
t1 = dt.datetime.now()
print('Training time: %i seconds' % (t1 - t0).seconds)

print('4. ---> Submission ... ')
ytest = model.predict(dtest)
print('Test shape OK.') if test.shape[0] == ytest.shape[0] else print('Oops')
test['trip_duration'] = np.exp(ytest) - 1
#test['trip_duration'] = ytest
subfn = "base2__val_" + str(model.best_score) + "__rnd_" + str(
    model.best_iteration) + "csv.gz"
def train_xgboost(df_preds, df_preds2):
    df_preds = df_preds.drop(['Filename'], axis=1)

    df_preds = df_preds[['DenseNet121_Predictions','InceptionV3_Predictions','ResNet50_Predictions','Vgg_Predictions','sex', 'localization', 'age','dx']]


    X,y = df_preds.iloc[:,:-1],df_preds.iloc[:,-1]


    print(X.head())


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=None, random_state=50 ,shuffle=True)


    X_train.to_csv("X_train.csv", index=False)
    X_test.to_csv("X_test.csv", index=False)



    params = {"objective":"multi:softmax", "num_class":7 ,'colsample_bytree': 0.3,'learning_rate': 0.001,
              'max_depth': 100, 'alpha': 10, "n_estimators":100}


    xg_class = xgb.XGBClassifier(objective ='mult:softmax', num_class=7, colsample_bytree = 0.03, learning_rate = 0.1,
                              max_depth = 1000, alpha = 1000, n_estimators = 1000)

    xg_class.fit(X_train,y_train)


    df_preds2 = df_preds2.drop(['Filename'], axis=1)

    df_preds2 = df_preds2[['DenseNet121_Predictions','InceptionV3_Predictions','ResNet50_Predictions','Vgg_Predictions','sex', 'localization', 'age','dx']]


    X,y = df_preds2.iloc[:,:-1],df_preds2.iloc[:,-1]



    print(X.head())

    data_dmatrix = xgb.DMatrix(data=X,label=y)

    X_test = X
    y_test = y

    preds = xg_class.predict(X_test)

    cm = confusion_matrix(y_test, preds)

    cm_plot_labels = ['akiec', 'bcc', 'bkl', 'df', 'mel','nv', 'vasc']

    plot_confusion_matrix(cm, cm_plot_labels, "xgboost")

    print("Balanced Accuracy: " + str(balanced_accuracy_score(y_test, preds)))
    print("Weighted Recall: " + str(recall_score(y_test, preds, average='weighted')))
    print("Class Recall: " + str(recall_score(y_test, preds, average=None)))
    print("Weighted Precision: " + str(precision_score(y_test, preds, average='weighted')))
    print("Class Precision: " + str(precision_score(y_test, preds, average=None)))
    print("Mean f1 score " + str(f1_score(y_test, preds, average='weighted')))
    print("Class f1 score " + str(f1_score(y_test, preds, average=None)))


    file = open("xgboost_results_Original.txt","w+")

    file.write("Balanced Accuracy: " + str(balanced_accuracy_score(y_test, preds)) + "\n")
    file.write("Weighted Recall: " + str(recall_score(y_test, preds, average='weighted')) + "\n")
    file.write("Class Recall: " + str(recall_score(y_test, preds, average=None)) + "\n")
    file.write("Weighted Precision: " + str(precision_score(y_test, preds, average='weighted')) + "\n")
    file.write("Class Precision: " + str(precision_score(y_test, preds, average=None)) + "\n")
    file.write("Mean f1 score " + str(f1_score(y_test, preds, average='weighted')))
    file.write("Class f1 score " + str(f1_score(y_test, preds, average=None)))


    file.close()

    xg_class = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)


    ax = xgb.plot_importance(xg_class)
    fig = ax.figure
    fig.set_size_inches(20,20)
Beispiel #57
0
                                                    random_state=42,
                                                    stratify=y)
preds = np.ones(y_test.shape[0])
# Get from cross validation
params = {
    'booster': 'gbtree',
    'objective': 'reg:logistic',
    'colsample_bytree': 0.2,
    'min_child_weight': 4,
    'subsample': 1,
    'learning_rate': 0.1,
    'max_depth': 6,
    'gamma': 0.05
}
training_data = xgb.DMatrix(X_train, y_train)
model = xgb.train(params, training_data, 230, feval=mcc_eval, maximize=True)

preds = model.predict(xgb.DMatrix(X_test))

# pick the best threshold out-of-fold
thresholds = np.linspace(0.01, 0.99, 99)
mcc = np.array([matthews_corrcoef(y_test, preds > thr) for thr in thresholds])
plt.plot(thresholds, mcc)
plt.show()
best_threshold = thresholds[mcc.argmax()]
print(mcc.max())
print(best_threshold)

model = xgb.train(params,
                  xgb.DMatrix(X, y),
                  230,
    'min_split_loss': 0,
    'max_depth': 6,
    'min_child_weight': 1,
    'max_delta_step': 0,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'reg_lambda': 1,
    'reg_alpha': 0,
    'grow_policy': 'depthwise',
    'max_leaves': 0,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'seed': 7
}
history = {}  # This will record rmse score of training and test set
eval_list = [(Train, "Training"), (Validation, "Validation")]
clf = xgb.train(params,
                Train,
                num_boost_round=119,
                evals=eval_list,
                obj=None,
                feval=None,
                maximize=False,
                early_stopping_rounds=40,
                evals_result=history)
prediction = clf.predict(xgb.DMatrix(x_test))
submission = pd.DataFrame({
    "card_id": main_test["card_id"].values,
    "target": np.ravel(prediction)
})
Beispiel #59
0
}

dtrain = xgb.DMatrix(x_train, y_train)
# cross-validation
cv_result = xgb.cv(xgb_params,
                   dtrain,
                   nfold=5,
                   num_boost_round=1000,
                   early_stopping_rounds=50,
                   verbose_eval=1,
                   show_stdv=False)
num_boost_rounds = len(cv_result)
print(num_boost_rounds)
# train model
model = xgb.train(dict(xgb_params, silent=1),
                  dtrain,
                  num_boost_round=num_boost_rounds)
res = []

for i in range(3):
    x_test['month_logerror'] = round(
        traingroupedMonth.ix[9 + int(i)]['logerror'], 6)
    x_test['quarter_logerror'] = round(traingroupedQuarter.ix[3]['logerror'],
                                       6)
    test_set = sc.transform(x_test)
    dtest = xgb.DMatrix(test_set)
    pred = model.predict(dtest)
    res.append(pred)

output = pd.DataFrame({
    'ParcelId': properties['parcelid'].astype(np.int32),
Beispiel #60
0
x_train, x_valid, y_train, y_valid = train_test_split(x_train,
                                                      y_train,
                                                      test_size=0.2,
                                                      random_state=4242)

#XGBoost算法
params = {}
params['objective'] = 'binary:logistic'
params['eva_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watch_list = [(d_train, 'train'), (d_valid, 'valid')]

bst = xgb.train(params,
                d_train,
                400,
                watch_list,
                early_stopping_rounds=50,
                verbose_eval=10)

d_test = xgb.DMatrix(x_test)
p_test = bst.predict(d_test)

sub = pd.DataFrame()
sub['test_id'] = df_test['test_id']
sub['is_duplicate'] = p_test
sub.to_csv('simple_xgb.csv', index=False)