def runxgBoostClassifier(self, bDetailReport=False): print("m_X_train size", len(self.m_X_train)) boosters = ['gbtree', 'gblinear'] for depth in range(3, 4): for rate in (range(2, 3, 1)): for estimator in (range(220, 240, 20)): # for bster in boosters: clf = XGBClassifier(max_depth=depth, learning_rate=(float(rate) / 10), n_estimators=estimator, silent=True, objective='binary:logistic', seed=400) clf.fit(self.m_X_train, self.m_y_train) y = clf.predict(self.m_X_test) print( "\nxgBoostClassifier depth={} rate={} estimator={}\n". format(depth, (float(rate) / 10), estimator)) print(classification_report(self.m_y_test, y)) print(clf.feature_importances_) # plot pyplot.bar(range(len(clf.feature_importances_)), clf.feature_importances_) pyplot.show() plot_importance(clf) if (bDetailReport): self.ClassifierDetailReport(self.m_y_test, y)
def eval_fn(params): model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed) score = 0 n_estimators = 0 for tr, va in skf: X_tr, y_tr = X_train[tr], y_train[tr] X_va, y_va = X_train[va], y_train[va] model.set_params(**params) model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss', early_stopping_rounds=50, verbose=False) score += model.best_score n_estimators += model.best_iteration score /= n_folds n_estimators /= n_folds n_estimators_lst.append(n_estimators) result_str = "train:%.4f ntree:%5d " % (score, n_estimators) if X_valid is not None: model.n_estimators = n_estimators model.fit(X_train, y_train) pr = model.predict_proba(X_valid)[:,1] sc_valid = log_loss(y_valid, pr) score_valid.append(sc_valid) result_str += "valid:%.4f" % sc_valid if verbose: print result_str return score
def leaveoneout(dataset, labels): '''分类器采用xgboost,交叉验证采用留一法''' leaveoo = LeaveOneOut() # Y_true = [] # Y_pre = [] #xgboost参数分为三类: '''1、通用参数 2、Booster参数:控制每一步的booster 3、学习目标参数:控制训练目标的表现''' for train_index, test_index in leaveoo.split(dataset): x_train, x_test = dataset[[train_index]], dataset[[test_index]] y_train, y_test = [labels[i] for i in train_index ], [labels[i] for i in test_index] estimator = XGBClassifier( silent=0, #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 min_child_weight=1, gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这种样子。 max_delta_step=1, #最大增量步长,我们允许每个树的权重估计。 colsample_bytree=0.8, # 生成树时进行的列采样 nthread=4, objective= 'binary:logistic', #定义需要被最小化的损失函数,binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。 reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 scale_pos_weight=1, n_estimators=200, #树的个数 seed=1000 #随机种子 ) estimator.fit(x_train, y_train) print(estimator.best_params_) y_true, y_pre = y_test, list(estimator.predict(x_test)) print("Accuracy : %.6g" % metrics.accuracy_score(y_true, y_pre))
def do_simple_xgboost_regression(x_train, y_train, x_test, y_test): xg_reg = XGBClassifier(silent=False, scale_pos_weight=1, learning_rate=0.01, colsample_bytree=0.4, subsample=0.8, objective='binary:logistic', n_estimators=1000, reg_alpha=0.3, max_depth=4, gamma=10) eval_set = [(x_train, y_train), (x_test, y_test)] eval_metric = ["auc", "error"] xg_reg.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True) train_accuracy = compute_accuracy(xg_reg, x_train, y_train) test_accuracy = compute_accuracy(xg_reg, x_test, y_test) print('train set accuracy: {}'.format(train_accuracy)) print('test set accuracy: {}'.format(test_accuracy)) y_score = xg_reg.predict(x_test) score = metrics.roc_auc_score(y_test, y_score) print('score {}'.format(score))
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target): param = { 'n_estimators':10, 'learning_rate': 0.01, } adj_params = { 'n_estimators':[10], 'learning_rate': [0.01], # 'n_estimators':[10,50,100,200,300,400,500,1000], # 'learning_rate': [0.01, 0.1, 1] } xgbt = XGBClassifier(**param) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cscv = GridSearchCV(xgbt, adj_params, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) cscv.fit(train_input, train_target) print("cv_results_:",cscv.cv_results_) print("best_params_: ",cscv.best_params_) xgbt= XGBClassifier(**cscv.best_params_) xgbt.fit(train_input,train_target.ravel()) predicted = xgbt.predict(test_input) xgbt_base_rmse = np.sqrt(metrics.mean_squared_error(test_target, predicted)) print("xgbt_base_rmse: ", xgbt_base_rmse) #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted))) return xgbt_base_rmse
def get_ntree(): f1_t_total, f1_v_total = [], [] for ntree in range(10, 810, 10): xgb_base = XGBClassifier(objective='binary:logistic', n_estimators=ntree, random_state=1234, silent=0, booster='gbtree', subsample=0.8, colsample_bytree=0.8, reg_alpha=1, reg_lambda=0, learning_rate=0.1, max_depth=6) print('此时 ntree = %s' % ntree) xgb_base.fit(X_t, y_t) y_t_pre = xgb_base.predict(X_t) y_v_pre = xgb_base.predict(X_v) f1_t_each = f1_score(y_t, y_t_pre, average='micro') f1_v_each = f1_score(y_v, y_v_pre, average='micro') f1_t_total.append(f1_t_each) f1_v_total.append(f1_v_each) myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'xgbbase_810_1.txt', 'a', encoding='utf-8') print(f1_t_each, ',', f1_v_each, file=myfile) myfile.close() return f1_t_total, f1_v_total
def model_train(xtrain, ytrain): X_train, X_test, y_train, y_test = train_test_split(xtrain, ytrain, test_size=0.2, random_state=0) cls = XGBClassifier() start_time = time.time() cls.fit(X_train, y_train) end_time = time.time() print('It took %d seconds to train the model!' % (end_time - start_time)) print() y_pred = cls.predict(X_test) print("模型及模型参数:") print(str(cls)) print("模型评估:") print('Accuracy:', accuracy_score(y_test, y_pred)) print('F1 score:', f1_score(y_test, y_pred)) print('Recall:', recall_score(y_test, y_pred)) print('Precision:', precision_score(y_test, y_pred)) print('\n clasification report:\n', classification_report(y_test, y_pred)) print('\n confussion matrix:\n', confusion_matrix(y_test, y_pred)) # 保存模型 model_name = "./model/" + "xgb_model" joblib.dump(cls, model_name)
def fit_model(self, X_train, y_train, X_test, y_test): clf = XGBClassifier(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train, y_train) y_pre = clf.predict(X_test) y_pro = clf.predict_proba(X_test)[:, 1] print "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score( y_test, y_pro) print "pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score( y_test, y_pre) new_feature = clf.apply(X_train) X_train_new = self.mergeToOne(X_train, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set sample number remains the same" return X_train_new, y_train, X_test_new, y_test
def train_xgb_classifier(dat, predictors, target_col, params): if params['train_frac'] < 1.0: if params['seed'] is not None: np.random.seed(params['seed']) else: np.random.seed(123) dat.sort_values(['GAME_DATE'], inplace=True) samp_size = int(params['train_frac'] * dat.shape[0]) # sample_ind = np.random.choice(dat.shape[0], size=int(np.floor(params['train_frac']*dat.shape[0])),replace=False) sample_ind = list(range(samp_size)) train_dat = dat.iloc[sample_ind, :].copy() calib_ind = list(set(range(dat.shape[0])) - set(sample_ind)) calib_dat = dat.iloc[calib_ind, :].copy() calib_dat_x = calib_dat[predictors] calib_dat_x.columns = ['f' + str(i) for i in range(len(predictors))] else: train_dat = dat.copy() train_dat_x = train_dat[predictors] train_dat_x.columns = ['f' + str(i) for i in range(len(predictors))] mod = XGBClassifier(**params) mod.fit(train_dat_x, train_dat[target_col]) if params['train_frac'] < 1.0: mod_final = CalibratedClassifierCV(mod, method='sigmoid', cv='prefit') mod_final.fit(calib_dat_x, calib_dat[target_col]) else: mod_final = mod return (mod_final)
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators, max_depth, min_child_weight, gamma, subsample, colsample_bytree, reg_alpha, eval_metric): ROCforest = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, objective='binary:logistic', nthread=4, seed=12) cv_folds = 5 eval_metric = eval_metric xgb_param = ROCforest.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=ROCforest.get_params()['n_estimators'], nfold=cv_folds, metrics=eval_metric) ROCforest.set_params(n_estimators=cvresult.shape[0]) ROCforest.fit(X_train, y_train) return ROCforest
class XGBoosting: def __init__(self, x_train, y_train, problemtype='regression', cv=5): self.x_train = x_train self.y_train = y_train self.cv = cv if problemtype == 'regression': self.clf = XGBRegressor() elif problemtype == 'classification': self.clf = XGBClassifier() def classify(self): self.clf.fit(self.x_train, self.y_train) def regress(self): self.clf.fit(self.x_train, self.y_train) def show_cross_val_score(self): cv_score = cross_val_score(estimator=self.clf, X=self.x_train, y=self.y_train, cv=self.cv, n_jobs=-1) print('XGB Cross Validated Score...') print(np.mean(cv_score)) print('\n') def optimise(self): pass
def runXGBoost(x_train, y_train, x_test, y_test, p): # Here we instantiate the extra gradient boosting classifier clf = XGBClassifier() clf.set_params(**p) clf.fit(x_train, y_train) # now, make the predictions using our classifier xgb_predictions = clf.predict(x_test) # now we have to computer the classification accuracy # think about what two variables we have to compare xgb_score = accuracy_score(y_test, xgb_predictions) print("XGB classification accuracy on test data is " + str(xgb_score), file=sys.stderr) etc_predictions = clf.predict(x_test) dt_score = accuracy_score(y_test, etc_predictions) print("accuracy score on test data: " + str(dt_score), file=sys.stderr) train_score = accuracy_score(y_train, clf.predict(x_train)) print("accuracy score on training data: " + str(train_score), file=sys.stderr) return (train_score, dt_score)
def fit_model(self, X_train, y_train): print('Model Fitting started: ', datetime.now()) start_train_date = pd.Timestamp( year=X_train['Year'].iloc[0], month=X_train['Month'].iloc[0], day=X_train['DayofMonth'].iloc[0]).date() end_train_date = pd.Timestamp( year=X_train['Year'].iloc[-1], month=X_train['Month'].iloc[-1], day=X_train['DayofMonth'].iloc[-1]).date() print('Fit model with data from {} to {}'.format( start_train_date, end_train_date)) model_name = '{}_{}_{}_{}_{}'.format(self.strategy_name, start_train_date.year, start_train_date.month, end_train_date.year, end_train_date.month) start_time = time.time() classifier = XGBClassifier(n_jobs=8, n_estimators=1000, verbosity=1) classifier.fit(X_train, y_train) pickle.dump(classifier, open("models/{}.pickle.dat".format(model_name), 'wb')) print('Duration Fitting: ', (time.time() - start_time)) self.prediction_model = classifier
def job_function(params): learning_rate = params[0] max_depth = params[1] ss_cs = params[2] gamma = params[3] min_child_weight = params[4] reg_lambda = params[5] reg_alpha = params[6] early_stopping_rounds = 25 if learning_rate >= 0.3: early_stopping_rounds = 5 if learning_rate <= 0.03: early_stopping_rounds = 50 scores = [] for i in range(iterations_per_job): X_train = Xy[i][0] X_test = Xy[i][1] y_train = Xy[i][2] y_test = Xy[i][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) scores.append(score) avg_score = np.array(scores).mean() print(avg_score, params) return avg_score
def _distributor(self, label, cv, param, eval_metric, early_stopping_rounds=50): start = time() if self.is_classifier: label = 'XGBClassifier' rs = XGBClassifier(param) else: label = 'XGBRegressor' rs = XGBRegressor(param) X_visible, X_blind, y_visible, y_blined = \ train_test_split( self.X_train, self.y_train, random_state=1301, stratify=self.y_train, test_size=0.4) rs.fit(self.X_visible, self.y_visible, eval_metric, early_stopping_rounds=50, eval_set=[(X_visible, y_visible), (X_blind, y_blined)]) self.result[label] = {} self.result[label]['clf'] = rs # self.result[label]['score'] = rs.best_score_ self.result[label]['time'] = time() - start # self.result[label]['set'] = ('n_iter: %s cv: %s' % (n_iter, cv)) pprint.pprint(self.result[label]) # pprint.pprint(rs.grid_scores_) out_result = open(self.result_address, 'wb') pickle.dump(self.result, out_result) out_result.close()
def compute_cv_metric(split, cross_val_data, bayes_trials_results): #Create clasifier for cross validation results clf = XGBClassifier(random_state=0, n_jobs=-1, **bayes_trials_results[0]['params']) train_x = cross_val_data[split][0] train_y = cross_val_data[split][1] test_x = cross_val_data[split][2] test_y = cross_val_data[split][3] clf.fit(train_x, train_y) y_pred_cv = clf.predict(test_x) y_pred_prob_cv = clf.predict_proba(test_x) tn = confusion_matrix(test_y, y_pred_cv)[0, 0] tp = confusion_matrix(test_y, y_pred_cv)[1, 1] fp = confusion_matrix(test_y, y_pred_cv)[0, 1] fn = confusion_matrix(test_y, y_pred_cv)[1, 0] npv = tn / (tn + fn) specificity = tn / (tn + fp) precision = tp / (tp + fp) recall = tp / (tp + fn) roc_auc_cv = roc_auc_score(test_y, y_pred_prob_cv[:, 1]) f1_cv = 2 * (precision * recall) / (precision + recall) return npv, specificity, precision, recall, roc_auc_cv, f1_cv, y_pred_prob_cv
def xxgboost(training, cv, testing): xgb = XGBClassifier(max_depth=6, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5) xgb.fit(training, cv.ravel()) XGBtrainscore = xgb.score(training, cv.ravel()) #Train Score kf = KFold(len(cv), n_folds=5) # 5 folder cross validation scores = cross_val_score(xgb, training, cv.ravel(), cv=kf) XGBvalidation = abs(scores.mean()) XGBy_pred = xgb.predict_proba(testing) le = LabelEncoder() y = le.fit_transform(labels) idlist = [] #id list listcty = [] #countries list for i in range(len(testid)): idi = testid[i] idlist += [idi] * 5 listcty += le.inverse_transform(np.argsort( XGBy_pred[i])[::-1])[:5].tolist() XGBsub = pd.DataFrame(np.column_stack((idlist, listcty)), columns=['id', 'country']) XGBsub.to_csv('XGsub_%s.csv' % csvname, index=False) print("XGBtrainscore", XGBtrainscore) print("XGBvalidation", XGBvalidation)
def xgboost_classifier(train_x, train_y): from xgboost.sklearn import XGBClassifier # model = XGBClassifier() model = XGBClassifier(silent=1, learning_rate=0.1, n_estimators=60, max_depth=6, min_child_weight=0.4, gamma=0.5, subsample=0.4, colsample_bytree=1, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=1000) #max_depth=[2,3,4,5,6,7] #learning_rate = [0.01,0.05,0.1,0.2,0.4,0.8,1] #n_estimators = [30,60, 80, 100, 150, 200] #param_grid = dict() #kfold = StratifiedKFold(n_splits=10,shuffle=True,random_state=7) #grid_search = GridSearchCV(model,param_grid,scoring='neg_log_loss',n_jobs=-1,cv=kfold) #grid_result = grid_search.fit(np.array(train_x), np.array(train_y)) #print grid_result.best_score_,'***********',grid_result.best_params_ model.fit(train_x, train_y) # # from xgboost import plot_importance # from matplotlib import pyplot # plot_importance(model) # pyplot.show() return model
def XGBoost(returns, factRet): """ :param return: :param facret: :param a: lambda """ [timeN, factorN] = factRet.shape [timeN, assetN] = returns.shape f_bar = [] for i in range(factorN): f_bar.append(np.prod(factRet.iloc[:, i] + 1)**(1 / timeN) - 1) colName = list(factRet.columns) f_bar = pd.DataFrame(f_bar).T f_bar.columns = colName xgb = XGBClassifier(learning_rate=0.1, n_estimators=10, max_depth=7, min_child_weight=2, gamma=0.2, subsample=0.8, colsample_bytree=0.6, objective='reg:linear', scale_pos_weight=1, seed=10) mu = [] for i in range(assetN): xgb.fit(factRet, returns.iloc[:, i]) mu.append(float(xgb.predict(f_bar))) mu = np.array(mu) Q = np.array(returns.cov()) return mu, Q
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(**params) df = data.sample(frac=0.3) pX = df.drop('LABEL', axis=1) py = df['LABEL'] if useTrainCV: print("start use cv") xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_param['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print(cvresult.shape[0]) alg.set_params(n_estimators=cvresult.shape[0]) params['n_estimators'] = cvresult.shape[0] print("best tree size is {}".format(cvresult.shape[0])) # Fit the algorithm on the data alg.fit(X, y, eval_metric='auc') y_pred = alg.predict(pX) accuracy = metrics.accuracy_score(py, y_pred) print("精确率Accuracy: %.2f%%" % (accuracy * 100.0)) print('auc:', metrics.roc_auc_score(py, y_pred)) train_report = metrics.classification_report(py, y_pred) print(train_report) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) print(feat_imp) return alg
def xgb_no_feature_select(train: pd.DataFrame, test: pd.DataFrame, y_train, cv=False): params = { 'silent': 1, 'nthread': 4, 'eval_metric': 'auc', 'verbose_eval': True, 'seed': 918, 'alpha': 9.6523, 'cosample_bytree': 0.9604, 'eta': 0.1171, 'gamma': 0.179, 'max_depth': 7, 'min_child_weight': 13, 'subsample': 0.9609 } xgtrain = xgb.DMatrix(train, label=y_train) if cv: cv_res = xgb_k_folder_cv(params, xgtrain) print(cv_res) model = XGBClassifier(**params) model.fit(train, y_train) y_predict = model.predict_proba(test) return model, y_predict
def fit_model_split(self, X_train, y_train, X_test, y_test, x_pre): # X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合 X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split( X_train, y_train, test_size=0.6, random_state=0) clf = XGBClassifier(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train_1, y_train_1) y_pre = clf.predict(X_train_2) y_pro = clf.predict_proba(X_train_2)[:, 1] print "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score( y_train_2, y_pro) print "pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score( y_train_2, y_pre) new_feature = clf.apply(X_train_2) X_train_new2 = self.mergeToOne(X_train_2, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) new_feature_pre = clf.apply(x_pre) X_pre_new = self.mergeToOne(x_pre, new_feature_pre) print "Training set of sample size 0.4 fewer than before" return X_train_new2, y_train_2, X_test_new, y_test, X_pre_new
class Classifier(object): def __init__(self, conf, task, train=None, test=None): self.conf = conf self.task = task self.train_ = train self.test_ = test self.features = [ "hasWith", "hasIn", "simiBucket", "textPos", "hasOf", "hasAnd", "startEntity", "distance", "hasFrom", "endEntity", "similarity", "hasThan", "hasVerb" ] self.labels = ["relation"] self.num_round = 500 self.eval_set = list() self.early_stopping_rounds = 20 self.classifier = XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=1000, gamma=4, verbosity=1, objective='multi:softmax', num_class=6, booster='gbtree', n_jobs=4, seed=27) def train(self): train_X, test_X, train_y, test_y = train_test_split( self.train_[self.features], self.train_[self.labels], test_size=0.4, random_state=42) self.eval_set = [(train_X.values, train_y.values), (test_X.values, test_y.values)] self.classifier.fit(train_X.values, train_y.values, eval_metric='merror', eval_set=self.eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose=True) self.classifier.save_model(self.conf.model_path.format(self.task)) return 'Model has been saved!' def test(self): test_set = self.test_[self.features].values self.classifier.load_model(self.conf.model_path.format(self.task)) self.classifier._le = LabelEncoder().fit([ 'USAGE', 'TOPIC', 'MODEL-FEATURE', 'PART_WHOLE', 'RESULT', 'COMPARE' ]) pred = self.classifier.predict(test_set) predictions = pd.concat([ self.test_[self.features], pd.DataFrame(pred, columns=["relation"]) ], axis=1) return predictions
def train(): trainDf = pd.read_csv("data_train.csv") testDf = pd.read_csv("data_test.csv") goal = "interested" predictors = [ "invited", "user_reco", "evt_p_reco", "evt_c_reco", "user_pop", "frnd_infl", "evt_pop" ] clf = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) X_train, X_test, y_train, y_test = train_test_split(trainDf[predictors], trainDf[goal], random_state=0) clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc", eval_set=[(X_test, y_test)]) return clf
def XGB_class_evaluation(individual): N_SPLITS = N_splits kf = KFold(n_splits=N_SPLITS) fc = XGBClassifier(learning_rate=individual[0], n_estimators=individual[5], silent=True, nthread=-1, gamma=0, min_child_weight=individual[1], max_depth=individual[2], subsample=individual[3], colsample_bylevel=individual[4], seed=0) M_pos = 0 M_mid = 0 M_neg = 0 for train, test in kf.split(trainX): fc.fit(trainX[train, :], trainY[train]) testY_pre = fc.predict(trainX[test, :]) Ind_pos = (trainY[test] == 1) Ind_mid = (trainY[test] == 0) Ind_neg = (trainY[test] == -1) M_pos += len(np.where(np.array(testY_pre[Ind_pos]) == 1)[0]) / len( np.where(Ind_pos)[0]) M_mid += len(np.where(np.array(testY_pre[Ind_mid]) == 0)[0]) / len( np.where(Ind_mid)[0]) M_neg += len(np.where(np.array(testY_pre[Ind_neg]) == -1)[0]) / len( np.where(Ind_neg)[0]) correct = map(lambda x: x / N_SPLITS, [M_pos, M_mid, M_neg]) return (tuple(correct))
def train_model(mall_id): # 开始训练模型 random_state = 10 metrix, tar = utils.get_data(mall_id) x_train, x_test, y_train, y_test = train_test_split( metrix, tar, test_size=0.1, random_state=random_state) # xgboost方法,基于boosting tree(提升树方法) # 设参数 训练慢 clf_name = "xgboost" save_dir = "./model/" + clf_name + "_" + mall_id + "_model.m" n_est = 50 clf = XGBClassifier( learning_rate=0.1, # 学习率 典型值为0.01-0.2 n_estimators=n_est, max_depth=5, # 树的最大深度 一般3-10 min_child_weight=1, # 决定最小叶子节点样本权重和 值较大,避免过拟合 值过高,会导致欠拟合 gamma=0, # 指定了节点分裂所需的最小损失函数下降值。 这个参数的值越大,算法越保守 subsample=0.8, # 对于每棵树,随机采样的比例 减小,算法保守,避免过拟合。值设置得过小,它会导致欠拟合 典型值:0.5-1 colsample_bytree=0.8, # 每棵随机采样的列数的占比 objective='binary:logistic', # 使用二分类 nthread=4, # 线程数 scale_pos_weight=1, # 在各类别样本十分不平衡时,参数设定为一个正值,可以使算法更快收敛 seed=0) # 随机数的种子 设置它可以复现随机数据的结果 print(utils.get_time(), ' ', mall_id, ' starts...') train_time = time.time() clf.fit(x_train, y_train) train_time = time.time() - train_time score = clf.score(x_test, y_test) joblib.dump(clf, save_dir) print(utils.get_time(), ' saved a model for ', mall_id, ' score: ', score, ' train time : ', train_time) train_time = int(train_time) return (score, n_est, train_time)
def xgb_result(x, y, testx, testy, para): print("----- Working on 'xgb' method...") #dtrain = xgb.DMatrix(x, label=y) #dtest = xgb.DMatrix(testx, label=testy) xgb0 = XGBClassifier(**para) # with open('xgb.pickle','rb') as f: # xgb0 = pickle.load(f) time0 = time.time() #bst = xgb.train(dtrain=dtrain,**para) xgb0.fit(x, y) train_time = time.time() - time0 confusion, test_time = Errmodel(xgb0, x, y, testx, testy, ntree_limit=xgb0.booster().best_iteration) print(confusion, '\n', train_time, '\n', test_time) importance = sorted(xgb0.booster().get_score().items(), key=lambda x: x[1]) result = { 'model': xgb0, 'confusion': confusion, 'train_time': train_time, 'test_time': test_time, 'importance': importance, 'best_iter': xgb0.booster().best_iteration } print("best_iter", xgb0.booster().best_iteration) return result
def train_classify(X_train, y_train): """ 使用XGBoostClassifier :param X_train: :param y_train: :return: """ print("正在使用XGBoostClassifier训练") model = XGBClassifier( learning_rate=0.1, n_estimators=80, # 树的个数--1000棵树建立xgboost max_depth=6, # 树的深度 min_child_weight=1, # 叶子节点最小权重 gamma=0., # 惩罚项中叶子结点个数前的参数 subsample=0.8, # 随机选择80%样本建立决策树 colsample_btree=0.8, # 随机选择80%特征建立决策树 objective='multi:softmax', # 指定损失函数 scale_pos_weight=1, # 解决样本个数不平衡的问题 random_state=27 # 随机数 ) model.fit(X_train, y_train, eval_set=[(X_train, y_train)], eval_metric="mlogloss", early_stopping_rounds=10, verbose=True) return model
def sampleTrain(): LABEL = 'LABEL' dpath = 'dan_train_{}.csv' data4 = pd.read_csv(dpath.format(201804),index_col=ID_COLUMN) data5 = pd.read_csv(dpath.format(201805),index_col=ID_COLUMN) a1 = data4[data4.LABEL == 1] del data4 b1 = data5[data5.LABEL == 1] b0 = data5[data5.LABEL == 1].sample(n=(a1.shape[0]+b1.shape[0])*35) del data5 data = b0.append(a1).append(b1).sort_index() X = data.drop(columns=LABEL) y = data[LABEL] params = {'learning_rate': 0.01, 'n_estimators': 1000, 'max_depth': 8, 'min_child_weight': 0, 'gamma': 0.4, 'subsample': 0.9, 'colsample_bytree': 0.6, 'scale_pos_weight': 10, 'n_jobs': 50, 'objective': 'binary:logistic', 'reg_alpha': 1, 'reg_lambda': 1} model = XGBClassifier(**params) model.fit(X, y, eval_metric=metrics.f1_score) del X del y del data joblib.dump(model, 'CDanCdmaModel_{}.pkl'.format(format(datetime.now().strftime('%d%H%M')))) data, X_test, y_test = get_transformed_data(month='201806', frac=1) print_evaluate(model, X_test, y_test)
def modeling_RF(): estimator = None try: df1 = pd.read_csv('last_total.csv', encoding='cp949') df_dummy = pd.get_dummies(df1) train, test = train_test_split(df_dummy, test_size=0.2, random_state=1234) train_x = train.drop('target_bool', axis=1) train_y = train['target_bool'] test_x = test.drop('target_bool', axis=1) test_y = test['target_bool'] xgb = XGBClassifier(random_state=1234, learning_rate=0.6000000000000001, max_depth=9, n_estimators=200) xgb.fit(train_x, train_y) abc = xgb.score(train_x, train_y) except Exception as e: print(e) finally: pass return abc
def pred(self, X): """ Computes the Xgboost and gradient boost predictions for given data. :param X: pre-processed data :return: None """ Y = X['isFraud'] X = X.drop(['nameOrig', 'nameDest', 'isFlaggedFraud', 'isFraud'], axis=1) # hot-encoding of transaction type' X.loc[X.type == 'TRANSFER', 'type'] = 0 X.loc[X.type == 'CASH_OUT', 'type'] = 1 X.type = X.type.astype(int) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1) wts = sum((Y == 0)) / sum(1.0 * (Y == 1)) # Grid search -- checking of best params # uncomment to compute params # print("Grid searching....") # self.param_tuning(x_train, y_train, wts) clf = XGBClassifier(max_depth=1, gamma=0.1, scale_pos_weight=wts, n_jobs=4) print("-----------------------------TRAINING XGBOOST------------------------------------") probs = clf.fit(x_train, y_train).predict_proba(x_test) probY = probs[:, 1] self.plot_roc(y_test, probY) print("-----------------------------TRAINING Gradient Boosting------------------------------------") clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) probs = clf.fit(x_train, y_train).predict(x_test) self.plot_roc(y_test, probs)
def train_model(train, test): x_train, y_train = get_fea_lab(train) x_test, y_test = get_fea_lab(test) xgb = XGBClassifier() print(xgb) paras = { 'max_depth': range(1, 3), 'min_child_weight': [i / 10 for i in range(0, 10)], 'scale_pos_weight': range(10, 100, 10) } gscv = GridSearchCV(estimator=xgb, param_grid=paras, cv=5, scoring='roc_auc') gscv.fit(x_train, y_train) print(gscv.best_params_) print(gscv.best_score_) print(gscv.score(x_test, y_test)) result = gscv.predict(x_test) print(confusion_matrix(y_test, result)) print(classification_report(y_test, result)) xgb.fit(x_train, y_train) test_result = xgb.predict(x_test) print(confusion_matrix(y_test, test_result)) print(classification_report(y_test, test_result)) '''
def get_leaf(self): self.get_data("oneHot") n_estimators = 300 clf_xgb = XGBClassifier(max_depth=4, learning_rate=0.0125, n_estimators=300, subsample=0.6, colsample_bytree=0.7, seed=4) #clf_xgb = XGBClassifier(max_depth=4, n_estimators=300) clf_xgb.fit(self.x_train, self.y_train) leafes_train = list(clf_xgb.apply(self.x_train)) leafes_test = list(clf_xgb.apply(self.x_test)) #补充最大值,最小值,将数据one-hot时统一 max_train = np.array(leafes_train).max() min_train = np.array(leafes_train).min() max_test = np.array(leafes_test).max() min_test = np.array(leafes_test).min() max_value = max(max_train, max_test) min_value = min(min_train, min_test) for i in range(min_value, max_value + 1): leafes_train.append([i] * n_estimators) enc = OneHotEncoder() enc.fit(leafes_train) #去除补充的值 leafes_train_feature = enc.transform( leafes_train).toarray()[:-(max_value - min_value + 1), :] print leafes_train_feature.shape, len(leafes_train) return leafes_train_feature, self.y_train, enc.transform( leafes_test).toarray(), self.y_test
def extract_leaf_feature(features, targets, train_indexes, params): model = XGBClassifier(**params) model.fit(features[train_indexes], targets[train_indexes]) booster = model.booster() dmatrix = xgb.DMatrix(features) leaf = booster.predict(dmatrix, pred_leaf=True) encoder = sklearn.preprocessing.OneHotEncoder() leaf_feature = encoder.fit_transform(leaf) return leaf_feature
def main(training_data, test_data): # Merging data to ensure consistent cleaning. Putting marker variable to separate later. training_data['source'] = 'training' test_data['source'] = 'test' merged_data = pd.concat([training_data, test_data]) # Cleaning data cleaned_data = data_cleaner(merged_data) # Separating data, removing marker pred_df = cleaned_data[cleaned_data['source'] == 'training'].copy() test_pred = cleaned_data[cleaned_data['source'] == 'test'].copy() pred_df.drop('source', axis=1, inplace=True) test_pred.drop('source', axis=1, inplace=True) # Transforming target into ints, saving the key for later transformation labels = LabelEncoder().fit(training_data['country_destination']) target_df = pd.Series(labels.transform(training_data['country_destination']), index=training_data.index) # Training model xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb_model.fit(pred_df.as_matrix(), target_df.tolist()) # Running the model preds = xgb_model.predict_proba(test_pred.as_matrix()) # Selecting the top 5 most likely for each respondent and stacking. # This section is VERY slow and could use being optimized model_probs = pd.DataFrame(preds, index=test_pred.index, columns=labels.classes_) stacked_probs = pd.Series() for i in model_probs.index: temp = model_probs.loc[i, :] temp_sort = pd.DataFrame(temp.sort_values(ascending=False)[:5].index) temp_sort['id'] = i temp_sort.columns = ['country', 'id'] stacked_probs = pd.concat([stacked_probs, temp_sort]) # # Selecting classes with highest probabilities, compiling into list # ids = [] # cts = [] # test_ids = pd.Series(test_data.index) # for i in range(len(test_ids)): # idx = test_data.index[i] # ids += [idx] * 5 # cts += labels.inverse_transform(np.argsort(model_probs[i])[::-1])[:5].tolist() # # predictions = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) # Cleaning output and returning it output = stacked_probs[['id', 'country']] return output
def main(): data_train = pd.read_csv(args.train_dataset) X_train = data_train.drop(['Id', 'Class'], axis=1) y_train = data_train.loc[:, 'Class'] data_test = pd.read_csv(args.test_dataset) X_test = data_test.drop(['Id'], axis=1) Id = data_test.loc[:, 'Id'] clf = XGBClassifier() clf.set_params(**best_dicts) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test) columns = ['Prediction'+str(i) for i in range(1, 10)] prediction = pd.DataFrame(prediction, columns=columns) results = pd.concat([Id, prediction], axis=1) return (clf, results)
def objective(space): clf = XGBClassifier(n_estimators=int(space['n_estimators']), objective='binary:logistic', seed=37, learning_rate=space['learning_rate'], max_depth=space['max_depth'], min_child_weight=space['min_child_weight'], colsample_bytree=space['colsample_bytree'], subsample=space['subsample']) clf.fit(xTrain, yTrain, eval_metric="logloss") pred = clf.predict_proba(xValid)[:, 1] loss = log_loss(yValid, pred) return{'loss': loss, 'status': STATUS_OK}
def myThreadFunc(ThreadID): X_train = Xy[ThreadID][0] X_test = Xy[ThreadID][1] y_train = Xy[ThreadID][2] y_test = Xy[ThreadID][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) print(score, clf.booster().best_ntree_limit) train_and_test_scores[ThreadID] = score
def apply_xgb_ens(y_valid, valid_folder='Valid', test_folder='Test'): """ Ensembler based on xgboost Gradient boosting. """ #Loading data X, X_test, n_preds, n_class = get_X_X_Test(valid_folder, test_folder) y = y_valid #Defining classifier xgb = XGBClassifier(max_depth=4, learning_rate=0.05, n_estimators=200, objective='multi:softprob', gamma=0., max_delta_step=0., subsample=0.9, colsample_bytree=0.9, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) return y_pred
def perform_prediction(training, labels, testing, xgb_votes, rf_votes): """ Perform prediction using a combination of XGB and RandomForests. """ predictions = np.zeros((len(testing), len(set(labels)))) # Predictions using xgboost. for i in range(xgb_votes): print 'XGB vote %d' % i xgb = XGBClassifier( max_depth=DEPTH_XGB, learning_rate=LEARNING_XGB, n_estimators=ESTIMATORS_XGB, objective='multi:softprob', subsample=SUBSAMPLE_XGB, colsample_bytree=COLSAMPLE_XGB) xgb.fit(training, labels) predictions += xgb.predict_proba(testing) # Predictions using RandomForestClassifier. for i in range(rf_votes): print 'RandomForest vote %d' % i rand_forest = RandomForestClassifier( n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF, max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, bootstrap=True) rand_forest.fit(training, labels) predictions += rand_forest.predict_proba(testing) return predictions
def xgboostinitial_predictor(train_path, test_path, eval_path): # Loading the data print 'Loading the data...' train = pd.read_csv(train_path, index_col=0) test = pd.read_csv(test_path, index_col=0) eval_df = pd.read_csv(eval_path, index_col=0) target = train['target'].copy() train.drop('target', axis=1, inplace=True) # Training model print 'Model training begins...' # xgtrain = xgb.DMatrix(train.values, target.values, missing=np.nan) # xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'logloss', 'eta': 0.01, # 'subsample': 0.5, 'colsample_bytree': 0.5, 'max_depth': 10, 'silent': 0} # # xgb_model = xgb.train(xgboost_params, xgtrain, learning_rates=0.3) xgb_model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='binary:logistic', subsample=0.5, colsample_bytree=0.5, seed=0) xgb_model.fit(train.as_matrix(), target.tolist()) # Running the model print 'Making predictions....' # xgtest = xgb.DMatrix(test.values) # xgeval = xgb.DMatrix(eval_df) test_preds = xgb_model.predict_proba(test.as_matrix()) eval_preds = xgb_model.predict_proba(eval_df.as_matrix()) print 'Cleaning predictions to match expected format....' test_output = pd.DataFrame(test_preds, index=test.index) print test_output.columns test_output = test_output[1] test_output.columns = ['PredictedProb'] eval_output = pd.DataFrame(eval_preds, index=eval_df.index) eval_output = eval_output[1] eval_output.columns = ['PredictedProb'] return test_output, eval_output
def train_classifier(X, y, clf_name='xgb'): if clf_name == 'xgb': clf = XGBClassifier( n_estimators=ESTIMATORS_XG, objective=OBJECTIVE_XG, max_depth=DEPTH_XG, learning_rate=LEARNING_RATE_XG, subsample=SUBSAMPLE_XG, colsample_bytree=COLSAMPLE_BYTREE_XG, seed=0, ) else: clf = RandomForestClassifier( n_estimators=ESTIMATORS_RF, criterion=CRITERION_RF, n_jobs=JOBS_RF, max_depth=DEPTH_RF, min_samples_leaf=MIN_LEAF_RF, min_samples_split=MIN_SPLIT_RF, max_features=MAX_FEATURES_RF, bootstrap=True, ) clf.fit(X, y) return clf
def get_xgboost_classifier(X_train, y_train, X_val, y_val,params=None, tag=""): param_grid = {'max_depth':[3,5,7], 'min_child_weight': [1,3,5], 'n_estimators': [50]} if params is None: xgb = XGBClassifier( learning_rate =0.2, objective= 'binary:logistic', seed=27) t = start("training xgboost ") cv = cross_validation.ShuffleSplit(X_train.shape[0], n_iter=10,test_size=0.2, random_state=123) clf = grid_search.GridSearchCV(xgb, param_grid, cv=cv, n_jobs=1, scoring='roc_auc') clf = clf.fit(X_train,y_train) report(t, nitems=10*len(param_grid)) print("Best score:{} with scorer {}".format(clf.best_score_, clf.scorer_)) print "With parameters:" best_parameters = clf.best_estimator_.get_params() for param_name in sorted(param_grid.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) else: clf = XGBClassifier(**params) clf.fit(X_train, y_train, eval_set = [(X_train,y_train),(X_val,y_val)], eval_metric='auc', verbose=False) if plot_cv_curves: train = clf.evals_result()['validation_0']['auc'] val = clf.evals_result()['validation_1']['auc'] plot_cv_curve(train, val, tag) if plot_feature_importance: plot_feature_importance(clf, tag) return clf
plt.xlabel('Predicted label') #define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ClusterCentroids cc = ClusterCentroids(random_state=0) os_X,os_y = cc.fit_sample(X_train,y_train) #XGboost clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1, max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1, reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000) clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False) evals_result = clf_XG.evals_result() y_true, y_pred = y_test, clf_XG.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) print "G score: " , math.sqrt(recall/ specifity)
#Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] # In[ ]: #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) # In[ ]: ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('sub0.csv',index=False)
# reg_alpha=0.1, # seed=27) # modelfit(xgb1, df_train, predictors, targetname, early_stopping_rounds=50) xgb1 = XGBClassifier( learning_rate=0.01, n_estimators=700, max_depth=5, min_child_weight=8, gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1, seed=27) xgb1.fit(df_train[predictors], df_train[targetname]) df_test['target'] = xgb1.predict(df_test[predictors]) df_test['target'] = df_test['target'].apply(lambda x: 'Y' if x==1 else 'N') submission = pd.DataFrame() submission['Loan_ID'] = df_test['Loan_ID'] submission['Loan_Status'] = df_test['target'] submission.to_csv('submission_XGB_retunned.csv', index=False)
"signup_app", "first_device_type", "first_browser", ] X = split_categorical_variables(train, categorical_variables) y = X.pop("country_destination") label_table = LabelEncoder() y = label_table.fit_transform(y.values) # # Let's try a gradiant boost classifier # In[56]: xgb_model = XGBClassifier(max_depth=3, n_estimators=10, learning_rate=0.1) xgb_model.fit(X, y) # ## How did we do? # # * To start, let's look at how well we did just predicting the final outcome pred = xgb_model.predict_proba(X) # Find the most probable country best_country = [] # Not used for now bestId = [] for i in range(len(pred)): bestId.append(np.argsort(pred[i])[::-1]) best_country.append(label_table.inverse_transform(bestId[-1]))
for iter in range(iterations): # if iter < 5: # continue X_train = Xy[iter][0] X_test = Xy[iter][1] y_train = Xy[iter][2] y_test = Xy[iter][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) print('fit start', datetime.now()) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2) submit = 0 if submit == 1: # n_estimators = 395 n_estimators = 349 #n_estimators = clf.booster().best_ntree_limit print(n_estimators) print('fit start', datetime.now()) clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=nthread) clf2.fit(X, y) #clf2.fit(X, y, eval_set=[(X, y2)], eval_metric=calculate_score_dummy, early_stopping_rounds=n_estimators) y_predicted = clf2.predict_proba(X_predict)
X_va = data_valid.values y_va = y_valid model = XGBClassifier(n_estimators=1, learning_rate=0.1, max_depth=1000, min_child_weight=1000, reg_lambda=0, seed=12) for cb in [0.1, 1.]: print('\ncolsample_bytree: %.1f' % cb) model.colsample_bylevel = cb model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr), (X_va, y_va)], eval_metric='auc', verbose=True) y_train=y_train.astype(int) n = data_train.shape[0] n = 327690 dtrain = xgb.DMatrix(data_train.values[:n], label = y_train[:n]) param2 = {'objective':'binary:logistic','tree_method':'approx', 'sketch_eps':0.00392, 'eta':.1, 'min_child_weight':10, 'max_depth':10, 'lambda':0, 'eval_metric':['logloss','auc'], 'nthread':2, 'seed':123, 'silent':1} param2 = {'objective':'binary:logistic', 'eta':.1, 'max_depth':10,# 'lambda':0, 'eval_metric':['logloss','auc'],
num_rounds=206 z=[] dtrain=xgb.DMatrix(train[features],label=y) clf=xgb.train(params,dtrain,num_rounds) importance=clf.get_fscore(fmap='xgb.fmap') importance=sorted(importance.items(),key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() bst=list(df['feature'][df.fscore>0.001]) #df.to_csv('select.csv',index=False) X_train,X_valid,y_train,y_valid=train_test_split(train[bst],y,test_size=0.6,random_state=10) print ('start xgboost learning...') alg = XGBClassifier(max_depth=6, learning_rate=0.05, n_estimators=1210, objective='multi:softprob', subsample=0.8, colsample_bytree=1,min_child_weight=1) alg.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)],eval_metric='mlogloss',early_stopping_rounds=10,verbose=True) #plt.figure() #df.plot() #df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10)) #plt.title('XGBoost Feature Importance') #plt.xlabel('relative importance') #plt.gcf().savefig('feature_importance_xgb.png') y_pred = alg.predict_proba(test[bst]) result=pd.DataFrame(y_pred,columns=['predict_0','predict_1','predict_2']) result['id']=test.id.values.copy() #result.to_csv('xgb10.csv',index=False)
def build_model(X, y): print("Fitting classifier") xgb = XGBClassifier(max_depth = 4, learning_rate = 0.25, n_estimators = 25, objective = 'multi:softprob', subsample = 0.6, colsample_bytree = 0.6) xgb.fit(X, y) return xgb
data.lon.unique().shape data_x=pd.get_dummies(data.action_type,prefix="action_type") cols=["combined_shot_type","game_event_id","period","playoffs", "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range", "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining", "loc_x","loc_y"] for col in cols: data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1) train_x=data_x[-pd.isnull(data.shot_made_flag)] test_x=data_x[pd.isnull(data.shot_made_flag)] train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)] clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550, subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(train_x, train_y) y_pred = clf.predict(train_x) print("Number of mislabeled points out of a total %d points : %d" % (train_x.shape[0],(train_y != y_pred).sum())) def logloss(act, pred): epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1-epsilon, pred) ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred))) ll = ll * -1.0/len(act) print(ll) return ll logloss(train_y,clf.predict_proba(train_x)[:,1]) test_y=clf.predict_proba(test_x)[:,1]
def xgbost(x,y,targetx): clf_xgb = XGBClassifier(n_estimators=1000,max_depth=6, learning_rate=0.0075,subsample=0.7,colsample_bytree=0.7,seed=4) clf_xgb.fit(x,y) return clf_xgb.predict_proba(targetx)[:,1]
#subsample : float Subsample ratio of the training instance. #colsample_bytree : float Subsample ratio of columns when constructing each tree. #seed : int Random number seed. xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=30, objective='multi:softprob', subsample=0.8, colsample_bytree=0.8, min_child_weight=1, seed=0) #fits test values and and encoded labels eval_set = [(X, y)] xgb.fit(X, y, eval_set=eval_set, eval_metric='mlogloss') #predicts coresponding class labels in the case of classification #predicts the probability of a user belonging to a class (country) #outputs a numpy array of shape (n_samples, n_classes) Ypred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities IDS = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] IDS += [idx] * 5 #Transform array or sparse matrix X back to feature mappings. cts += le.inverse_transform( np.argsort(Ypred[i])[::-1])[:5].tolist()
print data_train.shape print data_test.shape print 'Started Computing train set labels' label_set = np.sign(label_set['Click']) label_set[label_set == -1] = 0 print 'Finished computing train set labels' # fit estimator print "start XGBClassifier" n_samples = data_train.shape[0] est=XGBClassifier(n_estimators=200, learning_rate=0.1, silent= False) print "start fitting" est.fit(data_train, label_set) # predict class labels probs = est.predict_proba(data_test) print "cross validation start" cv = cross_validation.ShuffleSplit(n_samples, n_iter=10, random_state=0) scores = cross_validation.cross_val_score(est, data_train, label_set, cv=cv) mean = np.mean(probs[:, 1]) std = np.std(probs[:, 1]) print "Test predicted Mean:", mean print "Test predicted STD:", std df = pd.DataFrame(probs[:, 1]) df.columns = ["Prediction"] df.index += 1 df.to_csv("output_prediction.csv", index_label="Id")
def xgboost_algorithm(XTrain,YTrain,XTest): xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25, objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(XTrain, YTrain) y_pred_xgboost = xgb.predict_proba(XTest) return y_pred_xgboost
def model1(df_train, df_test): print('model1') print('rows', df_train.shape[0]) #remove rows with no sessions data hassessions = df_train['HasSessions'] df_train = df_train.drop(hassessions[hassessions == 0].index) #remove rows older than 1/1/2014 #dac2 = df_train.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) #print('removing rows', len(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index)) #df_train = df_train.drop(dac2[dac2 < datetime.strptime('20140101', '%Y%m%d')].index) print('rows', df_train.shape[0]) labels = df_train['country_destination'].values df_train = df_train.drop(['country_destination'], axis=1) piv_train = df_train.shape[0] #Creating a DataFrame with train+test data df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True) #Removing id and date_first_booking df_all = df_all.drop(['id', 'date_first_booking', 'sessions_count', 'HasSessions'], axis=1) #Filling nan df_all = df_all.fillna(-1) #####Feature engineering####### print('features in the csv', df_all.shape[1]) #date_account_created print('dac', datetime.now()) dac = np.vstack(df_all.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values) df_all['dac_year'] = dac[:,0] df_all['dac_month'] = dac[:,1] df_all['dac_day'] = dac[:,2] #day of week, seazon print('dac2', datetime.now()) dac2 = df_all.date_account_created.apply(lambda x: datetime.strptime(x, '%Y-%m-%d')) df_all['dac_weekday'] = dac2.apply(lambda x: x.weekday()) df_all['dac_season'] = dac2.apply(calculate_season) df_all = df_all.drop(['date_account_created'], axis=1) #timestamp_first_active print('tfa', datetime.now()) tfa = np.vstack(df_all.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values) df_all['tfa_year'] = tfa[:,0] df_all['tfa_month'] = tfa[:,1] df_all['tfa_day'] = tfa[:,2] df_all = df_all.drop(['timestamp_first_active'], axis=1) #Age print('age', datetime.now()) av = df_all.age.values df_all['age'] = np.where(np.logical_or(av<14, av>100), -1, av) #remove features print('remove features', datetime.now()) df_all = df_all.drop(['Sessions' + str(i) for i in [0]], axis=1) df_all = df_all.drop(['SessionsD' + str(i) for i in range(456)], axis=1) print('features in the model', df_all.shape[1]) #One-hot-encoding features print('one-hot', datetime.now()) ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser', 'dac_season', 'sessions_preferred_device'] for f in ohe_feats: df_all_dummy = pd.get_dummies(df_all[f], prefix=f) df_all = df_all.drop([f], axis=1) df_all = pd.concat((df_all, df_all_dummy), axis=1) #Splitting train and test vals = df_all.values X = vals[:piv_train] y = labels X_predict = vals[piv_train:] #learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.03, 6, 0.5, 2, 2, 2, 1 learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha = 0.03, 8, 0.5, 2, 1, 2, 0 early_stopping_rounds = 25 if learning_rate <= 0.03: early_stopping_rounds = 50 print(learning_rate, max_depth, ss_cs, gamma, min_child_weight, reg_lambda, reg_alpha) #n_estimators = 455 n_estimators = 350 #n_estimators = 1 print(n_estimators) print('fit start', datetime.now()) clf2 = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha, nthread=-1) clf2.fit(X, y) y_predicted2 = clf2.predict_proba(X_predict) return y_predicted2
train.drop(x, axis=1, inplace=True) test.drop(x, axis=1, inplace=True) y_train = train['TARGET'].values X_train = train.drop(['ID','TARGET'], axis=1).values y_test = test['ID'] X_test = test.drop(['ID'], axis=1).values xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=600, max_depth=5, min_child_weight=1, gamma=0, subsample=0.6815, colsample_bytree=0.701, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27) xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics=['auc'], early_stopping_rounds=50, show_progress=False) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb1.fit(X_train, y_train, eval_metric='auc') output = xgb1.predict_proba(X_test)[:,1] submission = pd.DataFrame({"ID":y_test, "TARGET":output}) submission.to_csv("submission.csv", index=False)
del device_freq del action_freq #Splitting train and test vals = df_all.values X = vals[:piv_train] le = LabelEncoder() y = le.fit_transform(labels) X_test = vals[piv_train:] #Classifier xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=43, objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) print('scores:', NDCG.cross_validation_score(X, labels,xgb,5)) ''' xgb.fit(X, y) y_pred = xgb.predict_proba(X_test) #Taking the 5 classes with highest probabilities ids = [] #list of ids cts = [] #list of countries for i in range(len(id_test)): idx = id_test[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist() #Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) sub.to_csv('sub.csv',index=False) '''
def do_cell(task): df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3] #print('do_cell', df_train.shape, df_test.shape, x_start, y_start) #train n_places_th_local = n_places_th n_places_local = n_places if n_places != 0: tmp = df_train.shape[0] value_counts = df_train.place_id.value_counts()[0:n_places] df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns] n_places_th_local = value_counts.values[n_places - 1] percentage = df_train.shape[0]/tmp elif n_places_th != 0: value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] else: n_places_th_local = 2 value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] while percentage > n_places_percentage: n_places_th_local += 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] n_places_th_local -= 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] #print(x_start, y_start, n_places_local, n_places_th_local, percentage) #test row_ids = df_test.index if 'place_id' in df_test.columns: df_test = df_test.drop(['place_id'], axis=1) le = LabelEncoder() y = le.fit_transform(df_train.place_id.values) X = df_train.drop(['place_id'], axis=1).values X_predict = df_test.values score = 0 n_estimators = 0 if xgb == 1: if xgb_calculate_n_estimators == True: clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False) score = round(1 - clf.booster().best_score, 6) n_estimators = clf.booster().best_ntree_limit else: abc += 1 xgb_options = clf.get_xgb_params() xgb_options['num_class'] = n_places + 1 train_dmatrix = DMatrix(X, label=y) #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score) n_estimators = cv_results.shape[0] score = round(1 - cv_results.values[-1][0], 6) std = round(cv_results.values[-1][1], 6) else: n_estimators = n_estimators_fixed clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) else: clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1) if rf_calculate_score == True: if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() else: #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) scores_cv = [] for train, test in folds: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() print(' ', x_start, y_start, score) scores_cv.append(score) score = np.array(scores_cv).mean() #if few_cells == 1 or grid_search == 1: # return [score, None, None] clf.fit(X, y) y_predict = clf.predict_proba(X_predict) ##1 labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx]) print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage) return [score, row_ids, labels_predict]