def gbdt(train,target,test,n): from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import GradientBoostingRegressor clf = GradientBoostingRegressor(n_estimators=150, learning_rate=0.1,subsample=0.4, max_depth=5, random_state=0, loss='ls') # .fit(train, target) from ROC import ROC,ROC2 print "delete ",-1*n," feature" (model,ks) = ROC(clf,train,target) #(model, ks) = ROC2(clf, train, target) result = model.predict(test) writeDatas(result, test, "bn{}".format(ks))
def gbdt_a(n_estimators=300,rate=0.1,max_depth=5,rand_state=0,name='train_data_5'): train,target,test = getDatas(name) print "data :",name print train.shape from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import GradientBoostingRegressor clf = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=rate, max_depth=max_depth, random_state=rand_state, loss='ls') # .fit(train, target) from ROC import ROC,ROC2,ROC3 logger.info("Datas name: %s",name) logger.info("n_estimators= %s rate= %s max_depth= %s rand_state= %s", n_estimators,rate,max_depth,rand_state) (model, ks) = ROC(clf, train, target) result = model.predict(test) writeDatas(result, test, "{}".format(ks))
def XGBoost_part(dtrain=None, test=None, dtest_X=None, test_y=None, k=0, gamma=0.02, min_child_weight=1.1, max_depth=5, lamda=100, subsamp=0.7, col_bytree=0.7, col_bylevel=0.7, eta=0.01): param = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': gamma, 'min_child_weight': min_child_weight, 'max_depth': max_depth, 'lambda': lamda, 'subsample': subsamp, 'colsample_bytree': col_bytree, 'colsample_bylevel': col_bylevel, 'eta': eta, 'tree_method': 'exact', 'seed': 0, 'nthread': 12 } cv_log = xgb.cv(param, dtrain, num_boost_round=3500, nfold=5, early_stopping_rounds=50, seed=0) num_round = cv_log.shape[0] cf = './featurescore/cvg{}.csv'.format(str(num_round)) cv_log.to_csv(cf) watchlist = [(dtrain, 'train')] #auc = cv_log['test-auc-mean'].max() bst = xgb.train(param, dtrain, num_round, evals=watchlist, early_stopping_rounds=50) # make prediction dtest = xgb.DMatrix(test, missing=-9999) preds = bst.predict(dtest) scores = bst.predict(dtrain, ntree_limit=bst.best_ntree_limit) fp, tp, thresholds = metrics.roc_curve(test_y, scores, pos_label=1) ks = KS(y=test_y, score=scores) kk = int(ks * 10000000000) % 1000 print "K-S:{}".format(ks) print "AUC:{}".format(metrics.auc(fp, tp)) with open('./featurescore/a.txt', 'a') as f: S = "gamma= "+str(gamma)+\ " min_child_weight= "+str(min_child_weight)+\ " max_depth= "+str(max_depth)+\ " lamda= "+str(lamda)+\ "\n" + \ "subsamp= "+str(subsamp)+\ " col_bytree= "+str(col_bytree)+\ " col_bylevel= "+str(col_bylevel)+\ " eta= "+str(eta) + \ " ntree= "+str(bst.best_ntree_limit)+ \ "\nfeatures scores: " + str(kk) f.writelines("{}\n".format(S)) f.writelines("K-S:{}\n".format(ks)) f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp))) #f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp))) # 写入文件 writeDatas(preds, test, "xgk{}".format(str(ks))) # get feature score feature_score = bst.get_fscore() feature_score = sorted(feature_score.items(), key=lambda x: x[1], reverse=True) fs = [] for (key, value) in feature_score: fs.append("{0},{1}\n".format(key, value)) print "features scores:", kk ff = './featurescore/feature_score_{0}.csv'.format(kk) with open(ff, 'w') as f: f.writelines("feature,score\n") f.writelines(fs) return kk
def XGBoost_(dtrain=None, test=None, dtest_X=None, test_y=None, k=0, gamma=0.1, min_child_weight=1.1, max_depth=5, lamda=10, subsamp=0.7, col_bytree=0.7, col_bylevel=0.7, eta=0.01): param = { 'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'gamma': gamma, 'min_child_weight': min_child_weight, 'max_depth': max_depth, 'lambda': lamda, 'subsample': subsamp, 'colsample_bytree': col_bytree, 'colsample_bylevel': col_bylevel, 'eta': eta, 'tree_method': 'exact', 'seed': 0, 'nthread': 12 } num_round = 1500 watchlist = [(dtrain, 'train')] bst = xgb.train(param, dtrain, num_round, evals=watchlist) # make prediction dtest = xgb.DMatrix(test) preds = bst.predict(dtest) scores = bst.predict(dtest_X) fp, tp, thresholds = metrics.roc_curve(test_y, scores, pos_label=1) ks = np.max(tp - fp) print "K-S:{}".format(ks) print "AUC:{}".format(metrics.auc(fp, tp)) with open('./featurescore/a.txt', 'a') as f: S = "gamma= "+str(gamma)+\ " min_child_weight= "+str(min_child_weight)+\ " max_depth= "+str(max_depth)+\ " lamda= "+str(lamda)+\ " subsamp= "+str(subsamp)+\ " col_bytree= "+str(col_bytree)+\ " col_bylevel= "+str(col_bylevel)+\ " eta= "+str(eta) f.writelines("{}\n".format(S)) f.writelines("K-S:{}\n".format(ks)) f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp))) # 写入文件 writeDatas(preds, test, "xg{}".format(str(min_child_weight) + str(ks))) # get feature score feature_score = bst.get_fscore() feature_score = sorted(feature_score.items(), key=lambda x: x[1], reverse=True) fs = [] ft = [] for (key, value) in feature_score: fs.append("{0},{1}\n".format(key, value)) if value >= 10: ft.append(key) with open('./featurescore/feature_score5_{0}.csv'.format(k), 'w') as f: f.writelines("feature,score\n") f.writelines(fs) return ft
def XGBoost_(train=None, y=None, test=None, dtest_X=None, test_y=None, k=0, num_round=3500, gamma=0.02, min_child_weight=1.1, max_depth=5, lamda=10, scale_pos_weight=3, subsamp=0.7, col_bytree=0.7, col_bylevel=0.7, eta=0.01, file="aac"): param = { 'booster': 'gbtree', 'objective': 'binary:logistic', #'eval_metric':'auc', 'gamma': gamma, 'min_child_weight': min_child_weight, 'max_depth': max_depth, 'lambda': lamda, 'subsample': subsamp, 'colsample_bytree': col_bytree, 'colsample_bylevel': col_bylevel, 'eta': eta, 'tree_method': 'exact', 'seed': 0, 'nthread': 12 } with open('./test/a{}.txt'.format(file), 'a') as f: S = "gamma= " + str(gamma) + \ " scale_pos_weight= " + str(scale_pos_weight) + \ " min_child_weight= " + str(min_child_weight) + \ " max_depth= " + str(max_depth) + \ " lamda= " + str(lamda) + \ "\n" + \ "subsamp= " + str(subsamp) + \ " col_bytree= " + str(col_bytree) + \ " col_bylevel= " + str(col_bylevel) + \ " eta= " + str(eta) f.writelines("{}\n".format(S)) dtrain = xgb.DMatrix(train, label=y, missing=-9999) #cv_log = xgb.cv(param, dtrain,show_stdv=True,verbose_eval=1,feval=evalerror,num_boost_round=3500, nfold=5,early_stopping_rounds=10, seed=0) #num_round = 21#cv_log.shape[0] #cf = './featurescore/acvg{}.csv'.format(str(num_round)) #cv_log.to_csv(cf) watchlist = [(dtrain, 'train'), (dtest_X, 'eval')] #auc = cv_log['test-auc-mean'].max() bst = xgb.train(param, dtrain, num_round, watchlist, maximize=True, feval=evalerror, early_stopping_rounds=50) # make prediction dtest = xgb.DMatrix(test, missing=-9999) preds = bst.predict(dtest, ntree_limit=bst.best_ntree_limit) p = bst.predict(dtrain, ntree_limit=bst.best_ntree_limit) scores = bst.predict(dtest_X, ntree_limit=bst.best_ntree_limit) fp, tp, thresholds = metrics.roc_curve(test_y, scores, pos_label=1) auc = metrics.auc(fp, tp) ks = KS(y=test_y.label, pred=scores) kk = int(ks * 10000000000) % 10000 print "K-S:{}".format(ks) print "AUC:{}".format(auc) with open('./test/a{}.txt'.format(file), 'a') as f: S = " best_ntree_limit:" + str(bst.best_ntree_limit) + \ " best_iteration= "+str(bst.best_iteration)+ \ "\nfeatures scores: " + str(kk) f.writelines("{}\n".format(S)) f.writelines("K-S:{}\n".format(ks)) f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp))) res = writeDatas(preds, test, "xgk_{}".format(str(kk))) res.columns = ['label' + str(kk)] y['label' + str(kk)] = p y = pd.concat([y, res]) y.drop('label', axis=1, inplace=True) y = y.reset_index() try: ypred = pd.read_csv("./test/y/a{}.csv".format(file)) y = pd.merge(y, ypred, on='userid') except: pass finally: y.to_csv("./test/y/a{}.csv".format(file), index=None) # get feature score feature_score = bst.get_fscore() feature_score = sorted(feature_score.items(), key=lambda x: x[1], reverse=True) fs = [] for (key, value) in feature_score: fs.append("{0},{1}\n".format(key, value)) print "features scores:", kk ff = './test/feature_score_{0}.csv'.format(kk) with open(ff, 'w') as f: f.writelines("feature,score\n") f.writelines(fs)
random_state=0) rf.fit(train_X, train_y) score = rf.predict_proba(test_X)[:, 1] fp, tp, thresholds = metrics.roc_curve(test_y.values, score, pos_label=1) ks = KS(y=test_y, score=score) print "K-S:{}".format(ks) print "AUC:{}".format(metrics.auc(fp, tp)) ans = rf.predict_proba(test)[:, 1] with open('./featurescore/a.txt', 'a') as f: S = "criterion= " + str(c) + \ " n_estimators= " + str(n) + \ " max_depth= " + str(md) f.writelines("{}\n".format(S)) f.writelines("K-S:{}\n".format(ks)) f.writelines("AUC:{}\n\n".format(metrics.auc(fp, tp))) writeDatas(ans, test, "rf{}".format(str(ks))) except: S = "criterion= " + str(c) + \ " n_estimators= " + str(n) + \ " max_depth= " + str(md) print "Eorr", S pass