def select_model(train_x, train_y, test_x, test_y): cv_params = {'n_estimators': [50,100,200,300,400]} other_params = {'learning_rate': 0.1, 'n_estimators': 50, 'max_depth':3, 'min_child_weight': 20, 'gamma':0, 'subsample': 0.8, 'colsample_bytree':1.0, 'reg_alpha': 0, 'reg_lambda': 1, 'silent':1, 'scale_pos_weight':1, 'seed':0} model = xgb.XGBClassifier(**other_params) optimized_XGB = GridSearchCV(estimator=model, param_grid=cv_params, scoring='r2', cv=5, verbose=1, n_jobs=4) optimized_XGB.fit(train_x, train_y) #evalute_result = optimized_XGB.cv_results_ #print "每轮迭代运行结果:{0}".format(evalute_result) print "参数的最佳取值:{0}".format(optimized_XGB.best_params_) print "最佳模型得分:{0}".format(optimized_XGB.best_score_) predict_train_y = optimized_XGB.predict(train_x) train_ks = run_ks(train_y, predict_train_y) predict_test_y = optimized_XGB.predict(test_x) test_ks = run_ks(test_y, predict_test_y) #print "训练集ks: %f,测试集ks: %f" % (train_ks, test_ks) print train_ks
def test_xgb_noin(data, fea_list_path, in_fea, model_path, filter_name=['name', 'idcard', 'phone', 'loan_dt', 'label']): fea_list = [] in_fea_list = [] df = pd.read_csv(data, sep='\t') df_label = df['label'] df_out = df for key in filter_name: if key in df.keys(): del df[key] if fea_list_path != "": fea_list = [ x.strip() for x in open(fea_list_path) if len(x.strip()) > 0 ] print len(fea_list) if in_fea != "": in_fea_list = [x.strip() for x in open(in_fea) if len(x.strip()) > 0] print len(in_fea_list) ss = list(set(fea_list) - set(in_fea_list)) #print ss print len(ss) df = df[fea_list] for i in ss: df[i] = 0 df_label = df_label[~df.isnull().all(axis=1)] df_out = df_out[~df.isnull().all(axis=1)] df = df[~df.isnull().all(axis=1)] test_x = df test_Y = df_label f_list = fea_list dTest = xgb.DMatrix(test_x, label=test_Y) clf_xgb = xgb.Booster() clf_xgb.load_model(model_path) print "Read test done" dTest = xgb.DMatrix(test_x) y_proba = clf_xgb.predict(dTest) ks_ = run_ks(test_Y, y_proba) #auc_ = roc_auc_score(test_Y, y_proba) print "%f" % (ks_['ks'])
def test_xgb(test_tbl, xgb_model, train_list): df_test_x, df_test_y, f_list_test, df_median = data_preprocess(test_tbl) df_test = pd.DataFrame() for e in train_list: df_test[e] = df_test_x[e] df_test_x = df_test # df_test_x.fillna(-1, inplace=True) print 'Read test done' test_y = np.array(df_test_y) xgb = xgb_model test_x = np.array(df_test_x) y_proba = xgb.predict_proba(test_x) y_score = [item[0] for item in y_proba] y_good = [1 - item for item in test_y] tmp_df = pd.DataFrame() tmp_df['f'] = y_score tmp_df['good'] = y_good tmp_df['bad'] = test_y ks_dict = run_ks(test_y, y_proba[:, 1]) auc = roc_auc_score(test_y, y_proba[:, 1]) print "%f\t%f" % (auc, ks_dict['ks']) print_ks(ks_dict, test_tbl + '_score_ks_detail')
def test_xgb(data, fea_list_path, model_path, filter_name=['name', 'idcard', 'phone', 'loan_dt', 'label']): fea_list = [] df = pd.read_csv(data, sep='\t') df_label = df['label'] df_out = df for key in filter_name: if key in df.keys(): del df[key] if fea_list_path != "": fea_list = [ x.strip() for x in open(fea_list_path) if len(x.strip()) > 0 ] df = df[fea_list] df_label = df_label[~df.isnull().all(axis=1)] df_out = df_out[~df.isnull().all(axis=1)] df = df[~df.isnull().all(axis=1)] test_x = df test_Y = df_label f_list = fea_list dTest = xgb.DMatrix(test_x, label=test_Y) clf_xgb = xgb.Booster() clf_xgb.load_model(model_path) print "Read test done" dTest = xgb.DMatrix(test_x) y_proba = clf_xgb.predict(dTest) print y_proba ks_ = run_ks(test_Y, y_proba) auc_ = roc_auc_score(test_Y, y_proba) print "%f\t%f" % (auc_, ks_['ks']) df_out['prob'] = y_proba print_ks(ks_, data + r'_score_ks_detail') df_out.to_csv(data + r'_prob_', index=False, sep='\t')
def xgb_model(sample_tbl, model_path, xgb_params={'nthread': 4, 'n_estimators': 80, 'max_depth': 3, 'min_child_weight': 2, 'gamma': 0.1, 'subsample': 0.4, 'learning_rate': 0.06, 'colsample_bytree': 0.5, 'scale_pos_weight': 1, 'seed': 100}): # os.path.join(model_res_path, 'score'), os.path.join(model_res_path, 'fip') plst = xgb_params.items() f_score = os.path.join(model_path, "score") f_write = os.path.join(model_path, "fip") df_x, df_y, df_name = sample_tbl x = np.array(df_x) y = np.array(df_y) # 深度是n,节点数2**(n+1)-1,叶子节点数2**n kf = StratifiedKFold(n_splits=5, shuffle=True) auc_list = [] auc_list_train = [] ks_list = [] ks_list_train = [] times = 0 for train_index, dev_index in kf.split(x, y): # print "KFold: %d\nauc\tks" % times times += 1 x_train, x_dev = x[train_index], x[dev_index] y_train, y_dev = y[train_index], y[dev_index] dtrain = xgb.DMatrix(x_train, label=y_train) dvalid = xgb.DMatrix(x_dev, label=y_dev) # evallist = [(dtrain, 'train'), (dvalid, 'eval')] #clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators'], evals=evallist) clf_xgb = xgb.train(plst, dtrain, num_boost_round=xgb_params['n_estimators']) clf_xgb.save_model(os.path.join(model_path, 'clf_'+str(times))) clf_xgb.dump_model(os.path.join(model_path, 'dump_raw_'+str(times))) f_weights_dict = clf_xgb.get_score(importance_type='weight') f_gains_dict = clf_xgb.get_score(importance_type='gain') f_covers_dict = clf_xgb.get_score(importance_type='cover') fea_analysis = [] for f_key in f_weights_dict: fea_analysis.append( {'feature': f_list[int(f_key[1:])], 'weight': f_weights_dict[f_key], 'gain': f_gains_dict[f_key], 'cover': f_covers_dict[f_key]}) fea_analysis_df = pd.DataFrame(fea_analysis, columns=['feature', 'weight', 'gain', 'cover']) fea_analysis_df.sort_values(['gain'], ascending=False, inplace=True) fea_analysis_df.to_csv(f_write+"_"+str(times), index=None, sep='\t') y_proba = clf_xgb.predict(dvalid) y_proba_train = clf_xgb.predict(dtrain) ks_dict = run_ks(y_dev, y_proba) ks_dict_train = run_ks(y_train, y_proba_train) auc = roc_auc_score(y_dev, y_proba) auc_train = roc_auc_score(y_train, y_proba_train) # print "%f\t%f" % (auc, ks) ks_list.append(ks_dict['ks']) auc_list.append(auc) ks_list_train.append(ks_dict_train['ks']) auc_list_train.append(auc_train) print_ks(ks_dict, f_score) fea_analysis_df.to_csv(f_write, index=None, sep='\t') clf_xgb.save_model(os.path.join(model_path, "clf")) clf_xgb.dump_model(os.path.join(model_path, "dump_raw")) dd = dict() dd['train_ks'] = ks_list_train dd['valida_ks'] = ks_list dd['train_auc'] = auc_list_train dd['valida_auc'] = auc_list train_ks_df = pd.DataFrame(dd) train_ks_df.to_csv(os.path.join(model_path, "ks_auc"),index=False, sep='\t') #params = clf_xgb.get_params() #print params ks_mean = np.mean(ks_list) ks_var = np.std(ks_list) auc_mean = np.mean(auc_list) auc_var = np.std(auc_list) ks_mean_train = np.mean(ks_list_train) ks_var_train = np.std(ks_list_train) auc_mean_train = np.mean(auc_list_train) auc_var_train = np.std(auc_list_train) print('train: ') print "ks mean: %f, ks var: %f" % (ks_mean_train, ks_var_train) print "auc mean: %f, auc var: %f" % (auc_mean_train, auc_var_train) print('validation:') print "ks mean: %f, ks var: %f" % (ks_mean, ks_var) print "auc mean: %f, auc var: %f" % (auc_mean, auc_var)
def train_xgb( sample_data, model_path, params={ 'learning_rate': 0.1, 'n_estimators': 50, 'max_depth': 3, 'min_child_weight': 20, 'gamma': 0, 'subsample': 0.8, 'colsample_bytree': 1.0, 'reg_alpha': 0, 'reg_lambda': 1, 'silent': 1, 'scale_pos_weight': 1, 'seed': 0 }): train_x, train_Y, valid_x, valid_Y, f_list = sample_data ks_list = [] auc_list = [] plst = params.items() dTrain = xgb.DMatrix(train_x, label=train_Y) dValid = xgb.DMatrix(valid_x, label=valid_Y) num_round = params['n_estimators'] model_clf = xgb.train(plst, dTrain, num_round) model_clf.save_model(os.path.join(model_path, "model_clf")) f_weights_dict = model_clf.get_score(importance_type='weight') f_gains_dict = model_clf.get_score(importance_type='gain') f_cover_dict = model_clf.get_score(importance_type='cover') #print len(f_weights_dict) #print len(f_gains_dict) #print len(f_cover_dict) fea_analysis = [] for f_key in f_weights_dict: fea_analysis.append({ 'feature': f_key, 'weight': f_weights_dict[f_key], 'gain': f_gains_dict[f_key], 'cover': f_cover_dict[f_key] }) fea_analysis_df = pd.DataFrame( fea_analysis, columns=['feature', 'weight', 'gain', 'cover']) fea_analysis_df.to_csv(model_path + r'/fip') Y_train_prob = model_clf.predict(dTrain) Y_valid_prob = model_clf.predict(dValid) train_ks = run_ks(train_Y, Y_train_prob)['ks'] print "train_ks" print train_ks valid_ks = run_ks(valid_Y, Y_valid_prob)['ks'] print "valid_ks" print valid_ks train_auc = roc_auc_score(train_Y, Y_train_prob) valid_auc = roc_auc_score(valid_Y, Y_valid_prob) print "train_auc" print train_auc print "valid_auc" print valid_auc ks_list.append(train_ks) ks_list.append(valid_ks) auc_list.append(train_auc) auc_list.append(valid_auc) fw = open(model_path + r'/result', 'a') str_ = str(train_ks) + '\t' + str(valid_ks) + '\t' + str( train_auc) + '\t' + str(valid_auc) + '\t' + json.dumps(params) + '\n' print str_ fw.write(str_) print "run success!"