def xgb_result(x, y, testx, testy, para): print("----- Working on 'xgb' method...") #dtrain = xgb.DMatrix(x, label=y) #dtest = xgb.DMatrix(testx, label=testy) xgb0 = XGBClassifier(**para) # with open('xgb.pickle','rb') as f: # xgb0 = pickle.load(f) time0 = time.time() #bst = xgb.train(dtrain=dtrain,**para) xgb0.fit(x, y) train_time = time.time() - time0 confusion, test_time = Errmodel(xgb0, x, y, testx, testy, ntree_limit=xgb0.booster().best_iteration) print(confusion, '\n', train_time, '\n', test_time) importance = sorted(xgb0.booster().get_score().items(), key=lambda x: x[1]) result = { 'model': xgb0, 'confusion': confusion, 'train_time': train_time, 'test_time': test_time, 'importance': importance, 'best_iter': xgb0.booster().best_iteration } print("best_iter", xgb0.booster().best_iteration) return result
def job_function(params): learning_rate = params[0] max_depth = params[1] ss_cs = params[2] gamma = params[3] min_child_weight = params[4] reg_lambda = params[5] reg_alpha = params[6] early_stopping_rounds = 25 if learning_rate >= 0.3: early_stopping_rounds = 5 if learning_rate <= 0.03: early_stopping_rounds = 50 scores = [] for i in range(iterations_per_job): X_train = Xy[i][0] X_test = Xy[i][1] y_train = Xy[i][2] y_test = Xy[i][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) scores.append(score) avg_score = np.array(scores).mean() print(avg_score, params) return avg_score
def myThreadFunc(ThreadID): X_train = Xy[ThreadID][0] X_test = Xy[ThreadID][1] y_train = Xy[ThreadID][2] y_test = Xy[ThreadID][3] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss_cs, colsample_bytree=ss_cs, gamma=gamma, min_child_weight=min_child_weight, seed=0, silent=True, reg_lambda=reg_lambda, reg_alpha=reg_alpha) clf.fit(X_train, y_train, eval_set=[(X_test, y_test2)], eval_metric=calculate_score_2, early_stopping_rounds=early_stopping_rounds, verbose=False) y_predicted = clf.predict_proba(X_test, ntree_limit=clf.booster().best_ntree_limit) score = calculate_score(y_predicted, y_test2) print(score, clf.booster().best_ntree_limit) train_and_test_scores[ThreadID] = score
def modelfit(params, x, y): #Fit the algorithm on the data print("fit") alg = XGBClassifier(**params) alg.fit(x, y, verbose=True) feat_imp = pd.Series( alg.booster().get_fscore()).sort_values(ascending=False) print(feat_imp)
def extract_leaf_feature(features, targets, train_indexes, params): model = XGBClassifier(**params) model.fit(features[train_indexes], targets[train_indexes]) booster = model.booster() dmatrix = xgb.DMatrix(features) leaf = booster.predict(dmatrix, pred_leaf=True) encoder = sklearn.preprocessing.OneHotEncoder() leaf_feature = encoder.fit_transform(leaf) return leaf_feature
##feature importance feature_importances = pd.DataFrame(xgb1.feature_importances_,index = x_train.columns, columns=['importance']).sort_values('importance',ascending=False) pt=feature_importances.plot.bar r=xgb1.predict(x_test) l=test_df["loan_id"] results=pd.DataFrame({"loan_id":l,"m13":r}) results.to_csv(r"D:results_xgb1_200.csv",index=False) feat_imp = pd.Series(xgb1.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') print(xgb1.feature_importances_) # plot from matplotlib import pyplot pyplot.bar(range(len(xgb1.feature_importances_)), xgb1.feature_importances_) pyplot.show() from xgboost import plot_importance plot_importance(xgb1) pyplot.show()
def modelfit(train, labels, test, features, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): model = XGBClassifier(learning_rate=0.2, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=27) test_percent = 0.2 X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=test_percent, random_state=23) xgb_param = model.get_xgb_params() xgtrain = xgb.DMatrix(X_train[features], y_train) xgcv = xgb.DMatrix(X_test[features]) xgtest = xgb.DMatrix(test[features]) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print("n_estimators=") print(cvresult.shape[0]) model.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data model.fit(X_train, y_train) ##training predictions proba = model.predict_proba(X_test) preds = proba[:, 1] score = roc_auc_score(y_test, preds) print("Area under ROC {0}".format(score)) #Print model report: # print "\nModel Report" # print "Accuracy : %.4g" % accuracy_score(y_train, preds) # print "AUC Score (Train): %f" % roc_auc_score(y_train, preds) feat_imp = pd.Series( model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') # plt.show() ##test predictions test_proba = model.predict_proba(test) test_preds = test_proba[:, 1] return test_preds
def do_cell(task): df_train, df_test, x_start, y_start = task[0], task[1], task[2], task[3] #print('do_cell', df_train.shape, df_test.shape, x_start, y_start) #train n_places_th_local = n_places_th n_places_local = n_places if n_places != 0: tmp = df_train.shape[0] value_counts = df_train.place_id.value_counts()[0:n_places] df_train = pd.merge(df_train, pd.DataFrame(value_counts), left_on='place_id', right_index=True)[df_train.columns] n_places_th_local = value_counts.values[n_places - 1] percentage = df_train.shape[0]/tmp elif n_places_th != 0: value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] else: n_places_th_local = 2 value_counts = df_train.place_id.value_counts() n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] while percentage > n_places_percentage: n_places_th_local += 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] n_places_th_local -= 1 n_places_local = value_counts[value_counts >= n_places_th_local].count() mask = value_counts[df_train.place_id.values] >= n_places_th_local percentage = mask.value_counts()[True]/df_train.shape[0] df_train = df_train.loc[mask.values] #print(x_start, y_start, n_places_local, n_places_th_local, percentage) #test row_ids = df_test.index if 'place_id' in df_test.columns: df_test = df_test.drop(['place_id'], axis=1) le = LabelEncoder() y = le.fit_transform(df_train.place_id.values) X = df_train.drop(['place_id'], axis=1).values X_predict = df_test.values score = 0 n_estimators = 0 if xgb == 1: if xgb_calculate_n_estimators == True: clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=5000, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric=calculate_score, early_stopping_rounds=early_stopping_rounds, verbose=10 if one_cell == 1 else False) score = round(1 - clf.booster().best_score, 6) n_estimators = clf.booster().best_ntree_limit else: abc += 1 xgb_options = clf.get_xgb_params() xgb_options['num_class'] = n_places + 1 train_dmatrix = DMatrix(X, label=y) #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) cv_results = cv(xgb_options, train_dmatrix, clf.n_estimators, early_stopping_rounds=early_stopping_rounds, verbose_eval=10 if one_cell == 1 else False, show_stdv=False, folds=folds, feval=calculate_score) n_estimators = cv_results.shape[0] score = round(1 - cv_results.values[-1][0], 6) std = round(cv_results.values[-1][1], 6) else: n_estimators = n_estimators_fixed clf = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, objective='multi:softprob', subsample=ss, colsample_bytree=cs, gamma=gamma, min_child_weight=min_child_weight, reg_lambda=reg_lambda, reg_alpha=reg_alpha) else: clf = RandomForestClassifier(n_estimators = 300, n_jobs = -1) if rf_calculate_score == True: if train_test == 1: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() else: #some of the classes have less than n_folds, cannot use stratified KFold #folds = StratifiedKFold(y, n_folds=n_folds, shuffle=True) folds = KFold(len(y), n_folds=n_folds, shuffle=True) scores_cv = [] for train, test in folds: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] y_train2 = le.transform(y_train) y_test2 = le.transform(y_test) clf.fit(X_train, y_train2) y_predict = clf.predict_proba(X_test) scores_local = [] for i in range(X_test.shape[0]): score = calculate_score_per_row(y_predict[i], y_test2[i]) scores_local.append(score) score = np.array(scores_local).mean() print(' ', x_start, y_start, score) scores_cv.append(score) score = np.array(scores_cv).mean() #if few_cells == 1 or grid_search == 1: # return [score, None, None] clf.fit(X, y) y_predict = clf.predict_proba(X_predict) ##1 labels_predict = le.inverse_transform(np.argsort(y_predict, axis=1)[:,::-1][:,:n_topx]) print(x_start, y_start, score, n_estimators, n_places_local, n_places_th_local, percentage) return [score, row_ids, labels_predict]
nullEmbarkeds = combine[combine.Embarked.isnull()].index.values combine['Embarked'].iloc[nullEmbarkeds] = 'C' #构建分类模型 ####offline model ### ## 交叉验证得到线下训练的准确率 ''' x_train=trainData['Fare','Age','Family','Embarked', 'Sex','Pclass','AgeClass','SibSp','PSM', 'Parch','FamilyBins'] ''' ''' x_train=np.concat(trainData['Fare'],trainData['Age'],trainData['Family'], trainData['Embarked'],trainData['Sex'],trainData['Pclass'], trainData['']) ''' y_train = trainData['Survived'] x_train = trainData['Fare'] model = XGBClassifier(max_depth=6, n_estimator=1000, learning_rate=0.01) scores = cross_val_score(model, x_train, y_train, cv=3) print('accuracy:{0:.5f}'.format(np.mean(scores))) #使用xgboost的get_fscore得到特征的重要性并排序 model.fit(x_train, y_train) importance = model.booster().get_fscore() sort_importance = sorted(importance.items(), key=operator.itemgetter(1), reverse=True) df = pd.DataFrame(sort_importance, columns=['feature', 'fscore']) print(df)
reg_lambda=2, subsample=1.0, colsample_bytree=1.0, max_delta_step=1, scale_pos_weight=1, objective='multi:softprob', nthread=8, seed=0 # , # silent = False ) print('training...') xgb_model.fit(training, label) print('predicting...') predicted = xgb_model.predict_proba(testing) predicted = pandas.DataFrame(predicted) predicted.columns = xgb_model.classes_ # Name index column. predicted.index.name = 'Id' # Write csv. print('Saving prediction...') predicted.to_csv('Prediction.csv') # feature importance feat_imp = pandas.Series( xgb_model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') matplotlib.pyplot.show() plot_importance(xgb_model, title='Feature importance') matplotlib.pyplot.show() plot_tree(xgb_model, num_trees=0) matplotlib.pyplot.show()
subsample=0.7930, colsample_bytree=0.4679) # 0.43251 for 7 models stacking # 'colsample_bytree' :(0.4679) # 'gamma': 4.2599), # 'learning_rate': (0.0685), # 'max_depth': (3), # 'min_child_weight': (21.8363), # 'n_estimators': (449), # 'subsample': ( 0.7930), bclf.fit(cat_blend, cy) cprob = bclf.predict_proba(cat_btest) bimportances = bclf.booster().get_fscore() bsorted_imp = sorted(bimportances.items(), key=operator.itemgetter(1)) bsorted_imp.reverse() cclf = XGBClassifier(max_depth=1, learning_rate=0.0607, n_estimators=303, objective='multi:softprob', nthread=8, gamma=3.4764, min_child_weight=10.8559, subsample=0.5598, colsample_bytree=0.6374) # 7 models: 0.87344 # 'colsample_bytree' :(0.6374)
for i in range(10): folds = StratifiedKFold(y_train, n_folds=5, shuffle=True) scores = [] iterations = [] for train_index, test_index in folds: X_train2, X_test2 = X_train.loc[train_index], X_train.loc[test_index] y_train2, y_test2 = y_train[train_index], y_train[test_index] X_train2, X_test2 = feature_engineering_extra(X_train2, X_test2, y_train2) X_train2 = csr_matrix(X_train2.values) X_test2 = csr_matrix(X_test2.values) clf.fit(X_train2, y_train2, eval_set=[(X_test2, y_test2)], eval_metric='mlogloss', early_stopping_rounds=early_stopping_rounds, verbose=False) #print(round(clf.booster().best_score, 6), int(clf.booster().best_ntree_limit)) scores.append(round(clf.booster().best_score, 6)) iterations.append(int(clf.booster().best_ntree_limit)) scores = np.array(scores) iterations = np.array(iterations) score = scores.mean() scores2.append(score) print('score, std, iterations', score, scores.std(), iterations.mean()) scores = np.array(scores2) scores = np.delete(scores, [scores.argmax(), scores.argmin()]) print('score, std', scores.mean(), scores.std()) if is_tt_rf == 1: X_train, X_test = feature_engineering(df_train, df_test, y_train)
train = np.loadtxt("train_stage2.csv") test = np.loadtxt("pred_stage2.csv") target = pd.read_csv('target.csv', index_col=0) submission = pd.read_csv('SubmissionFormat.csv') est = XGBClassifier(max_depth=7, learning_rate=0.02358, n_estimators=189, gamma=0.07479, min_child_weight=3.0666, subsample=0.4970, colsample_bytree=0.9517, reg_alpha=0.2065, objective='multi:softmax') est.fit(train, target['status_group']) path = 'save/est.pickle' file = open(path, 'wb') pickle.dump(est, file) pred = est.predict(test) importances = est.booster().get_fscore() sorted_imp = sorted(importances.items(), key=operator.itemgetter(1)) output = np.chararray(len(pred), itemsize=30) output[pred == 0] = 'functional' output[pred == 1] = 'functional needs repair' output[pred == 2] = 'non functional' submission['status_group'] = output submission.to_csv('output.csv', index=False)
reg_alpha=0.05, reg_lambda=2, subsample=1.0, colsample_bytree=1.0, max_delta_step=1, scale_pos_weight=1, objective='multi:softprob', nthread=8, seed=0 # , # silent = False ) print('training...') xgb_model.fit(training, label) print('predicting...') predicted = xgb_model.predict_proba(testing) predicted = pandas.DataFrame(predicted) predicted.columns = xgb_model.classes_ # Name index column. predicted.index.name = 'Id' # Write csv. print('Saving prediction...') predicted.to_csv('Prediction.csv') # feature importance feat_imp = pandas.Series(xgb_model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') matplotlib.pyplot.show() plot_importance(xgb_model, title='Feature importance') matplotlib.pyplot.show() plot_tree(xgb_model, num_trees=0) matplotlib.pyplot.show()
X_val = Xfold1 y_val = fold1.loc[:, 'Category'] # Now comes the time-consuming step of training xgb. # In[3]: xgb = XGBClassifier(**HYPER_PARAMS) xgb.fit(X_train, y_train, eval_set = [(X_val, y_val)], eval_metric = SCORING, verbose = 10) # Now, we can gaze at the important features. # In[4]: gbdt = xgb.booster() importance = gbdt.get_fscore() importance = sorted(importance.items(), key = operator.itemgetter(1), reverse = True) df=pd.DataFrame(importance, columns = ['feature', 'fscore']) print(df) # This provides us with a good idea as to which features are particularly relevant. # # - clearly, the timing in terms of minute, hour and year are critical # - the collocated-crime feature scores surprisingly high # - the spatial coordinates are useful # - the total number of crimes in a steet is an important indicator, as well as some of the log-ratios # - the month is not particularly essential, presumably as seasonal information can be recovered from the week
# Vectorize transformer = TfidfVectorizer() sparse_featureset = transformer.fit_transform(train_set) df_features = pd.DataFrame(sparse_featureset.todense(), columns=transformer.get_feature_names()) # Add another feature contains_7 = pd.Series([int(("7" in s)) for s in train_set]) df_features["Contains7"] = contains_7 # SKLearn API cls = XGBClassifier(silent=True) cls.fit(X=df_features, y=train_targets) print(cls.booster().get_fscore()) df_features = df_features.drop(df_features.columns[1], axis=1) train_data = xgb.DMatrix(df_features.values, label=train_targets) # Generic parameters param = { 'max_depth': 5, 'objective': 'reg:linear', #'objective':'multi:softprob','num_class':2, 'eta': .3, 'silent': 0, 'colsample_bytree': .2, 'nround': 100 }