def runXGBoost(x_train, y_train, x_test, y_test, p): # Here we instantiate the extra gradient boosting classifier clf = XGBClassifier() clf.set_params(**p) clf.fit(x_train, y_train) # now, make the predictions using our classifier xgb_predictions = clf.predict(x_test) # now we have to computer the classification accuracy # think about what two variables we have to compare xgb_score = accuracy_score(y_test, xgb_predictions) print("XGB classification accuracy on test data is " + str(xgb_score), file=sys.stderr) etc_predictions = clf.predict(x_test) dt_score = accuracy_score(y_test, etc_predictions) print("accuracy score on test data: " + str(dt_score), file=sys.stderr) train_score = accuracy_score(y_train, clf.predict(x_train)) print("accuracy score on training data: " + str(train_score), file=sys.stderr) return (train_score, dt_score)
def train_model(train_data, train_label, test_data, test_label): model = XGBClassifier(learning_rate=0.1, n_estimators=160, max_depth=6, min_child_weight=3, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', num_class=2, nthread=4, scale_pos_weight=1, seed=0) # dtrain = xgb.DMatrix(data=train_data,label=train_label) # 训练模型 print 'XGboost start trainning ' start_time = time.time() model.fit(train_data, train_label) print 'XGboost finish trainning' print 'trainning time:%d' % (time.time() - start_time) # 存储模型 joblib.dump(model, 'model/XGBoost_model_80w.pkl') print 'model write finish' # 测试结果 print 'train data result' train_result = model.predict(train_data) print metrics.classification_report(train_label, train_result) print 'test data result' test_result = model.predict(test_data) print metrics.classification_report(test_label, test_result)
def model_na_train(data_offline_filter,data_online,col,predictors): data_offline_filter_col_nona = data_offline_filter.loc[pd.notnull(data_offline_filter.loc[:,col]),:] data_offline_filter_col_na = data_offline_filter.loc[pd.isnull(data_offline_filter.loc[:,col]),:] data_online_col_na = data_online.loc[pd.isnull( data_online.loc[:,col]),:] data_online_col_nona = data_online.loc[pd.notnull( data_online.loc[:,col]),:] k= pd.qcut(data_offline_filter_col_nona[col].tolist()+[data_offline_filter_col_nona[col].min()-1], 10, retbins=True, labels=False,duplicates ='drop') cutoffs = k[1] data_offline_filter_col_nona[col+'_dis'] = np.digitize(data_offline_filter_col_nona[col], cutoffs, right=True) data_online_col_nona[col+'_dis'] = np.digitize(data_online_col_nona[col], cutoffs, right=True) dep = col+'_dis' train_col = data_offline_filter_col_nona.loc[data_offline_filter_col_nona.loc[:,'date'].isin(date_sorted[:40]),:] valid_col = data_offline_filter_col_nona.loc[data_offline_filter_col_nona.loc[:,'date'].isin(date_sorted[40:46]),:] xgb1 = XGBClassifier( learning_rate =0.05, n_estimators=3000, max_depth=6, min_child_weight=1, gamma=0.1, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', n_jobs=16, scale_pos_weight=1, seed=1, reg_alpha=0.5, reg_lambda =10, silent=False) xgb1.fit(train_col[predictors],train_col[dep],eval_set=[(train_col[predictors],train_col[dep]),(valid_col[predictors],valid_col[dep])],eval_metric='mlogloss',early_stopping_rounds=10) save_obj(xgb1,col+'_xgb') #保存模型 save_obj(cutoffs,col+'_cut') #保存切点 data_offline_filter_col_na_pred = xgb1.predict(data_offline_filter_col_na[predictors]) data_online_col_na_pred = xgb1.predict(data_online_col_na[predictors]) data_offline_filter_col_na[col+'_dis'] = data_offline_filter_col_na_pred data_online_col_na[col+'_dis'] = data_online_col_na_pred return pd.concat([data_offline_filter_col_nona.loc[:,['id',col+'_dis']],data_offline_filter_col_na.loc[:,['id',col+'_dis']]]),pd.concat([data_online_col_nona.loc[:,['id',col+'_dis']],data_online_col_na.loc[:,['id',col+'_dis']]])
def xgboost_submission(data): # combine test and training data for scaling train_X = data['training_data'].toarray() train_y = data['training_labels'].reshape(X.shape[0], 1) test_X = data['test_data'].toarray() X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.20, random_state=42) clf = XGBClassifier(max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8).fit(X_train, y_train) predicted_test = clf.predict(test_X) predicted_val = clf.predict(X_val) print('Accuracy:', accuracy_score(y_val, predicted_val)) print('F1 score:', f1_score(y_val, predicted_val)) print('Recall:', recall_score(y_val, predicted_val)) print('Precision:', precision_score(y_val, predicted_val)) print('\n clasification report:\n', classification_report(y_val, predicted_val)) print('\n confussion matrix:\n', confusion_matrix(y_val, predicted_val)) return predicted_test
def train(): """ Train model and save model: feature: tf_idf Classifier: XGBClassifier Model path: './model' :return: """ print('read data...') data = pd.read_csv('./data/intend_data_1.csv') data = data.sample(frac=1.0, replace=True, random_state=42) print('clean data...') data['sentence'] = data['sentence'].apply(clean) label_subject = dict(zip(range(0, len(set(data['label']))), sorted(list(set(data['label']))))) subject_label = dict(zip(sorted(list(set(data['label']))), range(0, len(set(data['label']))))) data['label'] = data['label'].map(subject_label) X_train, X_test, y_train, y_test = train_test_split(list(data['sentence']), list(data['label']), test_size=0.1, random_state=42) print('extract tfidf feature') vec = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2)) tfidf_model = vec.fit(data['sentence'].tolist()) trn_term_doc = tfidf_model.transform(X_train) test_term_doc = tfidf_model.transform(X_test) print('train topic model') classifier_model = XGBClassifier(learning_rate=0.30, n_estimators=300, max_depth=5, objective='multi:softmax', seed=42) classifier_model.fit(trn_term_doc, y_train, eval_metric='mlogloss') train_preds = classifier_model.predict(trn_term_doc) print('result in train:') print(metrics.classification_report(y_train, train_preds)) test_preds = classifier_model.predict(test_term_doc) print('result in train:') print(metrics.classification_report(y_test, test_preds)) print('train semantic model end') with open('./model/model.pk', 'wb') as file: save = { 'label_subject': label_subject, 'tfidfVectorizer': tfidf_model, 'classifier_model': classifier_model } pickle.dump(save, file)
def ranking_borda_xgboost(self): a = 0 rankings = np.zeros(len(self.X.columns),) std = np.zeros(len(self.X.columns),) for x in range(self.loops): seed = randint(0, 10000) #Splits the train/val set by a seed that generates randomly each loop. X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed) #Initializing a random forest rf = XGBClassifier() #Fits the Random forest and we calculate the matthew score. rf.fit(X_train, y_train) mattheworiginal = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We initialize 2 lists to append values from the next loop. matthewscores= [] columnsrf= [] for x in self.X.columns: X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed) #We drop a different column each loop. X_train = X_train.drop([x], axis=1) X_fr = X_fr.drop([x], axis=1) #We fit our random forest again, but this time our training dataset lacks a feature. rf.fit(X_train, y_train) matthew = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We append to the list each column that we dropped. columnsrf.append(x) #And we also append, the drop (or gain), in r2 that we got when the feature was missing. matthewscores.append(mattheworiginal - matthew) a += 1 outcome = np.array(list(zip(columnsrf, matthewscores))) outcomepd = pd.DataFrame(data=outcome, columns=['Variables', 'r2-punish']) outcomepd['ranking'] = outcomepd['r2-punish'].rank(ascending = False) rankings = np.add(outcomepd['ranking'].to_numpy(), rankings) # We stack each value vertically to get a 2d numpy array std = np.vstack((outcomepd['ranking'].to_numpy(), std)) std = np.delete(std, -1, axis = 0) std = np.std(std, axis = 0) std = np.dstack((columnsrf, std)) featuresranks = np.dstack((columnsrf, rankings)) std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'STD']) borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'Borda-Score']) borda = borda.merge(std, on = 'Categories',) borda['Borda-Score'] = pd.to_numeric(borda['Borda-Score']) borda['Borda-Average'] = borda['Borda-Score'] / self.loops borda['ranking'] = borda['Borda-Score'].rank(ascending = True) borda.sort_values(by='Borda-Score', inplace = True) return borda
def runXGBoost(x_train,y_train,x_test,y_test): parameter_grid = { 'reg_lambda': np.linspace(27, 28 , 2), # for over fitting 'reg_alpha': np.linspace(27, 28, 2), "learning_rate": np.linspace(.0001, .001, 3), # usually between .05 and .3 "max_depth": [2], "num_boosting_rounds": [1000], 'nthread': [10] } # Here we instantiate the extra gradient boosting classifier clf = XGBClassifier() grid_search = GridSearchCV(clf, n_jobs=40, return_train_score=True, param_grid=parameter_grid, cv=StratifiedKFold(n_splits=10)) grid_search.fit(x_train,y_train) print('Best score: {}'.format(grid_search.best_score_), file=sys.stderr) print('Best parameters: {}'.format(grid_search.best_params_), file=sys.stderr) # refit and train the model to the best features and training data clf = grid_search.best_estimator_ importances = clf.feature_importances_ print(importances, file=sys.stderr) # Print the feature ranking print("Feature ranking:", file=sys.stderr) importanceDict = {'names': [], 'imp': []} for name, importance in zip(x_train.columns, clf.feature_importances_): importanceDict['names'] += [name] importanceDict['imp'] += [importance] fRank = pd.DataFrame.from_dict(importanceDict) fRank = fRank.sort_values(by='imp', ascending=False) i = 0 for index, row in fRank.iterrows(): print("%d. %s %f" % (i, row['names'], row['imp']), file=sys.stderr) i += 1 cv_results = pd.DataFrame(grid_search.cv_results_)[['rank_test_score', 'params','mean_test_score','mean_train_score']] sorted_results = cv_results.sort_values(by='rank_test_score').head(5) print("\nTop 5 best Parameters: ", file=sys.stderr) for index, row in sorted_results.iterrows(): print("%d. %s train: %s test: %s" % (row['rank_test_score'], str(row['params']), str(row['mean_train_score']), str(row['mean_test_score'])), file=sys.stderr) # now we have to computer the classification accuracy # think about what two variables we have to compare etc_predictions = clf.predict(x_test) dt_score = accuracy_score(y_test, etc_predictions) print("accuracy score on test data: " + str(dt_score), file=sys.stderr) train_score = accuracy_score(y_train, clf.predict(x_train)) print("accuracy score on training data: " + str(train_score), file=sys.stderr) return (train_score, dt_score)
def ranking_by_matthew_punishment_xgb(self): std = np.zeros(len(self.X.columns),) rankings = np.zeros(len(self.X.columns),) for x in range(self.loops): seed = randint(0, 10000) #Splits the train/val set by a seed that generates randomly each loop. X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state= seed) #Initializing a random forest rf = XGBClassifier() #Fits the Random forest and we calculate a R2. rf.fit(X_train, y_train) r2original = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We initialize 2 lists to append values from the next loop. r2fr= [] columnsrf= [] for x in self.X.columns: X_train, X_fr, y_train, y_fr = train_test_split(self.X, self.y, test_size=0.30, random_state = seed) #We drop a different column each loop. X_train = X_train.drop([x], axis=1) X_fr = X_fr.drop([x], axis=1) #We fit our random forest again, but this time our training dataset lacks a feature. rf.fit(X_train, y_train) r2 = matthews_corrcoef(y_fr, rf.predict(X_fr)) #We append to the list each column that we dropped. columnsrf.append(x) #And we also append, the drop (or gain), in r2 that we got when the feature was missing. r2fr.append(r2original - r2) outcome = np.array(r2fr) rankings = np.add(outcome, rankings) std = np.vstack((outcome, std)) rankings = np.true_divide(rankings, self.loops) std = np.delete(std, -1, axis = 0) std = np.std(std, axis = 0) std = np.dstack((columnsrf, std)) std = pd.DataFrame(data = np.squeeze(std, axis = 0), columns =['Categories', 'SD_of_mtt_punishment']) featuresranks = np.dstack((columnsrf, rankings)) borda = pd.DataFrame(data = np.squeeze(featuresranks, axis=0), columns=['Categories', 'average-mtt-punishment']) borda['ranking'] = borda['average-mtt-punishment'].rank(ascending = False) borda = borda.merge(std, on = 'Categories',) borda.sort_values(by='average-mtt-punishment', inplace = True, ascending = False) return borda
def fit_model(self, X_train, y_train, X_test, y_test): clf = XGBClassifier(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train, y_train) y_pre = clf.predict(X_test) y_pro = clf.predict_proba(X_test)[:, 1] print "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score( y_test, y_pro) print "pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score( y_test, y_pre) new_feature = clf.apply(X_train) X_train_new = self.mergeToOne(X_train, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set sample number remains the same" return X_train_new, y_train, X_test_new, y_test
def fit_model_split(self, X_train, y_train, X_test, y_test): ##X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合 X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split( X_train, y_train, test_size=0.6, random_state=0) clf = XGBClassifier(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train_1, y_train_1) y_pre = clf.predict(X_train_2) y_pro = clf.predict_proba(X_train_2)[:, 1] print("pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro)) print("pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre)) new_feature = clf.apply(X_train_2) X_train_new2 = self.mergeToOne(X_train_2, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print("Training set of sample size 0.4 fewer than before") return X_train_new2, y_train_2, X_test_new, y_test
def XGBoost(returns, factRet): [timeN, factorN] = factRet.shape [timeN, assetN] = returns.shape #Prepare training and preidcting data colName = list(factRet.columns) f_bar = factRet.tail(2).mean() f_bar = pd.DataFrame(f_bar).T f_bar.columns = colName factRet = factRet.head(len(factRet) - 1) xgb = XGBClassifier(learning_rate=0.1, n_estimators=10, max_depth=7, min_child_weight=2, gamma=0.2, subsample=0.8, colsample_bytree=0.6, objective='reg:linear', scale_pos_weight=1, seed=10) mu = [] for i in range(assetN): xgb.fit(factRet, returns.iloc[:, i]) mu.append(float(xgb.predict(f_bar))) mu = np.array(mu) Q = np.array(returns.cov()) return mu, Q
def xgbt_base_rmse_mode(train_input, train_target, test_input, test_target): param = { 'n_estimators':10, 'learning_rate': 0.01, } adj_params = { 'n_estimators':[10], 'learning_rate': [0.01], # 'n_estimators':[10,50,100,200,300,400,500,1000], # 'learning_rate': [0.01, 0.1, 1] } xgbt = XGBClassifier(**param) cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) cscv = GridSearchCV(xgbt, adj_params, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1) cscv.fit(train_input, train_target) print("cv_results_:",cscv.cv_results_) print("best_params_: ",cscv.best_params_) xgbt= XGBClassifier(**cscv.best_params_) xgbt.fit(train_input,train_target.ravel()) predicted = xgbt.predict(test_input) xgbt_base_rmse = np.sqrt(metrics.mean_squared_error(test_target, predicted)) print("xgbt_base_rmse: ", xgbt_base_rmse) #print ("RMSE:", np.sqrt(metrics.mean_squared_error(test_target, predicted))) return xgbt_base_rmse
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(**params) df = data.sample(frac=0.3) pX = df.drop('LABEL', axis=1) py = df['LABEL'] if useTrainCV: print("start use cv") xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_param['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print(cvresult.shape[0]) alg.set_params(n_estimators=cvresult.shape[0]) params['n_estimators'] = cvresult.shape[0] print("best tree size is {}".format(cvresult.shape[0])) # Fit the algorithm on the data alg.fit(X, y, eval_metric='auc') y_pred = alg.predict(pX) accuracy = metrics.accuracy_score(py, y_pred) print("精确率Accuracy: %.2f%%" % (accuracy * 100.0)) print('auc:', metrics.roc_auc_score(py, y_pred)) train_report = metrics.classification_report(py, y_pred) print(train_report) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) print(feat_imp) return alg
def FraudXGB(trainX, trainY, testX, testY): # Long computation in this cell (~1.8 minutes) clf_xgb = XGBClassifier(max_depth=7, learning_rate=0.05, n_estimators=400, objective="binary:hinge", booster='gbtree', n_jobs=-1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=42) pred_prob = clf_xgb.fit(trainX, trainY).predict_proba(testX) predY_xgb = clf_xgb.predict(testX) modelName = 'XGBoostClassifier' model_perf = createOutParam(modelName, testX, testY, predY_xgb, pred_prob) XGBoostClassifier_pkl_filename = obj.model_path + '/XGBoostClassifier_20200202.pkl' # Open the file to save as pkl file XGBoostClassifier_model_pkl = open(XGBoostClassifier_pkl_filename, 'wb') pickle.dump(clf_xgb, XGBoostClassifier_model_pkl) # Close the pickle instances XGBoostClassifier_model_pkl.close() return model_perf
def do_simple_xgboost_regression(x_train, y_train, x_test, y_test): xg_reg = XGBClassifier(silent=False, scale_pos_weight=1, learning_rate=0.01, colsample_bytree=0.4, subsample=0.8, objective='binary:logistic', n_estimators=1000, reg_alpha=0.3, max_depth=4, gamma=10) eval_set = [(x_train, y_train), (x_test, y_test)] eval_metric = ["auc", "error"] xg_reg.fit(x_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=True) train_accuracy = compute_accuracy(xg_reg, x_train, y_train) test_accuracy = compute_accuracy(xg_reg, x_test, y_test) print('train set accuracy: {}'.format(train_accuracy)) print('test set accuracy: {}'.format(test_accuracy)) y_score = xg_reg.predict(x_test) score = metrics.roc_auc_score(y_test, y_score) print('score {}'.format(score))
def runxgBoostClassifier(self, bDetailReport=False): print("m_X_train size", len(self.m_X_train)) boosters = ['gbtree', 'gblinear'] for depth in range(3, 4): for rate in (range(2, 3, 1)): for estimator in (range(220, 240, 20)): # for bster in boosters: clf = XGBClassifier(max_depth=depth, learning_rate=(float(rate) / 10), n_estimators=estimator, silent=True, objective='binary:logistic', seed=400) clf.fit(self.m_X_train, self.m_y_train) y = clf.predict(self.m_X_test) print( "\nxgBoostClassifier depth={} rate={} estimator={}\n". format(depth, (float(rate) / 10), estimator)) print(classification_report(self.m_y_test, y)) print(clf.feature_importances_) # plot pyplot.bar(range(len(clf.feature_importances_)), clf.feature_importances_) pyplot.show() plot_importance(clf) if (bDetailReport): self.ClassifierDetailReport(self.m_y_test, y)
def train_model(train, test): x_train, y_train = get_fea_lab(train) x_test, y_test = get_fea_lab(test) xgb = XGBClassifier() print(xgb) paras = { 'max_depth': range(1, 3), 'min_child_weight': [i / 10 for i in range(0, 10)], 'scale_pos_weight': range(10, 100, 10) } gscv = GridSearchCV(estimator=xgb, param_grid=paras, cv=5, scoring='roc_auc') gscv.fit(x_train, y_train) print(gscv.best_params_) print(gscv.best_score_) print(gscv.score(x_test, y_test)) result = gscv.predict(x_test) print(confusion_matrix(y_test, result)) print(classification_report(y_test, result)) xgb.fit(x_train, y_train) test_result = xgb.predict(x_test) print(confusion_matrix(y_test, test_result)) print(classification_report(y_test, test_result)) '''
def XGBoost(returns, factRet): """ :param return: :param facret: :param a: lambda """ [timeN, factorN] = factRet.shape [timeN, assetN] = returns.shape f_bar = [] for i in range(factorN): f_bar.append(np.prod(factRet.iloc[:, i] + 1)**(1 / timeN) - 1) colName = list(factRet.columns) f_bar = pd.DataFrame(f_bar).T f_bar.columns = colName xgb = XGBClassifier(learning_rate=0.1, n_estimators=10, max_depth=7, min_child_weight=2, gamma=0.2, subsample=0.8, colsample_bytree=0.6, objective='reg:linear', scale_pos_weight=1, seed=10) mu = [] for i in range(assetN): xgb.fit(factRet, returns.iloc[:, i]) mu.append(float(xgb.predict(f_bar))) mu = np.array(mu) Q = np.array(returns.cov()) return mu, Q
def xgb(X_train, y_train, X_test, y_test, lime_flag=False, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=-1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=42, seed=None, missing=0): ''' Parameters: X_train, y_train, X_test, y_test- Learning set lime_flag- enable or disable lime ''' start_time = time.time() # cretae instance xgb= XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, objective=objective, booster=booster, n_jobs=n_jobs, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state, seed=seed, missing=missing) xgb.fit(X_train,y_train) #Predict on test set y_pred= xgb.predict(X_test) # understand the model through lime #if lime_flag: # lime_explainer(X_train, y_train, X_test, y_test, df_row=2, model_predictor= xgb, alogorithm_name="XGB") time_end=time.time() - start_time # Scores model_evaluation(X_train,y_train, X_test, y_test,y_pred, xgb, time_end, alg_name='XGB') # resturn model object return xgb
def XgbTrain(X, y): train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0) ##test_size测试集合所占比例 test_preds = pd.DataFrame({"label": test_y}) clf = XGBClassifier( learning_rate=0.3, # 默认0.3 学习率 n_estimators=50, # 树的个数 max_depth=10, #树的最大深度 objective='multi:softmax', min_child_weight=3, gamma=0.3, #伽玛参数 eta=0.1, subsample=0.7, #训练集占比 colsample_bytree=0.6, nthread=4, # cpu线程数 scale_pos_weight=1, reg_alpha=1e-05, reg_lambda=1, num_class=10, seed=10 ) clf.fit(train_x, train_y) test_preds['y_pred'] = clf.predict(test_x) test_preds['cha'] = test_preds['y_pred'] - test_preds['label'] test_preds.to_csv('E:/xinyong/xgbmodelfile/result191-501.csv', index=None) stdm = metrics.accuracy_score(test_preds['label'], test_preds['y_pred']) import matplotlib.pyplot as plt # 画出预测结果图 p = test_preds[['label', 'y_pred']].plot(subplots=True, style=['b-o', 'r-*']) plt.show() return stdm, clf
def model_train(xtrain, ytrain): X_train, X_test, y_train, y_test = train_test_split(xtrain, ytrain, test_size=0.2, random_state=0) cls = XGBClassifier() start_time = time.time() cls.fit(X_train, y_train) end_time = time.time() print('It took %d seconds to train the model!' % (end_time - start_time)) print() y_pred = cls.predict(X_test) print("模型及模型参数:") print(str(cls)) print("模型评估:") print('Accuracy:', accuracy_score(y_test, y_pred)) print('F1 score:', f1_score(y_test, y_pred)) print('Recall:', recall_score(y_test, y_pred)) print('Precision:', precision_score(y_test, y_pred)) print('\n clasification report:\n', classification_report(y_test, y_pred)) print('\n confussion matrix:\n', confusion_matrix(y_test, y_pred)) # 保存模型 model_name = "./model/" + "xgb_model" joblib.dump(cls, model_name)
def featureimportance(model, X_train, X_test, y_train, y_test): thresholds = sort(model.feature_importances_) bestthresh = 0 bestN = 0 bestaccuracy = 0 for thresh in thresholds: # select features using threshold selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X_train = selection.transform(X_train) select_X_test = selection.transform(X_test) # train model selection_model = XGBClassifier(model.get_xgb_params()) selection_model = modelfit(selection_model, select_X_train, select_X_test, y_train, y_test, featureimportance=True) # eval model y_pred = selection_model.predict(select_X_test) predictions = [round(value) for value in y_pred] accuracy = metrics.accuracy_score(y_test, predictions) print(f"Thresh={thresh}, n={select_X_train.shape[1]}, Accuracy: {accuracy*100}%") if accuracy > bestaccuracy: bestthresh = thresh bestN = select_X_train.shape[1] bestaccuracy = accuracy print(f"Best Run: Thresh={bestthresh}, n={bestN}, Accuracy: {bestaccuracy*100}%") '''
def pvXBOOST(trainX, testX, trainY, testY): train = np.append(trainX, trainY, axis=1) # test = np.append(testX, testY, axis=1) X = train[:, 0:-1] Y = train[:, -1] # sklearn接口 clf = XGBClassifier( n_estimators=100, # trees number learning_rate=0.2, max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=12, scale_pos_weight=1, reg_lambda=1, seed=27) model_sklearn = clf.fit(X, Y) preds = clf.predict(testX) conMar = confusion_matrix(testY, preds) # feature importance print(clf.feature_importances_) # plot pyplot.bar(range(len(clf.feature_importances_)), clf.feature_importances_) pyplot.show() FeatureImportance(clf.feature_importances_) print(conMar) cnm.writelines('\n**XGBoosting-confusion martix\n') cnm.write(np.array2string(conMar)) return classification_report(testY, preds)
class Classifier(object): def __init__(self, conf, task, train=None, test=None): self.conf = conf self.task = task self.train_ = train self.test_ = test self.features = [ "hasWith", "hasIn", "simiBucket", "textPos", "hasOf", "hasAnd", "startEntity", "distance", "hasFrom", "endEntity", "similarity", "hasThan", "hasVerb" ] self.labels = ["relation"] self.num_round = 500 self.eval_set = list() self.early_stopping_rounds = 20 self.classifier = XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=1000, gamma=4, verbosity=1, objective='multi:softmax', num_class=6, booster='gbtree', n_jobs=4, seed=27) def train(self): train_X, test_X, train_y, test_y = train_test_split( self.train_[self.features], self.train_[self.labels], test_size=0.4, random_state=42) self.eval_set = [(train_X.values, train_y.values), (test_X.values, test_y.values)] self.classifier.fit(train_X.values, train_y.values, eval_metric='merror', eval_set=self.eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose=True) self.classifier.save_model(self.conf.model_path.format(self.task)) return 'Model has been saved!' def test(self): test_set = self.test_[self.features].values self.classifier.load_model(self.conf.model_path.format(self.task)) self.classifier._le = LabelEncoder().fit([ 'USAGE', 'TOPIC', 'MODEL-FEATURE', 'PART_WHOLE', 'RESULT', 'COMPARE' ]) pred = self.classifier.predict(test_set) predictions = pd.concat([ self.test_[self.features], pd.DataFrame(pred, columns=["relation"]) ], axis=1) return predictions
def XGB_class_evaluation(individual): N_SPLITS = N_splits kf = KFold(n_splits=N_SPLITS) fc = XGBClassifier(learning_rate=individual[0], n_estimators=100, silent=True, nthread=-1, gamma=0, min_child_weight=individual[1], max_depth=individual[2], subsample=individual[3], colsample_bylevel=individual[4], seed=0) M_pos = 0 M_mid = 0 M_neg = 0 for train, test in kf.split(trainX): fc.fit(trainX[train, :], trainY[train]) testY_pre = fc.predict(trainX[test, :]) Ind_pos = (trainY[test] == 1) Ind_mid = (trainY[test] == 0) Ind_neg = (trainY[test] == -1) M_pos += len(np.where(np.array(testY_pre[Ind_pos]) == 1)[0]) / len( np.where(Ind_pos)[0]) M_mid += len(np.where(np.array(testY_pre[Ind_mid]) == 0)[0]) / len( np.where(Ind_mid)[0]) M_neg += len(np.where(np.array(testY_pre[Ind_neg]) == -1)[0]) / len( np.where(Ind_neg)[0]) correct = map(lambda x: x / N_SPLITS, [M_pos, M_mid, M_neg]) return (tuple(correct))
def get_ntree(): f1_t_total, f1_v_total = [], [] for ntree in range(10, 810, 10): xgb_base = XGBClassifier(objective='binary:logistic', n_estimators=ntree, random_state=1234, silent=0, booster='gbtree', subsample=0.8, colsample_bytree=0.8, reg_alpha=1, reg_lambda=0, learning_rate=0.1, max_depth=6) print('此时 ntree = %s' % ntree) xgb_base.fit(X_t, y_t) y_t_pre = xgb_base.predict(X_t) y_v_pre = xgb_base.predict(X_v) f1_t_each = f1_score(y_t, y_t_pre, average='micro') f1_v_each = f1_score(y_v, y_v_pre, average='micro') f1_t_total.append(f1_t_each) f1_v_total.append(f1_v_each) myfile = open('D:\\workspace python\\contest\\accu_save\\' + 'xgbbase_810_1.txt', 'a', encoding='utf-8') print(f1_t_each, ',', f1_v_each, file=myfile) myfile.close() return f1_t_total, f1_v_total
def xgboost(train_features, train_labels, test_features, feature_list=None, hfo_type_name=None): clf = XGBClassifier(nthread=-1) ''' #clf = XGBClassifier(learning_rate=0.05, n_estimators=1000, #100 max_depth=6, min_child_weight=3, gamma=0.05, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.005, objective='binary:logistic', nthread=-1, scale_pos_weight=1, seed=10, eval_metric='aucpr' #'aucpr' ) ''' clf.fit(train_features, train_labels) # Predict over test clf_predictions = clf.predict(test_features) clf_probs = clf.predict_proba(test_features)[:, 1] # graphics.feature_importances(feature_list, clf.feature_importances_, hfo_type_name, fig_id) return clf_predictions, clf_probs, clf
def leaveoneout(dataset, labels): '''分类器采用xgboost,交叉验证采用留一法''' leaveoo = LeaveOneOut() # Y_true = [] # Y_pre = [] #xgboost参数分为三类: '''1、通用参数 2、Booster参数:控制每一步的booster 3、学习目标参数:控制训练目标的表现''' for train_index, test_index in leaveoo.split(dataset): x_train, x_test = dataset[[train_index]], dataset[[test_index]] y_train, y_test = [labels[i] for i in train_index ], [labels[i] for i in test_index] estimator = XGBClassifier( silent=0, #设置成1则没有运行信息输出,最好是设置为0.是否在运行升级时打印消息。 min_child_weight=1, gamma=0, # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守,一般0.1、0.2这种样子。 max_delta_step=1, #最大增量步长,我们允许每个树的权重估计。 colsample_bytree=0.8, # 生成树时进行的列采样 nthread=4, objective= 'binary:logistic', #定义需要被最小化的损失函数,binary:logistic 二分类的逻辑回归,返回预测的概率(不是类别)。 reg_lambda=1, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 scale_pos_weight=1, n_estimators=200, #树的个数 seed=1000 #随机种子 ) estimator.fit(x_train, y_train) print(estimator.best_params_) y_true, y_pre = y_test, list(estimator.predict(x_test)) print("Accuracy : %.6g" % metrics.accuracy_score(y_true, y_pre))
def gradient_boosted_trees(train_features: np.array, train_labels: np.array, test_features: np.array, **kwargs): """Gradient Boosted Trees classifier. Parameters ---------- train_features : np.array Training sample features. train_labels: np.array Training sample classes. test_features: np.array Test sample features. kwargs: extra parameters All parameters allowed by sklearn.XGBClassifier Returns ------- predictions: np.array Predicted classes. prob: np.array Classification probability for all objects, [pIa, pnon-Ia]. """ #create classifier instance clf = XGBClassifier(**kwargs) clf.fit(train_features, train_labels) # train predictions = clf.predict(test_features) # predict prob = clf.predict_proba(test_features) # get probabilities return predictions, prob, clf
def compute_cv_metric(split, cross_val_data, bayes_trials_results): #Create clasifier for cross validation results clf = XGBClassifier(random_state=0, n_jobs=-1, **bayes_trials_results[0]['params']) train_x = cross_val_data[split][0] train_y = cross_val_data[split][1] test_x = cross_val_data[split][2] test_y = cross_val_data[split][3] clf.fit(train_x, train_y) y_pred_cv = clf.predict(test_x) y_pred_prob_cv = clf.predict_proba(test_x) tn = confusion_matrix(test_y, y_pred_cv)[0, 0] tp = confusion_matrix(test_y, y_pred_cv)[1, 1] fp = confusion_matrix(test_y, y_pred_cv)[0, 1] fn = confusion_matrix(test_y, y_pred_cv)[1, 0] npv = tn / (tn + fn) specificity = tn / (tn + fp) precision = tp / (tp + fp) recall = tp / (tp + fn) roc_auc_cv = roc_auc_score(test_y, y_pred_prob_cv[:, 1]) f1_cv = 2 * (precision * recall) / (precision + recall) return npv, specificity, precision, recall, roc_auc_cv, f1_cv, y_pred_prob_cv
#define X y X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) #ClusterCentroids cc = ClusterCentroids(random_state=0) os_X,os_y = cc.fit_sample(X_train,y_train) #XGboost clf_XG = XGBClassifier(learning_rate= 0.3, min_child_weight=1, max_depth=6,gamma=0,subsample=1, max_delta_step=0, colsample_bytree=1, reg_lambda=1, n_estimators=100, seed=1000, scale_pos_weight=1000) clf_XG.fit(os_X, os_y,eval_set=[(os_X, os_y), (X_test, y_test)],eval_metric='auc',verbose=False) evals_result = clf_XG.evals_result() y_true, y_pred = y_test, clf_XG.predict(X_test) #F1_score, precision, recall, specifity, G score print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred) print "Recall : %.4g" % metrics.recall_score(y_true, y_pred) recall = metrics.recall_score(y_true, y_pred) print "Precision : %.4g" % metrics.precision_score(y_true, y_pred) #Compute confusion matrix cnf_matrix = confusion_matrix(y_test,y_pred) np.set_printoptions(precision=2) print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) print "G score: " , math.sqrt(recall/ specifity) #Plot non-normalized confusion matrix
# reg_alpha=0.1, # seed=27) # modelfit(xgb1, df_train, predictors, targetname, early_stopping_rounds=50) xgb1 = XGBClassifier( learning_rate=0.01, n_estimators=700, max_depth=5, min_child_weight=8, gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', scale_pos_weight=1, seed=27) xgb1.fit(df_train[predictors], df_train[targetname]) df_test['target'] = xgb1.predict(df_test[predictors]) df_test['target'] = df_test['target'].apply(lambda x: 'Y' if x==1 else 'N') submission = pd.DataFrame() submission['Loan_ID'] = df_test['Loan_ID'] submission['Loan_Status'] = df_test['target'] submission.to_csv('submission_XGB_retunned.csv', index=False)
from sklearn.preprocessing import LabelEncoder from xgboost.sklearn import XGBClassifier import dataGatherer as dg isTest = int(sys.argv[1]) if isTest == 1: train, test, feature_train, feature_test, label_train, label_test = dg.test_data(.8) else: train, test, feature_train, feature_test, label_train = dg.prod_data() f_train = pd.concat([train,feature_train], axis = 1) f_test = pd.concat([test,feature_test], axis = 1) xgb = XGBClassifier(max_depth=6, learning_rate=0.1, n_estimators=50, objective='multi:softprob', subsample=1.0, colsample_bytree=1, seed=0) le = LabelEncoder() y = le.fit_transform(label_train.values) xgb.fit(f_train.values, y) y_pred = xgb.predict(f_test.values) y_pred = le.inverse_transform(y_pred) if isTest == 1 : y_f = y_pred == label_test.values print("misclassified = " + str(len(y_f[y_f==False]))) print("currect class = " + str(len(y_f[y_f==True]))) print("score = " + str(len(y_f[y_f==True])/len(y_f))) t = test[~y_f] l = label_test[~y_f] l_p = y_pred[~y_f] for i in range(0, len(l)): di.draw(t[i:i+1].values[0,], "images/prob_" + str(i) + "_" + str(l.values[i]) + "_" + str(l_p[i]) ) else : index = list(range(1,len(y_pred)+1)) index = pd.DataFrame(index, columns = ['ImageId']) y_pred = pd.DataFrame(y_pred, columns = ['Label'])
class TrollClassifier: def set_train_path(self, path): self.train_path = path def pre_process(self, json, istrain): mecab = Mecab() data = [] for cnt, article in enumerate(json): if cnt % 10000 == 0: print(cnt) text = bs(article["text"], "html.parser").text #title_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["title"])] #author_pos = ["%s_%s" % (word, pos) for word, pos in mecab.pos(article["author"])] text_pos = ["%s_%s" % (first, second) for first, second in mecab.pos(text)] data.append({ #"title_pos": title_pos, #"title_pos_sentences" : " ".join(title_pos), #"author_pos": author_pos, #"author_pos_sentences" : " ".join(author_pos), "text":article["text"], "text_pos": text_pos, "text_pos_sentences" : " ".join(text_pos), #"forumid": article["forumid"], "pk": article["pk"] }) if istrain == True: data[cnt]["istroll"] = article["is_troll"] data = pd.DataFrame.from_dict(data) data = data.set_index('pk') return data def fit(self, json_train, n_estimators = 10, is_xgb = True): train = self.pre_process(json_train, istrain = True) bow_vectorizer = BagOfWordsVectorizer() word2vec_model = Word2VecModel() tag_counter_model = TagCounterModel() # word2vec_model.fit(train["author_pos_sentences"], 500) # author_features = word2vec_model.transform(train["author_pos_sentences"], "author") # self.author_model = word2vec_model.get_model() # bow_vectorizer.fit(train["title_pos_sentences"], 1000) # title_features = bow_vectorizer.transform(train["title_pos_sentences"], "title") # self.title_model = bow_vectorizer.get_vectorizer() bow_vectorizer.fit(train["text_pos_sentences"], 1000) text_features = bow_vectorizer.transform(train["text_pos_sentences"], "text") self.text_model = bow_vectorizer.get_vectorizer() # tag_features = tag_counter_model.fit_transform(train["text"]) # self.tag_model = tag_counter_model.get_col() train = pd.concat([train, text_features], axis = 1) #le = preprocessing.LabelEncoder() # train["forumid"] = le.fit_transform(train["forumid"]) label = train['istroll'] train = train.drop('istroll', axis=1) train = train.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1) print(train.columns) train.columns = [str(x) for x in range(len(train.columns))] if is_xgb == False: self.model = RandomForestClassifier(n_estimators, n_jobs=-1) else: self.model = XGBClassifier(n_estimators = n_estimators, max_depth = 10) print(train.shape) self.model.fit(train, label) def save_model(self, save_path = "predict_model"): if not os.path.exists(save_path): os.makedirs(save_path) #pickle.dump(self.author_model, open("%s/author_model.p" % save_path, "wb"), protocol = pickle.HIGHEST_PROTOCOL) #pickle.dump(self.title_model, open("%s/title_model.p" % save_path, "wb"), protocol = pickle.HIGHEST_PROTOCOL) pickle.dump(self.text_model, open("%s/text_model.p" % save_path, "wb"), protocol = pickle.HIGHEST_PROTOCOL) #pickle.dump(self.tag_model, open("%s/tag_model.p" % save_path,"wb"), protocol = pickle.HIGHEST_PROTOCOL) pickle.dump(self.model, open("%s/predict_model.p" % save_path,"wb"), protocol = pickle.HIGHEST_PROTOCOL) def load_model(self, save_path = "predict_model"): #self.author_model = pickle.load(open("%s/author_model.p" % save_path, "rb")) #self.title_model = pickle.load(open("%s/title_model.p" % save_path, "rb")) self.text_model = pickle.load(open("%s/text_model.p" % save_path, "rb")) #self.tag_model = pickle.load(open("%s/tag_model.p" % save_path, "rb")) self.model = pickle.load(open("%s/predict_model.p" % save_path,"rb")) def _predict(self, json_test): test = self.pre_process(json_test, istrain = False) bow_vectorizer = BagOfWordsVectorizer() word2vec_model = Word2VecModel() tag_counter_model = TagCounterModel() # word2vec_model.set_model(self.author_model) # author_features = word2vec_model.transform(test["author_pos_sentences"], "author") #bow_vectorizer.set_vectorizer(self.title_model) #title_features = bow_vectorizer.transform(test["title_pos_sentences"], "title") bow_vectorizer.set_vectorizer(self.text_model) text_features = bow_vectorizer.transform(test["text_pos_sentences"], "text") #tag_counter_model.set_col(self.tag_model) #tag_features = tag_counter_model.transform(test["text"]) test = pd.concat([test, text_features], axis = 1) #le = preprocessing.LabelEncoder() #test["forumid"] = le.fit_transform(test["forumid"]) test = test.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1) test.columns = [str(x) for x in range(len(test.columns))] return test def predict(self, json_test): result = self.model.predict(self._predict(json_test)) return result def predict_proba(self, json_test): result = self.model.predict_proba(self._predict(json_test)).T #if results are all False, it's not 2-dimensional so return only first col if result.shape[0] < 2: return result[0] else: return result[1]
data_x=pd.get_dummies(data.action_type,prefix="action_type") cols=["combined_shot_type","game_event_id","period","playoffs", "shot_type","shot_zone_area","shot_zone_basic","shot_zone_range", "matchup","opponent","game_date","shot_distance","minutes_remaining","seconds_remaining", "loc_x","loc_y"] for col in cols: data_x=pd.concat([data_x,pd.get_dummies(data[col],prefix=col),],axis=1) train_x=data_x[-pd.isnull(data.shot_made_flag)] test_x=data_x[pd.isnull(data.shot_made_flag)] train_y=data.shot_made_flag[-pd.isnull(data.shot_made_flag)] clf = XGBClassifier(max_depth=6, learning_rate=0.01, n_estimators=550, subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(train_x, train_y) y_pred = clf.predict(train_x) print("Number of mislabeled points out of a total %d points : %d" % (train_x.shape[0],(train_y != y_pred).sum())) def logloss(act, pred): epsilon = 1e-15 pred = sp.maximum(epsilon, pred) pred = sp.minimum(1-epsilon, pred) ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred))) ll = ll * -1.0/len(act) print(ll) return ll logloss(train_y,clf.predict_proba(train_x)[:,1]) test_y=clf.predict_proba(test_x)[:,1] test_id=data[pd.isnull(data.shot_made_flag)]["shot_id"]