def ModelPredict(para): # para=[7.16, 0.4, 0.31,0.9,2.7,1.48, 0.78, 0.86] data = pd.DataFrame(columns=('R', 'angle', 'occusion', 'score')) print("预测开始:") for i in range(int(len(para) / 4)): print(i * 4) data.loc[i] = para[i * 4:i * 4 + 4] print(data) y_test = data.pop('score') x_test = data print(x_test) print(y_test) cab, lgb, xgb, gbdt, stack_lr = LoadModel() print("加载完毕:") y_pred_cab_test = cab.predict(x_test) y_pred_lgb_test = lgb.predict(x_test) y_pred_xgb_test = xgb.predict(x_test) y_pred_gbdt_test = gbdt.predict(x_test) print("stack") stack_x_test = pd.DataFrame() stack_x_test['Method_1'] = y_pred_cab_test stack_x_test['Method_2'] = y_pred_lgb_test stack_x_test['Method_3'] = y_pred_xgb_test stack_x_test['Method_4'] = y_pred_gbdt_test stack_pred = stack_lr.predict(stack_x_test) print("stack_mae:", mean_absolute_error(y_test, stack_pred)) #mae:2.1501818709279975 print(stack_pred.tolist()) return stack_pred.tolist()
def prediction(): xgb = xgboost.XGBRegressor(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7) traindf, testdf = train_test_split(X_train, test_size = 0.3) xgb.fit(X_train,y_train) predictions = xgb.predict(X_test) print(explained_variance_score(predictions,y_test))
def inference(self, spec): #データの前処理 X = spec.drop(['price', '詳細情報'], axis=1).as_matrix() y = spec['price'] indices = spec.index #モデルのロード xgb = self.load_model() #推論 prediction = xgb.predict(X) #結果の比較 error = y - prediction error_per = abs(error) / y * 100 result = pd.DataFrame( { 'prediction': prediction, 'error': error, 'error_percent': error_per, }, index=indices) result_all = pd.concat([spec, result], axis=1) treasure = result_all[(result_all['error_percent'] > 5) & (result_all['error'] < 0)] # #結果を出力 # print(result_all) return treasure
def XGBoost_classifier(X_train, train_target, X_test): X_test = X_test.values xgb = XGBClassifier() xgb.fit(X_train, train_target) hyp = xgb.predict(X_test) return hyp
def xgboost(train_x,train_y,test_x,test_y): import xgboost as xgb xgb = xgb.XGBClassifier(n_estimators=150,min_samples_leaf=3,max_depth=6) xgb.fit(train_x,train_y) y_pred = xgb.predict(test_x) print(classification_report(test_y,y_pred)) print(confusion_matrix(test_y,y_pred)) print('gbdt accuracy is', accuracy_score(test_y,y_pred))
def XGBscore(self): X_train_leaves = self.x_train y_train = self.y_train X_test_leaves = self.x_test y_test = self.y_test xgb = XGBClassifier() xgb.fit(X_train_leaves, y_train) Y_pred_xgb = xgb.predict(X_test_leaves) xgb_auc = roc_auc_score(y_test, Y_pred_xgb) print('GBDT + XGB auc: %.5f' % xgb_auc)
def test_xgboost_sklearn_gressor(): l1 = [] from sklearn.datasets import load_boston boston = load_boston() xgb = XGBRegressor() xgb.fit(boston.data, boston.target) predictions = xgb.predict(boston.data) l1 += predictions.tolist() print(predictions) print(type(predictions))
def train_xgb(data): X = data.drop(['cause'], axis=1).values Y = data['cause'].values X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) xgb = XGBClassifier(n_estimators=300) xgb.fit(X_train, y_train) preds = xgb.predict(X_test) acc_xgb = (preds == y_test).sum().astype(float) / len(preds) * 100 print("XGBoost's prediction accuracy is: %3.2f" % (acc_xgb))
def check_performance(g_z, train_data, test_data, data_cols, label_cols=[], seed=0, with_class=False, data_dim=2): #train_data,test_data = load_preprocess_aps_data() if len(label_cols) > 0: gen_df = pd.DataFrame(g_z[:, :-1], columns=data_cols) else: gen_df = pd.DataFrame(g_z, columns=data_cols) gen_df['failure'] = np.ones((g_z.shape[0], 1)) combined_train_df = pd.concat([train_data, gen_df]) print(train_data.shape, gen_df.shape, combined_train_df.shape) xgb_params = { # 'tree_method': 'hist', # for faster evaluation 'max_depth': 3, # for faster evaluation 'n_estimators': 18, #'objective': 'binary:logistic', 'random_state': 0, #'eval_metric': 'auc', # allows for balanced or unbalanced classes 'scale_pos_weight': 40, 'min_child_weight': 44, 'silent': 1 } X_train = combined_train_df[combined_train_df.columns.drop( 'failure')].values y_train = combined_train_df.failure X_test = test_data[test_data.columns.drop('failure')].values y_test = test_data.failure xgb = XGBClassifier(max_depth=3, n_estimators=18, n_jobs=-1, scale_pos_weight=40, min_child_weight=44) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) # dtrain = xgb.DMatrix(X_train, y_train, feature_names=data_cols + label_cols) # dtest = xgb.DMatrix(X_test, feature_names=data_cols + label_cols) # xgb_test = xgb.train(xgb_params, dtrain, num_boost_round=10) # limit to ten rounds for faster evaluation # # y_pred = np.round(xgb_test.predict(dtest)) print('Test performance confusion', confusion_matrix( y_test, y_pred)) # assumes balanced real and generated datasets return aps_cost(y_pred, y_test) # assumes balanced real and generated datasets
def fonction_model_xgb(data): df = fonction_select_xgb(data) X = df.drop("tx_rec_marg_Bin",axis = 1) y = df["tx_rec_marg_Bin"] X_train, X_test,y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 0) kf = KFold(n_splits=3) kf.get_n_splits(X) Quant=df[[col for col in df.columns.to_list() if df[col].nunique() > 3]] num = list(Quant.columns) scaler = StandardScaler().fit(X_train[num]) X_train[num] = scaler.transform(X_train[num]) X_test[num] = scaler.transform(X_test[num]) xgb = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500) #grid_xgb = GridSearchCV (estimator = xgb, param_grid=param_grid ,scoring="accuracy") #print(grid_rf.best_params_) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) print(classification_report(y_test,y_pred)) #print(grid_xgb.best_params_) xgb_shap = XGBClassifier(criterion = 'gini',max_depth = 7, max_features = 'auto', n_estimators = 500) xgb_shap.fit(X_train, y_train) shap_values = shap.TreeExplainer(xgb_shap).shap_values(X_train) print(shap.summary_plot(shap_values, X_train, plot_type="bar")) print(confusion_matrix(y_test,y_pred)) print(f1_score(y_pred,y_test, average='micro')) return data
def runXGBoost(train_data_mix_n, train_Y, test_data_mix_n, test_Y): xgb = XGBClassifier(max_depth=10, min_child_weight=6, gamma=0.5, colsample_bytree=0.7, subsample=0.7, reg_alpha=1) xgb.fit(train_data_mix_n, train_Y) predicted_label = xgb.predict(test_data_mix_n) print("Test accuracy using XGBoost Classifier") print(accuracy_score(test_Y, predicted_label)) print("Confusion Metrix for XGBoost Classifier..") cnf_matrix = confusion_matrix(test_Y, predicted_label) print(cnf_matrix)
def xgb_model_1(X_train,y_train,X_test,params=None): # train with the scikit-learn API xgb = XGBRegressor(n_estimators=1000, max_depth=13, min_child_weight=150, subsample=0.7, colsample_bytree=0.3) y_test = np.zeros(len(X_test)) for i, (train_ind, val_ind) in enumerate(KFold(n_splits=2, shuffle=True, random_state=1989).split(X_train)): print("----------------------") print("Training model #%d" % i) print("----------------------") # XGBRegressor.fit xgb.fit(X_train[train_ind], y_train[train_ind], eval_set=[(X_train[val_ind], y_train[val_ind])], early_stopping_rounds=10, verbose=25) y_test += xgb.predict(X_test, ntree_limit=xgb.best_ntree_limit) y_test /= 2 return y_test
def predict(data): """预测""" X_all = data.drop(['FTR'], 1) X_all = change_type(X_all) X_all = one_hot_encode(X_all) y_all = data['FTR'] y_all = y_all.map({'NH': 0, 'H': 1}) # 把标签映射为0和1 # 读取模型 xgb = joblib.load('xgboost_model_demo.model') # 随机抽出10条数据进行预测 random_x = X_all.sample(n=10) random_y = y_all.sample(n=10) # 进行预测 predict_result = xgb.predict(random_x) print("实际值:%s \n预测值:%s" % (random_y.values, predict_result))
def pred(): xgb = XGBClassifier(booster='gbtree', gamma=0.0, max_depth=8, min_child_weight=3, learning_rate=0.03, n_jobs=-1, scale_pos_weight=1, reg_alpha=0.1, reg_lambda=1, colsample_bytree=0.9, subsample=0.8, n_estimators=370, objective="binary:hinge", tree_method='gpu_hist', gpu_id=0, random_state=5477113) xgb.fit(x_combined, y_combined) y_pred = xgb.predict(X_test) team_name = 'TeamFOSAI' submission_index = 2 label_file = '/media/jose/hk-data/PycharmProjects/the_speech/data/mask/labels/labels.csv' df_labels = pd.read_csv(label_file) # Write out predictions to csv file (official submission format) pred_file_name = task + '.' + feat_type +'.test.' + team_name + '_' + str(submission_index) + '.csv' print('Writing file ' + pred_file_name + '\n') df = pd.DataFrame(data={'file_name': df_labels['file_name'][df_labels['file_name'].str.startswith('test')].values, 'prediction': le.inverse_transform(y_pred).flatten()}, columns=['file_name','prediction']) df.to_csv(pred_file_name, index=False) print('Done.\n')
def predictor(): data=request.get_json(force=True) a=str(data.get("rain")) b=str(data.get("temperature")) c=str(data.get("season")) #c=data.get("humidity") d=str(data.get("state")) e=str(data.get("year")) f=str(data.get("P_r")) array=[] array.append([getIndex(state_enc,d),getIndex(enc_year,e),getIndex(enc_season,c),float(b),float(a),float(6)]) array=np.array(array) df=pd.DataFrame(array,columns=['State_Name','Crop_Year','Season','temperature','Rainfall','P_r']) prediction=xgb.predict(df) prediction=np.round(prediction) prediction=int(prediction) ans=enc_crop[prediction] ans=[ans] ans=np.array(ans) df=pd.DataFrame(ans,columns=['crop']) return df.to_json(orient='records')
def train_xgb(): import xgboost as xgb train_set, evaluation_set = split_train_set(encoding()) # train_set, evaluation_set = split_train_set() train_set.fillna(0, inplace=True) print(train_set.head()) print('prepare for the training...') features = [x for x in train_set.columns if x not in ['label']] y_train = train_set['label'] X_train = train_set[features] y_test = evaluation_set['label'] X_test = evaluation_set[features] xgb_train = xgb.DMatrix(X_train, y_train) xgb_test = xgb.DMatrix(X_test) print('X_train shape') print(X_train.shape) print('y_train shape') print(y_train.shape) params = { 'objective': 'binary:logistic', 'eta': 0.1, 'colsample_bytree': 0.886, 'min_child_weight': 2, 'max_depth': 10, 'subsample': 0.886, # 'alpha': 10, # 'gamma': 30, # 'lambda': 50, 'verbose_eval': True, 'eval_metric': 'auc', 'scale_pos_weight': 10, 'seed': 201703, 'missing': -1 } xgb = xgb.train(params, xgb_train, early_stopping_rounds=20) pre = xgb.predict(xgb_test) print(type(pre))
#--------------- XGBoost algorithm import xgboost as xgb from xgboost.sklearn import XGBClassifier tic7 = time.time() xgb = XGBClassifier(objective='multi:softmax', num_class=4, n_fold=4, colsample_bytree=1, learning_rate=0.15, max_depth=5, n_estimators=600, subsample=0.3) xgb.fit(X_train, y_train) y_pred = xgb.predict(X_test) # Get the accuracy score acc = accuracy_score(y_test, y_pred) # Get f1 score f1_xgb = f1_score(y_test, y_pred, average='weighted') # Append to the accuracy list #accuracy_lst.append(acc) #f1_lst.append(f1_xgb) print("[XGBoost algorithm] accuracy_score: {:.3f}.".format(acc)) print("[XGBoost algorithm] f1_score: {:.3f}.".format(f1_xgb)) toc7 = time.time() print('Elapsed time for Losigtic regression is %f seconds \n' %
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot() num_boost_rounds = len(cv_output) model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round= num_boost_rounds) xgb.plot_importance(model, height=0.5) num_boost_round = model.best_iteration xgb.plot_importance(model, height=0.5) from xgboost.sklearn import XGBRegressor xgb = XGBRegressor( nthread=-1, missing= -1, n_estimators=300, learning_rate=0.02, max_depth=17, subsample=0.9 , min_child_weight=3, colsample_bytree=0.7, reg_alpha=100, reg_lambda=100, silent=False) xgb.fit(x_train,y_train) #print(x_train) pred=xgb.predict(x_test) predictions = [round(value) for value in pred] """x_test['result']=pred x_test['crop']=y_test x_test.to_csv('pred.csv')""" #print accuracy_score(y_test,pred) accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) @app.route('/predictor',methods=['POST','GET']) def predictor(): data=request.get_json(force=True) a=str(data.get("rain")) b=str(data.get("temperature")) c=str(data.get("season")) #c=data.get("humidity")
'Supermarket Type3': 2, 'Supermarket Type2': 1 } datatest.Outlet_Type = [gender[item] for item in datatest.Outlet_Type] datatest.head() #usig Randome forest Regresser regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=500) regr.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"]) datat.dtypes pred = regr.predict(datatest[datatest.columns[1:6]]) #using Xgboost xgb = xgb.XGBRegressor(n_estimators=50, learning_rate=0.09, gamma=0, subsample=0.85, colsample_bytree=1, max_depth=7) xgb.fit(data[data.columns[1:6]], data["Item_Outlet_Sales"]) pred = xgb.predict(datatest[datatest.columns[1:6]]) datat["Item_Outlet_Sales"] = pred newdf = datat[['Item_Identifier', 'Outlet_Identifier', 'Item_Outlet_Sales']] newdf.to_csv("D://24projects//Project 3//output.csv", encoding='utf-8', index=False) datat["Item_Outlet_Sales"].isnull().sum()
print('Time Taken: {:.2f} seconds'.format(time_taken)) # In[ ]: print(accuracy_score(preds1, y_test)) print(classification_report(preds1, y_test)) print(confusion_matrix(preds1, y_test)) # In[ ]: import xgboost as xgb time1 = time.time() xgb = xgb.XGBClassifier(n_jobs=-1) xgb.fit(X_train, y_train) preds2 = xgb.predict(X_test) time_taken = time.time() - time1 print('Time Taken: {:.2f} seconds'.format(time_taken)) # In[ ]: # manual method to check accuracy, see first 100 predictions, around 70% correct prediction for i in range(100): if preds2[i] == np.array(y_test)[i]: print('1', end=', ') # correct prediction else: print('0', end=', ') # wrong prediction # In[ ]: preds2[0:100:5]
def run_benchmark(args): try: dtest = xgb.DMatrix('dtest.dm') dtrain = xgb.DMatrix('dtrain.dm') if not (dtest.num_col() == args.columns \ and dtrain.num_col() == args.columns): raise ValueError("Wrong cols") if not (dtest.num_row() == args.rows * args.test_size \ and dtrain.num_row() == args.rows * (1-args.test_size)): raise ValueError("Wrong rows") except: print("Generating dataset: {} rows * {} columns".format( args.rows, args.columns)) print("{}/{} test/train split".format(args.test_size, 1.0 - args.test_size)) tmp = time.time() X, y = make_classification(args.rows, n_features=args.columns, n_redundant=0, n_informative=args.columns, n_repeated=0, random_state=7) if args.sparsity < 1.0: X = np.array([[ np.nan if rng.uniform(0, 1) < args.sparsity else x for x in x_row ] for x_row in X]) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=args.test_size, random_state=7) print("Generate Time: %s seconds" % (str(time.time() - tmp))) #save to .csv file np.savetxt('train.csv', np.concatenate( (X_train, y_train.reshape((y_train.shape[0], 1))), axis=1), fmt='%.8f', delimiter=',') np.savetxt('test.csv', np.concatenate((X_test, y_test.reshape( (y_test.shape[0], 1))), axis=1), fmt='%.8f', delimiter=',') tmp = time.time() print("DMatrix Start") dtrain = xgb.DMatrix(X_train, y_train) dtest = xgb.DMatrix(X_test, y_test, nthread=-1) print("DMatrix Time: %s seconds" % (str(time.time() - tmp))) dtest.save_binary('dtest.dm') dtrain.save_binary('dtrain.dm') param = { 'max_depth': 6, 'eta': 0.1, 'silent': 1, 'objective': 'binary:logistic' } if args.params is not '': param.update(ast.literal_eval(args.params)) param['tree_method'] = args.tree_method print("Training with '%s'" % param['tree_method']) tmp = time.time() param['nthread'] = 24 param['eval_metric'] = 'auc' #xgb.train(param, dtrain, args.iterations, evals=[(dtest, "test")]) xgb.train(param, dtrain, args.iterations) print("Train Time: %s seconds" % (str(time.time() - tmp))) # this is prediction preds = xgb.predict(dtest) y_test = dtest.get_label() auc_score = metrics.roc_auc_score(y_test, preds) logger.info('auc = %f', auc_score)
per = fs_results['per'][0] # re-run the model with the best parameters and features selected fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile = per) feature_model = fs.fit(features_train,target_train) features_train_new = feature_model.transform(features_train) features_test_new = feature_model.transform(features_test) # Create the model xgb = xgboost.XGBClassifier(n_estimators=est, learning_rate=lr, gamma=0, subsample=subsample, colsample_bytree=colsample_bytree, max_depth=depth) # Fit the model xgb.fit(features_train_new, target_train) pred_test = xgb.predict(features_test_new) pred_train = xgb.predict(features_train_new) # predict the games predictions_train = [round(value) for value in pred_train] predictions_test = [round(value) for value in pred_test] # calculate the accuracy train_accuracy = accuracy_score(target_train, predictions_train) test_accuracy = accuracy_score(target_test, predictions_test) print (train_accuracy) print (test_accuracy) # store the predictions pred_df1 = pd.DataFrame(predictions_test)
def c(x): if x=='0': x=0 elif x == 'a': x=3 elif x =='b': x=2 elif x =='c': x=1 StateHoliday = df_all.StateHoliday.astype(str).apply(c).values df_all.drop(["StateHoliday"], axis=1) df_all["StateHoliday"] = StateHoliday vals = df_all.values print df_all.columns X = vals[:piv_train] y = Sales X_test = vals[piv_train:] xgb = XGBRegressor(max_depth=6, learning_rate=0.3, n_estimators=25, subsample=0.5, colsample_bytree=0.5, seed=0) xgb.fit(X, y) y_pred = xgb.predict(X_test) sub = pd.DataFrame(np.column_stack((ids, y_pred)), columns=['Id', 'Sales']) sub.to_csv('sub.csv',index=False)
('txt2', pipeline.Pipeline([('s2', cust_txt_col(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])), ('txt3', pipeline.Pipeline([('s3', cust_txt_col(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])), ('txt4', pipeline.Pipeline([('s4', cust_txt_col(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)])) ], transformer_weights = { 'cst': 1.0, 'txt1': 0.5, 'txt2': 0.25, 'txt3': 0.0, 'txt4': 0.5 }, n_jobs = -1 )), ('rfr', rfr))]) param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]} model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, n_jobs = -1, cv = 2, verbose = 20, scoring=RMSE) model.fit(X_train, y_train) XGBmodel = grid_search.GridSearchCV(estimator = clfXGB, param_grid = param_grid, n_jobs = -1, cv = 2, verbose = 20, scoring=RMSE) XGBmodel.fit(X_train,y_train) print("Best parameters found by grid search:") print(model.best_params_) print("Best CV score:") print(model.best_score_) y_pred = 0.25*model.predict(X_test) + 0.75*xgb.predict(X_test) pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('submission.csv',index=False) print("--- Training & Testing: %s minutes ---" % round(((time.time() - start_time)/60),2))
def measure_others(X_train, y_train, X_test, y_test): # Logistic Regression log_reg_params = { "penalty": ['none', 'l2'], 'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] } grid_log_reg = GridSearchCV(LogisticRegression(), log_reg_params) grid_log_reg.fit(X_train, y_train) # We automatically get the logistic regression with the best parameters. log_reg = grid_log_reg.best_estimator_ knears_params = { "n_neighbors": list(range(1, 10, 1)), 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] } grid_knears = GridSearchCV(KNeighborsClassifier(), knears_params) grid_knears.fit(X_train, y_train) # KNears best estimator knears_neighbors = grid_knears.best_estimator_ k = grid_knears.best_estimator_.n_neighbors # k nearest # Support Vector Classifier svc_params = { 'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'] } grid_svc = GridSearchCV(SVC(), svc_params) grid_svc.fit(X_train, y_train) # SVC best estimator svc = grid_svc.best_estimator_ # DecisionTree Classifier tree_params = { "criterion": ["gini", "entropy"], "max_depth": list(range(2, 4, 1)), "min_samples_leaf": list(range(5, 7, 1)) } grid_tree = GridSearchCV(DecisionTreeClassifier(), tree_params) grid_tree.fit(X_train, y_train) # tree best estimator tree_clf = grid_tree.best_estimator_ ######################################################################################## y_pred_log_reg = log_reg.predict(X_test) y_pred_knear = knears_neighbors.predict(X_test) y_pred_svc = svc.predict(X_test) y_pred_tree = tree_clf.predict(X_test) ############################################################################################### gnb = GaussianNB() y_pred_gnb = gnb.fit(X_train, y_train).predict(X_test) ######################################################################################## import xgboost as xgb xgb_model = xgb.XGBClassifier(objective="binary:logistic") xgb_params = { "colsample_bytree": uniform(0.7, 0.3), "gamma": uniform(0, 0.5), "learning_rate": uniform(0.03, 0.3), # default 0.1 "max_depth": randint(2, 6), # default 3 "n_estimators": randint(100, 150), # default 100 "subsample": uniform(0.6, 0.4) } rand_xgb = RandomizedSearchCV(xgb_model, param_distributions=xgb_params, random_state=42, n_iter=200, cv=3, verbose=1, n_jobs=1, return_train_score=True) rand_xgb.fit(X_train, y_train) # KNears best estimator xgb = rand_xgb.best_estimator_ y_pred_xgb = xgb.predict(X_test) return k, y_pred_log_reg, y_pred_knear, y_pred_svc, y_pred_tree, y_pred_gnb, y_pred_xgb
x_test_stacking.iloc[i, 3] = x_test_stacking.iloc[i, 3] + pred_rf[i] x_test_stacking.iloc[i, 4] = x_test_stacking.iloc[i, 4] + pred_knn[i] #------------------------------------------------------------------ #####################对测试集结果进行平均化处理#####33 print(x_test_stacking) for i in range(len(x_test)): for j in range(5): x_test_stacking.iloc[i, j] = x_test_stacking.iloc[i, j] / 3 print(x_test_stacking) ###################################第二层用xgboost############# xgb = XGBRegressor(max_depth=4, learning_rate=0.005, n_estimators=500, silent=True, objective='reg:linear', subsample=0.93, base_score=y_mean, seed=0, missing=None) xgb.fit(x_train_stacking, y_train) pred = xgb.predict(x_test_stacking) print(pred) output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': pred}) output.to_csv( 'C:\\Users\\Administrator\\Desktop\\benz\\new\\test_stacking.csv', index=False)
colsample_bytree=0.3) y_test = np.zeros(len(X_test)) for i, (train_ind, val_ind) in enumerate( KFold(n_splits=2, shuffle=True, random_state=1989).split(X_train)): print('----------------------') print('Training model #%d' % i) print('----------------------') xgb.fit(X_train[train_ind], y_train[train_ind], eval_set=[(X_train[val_ind], y_train[val_ind])], early_stopping_rounds=10, verbose=25) y_test += xgb.predict(X_test, ntree_limit=xgb.best_ntree_limit) y_test /= 2 ### ================================================ ### df_sub = pd.DataFrame({ 'id': df_all[df_all['trip_duration'].isnull()]['id'].values, 'trip_duration': np.exp(y_test) }).set_index('id') print(df_sub) df_sub.to_csv('~/NYC_Taxi_Trip_Duration/output/0824_xgb_387.csv') """ #print (auto_classifier.config_dict)
"colsample_bytree": 0.95, "alpha": 2e-05, "lambda": 10 } ROUNDS = 151 print('xgboost train:') xgboost = xgboost.train(params=xgb_params, dtrain=d_train, num_boost_round=ROUNDS, verbose_eval=10) #lgb.plot_importance(lightGBM, figsize=(9,40)) #plt.show() # save model to file joblib.dump(xgboost, "xgboost.model") df_test = load_from_hdfs('df_test') final_preds = xgboost.predict(df_test[features_to_use]) df_final_pred = pd.DataFrame(df_test[['order_id', 'product_id']]) df_final_pred['prediction'] = final_preds df_final_pred = df_final_pred.loc[df_final_pred.prediction > 0.01, ['order_id', 'prediction', 'product_id']] df_order = applyParallel(df_final_pred.groupby(df_final_pred.order_id), create_products).reset_index() df_order[['order_id', 'products']].to_csv('../submission/submission.csv', index=False)
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features1, target, test_size=0.1, random_state=42) from sklearn import svm svm1 = svm.SVC() svm1.fit(X_train, y_train) predictionssvm = svm1.predict(X_test) dtc=DecisionTreeClassifier() modeldtc = dtc.fit(X_train, y_train) predictionsdtc = dtc.predict(X_test) adb=AdaBoostClassifier() modeladb=adb.fit(X_train, y_train) predictionsadb = adb.predict(X_test) from sklearn.ensemble import GradientBoostingClassifier xgb= GradientBoostingClassifier() modelxbg=xgb.fit(X_train, y_train) predictionsxgb = xgb.predict(X_test) import operator from sklearn.neural_network import MLPClassifier mlp=MLPClassifier(solver='adam',activation='tanh',random_state=0) modelmlp=mlp.fit(X_train,y_train) predictionmlp=mlp.predict(X_test) #4. Stacked Classifier X=features1 y=target clf1 = adb clf2 = dtc clf3 = svm1 meta = LogisticRegression() sclf = StackingClassifier(classifiers=[meta, clf1, clf3],
train_new.drop([ID],1,inplace = True) test_new.drop([ID],1,inplace = True) pca = PCA(.95) pca.fit(train_new) train_new2 = pca.transform(train_new) test_new2 = pca.transform(test_new) train_new3 = pd.DataFrame(train_new2) test_new3 = pd.DataFrame(test_new2) train_new4 = train_new test_new4 = test_new train_new4[ID] = train_ids test_new4[ID] = test_ids train_new4[target] = targets predictors = [x for x in train_new4.columns if x not in [target,ID]] xgb = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.08, gamma=0, subsample=0.75,colsample_bytree=1, max_depth=7) x_train,x_cv,y_train,y_cv = train_test_split(train_new4.loc[:,predictors],train_new4.loc[:,target],test_size=0.2) xgb.fit(x_train,y_train) print(metrics.r2_score(y_train,xgb.predict(x_train.loc[:,predictors]))) print(metrics.r2_score(y_cv,xgb.predict(x_cv.loc[:,predictors]))) predicted_xgb = xgb.predict(test_new4.loc[:,predictors]) print(len(predicted_xgb)) for i in range(len(predicted_xgb)): if(predicted_xgb[i] <0): predicted_xgb[i] = 0 predicted = pd.DataFrame() predicted['Id'] = test[ID] predicted['Yield'] = predicted_xgb predicted.to_csv("/home/ubuntu/Hackathons/Yield Prediction/submission.csv",index = False)
price=dataset['COST'] Data=dataset.drop(['COST'],axis=1) x=np.array(price).reshape(-1,1) from sklearn.model_selection import train_test_split X_train,X_test,Y_train,Y_test=train_test_split(Data,x,test_size=0.33,random_state=0) import xgboost as xgb xgb = xgb.XGBRegressor(n_estimators=10, max_depth=5, objectives = 'reg:linear' , learning_rate=0.3) import time dep=time.time() xgb.fit(X_train,Y_train) fin=time.time()-dep predictions = xgb.predict(X_test) from sklearn.metrics import mean_squared_error rmse=np.sqrt(mean_squared_error(Y_test,predictions)) print("RMSE: %f" % (rmse)) from sklearn.metrics import explained_variance_score EV=explained_variance_score(Y_test,predictions) print("EV : %f" %(EV)) import matplotlib.pyplot as plt import os os.getcwd() os.chdir('C:/Program Files (x86)/Graphviz2.38/bin') xgb.plot_tree(xgb,num_trees=9) plt.rcParams['figure.figsize'] = [50, 10] plt.show()
over_samples_X, over_samples_y = over_samples.fit_sample(X_train, y_train) #over_samples_X, over_samples_y = over_samples.fit_sample(X_train.values,y_train.values.ravel()) # 重抽样前的类别比例 print(y_train.value_counts() / len(y_train)) # 重抽样后的类别比例 print(pd.Series(over_samples_y).value_counts() / len(over_samples_y)) # 导入第三方包 import xgboost import numpy as np # 构建XGBoost分类器 xgboost = xgboost.XGBClassifier() # 使用重抽样后的数据,对其建模 xgboost.fit(over_samples_X, over_samples_y) # 将模型运用到测试数据集中 resample_pred = xgboost.predict(np.array(X_test)) # 返回模型的预测效果 print('模型的准确率为:\n', metrics.accuracy_score(y_test, resample_pred)) print('模型的评估报告:\n', metrics.classification_report(y_test, resample_pred)) # 计算欺诈交易的概率值,用于生成ROC曲线的数据 y_score = xgboost.predict_proba(np.array(X_test))[:, 1] fpr, tpr, threshold = metrics.roc_curve(y_test, y_score) # 计算AUC的值 roc_auc = metrics.auc(fpr, tpr) # 绘制面积图 plt.stackplot(fpr, tpr, color='steelblue', alpha=0.5, edgecolor='black') # 添加边际线 plt.plot(fpr, tpr, color='black', lw=1)