def fit(self, X, y): dtrain = xgb.DMatrix(X, label=y, missing=np.NaN) self.model = xgb.train(self.learner_params, dtrain, self.boosting_rounds, xgb_model=self.model)
if x_test[c].dtype == 'object': lbl = preprocessing.LabelEncoder() lbl.fit(list(x_test[c].values)) x_test[c] = lbl.transform(list(x_test[c].values)) xgb_params = { 'eta': 0.05, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1 } dtrain = xgb.DMatrix(x_train, y_train) dtest = xgb.DMatrix(x_test) # cv_output = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20, verbose_eval=25, show_stdv=False) # print('best num_boost_rounds = ', len(cv_output)) # num_boost_rounds = len(cv_output) # 382 num_boost_rounds = 385 model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds) y_predict = model.predict(dtest) output = pd.DataFrame({'id': id_test, 'price_doc': y_predict}) output.to_csv('output.csv', index=False) print 'ok'
print('Train score is:', scr[i]) print(log_loss(y, oob_pred)) print oob_pred[1:10] sub_pred = sub_pred.mean(axis=1) oob_pred_filename = '../output/oob_pred_rfentropy_' + str(np.mean(scr)) sub_pred_filename = '../output/sub_pred_rfentropy_' + str(np.mean(scr)) pkl.dump(oob_pred, open(oob_pred_filename + '.p', 'wb')) pkl.dump(sub_pred, open(sub_pred_filename + '.p', 'wb')) preds = pd.DataFrame({"ID": ids, "PredictedProb": sub_pred}) preds.to_csv(sub_pred_filename + '.csv', index=False) ''' fraction_of_positives, mean_predicted_value = calibration_curve(y, oob_pred, n_bins=10) plt.plot(mean_predicted_value, fraction_of_positives, "s-",label="ssafaasf") plt.show() ''' else: # Train on full data dtrain = xgb.DMatrix(X, y) dtest = xgb.DMatrix(X_sub) clf = xgb.train(xgb_param, dtrain, m_params['n_rounds']) pred = clf.predict(dtrain) print('Train score is:', log_loss(y, np.array(pred))) #clf.save_model(model_name + '.model') pred = clf.predict(dtest) print("Saving Results.") preds = pd.DataFrame({"ID": ids, "target": pred}) preds.to_csv(model_name + '.csv', index=False)
def xgb_model(X_train, X_valid, y_train, y_valid, X_test_id, X_test): """ xgb 模型 :param X_train: :param X_valid: :param y_train: :param y_valid: :param X_test_id: :return: """ import pandas as pd import numpy as np import xgboost as xgb dtrain = xgb.DMatrix(X_train, label=y_train.values) dvalid = xgb.DMatrix(X_valid, label=y_valid.values) # ########################################## Tuning Paramters ########################################## xgb_best_params = {} params = {'booster': 'gbtree', 'objective': 'reg:squarederror', 'max_depth': 6, 'learning_rate': 1, 'gamma': 0, 'min_child_weight': 1, 'subsample': 1, 'colsample_bytree': 1, 'reg_alpha': 0, 'reg_lambda ': 1, 'random_state': 23, 'gpu_id': 0, 'max_bin': 16, 'tree_method': 'gpu_exact' } # ########################################### n_estimators ############################################ min_merror = np.inf for n_estimators in range(10, 1000, 10): params['n_estimators'] = n_estimators cv_results = xgb.cv(params, dtrain, nfold=3, num_boost_round=1000, early_stopping_rounds=30, feval=rmspe_xg, seed=23) mean_error = min(cv_results['test-rmspe-mean']) if mean_error < min_merror: min_merror = mean_error xgb_best_params["n_estimators"] = n_estimators params["n_estimators"] = xgb_best_params["n_estimators"] # ########################################### max_depth & min_child_weight ############################# min_merror = np.inf for max_depth in range(6, 11, 1): for min_child_weight in range(4, 10, 1): params['max_depth'] = max_depth params['min_child_weight'] = min_child_weight cv_results = xgb.cv(params, dtrain, nfold=3, num_boost_round=1000, early_stopping_rounds=50, feval=rmspe_xg, seed=23) mean_error = np.argmin(cv_results['test-rmspe-mean']) if mean_error < min_merror: min_merror = mean_error xgb_best_params["max_depth"] = max_depth xgb_best_params["min_child_weight"] = min_child_weight params['max_depth'] = xgb_best_params['max_depth'] params["min_child_weight"] = xgb_best_params["min_child_weight"] # ########################################### gamma ##################################################### for gamma in [i / 10.0 for i in range(0, 1)]: params['gamma'] = gamma cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23) mean_error = min(cv_results['test-rmspe-mean']) if mean_error < min_merror: min_merror = mean_error xgb_best_params["gamma"] = gamma params["gamma"] = xgb_best_params["gamma"] # ############################################# subsample & colsample_bytree ############################ min_merror = np.inf for subsample in [i / 10.0 for i in range(6, 10)]: for colsample_bytree in [i / 10.0 for i in range(6, 10)]: params['subsample'] = subsample params['colsample_bytree'] = colsample_bytree cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23) mean_error = min(cv_results['test-rmspe-mean']) if mean_error < min_merror: min_merror = mean_error xgb_best_params["subsample"] = subsample xgb_best_params["colsample_bytree"] = colsample_bytree params["subsample"] = xgb_best_params["subsample"] params["colsample_bytree"] = xgb_best_params["colsample_bytree"] # ############################################# reg_alpha ################################################ min_merror = np.inf for reg_alpha in [0.8, 0.9, 1, 1.1, 1.2]: params['reg_alpha'] = reg_alpha cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23) mean_error = min(cv_results['test-rmspe-mean']) if mean_error < min_merror: min_merror = mean_error xgb_best_params["reg_alpha"] = reg_alpha params["reg_alpha"] = xgb_best_params["reg_alpha"] # ############################################# reg_lambda ################################################ min_merror = np.inf for reg_lambda in [0.8, 0.9, 1, 1.1, 1.2]: params['reg_lambda'] = reg_lambda cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23) mean_error = min(cv_results['test-rmspe-mean']) if mean_error < min_merror: min_merror = mean_error xgb_best_params["reg_lambda"] = reg_lambda params["reg_lambda"] = xgb_best_params["reg_lambda"] # ############################################# learning_rate ################################################ min_merror = np.inf for learning_rate in [0.001, 0.005, 0.01, 0.03, 0.05]: params['learning_rate'] = learning_rate cv_results = xgb.cv(params, dtrain, nfold=3, early_stopping_rounds=50, feval=rmspe_xg, seed=23) mean_error = min(cv_results['test-rmspe-mean']) if mean_error < min_merror: min_merror = mean_error xgb_best_params["learning_rate"] = learning_rate params["learning_rate"] = xgb_best_params["learning_rate"] print(params) bst_params = { "eta": 0.3, "alpha": 1, "silent": 1, "seed": 42, "objective": params['objective'], "booster": params['booster'], "max_depth": params['max_depth'], 'min_child_weight': params['min_child_weight'], "subsample": params['subsample'], "colsample_bytree": params['colsample_bytree'], "reg_alpha": params['reg_alpha'], "gpu_id": params['gpu_id'], "max_bin": params['max_bin'], "tree_method": params['tree_method'], "n_estimators": params['n_estimators'] } watchlist = [(dtrain, 'train'), (dvalid, 'eval')] xgb_model = xgb.train(bst_params, dtrain, num_boost_round=1000, evals=watchlist, early_stopping_rounds=100, feval=rmspe_xg, verbose_eval=True) print("Validating") yhat = xgb_model.predict(xgb.DMatrix(X_valid)) error = rmspe(np.expm1(y_valid.values), np.expm1(yhat)) print('RMSPE: {:.6f}'.format(error)) xgb_test_prod = xgb_model.predict(xgb.DMatrix(X_test)) xgb_test_prod = np.expm1(xgb_test_prod) sub_df = pd.DataFrame(data=list(X_test_id), columns=['id']) sub_df["forecastVolum"] = [int(i) for i in xgb_test_prod] sub_df.to_csv(DefaultConfig.project_path + "/data/submit/" + DefaultConfig.select_model + "_submission.csv", index=False, encoding='utf-8')
line = ser.readline() total_byte = total_byte + len(line.decode('utf-8')) #print(" byte:",len(line.decode('utf-8'))," total_byte:", total_byte) line_str = (line.decode('utf-8')).replace('\n', '') # byteをstrに変換後、改行コードを削除 lines = line_str.split(',') print(lines) with open(test_data, 'a',newline="") as f: writer = csv.writer(f) writer.writerow(lines) ### 即座に予測 # pandasのDataFrameを作成 df = pd.read_csv(test_data) data = pd.DataFrame(df, columns=['bothfoot_L','swing_L','bothfoot_R','swing_R','stand_L','stand_R']) # ヘッダーがあることが前提 # 最終行を対象に予測 dtest = xgb.DMatrix(data.tail(1)) pred = ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit) #print(pred[0]) # クラスタ番号 cpred = int(pred[0]) # 番号ごとの定義は毎回変える必要がある if cpred == 4: print("Normal") elif cpred == 0: print("Tired") elif cpred == 8: print("RUN") else: print("Stop") except KeyboardInterrupt: # 確認用
df_train=train1[train1.shop_id.notnull()] df_test=train1[train1.shop_id.isnull()] lbl = preprocessing.LabelEncoder() lbl.fit(list(df_train['shop_id'].values)) df_train['label'] = lbl.transform(list(df_train['shop_id'].values)) num_class=df_train['label'].max()+1 params = { 'objective': 'multi:softmax', 'eta': 0.1, 'max_depth': 9, 'eval_metric': 'merror', 'seed': 0, 'missing': -999, 'num_class':num_class, 'silent' : 1 } feature=[x for x in train1.columns if x not in ['user_id','label','shop_id','time_stamp','mall_id','wifi_infos','hours','weekday']] xgbtrain = xgb.DMatrix(df_train[feature], df_train['label']) xgbtest = xgb.DMatrix(df_test[feature]) watchlist = [ (xgbtrain,'train'), (xgbtrain, 'test') ] num_rounds=60 model = xgb.train(params, xgbtrain, num_rounds, watchlist, early_stopping_rounds=15) df_test['label']=model.predict(xgbtest) df_test['shop_id']=df_test['label'].apply(lambda x:lbl.inverse_transform(int(x))) r=df_test[['row_id','shop_id']] result=pd.concat([result,r]) result['row_id']=result['row_id'].astype('int') result.to_csv(path+'sub.csv',index=False)
def run(self): """ Run the whole training and predicting process. Returns: """ logger.info('Predicting week 10...') logger.info('PCA with week 10...') reg_train_10 = pd.read_csv('processed/reg_train_10.csv', dtype=DTYPES) reg_train_10 = self.set_clusters(reg_train_10) for cluster_no in range(0, self._n_clusters): tmp = reg_train_10[reg_train_10.cluster == cluster_no] tmp_feature = tmp[FEATURES].values dtest = xgb.DMatrix(tmp_feature, label=None, feature_names=FEATURES) del tmp_feature y_test = self._xgb_boosters[cluster_no].predict(dtest) tmp['Demanda_uni_equil'] = np.exp(y_test) - 1 if cluster_no == 0: sub_10 = tmp[['id', 'Demanda_uni_equil']] train_10 = tmp[[ 'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil' ]] else: sub_10 = pd.concat([sub_10, tmp[['id', 'Demanda_uni_equil']]]) train_10 = pd.concat([ train_10, tmp[[ 'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil' ]] ]) logger.info('Predicting week 11...') raw_train = pd.concat([RAW_TRAIN, train_10], axis=0) reg_train_11 = pp.extract_lag_features(data_frame=raw_train, test_set=RAW_TEST, week=11) logger.info('PCA with week 11...') reg_train_11 = self.set_clusters(reg_train_11) for cluster_no in range(0, self._n_clusters): tmp = reg_train_11[reg_train_11.label == cluster_no] tmp_feature = reg_train_11[FEATURES].values dtest = xgb.DMatrix(tmp_feature, label=None, feature_names=FEATURES) del tmp_feature y_test = self._xgb_boosters[cluster_no].predict(dtest) tmp['Demanda_uni_equil'] = np.exp(y_test) - 1 if cluster_no == 0: sub_11 = tmp[['id', 'Demanda_uni_equil']] train_11 = tmp[[ 'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil' ]] else: sub_11 = pd.concat([sub_11, tmp[['id', 'Demanda_uni_equil']]]) train_11 = pd.concat([ train_11, tmp[[ 'Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID', 'Producto_ID', 'Demanda_uni_equil' ]] ]) sub = pd.concat([sub_10, sub_11]) sub = sub.sort_values(by='id') sub.to_csv('submission/submission_cluster_xgb.csv', index=False) logger.info('Done with submission.')
folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=2019) oof = np.zeros(len(train_df)) predictions =np.zeros(len(test_df)) for i, (trn, val) in enumerate(folds.split(train_df.values,target.values)): print(i+1, "fold. AUC") trn_x = train_df.iloc[trn][features] trn_y = target.iloc[trn] val_x = train_df.iloc[val][features] val_y = target.iloc[val] model = xgb.train(params , xgb.DMatrix(trn_x, trn_y) , 100000 , [(xgb.DMatrix(trn_x, trn_y), 'train'), (xgb.DMatrix(val_x, val_y), 'valid')] , verbose_eval=5000 , early_stopping_rounds=3000 ) oof[val] = model.predict(xgb.DMatrix(val_x), ntree_limit=model.best_ntree_limit) predictions += model.predict(xgb.DMatrix(test_df[features]), ntree_limit=model.best_ntree_limit) / folds.n_splits print("CV score: {:<8.5f}".format(roc_auc_score(target, oof))) cv_auc = roc_auc_score(target, oof) cv_auc = cv_auc.round(6) OUTPUT_FILE = 'xgb_result_50w/' + output_name +'-'+ str(cv_auc) + '.csv'
def _model_predict(all_feature, predict_feature, predict_col, num_boost_round=1000): k_v = {} if predict_col in enum_col or predict_col in ext_enum_col: # 删除数量较少的类别 def func_count(df): df['value_count'] = df[predict_col].count() return df if predict_col in large_limit_col.keys(): number_limit = large_limit_col[predict_col] else: number_limit = 10 all_feature = all_feature.groupby(predict_col).apply(func_count) del_test_size = len(all_feature[(all_feature[test_label_col] == 1) & (all_feature["value_count"] < number_limit)]) print(predict_col, "del_test_size:", del_test_size) # 原本应有的所有测试集 test_feature_org = all_feature[all_feature[test_label_col] == 1] test_feature_org.drop(["value_count"], axis=1, inplace=True) test_y_org = np.array(test_feature_org[predict_col]) test_x_org = np.array(test_feature_org.drop([predict_col, test_label_col], axis=1)) print("test_x_org", test_x_org.shape) all_feature = all_feature[all_feature["value_count"] >= number_limit] all_feature.drop(["value_count"], axis=1, inplace=True) # 将value转换为class label = all_feature[predict_col] all_y = sorted(list(set(label))) if len(all_y) == 1: # 只有一个值,直接返回预测结果 print("only one value!") return np.array([all_y[0]] * len(predict_feature)), 1 v_k = {} for k, v in enumerate(all_y): v_k[v] = k k_v[k] = v label = np.array([v_k[i] for i in label]) all_feature[predict_col] = label train_feature = all_feature[all_feature[test_label_col] == 0] train_y = np.array(train_feature[predict_col]) train_x = np.array(train_feature.drop([predict_col, test_label_col], axis=1)) test_feature = all_feature[all_feature[test_label_col] == 1] test_y = np.array(test_feature[predict_col]) test_x = np.array(test_feature.drop([predict_col, test_label_col], axis=1)) predict_x = np.array(predict_feature.drop([predict_col, test_label_col], axis=1)) print("train_x:", train_x.shape, "test_x:", test_x.shape, "predict_x", predict_x.shape) params = {'booster': 'gbtree', 'eta': 0.02, 'max_depth': 8, # 5 4 3 'colsample_bytree': 0.9, # 0.8 0.7 'subsample': 0.8, 'min_child_weight': 40, # 2 3 'silent': 1, 'nthread': 4, 'tree_method': 'gpu_hist', "gpu_id": 0, "seed": 0 } if predict_col in bool_col: params["objective"] = "binary:logistic" params["eval_metric"] = "error" params["is_unbalance"] = True eval_metric = None elif predict_col in enum_col or predict_col in ext_enum_col: params["objective"] = "multi:softmax" params["eval_metric"] = "merror" params["num_class"] = max(label) + 1 eval_metric = None else: params["objective"] = "reg:linear" eval_metric = tool.xgb_metric train_set = xgb.DMatrix(train_x, label=train_y) valid_set = xgb.DMatrix(test_x, label=test_y) temp_model = xgb.train(params, train_set, num_boost_round=num_boost_round, evals=[(valid_set, "validate")], feval=eval_metric, maximize=True, early_stopping_rounds=200, verbose_eval=False) test_pred = temp_model.predict(valid_set) # 把概率转换为label if predict_col in bool_col: test_pred = np.where(test_pred > 0.5, 1, 0) elif predict_col in enum_col or predict_col in ext_enum_col: # 用原始的全测试集 if del_test_size > 0: valid_set = xgb.DMatrix(test_x_org) test_pred = temp_model.predict(valid_set) test_y = test_y_org # 取回原来的值 test_pred = np.array([k_v[i] for i in test_pred]) if predict_col in category_col: test_s = tool.label_score(test_y, test_pred) else: test_s = tool.regression_score(test_y, test_pred) # 可能保留两位小数或一位小数更好,或取整 if_round = False test_pred2 = np.round(test_pred, 2) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 2 test_s = test_s2 test_pred2 = np.round(test_pred, 1) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 1 test_s = test_s2 test_pred2 = np.round(test_pred, 0) test_s2 = tool.regression_score(test_y, test_pred2) if test_s < test_s2 - threshold: if_round = 0 test_s = test_s2 print("best iteration: ", temp_model.best_iteration) print("test score: ", test_s) predict_set = xgb.DMatrix(predict_x) predict_target = temp_model.predict(predict_set) predict_target = np.array(predict_target) if predict_col in enum_col or predict_col in ext_enum_col: predict_target = np.array([k_v[i] for i in predict_target]) elif predict_col in bool_col: predict_target = np.where(predict_target > 0.5, 1, 0) if if_round: predict_target = np.round(predict_target, if_round) return predict_target, test_s
colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) modelfit(xgb1, train, predictors) # In[79]: df = test df = df.drop('id', 1) df = df.drop('scan_folder', 1) # In[80]: xgtest = xgb.DMatrix(df[predictors].values) # In[81]: preds = xgb1.predict_proba(df) # In[82]: preds # In[110]: data = [] cols = ['id', 'cancer'] df = test for i, row in tqdm(df.iterrows(), total=len(df)):
# ============================================================================= # wait # ============================================================================= while True: if os.path.isfile('SUCCESS_803'): break else: sleep(60 * 1) utils.send_line('{} start'.format(__file__)) # ============================================================================= # load train # ============================================================================= dtrain = xgb.DMatrix('../data/dtrain.mt') gc.collect() # ============================================================================= # xgboost # ============================================================================= param = { 'colsample_bylebel': 0.8, 'subsample': 0.1, 'eta': 0.1, 'eval_metric': 'auc', 'max_depth': 4, 'objective': 'binary:logistic', 'silent': 1, 'tree_method': 'hist',
train_columns = x_train.columns for c in x_train.dtypes[x_train.dtypes == object].index.values: x_train[c] = (x_train[c] == True) del df_train gc.collect() #split = 80000 split = int(len(x_train) * 0.88) print("split: " + str(split)) x_train, y_train, x_valid, y_valid = x_train[:split], y_train[:split], x_train[ split:], y_train[split:] print('Building DMatrix...') d_train = xgb.DMatrix(x_train, label=y_train) d_valid = xgb.DMatrix(x_valid, label=y_valid) del x_train, x_valid gc.collect() print('Training ...') params = {} params['seed'] = 21 params['eta'] = 0.02 params['objective'] = 'reg:linear' params['eval_metric'] = 'mae' params['silent'] = 1 params['max_depth'] = 4 params['min_child_weight'] = 1 params['gamma'] = 0.0
print(feature_names) from sklearn.model_selection import KFold del train['filename'] del train['classification'] k = 3 kfold = KFold(n_splits=k) features_T = train.drop(['pathologie'], axis=1).values pathologies_T = train.pathologie print("Data ready for Kfold") print("Kfold test starting") for i, (train_index, test_index) in enumerate(kfold.split(features_T, pathologies_T)): print('[Fold %d/%d]' % (i + 1, 3)) X_train, X_test = features_T[train_index], features_T[test_index] y_train, y_test = pathologies_T[train_index], pathologies_T[test_index] d_train = xgb.DMatrix(X_train, label=y_train, feature_names=feature_names) d_test = xgb.DMatrix(X_test, label=y_test, feature_names=feature_names) watchlist = [(d_test, 'eval'), (d_train, 'train')] print("Training") evals_result = {} model = xgb.train(params, d_train, num_round, watchlist, evals_result=evals_result) print("Trained") xgb.plot_importance(model, max_num_features=25) from matplotlib import pyplot pyplot.show() print()
def predict(self, X): if self.model is None: raise XgbLearnerException("Xgboost model is None") dtrain = xgb.DMatrix(X, missing=np.NaN) return self.model.predict(dtrain)
import xgboost as xgb # read in data dtrain = xgb.DMatrix('../../data/data_20170722_01/train_data.txt') dtest = xgb.DMatrix('../../data/data_20170722_01/test_data.txt') # specify parameters via map, definition are same as c++ version param = {'max_depth':22, 'eta':0.1, 'silent':0, 'objective':'binary:logistic','min_child_weight':3,'gamma':14 } # specify validations set to watch performance watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 60 bst = xgb.train(param, dtrain, num_round, watchlist) # this is prediction preds = bst.predict(dtest) labels = dtest.get_label() positive_threshold_list = [0.50, 0.67, 0.80, 0.90, 0.95] for positive_threshold in positive_threshold_list: print('positive_threshold: ' + str(positive_threshold)) num_correct = sum(1 for i in range(len(preds)) if int(preds[i]>positive_threshold)==labels[i]) num_pred = len(preds) num_error = num_pred - num_correct print ('error=%d/%d=%f' % (num_error, num_pred, num_error /float(num_pred))) print ('accuracy=%d/%d=%f' % ( num_correct, num_pred, num_correct /float(num_pred))) num_true_positive = sum(1 for i in range(len(preds)) if int(preds[i]>positive_threshold)==labels[i] and labels[i]==1) num_positive_pred = sum(1 for i in range(len(preds)) if preds[i]>positive_threshold)
def get_dmat(self): return xgb.DMatrix(self.X, self.y)
def runModel(modelName, num): train_data_X,train_data_Y,test_data_X,test_data_Y = getData_old() dtrain = xgb.DMatrix(train_data_X, train_data_Y) dtest = xgb.DMatrix(test_data_X, test_data_Y) # print(dtrain.get_label()) # print(ca) # specify parameters via map # params = {'max_depth':10, 'eta':1, 'silent':1, 'objective':'binary:logistic' } watchlist = [ (dtrain,'train'), (dtest, 'test') ] params={ 'booster':'gbtree', 'objective': 'binary:logistic', 'gamma':0.8, # 在树的叶子节点下一个分区的最小损失,越大算法模型越保守 。[0:] 'max_depth':6, # 构建树的深度 [1:] 'lambda':100, # L2 正则项权重 'subsample':0.5, # 采样训练数据,设置为0.5,随机选择一般的数据实例 (0:1] 'colsample_bytree':1, # 构建树树时的采样比率 (0:1] 'min_child_weight':12, # 节点的最少特征数 'silent':1 , 'eta': 0.1, # 如同学习率 'seed':30, 'nthread':4,# cpu 线程数,根据自己U的个数适当调整 } # 设置boosting迭代计算次数 num_round = num bst = xgb.train(params, dtrain, num_round, watchlist) # dtrain是训练数据集 train_preds = bst.predict(dtrain) # # print ("train_preds",train_preds) train_predictions = [round(value) for value in train_preds] # print ("train_predictions",train_predictions) y_train = dtrain.get_label() # print ("y_train",y_train) train_accuracy = accuracy_score(y_train, train_predictions) # log.info ("Train Accuary: %.2f%%" % (train_accuracy * 100.0)) # make prediction preds = bst.predict(dtest) predictions = [round(value) for value in preds] # log.info ("preds:"+str(preds)) # log.info ("predictions:"+str(predictions)) y_test = dtest.get_label() # log.info ("y_test:"+str(y_test)) test_accuracy = accuracy_score(y_test, predictions) # log.info("Test Accuracy: "+str(test_accuracy * 100.0)+"%") #save model with open(PATH_CURR + '/modelSave/' + modelName + '.pik','wb')as f: pickle.dump(bst,f,-1)
def get_test_dmat(self, num_rows): rs = np.random.RandomState(432) return xgb.DMatrix( self.X[rs.randint(0, self.X.shape[0], size=num_rows), :])
def train(self, data=None, log_demand=True, normalize=False): """ Train Args: data: a DataFrame object, default None. If None, 'processed/reg_train.csv' will be loaded. log_demand: a boolean object, defult True. If True, the target demand will be transformed by log operation, e.g.: y => log(y+1). normalize: a bollean object, default False. If True, the train data will be normalized. Returns: A xgb booster. """ # load data logger.info('Loading train data...') if data is None: data = pd.read_csv('processed/reg_train.csv', dtype=DTYPES) logger.info('Setting features for training...') # sampling logger.info('Sampling...') data = data.take( np.random.permutation(len(data))[:int(self._batch_size * len(data))]) # prepare training set. logger.info("Preparing training set...") if log_demand: data.loc[:, 'Demanda_uni_equil'] = np.log( data.Demanda_uni_equil.values + 1) x_data = data[FEATURES] feature_names = x_data.columns.tolist() x_data = x_data.values if normalize: x_data = x_data / x_data.max(axis=0) y_data = data['Demanda_uni_equil'].values # free memory del data # make cross-validation set. logger.info('Cross-validation...') x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, test_size=self._cv_size) dtrain = xgb.DMatrix(data=x_train, label=y_train, feature_names=feature_names) dtest = xgb.DMatrix(data=x_test, label=y_test, feature_names=feature_names) # free memory del x_data, x_train, x_test, y_train, y_test print('parameters: \n', self._xgb_params) watchlist = [(dtrain, 'train'), (dtest, 'eval')] logger.info('Training...') xgb_reg = xgb.train(params=self._xgb_params, dtrain=dtrain, num_boost_round=40, evals=watchlist, early_stopping_rounds=5) logger.info('Done with training.') return xgb_reg
verbose=0, warm_start=False) logistic.fit(features, labels) print cross_val_score(logistic, features, labels, cv=5) if __name__ == "__main__": #read sample sample = pd.read_csv('sampleSubmission.csv') # Import Data tests = pd.read_csv('test.csv') tests = tests.drop('id', axis=1) scaler = StandardScaler(copy=False, with_mean=True, with_std=True) tests = scaler.fit_transform(tests) tests_xgb = xgb.DMatrix(tests) #features = pd.read_csv('../input/train.csv') features = pd.read_csv('train.csv') features = features.drop('id', axis=1) # Extract target and Encode it to make it manageable by ML algo labels = features.target.values labels = LabelEncoder().fit_transform(labels) #print labels # Remove target from train, else it's too easy ... features = features.drop('target', axis=1) #features = preprocessing.normalize(features) scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
params = { 'booster': 'gbtree', 'objective': 'multi:softmax', 'num_class': 2, 'gamma': 0.2, 'max_depth': 15, 'lambda': 2, 'subsample': 0.8, 'colsample_bytree': 0.5, 'min_child_weight': 2, 'silent': 1, 'eta': 0.1, 'seed': 1000 } d_train = xgb.DMatrix(x_train, y_train) num_rounds = 100 model = xgb.train(params, d_train, num_rounds) dtest = xgb.DMatrix(x_test) y_pred = model.predict(dtest) compute_score(y_pred, y_test) fig, ax = plt.subplots(figsize=(15, 15)) plot_importance(model, height=0.5, ax=ax, max_num_features=64) plt.show() """ 3. AdaboostClassifier """ print("************ AdaboostClassifier ************") clf = AdaBoostClassifier(n_estimators=120, learning_rate=0.9) clf.fit(x_train, y_train)
# -*- coding: utf-8 -*- import xgboost as xgb import matplotlib.pyplot as plt import os import sklearn as sk import common work_dir = 'E:/krzys/informatyka-studia/sem-16-2017L/msi2/proj/SmogDetector/SmogDetector.Python/' os.chdir(work_dir) f1 = open('./res/res_7_cv.txt', 'w+') # read in data dtrain = xgb.DMatrix(work_dir + 'data/' + 'data_reg2_train.txt') dtest = xgb.DMatrix(work_dir + 'data/' + 'data_reg2_test.txt') # specify parameters via map param = [ ('max_depth', 6), # depth of tree ('booster', 'dart'), ('eta', 0.1), #learning rate, prevents overfitting ('silent', 1), # prints mesagees ('gamma', 1.0), # bigger -> more conservative #('min_child_weight',1), ('objective', 'reg:linear'), ('eval_metric', 'rmse'), #('eval_metric', 'merror') #2 metryki ] watchlist = [(dtrain, 'train'), (dtest, 'eval')] num_round = 1000 evals_result = {} f1.write("Regresja_dart-idx,Regresja_dart-precyzja,Regresja_dart-std\n") for i_max_depth in range(3, 4):
# missing values data_set.isnull().sum() # no missing values # missingno.matrix(data_set) # distribution of labels sns.countplot(target) # balanced labels # correlation analysis sns.heatmap(data_set.corr()) # some features are highly correlated # split dataset into train and test set ind = np.random.rand(len(data_set)) < 0.8 train_set_origin = data_set[ind] test_set_origin = data_set[~ind] # reset index train_set = train_set_origin.reset_index(drop=True) test_set = test_set_origin.reset_index(drop=True) # train label and test label train_label = target[ind] test_label = target[~ind] # Train model xgb_train = xgb.DMatrix(train_set, label=train_label) params = { "objective": "multi:softmax", "eta": 0.1, "max_depth": 5, "num_class": 3 } num_round = 50 watchlist = [(xgb_train, 'train'), (xgb_test, 'test')] xgb_test = xgb.DMatrix(test_set, label=test_label) xgb_model = xgb.train(params, xgb_train, num_round, watchlist)
self.param['seed'] = seed self.nrounds = params.pop('nrounds', 1000) def train(self, x_train, y_train, x_valid=None, y_valid=None, sample_weights=None): dtrain = xgb.DMatrix(x_train, label=y_train) self.gbdt = xgb.train(self.param, dtrain, self.nrounds) # pred_leaf=True => getting leaf indices def predict(self, x): return self.gbdt.predict(xgb.DMatrix(x), pred_leaf=True).astype(int) x_train, y_train, x_test = get_data() dtrain = xgb.DMatrix(x_train, label=y_train) xg = XgbWrapper(seed=SEED, params=xgb_params) xg_cat_embedding_train, xg_cat_embedding_test = get_oof( xg, x_train, y_train, x_test) xg_cat_embedding_ohe_train, xg_cat_embedding_ohe_test = get_sparse_ohe( xg_cat_embedding_train, xg_cat_embedding_test) print("OneHotEncoded XG-Embeddings: {},{}".format( xg_cat_embedding_ohe_train.shape, xg_cat_embedding_ohe_test.shape))
import xgboost as xgb from xgboost import XGBRegressor hit = pd.read_csv("Hitters.csv") df = hit.copy() df = df.dropna() dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']]) y = df["Salary"] X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis=1).astype("float64") X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) DM_train = xgb.DMatrix(data=X_train, label=y_train) DM_test = xgb.DMatrix(data=X_test, label=y_test) xgb_model = XGBRegressor().fit(X_train, y_train) y_pred = xgb_model.predict(X_test) print(np.sqrt(mean_squared_error(y_test, y_pred))) xgb_grid = { 'colsample_bytree': [0.4, 0.5, 0.6, 0.9, 1], 'n_estimators': [100, 200, 500, 1000], 'max_depth': [2, 3, 4, 5, 6], 'learning_rate': [0.1, 0.01, 0.5] } xgb = XGBRegressor() xgb_cv = GridSearchCV(xgb, param_grid=xgb_grid, cv=10, versobe=2) xgb_cv.fit(X_train, y_train) print(xgb_cv.best_params_)
def predict(self, x): return self.gbdt.predict(xgb.DMatrix(x), pred_leaf=True).astype(int)
def arlines_test(): if sys.version.startswith("2"): print("native XGBoost tests only supported on python3") return import xgboost as xgb assert H2OXGBoostEstimator.available() is True # Artificial data to be used throughout the test, very simple raw_data = { 'wealthy': [1, 1, 1, 0, 0], 'ownsTesla': [False, False, False, True, True] } train_frame = pd.DataFrame(data=raw_data) data = train_frame.as_matrix(['wealthy']) label = train_frame.as_matrix(['ownsTesla']) # Native XGBosot model trained first dtrain = xgb.DMatrix(data=data, label=label) watchlist = [(dtrain, 'train')] param = { 'eta': 0.7, 'silent': 1, 'objective': 'binary:logistic', 'booster': 'gbtree', 'max_depth': 2, 'seed': 1, 'max_delta_step': 0, 'alpha': 0, 'nround': 5 } bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=2, evals=watchlist) native_prediction = bst.predict(data=dtrain) print(native_prediction) assert len(native_prediction) == 5 # H2O XGBoost model trained frame = h2o.H2OFrame(train_frame) # Force factor variables, even if recognized correctly frame['ownsTesla'] = frame['ownsTesla'].asfactor() frame['wealthy'] = frame['wealthy'].asfactor() # The ntrees parameters in H2O translates to max_depth param h2o_model = H2OXGBoostEstimator(training_frame=frame, learn_rate=0.7, booster='gbtree', seed=1, ntrees=2) h2o_model.train(x=['ownsTesla'], y='wealthy', training_frame=frame) h2o_prediction = h2o_model.predict(frame['ownsTesla']) print(h2o_prediction) assert len(h2o_prediction['p0']) == 5 assert round(h2o_prediction['p0'][0, 0], 5) == round(native_prediction[0].item(), 5) assert round(h2o_prediction['p0'][1, 0], 5) == round(native_prediction[1].item(), 5) assert round(h2o_prediction['p0'][2, 0], 5) == round(native_prediction[2].item(), 5) assert round(h2o_prediction['p0'][3, 0], 5) == round(native_prediction[3].item(), 5) assert round(h2o_prediction['p0'][4, 0], 5) == round(native_prediction[4].item(), 5)
train_X = combined_data.iloc[:train_length, 1:] train_Y = train_data['SalePrice'] train_Id = combined_data.iloc[:train_length, 0] test_X = combined_data.iloc[train_length:, 1:] test_Id = combined_data.iloc[train_length:, 0] #Price Comparision for Original Sale Price and log of Sale Price fig, (axis1, axis2) = plt.subplots(1, 2, figsize=(10, 5)) axis1.hist(train_Y) train_Y = np.log1p(train_Y) axis2.hist(train_Y) # formatting DMatrix to train xgb dtrain = xgb.DMatrix(train_X, label=train_Y) # The error metric: RMSE on the log of the sale prices. from sklearn.metrics import mean_squared_error import math #UDF for Range Function for decimals def common_num_range(start, stop, step): startlen = stoplen = steplen = 0 if '.' in str(start): startlen = len(str(start)) - str(start).index('.') - 1 if '.' in str(stop): stoplen = len(str(stop)) - str(stop).index('.') - 1 if '.' in str(step): steplen = len(str(step)) - str(step).index('.') - 1
with open(root_folder + model_object_folder + features_xgboost_file, "rb") as pickle_features_file: features = pickle.load(pickle_features_file) # save the model to disk with open(root_folder + model_object_folder + xgboost_reference_price_model, "rb") as pickle_output_file: xgb_model = pickle.load(pickle_output_file) features_importance =\ pd.read_csv(root_folder + model_validation_folder + features_importance_xgboost_csv, sep=';', decimal=',') features_importance =\ features_importance.sort_values(by=['Importance'], ascending=False) predictions = xgb_model.predict(xgb.DMatrix(X_test, label=y_test)) df_measures_knn =\ pd.DataFrame(columns=['k','n_components','mean_dist_25_75','median_dist_25_75', 'total_dist_25_75','mean_variance','median_variance', 'total_variance','mean_ratio_variance','median_ratio_variance', 'total_ratio_variance','mean_ratio_interquartile','median_ratio_interquartile', 'total_ratio_interquartile','df_coverage_real_spread','df_coverage_predicted_spread', 'df_dist_prediction_quartiles_to_mean','df_dist_real_quartiles_to_mean', 'mean_dist_q_prediction_to_mean','median_dist_q_prediction_to_mean', 'total_dist_q_prediction_to_mean','mean_dist_q_real_to_mean', 'median_dist_q_real_to_mean','total_dist_q_real_to_mean', 'mean_dist_prediction_to_mean', 'median_dist_prediction_to_mean', 'total_dist_prediction_to_mean', 'mean_dist_real_to_mean', 'median_dist_real_to_mean', 'total_dist_real_to_mean', 'mean_dist_25_75_including_pred','median_dist_25_75_including_pred', 'total_dist_25_75_including_pred','mean_increase_including_pred'])
def get_tranformer_score(tranformer): xrf = tranformer dpredict = xgb.DMatrix(X_test) prediction = xrf.predict(dpredict, ntree_limit=xrf.best_ntree_limit) return mean_squared_error(y_test, prediction)