class XGB(BaseModel): def __init__(self): self.clf = XGBClassifier( n_estimators=200, max_depth=20, learning_rate=0.1, random_state=0, booster="gbtree", use_label_encoder=False, ) def train(self, X_train, Y_train): X_train, Y_train = do_rebalance(X_train, Y_train) self.clf.fit(X_train, Y_train) def test(self, X_test, Y_test): Y_prob = self.clf.predict_proba(X_test) auc = metrics.roc_auc_score(Y_test, Y_prob[:, 1]) def predict(self, X): Y_prob = self.clf.predict_proba(X) return Y_prob def load_model(self, model_path): self.clf.load_model(model_path) # with open(model_path, "rb+") as file: # self.clf = pickle.load(file) def save_model(self, model_path): self.clf.save_model(model_path)
def train(file): y_data=makeRawDataset(file) #provides with raw data X=pd.read_csv(file).Question X_without=removePunc(X) #without punctuations X_correct=fuzzy(X_without) #with fuzzy X_enc=encode(X_correct) labels=[">","<","<=",">=","==","NULL","LIKE"] encoder=LabelEncoder() codes=encoder.fit_transform(labels) codeMap={labels[i]:codes[i] for i in range(len(labels))} inverseMap={codes[i]:labels[i] for i in range(len(labels))} maps={"codeMap":codeMap,"inverseMap":inverseMap} np.save("Map.npy",maps) y=[] for i in y_data: y.append(codeMap[i]) X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size = 0.2, random_state = 42) model = XGBClassifier() model.fit(X_train,y_train) model.save_model("WhereCond.model") y_hat=model.predict(X_test) print(y_hat[:10]) y_pred=[] for i in y_hat: y_pred.append(inverseMap[i]) y_true=[] for i in y_test: y_true.append(inverseMap[i]) sk_report = classification_report(digits=6,y_true=y_test,y_pred=y_hat) print(sk_report)
def tree_optimization(learning_rate, gamma, max_depth, subsample, reg_lambda, num_parallel_tree, min_child_weight): global tree_no, best_score X_train, X_eval, y_train, y_eval = train_test_split(features, labels, test_size=0.2, shuffle=True, stratify=labels) X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train) booster_params = { 'n_estimators': 500, 'learning_rate': learning_rate, 'gamma': gamma, 'max_depth': int(np.around(max_depth)), 'subsample': subsample, 'sampling_method': 'gradient_based', 'reg_lambda': reg_lambda, 'min_child_weight': int(np.around(min_child_weight)), 'num_parallel_tree': int(np.around(num_parallel_tree)), 'objective': 'binary:logistic', 'verbosity': 1, 'max_delta_step ': 1 } print("generating model") trip_model = XGBClassifier(**booster_params) trip_model.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], eval_metric=f1_eval, early_stopping_rounds=25) print("Training Accuracy: %.2f" % (trip_model.score(X_eval, y_eval) * 100), "%") preds = trip_model.predict(X_eval) current_score = f1_score(y_eval, preds) if current_score > best_score: print(f"Score increased to {current_score} from {best_score}") best_score = current_score sub_pred = trip_model.predict(sub) trip_model.save_model( f'logs/f1_{run_num}_{best_score}_{tree_no}_thresh.model') savetxt(f'logs/{run_num}_{tree_no}_preds.txt', sub_pred, delimiter=',') booster_params['f1'] = current_score booster_params['tree_no'] = tree_no record_history(booster_params) tree_no += 1 return current_score
def xgboost(self, x_train, y_train, x_valid, y_valid): xgb_model = XGBClassifier(objective='multi:softmax', verbose=2) eval_set = [(x_valid, y_valid)] xgb_model.fit(x_train, y_train, eval_set=eval_set, eval_metric='merror', verbose=True, early_stopping_rounds=10) plot_importance(xgb_model) pyplot.show() xgb_model.save_model('./modfile/tf_idf_XGBoost.model') return xgb_model, x_valid, y_valid
def create_tree(): X_train, X_eval, y_train, y_eval = train_test_split( features, labels, test_size=0.2, shuffle=True, stratify=labels, random_state=RANDOM_SEED) X_train, X_test, y_train, y_test = train_test_split( X_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=RANDOM_SEED) booster_params = { 'n_estimators': 500, 'objective': 'binary:logistic', 'verbosity': 1 } print("generating model") trip_model = XGBClassifier(**booster_params) trip_model.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], eval_metric=f1_eval, early_stopping_rounds=50) iter_num = trip_model.best_iteration + 30 booster_params = { 'n_estimators': iter_num, 'objective': 'binary:logistic', 'verbosity': 1 } trip_model = XGBClassifier(**booster_params) trip_model.fit(X=X_train, y=y_train, eval_set=[(X_test, y_test)], eval_metric=f1_eval) print("Training Accuracy: %.2f" % (trip_model.score(X_eval, y_eval) * 100), "%") preds = trip_model.predict(X_eval) current_score = f1_score(y_eval, preds) trip_model.save_model(f'logs/f1_{current_score}_thresh.model') sub_pred = trip_model.predict(sub) savetxt(f'logs/{current_score}_preds.txt', sub_pred, delimiter=',') return trip_model
def train_xgboost(self): """ Train an XGBoost classifier on the dataset and save model as artifact """ from xgboost import XGBClassifier xgb = XGBClassifier() xgb.fit(self.X, self.y) xgb.save_model(script_path("model.bst")) with open(script_path("model.bst"), "rb") as fh: self.buffered_xgb_model = fh.read() self.next(self.join)
def scikitAPI(trainFile): X, Y, X_train, X_test, Y_train, Y_test = inputData(trainFile) ''' Y = transLabel(Y)[1] Y_train = transLabel(Y_train)[1] Y_test = transLabel(Y_test)[1] ''' start = time.time() print("class: {0}".format(np.unique(Y_train))) print("train data shape: %r, train target shape: %r" % (X_train.shape, Y_train.shape)) print("test data shape: %r, test target shape: %r" % (X_test.shape, Y_test.shape)) #params = {"n_estimators":300, "num_leaves":128, "learning_rate":0.1} params = {"n_estimators": 50, "learning_rate": 0.1} print("{:-<50}".format("")) print("params", params) #clf_test = LGBMClassifier(n_estimators=200) # clf_test = XGBClassifier(**params) # clf_test.fit(X_train, Y_train) # print('Training time:{0}'.format(time.time()-start)) # print("clf class: ",clf_test.classes_) # #pred = clf_test.predict(X_test) # #print(accuracy_score(pred, y_test)) # print("Traing Acc: ", clf_test.score(X_train, Y_train), np.shape(X_train)) # print("Test Acc: ", clf_test.score(X_test, Y_test), np.shape(X_test)) # print("Total Acc: :", clf_test.score(X, Y), np.shape(X)) xgtrain = xgb.DMatrix(X, Y) clf = XGBClassifier(**params).fit(X, Y) model_path = 'xgb/model.joblib' # joblib.dump(clf, model_path, compress=1) #clf.save_model("lgbm_model.ml") model_path = 'xgb/model.bst' clf.save_model(model_path) # clf.dump_model('xgb/dump.raw.txt', 'xgb/featmap.txt') # print("ACC: ", load_model(model_path).score(X_test, Y_test)) print("model save to {}".format(model_path)) print("model.ml size: {:.3f} KB".format( os.path.getsize(model_path) / 1024)) clf = load_model(model_path, Y) print(clf.predict([np.arange(51)]))
def train_lazy(): # Load the dataset X, y = load_data() # Split the data X_train, X_val, y_train, y_val = split_dataset(X, y) # # Normalize X_train = normalize(X_train) X_val = normalize(X_val) # uncomment to check the performance of the 25 models # clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None) # # fit # scores,_ = clf.fit(X_train, X_val, y_train, y_val) # # print # print(scores) # Final model # check if model exist if os.path.isfile(config.MODEL_PATH): model = XGBClassifier() model.load_model(config.MODEL_PATH) else: model = XGBClassifier() model.fit(X_train, y_train, eval_metric="error", eval_set=[(X_train, y_train), (X_val, y_val)], verbose=True) # save model model.save_model(config.MODEL_PATH) # performance on train set y_pred = model.predict(X_train) # evaluate predictions print_performance(y_train, y_pred, 'train') # performance on val set y_pred = model.predict(X_val) # evaluate predictions print_performance(y_val, y_pred, 'val') # Load the test dataset X_test, y_test = load_test_data() # # Normalize X_test = normalize(X_test) # get prediction y_pred = model.predict(X_test) # evaluate predictions print_performance(y_test, y_pred, 'test') # print plot_performance(model)
def build(): data = load_kickstarter(DATA_PATH) encoder, data = preprocess(data) x_train, x_test, y_train, y_test = train_test_split(data.drop(['state'], axis=1), data.state, test_size=0.2, random_state=SEED) clf = XGBClassifier(n_jobs=6, random_state=SEED) clf.fit(x_train, y_train, verbose=True) dump(encoder, 'onehot-150k.joblib') clf.save_model('xgb-150k-v1.model')
def train_model(self): """Trains the model Returns ------- testing and training f1 scores """ df = Crunchbase().format_crunchbase() subset = df[['type_Group B', 'money_raised_at_ipo', 'number_of_acquisitions', 'valuation_at_ipo', 'employee_cat', 'number_of_lead_investors', 'number_of_lead_investments', 'number_of_investments', 'type_Group A', 'industries_type_0', 'number_of_employee_profiles', 'number_of_events', 'number_of_investors', 'total_products_active', 'type_For Profit', 'ipo_status']] x = subset[subset.columns.drop('ipo_status')] y = subset[['ipo_status']] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2) # scale data scaler = StandardScaler() # save the scaler x_train_scaled = scaler.fit_transform(x_train) pickle.dump(scaler, open('./Model/scaler.pkl', 'wb')) x_test_scaled = scaler.transform(x_test) # scale data scaler = StandardScaler() # save the scaler x_train_scaled = scaler.fit_transform(x_train) pickle.dump(scaler, open('./Model/scaler.pkl', 'wb')) x_test_scaled = scaler.transform(x_test) # transform data to balance it smote = SMOTE(random_state=42, sampling_strategy=0.5) x_train_scaled_smote, y_train_smote = smote.fit_resample(x_train_scaled, y_train) xgb = XGBClassifier() xgb.fit(x_train_scaled_smote, y_train_smote) # save the trained model xgb.save_model('./Model/final_model.json') y_train_pred_xgb = xgb.predict(x_train_scaled_smote) y_test_pred_xgb = xgb.predict(x_test_scaled) training_f1_score = f1_score(y_train_smote, y_train_pred_xgb) testing_f1_score = f1_score(y_test, y_test_pred_xgb) return training_f1_score, testing_f1_score
def train_model(): x, y = load_data() model = create_dl_model() model.fit(x, y, epochs=20, batch_size=32, verbose=0, callbacks=keras.callbacks.EarlyStopping(monitor='loss', patience=3)) layermodel_1 = keras.models.Model(inputs=model.input, outputs=model.layers[2].output) x = layermodel_1.predict(x) xgb = XGBClassifier(n_estimators=110, learning_rate=0.12) xgb.fit(x, y.ravel()) layermodel_1.save("../files/DNN.h5") xgb.save_model("../files/xgboost.model")
def train(X_train, y_train, X_test, y_test, i): if i == 0: name = 'citi' elif i == 1: name = 'jpm' else: name = 'boa' weights = (y_train == 0).sum()/(1.0 * (y_train == 1).sum()) model = XGBClassifier(tree_method='gpu_hist', objective='binary:logistic', n_estimators=300, scale_pos_weight=weights, n_jobs=6) if name == 'citi': model.fit(X_train, y_train, eval_metric=['auc']) else: model.fit(X_train, y_train, eval_metric=['auc']) y_pred = model.predict(X_test) get_results(y_pred, y_test) model.save_model(f'{name}.model') return model
def xgbmodel(): data1 = pd.read_csv('/home/msmal/object_type/X_enc_object.csv', header=0) print("data1 read over!") data2 = pd.read_csv('/home/msmal/float_type/train.csv', header=0) data2[np.isinf(data2)] = -1 data2[np.isnan(data2)] = -2 print("data2 read over!") X = pd.concat([data1.iloc[:, 1:], data2], axis=1) y = pd.read_csv('/home/msmal/label.csv', header=0) scaler = preprocessing.StandardScaler() X_scaled = scaler.fit_transform(X) encoder = LabelEncoder() y = encoder.fit_transform(y) X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size=0.005, random_state=49) #xgbmodel:xgb = XGBClassifier(n_jobs=-1) xgb = XGBClassifier(n_jobs=-1,n_estimators=500,learning_rate=0.05,subsample=0.8) xgb.fit(X_train, Y_train) xgb.save_model('xgb1.model') print("model has saved!") xgbpre = xgb.predict(X_test) xgb_report = metrics.classification_report(Y_test, xgbpre) print(xgb_report)
thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit = True) parameter = { 'n_estimators': [100, 200, 400], 'learning_rate' : [0.03, 0.05, 0.07, 0.1], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9], 'colsample_bylevel':[0.6, 0.7, 0.8, 0.9], 'max_depth': [4, 5, 6] } search = GridSearchCV( XGBRegressor(), parameter, cv =5, n_jobs = -1) select_x_train = selection.transform(x_train) search.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) x_pred = search.predict(select_x_test) score = r2_score(y_test, x_pred) print('R2는',score) print("Thresh=%.3f, n=%d, R2: %.2f%%" %(thresh, select_x_train.shape[1], score*100.0)) model.save_model('./model/xgb_save/m34sfm_cancer/cancer.xgb' + str(select_x_train.shape[1])+'.model') '''
def train_xgboost(data: IrisData): clf = XGBClassifier() clf.fit(data.X, data.y) model_path = os.path.join(XGBoostFolder, "model.json") clf.save_model(model_path)
from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer def get_data(): url = 'simple_data.csv' return pd.read_csv(url) columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder='passthrough') df = get_data() df['position'] = df['position'].str.extract(r'(\w+),?') X = df.drop(columns=['all_star']) X = columnTransformer.fit_transform(X) y = df.all_star X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99) model = XGBClassifier(objective='binary:logistic', use_label_encoder=False) model.fit(X_train, y_train) model.save_model('xgb.model') predictions = model.predict(X_test) print("XGBoost Training Accuracy") print(f'Accuracy: {round(accuracy_score(y_test, predictions) * 100, 3)}%') print(classification_report(y_test, predictions))
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=77, shuffle=True) # 2. 모델 model = XGBClassifier(n_estimators=1000, learning_rate=0.1) # model = XGBRegressor(learning_rate=0.01) # 3. 훈련 model.fit( x_train, y_train, verbose=1, #다 보여 준다(0 / 1 , False / True) eval_metric='logloss', #keras의 metrics와 동일 eval_set=[(x_train, y_train), (x_test, y_test)]) # 4. 평가, 예측 result = model.evals_result() # print("eval's results : ", result) acc = model.score(x_test, y_test) print('acc:', acc) y_pred = model.predict(x_test) print('최종정답률:', accuracy_score(y_test, y_pred)) model.save_model('./save/xgb_save/cancer2.xgb.model') print('저장완료')
selection_x_train = selection.transform(x_train) selection_x_test = selection.transform(x_test) print(selection_x_train.shape) selection_model = XGBClassifier(n_estimators=1000, max_depth=4, learning_rate=0.5, n_jobs=-1) selection_model.fit(selection_x_train, y_train, verbose=False, eval_metric=["merror", "mlogloss"], eval_set=[(selection_x_train, y_train), (selection_x_test, y_test)], early_stopping_rounds=100) y_pred = selection_model.predict(selection_x_test) # results = selection_model.evals_result() # print("evals_result : \n", results) score = accuracy_score(y_test, y_pred) print("Thresh=%.3f, n=%d, acc: %.2f%%" % (thresh, selection_x_train.shape[1], score * 100.0)) # (120, 1) # Thresh=0.621, n=1, acc: 96.67% model.save_model("./model/xgb_save/iris_acc_96.67_model")
# 不可视化数据集loss # model = XGBClassifier() # model.fit(X_train, y_train) ##可视化测试集的loss model = XGBClassifier() eval_set = [(X_test, y_test)] model.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=eval_set, verbose=True) # 改为True就能可视化loss model.save_model("00001.model") model.fit(X, Y) plot_importance(model) pyplot.show() y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) ##Accuracy: 77.56% test_auc2 = roc_auc_score(y_test, y_pred) #验证集上的auc值 print("xgb_muliclass_auc:", test_auc2)
X = df_2018.drop("job_title", axis=1) y = df_2018["job_title"] # Splitting data into training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20) # save out the split training data to use with Cloud AutoML with open("train_data_2018.csv", "+w") as file: pd.concat([X_train, y_train], axis=1).to_csv(file, index=False) with open("test_data_2018.csv", "+w") as file: pd.concat([X_test, y_test], axis=1).to_csv(file, index=False) # encode all features using ordinal encoding encoder_x = ce.OrdinalEncoder() X_encoded = encoder_x.fit_transform(X) # you'll need to use a different encoder for each dataframe encoder_y = ce.OrdinalEncoder() y_encoded = encoder_y.fit_transform(y) # split encoded dataset X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded, train_size=0.80, test_size=0.20) from xgboost import XGBClassifier # train XGBoost model with default parameters my_model = XGBClassifier() my_model.fit(X_train_encoded, y_train_encoded, verbose=False) # and save our model my_model.save_model("xgboost_baseline.model")
for thres in threshold: selection = SelectFromModel(model, threshold=thres, prefit=True) select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) selection_model = LGBMClassifier(n_estimators=100, learning_rate=0.05, n_jobs=-1) selection_model.fit(select_x_train, y_train, verbose=False, eval_metric=['logloss', 'error'], eval_set=[(select_x_train, y_train), (select_x_test, y_test)], early_stopping_rounds=20) y_pred = selection_model.predict(select_x_test) acc = accuracy_score(y_test, y_pred) print("Thresh=%.3f, n = %d, ACC : %.2f%%" % (thres, select_x_train.shape[1], acc * 100.0)) # result = selection_model.evals_result() # print("eval's result : ", result) model.save_model("./model/sample/cancer/cancer_rmse=%.3f-r2=%.2f.model" % (thres, acc))
class XGBoost(BaseAlgorithm): def __init__(self, algorithm_settings, problem_type): super().__init__(algorithm_settings) self.problem_type = problem_type def build(self): if self.problem_type == SupervisedTask.regression: self.build_regression_model() elif self.problem_type == SupervisedTask.classification: self.build_classification_model() else: raise TypeError('Unknown problem_type') def build_regression_model(self): from xgboost import XGBRegressor self.model = XGBRegressor( max_depth=self.algorithm_settings.max_depth, learning_rate=self.algorithm_settings.learning_rate, n_estimators=self.algorithm_settings.n_estimators, objective=self.algorithm_settings.objective, booster=self.algorithm_settings.booster, n_jobs=self.algorithm_settings.n_jobs, gamma=self.algorithm_settings.gamma, min_child_weight=self.algorithm_settings.min_child_weight, max_delta_step=self.algorithm_settings.max_delta_step, subsample=self.algorithm_settings.subsample, reg_alpha=self.algorithm_settings.reg_alpha, reg_lambda=self.algorithm_settings.reg_lambda, random_state=self.algorithm_settings.random_state) def build_classification_model(self): from xgboost import XGBClassifier self.model = XGBClassifier( max_depth=self.algorithm_settings.max_depth, learning_rate=self.algorithm_settings.learning_rate, n_estimators=self.algorithm_settings.n_estimators, objective=self.algorithm_settings.objective, booster=self.algorithm_settings.booster, n_jobs=self.algorithm_settings.n_jobs, gamma=self.algorithm_settings.gamma, min_child_weight=self.algorithm_settings.min_child_weight, max_delta_step=self.algorithm_settings.max_delta_step, subsample=self.algorithm_settings.subsample, reg_alpha=self.algorithm_settings.reg_alpha, reg_lambda=self.algorithm_settings.reg_lambda, random_state=self.algorithm_settings.random_state) def train(self, train_x, train_y, settings): self.model.fit(train_x, train_y, eval_metric=self.algorithm_settings.eval_metric) self.save(settings) def evaluate(self, test_x): prediction = self.model.predict(test_x) prediction = prediction.reshape(-1, 1) return prediction def load(self, model_path): self.model.load_model(fname=model_path) def save(self, settings): model_save_dir = os.path.join(settings.models_path, 'xgboost_models') os.makedirs(model_save_dir, exist_ok=True) model_name = self.get_model_name(settings) save_path = os.path.join(model_save_dir, model_name) self.model.save_model(fname=save_path) print(f"Model saved to: {save_path}") def get_model_name(self, settings): if settings.problem_type == SupervisedTask.regression: return 'regression_model.xgb' else: return 'classification_model.xgb'
def train_and_generate_model(): #global log_fd global log_fd_opt global tr_input_arr global tr_angle_arr global val_input_arr global val_angle_arr data_len = len(exchange_rates) log_fd_tr = open("./train_progress_log_" + dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt", mode="w") # inner logger function for backtest def logfile_writeln_tr(log_str): nonlocal log_fd_tr log_fd_tr.write(log_str + "\n") log_fd_tr.flush() print("data size of rates: " + str(data_len)) print("num of rate datas for tarin: " + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR)) print("input features sets for tarin: " + str(COMPETITION_TRAIN_DATA_NUM)) logfile_writeln_tr("data size of rates: " + str(data_len)) logfile_writeln_tr("num of rate datas for tarin: " + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR)) tr_input_mat = [] tr_angle_mat = [] is_loaded_input_mat = False if os.path.exists("./tr_input_mat.pickle"): with open('./tr_input_mat.pickle', 'rb') as f: tr_input_mat = pickle.load(f) with open('./tr_angle_mat.pickle', 'rb') as f: tr_angle_mat = pickle.load(f) is_loaded_input_mat = True else: for i in range(DATA_HEAD_ASOBI, len(exchange_rates) - DATA_HEAD_ASOBI - OUTPUT_LEN, SLIDE_IDX_NUM_AT_GEN_INPUTS_AND_COLLECT_LABELS): tr_input_mat.append([ exchange_rates[i], (exchange_rates[i] - exchange_rates[i - 1]) / exchange_rates[i - 1], get_rsi(exchange_rates, i), get_ma(exchange_rates, i), get_ma_kairi(exchange_rates, i), get_bb_1(exchange_rates, i), get_bb_2(exchange_rates, i), get_ema(exchange_rates, i), get_ema_rsi(exchange_rates, i), get_cci(exchange_rates, i), get_mo(exchange_rates, i), get_lw(exchange_rates, i), get_ss(exchange_rates, i), get_dmi(exchange_rates, i), get_vorarity(exchange_rates, i), get_macd(exchange_rates, i), str(judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i])) ]) tr_input_mat.append([ reverse_exchange_rates[i], (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) / reverse_exchange_rates[i - 1], get_rsi(reverse_exchange_rates, i), get_ma(reverse_exchange_rates, i), get_ma_kairi(reverse_exchange_rates, i), get_bb_1(reverse_exchange_rates, i), get_bb_2(reverse_exchange_rates, i), get_ema(reverse_exchange_rates, i), get_ema_rsi(reverse_exchange_rates, i), get_cci(reverse_exchange_rates, i), get_mo(reverse_exchange_rates, i), get_lw(reverse_exchange_rates, i), get_ss(reverse_exchange_rates, i), get_dmi(reverse_exchange_rates, i), get_vorarity(reverse_exchange_rates, i), get_macd(reverse_exchange_rates, i), str( judge_chart_type( reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i])) ]) tmp = exchange_rates[i + OUTPUT_LEN] - exchange_rates[i] if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) tmp = reverse_exchange_rates[ i + OUTPUT_LEN] - reverse_exchange_rates[i] if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) if is_loaded_input_mat == False: with open('tr_input_mat.pickle', 'wb') as f: pickle.dump(tr_input_mat, f) with open('tr_angle_mat.pickle', 'wb') as f: pickle.dump(tr_angle_mat, f) #log output for tensorboard #configure("logs/xgboost_trade_cpu_1") tr_input_arr = np.array(tr_input_mat[0:COMPETITION_TRAIN_DATA_NUM]) tr_angle_arr = np.array(tr_angle_mat[0:COMPETITION_TRAIN_DATA_NUM]) watchlist = None split_idx = COMPETITION_TRAIN_DATA_NUM + int( (len(tr_input_mat) - COMPETITION_TRAIN_DATA_NUM) * VALIDATION_DATA_RATIO) if VALIDATION_DATA_RATIO != 0.0: val_input_arr = np.array( tr_input_mat[COMPETITION_TRAIN_DATA_NUM:split_idx]) val_angle_arr = np.array( tr_angle_mat[COMPETITION_TRAIN_DATA_NUM:split_idx]) watchlist = [(tr_input_arr, tr_angle_arr), (val_input_arr, val_angle_arr)] else: watchlist = [(tr_input_arr, tr_angle_arr)] start = time.time() if is_param_tune_with_optuna: log_fd_opt = open("./tune_progress_log_" + dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt", mode="w") study = None if is_use_db_at_tune: study = optuna.Study(study_name='fxsystrade', storage='sqlite:///../fxsystrade.db') else: study = optuna.create_study() parallel_num = RAPTOP_THREAD_NUM * 2 if is_colab_cpu or is_exec_at_mba: parallel_num = COLAB_CPU_AND_MBA_THREAD_NUM * 2 if special_optuna_parallel_num != -1: parallel_num = special_optuna_parallel_num study.optimize(opt, n_trials=OPTUNA_TRIAL_NUM, n_jobs=parallel_num) process_time = time.time() - start logfile_writeln_opt("best_params: " + str(study.best_params)) logfile_writeln_opt("best_value: " + str(study.best_value)) logfile_writeln_opt("best_trial: " + str(study.best_trial)) logfile_writeln_opt("excecution time of tune: " + str(process_time)) log_fd_opt.flush() log_fd_opt.close() exit() param = {} n_thread = RAPTOP_THREAD_NUM if is_use_gpu: param['tree_method'] = 'gpu_hist' param['max_bin'] = 16 param['gpu_id'] = 0 n_thread = COLAB_CPU_AND_MBA_THREAD_NUM if is_colab_cpu or is_exec_at_mba: n_thread = COLAB_CPU_AND_MBA_THREAD_NUM logfile_writeln_tr("training parameters are below...") logfile_writeln_tr(str(param)) eval_result_dic = {} logfile_writeln_tr("num_round: " + str(NUM_ROUND)) clf = XGBClassifier(max_depth=MAX_DEPTH, random_state=42, n_estimators=NUM_ROUND, min_child_weight=18, subsample=0.9, colsample_bytree=0.6, eta=ETA, objective='binary:logistic', verbosity=0, n_thread=n_thread, **param) verbosity = True if is_use_gpu or is_colab_cpu: verbosity = False clf.fit(tr_input_arr, tr_angle_arr, eval_set=watchlist, verbose=verbosity) process_time = time.time() - start logfile_writeln_tr("excecution time of training: " + str(process_time)) clf.save_model('./xgb.model') booster = clf.get_booster() booster.dump_model('./xgb_model.raw.txt') eval_result_dic = clf.evals_result() for ii in range(len(eval_result_dic['validation_0']['error'])): if VALIDATION_DATA_RATIO != 0.0: logfile_writeln_tr( str(ii) + "," + str(eval_result_dic['validation_0']['error'][ii]) + "," + str(eval_result_dic['validation_1']['error'][ii])) else: logfile_writeln_tr( str(ii) + "," + str(eval_result_dic['validation_0']['error'][ii])) # Feature Importance fti = clf.feature_importances_ logfile_writeln_tr('Feature Importances:') for i, feat in enumerate(FEATURE_NAMES): logfile_writeln_tr('\t{0:20s} : {1:>.6f}'.format(feat, fti[i])) log_fd_tr.flush() log_fd_tr.close() print("finished training and saved model.")
print("최종 정답률 : ", r2_score(y_test, y_predict)) score = accuracy_score(y_test, y_predict) model = model.best_estimator_ thresholds = np.sort(model.feature_importances_) print(thresholds) n = 0 score = 0 for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = XGBClassifier(n_jobs=-1) selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test) y_predict = selection_model.predict(select_x_test) acc = accuracy_score(y_test, y_predict) if acc * 100.0 > score: n = select_x_train.shape[1] score = acc * 100.0 L_selection = selection selection_model.save_model("./save/xgb_save/ml37_3_cancer.xgb.model") print("Thresh=%.3f, n=%d, acc: %.2f%%" % (thresh, select_x_train.shape[1], score * 100.0))
def tree_optimization(self, learning_rate, gamma, max_depth, subsample, reg_lambda, num_parallel_tree, min_child_weight): x_train, x_eval, y_train, y_eval = train_test_split( self.features, self.labels, test_size=0.2, shuffle=True, stratify=self.labels, random_state=self.seed) x_train, x_test, y_train, y_test = train_test_split( x_train, y_train, test_size=0.2, shuffle=True, stratify=y_train, random_state=self.seed) booster_params = { 'n_estimators': 500, 'learning_rate': learning_rate, 'gamma': gamma, 'max_depth': int(np.around(max_depth)), 'subsample': subsample, 'sampling_method': 'gradient_based', 'reg_lambda': reg_lambda, 'min_child_weight': int(np.around(min_child_weight)), 'num_parallel_tree': int(np.around(num_parallel_tree)), 'objective': 'binary:logistic', 'verbosity': 1, 'max_delta_step ': 1 } print("generating model") model = XGBClassifier(**booster_params) model.fit(X=x_train, y=y_train, eval_set=[(x_test, y_test)], eval_metric=f1_eval, early_stopping_rounds=25) preds = model.predict(x_eval) current_score = f1_score(y_eval, preds) print(f"eval ROC score: {current_score}") if current_score > self.best_score: print(f"Score increased to {current_score} from {self.best_score}") self.best_score = current_score sub_pred = model.predict(self.pred) model.save_model( f'logs/f1_{self.run_num}_{self.best_score}_{self.tree_no}_{self.model_type}_thresh.model' ) savetxt( f'logs/{self.run_num}_{self.tree_no}_{self.model_type}_preds.txt', sub_pred, delimiter=',') booster_params['f1'] = current_score booster_params['tree_no'] = self.tree_no record_history(booster_params) self.tree_no += 1 return current_score
test_size=0.2, random_state=24) x_train = x_train.as_matrix() y_train = y_train.as_matrix() x_valid = x_valid.as_matrix() y_valid = y_valid.as_matrix() clf = XGBClassifier(booster='gbtree', nthread=4, learning_rate=0.1, min_child_weight=1, max_depth=5, subsample=0.8, colsample_bytree=0.8, scale_pos_weight=1, objective='binary:logistic', seed=24) eval_set = [(x_valid, y_valid)] clf.fit(x_train, y_train, early_stopping_rounds=100, eval_metric='logloss', eval_set=eval_set, verbose=True) clf.save_model("./model/xgboost.model") pre = clf.predict(x_valid) pro = clf.predict_proba(x_valid)[:, 1] print("Accuracy: %f" % accuracy_score(y_valid, pre)) print("AUC Score: %f" % roc_auc_score(y_valid, pro))
def train_xgboost(data: IrisData, artifacts_folder: str): clf = XGBClassifier() clf.fit(data.X, data.y) clf.save_model(f"{artifacts_folder}/{XGBoostFolder}/model.bst")
# rmse, mae, logloss, error(설명 error가 accuracy), auc(설명 accuracy친구) results = model.evals_result() # print("eval's results : ", results) # print("r2 Score : %.2f%%:" %(r2*100.0)) y_pred = model.predict(x_test) acc = accuracy_score(y_pred, y_test) print("acc : ", acc) ##################################################################################################### # import pickle # 파이썬에서 제공한다. # from joblib import dump, load # import joblib # pickle.dump(model, open("./model/xgb_save/cancer.pickle.dat", "wb")) # wb형식으로 저장하겠다. # joblib.dump(model, "./model/xgb_save/cancer.joblib.dat") model.save_model("./model/xgb_save/cancer.xgb.model") print("저장됬다.") # model2 = pickle.load(open("./model/xgb_save/cancer.pickle.dat", "rb")) # model2 = joblib.load("./model/xgb_save/cancer.joblib.dat") model2 = XGBClassifier() model2.load_model("./model/xgb_save/cancer.xgb.model") print('불러왔다.') y_pred = model2.predict(x_test) acc = accuracy_score(y_pred, y_test) print("acc : ", acc)
class Classifier: # for initializing train and test sets, classifier and accuracy score # Change method to gpu_hist if you want xgboost to run on a GPU def __init__(self, params={ 'objective': 'reg:squarederror', 'verbosity': 0 }): self.X_train = [] self.X_labels = [] self.test = [] self.test_labels = [] self.model = XGBClassifier(**params) self.prediction = 0 self.error = 0 def size(self): if isinstance(self.X_train, np.ndarray): return self.X_train.size return len(self.X_train) # adding the data points def input_train(self, features, feature): if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0: self.X_train = self.X_train.tolist() self.X_labels = self.X_labels.tolist() self.X_train.append(features) self.X_labels.append(feature) # train the data def train(self): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) self.model.fit(self.X_train, self.X_labels) def train_eval(self, metric='error'): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) X_train, X_test, y_train, y_test = train_test_split(self.X_train, self.X_labels, test_size=0.33) self.model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=metric) evals_result = self.model.evals_result() if metric == 'error': validations = [] for val in evals_result.values(): lst = val.get("error") validations.append(sum(lst) / len(lst)) return 1 - (sum(validations) / len(validations)) else: validations = [] for val in evals_result.values(): lst = val.get(metric) validations.append(lst[-1]) return validations # input test labels if you want to check accuracy def label(self, label): self.test_labels.append(label) def input_test(self, features): if isinstance(self.test, np.ndarray) and self.test.size > 0: self.test = self.test.tolist() self.test.append(features) # test data def predict(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict(self.test) return self.prediction def predict_proba(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict_proba(self.test) return self.prediction # if you have the test labels you can check the error rate (you want error close to 0) def check_error(self): self.test_labels = np.asarray(self.test_labels) self.error = metrics.mean_absolute_error(self.test_labels, self.prediction) return self.error # save classifier def save_classifier(self, file): self.model.save_model(file) # open saved classifier def open_classifier(self, file): self.model.load_model(file) # removes all training data def clean_train(self): self.X_train = [] self.X_labels = [] # removes all testing data def clean_test(self): self.test = [] self.test_labels = []
class Xgboost(object): def __init__(self, task="cla", module_type="performance", compute_task="cpu", **params): """ :param task: :param module_type: :param compute_task: :param params: """ assert task in ["cla", "reg"] assert module_type in ["debug", "performance", "balance"] assert compute_task in ["cpu", "gpu"] self.task = task self.module_type = module_type # 模块 if self.module_type == "debug": params["n_jos"] = 1 elif self.module_type == "performance": params["n_jos"] = cpu_count() # cpu核心数 else: # 性能模式 params["n_jos"] = cpu_count() // 2 self.compute_task = compute_task if self.compute_task == "gpu": # 使用gpu params["tree_method"] = "gpu_hist" else: # 默认cpu params["tree_method"] = "hist" # 使用的cpu if self.task == "reg": # 做回归任务 self.model = XGBRegressor( learning_rate=params.get("learning_rate", 0.3), n_estimators=params.get("n_estimators", 100), # 树的个数100,即代数 max_depth=params.get("max_depth", 6), # 树的深度 min_child_weight=params.get("min_child_weight", 1), # 叶子节点最小权重 n_jobs=params.get("n_jos", None), # 线程数 gamma=params.get("gamma", 0), # 惩罚项中叶子节点个数前的参数 reg_lambda=params.get("lambda", 1), # lambda reg_alpha=params.get("alpha", 0), tree_method=params.get("tree_method", "auto"), subsample=params.get("subsample", 1), # 随机选择100%样本建立决策树 colsample_bytree=1, # 随机选择80%特征建立决策树 objective=params.get("objective", "reg:squarederror"), # 指定损失函数 # num_class=params.get("num_class", 2), # 不指定即为2分类 booster=params.get("booster", "gbtree"), # 使用的提升器 scale_pos_weight=1, # 解决样本不平衡问题 random_state=27, # 随机数 ) else: # 做的分类任务 self.model = XGBClassifier( learning_rate=params.get("learning_rate", 0.3), n_estimators=params.get("n_estimators", 100), # 树的个数100,即代数 max_depth=params.get("max_depth", 6), # 树的深度 min_child_weight=params.get("min_child_weight", 1), # 叶子节点最小权重 n_jobs=params.get("n_jos", None), # 线程数 gamma=params.get("gamma", 0), # 惩罚项中叶子节点个数前的参数 reg_lambda=params.get("lambda", 1), # lambda reg_alpha=params.get("alpha", 0), tree_method=params.get("tree_method", "auto"), # 树方法, 默认为auto subsample=params.get("subsample", 1), # 随机选择100%样本建立决策树 colsample_bytree=1, # 随机选择80%特征建立决策树 objective=params.get("objective", "multi:softmax"), # 指定损失函数 # 'binary:logistic 二分类交叉上 # num_class=params.get("num_class", 2), # 不指定即为2分类 booster=params.get("booster", "gbtree"), # 使用的提升器 scale_pos_weight=1, # 解决样本不平衡问题 random_state=27, # 随机数 ) """ 目标函数类型 具体查看 https://xgboost.readthedocs.io/en/latest/parameter.html obejctive: 默认 reg:squarederror: reg:squarederror: #回归平方误差 reg:squaredlogerror # 上述误差上取对数 reg:logistic logistic regression reg:logistic 逻辑回归 binary:logistic 逻辑回归二分类, 输出为概率值 binary:logitraw 逻辑回归 2分类,输出为logits之前的得分 binary:hinge 用于二元分类的铰链损失。这使得预测为0或1,而不是产生概率。 multi:softmax: 多分类,需要指定num_class的类别 multi:softprob: 输出为概率 ndata*nclass 的矩阵,即,每行数据为分属类别的概率 """ def train(self, x_train, y_train=None, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, sample_weight_eval_set=None): # print(self.model) """ :param x_train: 回归中,使用特征矩阵, array :param y_train: 标签 array :param eval_metric :return: """ # 默认开启过早停止 # eval_metric in ["rmse","rmsle","mae","logloss","error","error@t", "merror","mlogloss","auc","aucpr", # "ndcg","map","ndcg@n", "map@n","ndcg-", "map-", "ndcg@n-", "map@n-","poisson-nloglik", # "gamma-nloglik","cox-nloglik","gamma-deviance","tweedie-nloglik","aft-nloglik"] # eval_metric 参数可为字符串, 也可以是列表字符串的形式 if eval_metric: # 若需要使用评估模型模式, assert eval_set # 要确保 测试集是存在的。 self.model.fit(X=x_train, y=y_train, sample_weight=sample_weight, base_margin=base_margin, eval_set=eval_set, eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=verbose, sample_weight_eval_set=sample_weight_eval_set) # early_stopping_rounds=10 过早停止的条件 # 默认使用值为10 # verbose=True # 是否开启冗余 def plot_loss(self): # 绘制loss result = self.model.evals_result() #获取模型结果 epochs = len(result["validation_0"]["rmse"]) x_axis = range(0, epochs) # 绘制loss曲线图 figure, ax = plt.subplots() ax.plot(x_axis, result["validation_0"]["rmse"], label="Train") ax.plot(x_axis, result["validation_1"]["rmse"], label="Test") ax.legend() plt.ylabel("loss") plt.title("Xgboost Log Loss") plt.show() def predict(self, x_test): """ :param x_test: #使用np.array、scipy.sparse 用于预测 :return: """ my_pred = self.model.predict(data=x_test, output_margin=False, validate_features=True, base_margin=None) return my_pred def plt_importance(self, figure_path=None, ifsave=True): # 绘制重要性特征 """ :param figure_path: 图片保存路径 :param ifsave: 是否保存图片 :return: """ # 绘制特征重要性 fig, ax = plt.subplots(figsize=(15, 15)) plot_importance(self.model, height=0.5, ax=ax, max_num_features=64) # 最多绘制64个特征 if ifsave: if not figure_path: plt.savefig( "../model/XGBboost_model/Xgboost_featute_importance_before.png" ) else: plt.savefig(figure_path) plt.show() # 显示图片 def _plt_importance_v1(self, columns_name, figure_path=None, ifsave=True): # 绘制重要性特征,使用实际的列名进行替换 fig, ax = plt.subplots(figsize=(15, 15)) plot_importance_v1(self.model, model_name="xgb", columns_name=columns_name, height=0.5, ax=ax, max_num_features=64) # 最多绘制64个特征 if ifsave: if not figure_path: plt.savefig( "../model/XGBboost_model/Xgboost_featute_importance_after.png" ) else: plt.savefig(figure_path) plt.show() # 显示图片 def plt_tree(self, num_tree): # 绘制树 """ :param num_tree: 指定目标树的序号 :return: """ plot_tree(booster=self.model, num_trees=num_tree) def plot_graphviz(self, num_tree): # 进行绘制graphviz to_graphviz(self.model, num_trees=num_tree) # 获取重要特征 def get_importance(self): return self.model.feature_importances_ # 评估函数 def evaluate(self, y_test, my_pred, evalue_fun="mse"): if evalue_fun == "acc": # 准确率 分类指标 result = accuracy_score(y_true=y_test, y_pred=my_pred) print("accuarcy:%.2f" % (result * 100.0)) elif evalue_fun == "auc": # auc 值 分类指标 result = roc_auc_score(y_true=y_test, y_score=my_pred) print("auc:%.2f" % (result)) elif evalue_fun == "mae": # 回归指标, 平均绝对误差 result = mean_absolute_error(y_true=y_test, y_pred=my_pred) print("mae:%.2f" % (result)) elif evalue_fun == "median_ae": # 种植绝对误差 回归指标 result = median_absolute_error(y_true=y_test, y_pred=my_pred) print("median_ae:%.2f" % (result)) elif evalue_fun == "r2_score": # R平方值 回归指标 result = r2_score(y_true=y_test, y_pred=my_pred) print("r2_score:%.2f" % (result)) elif evalue_fun == "evs": # 回归反差, 回归指标 result = explained_variance_score(y_true=y_test, y_pred=my_pred) print("explained_variance_score:%.2f" % (result)) elif evalue_fun == "aps": # 分类指标, 根据预测得分计算平均精度(AP) result = average_precision_score(y_true=y_test, y_score=my_pred, average="maco", sample_weight=None) print("average_precision_score:%.2f" % (result)) elif evalue_fun == "bsl": result = brier_score_loss(y_true=y_test, y_prob=my_pred, sample_weight=None, pos_label=None) print("brier_score_loss:%.2f" % (result)) elif evalue_fun == "cmt": #计算混淆矩阵来评估分类的准确性 分类指标 result = confusion_matrix(y_true=y_test, y_pred=my_pred, labels=None, sample_weight=None) print("confusion_matrix:%.2f" % (result)) elif evalue_fun == "f1_score": # f1 得分, 分类指标 result = f1_score(y_true=y_test, y_pred=my_pred, labels=None, pos_label=1, average="binary", sample_weight=None) #F1值 print("f1_score:%.2f" % (result)) elif evalue_fun == "log_loss": # 交叉熵孙绍, 分类指标 result = log_loss(y_true=y_test, y_pred=my_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None) print("log_loss:%.2f" % (result)) elif evalue_fun == "precision_score": # 查准率 分类指标 result = precision_score(y_true=y_test, y_pred=my_pred, labels=None, pos_label=1, average="binary") print("precision_score:%.2f" % (result)) elif evalue_fun == "recall_score": # 查全绿 分类指标 result = recall_score(y_true=y_test, y_pred=my_pred, labels=None, pos_label=1, average="binary", sample_weight=None) print("recall_score:%.2f" % (result)) elif evalue_fun == "roc_auc_score": # 计算 roc 曲线下面的面积就是AUC值, 分类指标 result = roc_auc_score(y_true=y_test, y_score=my_pred, average="macro", sample_weight=None) print("roc_auc_score:%.2f" % (result)) elif evalue_fun == "roc_curve": # 计算PROC曲线的横轴坐标 分类指标 fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=my_pred, pos_label=None, sample_weight=None, drop_intermediate=True) result = (fpr, tpr, thresholds) else: # mse 参数 均方差, 回归指标 result = mean_squared_error(y_true=y_test, y_pred=my_pred) print("mse:%.2f" % (result)) return result def save_model(self, save_params): # 模型保存 self.model.save_model( fname=save_params.get( "fname", "../model/XGBboost_model/XGboostmodel.model") # 保存的文件路径名字 # format=save_params.get("format", "cbm"), # 保存的数据格式 # pool=save_params.get("pool", None) # 训练使用的数据 模型保存成json格式,无需使用pool )