def train_all(self, data): """ Train xgb models for all clusters. Args: data: a DataFrame object, default None. data_cls is the dataframe for training, and must be set clusters. Returns: """ if 'cluster' not in data.columns.tolist(): raise AttributeError("data has no column 'cluster'.") for cluster in range(0, self._n_clusters): model_path = 'models/xgb_cluster_{}.model'.format(cluster) logger.info('Loading xgb_cluster_{}.model'.format(cluster)) if os.path.exists(model_path): xgb_reg = xgb.load_model(model_path) self._xgb_boosters.append(xgb_reg) else: logger.info('Model does not exist, training model...') _train = data[data.cluster == cluster] print('There are {} training samples.'.format(len(_train))) xgb_reg = self.train(data=_train) xgb_reg.save_model(model_path) self._xgb_boosters.append(xgb_reg)
def predict_xgboost(X: pd.DataFrame, config: Config) -> List: preds = np.zeros((config["n_split_xgb"], X.shape[0])) for i, mdl_fname in enumerate(config["xgb_models"]): mdl = xgb.Booster({'nthread': 4}) mdl = xgb.load_model(mdl_fname) preds[i, :] = mdl.predict(xgb.DMatrix(X), ntree_limit=mdl.best_ntree_limit) return list(np.mean(preds, 0))
def predict(model_file): xgb.load_model(model_file)
dtest = xgb.DMatrix(test_data) y_pred = booster.predict(dtest) y_pred = y_pred > 0.5 y_pred = y_pred.astype(int) accuracy = accuracy_score(y_pred, test_labels) print("Accuracy: %.2f%%" % (accuracy * 100.0))#예측률 import pickle pickle.dump(booster,open("pima.pickle.dat","wb")) loaded_model = pickle.load(open("pima.pickle.dat","rb")) y_pred= loaded_model.predict(dtest) booster.dump_model(doc2vec_model_name+'.xgboost') xgb.load_model(doc2vec_model_name+'.xgboost') #scale_pos_weight = len(software_arrays) / len(malware_arrays)## rate of 0/1 # #xgboost 파라미터 설명 .. ''' params = { #일반 파라미터 'booster': 'gbtree',# gbtree : tree-based models #gblinear : linear models dart 'n_jobs' : 'default', #멀티 쓰레드 개수 default가 가장 큼 'silent': 0,#실행 메시지 출력 : 0 안하게 할라면 1 #booster 파라미터 'eta' : 0.02,# 학습률 . 일반적으로 0.01~ 0.2 가 사용. 부스팅 마다 변경 추천 과적합 방지 'min_child_weight' : 1,# 기본값은 1 . 'gamma' : 0 ,# 정보획득 값. 기본값 0 'max_depth' : 6, #트리의 최대 깊이 기본값은 6
#This program is for testing a trained BTD # By Zach Shelton # 9/9/2021 # Running this will test on a f import awkward as ak import pandas as pd import numpy as np import matplotlib.pyplot as plt #Data is stored in pandas -> Each from sklearn.model_selection import train_test_split import xgboost as xgb from numpy.random import choice import argparse parser = argparse.ArgumentParser( description= 'run boosted decision tree on data, note this file grabs only the data not validation this is an experimental set' ) parser.add_argument('file', metavar='f', type=str) parser.add_argument('BTD', metavar='d', type=str) parser.add_argument('result', metavar='d', type=str) args = parser.parse_args() xg_reg = xgb.load_model(args.BTD) rawdata = pandas.read_csv(args.file) etruth = rawdata[["event", "truth"]] cleandata = rawdata.drop(["event", "truth"], axis=1) Dexp = xgb.DMatrix(data=cleandata) predictions = xg_reg.predict(Dexp) preddf = pd.Series(predictions) preddf.to_csv("ExperimentalPred/%s.csv" % args.result)
def dumpBin(model_path, feature_map='', out_put='', ftype=''): bst = xgb.load_model(mode_path) bst.dump_model(out_put, fmap=freature_map, with_stats=True, dump_format=ftype)
from data import load_data _, test_data = load_data.load_data_with_header() user_id = test_data.pop(['user_id']) label_vocabulary = { 0: '99999825', 1: '90063345', 2: '90109916', 3: '89950166', 4: '89950168', 5: '99104722', 6: '89950167', 7: '89016252', 8: '90155946', 9: '99999828', 10: '99999826', 11: '99999827', 12: '89016259', 13: '99999830', 14: '89016253' } # 预测 model = xgb.load_model('./xgb.model') xgb_test = xgb.DMatrix(test_data) preds = model.predict(xgb_test, ntree_limit=model.best_ntree_limit) result = pd.DataFrame(preds, columns=['predict']) result['predict'] = result['predict'].map(label_vocabulary) xgb_submission = pd.concat([user_id, result], axis=1) xgb_submission.to_csv('./xgb_submission.csv', index=False)