Beispiel #1
0
class XGB(BaseModel):
    def __init__(self):
        self.clf = XGBClassifier(
            n_estimators=200,
            max_depth=20,
            learning_rate=0.1,
            random_state=0,
            booster="gbtree",
            use_label_encoder=False,
        )

    def train(self, X_train, Y_train):
        X_train, Y_train = do_rebalance(X_train, Y_train)
        self.clf.fit(X_train, Y_train)

    def test(self, X_test, Y_test):
        Y_prob = self.clf.predict_proba(X_test)
        auc = metrics.roc_auc_score(Y_test, Y_prob[:, 1])

    def predict(self, X):
        Y_prob = self.clf.predict_proba(X)
        return Y_prob

    def load_model(self, model_path):
        self.clf.load_model(model_path)
        # with open(model_path, "rb+") as file:
        #     self.clf = pickle.load(file)

    def save_model(self, model_path):
        self.clf.save_model(model_path)
def train(file):
    y_data=makeRawDataset(file) #provides with raw data
    X=pd.read_csv(file).Question
    X_without=removePunc(X) #without punctuations
    X_correct=fuzzy(X_without) #with fuzzy
    X_enc=encode(X_correct)
    labels=[">","<","<=",">=","==","NULL","LIKE"]
    encoder=LabelEncoder()
    codes=encoder.fit_transform(labels)
    codeMap={labels[i]:codes[i] for i in range(len(labels))}
    inverseMap={codes[i]:labels[i] for i in range(len(labels))}
    maps={"codeMap":codeMap,"inverseMap":inverseMap}
    np.save("Map.npy",maps)
    y=[]
    for i in y_data:
        y.append(codeMap[i])
    X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size = 0.2, random_state = 42)
    model = XGBClassifier()
    model.fit(X_train,y_train)
    model.save_model("WhereCond.model")
    y_hat=model.predict(X_test)
    print(y_hat[:10])
    y_pred=[]
    for i in y_hat:
        y_pred.append(inverseMap[i])
    y_true=[]
    for i in y_test:
        y_true.append(inverseMap[i])
    sk_report = classification_report(digits=6,y_true=y_test,y_pred=y_hat)
    print(sk_report)
Beispiel #3
0
def tree_optimization(learning_rate, gamma, max_depth, subsample, reg_lambda,
                      num_parallel_tree, min_child_weight):
    global tree_no, best_score

    X_train, X_eval, y_train, y_eval = train_test_split(features,
                                                        labels,
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        stratify=labels)

    X_train, X_test, y_train, y_test = train_test_split(X_train,
                                                        y_train,
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        stratify=y_train)
    booster_params = {
        'n_estimators': 500,
        'learning_rate': learning_rate,
        'gamma': gamma,
        'max_depth': int(np.around(max_depth)),
        'subsample': subsample,
        'sampling_method': 'gradient_based',
        'reg_lambda': reg_lambda,
        'min_child_weight': int(np.around(min_child_weight)),
        'num_parallel_tree': int(np.around(num_parallel_tree)),
        'objective': 'binary:logistic',
        'verbosity': 1,
        'max_delta_step ': 1
    }

    print("generating model")
    trip_model = XGBClassifier(**booster_params)
    trip_model.fit(X=X_train,
                   y=y_train,
                   eval_set=[(X_test, y_test)],
                   eval_metric=f1_eval,
                   early_stopping_rounds=25)

    print("Training Accuracy: %.2f" % (trip_model.score(X_eval, y_eval) * 100),
          "%")
    preds = trip_model.predict(X_eval)
    current_score = f1_score(y_eval, preds)

    if current_score > best_score:
        print(f"Score increased to {current_score} from {best_score}")
        best_score = current_score
        sub_pred = trip_model.predict(sub)
        trip_model.save_model(
            f'logs/f1_{run_num}_{best_score}_{tree_no}_thresh.model')
        savetxt(f'logs/{run_num}_{tree_no}_preds.txt', sub_pred, delimiter=',')

    booster_params['f1'] = current_score
    booster_params['tree_no'] = tree_no
    record_history(booster_params)
    tree_no += 1
    return current_score
Beispiel #4
0
    def xgboost(self, x_train, y_train, x_valid, y_valid):

        xgb_model = XGBClassifier(objective='multi:softmax',
                                  verbose=2)

        eval_set = [(x_valid, y_valid)]
        xgb_model.fit(x_train, y_train, eval_set=eval_set, eval_metric='merror', verbose=True, early_stopping_rounds=10)
        plot_importance(xgb_model)
        pyplot.show()
        xgb_model.save_model('./modfile/tf_idf_XGBoost.model')
        return xgb_model, x_valid, y_valid
Beispiel #5
0
def create_tree():
    X_train, X_eval, y_train, y_eval = train_test_split(
        features,
        labels,
        test_size=0.2,
        shuffle=True,
        stratify=labels,
        random_state=RANDOM_SEED)

    X_train, X_test, y_train, y_test = train_test_split(
        X_train,
        y_train,
        test_size=0.2,
        shuffle=True,
        stratify=y_train,
        random_state=RANDOM_SEED)

    booster_params = {
        'n_estimators': 500,
        'objective': 'binary:logistic',
        'verbosity': 1
    }

    print("generating model")

    trip_model = XGBClassifier(**booster_params)
    trip_model.fit(X=X_train,
                   y=y_train,
                   eval_set=[(X_test, y_test)],
                   eval_metric=f1_eval,
                   early_stopping_rounds=50)

    iter_num = trip_model.best_iteration + 30
    booster_params = {
        'n_estimators': iter_num,
        'objective': 'binary:logistic',
        'verbosity': 1
    }
    trip_model = XGBClassifier(**booster_params)
    trip_model.fit(X=X_train,
                   y=y_train,
                   eval_set=[(X_test, y_test)],
                   eval_metric=f1_eval)

    print("Training Accuracy: %.2f" % (trip_model.score(X_eval, y_eval) * 100),
          "%")
    preds = trip_model.predict(X_eval)
    current_score = f1_score(y_eval, preds)
    trip_model.save_model(f'logs/f1_{current_score}_thresh.model')
    sub_pred = trip_model.predict(sub)
    savetxt(f'logs/{current_score}_preds.txt', sub_pred, delimiter=',')

    return trip_model
Beispiel #6
0
    def train_xgboost(self):
        """
        Train an XGBoost classifier on the dataset and save model as artifact
        """
        from xgboost import XGBClassifier

        xgb = XGBClassifier()
        xgb.fit(self.X, self.y)
        xgb.save_model(script_path("model.bst"))
        with open(script_path("model.bst"), "rb") as fh:
            self.buffered_xgb_model = fh.read()
        self.next(self.join)
Beispiel #7
0
def scikitAPI(trainFile):
    X, Y, X_train, X_test, Y_train, Y_test = inputData(trainFile)
    '''
    Y = transLabel(Y)[1]
    Y_train = transLabel(Y_train)[1]
    Y_test = transLabel(Y_test)[1]
    '''

    start = time.time()
    print("class: {0}".format(np.unique(Y_train)))
    print("train data shape: %r, train target shape: %r" %
          (X_train.shape, Y_train.shape))
    print("test data shape: %r, test target shape: %r" %
          (X_test.shape, Y_test.shape))

    #params = {"n_estimators":300, "num_leaves":128, "learning_rate":0.1}
    params = {"n_estimators": 50, "learning_rate": 0.1}

    print("{:-<50}".format(""))
    print("params", params)
    #clf_test = LGBMClassifier(n_estimators=200)

    # clf_test = XGBClassifier(**params)
    # clf_test.fit(X_train, Y_train)

    # print('Training time:{0}'.format(time.time()-start))

    # print("clf class: ",clf_test.classes_)
    # #pred = clf_test.predict(X_test)
    # #print(accuracy_score(pred, y_test))
    # print("Traing Acc: ", clf_test.score(X_train, Y_train), np.shape(X_train))
    # print("Test Acc: ", clf_test.score(X_test, Y_test), np.shape(X_test))
    # print("Total Acc: :", clf_test.score(X, Y), np.shape(X))

    xgtrain = xgb.DMatrix(X, Y)
    clf = XGBClassifier(**params).fit(X, Y)
    model_path = 'xgb/model.joblib'
    # joblib.dump(clf, model_path, compress=1)
    #clf.save_model("lgbm_model.ml")

    model_path = 'xgb/model.bst'
    clf.save_model(model_path)

    # clf.dump_model('xgb/dump.raw.txt', 'xgb/featmap.txt')
    # print("ACC: ", load_model(model_path).score(X_test, Y_test))
    print("model save to {}".format(model_path))
    print("model.ml size: {:.3f} KB".format(
        os.path.getsize(model_path) / 1024))
    clf = load_model(model_path, Y)
    print(clf.predict([np.arange(51)]))
Beispiel #8
0
def train_lazy():
    # Load the dataset
    X, y = load_data()
    # Split the data
    X_train, X_val, y_train, y_val = split_dataset(X, y)
    # # Normalize
    X_train = normalize(X_train)
    X_val = normalize(X_val)

    # uncomment to check the performance of the 25 models
    # clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
    # # fit
    # scores,_ = clf.fit(X_train, X_val, y_train, y_val)
    # # print
    # print(scores)

    # Final model
    # check if model exist
    if os.path.isfile(config.MODEL_PATH):
        model = XGBClassifier()
        model.load_model(config.MODEL_PATH)
    else:
        model = XGBClassifier()
        model.fit(X_train,
                  y_train,
                  eval_metric="error",
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  verbose=True)
        # save model
        model.save_model(config.MODEL_PATH)
    # performance on train set
    y_pred = model.predict(X_train)
    # evaluate predictions
    print_performance(y_train, y_pred, 'train')

    # performance on val set
    y_pred = model.predict(X_val)
    # evaluate predictions
    print_performance(y_val, y_pred, 'val')

    # Load the test dataset
    X_test, y_test = load_test_data()
    # # Normalize
    X_test = normalize(X_test)
    # get prediction
    y_pred = model.predict(X_test)
    # evaluate predictions
    print_performance(y_test, y_pred, 'test')
    # print
    plot_performance(model)
Beispiel #9
0
def build():
    
    data = load_kickstarter(DATA_PATH)
    encoder, data = preprocess(data)
    
    x_train, x_test, y_train, y_test = train_test_split(data.drop(['state'], axis=1), data.state, test_size=0.2, random_state=SEED)
    
    clf = XGBClassifier(n_jobs=6, random_state=SEED)
    clf.fit(x_train, y_train, verbose=True)
    
    dump(encoder, 'onehot-150k.joblib')
    clf.save_model('xgb-150k-v1.model')
    
    
    def train_model(self):
      
        """Trains the model
        Returns
        -------
        testing and training f1 scores
        """

        df = Crunchbase().format_crunchbase()
        subset = df[['type_Group B', 'money_raised_at_ipo', 'number_of_acquisitions', 'valuation_at_ipo',
                     'employee_cat', 'number_of_lead_investors', 'number_of_lead_investments', 'number_of_investments',
                     'type_Group A', 'industries_type_0', 'number_of_employee_profiles',
                     'number_of_events',
                     'number_of_investors', 'total_products_active', 'type_For Profit', 'ipo_status']]
        
        x = subset[subset.columns.drop('ipo_status')]
        y = subset[['ipo_status']]
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

        # scale data
        scaler = StandardScaler()
        # save the scaler
        x_train_scaled = scaler.fit_transform(x_train)
        pickle.dump(scaler, open('./Model/scaler.pkl', 'wb'))
        x_test_scaled = scaler.transform(x_test)

        # scale data
        scaler = StandardScaler()
        # save the scaler
        x_train_scaled = scaler.fit_transform(x_train)
        pickle.dump(scaler, open('./Model/scaler.pkl', 'wb'))
        x_test_scaled = scaler.transform(x_test)

        # transform data to balance it
        smote = SMOTE(random_state=42, sampling_strategy=0.5)
        x_train_scaled_smote, y_train_smote = smote.fit_resample(x_train_scaled, y_train)

        xgb = XGBClassifier()
        xgb.fit(x_train_scaled_smote, y_train_smote)
        # save the trained model
        xgb.save_model('./Model/final_model.json')

        y_train_pred_xgb = xgb.predict(x_train_scaled_smote)
        y_test_pred_xgb = xgb.predict(x_test_scaled)
        training_f1_score = f1_score(y_train_smote, y_train_pred_xgb)
        testing_f1_score = f1_score(y_test, y_test_pred_xgb)

        return training_f1_score, testing_f1_score
Beispiel #11
0
def train_model():
    x, y = load_data()
    model = create_dl_model()
    model.fit(x,
              y,
              epochs=20,
              batch_size=32,
              verbose=0,
              callbacks=keras.callbacks.EarlyStopping(monitor='loss',
                                                      patience=3))
    layermodel_1 = keras.models.Model(inputs=model.input,
                                      outputs=model.layers[2].output)
    x = layermodel_1.predict(x)
    xgb = XGBClassifier(n_estimators=110, learning_rate=0.12)
    xgb.fit(x, y.ravel())
    layermodel_1.save("../files/DNN.h5")
    xgb.save_model("../files/xgboost.model")
Beispiel #12
0
def train(X_train, y_train, X_test, y_test, i):
    if i == 0:
        name = 'citi'
    elif i == 1:
        name = 'jpm'
    else:
        name = 'boa'
    weights = (y_train == 0).sum()/(1.0 * (y_train == 1).sum())
    model = XGBClassifier(tree_method='gpu_hist', objective='binary:logistic',
                          n_estimators=300, scale_pos_weight=weights, n_jobs=6)
    if name == 'citi':
        model.fit(X_train, y_train, eval_metric=['auc'])
    else:
        model.fit(X_train, y_train, eval_metric=['auc'])
    y_pred = model.predict(X_test)
    get_results(y_pred, y_test)
    model.save_model(f'{name}.model')
    return model
Beispiel #13
0
def xgbmodel():
    data1 = pd.read_csv('/home/msmal/object_type/X_enc_object.csv', header=0)
    print("data1 read over!")
    data2 = pd.read_csv('/home/msmal/float_type/train.csv', header=0)
    data2[np.isinf(data2)] = -1
    data2[np.isnan(data2)] = -2
    print("data2 read over!")
    X = pd.concat([data1.iloc[:, 1:], data2], axis=1)
    y = pd.read_csv('/home/msmal/label.csv', header=0)
    scaler = preprocessing.StandardScaler()
    X_scaled = scaler.fit_transform(X)
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)
    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size=0.005, random_state=49)
    #xgbmodel:xgb = XGBClassifier(n_jobs=-1)
    xgb = XGBClassifier(n_jobs=-1,n_estimators=500,learning_rate=0.05,subsample=0.8)
    xgb.fit(X_train, Y_train)
    xgb.save_model('xgb1.model')
    print("model has saved!")
    xgbpre = xgb.predict(X_test)
    xgb_report = metrics.classification_report(Y_test, xgbpre)
    print(xgb_report)
Beispiel #14
0
thresholds = np.sort(model.feature_importances_)
print(thresholds)

for thresh in thresholds:
        selection = SelectFromModel(model, threshold=thresh, prefit = True)

        parameter = {
            'n_estimators': [100, 200, 400],
            'learning_rate' : [0.03, 0.05, 0.07, 0.1],
            'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
            'colsample_bylevel':[0.6, 0.7, 0.8, 0.9],
            'max_depth': [4, 5, 6]
        }

        search = GridSearchCV( XGBRegressor(), parameter, cv =5, n_jobs = -1)

        select_x_train = selection.transform(x_train)

        search.fit(select_x_train, y_train)

        select_x_test = selection.transform(x_test)
        x_pred = search.predict(select_x_test)

        score = r2_score(y_test, x_pred)
        print('R2는',score)

        print("Thresh=%.3f, n=%d, R2: %.2f%%" %(thresh, select_x_train.shape[1], score*100.0))
        model.save_model('./model/xgb_save/m34sfm_cancer/cancer.xgb' + str(select_x_train.shape[1])+'.model')

'''
Beispiel #15
0
def train_xgboost(data: IrisData):
    clf = XGBClassifier()
    clf.fit(data.X, data.y)

    model_path = os.path.join(XGBoostFolder, "model.json")
    clf.save_model(model_path)
Beispiel #16
0
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 



def get_data():
    url = 'simple_data.csv'
    return pd.read_csv(url)
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [1])], remainder='passthrough')
df = get_data()
df['position'] = df['position'].str.extract(r'(\w+),?')

X = df.drop(columns=['all_star'])

X = columnTransformer.fit_transform(X)
y = df.all_star

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99)

model = XGBClassifier(objective='binary:logistic', use_label_encoder=False)
model.fit(X_train, y_train)

model.save_model('xgb.model')

predictions = model.predict(X_test)

print("XGBoost Training Accuracy")
print(f'Accuracy: {round(accuracy_score(y_test, predictions) * 100, 3)}%')
print(classification_report(y_test, predictions))
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=77,
                                                    shuffle=True)

# 2. 모델
model = XGBClassifier(n_estimators=1000, learning_rate=0.1)
# model = XGBRegressor(learning_rate=0.01)

# 3. 훈련
model.fit(
    x_train,
    y_train,
    verbose=1,  #다 보여 준다(0 / 1 , False / True)
    eval_metric='logloss',  #keras의 metrics와 동일
    eval_set=[(x_train, y_train), (x_test, y_test)])

# 4. 평가, 예측
result = model.evals_result()
# print("eval's results : ", result)

acc = model.score(x_test, y_test)
print('acc:', acc)

y_pred = model.predict(x_test)
print('최종정답률:', accuracy_score(y_test, y_pred))

model.save_model('./save/xgb_save/cancer2.xgb.model')
print('저장완료')
Beispiel #18
0
    selection_x_train = selection.transform(x_train)
    selection_x_test = selection.transform(x_test)
    print(selection_x_train.shape)

    selection_model = XGBClassifier(n_estimators=1000,
                                    max_depth=4,
                                    learning_rate=0.5,
                                    n_jobs=-1)

    selection_model.fit(selection_x_train,
                        y_train,
                        verbose=False,
                        eval_metric=["merror", "mlogloss"],
                        eval_set=[(selection_x_train, y_train),
                                  (selection_x_test, y_test)],
                        early_stopping_rounds=100)

    y_pred = selection_model.predict(selection_x_test)

    # results = selection_model.evals_result()
    # print("evals_result : \n", results)

    score = accuracy_score(y_test, y_pred)
    print("Thresh=%.3f, n=%d, acc: %.2f%%" %
          (thresh, selection_x_train.shape[1], score * 100.0))
# (120, 1)
# Thresh=0.621, n=1, acc: 96.67%

model.save_model("./model/xgb_save/iris_acc_96.67_model")
Beispiel #19
0
# 不可视化数据集loss
# model = XGBClassifier()
# model.fit(X_train, y_train)

##可视化测试集的loss
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train,
          y_train,
          early_stopping_rounds=100,
          eval_metric="logloss",
          eval_set=eval_set,
          verbose=True)
# 改为True就能可视化loss
model.save_model("00001.model")

model.fit(X, Y)
plot_importance(model)
pyplot.show()

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
##Accuracy: 77.56%

test_auc2 = roc_auc_score(y_test, y_pred)  #验证集上的auc值
print("xgb_muliclass_auc:", test_auc2)
X = df_2018.drop("job_title", axis=1)
y = df_2018["job_title"]

# Splitting data into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20)
# save out the split training data to use with Cloud AutoML
with open("train_data_2018.csv", "+w") as file:
    pd.concat([X_train, y_train], axis=1).to_csv(file, index=False)
with open("test_data_2018.csv", "+w") as file:
    pd.concat([X_test, y_test], axis=1).to_csv(file, index=False)

# encode all features using ordinal encoding
encoder_x = ce.OrdinalEncoder()
X_encoded = encoder_x.fit_transform(X)
# you'll need to use a different encoder for each dataframe
encoder_y = ce.OrdinalEncoder()
y_encoded = encoder_y.fit_transform(y)
# split encoded dataset
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = train_test_split(X_encoded, y_encoded,
                                                    train_size=0.80, test_size=0.20)

from xgboost import XGBClassifier

# train XGBoost model with default parameters
my_model = XGBClassifier()
my_model.fit(X_train_encoded, y_train_encoded, verbose=False)

# and save our model
my_model.save_model("xgboost_baseline.model")
Beispiel #21
0
for thres in threshold:
    selection = SelectFromModel(model, threshold=thres, prefit=True)

    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)

    selection_model = LGBMClassifier(n_estimators=100,
                                     learning_rate=0.05,
                                     n_jobs=-1)

    selection_model.fit(select_x_train,
                        y_train,
                        verbose=False,
                        eval_metric=['logloss', 'error'],
                        eval_set=[(select_x_train, y_train),
                                  (select_x_test, y_test)],
                        early_stopping_rounds=20)

    y_pred = selection_model.predict(select_x_test)
    acc = accuracy_score(y_test, y_pred)

    print("Thresh=%.3f, n = %d, ACC : %.2f%%" %
          (thres, select_x_train.shape[1], acc * 100.0))

    # result = selection_model.evals_result()
    # print("eval's result : ", result)

    model.save_model("./model/sample/cancer/cancer_rmse=%.3f-r2=%.2f.model" %
                     (thres, acc))
class XGBoost(BaseAlgorithm):
    def __init__(self, algorithm_settings, problem_type):
        super().__init__(algorithm_settings)
        self.problem_type = problem_type

    def build(self):
        if self.problem_type == SupervisedTask.regression:
            self.build_regression_model()

        elif self.problem_type == SupervisedTask.classification:
            self.build_classification_model()

        else:
            raise TypeError('Unknown problem_type')

    def build_regression_model(self):
        from xgboost import XGBRegressor
        self.model = XGBRegressor(
            max_depth=self.algorithm_settings.max_depth,
            learning_rate=self.algorithm_settings.learning_rate,
            n_estimators=self.algorithm_settings.n_estimators,
            objective=self.algorithm_settings.objective,
            booster=self.algorithm_settings.booster,
            n_jobs=self.algorithm_settings.n_jobs,
            gamma=self.algorithm_settings.gamma,
            min_child_weight=self.algorithm_settings.min_child_weight,
            max_delta_step=self.algorithm_settings.max_delta_step,
            subsample=self.algorithm_settings.subsample,
            reg_alpha=self.algorithm_settings.reg_alpha,
            reg_lambda=self.algorithm_settings.reg_lambda,
            random_state=self.algorithm_settings.random_state)

    def build_classification_model(self):
        from xgboost import XGBClassifier
        self.model = XGBClassifier(
            max_depth=self.algorithm_settings.max_depth,
            learning_rate=self.algorithm_settings.learning_rate,
            n_estimators=self.algorithm_settings.n_estimators,
            objective=self.algorithm_settings.objective,
            booster=self.algorithm_settings.booster,
            n_jobs=self.algorithm_settings.n_jobs,
            gamma=self.algorithm_settings.gamma,
            min_child_weight=self.algorithm_settings.min_child_weight,
            max_delta_step=self.algorithm_settings.max_delta_step,
            subsample=self.algorithm_settings.subsample,
            reg_alpha=self.algorithm_settings.reg_alpha,
            reg_lambda=self.algorithm_settings.reg_lambda,
            random_state=self.algorithm_settings.random_state)

    def train(self, train_x, train_y, settings):
        self.model.fit(train_x,
                       train_y,
                       eval_metric=self.algorithm_settings.eval_metric)
        self.save(settings)

    def evaluate(self, test_x):
        prediction = self.model.predict(test_x)
        prediction = prediction.reshape(-1, 1)
        return prediction

    def load(self, model_path):
        self.model.load_model(fname=model_path)

    def save(self, settings):
        model_save_dir = os.path.join(settings.models_path, 'xgboost_models')
        os.makedirs(model_save_dir, exist_ok=True)
        model_name = self.get_model_name(settings)
        save_path = os.path.join(model_save_dir, model_name)
        self.model.save_model(fname=save_path)
        print(f"Model saved to: {save_path}")

    def get_model_name(self, settings):
        if settings.problem_type == SupervisedTask.regression:
            return 'regression_model.xgb'

        else:
            return 'classification_model.xgb'
def train_and_generate_model():

    #global log_fd

    global log_fd_opt

    global tr_input_arr

    global tr_angle_arr

    global val_input_arr

    global val_angle_arr

    data_len = len(exchange_rates)

    log_fd_tr = open("./train_progress_log_" +
                     dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt",
                     mode="w")

    # inner logger function for backtest

    def logfile_writeln_tr(log_str):

        nonlocal log_fd_tr

        log_fd_tr.write(log_str + "\n")

        log_fd_tr.flush()

    print("data size of rates: " + str(data_len))

    print("num of rate datas for tarin: " +
          str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR))

    print("input features sets for tarin: " + str(COMPETITION_TRAIN_DATA_NUM))

    logfile_writeln_tr("data size of rates: " + str(data_len))

    logfile_writeln_tr("num of rate datas for tarin: " +
                       str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR))

    tr_input_mat = []

    tr_angle_mat = []

    is_loaded_input_mat = False

    if os.path.exists("./tr_input_mat.pickle"):

        with open('./tr_input_mat.pickle', 'rb') as f:

            tr_input_mat = pickle.load(f)

        with open('./tr_angle_mat.pickle', 'rb') as f:

            tr_angle_mat = pickle.load(f)

        is_loaded_input_mat = True

    else:

        for i in range(DATA_HEAD_ASOBI,
                       len(exchange_rates) - DATA_HEAD_ASOBI - OUTPUT_LEN,
                       SLIDE_IDX_NUM_AT_GEN_INPUTS_AND_COLLECT_LABELS):

            tr_input_mat.append([
                exchange_rates[i],
                (exchange_rates[i] - exchange_rates[i - 1]) /
                exchange_rates[i - 1],
                get_rsi(exchange_rates, i),
                get_ma(exchange_rates, i),
                get_ma_kairi(exchange_rates, i),
                get_bb_1(exchange_rates, i),
                get_bb_2(exchange_rates, i),
                get_ema(exchange_rates, i),
                get_ema_rsi(exchange_rates, i),
                get_cci(exchange_rates, i),
                get_mo(exchange_rates, i),
                get_lw(exchange_rates, i),
                get_ss(exchange_rates, i),
                get_dmi(exchange_rates, i),
                get_vorarity(exchange_rates, i),
                get_macd(exchange_rates, i),
                str(judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i]))
            ])

            tr_input_mat.append([
                reverse_exchange_rates[i],
                (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) /
                reverse_exchange_rates[i - 1],
                get_rsi(reverse_exchange_rates, i),
                get_ma(reverse_exchange_rates, i),
                get_ma_kairi(reverse_exchange_rates, i),
                get_bb_1(reverse_exchange_rates, i),
                get_bb_2(reverse_exchange_rates, i),
                get_ema(reverse_exchange_rates, i),
                get_ema_rsi(reverse_exchange_rates, i),
                get_cci(reverse_exchange_rates, i),
                get_mo(reverse_exchange_rates, i),
                get_lw(reverse_exchange_rates, i),
                get_ss(reverse_exchange_rates, i),
                get_dmi(reverse_exchange_rates, i),
                get_vorarity(reverse_exchange_rates, i),
                get_macd(reverse_exchange_rates, i),
                str(
                    judge_chart_type(
                        reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i]))
            ])

            tmp = exchange_rates[i + OUTPUT_LEN] - exchange_rates[i]

            if tmp >= 0:

                tr_angle_mat.append(1)

            else:

                tr_angle_mat.append(0)

            tmp = reverse_exchange_rates[
                i + OUTPUT_LEN] - reverse_exchange_rates[i]

            if tmp >= 0:

                tr_angle_mat.append(1)

            else:

                tr_angle_mat.append(0)

        if is_loaded_input_mat == False:

            with open('tr_input_mat.pickle', 'wb') as f:

                pickle.dump(tr_input_mat, f)

            with open('tr_angle_mat.pickle', 'wb') as f:

                pickle.dump(tr_angle_mat, f)

    #log output for tensorboard

    #configure("logs/xgboost_trade_cpu_1")

    tr_input_arr = np.array(tr_input_mat[0:COMPETITION_TRAIN_DATA_NUM])

    tr_angle_arr = np.array(tr_angle_mat[0:COMPETITION_TRAIN_DATA_NUM])

    watchlist = None

    split_idx = COMPETITION_TRAIN_DATA_NUM + int(
        (len(tr_input_mat) - COMPETITION_TRAIN_DATA_NUM) *
        VALIDATION_DATA_RATIO)

    if VALIDATION_DATA_RATIO != 0.0:

        val_input_arr = np.array(
            tr_input_mat[COMPETITION_TRAIN_DATA_NUM:split_idx])

        val_angle_arr = np.array(
            tr_angle_mat[COMPETITION_TRAIN_DATA_NUM:split_idx])

        watchlist = [(tr_input_arr, tr_angle_arr),
                     (val_input_arr, val_angle_arr)]

    else:

        watchlist = [(tr_input_arr, tr_angle_arr)]

    start = time.time()

    if is_param_tune_with_optuna:

        log_fd_opt = open("./tune_progress_log_" +
                          dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt",
                          mode="w")

        study = None

        if is_use_db_at_tune:

            study = optuna.Study(study_name='fxsystrade',
                                 storage='sqlite:///../fxsystrade.db')

        else:

            study = optuna.create_study()

        parallel_num = RAPTOP_THREAD_NUM * 2

        if is_colab_cpu or is_exec_at_mba:

            parallel_num = COLAB_CPU_AND_MBA_THREAD_NUM * 2

        if special_optuna_parallel_num != -1:

            parallel_num = special_optuna_parallel_num

        study.optimize(opt, n_trials=OPTUNA_TRIAL_NUM, n_jobs=parallel_num)

        process_time = time.time() - start

        logfile_writeln_opt("best_params: " + str(study.best_params))

        logfile_writeln_opt("best_value: " + str(study.best_value))

        logfile_writeln_opt("best_trial: " + str(study.best_trial))

        logfile_writeln_opt("excecution time of tune: " + str(process_time))

        log_fd_opt.flush()

        log_fd_opt.close()

        exit()

    param = {}

    n_thread = RAPTOP_THREAD_NUM

    if is_use_gpu:

        param['tree_method'] = 'gpu_hist'

        param['max_bin'] = 16

        param['gpu_id'] = 0

        n_thread = COLAB_CPU_AND_MBA_THREAD_NUM

    if is_colab_cpu or is_exec_at_mba:

        n_thread = COLAB_CPU_AND_MBA_THREAD_NUM

    logfile_writeln_tr("training parameters are below...")

    logfile_writeln_tr(str(param))

    eval_result_dic = {}

    logfile_writeln_tr("num_round: " + str(NUM_ROUND))

    clf = XGBClassifier(max_depth=MAX_DEPTH,
                        random_state=42,
                        n_estimators=NUM_ROUND,
                        min_child_weight=18,
                        subsample=0.9,
                        colsample_bytree=0.6,
                        eta=ETA,
                        objective='binary:logistic',
                        verbosity=0,
                        n_thread=n_thread,
                        **param)

    verbosity = True

    if is_use_gpu or is_colab_cpu:

        verbosity = False

    clf.fit(tr_input_arr, tr_angle_arr, eval_set=watchlist, verbose=verbosity)

    process_time = time.time() - start

    logfile_writeln_tr("excecution time of training: " + str(process_time))

    clf.save_model('./xgb.model')

    booster = clf.get_booster()

    booster.dump_model('./xgb_model.raw.txt')

    eval_result_dic = clf.evals_result()

    for ii in range(len(eval_result_dic['validation_0']['error'])):

        if VALIDATION_DATA_RATIO != 0.0:

            logfile_writeln_tr(
                str(ii) + "," +
                str(eval_result_dic['validation_0']['error'][ii]) + "," +
                str(eval_result_dic['validation_1']['error'][ii]))

        else:

            logfile_writeln_tr(
                str(ii) + "," +
                str(eval_result_dic['validation_0']['error'][ii]))

    # Feature Importance

    fti = clf.feature_importances_

    logfile_writeln_tr('Feature Importances:')

    for i, feat in enumerate(FEATURE_NAMES):

        logfile_writeln_tr('\t{0:20s} : {1:>.6f}'.format(feat, fti[i]))

    log_fd_tr.flush()

    log_fd_tr.close()

    print("finished training and saved model.")
print("최종 정답률     : ", r2_score(y_test, y_predict))

score = accuracy_score(y_test, y_predict)

model = model.best_estimator_

thresholds = np.sort(model.feature_importances_)
print(thresholds)

n = 0
score = 0

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
    selection_model = XGBClassifier(n_jobs=-1)
    selection_model.fit(select_x_train, y_train)

    select_x_test = selection.transform(x_test)
    y_predict = selection_model.predict(select_x_test)
    acc = accuracy_score(y_test, y_predict)

    if acc * 100.0 > score:
        n = select_x_train.shape[1]
        score = acc * 100.0
        L_selection = selection
        selection_model.save_model("./save/xgb_save/ml37_3_cancer.xgb.model")
        print("Thresh=%.3f, n=%d, acc: %.2f%%" %
              (thresh, select_x_train.shape[1], score * 100.0))
Beispiel #25
0
    def tree_optimization(self, learning_rate, gamma, max_depth, subsample,
                          reg_lambda, num_parallel_tree, min_child_weight):

        x_train, x_eval, y_train, y_eval = train_test_split(
            self.features,
            self.labels,
            test_size=0.2,
            shuffle=True,
            stratify=self.labels,
            random_state=self.seed)

        x_train, x_test, y_train, y_test = train_test_split(
            x_train,
            y_train,
            test_size=0.2,
            shuffle=True,
            stratify=y_train,
            random_state=self.seed)
        booster_params = {
            'n_estimators': 500,
            'learning_rate': learning_rate,
            'gamma': gamma,
            'max_depth': int(np.around(max_depth)),
            'subsample': subsample,
            'sampling_method': 'gradient_based',
            'reg_lambda': reg_lambda,
            'min_child_weight': int(np.around(min_child_weight)),
            'num_parallel_tree': int(np.around(num_parallel_tree)),
            'objective': 'binary:logistic',
            'verbosity': 1,
            'max_delta_step ': 1
        }

        print("generating model")
        model = XGBClassifier(**booster_params)
        model.fit(X=x_train,
                  y=y_train,
                  eval_set=[(x_test, y_test)],
                  eval_metric=f1_eval,
                  early_stopping_rounds=25)

        preds = model.predict(x_eval)
        current_score = f1_score(y_eval, preds)

        print(f"eval ROC score: {current_score}")

        if current_score > self.best_score:
            print(f"Score increased to {current_score} from {self.best_score}")
            self.best_score = current_score
            sub_pred = model.predict(self.pred)
            model.save_model(
                f'logs/f1_{self.run_num}_{self.best_score}_{self.tree_no}_{self.model_type}_thresh.model'
            )
            savetxt(
                f'logs/{self.run_num}_{self.tree_no}_{self.model_type}_preds.txt',
                sub_pred,
                delimiter=',')

        booster_params['f1'] = current_score
        booster_params['tree_no'] = self.tree_no
        record_history(booster_params)
        self.tree_no += 1
        return current_score
Beispiel #26
0
                                                      test_size=0.2,
                                                      random_state=24)

x_train = x_train.as_matrix()
y_train = y_train.as_matrix()
x_valid = x_valid.as_matrix()
y_valid = y_valid.as_matrix()

clf = XGBClassifier(booster='gbtree',
                    nthread=4,
                    learning_rate=0.1,
                    min_child_weight=1,
                    max_depth=5,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    scale_pos_weight=1,
                    objective='binary:logistic',
                    seed=24)
eval_set = [(x_valid, y_valid)]
clf.fit(x_train,
        y_train,
        early_stopping_rounds=100,
        eval_metric='logloss',
        eval_set=eval_set,
        verbose=True)
clf.save_model("./model/xgboost.model")
pre = clf.predict(x_valid)
pro = clf.predict_proba(x_valid)[:, 1]

print("Accuracy: %f" % accuracy_score(y_valid, pre))
print("AUC Score: %f" % roc_auc_score(y_valid, pro))
Beispiel #27
0
def train_xgboost(data: IrisData, artifacts_folder: str):
    clf = XGBClassifier()
    clf.fit(data.X, data.y)
    clf.save_model(f"{artifacts_folder}/{XGBoostFolder}/model.bst")
Beispiel #28
0
# rmse, mae, logloss, error(설명 error가 accuracy), auc(설명 accuracy친구)

results = model.evals_result()
# print("eval's results : ", results)

# print("r2 Score : %.2f%%:" %(r2*100.0))

y_pred = model.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print("acc : ", acc)

#####################################################################################################
# import pickle   # 파이썬에서 제공한다.

# from joblib import dump, load
# import joblib
# pickle.dump(model, open("./model/xgb_save/cancer.pickle.dat", "wb")) # wb형식으로 저장하겠다.
# joblib.dump(model, "./model/xgb_save/cancer.joblib.dat")
model.save_model("./model/xgb_save/cancer.xgb.model")
print("저장됬다.")

# model2 = pickle.load(open("./model/xgb_save/cancer.pickle.dat", "rb"))
# model2 = joblib.load("./model/xgb_save/cancer.joblib.dat")
model2 = XGBClassifier()
model2.load_model("./model/xgb_save/cancer.xgb.model")
print('불러왔다.')

y_pred = model2.predict(x_test)
acc = accuracy_score(y_pred, y_test)
print("acc : ", acc)
Beispiel #29
0
class Classifier:

    # for initializing train and test sets, classifier and accuracy score
    # Change method to gpu_hist if you want xgboost to run on a GPU
    def __init__(self,
                 params={
                     'objective': 'reg:squarederror',
                     'verbosity': 0
                 }):
        self.X_train = []
        self.X_labels = []
        self.test = []
        self.test_labels = []
        self.model = XGBClassifier(**params)
        self.prediction = 0
        self.error = 0

    def size(self):
        if isinstance(self.X_train, np.ndarray):
            return self.X_train.size
        return len(self.X_train)

    # adding the data points
    def input_train(self, features, feature):
        if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0:
            self.X_train = self.X_train.tolist()
            self.X_labels = self.X_labels.tolist()
        self.X_train.append(features)
        self.X_labels.append(feature)

    # train the data
    def train(self):
        self.X_train = np.asarray(self.X_train)
        self.X_labels = np.asarray(self.X_labels)
        self.model.fit(self.X_train, self.X_labels)

    def train_eval(self, metric='error'):
        self.X_train = np.asarray(self.X_train)
        self.X_labels = np.asarray(self.X_labels)
        X_train, X_test, y_train, y_test = train_test_split(self.X_train,
                                                            self.X_labels,
                                                            test_size=0.33)
        self.model.fit(X_train,
                       y_train,
                       eval_set=[(X_train, y_train), (X_test, y_test)],
                       eval_metric=metric)
        evals_result = self.model.evals_result()
        if metric == 'error':
            validations = []
            for val in evals_result.values():
                lst = val.get("error")
                validations.append(sum(lst) / len(lst))
            return 1 - (sum(validations) / len(validations))
        else:
            validations = []
            for val in evals_result.values():
                lst = val.get(metric)
                validations.append(lst[-1])
            return validations

    # input test labels if you want to check accuracy
    def label(self, label):
        self.test_labels.append(label)

    def input_test(self, features):
        if isinstance(self.test, np.ndarray) and self.test.size > 0:
            self.test = self.test.tolist()
        self.test.append(features)

    # test data
    def predict(self):
        if not isinstance(self.test, np.ndarray):
            self.test = np.asarray(self.test)
        self.prediction = self.model.predict(self.test)
        return self.prediction

    def predict_proba(self):
        if not isinstance(self.test, np.ndarray):
            self.test = np.asarray(self.test)
        self.prediction = self.model.predict_proba(self.test)
        return self.prediction

    # if you have the test labels you can check the error rate (you want error close to 0)
    def check_error(self):
        self.test_labels = np.asarray(self.test_labels)
        self.error = metrics.mean_absolute_error(self.test_labels,
                                                 self.prediction)
        return self.error

    # save classifier
    def save_classifier(self, file):
        self.model.save_model(file)

    # open saved classifier
    def open_classifier(self, file):
        self.model.load_model(file)

    # removes all training data
    def clean_train(self):
        self.X_train = []
        self.X_labels = []

    # removes all testing data
    def clean_test(self):
        self.test = []
        self.test_labels = []
class Xgboost(object):
    def __init__(self,
                 task="cla",
                 module_type="performance",
                 compute_task="cpu",
                 **params):
        """
        :param task:
        :param module_type:
        :param compute_task:
        :param params:
        """
        assert task in ["cla", "reg"]
        assert module_type in ["debug", "performance", "balance"]
        assert compute_task in ["cpu", "gpu"]

        self.task = task
        self.module_type = module_type  # 模块
        if self.module_type == "debug":
            params["n_jos"] = 1
        elif self.module_type == "performance":
            params["n_jos"] = cpu_count()  # cpu核心数
        else:  # 性能模式
            params["n_jos"] = cpu_count() // 2
        self.compute_task = compute_task

        if self.compute_task == "gpu":  # 使用gpu
            params["tree_method"] = "gpu_hist"
        else:  # 默认cpu
            params["tree_method"] = "hist"  # 使用的cpu

        if self.task == "reg":  # 做回归任务
            self.model = XGBRegressor(
                learning_rate=params.get("learning_rate", 0.3),
                n_estimators=params.get("n_estimators", 100),  # 树的个数100,即代数
                max_depth=params.get("max_depth", 6),  # 树的深度
                min_child_weight=params.get("min_child_weight", 1),  # 叶子节点最小权重
                n_jobs=params.get("n_jos", None),  # 线程数
                gamma=params.get("gamma", 0),  # 惩罚项中叶子节点个数前的参数
                reg_lambda=params.get("lambda", 1),  # lambda
                reg_alpha=params.get("alpha", 0),
                tree_method=params.get("tree_method", "auto"),
                subsample=params.get("subsample", 1),  # 随机选择100%样本建立决策树
                colsample_bytree=1,  # 随机选择80%特征建立决策树
                objective=params.get("objective",
                                     "reg:squarederror"),  # 指定损失函数
                # num_class=params.get("num_class", 2),  # 不指定即为2分类
                booster=params.get("booster", "gbtree"),  # 使用的提升器
                scale_pos_weight=1,  # 解决样本不平衡问题
                random_state=27,  # 随机数
            )

        else:  # 做的分类任务
            self.model = XGBClassifier(
                learning_rate=params.get("learning_rate", 0.3),
                n_estimators=params.get("n_estimators", 100),  # 树的个数100,即代数
                max_depth=params.get("max_depth", 6),  # 树的深度
                min_child_weight=params.get("min_child_weight", 1),  # 叶子节点最小权重
                n_jobs=params.get("n_jos", None),  # 线程数
                gamma=params.get("gamma", 0),  # 惩罚项中叶子节点个数前的参数
                reg_lambda=params.get("lambda", 1),  # lambda
                reg_alpha=params.get("alpha", 0),
                tree_method=params.get("tree_method", "auto"),  # 树方法, 默认为auto
                subsample=params.get("subsample", 1),  # 随机选择100%样本建立决策树
                colsample_bytree=1,  # 随机选择80%特征建立决策树
                objective=params.get("objective", "multi:softmax"),
                # 指定损失函数   # 'binary:logistic   二分类交叉上

                # num_class=params.get("num_class", 2),  # 不指定即为2分类
                booster=params.get("booster", "gbtree"),  # 使用的提升器
                scale_pos_weight=1,  # 解决样本不平衡问题
                random_state=27,  # 随机数
            )
        """
        目标函数类型
        具体查看  https://xgboost.readthedocs.io/en/latest/parameter.html
        obejctive:  默认  reg:squarederror:
        reg:squarederror:  #回归平方误差
        reg:squaredlogerror  # 上述误差上取对数
        reg:logistic logistic regression
        reg:logistic    逻辑回归
        binary:logistic    逻辑回归二分类, 输出为概率值
        binary:logitraw    逻辑回归 2分类,输出为logits之前的得分
        binary:hinge   用于二元分类的铰链损失。这使得预测为0或1,而不是产生概率。
        multi:softmax:  多分类,需要指定num_class的类别
        multi:softprob:  输出为概率  ndata*nclass 的矩阵,即,每行数据为分属类别的概率
        """

    def train(self,
              x_train,
              y_train=None,
              sample_weight=None,
              base_margin=None,
              eval_set=None,
              eval_metric=None,
              early_stopping_rounds=None,
              verbose=True,
              sample_weight_eval_set=None):
        # print(self.model)
        """
        :param x_train:     回归中,使用特征矩阵,  array
        :param y_train:      标签  array
        :param eval_metric
        :return:
        """
        # 默认开启过早停止

        # eval_metric in ["rmse","rmsle","mae","logloss","error","error@t", "merror","mlogloss","auc","aucpr",
        #                 "ndcg","map","ndcg@n", "map@n","ndcg-", "map-", "ndcg@n-", "map@n-","poisson-nloglik",
        #                 "gamma-nloglik","cox-nloglik","gamma-deviance","tweedie-nloglik","aft-nloglik"]
        # eval_metric   参数可为字符串, 也可以是列表字符串的形式

        if eval_metric:  # 若需要使用评估模型模式,
            assert eval_set  # 要确保   测试集是存在的。

        self.model.fit(X=x_train,
                       y=y_train,
                       sample_weight=sample_weight,
                       base_margin=base_margin,
                       eval_set=eval_set,
                       eval_metric=eval_metric,
                       early_stopping_rounds=early_stopping_rounds,
                       verbose=verbose,
                       sample_weight_eval_set=sample_weight_eval_set)

        # early_stopping_rounds=10  过早停止的条件  # 默认使用值为10
        # verbose=True  # 是否开启冗余

    def plot_loss(self):  # 绘制loss
        result = self.model.evals_result()  #获取模型结果
        epochs = len(result["validation_0"]["rmse"])
        x_axis = range(0, epochs)
        # 绘制loss曲线图
        figure, ax = plt.subplots()
        ax.plot(x_axis, result["validation_0"]["rmse"], label="Train")
        ax.plot(x_axis, result["validation_1"]["rmse"], label="Test")
        ax.legend()
        plt.ylabel("loss")
        plt.title("Xgboost Log Loss")
        plt.show()

    def predict(self, x_test):
        """
        :param x_test:  #使用np.array、scipy.sparse  用于预测
        :return:
        """
        my_pred = self.model.predict(data=x_test,
                                     output_margin=False,
                                     validate_features=True,
                                     base_margin=None)
        return my_pred

    def plt_importance(self, figure_path=None, ifsave=True):  # 绘制重要性特征
        """
        :param figure_path:  图片保存路径
        :param ifsave:  是否保存图片
        :return:
        """
        # 绘制特征重要性
        fig, ax = plt.subplots(figsize=(15, 15))
        plot_importance(self.model, height=0.5, ax=ax,
                        max_num_features=64)  # 最多绘制64个特征
        if ifsave:
            if not figure_path:
                plt.savefig(
                    "../model/XGBboost_model/Xgboost_featute_importance_before.png"
                )
            else:
                plt.savefig(figure_path)
        plt.show()  # 显示图片

    def _plt_importance_v1(self,
                           columns_name,
                           figure_path=None,
                           ifsave=True):  # 绘制重要性特征,使用实际的列名进行替换
        fig, ax = plt.subplots(figsize=(15, 15))
        plot_importance_v1(self.model,
                           model_name="xgb",
                           columns_name=columns_name,
                           height=0.5,
                           ax=ax,
                           max_num_features=64)  # 最多绘制64个特征
        if ifsave:
            if not figure_path:
                plt.savefig(
                    "../model/XGBboost_model/Xgboost_featute_importance_after.png"
                )
            else:
                plt.savefig(figure_path)
        plt.show()  # 显示图片

    def plt_tree(self, num_tree):  # 绘制树
        """
        :param num_tree:  指定目标树的序号
        :return:
        """
        plot_tree(booster=self.model, num_trees=num_tree)

    def plot_graphviz(self, num_tree):  # 进行绘制graphviz
        to_graphviz(self.model, num_trees=num_tree)

    # 获取重要特征
    def get_importance(self):
        return self.model.feature_importances_

    # 评估函数
    def evaluate(self, y_test, my_pred, evalue_fun="mse"):
        if evalue_fun == "acc":  # 准确率    分类指标
            result = accuracy_score(y_true=y_test, y_pred=my_pred)
            print("accuarcy:%.2f" % (result * 100.0))
        elif evalue_fun == "auc":  # auc 值   分类指标
            result = roc_auc_score(y_true=y_test, y_score=my_pred)
            print("auc:%.2f" % (result))
        elif evalue_fun == "mae":  # 回归指标, 平均绝对误差
            result = mean_absolute_error(y_true=y_test, y_pred=my_pred)
            print("mae:%.2f" % (result))
        elif evalue_fun == "median_ae":  # 种植绝对误差  回归指标
            result = median_absolute_error(y_true=y_test, y_pred=my_pred)
            print("median_ae:%.2f" % (result))
        elif evalue_fun == "r2_score":  # R平方值   回归指标
            result = r2_score(y_true=y_test, y_pred=my_pred)
            print("r2_score:%.2f" % (result))
        elif evalue_fun == "evs":  # 回归反差,    回归指标
            result = explained_variance_score(y_true=y_test, y_pred=my_pred)
            print("explained_variance_score:%.2f" % (result))
        elif evalue_fun == "aps":  #  分类指标, 根据预测得分计算平均精度(AP)
            result = average_precision_score(y_true=y_test,
                                             y_score=my_pred,
                                             average="maco",
                                             sample_weight=None)
            print("average_precision_score:%.2f" % (result))
        elif evalue_fun == "bsl":
            result = brier_score_loss(y_true=y_test,
                                      y_prob=my_pred,
                                      sample_weight=None,
                                      pos_label=None)
            print("brier_score_loss:%.2f" % (result))
        elif evalue_fun == "cmt":  #计算混淆矩阵来评估分类的准确性   分类指标
            result = confusion_matrix(y_true=y_test,
                                      y_pred=my_pred,
                                      labels=None,
                                      sample_weight=None)
            print("confusion_matrix:%.2f" % (result))
        elif evalue_fun == "f1_score":  # f1 得分, 分类指标
            result = f1_score(y_true=y_test,
                              y_pred=my_pred,
                              labels=None,
                              pos_label=1,
                              average="binary",
                              sample_weight=None)  #F1值
            print("f1_score:%.2f" % (result))
        elif evalue_fun == "log_loss":  # 交叉熵孙绍, 分类指标
            result = log_loss(y_true=y_test,
                              y_pred=my_pred,
                              eps=1e-15,
                              normalize=True,
                              sample_weight=None,
                              labels=None)
            print("log_loss:%.2f" % (result))
        elif evalue_fun == "precision_score":  # 查准率   分类指标
            result = precision_score(y_true=y_test,
                                     y_pred=my_pred,
                                     labels=None,
                                     pos_label=1,
                                     average="binary")
            print("precision_score:%.2f" % (result))
        elif evalue_fun == "recall_score":  # 查全绿   分类指标
            result = recall_score(y_true=y_test,
                                  y_pred=my_pred,
                                  labels=None,
                                  pos_label=1,
                                  average="binary",
                                  sample_weight=None)
            print("recall_score:%.2f" % (result))
        elif evalue_fun == "roc_auc_score":  # 计算 roc 曲线下面的面积就是AUC值,  分类指标
            result = roc_auc_score(y_true=y_test,
                                   y_score=my_pred,
                                   average="macro",
                                   sample_weight=None)
            print("roc_auc_score:%.2f" % (result))
        elif evalue_fun == "roc_curve":  # 计算PROC曲线的横轴坐标  分类指标
            fpr, tpr, thresholds = roc_curve(y_true=y_test,
                                             y_score=my_pred,
                                             pos_label=None,
                                             sample_weight=None,
                                             drop_intermediate=True)
            result = (fpr, tpr, thresholds)
        else:  # mse 参数   均方差, 回归指标
            result = mean_squared_error(y_true=y_test, y_pred=my_pred)
            print("mse:%.2f" % (result))
        return result

    def save_model(self, save_params):  # 模型保存
        self.model.save_model(
            fname=save_params.get(
                "fname",
                "../model/XGBboost_model/XGboostmodel.model")  # 保存的文件路径名字
            # format=save_params.get("format", "cbm"),  # 保存的数据格式
            # pool=save_params.get("pool", None)  #  训练使用的数据   模型保存成json格式,无需使用pool
        )