class Classifier(object): def __init__(self, conf, task, train=None, test=None): self.conf = conf self.task = task self.train_ = train self.test_ = test self.features = [ "hasWith", "hasIn", "simiBucket", "textPos", "hasOf", "hasAnd", "startEntity", "distance", "hasFrom", "endEntity", "similarity", "hasThan", "hasVerb" ] self.labels = ["relation"] self.num_round = 500 self.eval_set = list() self.early_stopping_rounds = 20 self.classifier = XGBClassifier(max_depth=4, learning_rate=0.1, n_estimators=1000, gamma=4, verbosity=1, objective='multi:softmax', num_class=6, booster='gbtree', n_jobs=4, seed=27) def train(self): train_X, test_X, train_y, test_y = train_test_split( self.train_[self.features], self.train_[self.labels], test_size=0.4, random_state=42) self.eval_set = [(train_X.values, train_y.values), (test_X.values, test_y.values)] self.classifier.fit(train_X.values, train_y.values, eval_metric='merror', eval_set=self.eval_set, early_stopping_rounds=self.early_stopping_rounds, verbose=True) self.classifier.save_model(self.conf.model_path.format(self.task)) return 'Model has been saved!' def test(self): test_set = self.test_[self.features].values self.classifier.load_model(self.conf.model_path.format(self.task)) self.classifier._le = LabelEncoder().fit([ 'USAGE', 'TOPIC', 'MODEL-FEATURE', 'PART_WHOLE', 'RESULT', 'COMPARE' ]) pred = self.classifier.predict(test_set) predictions = pd.concat([ self.test_[self.features], pd.DataFrame(pred, columns=["relation"]) ], axis=1) return predictions
def xgboost_train(X_train, X_test, y_train, y_test, model_path): print('Training by xgb') # default is binary:logistic train_model = XGBClassifier(use_label_encoder=False).fit(X_train, y_train) pred = train_model.predict(X_test) print('Classification report:\n', classification_report(y_test, pred)) auc = accuracy_score(y_test, pred) * 100 print(f'Accuracy score: {auc}') print('Save model to ', model_path) train_model.save_model(model_path) return auc
GS_XGB.best_score_)) print("The best params found in epoch {} is:\n{}".format(e, ind_params)) if (old_params == ind_params) and (e >= 3): best_params_['GDM'] = ind_params break old_params = ind_params.copy() random.shuffle(cv_params_list_copy) print('----{}_{} TRAINING FINISHED----'.format('GDM', '1')) best_params_file = pd.DataFrame.from_dict(best_params_) clf = XGBClassifier(random_state=1, scale_pos_weight=12.026280323450134, n_estimators=200, max_depth=2, min_child_weight=29, colsample_bytree=0.7, subsample=1, gamma=0, reg_alpha=5, reg_lambda=5, learning_rate=0.1, n_jobs=-1).fit(X_train, y_train) pred_train, pred_test = clf.predict_proba(X_train), clf.predict_proba(X_test) score_train, score_test = roc_auc_score(y_train, pred_train[:, 1]), roc_auc_score( y_test, pred_test[:, 1]) print(score_train, score_test) clf.save_model("G:/Thesis/code/xgboost.model")
gc.collect() print('Training by xgb') # default is binary:logistic train_model = XGBClassifier(use_label_encoder=False).fit(X_train, y_train) pred = train_model.predict(X_test) print('Classification report:\n', classification_report(y_test, pred)) print(f'Accuracy score: {accuracy_score(y_test, pred) * 100}') del train_df gc.collect() print('Save model to ', MODEL_PATH) train_model.save_model(MODEL_PATH) print('Prepare online serving') print('Deploy sql') connection.execute("SET @@execute_mode='online';") connection.execute(f'USE {DB_NAME}') nothrow_execute(f'DROP DEPLOYMENT {DEPLOY_NAME}') deploy_sql = f"""DEPLOY {DEPLOY_NAME} {sql_part}""" print(deploy_sql) connection.execute(deploy_sql) print('Import data to online') # online feature extraction needs history data # set job_timeout bigger if the `LOAD DATA` job timeout connection.execute( f"LOAD DATA INFILE 'file://{os.path.abspath('train_sample.csv')}' "
thresh = 0.01702188141644001 selection = SelectFromModel(model, threshold=thresh, prefit=True) select_X = selection.transform(X) model = XGBClassifier( max_depth=3, learning_rate=0.01, verbosity=None, # might throw an error, just put it as 0 if it does objective= 'binary:logistic', booster='gbtree', tree_method='exact', n_jobs=0, gamma=0.0, min_child_weight=4, max_delta_step=0, subsample=.85, colsample_bytree=0.6, colsample_bynode=1, reg_alpha=0.1, reg_lambda=100, scale_pos_weight=1, base_score=0.5, random_state=812) model = model.fit(select_X, y, eval_metric='auc') ''' model.save_model('FULLNBAMODEL2010-2020test.model') print("model complete")