Ejemplo n.º 1
0
class Classifier(object):
    def __init__(self, conf, task, train=None, test=None):
        self.conf = conf
        self.task = task
        self.train_ = train
        self.test_ = test
        self.features = [
            "hasWith", "hasIn", "simiBucket", "textPos", "hasOf", "hasAnd",
            "startEntity", "distance", "hasFrom", "endEntity", "similarity",
            "hasThan", "hasVerb"
        ]
        self.labels = ["relation"]
        self.num_round = 500
        self.eval_set = list()
        self.early_stopping_rounds = 20
        self.classifier = XGBClassifier(max_depth=4,
                                        learning_rate=0.1,
                                        n_estimators=1000,
                                        gamma=4,
                                        verbosity=1,
                                        objective='multi:softmax',
                                        num_class=6,
                                        booster='gbtree',
                                        n_jobs=4,
                                        seed=27)

    def train(self):
        train_X, test_X, train_y, test_y = train_test_split(
            self.train_[self.features],
            self.train_[self.labels],
            test_size=0.4,
            random_state=42)

        self.eval_set = [(train_X.values, train_y.values),
                         (test_X.values, test_y.values)]
        self.classifier.fit(train_X.values,
                            train_y.values,
                            eval_metric='merror',
                            eval_set=self.eval_set,
                            early_stopping_rounds=self.early_stopping_rounds,
                            verbose=True)

        self.classifier.save_model(self.conf.model_path.format(self.task))
        return 'Model has been saved!'

    def test(self):
        test_set = self.test_[self.features].values
        self.classifier.load_model(self.conf.model_path.format(self.task))
        self.classifier._le = LabelEncoder().fit([
            'USAGE', 'TOPIC', 'MODEL-FEATURE', 'PART_WHOLE', 'RESULT',
            'COMPARE'
        ])
        pred = self.classifier.predict(test_set)
        predictions = pd.concat([
            self.test_[self.features],
            pd.DataFrame(pred, columns=["relation"])
        ],
                                axis=1)

        return predictions
Ejemplo n.º 2
0
def xgboost_train(X_train, X_test, y_train, y_test, model_path):
    print('Training by xgb')
    # default is binary:logistic
    train_model = XGBClassifier(use_label_encoder=False).fit(X_train, y_train)
    pred = train_model.predict(X_test)
    print('Classification report:\n', classification_report(y_test, pred))
    auc = accuracy_score(y_test, pred) * 100
    print(f'Accuracy score: {auc}')

    print('Save model to ', model_path)
    train_model.save_model(model_path)
    return auc
                                                   GS_XGB.best_score_))
    print("The best params found in epoch {} is:\n{}".format(e, ind_params))
    if (old_params == ind_params) and (e >= 3):
        best_params_['GDM'] = ind_params
        break
    old_params = ind_params.copy()
    random.shuffle(cv_params_list_copy)
print('----{}_{} TRAINING FINISHED----'.format('GDM', '1'))
best_params_file = pd.DataFrame.from_dict(best_params_)

clf = XGBClassifier(random_state=1,
                    scale_pos_weight=12.026280323450134,
                    n_estimators=200,
                    max_depth=2,
                    min_child_weight=29,
                    colsample_bytree=0.7,
                    subsample=1,
                    gamma=0,
                    reg_alpha=5,
                    reg_lambda=5,
                    learning_rate=0.1,
                    n_jobs=-1).fit(X_train, y_train)
pred_train, pred_test = clf.predict_proba(X_train), clf.predict_proba(X_test)

score_train, score_test = roc_auc_score(y_train,
                                        pred_train[:, 1]), roc_auc_score(
                                            y_test, pred_test[:, 1])
print(score_train, score_test)

clf.save_model("G:/Thesis/code/xgboost.model")
Ejemplo n.º 4
0
gc.collect()

print('Training by xgb')

# default is binary:logistic
train_model = XGBClassifier(use_label_encoder=False).fit(X_train, y_train)
pred = train_model.predict(X_test)
print('Classification report:\n', classification_report(y_test, pred))
print(f'Accuracy score: {accuracy_score(y_test, pred) * 100}')

del train_df
gc.collect()

print('Save model to ', MODEL_PATH)
train_model.save_model(MODEL_PATH)

print('Prepare online serving')

print('Deploy sql')
connection.execute("SET @@execute_mode='online';")
connection.execute(f'USE {DB_NAME}')
nothrow_execute(f'DROP DEPLOYMENT {DEPLOY_NAME}')
deploy_sql = f"""DEPLOY {DEPLOY_NAME} {sql_part}"""
print(deploy_sql)
connection.execute(deploy_sql)
print('Import data to online')
# online feature extraction needs history data
# set job_timeout bigger if the `LOAD DATA` job timeout
connection.execute(
    f"LOAD DATA INFILE 'file://{os.path.abspath('train_sample.csv')}' "
thresh = 0.01702188141644001

selection = SelectFromModel(model, threshold=thresh, prefit=True)
select_X = selection.transform(X)


model = XGBClassifier(
                max_depth=3,
                learning_rate=0.01,
                verbosity=None, # might throw an error, just put it as 0 if it does
                objective= 'binary:logistic',
                booster='gbtree',
                tree_method='exact',
                n_jobs=0,
                gamma=0.0,
                min_child_weight=4,
                max_delta_step=0,
                subsample=.85, 
                colsample_bytree=0.6,
                colsample_bynode=1,
                reg_alpha=0.1,
                reg_lambda=100,
                scale_pos_weight=1,
                base_score=0.5,
                random_state=812)

model = model.fit(select_X, y, eval_metric='auc')
'''

model.save_model('FULLNBAMODEL2010-2020test.model')
print("model complete")