def train_for_threshold(self, features, target='label', num=35000): train_df = self.train_[self.train_.ID < num] val_df = self.train_[self.train_.ID >= num] X_train, y_train = train_df[features].values, train_df[ target].values.astype('uint8') X_eval, y_eval = val_df[features].values, val_df[target].values.astype( 'uint8') lgb_train = lgb.Dataset(X_train, y_train) lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train) lgb_model = lgb.train(self.params, lgb_train, num_boost_round=10000, valid_sets=[lgb_train, lgb_eval], valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=1000) y_pred = lgb_model.predict(X_eval) ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词 gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred) ## 获取搜索得到的阈值结果 self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba) return self.threshold
def train_for_threshold(self, features, target='label', num=35000): train_df = self.train_[self.train_.ID < num] val_df = self.train_[self.train_.ID >= num] X_train, y_train = train_df[features].values, train_df[ target].values.astype('uint8') X_eval, y_eval = val_df[features].values, val_df[target].values.astype( 'uint8') xgb_train = xgb.DMatrix(X_train, y_train) xgb_eval = xgb.DMatrix(X_eval, y_eval) xgb_model = xgb.train(self.params, xgb_train, num_boost_round=1000, evals=[(xgb_train, 'train'), (xgb_eval, 'eval')], early_stopping_rounds=100, verbose_eval=100) y_pred = xgb_model.predict(xgb_eval) ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词 gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred) ## 获取搜索得到的阈值结果 self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba) return self.threshold
def save_result(self): pred_id, pred_words, pred_proba = sort_val(self.test_, self.pred, predict=True) entities = return_entity(pred_words, pred_proba, self.threshold) save_as_order(pred_id, entities, self.opt, 'lgb_result.txt')
def train_for_threshold(self, features, target='label', num=35000): train_df = self.train_[self.train_.ID < num] val_df = self.train_[self.train_.ID >= num] X_train, y_train = train_df[features].values, train_df[target].values.astype('uint8') X_eval, y_eval = val_df[features].values, val_df[target].values.astype('uint8') cat_train = Pool(X_train, y_train) cat_eval = Pool(X_eval, y_eval) cat_model = catboost.train(cat_train, self.params, iterations=10000, eval_set=cat_eval, early_stopping_rounds=200, verbose=500) y_pred = cat_model.predict(cat_eval, prediction_type='Probability')[:,1] ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词 gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred) ## 获取搜索得到的阈值结果 self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba) return self.threshold