def save_features(df, data_type='train'): save_path = Path('../configs/feature/all.yml') dh = DataHandler() if not save_path.exists(): save_path.touch() feature_dict = {'features': []} else: feature_dict = dh.load(save_path) new_feature = sorted(set(feature_dict['features'] + df.columns.tolist())) feature_dict['features'] = new_feature dh.save(save_path, feature_dict) for col in df.columns: df[[col]].reset_index(drop=True).to_feather(f'../features/{col}_{data_type}.feather')
class Trainer: def __init__(self, cfg): self.cfg = cfg self.cat_features = self.cfg.data.features.cat_features self.oof = None self.raw_preds = None self.weights = [] self.models = [] self.scores = [] self.feature_importance_df = pd.DataFrame( columns=['feature', 'importance']) self.dh = DataHandler() def train(self, train_df: pd.DataFrame, target_df: pd.DataFrame, fold_df: pd.DataFrame): self.oof = np.zeros(len(train_df)) for fold_, col in enumerate(fold_df.columns): print( f'\n========================== FOLD {fold_} ... ==========================\n' ) logging.debug( f'\n========================== FOLD {fold_} ... ==========================\n' ) self._train_fold(train_df, target_df, fold_df[col]) print('\n\n===================================\n') print(f'CV: {np.mean(self.scores):.4f}') print('\n===================================\n\n') logging.debug('\n\n===================================\n') logging.debug(f'CV: {np.mean(self.scores):.4f}') logging.debug('\n===================================\n\n') return np.mean(self.scores) def _train_fold(self, train_df, target_df, fold): tr_x, va_x = train_df[fold == 0], train_df[fold > 0] tr_y, va_y = target_df[fold == 0], target_df[fold > 0] weight = fold.max() self.weights.append(weight) model = factory.get_model(self.cfg.model) model.fit(tr_x, tr_y, va_x, va_y, self.cat_features) va_pred = model.predict(va_x, self.cat_features) if self.cfg.data.target.reconvert_type: va_y = getattr(np, self.cfg.data.target.reconvert_type)(va_y) va_pred = getattr(np, self.cfg.data.target.reconvert_type)(va_pred) va_pred = np.where(va_pred >= 0, va_pred, 0) self.models.append(model) self.oof[va_x.index] = va_pred.copy() score = factory.get_metrics(self.cfg.common.metrics.name)(va_y, va_pred) self.scores.append(score) if self.cfg.model.name in ['lightgbm', 'catboost', 'xgboost']: importance_fold_df = pd.DataFrame() fold_importance = model.extract_importances() importance_fold_df['feature'] = train_df.columns importance_fold_df['importance'] = fold_importance self.feature_importance_df = pd.concat( [self.feature_importance_df, importance_fold_df], axis=0) def predict(self, test_df): preds = np.zeros(len(test_df)) for fold_, model in enumerate(self.models): pred = model.predict(test_df, self.cat_features) if self.cfg.data.target.reconvert_type: pred = getattr(np, self.cfg.data.target.reconvert_type)(pred) pred = np.where(pred >= 0, pred, 0) preds += pred.copy() * self.weights[fold_] self.raw_preds = preds.copy() return preds def save(self, run_name): log_dir = Path(f'../logs/{run_name}') self.dh.save(log_dir / 'oof.npy', self.oof) self.dh.save(log_dir / 'raw_preds.npy', self.raw_preds) self.dh.save(log_dir / 'importance.csv', self.feature_importance_df) self.dh.save(log_dir / 'model_weight.pkl', self.models)