def main(c_runtime, c_transformer, c_model, c_trainer, c_log): with blocktimer('Preprocess'): train, test = Transformer.run(**c_transformer.__dict__) X_train, y_train, X_test = split_X_y(train, test) with blocktimer('Tune & Train'): modelfactory = ModelFactory() # tune the model params model = modelfactory.create(c_model) optimal_c_model = tune_gbdt_params(model, X_train, y_train, c_trainer.n_splits) # train with best params, full data model = modelfactory.create(optimal_c_model) model = model.train(X_train, y_train) with blocktimer('Predict'): sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = test['TransactionID'] y_test = model.predict(X_test) sub['isFraud'] = y_test sub.to_csv(c_runtime.out_sub_path, index=False) logger.info(f'Saved {c_runtime.out_sub_path}')
def main(c): with blocktimer('Preprocess'): train, test = Transformer.run(**c.transformer.__dict__) X_train, y_train, X_test = split_X_y(train, test) test = test.sort_values('TransactionDT') with blocktimer('Tune & Train'): modelfactory = ModelFactory() # tune the model params model = modelfactory.create(c.model) optimal_c_model = tune_gbdt_params(model, X_train, y_train, c.trainer.n_splits) # train with best params, full data model = modelfactory.create(optimal_c_model) model = model.train(X_train, y_train) # save results model.save(c.model.dir / f'model_{c.runtime.VERSION}_{c.model.TYPE}.pkl') importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance_path = c.runtime.ROOTDIR / 'feature/importance' / f'importance_{c.runtime.VERSION}.csv' importance.to_csv(importance_path) logger.info(f'Saved {str(importance_path)}') with blocktimer('Predict'): sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = test['TransactionID'] y_test = model.predict(X_test) sub['isFraud'] = y_test sub.to_csv(c.runtime.out_sub_path, index=False) logger.debug(f'Saved {c.runtime.out_sub_path}')
def tune_gbdt_params(model, X, y, n_splits) -> dict: ''' Tune parameters by training ''' # start tuning train log create_logger('train', **c.log) logger_train = getLogger('train') logger_train.debug('{}\t{}\t{}\t{}'.format('fold', 'iteration', 'train_auc', 'val_auc')) aucs = list() feature_importances = pd.DataFrame() feature_importances['feature'] = X.columns # split data into train, validation folds = TimeSeriesSplit(n_splits=n_splits) for i, (idx_train, idx_val) in enumerate(folds.split(X, y)): fold = i + 1 with blocktimer(f'Fold {fold}'): # prepare logger.info(f'Training on fold {fold}') X_train = X.iloc[idx_train] y_train = y.iloc[idx_train] X_val = X.iloc[idx_val] y_val = y.iloc[idx_val] # train model = model.train_and_validate(X_train, y_train, X_val, y_val, logger_train, fold) # record result feature_importances[f'fold_{fold}'] = model.feature_importance aucs.append(model.validation_auc) # TODO: save models at each steps logger.debug(f'Fold {fold} finished') logger.info('Training has finished.') logger.debug(f'Mean AUC: {np.mean(aucs)}') # TODO: save feature importance and other # make optimal config from result optimal_c_model = model.config if model.best_iteration is not None: # new param optimal_c_model.params['num_boost_round'] = model.best_iteration else: logger.warn( 'Did not meet early stopping. Try larger num_boost_rounds.') # no need after optimized num_boost_round del optimal_c_model.params['early_stopping_rounds'] return optimal_c_model
def tune_gbdt_params(model, X, y, n_splits) -> dict: ''' Tune parameter num_boost_round ''' # start tuning train log create_logger('train', **c.log) logger_train = getLogger('train') logger_train.debug('{}\t{}\t{}\t{}'.format('fold', 'iteration', 'train_auc', 'val_auc')) # aucs = list() # split data into train, validation folds = TimeSeriesSplit(n_splits=n_splits) for i, (idx_train, idx_val) in enumerate(folds.split(X, y)): fold = i + 1 with blocktimer(f'Training on Fold {fold}'): X_train = X.iloc[idx_train] y_train = y.iloc[idx_train] X_val = X.iloc[idx_val] y_val = y.iloc[idx_val] # train model = model.train_and_validate(X_train, y_train, X_val, y_val, logger_train, fold) model.save( c.model.dir / f'model_{c.runtime.VERSION}_{c.model.TYPE}_fold{fold}.pkl') # record result # aucs.append(model.val_auc) # logger.info(f'train_auc: {model.train_auc} val_auc: {model.val_auc}') # logger.info(f'Mean AUC: {np.mean(aucs)}') # make optimal config from result optimal_c_model = model.config if model.best_iteration is not None: optimal_c_model.params['num_boost_round'] = model.best_iteration else: logger.warn( 'Did not meet early stopping. Try larger num_boost_rounds.') # no need after optimized num_boost_round del optimal_c_model.params['early_stopping_rounds'] return optimal_c_model
def optimize_num_boost_round(model, X, y, n_splits, dsize, paths, scores) -> dict: ''' Estimate best num_boost_round by early stopping ''' # split data into train, validation folds = TimeSeriesSplit(n_splits=n_splits) for i, (idx_train, idx_val) in enumerate(folds.split(X, y)): fold = i + 1 with blocktimer(f'Training on Fold {fold}', level=INFO): X_train = X.iloc[idx_train] y_train = y.iloc[idx_train] X_val = X.iloc[idx_val] y_val = y.iloc[idx_val] # train model = model.train( X_train, y_train, X_val, y_val, num_boost_round=c.train.num_boost_round, early_stopping_rounds=c.train.early_stopping_rounds, fold=fold) model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_fold{fold}{dsize}.pkl' paths.update({f'model_fold_{fold}_path': model_fold_path}) model.save(paths[f'model_fold_{fold}_path']) # make optimal config from result if model.best_iteration > 0: logger.info( f'Early stopping. Best iteration is: {model.best_iteration}') scores.best_iteration = model.best_iteration return model.best_iteration else: logger.warn( 'Did not meet early stopping. Try larger num_boost_rounds.') scores.best_iteration = None return c.train.num_boost_round
def main(c, r): r.scores = {} with blocktimer('Preprocess', level=INFO): # unpack feature set list. set[i]={name: cols} for name, col_list in c.feature.set.items(): in_train_path = f'data/feature/{name}_train.pkl' in_test_path = f'data/feature/{name}_test.pkl' cols = col_list train = pd.read_pickle(in_train_path) test = pd.read_pickle(in_test_path) logger.debug(f'Loaded feature {name}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop(c.feature.target, axis=1) y_train = train[c.feature.target].copy(deep=True) X_test = test del train, test with blocktimer('Tune hyper params', level=INFO): ''' Run optimization ''' mlflow.log_param('type', c.model.type) mlflow.log_param('num_boost_round', c.train.num_boost_round) mlflow.log_param('early_stopping_rounds', c.train.early_stopping_rounds) f = partial(objective, X_train=X_train, y_train=y_train, X_test=X_test, cols=cols, c=c) opt = optuna.create_study( direction='maximize', study_name= f'{experiment_type}_{c.runtime.version}{c.runtime.dsize}', storage= f'sqlite:///data/optimization/{experiment_type}_{c.runtime.version}{c.runtime.dsize}.db', load_if_exists=True) opt.optimize(f, n_trials=c.optimize.n_trials) trial = opt.best_trial r.optimize = {} r.scores.best_trial = trial.number r.scores.best_score = trial.value r.optimize.best_params = trial.params tuned_params = c.model.params.copy() tuned_params.update(trial.params) r.model.tuned_params = tuned_params logger.debug(f'Best trial: {trial.number}') logger.debug(f'Best score: {trial.value}') logger.debug(f'Best params: {trial.params}') mlflow.log_metric('best_trial', trial.number) mlflow.log_metric('best_score', trial.value) mlflow.log_params(trial.params) return r
def main(c): dsize = '.small' if c.runtime.use_small_data is True else '' paths = EasyDict() scores = EasyDict() modelfactory = ModelFactory() with blocktimer('Preprocess', level=INFO): paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl' paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl' train = pd.read_pickle(paths.in_train_path) test = pd.read_pickle(paths.in_test_path) logger.debug(f'Loaded feature {c.features[0]}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop('isFraud', axis=1) X_test = test y_train = train['isFraud'].copy(deep=True) del train, test with blocktimer('Optimize', level=INFO): if c.train.optimize_num_boost_round is True: # tune the model params model = modelfactory.create(c.model) best_iteration = optimize_num_boost_round(model, X_train[c.cols], y_train, c.train.n_splits, dsize, paths, scores) else: logger.debug('Skip optimization') best_iteration = c.train.num_boost_round with blocktimer('Train', level=INFO): logger.debug(f'Now using the following {len(c.cols)} features.') logger.debug(f'{np.array(c.cols)}') # CHRIS - TRAIN 75% PREDICT 25% idxT = X_train.index[:3 * len(X_train) // 4] idxV = X_train.index[3 * len(X_train) // 4:] ''' model = modelfactory.create(c.model) model = model.train(X_train.loc[idxT, :], y_train[idxT], X_train.loc[idxV, :], y_train[idxV], num_boost_round=best_iteration) importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) # save results paths.out_model_dir = f'data/model/model_{c.runtime.version}_{c.model.type}{dsize}.pkl' paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' model.save(paths.out_model_dir) importance.to_csv(paths.importance_path) ''' from sklearn.model_selection import GroupKFold from sklearn.metrics import roc_auc_score oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): month = X_train.iloc[idxV]['DT_M'].iloc[0] logger.info(f'Fold {i+1} withholding month {month}') logger.info( f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}') categorical_features = [ 'ProductCD', 'M4', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', ] model = modelfactory.create(c.model) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] += model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits del model logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}') paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' # model.save(paths.out_model_dir) ''' importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance.to_csv(paths.importance_path) ''' with blocktimer('Predict', level=INFO): # y_test = model.predict(X_test) sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = X_test.reset_index()['TransactionID'] # sub['isFraud'] = y_test sub['isFraud'] = preds paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv' sub.to_csv(paths.out_sub_path, index=False) result = EasyDict() result.update(c) result.scores = scores result.paths = paths return result
def main(c): dsize = '.small' if c.runtime.use_small_data is True else '' paths = EasyDict() scores = EasyDict() result = EasyDict() result.update(c) modelfactory = ModelFactory() with blocktimer('Preprocess', level=INFO): paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl' paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl' train = pd.read_pickle(paths.in_train_path) test = pd.read_pickle(paths.in_test_path) logger.debug(f'Loaded feature {c.features[0]}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop('isFraud', axis=1) X_test = test y_train = train['isFraud'].copy(deep=True) del train, test with blocktimer('Optimize num_boost_round', level=INFO): if c.train.optimize_num_boost_round is True: # tune the model params model = modelfactory.create(c.model) best_iteration = optimize_num_boost_round(model, X_train[c.cols], y_train, c.train.n_splits, dsize, paths, scores) else: logger.debug('Skip optimization') best_iteration = c.train.num_boost_round with blocktimer('Optimize model params', level=INFO): if c.train.optimize_model_params is True: # define objective for optuna def objectives(trial): max_depth = trial.suggest_int('max_depth', 3, 12) params = { 'boosting_type': 'gbdt', # num_leaves should be smaller than approximately 2^max_depth*0.75 'num_leaves': 2**max_depth * 3 // 4, 'max_depth': max_depth, 'learning_rate': 0.05, 'objective': 'binary', 'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 1e0), # 0.03454472573214212, 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 1e0), # 0.3899927210061127, 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e0), # 0.6485237330340494, 'random_state': 42, 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 200), # 106, 'metric': 'auc', 'max_bin': 255 } c.model.params = params # Train by 6-fold CV oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): fold = i + 1 month = X_train.iloc[idxV]['DT_M'].iloc[0] model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_opt_fold{fold}{dsize}.pkl' model = modelfactory.create(c.model) logger.info(f'Fold {fold} withholding month {month}') logger.info( f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}' ) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] = model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits paths.update({f'model_fold_{fold}_path': model_fold_path}) model.save(paths[f'model_fold_{fold}_path']) del model score = roc_auc_score(y_train, oof) logger.info(f'Fold {fold} OOF cv= {score}') return score # run optimization opt = optuna.create_study( direction='maximize', study_name=f'parameter_study_0016{dsize}', storage= f'sqlite:///data/optimization/parameter_study_0016{dsize}.db', load_if_exists=True) opt.optimize(objectives, n_trials=20) trial = opt.best_trial logger.debug(f'Best trial: {trial.value}') logger.debug(f'Best params: {trial.params}') scores.best_trial = trial.value result.optimize = {} result.optimize.best_params = trial.params else: logger.debug('Skip optimization') with blocktimer('Train', level=INFO): if c.train.train_model: logger.debug(f'Now using the following {len(c.cols)} features.') logger.debug(f'{np.array(c.cols)}') oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): month = X_train.iloc[idxV]['DT_M'].iloc[0] logger.info(f'Fold {i+1} withholding month {month}') logger.info( f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}' ) ''' categorical_features = ['ProductCD', 'M4', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', ] ''' model = modelfactory.create(c.model) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] = model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits del model logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}') paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' # model.save(paths.out_model_dir) ''' importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance.to_csv(paths.importance_path) ''' with blocktimer('Predict', level=INFO): if c.train.predict: sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = X_test.reset_index()['TransactionID'] sub['isFraud'] = preds paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv' sub.to_csv(paths.out_sub_path, index=False) result.scores = scores result.paths = paths return result