if model.best_iteration is not None: optimal_c_model.params['num_boost_round'] = model.best_iteration else: logger.warn( 'Did not meet early stopping. Try larger num_boost_rounds.') # no need after optimized num_boost_round del optimal_c_model.params['early_stopping_rounds'] return optimal_c_model if __name__ == "__main__": gc.enable() warnings.filterwarnings('ignore') # read config & apply option c = EasyDict(config) opt = parse_option() c.transformer.USE_SMALL_DATA = opt.small c.log.slackauth.NO_SEND_MESSAGE = opt.nomsg seed_everything(c.runtime.RANDOM_SEED) create_logger('main', **c.log) logger = getLogger('main') logger.info( f':thinking_face: Starting experiment {c.runtime.VERSION}_{c.runtime.DESCRIPTION}' ) try: main(c) logger.info(
mlflow.log_metric('best_trial', trial.number) mlflow.log_metric('best_score', trial.value) mlflow.log_params(trial.params) return r if __name__ == '__main__': gc.enable() warnings.filterwarnings('ignore') # get option opt = parse_option() # c is for config c = json.load(open(f'config/config_{opt.version}.json')) c = EasyDict(c) c.runtime = {} c.runtime.version = opt.version c.runtime.use_small_data = opt.small c.runtime.no_send_message = opt.nomsg c.runtime.random_seed = opt.seed c.runtime.dsize = '.small' if c.runtime.use_small_data is True else '' # dict to save results r = EasyDict() r.update(c) r.paths = {} r.paths.result = f'config/result_{c.runtime.version}{c.runtime.dsize}.json' seed_everything(c.runtime.random_seed)
f'Early stopping. Best iteration is: {model.best_iteration}') scores.best_iteration = model.best_iteration return model.best_iteration else: logger.warn( 'Did not meet early stopping. Try larger num_boost_rounds.') scores.best_iteration = None return c.train.num_boost_round if __name__ == "__main__": gc.enable() warnings.filterwarnings('ignore') # slack config slackauth = EasyDict(json.load(open('./slackauth.json', 'r'))) slackauth.token_path = Path().home() / slackauth.token_file # get option opt = parse_option() c = json.load(open(f'config/config_{opt.version}.json')) c = EasyDict(c) c.runtime = {} c.runtime.version = opt.version c.runtime.use_small_data = opt.small c.runtime.no_send_message = opt.nomsg c.runtime.random_seed = opt.seed seed_everything(c.runtime.random_seed) dsize = '.small' if c.runtime.use_small_data is True else ''
def main(c): dsize = '.small' if c.runtime.use_small_data is True else '' paths = EasyDict() scores = EasyDict() modelfactory = ModelFactory() with blocktimer('Preprocess', level=INFO): paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl' paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl' train = pd.read_pickle(paths.in_train_path) test = pd.read_pickle(paths.in_test_path) logger.debug(f'Loaded feature {c.features[0]}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop('isFraud', axis=1) X_test = test y_train = train['isFraud'].copy(deep=True) del train, test with blocktimer('Optimize', level=INFO): if c.train.optimize_num_boost_round is True: # tune the model params model = modelfactory.create(c.model) best_iteration = optimize_num_boost_round(model, X_train[c.cols], y_train, c.train.n_splits, dsize, paths, scores) else: logger.debug('Skip optimization') best_iteration = c.train.num_boost_round with blocktimer('Train', level=INFO): logger.debug(f'Now using the following {len(c.cols)} features.') logger.debug(f'{np.array(c.cols)}') # CHRIS - TRAIN 75% PREDICT 25% idxT = X_train.index[:3 * len(X_train) // 4] idxV = X_train.index[3 * len(X_train) // 4:] ''' model = modelfactory.create(c.model) model = model.train(X_train.loc[idxT, :], y_train[idxT], X_train.loc[idxV, :], y_train[idxV], num_boost_round=best_iteration) importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) # save results paths.out_model_dir = f'data/model/model_{c.runtime.version}_{c.model.type}{dsize}.pkl' paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' model.save(paths.out_model_dir) importance.to_csv(paths.importance_path) ''' from sklearn.model_selection import GroupKFold from sklearn.metrics import roc_auc_score oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): month = X_train.iloc[idxV]['DT_M'].iloc[0] logger.info(f'Fold {i+1} withholding month {month}') logger.info( f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}') categorical_features = [ 'ProductCD', 'M4', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', ] model = modelfactory.create(c.model) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] += model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits del model logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}') paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' # model.save(paths.out_model_dir) ''' importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance.to_csv(paths.importance_path) ''' with blocktimer('Predict', level=INFO): # y_test = model.predict(X_test) sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = X_test.reset_index()['TransactionID'] # sub['isFraud'] = y_test sub['isFraud'] = preds paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv' sub.to_csv(paths.out_sub_path, index=False) result = EasyDict() result.update(c) result.scores = scores result.paths = paths return result
def main(c): dsize = '.small' if c.runtime.use_small_data is True else '' paths = EasyDict() scores = EasyDict() result = EasyDict() result.update(c) modelfactory = ModelFactory() with blocktimer('Preprocess', level=INFO): paths.in_train_path = f'data/feature/{c.features[0]}_train.pkl' paths.in_test_path = f'data/feature/{c.features[0]}_test.pkl' train = pd.read_pickle(paths.in_train_path) test = pd.read_pickle(paths.in_test_path) logger.debug(f'Loaded feature {c.features[0]}') if c.runtime.use_small_data: frac = 0.001 train = train.sample(frac=frac, random_state=42) test = test.sample(frac=frac, random_state=42) logger.debug(f'train.shape: {train.shape}, test.shape: {test.shape}') # Split into X, y X_train = train.drop('isFraud', axis=1) X_test = test y_train = train['isFraud'].copy(deep=True) del train, test with blocktimer('Optimize num_boost_round', level=INFO): if c.train.optimize_num_boost_round is True: # tune the model params model = modelfactory.create(c.model) best_iteration = optimize_num_boost_round(model, X_train[c.cols], y_train, c.train.n_splits, dsize, paths, scores) else: logger.debug('Skip optimization') best_iteration = c.train.num_boost_round with blocktimer('Optimize model params', level=INFO): if c.train.optimize_model_params is True: # define objective for optuna def objectives(trial): max_depth = trial.suggest_int('max_depth', 3, 12) params = { 'boosting_type': 'gbdt', # num_leaves should be smaller than approximately 2^max_depth*0.75 'num_leaves': 2**max_depth * 3 // 4, 'max_depth': max_depth, 'learning_rate': 0.05, 'objective': 'binary', 'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-3, 1e0), # 0.03454472573214212, 'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-2, 1e0), # 0.3899927210061127, 'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-2, 1e0), # 0.6485237330340494, 'random_state': 42, 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 200), # 106, 'metric': 'auc', 'max_bin': 255 } c.model.params = params # Train by 6-fold CV oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): fold = i + 1 month = X_train.iloc[idxV]['DT_M'].iloc[0] model_fold_path = f'data/model/model_{c.runtime.version}_{c.model.type}_opt_fold{fold}{dsize}.pkl' model = modelfactory.create(c.model) logger.info(f'Fold {fold} withholding month {month}') logger.info( f'rows of train= {len(idxT)}, rows of holdout= {len(idxV)}' ) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] = model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits paths.update({f'model_fold_{fold}_path': model_fold_path}) model.save(paths[f'model_fold_{fold}_path']) del model score = roc_auc_score(y_train, oof) logger.info(f'Fold {fold} OOF cv= {score}') return score # run optimization opt = optuna.create_study( direction='maximize', study_name=f'parameter_study_0016{dsize}', storage= f'sqlite:///data/optimization/parameter_study_0016{dsize}.db', load_if_exists=True) opt.optimize(objectives, n_trials=20) trial = opt.best_trial logger.debug(f'Best trial: {trial.value}') logger.debug(f'Best params: {trial.params}') scores.best_trial = trial.value result.optimize = {} result.optimize.best_params = trial.params else: logger.debug('Skip optimization') with blocktimer('Train', level=INFO): if c.train.train_model: logger.debug(f'Now using the following {len(c.cols)} features.') logger.debug(f'{np.array(c.cols)}') oof = np.zeros(len(X_train)) preds = np.zeros(len(X_test)) skf = GroupKFold(n_splits=6) for i, (idxT, idxV) in enumerate( skf.split(X_train, y_train, groups=X_train['DT_M'])): month = X_train.iloc[idxV]['DT_M'].iloc[0] logger.info(f'Fold {i+1} withholding month {month}') logger.info( f'rows of train ={len(idxT)}, rows of holdout ={len(idxV)}' ) ''' categorical_features = ['ProductCD', 'M4', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain', ] ''' model = modelfactory.create(c.model) model = model.train( X_train[c.cols].iloc[idxT], y_train.iloc[idxT], X_train[c.cols].iloc[idxV], y_train.iloc[idxV], num_boost_round=best_iteration, early_stopping_rounds=c.train.early_stopping_rounds, # categorical_features=categorical_features, fold=i + 1) oof[idxV] = model.predict(X_train[c.cols].iloc[idxV]) preds += model.predict(X_test[c.cols]) / skf.n_splits del model logger.info(f'OOF cv= {roc_auc_score(y_train, oof)}') paths.importance_path = f'feature/importance/importance_{c.runtime.version}{dsize}.csv' # model.save(paths.out_model_dir) ''' importance = pd.DataFrame(model.feature_importance, index=X_train.columns, columns=['importance']) importance.to_csv(paths.importance_path) ''' with blocktimer('Predict', level=INFO): if c.train.predict: sub = pd.DataFrame(columns=['TransactionID', 'isFraud']) sub['TransactionID'] = X_test.reset_index()['TransactionID'] sub['isFraud'] = preds paths.out_sub_path = f'data/submission/submission_{c.runtime.version}{dsize}.csv' sub.to_csv(paths.out_sub_path, index=False) result.scores = scores result.paths = paths return result