def train_cross_val(p): data_ = load_data(root_dir='./data/', mode='train') data_, target_, features, date = preprocess_data(data_, nn=True) gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=5) input_size = data_.shape[-1] output_size = 1 tb_logger = pl_loggers.TensorBoardLogger('logs/') models = [] for i, (train_idx, val_idx) in enumerate(gts.split(data_, groups=date)): idx = np.concatenate([train_idx, val_idx]) data = copy.deepcopy(data_[idx]) target = copy.deepcopy(target_[idx]) checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( 'models/', "fold_{}".format(i)), monitor="val_auc", mode='max', save_top_k=1, period=10) model = Classifier(input_size=input_size, output_size=output_size, params=p) if p['activation'] == nn.ReLU: model.apply(lambda m: init_weights(m, 'relu')) elif p['activation'] == nn.LeakyReLU: model.apply(lambda m: init_weights(m, 'leaky_relu')) train_idx = [i for i in range(0, max(train_idx) + 1)] val_idx = [i for i in range(len(train_idx), len(idx))] data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True, mode='mean') data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False, mode='mean') dataset = FinData(data=data, target=target, date=date) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_auc', patience=10, min_delta=0.0005, mode='max') trainer = pl.Trainer(logger=tb_logger, max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), f'models/fold_{i}_state_dict.pth') models.append(model) return models, features
def loptimize(trial, data_dict: dict): p = { 'learning_rate': trial.suggest_uniform('learning_rate', 1e-4, 1e-1), 'max_leaves': trial.suggest_int('max_leaves', 5, 100), 'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 0.99), 'bagging_freq': trial.suggest_int('bagging_freq', 1, 10), 'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 0.99), 'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 50, 1000), 'lambda_l1': trial.suggest_uniform('lambda_l1', 0.005, 0.05), 'lambda_l2': trial.suggest_uniform('lambda_l2', 0.005, 0.05), 'boosting': trial.suggest_categorical('boosting', ['gbdt', 'goss', 'rf']), 'objective': 'binary', 'verbose': 1, 'n_jobs': 10, 'metric': 'auc' } if p['boosting'] == 'goss': p['bagging_freq'] = 0 p['bagging_fraction'] = 1.0 scores = [] sizes = [] data = data_dict['data'] target = data_dict['target'] date = data_dict['date'] # gts = GroupTimeSeriesSplit() gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10) for i, (tr_idx, val_idx) in enumerate(gts.split(data, groups=date)): sizes.append(len(tr_idx)) x_tr, x_val = data.iloc[tr_idx], data.iloc[val_idx] y_tr, y_val = target[tr_idx], target[val_idx] x_tr, x_val = calc_data_mean(x_tr, cache_dir='cache/', fold=i, train=True), \ calc_data_mean(x_val, cache_dir='cache/', fold=i, train=False) train = lgb.Dataset(x_tr, label=y_tr) val = lgb.Dataset(x_val, label=y_val) clf = lgb.train(p, train, 500, valid_sets=[val], early_stopping_rounds=50, verbose_eval=True) preds = clf.predict(x_val) score = roc_auc_score(y_val, preds) print(f'Fold {i} ROC AUC:\t', score) scores.append(score) del clf, preds, train, val, x_tr, x_val, y_tr, y_val, score rubbish = gc.collect() print(scores) avg_score = weighted_mean(scores, sizes) print('Avg Score:', avg_score) return avg_score
def optimize(trial: optuna.trial.Trial, data_dict: dict): p = { 'learning_rate': trial.suggest_uniform('learning_rate', 1e-4, 1e-1), 'max_depth': trial.suggest_int('max_depth', 5, 30), 'max_leaves': trial.suggest_int('max_leaves', 5, 50), 'subsample': trial.suggest_uniform('subsample', 0.3, 1.0), 'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.3, 1.0), 'min_child_weight': trial.suggest_int('min_child_weight', 5, 100), 'lambda': trial.suggest_uniform('lambda', 0.05, 0.2), 'alpha': trial.suggest_uniform('alpha', 0.05, 0.2), 'objective': 'binary:logistic', 'booster': 'gbtree', 'tree_method': 'gpu_hist', 'verbosity': 1, 'n_jobs': 10, 'eval_metric': 'auc' } print('Choosing parameters:', p) scores = [] sizes = [] # gts = GroupTimeSeriesSplit() data = data_dict['data'] target = data_dict['target'] date = data_dict['date'] gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10) for i, (tr_idx, val_idx) in enumerate(gts.split(data, groups=date)): sizes.append(len(tr_idx)) x_tr, x_val = copy.deepcopy(data.iloc[tr_idx]), copy.deepcopy( data.iloc[val_idx]) y_tr, y_val = copy.deepcopy(target[tr_idx]), copy.deepcopy( target[val_idx]) x_tr, x_val = calc_data_mean(x_tr, cache_dir='cache/', fold=i, train=True), \ calc_data_mean(x_val, cache_dir='cache/', fold=i, train=False) d_tr = xgb.DMatrix(x_tr, label=y_tr) d_val = xgb.DMatrix(x_val, label=y_val) clf = xgb.train(p, d_tr, 500, [(d_val, 'eval')], early_stopping_rounds=50, verbose_eval=True) val_pred = clf.predict(d_val) score = roc_auc_score(y_val, val_pred) print(f'Fold {i} ROC AUC:\t', score) scores.append(score) del clf, val_pred, d_tr, d_val, x_tr, x_val, y_tr, y_val, score rubbish = gc.collect() print(scores) avg_score = weighted_mean(scores, sizes) print('Avg Score:', avg_score) return avg_score
def main(train=True): p = { 'batch_size': 4986, 'dim_1': 248, 'dim_2': 487, 'dim_3': 269, 'dim_4': 218, 'dim_5': 113, 'activation': nn.ReLU, 'dropout': 0.01563457578202565, 'lr': 0.00026372556533974916, 'label_smoothing': 0.06834918091900156, 'weight_decay': 0.005270589494631074, 'amsgrad': False } if train: models, features = train_cross_val(p) # models, features = final_train(p, load=False) else: data_ = load_data(root_dir='./data/', mode='train') data_, target_, features, date = preprocess_data(data_, nn=True) model_path = '/kaggle/input/model-files' f_mean = calc_data_mean(data_, 'cache') models = load_model(model_path, data_.shape[-1], 1, p, False) # model, checkpoint = final_train(p) # best_model_path = checkpoint.best_model_path # model, features = final_train(load=best_model_path) test_model(models, features) return models
def final_train(p, load=False): data_ = load_data(root_dir='./data/', mode='train') data, target, features, date = preprocess_data(data_, nn=True) input_size = data.shape[-1] output_size = 1 train_idx, val_idx = date[date <= 450].index.values.tolist(), date[ date > 450].index.values.tolist() data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True) data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False) checkpoint_callback = pl.callbacks.ModelCheckpoint( filepath='models/full_train', monitor="val_auc", mode='max', save_top_k=1, period=10) model = Classifier(input_size=input_size, output_size=output_size, params=p) if p['activation'] == nn.ReLU: model.apply(lambda m: init_weights(m, 'relu')) elif p['activation'] == nn.LeakyReLU: model.apply(lambda m: init_weights(m, 'leaky_relu')) dataset = FinData(data, target, date) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_auc', patience=10, min_delta=0.0005, mode='max') trainer = pl.Trainer(max_epochs=500, gpus=1, callbacks=[checkpoint_callback, es], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) torch.save(model.state_dict(), 'models/final_train.pth') return model, features
def optimize(trial: optuna.Trial, data_dict): gts = PurgedGroupTimeSeriesSplit(n_splits=5, group_gap=10) input_size = data_dict['data'].shape[-1] output_size = 5 checkpoint_callback = pl.callbacks.ModelCheckpoint(os.path.join( 'models/', "trial_resnet_{}".format(trial.number)), monitor="val_auc", mode='max') logger = MetricsCallback() metrics = [] sizes = [] # trial_file = 'HPO/nn_hpo_2021-01-05.pkl' trial_file = None p = create_param_dict(trial, trial_file) p['batch_size'] = trial.suggest_int('batch_size', 8000, 15000) for i, (train_idx, val_idx) in enumerate( gts.split(data_dict['data'], groups=data_dict['date'])): idx = np.concatenate([train_idx, val_idx]) data = copy.deepcopy(data_dict['data'][idx]) target = copy.deepcopy(data_dict['target'][idx]) date = copy.deepcopy(data_dict['date'][idx]) train_idx = [i for i in range(0, max(train_idx) + 1)] val_idx = [i for i in range(len(train_idx), len(idx))] data[train_idx] = calc_data_mean(data[train_idx], './cache', train=True, mode='mean') data[val_idx] = calc_data_mean(data[val_idx], './cache', train=False, mode='mean') model = Classifier(input_size, output_size, params=p) # model.apply(init_weights) dataset = FinData(data=data, target=target, date=date, multi=True) dataloaders = create_dataloaders(dataset, indexes={ 'train': train_idx, 'val': val_idx }, batch_size=p['batch_size']) es = EarlyStopping(monitor='val_loss', patience=10, min_delta=0.0005, mode='min') trainer = pl.Trainer(logger=False, max_epochs=500, gpus=1, callbacks=[ checkpoint_callback, logger, PyTorchLightningPruningCallback( trial, monitor='val_loss'), es ], precision=16) trainer.fit(model, train_dataloader=dataloaders['train'], val_dataloaders=dataloaders['val']) val_loss = logger.metrics[-1]['val_loss'].item() metrics.append(val_loss) sizes.append(len(train_idx)) metrics_mean = weighted_mean(metrics, sizes) return metrics_mean