def run_trials(): """Run one TPE meta optimisation step and save its results.""" max_evals = nb_evals = 15 logger = get_logger(save_path + '/trials.log', name='trials') logger.info("Attempt to resume a past training if it exists:") try: # https://github.com/hyperopt/hyperopt/issues/267 trials = pickle.load(open(save_path + "/results.pkl", "rb")) logger.info("Found saved Trials! Loading...") max_evals = len(trials.trials) + nb_evals logger.info("Rerunning from {} trials to add another one.".format( len(trials.trials))) except: trials = Trials() logger.info("Starting from scratch: new trials.") best = fmin( train_challenge2020, space, algo=tpe.suggest, trials=trials, max_evals=max_evals, ) logger.info("Best: {}".format(best)) pickle.dump(trials, open(save_path + "/results.pkl", "wb")) logger.info("\nOPTIMIZATION STEP COMPLETE.\n") logger.info("Trials:") for trial in trials: logger.info(trial)
def train_challenge2020(hype_space): # Paths to save log, checkpoint, tensorboard logs and results run_id = datetime.now().strftime(r'%m%d_%H%M%S') base_path = save_path + '/' + run_id os.makedirs(base_path) write_json(hype_space, base_path + '/hype_space.json') checkpoint_dir = base_path + '/checkpoints' log_dir = base_path + '/log' tb_dir = base_path + '/tb_log' result_dir = base_path + '/results' os.makedirs(result_dir) os.makedirs(log_dir) os.makedirs(checkpoint_dir) os.makedirs(tb_dir) # Logger for train logger = get_logger(log_dir + '/info.log', name='train' + run_id) logger.info(hype_space) # Tensorboard train_writer = SummaryWriter(tb_dir + '/train') val_writer = SummaryWriter(tb_dir + '/valid') try: # Hyper Parameters split_index = "../process/data_split/" + hype_space['data_split'] # Setup Cuda use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # Data_loader train_loader = ChallengeDataLoader2( label_dir, data_dir, split_index, batch_size=hype_space['trainer']['batch_size'], normalization=hype_space['data_normalization'], augmentations=hype_space['augmentation']['method'], p=hype_space['augmentation']['prob']) valid_loader = train_loader.valid_data_loader test_loader = train_loader.test_data_loader # Build model architecture global model for file, types in files_models.items(): for type in types: if hype_space["arch"]["type"] == type: model = init_obj(hype_space, 'arch', eval("module_arch_" + file)) dummy_input = Variable(torch.rand(16, 12, 3000)) train_writer.add_graph(model, (dummy_input, )) model.to(device) # Get function handles of loss and metrics criterion = getattr(module_loss, hype_space['loss']['type']) # Get function handles of metrics challenge_metrics = ChallengeMetric(label_dir) metric = challenge_metrics.challenge_metric # Get indices of the scored labels if hype_space['only_scored']: indices = challenge_metrics.indices else: indices = None # Build optimizer, learning rate scheduler trainable_params = filter(lambda p: p.requires_grad, model.parameters()) optimizer = init_obj(hype_space, 'optimizer', torch.optim, trainable_params) if hype_space['lr_scheduler']['type'] == 'GradualWarmupScheduler': params = hype_space["lr_scheduler"]["args"] scheduler_steplr_args = dict(params["after_scheduler"]["args"]) scheduler_steplr = getattr(torch.optim.lr_scheduler, params["after_scheduler"]["type"])( optimizer, **scheduler_steplr_args) lr_scheduler = GradualWarmupScheduler( optimizer, multiplier=params["multiplier"], total_epoch=params["total_epoch"], after_scheduler=scheduler_steplr) else: lr_scheduler = init_obj(hype_space, 'lr_scheduler', torch.optim.lr_scheduler, optimizer) # Begin training process trainer = hype_space['trainer'] epochs = trainer['epochs'] # Full train and valid logic mnt_metric_name, mnt_mode, mnt_best, early_stop = get_mnt_mode(trainer) not_improved_count = 0 for epoch in range(epochs): best = False train_loss, train_metric = train(model, optimizer, train_loader, criterion, metric, indices, epoch, device=device) val_loss, val_metric = valid(model, valid_loader, criterion, metric, indices, device=device) if hype_space['lr_scheduler']['type'] == 'ReduceLROnPlateau': # if hype_space['lr_scheduler']['args']['mode'] == 'min': # lr_scheduler.step(train_loss) # else: # lr_scheduler.step(train_metric) lr_scheduler.step(val_loss) elif hype_space['lr_scheduler'][ 'type'] == 'GradualWarmupScheduler': lr_scheduler.step(epoch, val_loss) else: lr_scheduler.step() logger.info( 'Epoch:[{}/{}]\t {:10s}: {:.5f}\t {:10s}: {:.5f}'.format( epoch, epochs, 'loss', train_loss, 'metric', train_metric)) logger.info( ' \t {:10s}: {:.5f}\t {:10s}: {:.5f}'.format( 'val_loss', val_loss, 'val_metric', val_metric)) logger.info(' \t learning_rate: {}'.format( optimizer.param_groups[0]['lr'])) # check whether model performance improved or not, according to specified metric(mnt_metric) if mnt_mode != 'off': mnt_metric = val_loss if mnt_metric_name == 'val_loss' else val_metric improved = (mnt_mode == 'min' and mnt_metric <= mnt_best) or \ (mnt_mode == 'max' and mnt_metric >= mnt_best) if improved: mnt_best = mnt_metric not_improved_count = 0 best = True else: not_improved_count += 1 if not_improved_count > early_stop: logger.info( "Validation performance didn\'t improve for {} epochs. Training stops." .format(early_stop)) break if best == True: save_checkpoint(model, epoch, optimizer, mnt_best, hype_space, checkpoint_dir, save_best=True) logger.info("Saving current best: model_best.pth ...") # Tensorboard log train_writer.add_scalar('loss', train_loss, epoch) train_writer.add_scalar('metric', train_metric, epoch) train_writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) val_writer.add_scalar('loss', val_loss, epoch) val_writer.add_scalar('metric', val_metric, epoch) # Logger for test logger = get_logger(result_dir + '/info.log', name='test' + run_id) logger.propagate = False # Load model_best checkpoint model = load_checkpoint(model, checkpoint_dir + '/model_best.pth', logger) # Testing test_loss, test_metric = test(model, test_loader, criterion, metric, device=device) logger.info(' {:10s}: {:.5f}\t {:10s}: {:.5f}'.format( 'loss', test_loss, 'metric', test_metric)) challenge_metrics.return_metric_list() analyze(model, test_loader, criterion, challenge_metrics, logger, result_dir, device=device) write_json(hype_space, '{}/{}_{:.5f}.json'.format(save_path, run_id, test_metric)) except: test_metric = -10 return -test_metric