X_test = numpy.float32(X_test) X_test /= 255.0 X_test *= 2.0 train_dataset = supervised_dataset.SupervisedDataset(X_train, y_train) val_dataset = supervised_dataset.SupervisedDataset(X_val, y_val) train_iterator = train_dataset.iterator( mode='random_uniform', batch_size=64, num_batches=31000) val_iterator = val_dataset.iterator( mode='random_uniform', batch_size=64, num_batches=31000) # Create object to local contrast normalize a batch. # Note: Every batch must be normalized before use. normer = util.Normer3(filter_size=5, num_channels=1) module_list = [normer] preprocessor = util.Preprocessor(module_list) print('Training Model') for x_batch, y_batch in train_iterator: x_batch = preprocessor.run(x_batch) monitor.start() log_prob, accuracy = model.train(x_batch, y_batch) monitor.stop(1-accuracy) if monitor.test: monitor.start() x_val_batch, y_val_batch = val_iterator.next() x_val_batch = preprocessor.run(x_val_batch) val_accuracy = model.eval(x_val_batch, y_val_batch) monitor.stop_test(1-val_accuracy)
batch_size=64, num_batches=31000) # Create object to local contrast normalize a batch. # Note: Every batch must be normalized before use. normer = util.Normer3(filter_size=5, num_channels=1) module_list = [normer] preprocessor = util.Preprocessor(module_list) print('Training Model') for x_batch, y_batch in train_iterator: #x_batch = preprocessor.run(x_batch) x_batch = (x_batch - mean) / std # loop over batch for i in range(len(x_batch)): # hide patch for an image x_batch[i] = hide_patch(x_batch[i]) monitor.start() log_prob, accuracy = model.train(x_batch, y_batch) monitor.stop(1 - accuracy) # monitor takes error instead of accuracy if monitor.test: monitor.start() x_val_batch, y_val_batch = val_iterator.next() #x_val_batch = preprocessor.run(x_val_batch) x_val_batch = (x_val_batch - mean) / std val_accuracy = model.eval(x_val_batch, y_val_batch) monitor.stop_test(1 - val_accuracy)
def learning( cfg: OmegaConf, training_data_loader: torch.utils.data.DataLoader, validation_data_loader: torch.utils.data.DataLoader, model: SupervisedModel, ) -> None: """ Learning function including evaluation :param cfg: Hydra's config instance :param training_data_loader: Training data loader :param validation_data_loader: Validation data loader :param model: Model :return: None """ local_rank = cfg["distributed"]["local_rank"] num_gpus = cfg["distributed"]["world_size"] epochs = cfg["parameter"]["epochs"] num_training_samples = len(training_data_loader.dataset.data) steps_per_epoch = int( num_training_samples / (cfg["experiment"]["batches"] * num_gpus)) # because the drop=True total_steps = cfg["parameter"]["epochs"] * steps_per_epoch warmup_steps = cfg["parameter"]["warmup_epochs"] * steps_per_epoch current_step = 0 best_metric = np.finfo(np.float64).max optimizer = torch.optim.SGD(params=model.parameters(), lr=calculate_initial_lr(cfg), momentum=cfg["parameter"]["momentum"], nesterov=False, weight_decay=cfg["experiment"]["decay"]) # https://github.com/google-research/simclr/blob/master/lars_optimizer.py#L26 optimizer = LARC(optimizer=optimizer, trust_coefficient=0.001, clip=False) cos_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer.optim, T_max=total_steps - warmup_steps, ) for epoch in range(1, epochs + 1): # training model.train() training_data_loader.sampler.set_epoch(epoch) for data, targets in training_data_loader: # adjust learning rate by applying linear warming if current_step <= warmup_steps: lr = calculate_lr(cfg, warmup_steps, current_step) for param_group in optimizer.param_groups: param_group["lr"] = lr optimizer.zero_grad() data, targets = data.to(local_rank), targets.to(local_rank) unnormalized_features = model(data) loss = torch.nn.functional.cross_entropy(unnormalized_features, targets) loss.backward() optimizer.step() # adjust learning rate by applying cosine annealing if current_step > warmup_steps: cos_lr_scheduler.step() current_step += 1 if local_rank == 0: logger_line = "Epoch:{}/{} progress:{:.3f} loss:{:.3f}, lr:{:.7f}".format( epoch, epochs, epoch / epochs, loss.item(), optimizer.param_groups[0]["lr"]) # During warmup phase, we skip validation sum_val_loss, num_val_corrects = validation(validation_data_loader, model, local_rank) torch.distributed.barrier() torch.distributed.reduce(sum_val_loss, dst=0) torch.distributed.reduce(num_val_corrects, dst=0) num_val_samples = len(validation_data_loader.dataset) # logging and save checkpoint if local_rank == 0: validation_loss = sum_val_loss.item() / num_val_samples validation_acc = num_val_corrects.item() / num_val_samples logging.info(logger_line + " val loss:{:.3f}, val acc:{:.2f}%".format( validation_loss, validation_acc * 100.)) if cfg["parameter"]["metric"] == "loss": metric = validation_loss else: metric = 1. - validation_acc if metric <= best_metric: if "save_fname" in locals(): if os.path.exists(save_fname): os.remove(save_fname) save_fname = "epoch={}-{}".format( epoch, cfg["experiment"]["output_model_name"]) torch.save(model.state_dict(), save_fname)