def main(): config = load_config() if config.test.output_dir is None: output_dir = pathlib.Path(config.test.checkpoint).parent else: output_dir = pathlib.Path(config.test.output_dir) output_dir.mkdir(exist_ok=True, parents=True) logger = create_logger(name=__name__, distributed_rank=get_rank()) model = create_model(config) model = apply_data_parallel_wrapper(config, model) checkpointer = CheckPointer(model, checkpoint_dir=output_dir, logger=logger, distributed_rank=get_rank()) checkpointer.load(config.test.checkpoint) test_loader = create_dataloader(config, is_train=False) _, test_loss = create_loss(config) preds, probs, labels, loss, acc = evaluate(config, model, test_loader, test_loss, logger) output_path = output_dir / f'predictions.npz' np.savez(output_path, preds=preds, probs=probs, labels=labels, loss=loss, acc=acc)
def main(): global global_step config = load_config() set_seed(config) setup_cudnn(config) epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2, size=config.scheduler.epochs) if config.train.distributed: dist.init_process_group(backend=config.train.dist.backend, init_method=config.train.dist.init_method, rank=config.train.dist.node_rank, world_size=config.train.dist.world_size) torch.cuda.set_device(config.train.dist.local_rank) output_dir = pathlib.Path(config.train.output_dir) if get_rank() == 0: if not config.train.resume and output_dir.exists(): raise RuntimeError( f'Output directory `{output_dir.as_posix()}` already exists') output_dir.mkdir(exist_ok=True, parents=True) if not config.train.resume: save_config(config, output_dir / 'config.yaml') save_config(get_env_info(config), output_dir / 'env.yaml') diff = find_config_diff(config) if diff is not None: save_config(diff, output_dir / 'config_min.yaml') logger = create_logger(name=__name__, distributed_rank=get_rank(), output_dir=output_dir, filename='log.txt') logger.info(config) logger.info(get_env_info(config)) train_loader, val_loader = create_dataloader(config, is_train=True) model = create_model(config) macs, n_params = count_op(config, model) logger.info(f'MACs : {macs}') logger.info(f'#params: {n_params}') optimizer = create_optimizer(config, model) model, optimizer = apex.amp.initialize(model, optimizer, opt_level=config.train.precision) model = apply_data_parallel_wrapper(config, model) scheduler = create_scheduler(config, optimizer, steps_per_epoch=len(train_loader)) checkpointer = Checkpointer(model, optimizer=optimizer, scheduler=scheduler, save_dir=output_dir, save_to_disk=get_rank() == 0) start_epoch = config.train.start_epoch scheduler.last_epoch = start_epoch if config.train.resume: checkpoint_config = checkpointer.resume_or_load('', resume=True) global_step = checkpoint_config['global_step'] start_epoch = checkpoint_config['epoch'] config.defrost() config.merge_from_other_cfg(ConfigNode(checkpoint_config['config'])) config.freeze() elif config.train.checkpoint != '': checkpoint = torch.load(config.train.checkpoint, map_location='cpu') if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): model.module.load_state_dict(checkpoint['model']) else: model.load_state_dict(checkpoint['model']) if get_rank() == 0 and config.train.use_tensorboard: tensorboard_writer = create_tensorboard_writer( config, output_dir, purge_step=config.train.start_epoch + 1) tensorboard_writer2 = create_tensorboard_writer( config, output_dir / 'running', purge_step=global_step + 1) else: tensorboard_writer = DummyWriter() tensorboard_writer2 = DummyWriter() train_loss, val_loss = create_loss(config) if (config.train.val_period > 0 and start_epoch == 0 and config.train.val_first): validate(0, config, model, val_loss, val_loader, logger, tensorboard_writer) for epoch, seed in enumerate(epoch_seeds[start_epoch:], start_epoch): epoch += 1 np.random.seed(seed) train(epoch, config, model, optimizer, scheduler, train_loss, train_loader, logger, tensorboard_writer, tensorboard_writer2) if config.train.val_period > 0 and (epoch % config.train.val_period == 0): validate(epoch, config, model, val_loss, val_loader, logger, tensorboard_writer) tensorboard_writer.flush() tensorboard_writer2.flush() if (epoch % config.train.checkpoint_period == 0) or ( epoch == config.scheduler.epochs): checkpoint_config = { 'epoch': epoch, 'global_step': global_step, 'config': config.as_dict(), } checkpointer.save(f'checkpoint_{epoch:05d}', **checkpoint_config) tensorboard_writer.close() tensorboard_writer2.close()
def validate(epoch, config, model, loss_func, val_loader, logger, tensorboard_writer): logger.info(f'Val {epoch}') device = torch.device(config.device) model.eval() loss_meter = AverageMeter() acc1_meter = AverageMeter() acc5_meter = AverageMeter() start = time.time() with torch.no_grad(): for step, (data, targets) in enumerate(val_loader): if get_rank() == 0: if config.tensorboard.val_images: if epoch == 0 and step == 0: image = torchvision.utils.make_grid(data, normalize=True, scale_each=True) tensorboard_writer.add_image('Val/Image', image, epoch) data = data.to( device, non_blocking=config.validation.dataloader.non_blocking) targets = targets.to(device) outputs = model(data) loss = loss_func(outputs, targets) acc1, acc5 = compute_accuracy(config, outputs, targets, augmentation=False, topk=(1, 5)) if config.train.distributed: loss_all_reduce = dist.all_reduce(loss, op=dist.ReduceOp.SUM, async_op=True) acc1_all_reduce = dist.all_reduce(acc1, op=dist.ReduceOp.SUM, async_op=True) acc5_all_reduce = dist.all_reduce(acc5, op=dist.ReduceOp.SUM, async_op=True) loss_all_reduce.wait() acc1_all_reduce.wait() acc5_all_reduce.wait() loss.div_(dist.get_world_size()) acc1.div_(dist.get_world_size()) acc5.div_(dist.get_world_size()) loss = loss.item() acc1 = acc1.item() acc5 = acc5.item() num = data.size(0) loss_meter.update(loss, num) acc1_meter.update(acc1, num) acc5_meter.update(acc5, num) torch.cuda.synchronize() logger.info(f'Epoch {epoch} ' f'loss {loss_meter.avg:.4f} ' f'acc@1 {acc1_meter.avg:.4f} ' f'acc@5 {acc5_meter.avg:.4f}') elapsed = time.time() - start logger.info(f'Elapsed {elapsed:.2f}') if get_rank() == 0: if epoch > 0: tensorboard_writer.add_scalar('Val/Loss', loss_meter.avg, epoch) tensorboard_writer.add_scalar('Val/Acc1', acc1_meter.avg, epoch) tensorboard_writer.add_scalar('Val/Acc5', acc5_meter.avg, epoch) tensorboard_writer.add_scalar('Val/Time', elapsed, epoch) if config.tensorboard.model_params: for name, param in model.named_parameters(): tensorboard_writer.add_histogram(name, param, epoch)
def train(epoch, config, model, optimizer, scheduler, loss_func, train_loader, logger, tensorboard_writer, tensorboard_writer2): global global_step logger.info(f'Train {epoch} {global_step}') device = torch.device(config.device) model.train() loss_meter = AverageMeter() acc1_meter = AverageMeter() acc5_meter = AverageMeter() start = time.time() for step, (data, targets) in enumerate(train_loader): step += 1 global_step += 1 if get_rank() == 0 and step == 1: if config.tensorboard.train_images: image = torchvision.utils.make_grid(data, normalize=True, scale_each=True) tensorboard_writer.add_image('Train/Image', image, epoch) data = data.to(device, non_blocking=config.train.dataloader.non_blocking) targets = send_targets_to_device(config, targets, device) data_chunks, target_chunks = subdivide_batch(config, data, targets) optimizer.zero_grad() outputs = [] losses = [] for data_chunk, target_chunk in zip(data_chunks, target_chunks): if config.augmentation.use_dual_cutout: w = data_chunk.size(3) // 2 data1 = data_chunk[:, :, :, :w] data2 = data_chunk[:, :, :, w:] outputs1 = model(data1) outputs2 = model(data2) output_chunk = torch.cat( (outputs1.unsqueeze(1), outputs2.unsqueeze(1)), dim=1) else: output_chunk = model(data_chunk) outputs.append(output_chunk) loss = loss_func(output_chunk, target_chunk) if "CIFAR10_CM" in config.dataset.name: # Added by W210 Team loss = loss.mean() losses.append(loss) with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() outputs = torch.cat(outputs) if config.train.gradient_clip > 0: torch.nn.utils.clip_grad_norm_(apex.amp.master_params(optimizer), config.train.gradient_clip) if config.train.subdivision > 1: for param in model.parameters(): param.grad.data.div_(config.train.subdivision) optimizer.step() acc1, acc5 = compute_accuracy(config, outputs, targets, augmentation=True, topk=(1, 5)) loss = sum(losses) if config.train.distributed: loss_all_reduce = dist.all_reduce(loss, op=dist.ReduceOp.SUM, async_op=True) acc1_all_reduce = dist.all_reduce(acc1, op=dist.ReduceOp.SUM, async_op=True) acc5_all_reduce = dist.all_reduce(acc5, op=dist.ReduceOp.SUM, async_op=True) loss_all_reduce.wait() acc1_all_reduce.wait() acc5_all_reduce.wait() loss.div_(dist.get_world_size()) acc1.div_(dist.get_world_size()) acc5.div_(dist.get_world_size()) loss = loss.item() acc1 = acc1.item() acc5 = acc5.item() num = data.size(0) loss_meter.update(loss, num) acc1_meter.update(acc1, num) acc5_meter.update(acc5, num) torch.cuda.synchronize() if get_rank() == 0: if step % config.train.log_period == 0 or step == len( train_loader): logger.info( f'Epoch {epoch} ' f'Step {step}/{len(train_loader)} ' f'lr {scheduler.get_last_lr()[0]:.6f} ' f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f}) ' f'acc@1 {acc1_meter.val:.4f} ({acc1_meter.avg:.4f}) ' f'acc@5 {acc5_meter.val:.4f} ({acc5_meter.avg:.4f})') tensorboard_writer2.add_scalar('Train/RunningLoss', loss_meter.avg, global_step) tensorboard_writer2.add_scalar('Train/RunningAcc1', acc1_meter.avg, global_step) tensorboard_writer2.add_scalar('Train/RunningAcc5', acc5_meter.avg, global_step) tensorboard_writer2.add_scalar('Train/RunningLearningRate', scheduler.get_last_lr()[0], global_step) scheduler.step() if get_rank() == 0: elapsed = time.time() - start logger.info(f'Elapsed {elapsed:.2f}') tensorboard_writer.add_scalar('Train/Loss', loss_meter.avg, epoch) tensorboard_writer.add_scalar('Train/Acc1', acc1_meter.avg, epoch) tensorboard_writer.add_scalar('Train/Acc5', acc5_meter.avg, epoch) tensorboard_writer.add_scalar('Train/Time', elapsed, epoch) tensorboard_writer.add_scalar('Train/LearningRate', scheduler.get_last_lr()[0], epoch)
def main(): global global_step config = load_config() set_seed(config) setup_cudnn(config) # np.iinfo(np_type).max: machine limit (upper bound) of the this type # every epoch will have a specific epoch seed epoch_seeds = np.random.randint(np.iinfo(np.int32).max // 2, size=config.scheduler.epochs) if config.train.distributed: dist.init_process_group(backend=config.train.dist.backend, init_method=config.train.dist.init_method, rank=config.train.dist.node_rank, world_size=config.train.dist.world_size) torch.cuda.set_device(config.train.dist.local_rank) output_dir = pathlib.Path(config.train.output_dir) if get_rank() == 0: if not config.train.resume and output_dir.exists(): raise RuntimeError( f'Output directory `{output_dir.as_posix()}` already exists') output_dir.mkdir(exist_ok=True, parents=True) if not config.train.resume: # if we need to resume training, current config, environment info and the difference between # the current and default config will be saved. save_config(config, output_dir / 'config.yaml') save_config(get_env_info(config), output_dir / 'env.yaml') diff = find_config_diff(config) if diff is not None: save_config(diff, output_dir / 'config_min.yaml') logger = create_logger(name=__name__, distributed_rank=get_rank(), output_dir=output_dir, filename='log.txt') logger.info(config) logger.info(get_env_info(config)) train_loader, val_loader = create_dataloader(config, is_train=True) model = create_model(config) # Multiply-and-ACcumulate(MAC): ops macs, n_params = count_op(config, model) logger.info(f'MACs : {macs}') logger.info(f'#params: {n_params}') # creating optimizer: SGD with nesterov momentum, adam, amsgrad, adabound, adaboundw or lars. optimizer = create_optimizer(config, model) # some AMP(Automatic mixed precision) settings if config.device != 'cpu': model, optimizer = apex.amp.initialize( model, optimizer, opt_level=config.train.precision) # create data parallel model or distributed data model = apply_data_parallel_wrapper(config, model) # set up scheduler and warm up scheduler # steps per epoch: how many batches in an epoch scheduler = create_scheduler(config, optimizer, steps_per_epoch=len(train_loader)) # create checkponit, do ot use torch's default checkpoint saver because it can't save scheduler checkpointer = Checkpointer(model, optimizer=optimizer, scheduler=scheduler, save_dir=output_dir, save_to_disk=get_rank() == 0) start_epoch = config.train.start_epoch # last_epoch is used to resume training, here normally we should start from config.train.start_epoch scheduler.last_epoch = start_epoch # The resume training supports multiple modes: # 1. resume = True, loading model from the last training checkpoint and following the global step and config # 2. resume = False, training checkpoint is specified, load checkpoint to cpu if config.train.resume: checkpoint_config = checkpointer.resume_or_load('', resume=True) global_step = checkpoint_config['global_step'] start_epoch = checkpoint_config['epoch'] config.defrost() config.merge_from_other_cfg(ConfigNode(checkpoint_config['config'])) config.freeze() elif config.train.checkpoint != '': checkpoint = torch.load(config.train.checkpoint, map_location='cpu') if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): model.module.load_state_dict(checkpoint['model']) else: model.load_state_dict(checkpoint['model']) # Two TensorBoard writer: # First writer for this run of training(maybe it's resuming training) # Second writer follows the global steps and records the global run. if get_rank() == 0 and config.train.use_tensorboard: tensorboard_writer = create_tensorboard_writer( config, output_dir, purge_step=config.train.start_epoch + 1) tensorboard_writer2 = create_tensorboard_writer( config, output_dir / 'running', purge_step=global_step + 1) else: tensorboard_writer = DummyWriter() tensorboard_writer2 = DummyWriter() train_loss, val_loss = create_loss(config) if (config.train.val_period > 0 and start_epoch == 0 and config.train.val_first): # validate the model from epoch 0 validate(0, config, model, val_loss, val_loader, logger, tensorboard_writer) for epoch, seed in enumerate(epoch_seeds[start_epoch:], start_epoch): epoch += 1 np.random.seed(seed) train(epoch, config, model, optimizer, scheduler, train_loss, train_loader, logger, tensorboard_writer, tensorboard_writer2) if config.train.val_period > 0 and (epoch % config.train.val_period == 0): validate(epoch, config, model, val_loss, val_loader, logger, tensorboard_writer) tensorboard_writer.flush() tensorboard_writer2.flush() if (epoch % config.train.checkpoint_period == 0) or (epoch == config.scheduler.epochs): checkpoint_config = { 'epoch': epoch, 'global_step': global_step, 'config': config.as_dict(), } checkpointer.save(f'checkpoint_{epoch:05d}', **checkpoint_config) tensorboard_writer.close() tensorboard_writer2.close()
def train(epoch, config, model, optimizer, scheduler, loss_func, train_loader, logger, tensorboard_writer, tensorboard_writer2): global global_step logger.info(f'Train {epoch} {global_step}') device = torch.device(config.device) model.train() loss_meter = AverageMeter() acc1_meter = AverageMeter() acc5_meter = AverageMeter() start = time.time() for step, (data, targets) in enumerate(train_loader): # every step is an iteration step += 1 global_step += 1 if get_rank() == 0 and step == 1: if config.tensorboard.train_images: image = torchvision.utils.make_grid(data, normalize=True, scale_each=True) tensorboard_writer.add_image('Train/Image', image, epoch) data = data.to(device, non_blocking=config.train.dataloader.non_blocking) # Because target is not also pure single target(label), when data augmentation like mixup is deployed, # multiple labels could occur and need to be sent to device separately. targets = send_targets_to_device(config, targets, device) data_chunks, target_chunks = subdivide_batch(config, data, targets) optimizer.zero_grad() outputs = [] losses = [] for data_chunk, target_chunk in zip(data_chunks, target_chunks): if config.augmentation.use_dual_cutout: w = data_chunk.size(3) // 2 data1 = data_chunk[:, :, :, :w] data2 = data_chunk[:, :, :, w:] outputs1 = model(data1) outputs2 = model(data2) output_chunk = torch.cat( (outputs1.unsqueeze(1), outputs2.unsqueeze(1)), dim=1) else: output_chunk = model(data_chunk) outputs.append(output_chunk) loss = loss_func(output_chunk, target_chunk) # Loss is used for calculating and accumulating the gradients. # But losses is a list containing all losses but not for gradients. losses.append(loss) if config.device != 'cpu': with apex.amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # concatenating all chunks into one piece. outputs = torch.cat(outputs) if config.train.gradient_clip > 0: # If norm of gradients higher than the specified value in the config, # scale the gradient to the specified gradient norm value. if config.device != 'cpu': torch.nn.utils.clip_grad_norm_( apex.amp.master_params(optimizer), config.train.gradient_clip) else: torch.nn.utils.clip_grad_norm_(model.parameters(), config.train.gradient_clip) if config.train.subdivision > 1: for param in model.parameters(): # the final gradients should be divided(averaged) by the number of the subdivision param.grad.data.div_(config.train.subdivision) # optimizing gradients optimizer.step() acc1, acc5 = compute_accuracy(config, outputs, targets, augmentation=True, topk=(1, 5)) loss = sum(losses) if config.train.distributed: loss_all_reduce = dist.all_reduce(loss, op=dist.ReduceOp.SUM, async_op=True) acc1_all_reduce = dist.all_reduce(acc1, op=dist.ReduceOp.SUM, async_op=True) acc5_all_reduce = dist.all_reduce(acc5, op=dist.ReduceOp.SUM, async_op=True) loss_all_reduce.wait() acc1_all_reduce.wait() acc5_all_reduce.wait() loss.div_(dist.get_world_size()) acc1.div_(dist.get_world_size()) acc5.div_(dist.get_world_size()) loss = loss.item() acc1 = acc1.item() acc5 = acc5.item() num = data.size(0) loss_meter.update(loss, num) acc1_meter.update(acc1, num) acc5_meter.update(acc5, num) torch.cuda.synchronize() if get_rank() == 0: if step % config.train.log_period == 0 or step == len( train_loader): logger.info( f'Epoch {epoch} ' f'Step {step}/{len(train_loader)} ' f'lr {scheduler.get_last_lr()[0]:.6f} ' f'loss {loss_meter.val:.4f} ({loss_meter.avg:.4f}) ' f'acc@1 {acc1_meter.val:.4f} ({acc1_meter.avg:.4f}) ' f'acc@5 {acc5_meter.val:.4f} ({acc5_meter.avg:.4f})') tensorboard_writer2.add_scalar('Train/RunningLoss', loss_meter.avg, global_step) tensorboard_writer2.add_scalar('Train/RunningAcc1', acc1_meter.avg, global_step) tensorboard_writer2.add_scalar('Train/RunningAcc5', acc5_meter.avg, global_step) tensorboard_writer2.add_scalar('Train/RunningLearningRate', scheduler.get_last_lr()[0], global_step) scheduler.step() if get_rank() == 0: elapsed = time.time() - start logger.info(f'Elapsed {elapsed:.2f}') tensorboard_writer.add_scalar('Train/Loss', loss_meter.avg, epoch) tensorboard_writer.add_scalar('Train/Acc1', acc1_meter.avg, epoch) tensorboard_writer.add_scalar('Train/Acc5', acc5_meter.avg, epoch) tensorboard_writer.add_scalar('Train/Time', elapsed, epoch) tensorboard_writer.add_scalar('Train/LearningRate', scheduler.get_last_lr()[0], epoch)