def reduce_buffers(self, bs): for stage in self.stages: if isinstance(stage, RandomlyWiredStage): if self.training: bs, r_mean, r_var, r_usage = batch_reduce(bs, stage.nn_running_mean, stage.nn_running_var, stage.node_running_usage) stage.nn_running_mean.copy_(r_mean) stage.nn_running_var.copy_(r_var) stage.node_running_usage.copy_(r_usage)
def validate(cur_gpu, val_loader, model, criterion, epoch, hparams): logger = get_logger() model.eval() if logger: loss_meter = AverageMeter('val_loss') acc1_meter = AverageMeter('val_acc1') acc5_meter = AverageMeter('val_acc5') model_module = model.module if hparams.distributed_mode == 'gpus' else model for i, (image, target) in enumerate(val_loader): with torch.no_grad(): if cur_gpu >= 0: image = image.cuda(cur_gpu, non_blocking=True) target = target.cuda(cur_gpu, non_blocking=True) if hparams.fp16: image = image.half() output = model(image) if hparams.fp16: output = output.float() loss = criterion(output, target, label_smoothing=hparams.label_smoothing) acc1, acc5 = accuracy(output, target, topk=(1, 5)) bs = torch.tensor(image.size(0), device='cuda:%d' % cur_gpu if cur_gpu >= 0 else None) if hparams.distributed_mode == 'gpus': bs, loss, acc1, acc5 = batch_reduce(bs, loss, acc1, acc5) if logger: loss_meter.update(loss.item(), bs.item()) acc1_meter.update(acc1.item(), bs.item()) acc5_meter.update(acc5.item(), bs.item()) loss, acc1, acc5 = None, None, None if logger: metrics = [('val_loss', loss_meter.result), ('val_acc1', acc1_meter.result), ('val_acc5', acc5_meter.result)] logger.log_metrics(metrics, epoch + 1, 0, 'val') logger.log_summaries(model_module.get_summaries(), epoch + 1, 0, 'val') loss, acc1, acc5 = loss_meter.result, acc1_meter.result, acc5_meter.result return loss, acc1, acc5
def train(cur_gpu, train_loader, model, criterion, optimizer, lr_scheduler, params, params_clone, epoch, hparams): logger = get_logger() monitor = get_monitor() model.train() if logger: loss_meter = AverageMeter('train_loss') acc1_meter = AverageMeter('train_acc1') acc5_meter = AverageMeter('train_acc5') model_module = model.module if hparams.distributed_mode == 'gpus' else model for i, (image, target) in enumerate(train_loader): monitor and monitor.before_step() if cur_gpu >= 0: image = image.cuda(cur_gpu, non_blocking=True) target = target.cuda(cur_gpu, non_blocking=True) if hparams.fp16: image = image.half() output = model(image) if hparams.fp16: output = output.float() loss = criterion(output, target, label_smoothing=hparams.label_smoothing) loss_ = loss.data.clone() acc1, acc5 = accuracy(output, target, topk=(1,5)) bs = torch.tensor(image.size(0), device='cuda:%d' % cur_gpu if cur_gpu >= 0 else None) if hparams.distributed_mode == 'gpus': model_module.reduce_buffers(bs) bs, loss_, acc1, acc5 = batch_reduce(bs, loss_, acc1, acc5) if logger: loss_meter.update(loss.item(), bs.item()) acc1_meter.update(acc1.item(), bs.item()) acc5_meter.update(acc5.item(), bs.item()) if hparams.fp16: loss = loss * hparams.loss_scale model.zero_grad() loss.backward() copy_grads(params, params_clone) for p in params_clone: p.grad.data.div_(hparams.loss_scale) optimizer.step() copy_params(params_clone, params) torch.cuda.synchronize() else: optimizer.zero_grad() loss.backward() optimizer.step() step = i + 1 lr_scheduler.step(epoch=epoch + step / hparams.steps_per_epoch) if logger and step % hparams.print_freq == 0: metrics = [('train_loss', loss_meter.result), ('train_acc1', acc1_meter.result), ('train_acc5', acc5_meter.result), ('lr', optimizer.param_groups[0]['lr'])] # epoch: zero-indexed for code, one-indexed for human reader logger.log_metrics(metrics, epoch + 1, step, 'train') logger.log_summaries(model_module.get_summaries(), epoch + 1, step, 'train') monitor and monitor.after_step(str(loss_meter)) if logger: metrics = [('train_loss', loss_meter.result), ('train_acc1', acc1_meter.result), ('train_acc5', acc5_meter.result), ('lr', optimizer.param_groups[0]['lr'])] logger.log_metrics(metrics, epoch + 1, step, 'train') logger.log_summaries(model_module.get_summaries(), epoch + 1, step, 'train')