def prepare(args, e_ix_ln, r_ix_ln, t_ix_ln): mdl = _model(args, e_ix_ln, r_ix_ln, t_ix_ln) lr_ml = (hvd.local_size() if hvd.nccl_built() else 1) if not args.tpu and args.adasum else _size(args) opt = torch.optim.Adam(mdl.parameters(), lr=lr_ml * args.learning_rate, weight_decay=args.weight_decay) st_e, bst_ls = _resume(args, mdl, opt) if args.resume != '' else (1, None) if not args.tpu: opt = hvd.DistributedOptimizer( opt, named_parameters=mdl.named_parameters(), compression=hvd.Compression.fp16 if args.fp16 else hvd.Compression.none, op=hvd.Adasum if args.adasum else hvd.Average) hvd.broadcast_parameters(mdl.state_dict(), root_rank=0) lr_sc = torch.optim.lr_scheduler.StepLR(opt, step_size=args.learning_rate_step, gamma=args.learning_rate_gamma) if not args.tpu: hvd.broadcast_optimizer_state(opt, root_rank=0) ls_f = _loss_f(args).to(args.dvc) return mdl, opt, lr_sc, ls_f, st_e, bst_ls
def verify_communication(use_horovod, world_size): """Verifies that the communication between workers works as expected It reduces a tensor of [1], and verifies that the reduced tensor is the same as the world size Args: use_horovod (bool): Use horovod for communication world_size (int): Distributed world size Raises: AssertionError: if the communication doesn't work as expected """ if use_horovod: hvd.init() logger.info("Using horovod, rank = {}".format(hvd.rank())) tensor = torch.tensor( [1], device=torch.device("cuda" if dist.get_backend() == dist.Backend.NCCL else "cpu"), ) res = hvd.allreduce(tensor, op=hvd.Sum) assert res[0] == world_size, "Communication is not working" else: logger.info("Using torch, rank={}".format(dist.get_rank())) tensor = torch.tensor( [1], device=torch.device("cuda" if dist.get_backend() == dist.Backend.NCCL else "cpu"), ) dist.all_reduce(tensor, op=dist.ReduceOp.SUM) assert tensor[0] == world_size, "Communication is not working" if hvd: logger.info("NCCL Built={}, MPI Built={} , GLOO Built={}".format( hvd.nccl_built(), hvd.mpi_built(), hvd.gloo_built()))
def test_orthogonal(self): hvd.init() # TODO support non-MPI Adasum operation # Only do this test if there are GPUs available. if not hvd.mpi_enabled() or not torch.cuda.is_available(): self.skipTest("No GPUs available") device = torch.device('cuda:{}'.format(hvd.local_rank())) np.random.seed(2) torch.manual_seed(2) size = hvd.size() local_size = hvd.local_size() rank = hvd.rank() for data_type in self.data_types: denominator = local_size if hvd.nccl_built() else 1 all_Ns = [size * 20 - 17, size * 2 + 1, size + 2, 2**19] tensors = [] all_qs = [] for N in all_Ns: a = np.random.normal(0, 1, (N, size)).astype(np.float64) q, r = np.linalg.qr(a) q = q.astype(data_type) all_qs.append(q.astype(np.float64)) tensors.append(q[:, hvd.rank()]) tensors = list( map(lambda x: torch.from_numpy(x).to(device), tensors)) handles = [ hvd.allreduce_async(tensor, op=hvd.Adasum) for tensor in tensors ] reduced_tensors = [synchronize(h) for h in handles] expected = [np.sum(q, axis=1) / denominator for q in all_qs] all_comp = [ self.are_close(data_type, e, rt.cpu().numpy()) for e, rt in zip(expected, reduced_tensors) ] if np.alltrue(all_comp): print('Orthogonal test passed') else: for c, e, rt in zip(all_comp, expected, reduced_tensors): if c == False: print('computed: ', rt) print('expected: ', e) print('off by: ', self.diff_ratio(e, rt.cpu().numpy())) assert np.alltrue(all_comp)
val_loader = DataLoader(val_dataset, batch_size=val_batch_size, collate_fn=val_dataset.collate, sampler=val_sampler, **kwargs) # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # ------------ preparation ------------ net = SCNN(resize_shape, pretrained=True) lr_scaler = 1 if torch.cuda.is_available(): net.cuda() # Horovod: Scale learning rate as per number of devices if hvd.nccl_built(): lr_scaler = hvd.local_size() net = torch.nn.DataParallel(net) lr = exp_cfg['optim']['lr'] momentum = exp_cfg['optim']['momentum'] weight_decay = exp_cfg['optim']['weight_decay'] nesterov = exp_cfg['optim']['nesterov'] # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(net.parameters(), lr=lr * lr_scaler, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov)
torch.backends.cudnn.deterministic = True torch.cuda.set_device(hvd.local_rank()) config.rank = hvd.rank() config.world = hvd.size() if hvd.local_rank() == 0: utils.download_model(config) hvd.broadcast_object(0, root_rank=0) model = x.Model(config) start_time = time.time() print('Loading dataset') train_data, dev_data, test_data = utils.build_dataset(config) train_iter = utils.build_dataloader(train_data, config) dev_iter = utils.build_dataloader(dev_data, config) test_iter = utils.build_dataloader(test_data, config) time_dif = utils.get_time_dif(start_time) print("Prepare data time: ", time_dif) # Train, eval, test model = model.to(config.device) if hvd.nccl_built() == False: raise Exception("NCCL was not compiled in Horovod!") train.train(config, model, train_iter, dev_iter, test_iter)
def main(): args = parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) local_rank = hvd.local_rank() world_size = hvd.size() if args.cuda: device = torch.device(f'cuda:{local_rank}') # Horovod: pin GPU to local rank. torch.cuda.set_device(device) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' # Horovod: use DistributedSampler to partition the training data. data = prepare_datasets(args, rank=local_rank, num_workers=world_size, data='mnist') model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) loss_fn = nn.CrossEntropyLoss() epoch_times = [] for epoch in range(1, args.epochs + 1): t0 = time.time() train(epoch, data['training'], rank=local_rank, model=model, loss_fn=loss_fn, optimizer=optimizer, args=args, scaler=None) if epoch > 2: epoch_times.append(time.time() - t0) if epoch % 10 == 0: if hvd.local_rank() == 0: accuracy = evaluate(model=model, test_loader=data['testing'].loader) logger.log('-' * 75) logger.log(f'Epoch: {epoch}, Accuracy: {accuracy}') logger.log('-' * 75) if local_rank == 0: epoch_times_str = ', '.join(str(x) for x in epoch_times) logger.log('Epoch times:') logger.log(epoch_times_str) outdir = os.path.join(os.getcwd(), 'results_mnist', f'size{world_size}') if not os.path.isdir(outdir): os.makedirs(outdir) modeldir = os.path.join(outdir, 'saved_models') modelfile = os.path.join(modeldir, 'hvd_model_mnist.pth') if not os.path.isdir(modeldir): os.makedirs(modeldir) logger.log(f'Saving model to: {modelfile}') torch.save(model.state_dict(), modelfile) args_file = os.path.join(outdir, f'args_size{world_size}.json') logger.log(f'Saving args to: {args_file}.') with open(args_file, 'at') as f: json.dump(args.__dict__, f, indent=4) times_file = os.path.join(outdir, f'epoch_times_size{world_size}.csv') logger.log(f'Saving epoch times to: {times_file}') with open(times_file, 'a') as f: f.write(epoch_times_str + '\n')
def main_worker(args_): args_.cuda = not args_.no_cuda and torch.cuda.is_available() allreduce_batch_size = args_.batch_size * args_.batches_per_allreduce hvd.init() torch.distributed.init_process_group('nccl', rank=4) if args_.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) print(f"this process's hvd rank = {hvd.local_rank()}") # torch.cuda.manual_seed(args_.seed) # cudnn.benchmark = True # # If set > 0, will resume training from a given checkpoint. # resume_from_epoch = 0 # for try_epoch in range(args_.epochs, 0, -1): # if os.path.exists(args_.checkpoint_format.format(epoch=try_epoch)): # resume_from_epoch = try_epoch # break # # # Horovod: broadcast resume_from_epoch from rank 0 (which will have # # checkpoints) to other ranks. # resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0, # name='resume_from_epoch').item() # # Horovod: print logs on the first worker. # verbose = 1 if hvd.rank() == 0 else 0 # # # Horovod: write TensorBoard logs on first worker. # try: # if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'): # from torch.utils.tensorboard import SummaryWriter # else: # from tensorboardX import SummaryWriter # os.makedirs(os.path.join(args_.model_output_dir, 'logs'), exist_ok=True) # log_writer = SummaryWriter(os.path.join(args_.model_output_dir, 'logs')) if hvd.rank() == 0 else None # except ImportError: # log_writer = None ### MODEL CREATION ### # create model model1 = VQ_VAE(num_inputs=1, weight_matching=0., channel_var=np.ones((1,))) model2 = VQ_VAE(num_inputs=1, weight_matching=0.0005, channel_var=np.ones((1,))) model1.cuda() model2.cuda() model1 = torch.nn.parallel.DistributedDataParallel(model1) model2 = torch.nn.parallel.DistributedDataParallel(model2) # By default, Adasum doesn't need scaling up learning rate. # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce if args_.cuda and args_.use_adasum and hvd.nccl_built(): # If using GPU Adasum allreduce, scale learning rate by local_size. lr_scaler = args_.batches_per_allreduce * hvd.local_size() elif not args_.use_adasum: lr_scaler = args_.batches_per_allreduce * hvd.size() else: lr_scaler = 1 # Horovod: scale learning rate by the number of GPUs. optimizer1 = t.optim.Adam(model1.parameters(), lr=(args_.base_lr * lr_scaler), betas=(.9, .999)) optimizer2 = t.optim.Adam(model2.parameters(), lr=(args_.base_lr * lr_scaler), betas=(.9, .999)) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args_.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer1 = hvd.DistributedOptimizer( optimizer1, named_parameters=model1.named_parameters(), compression=compression, backward_passes_per_step=args_.batches_per_allreduce, op=hvd.Adasum if args_.use_adasum else hvd.Average) optimizer2 = hvd.DistributedOptimizer( optimizer2, named_parameters=model2.named_parameters(), compression=compression, backward_passes_per_step=args_.batches_per_allreduce, op=hvd.Adasum if args_.use_adasum else hvd.Average) # # Restore from a previous checkpoint, if initial_epoch is specified. # # Horovod: restore on the first worker which will broadcast weights to other workers. # if resume_from_epoch > 0 and hvd.rank() == 0: # filepath = args.checkpoint_format.format(epoch=resume_from_epoch) # checkpoint = torch.load(filepath) # model.load_state_dict(checkpoint['model']) # optimizer.load_state_dict(checkpoint['optimizer']) ### Settings ### model_output_dir = args_.model_output_dir project_dir = args_.project_dir ### Prepare Data ### log.info("LOADING FILES") # ======= load data using pytorch systems ======== torch.set_num_threads(4) dataset = DatasetFolderWithPaths( root=project_dir+"/JUNE"+"/raw_patches", loader=npy_loader, extensions='.npy' ) dataset_mask = DatasetFolderWithPaths( root=project_dir+"/JUNE"+"/raw_masks", loader=npy_loader, extensions='.npy' ) relation_mat = np.load(os.path.join(project_dir, "JUNE", "raw_patches", "relation_mat.npy"), allow_pickle=True) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_sampler_mask = torch.utils.data.distributed.DistributedSampler( dataset_mask, num_replicas=hvd.size(), rank=hvd.rank()) os.makedirs(os.path.join(model_output_dir, "stage1"), exist_ok=True) os.makedirs(os.path.join(model_output_dir, "stage2"), exist_ok=True) # ========================================================= # ========================================================= log.info("TRAINING: STARTING STAGE 1") kwargs = {'num_workers': 4, 'pin_memory': True} if args_.cuda else {} train_loader = torch.utils.data.DataLoader( dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) train_mask_loader = torch.utils.data.DataLoader( dataset_mask, batch_size=allreduce_batch_size, sampler=train_sampler_mask, **kwargs) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model1.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer1, root_rank=0) output_dir = os.path.join(model_output_dir, "stage1") writer = SummaryWriter(output_dir) log.info(f"\ttensorboard logs written to {output_dir}") for epoch in range(args_.stage1_epochs): model1.train() train_sampler.set_epoch(epoch) mean_loss = train(model1, train_loader, optimizer1, # relation_mat=relation_mat, mask_loader=train_mask_loader, args_=args_ ) for key, loss in mean_loss.items(): mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1. writer.add_scalar('Loss/' + key, mean_loss[key], epoch) writer.flush() log.info('\tepoch %d' % epoch) log.info('\t'.join(['{}:{:0.4f} '.format(key, loss) for key, loss in mean_loss.items()])) # only master process should save checkpoints. if torch.distributed.get_rank() == 0: log.info(f'\t saving epoch {epoch}') t.save(model1.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch)) writer.close() # ========================================================= # ========================================================= log.info("TRAINING: STARTING STAGE 2") # get the last saved epoch. on IBM, use max(). on OSX use min() # s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1", "/*")) s1_epochs = glob.glob(os.path.join(model_output_dir, "stage1") + '/*.pt') last_epoch = max(s1_epochs, key=os.path.getctime) log.info(f"\tloading last epoch = {last_epoch}") train_loader = torch.utils.data.DataLoader(dataset, batch_size=allreduce_batch_size, sampler=train_sampler) train_mask_loader = torch.utils.data.DataLoader(dataset_mask, batch_size=allreduce_batch_size, sampler=train_sampler_mask) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model2.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer2, root_rank=0) output_dir = os.path.join(model_output_dir, "stage2") writer = SummaryWriter(output_dir) log.info(f"\ttensorboard logs written to {output_dir}") model2.load_state_dict(t.load(last_epoch)) for epoch in range(args_.stage2_epochs): model2.train() train_sampler.set_epoch(epoch) mean_loss = train(model2, train_loader, optimizer2, # relation_mat=relation_mat, mask_loader=train_mask_loader ) # shuffle samples ids at the end of the epoch # if shuffle_data: # np.random.shuffle(sample_ids) for key, loss in mean_loss.items(): mean_loss[key] = sum(loss) / len(loss) if len(loss) > 0 else -1. writer.add_scalar('Loss/' + key, mean_loss[key], epoch) writer.flush() log.info('\tepoch %d' % epoch) log.info('\t'.join(['{}:{:0.4f} '.format(key, loss) for key, loss in mean_loss.items()])) if torch.distributed.get_rank() == 0: log.info(f'\t saving epoch {epoch}') t.save(model2.state_dict(), os.path.join(output_dir, 'model_epoch%d.pt' % epoch)) writer.close()
def main(): global args, best_prec1, best_prec5 args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() #horovod initialize hvd.init() log = None if hvd.rank() == 0: log = SummaryWriter(log_dir=args.log_dir) print('The Training Model is %s' % args.arch) # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.cuda: torch.cuda.set_device(hvd.local_rank()) normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), download=True) val_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=False,transform=transforms.Compose([ transforms.ToTensor(), normalize, ])) #Horovod Partition the training data train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset,batch_size=args.batch_size,sampler=train_sampler,**kwargs) val_loader = torch.utils.data.DataLoader( val_dataset,batch_size=args.batch_size,sampler=val_sampler,**kwargs) # model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) if args.arch in resnet.__dict__: model = resnet.__dict__[args.arch]() elif args.arch == 'alexnet': model = models.AlexNet() elif args.arch == 'vgg16': model = models.VGG16() if hvd.rank() == 0: numel = sum(p.numel() for p in model.parameters()) print('Total params: {:d}'.format(numel)) lr_scaler = hvd.size() if args.cuda: model.cuda() if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.half: model.half() criterion.half() base_optimizer = torch.optim.SGD(model.parameters(), args.lr * lr_scaler, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(base_optimizer, # milestones=[100, 150], last_epoch=args.start_epoch - 1) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(base_optimizer, root_rank=0) #Compression # compression = Allgather(MGCCompressor(0.05), ResidualMemory(), hvd.size()) # compression = Allgather(TernGradCompressor(), ResidualMemory(), hvd.size()) compression = Allreduce(NoneCompressor(), NoneMemory()) # compression = Allgather(DgcCompressor(0.01), ResidualMemory(), hvd.size()) # compression = Allgather(LowQSGDCompressor(), ResidualMemory(), hvd.size()) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(base_optimizer, compression, named_parameters=model.named_parameters()) if hvd.rank() == 0: log.add_scalar('train/accuracy', 0., 0) log.add_scalar('test/accuracy', 0., 0) for epoch in range(args.start_epoch + 1, args.epochs + 1): adjust_learning_rate(optimizer, epoch, size=lr_scaler) if hvd.rank() == 0: print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, log=log) # evaluate on validation set prec1, prec5 = validate(val_loader, model, criterion, epoch, log=log) # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) best_prec5 = max(prec5, best_prec5) if hvd.rank() == 0: print('Best Pred@1:{:.2f}%, Prec@5:{:.2f}%\n'.format(best_prec1, best_prec5)) # if epoch > 0 and epoch % args.save_every == 0: # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th')) # # save_checkpoint({ # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'model.th')) if hvd.rank() == 0: log.close()
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.val_batch_size, sampler=val_sampler, **kwargs) # Set up standard VGG16 model. model = models.vgg16() # By default, Adasum doesn't need scaling up learning rate. # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = args.batches_per_allreduce * hvd.local_size() # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=(args.base_lr * lr_scaler), momentum=args.momentum, weight_decay=args.wd) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression,
batch_size=test_batch_size, sampler=test_sampler, **kwargs) model = Net() ##### HOROVOD ##### # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. ##### TODO:Need argument ##### if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. ##### TODO:Need argument ##### ''' optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum)''' optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) ##### HOROVOD ##### # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0)
def train(args): hvd.init() print("Hello from local_rank {}/{}, rank {}/{}".format( hvd.local_rank(), hvd.local_size(), hvd.rank(), hvd.size())) verbose = hvd.rank() == 0 if verbose: print('Using PyTorch version:', torch.__version__) print('Horovod version: {}, CUDA: {}, ROCM: {}, NCCL: {}, MPI: {}'.format( hvd_version, hvd.cuda_built(), hvd.rocm_built(), hvd.nccl_built(), hvd.mpi_built())) print(torch.__config__.show()) cudnn.benchmark = True torch.cuda.set_device(hvd.local_rank()) world_size = hvd.size() # Set up standard model. if verbose: print('Using {} model'.format(args.model)) model = getattr(models, args.model)() model = model.cuda() # import torch.multiprocessing as mp # # # assert "forkserver" in mp.get_all_start_methods() # mp.set_start_method("forkserver") lr_scaler = hvd.size() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), 1e-4 * lr_scaler) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) train_dataset = dataset_from_datadir(args.datadir, verbose) train_sampler = DistributedSampler(train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batchsize, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler, multiprocessing_context='forkserver') hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) total_step = args.steps if args.steps is not None else len(train_loader) # For each block of printed steps last_start = datetime.now() last_images = 0 # For final average avg_images = 0 avg_start = None tot_steps = 0 for epoch in range(args.epochs): for i, (images, labels) in enumerate(train_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() li = len(images) last_images += li tot_steps += 1 if tot_steps == args.warmup_steps: avg_start = datetime.now() elif tot_steps > args.warmup_steps: avg_images += li if (i + 1) % args.print_steps == 0 and verbose: now = datetime.now() last_secs = (now-last_start).total_seconds() print(f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], ' f'Loss: {loss.item():.4f}, ' f'Images/sec: {last_images*world_size/last_secs:.2f} ' f'(last {args.print_steps} steps)') last_start = now last_images = 0 if args.steps is not None and i >= args.steps: break if verbose: dur = datetime.now() - avg_start print(f"Training completed in: {dur}") print(f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} " f"(average, skipping {args.warmup_steps} warmup steps)")
def fit(self, input_data=None, input_labels=None, loss="", opt=""): if self.use_model: # use_model # Check Input Data if input_data is None or input_labels is None: return if self.model_onnx: print("Cannot use onnx type to fit model") return # Make TensorDataset and DataLoader for PyTorch train_dataset = TensorDataset(input_data, input_labels) # Handling Input of Loss Function loss_func = F.nll_loss if loss == "nll_loss": loss_func = F.nll_loss elif loss == "mse_loss": loss_func = F.mse_loss elif loss == "cross_entropy": loss_func = F.cross_entropy elif loss == "l1_loss": loss_func = F.l1_loss if self.cuda: ##### HOROVOD ##### train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) kwargs = {'num_workers': 1, 'pin_memory': True} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, sampler=train_sampler, **kwargs) # Set Optimizer if self.use_optimizer: optimizer = self.optimizer else: if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() if opt == "SGD": optimizer = optim.SGD(self.model.parameters(), lr=self.lr*lr_scalar, momentum=self.momentum) else: optimizer = optim.SGD(self.model.parameters(), lr=self.lr*lr_scalar, momentum=self.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none compression = hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=self.model.named_parameters(), compression=compression, op=hvd.Average) #op=hvd.Adasum if args.use_adasum else hvd.Average) else: train_loader = DataLoader(train_dataset, batch_size=self.batch_size) if self.use_optimizer: optimizer = self.optimizer else: if optim == "SGD": optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) else: optimizer = optim.SGD(self.model.parameters(), lr=self.lr, momentum=self.momentum) if self.debug: # Print model's state_dict print("Model's state_dict:") for param_tensor in self.model.state_dict(): print(param_tensor, "\t", self.model.state_dict()[param_tensor].size()) # Print optimizer's state_dict print("Optimizer's state_dict:") for var_name in optimizer.state_dict(): print(var_name, "\t", optimizer.state_dict()[var_name]) losses = [] nums = [] accs = [] for epoch in range(self.epochs): self.model.train() # Horovod: set epoch to sampler for shuffling. if self.cuda: train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if self.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = self.model(data) loss = loss_func(output, target) acc = self.accuracy(output,target) loss.backward() optimizer.step() if batch_idx % self.log_interval == 0: if self.cuda: if hvd.rank() == 0: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {}'.format( epoch+1, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item(), acc*100)) else: print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAccuracy: {}'.format( epoch+1, batch_idx * len(data), len(train_loader.dataset), 100. * batch_idx / len(train_loader), loss.item(), acc*100))
def pretrain( run_name: str, # # Data train_filepath: str = DEFAULT_CSNJS_TRAIN_FILEPATH, spm_filepath: str = DEFAULT_SPM_UNIGRAM_FILEPATH, num_workers=1, limit_dataset_size=-1, max_length=1024, subword_regularization_alpha: float = 0, program_mode="contrastive", loss_mode="infonce", # infonce, mlm, or hybrid min_alternatives=1, # # Model resume_path: str = "", encoder_type: str = "transformer", lstm_project_mode: str = "hidden", n_encoder_layers: int = 6, d_model: int = 512, n_head: int = 8, # # Optimization num_epochs: int = 100, save_every: int = 1, batch_size: int = 256, lr: float = 8e-4, weight_decay: float = 0, adam_betas=(0.9, 0.98), warmup_steps: int = 5000, num_steps: int = 600000, # # Horovod use_adasum: bool = False, fp16_allreduce: bool = False, gradient_predivide_factor: float = 1.0, # # Computational use_cuda: bool = True, seed: int = 0, ): hvd.init() logger.info("L:", n_encoder_layers, type(n_encoder_layers)) logger.info("H:", d_model, type(d_model)) logger.info("A:", n_head, type(n_head)) run_name = str(run_name) # support numerical run ids slurm_job_id = os.environ.get("SLURM_JOB_ID") slurm_job_hostname = os.environ.get("SLURM_JOB_NODELIST") config = locals() logger.info(f"Config = \n{config}") logger.info("Training configuration: {}".format(config)) logger.info( f"CUDA_VISIBLE_DEVICES = '{os.environ.get('CUDA_VISIBLE_DEVICES')}'") logger.info(f"CUDA_DEVICE_ORDER = '{os.environ.get('CUDA_DEVICE_ORDER')}'") assert program_mode in ["contrastive", "identity", "augmentation"] assert loss_mode == "infonce" or loss_mode == "mlm" or loss_mode == "hybrid" assert not (program_mode == "contrastive" and loss_mode == "mlm") assert not (program_mode != "contrastive" and (loss_mode == "hybrid" or loss_mode == "infonce")) assert not use_cuda or torch.cuda.is_available() torch.manual_seed(seed) np.random.seed(seed) random.seed(seed) run_dir = RUN_DIR / "{}_{}".format(run_name, int(time.time())) run_dir.mkdir(exist_ok=True, parents=True) config["run_dir"] = str(run_dir.resolve()) logger.add(str((run_dir / "train.log").resolve())) logger.info(f"Saving logs, model checkpoints to {run_dir}") # Create training dataset and dataloader assert train_filepath.endswith(".pickle") or train_filepath.endswith(".gz") # Setup distributed gpu = hvd.local_rank() ngpus_per_node = 1 chief_node = gpu == 0 assert gpu is not None if chief_node: if config["loss_mode"] == "mlm": project = "bert-pretrain" elif config["loss_mode"] == "infonce": project = "moco-pretrain" elif config["loss_mode"] == "hybrid": project = "hybrid" wandb.init(name=config["run_name"], config=config, job_type="training", project=project, entity="ml4code") logger.info("Use GPU: {} for training".format(gpu)) torch.cuda.set_device(gpu) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get("num_workers", 0) > 0 and hasattr(mp, "_supports_context") and mp._supports_context and "forkserver" in mp.get_all_start_methods()): kwargs["multiprocessing_context"] = "forkserver" sp = spm.SentencePieceProcessor() sp.Load(config["spm_filepath"]) pad_id = sp.PieceToId("[PAD]") logger.info("pad_id {}", pad_id) assert pad_id == 0 # hard coded in pad_collate mask_id = sp.PieceToId("[MASK]") # Create model if config["loss_mode"] == "infonce": # TODO(ajay): Support n_head argument, check how d_model is being used (why not in encoder config dict?) model = CodeMoCo( sp.GetPieceSize(), pad_id=pad_id, d_model=config["d_model"], encoder_config=dict( encoder_type=config["encoder_type"], lstm_project_mode=config["lstm_project_mode"], n_encoder_layers=config["n_encoder_layers"], ), ) logger.info( f"Created CodeMoCo model with {count_parameters(model)} params") elif config["loss_mode"] == "mlm": model = CodeMLM( sp.GetPieceSize(), pad_id=pad_id, encoder_type=config["encoder_type"], n_encoder_layers=config["n_encoder_layers"], d_model=config["d_model"], n_head=config["n_head"], d_ff=4 * config["d_model"], ) logger.info( f"Created CodeMLM model with {count_parameters(model)} params") elif config["loss_mode"] == "hybrid": model = CodeContrastiveMLM( sp.GetPieceSize(), pad_id=pad_id, n_encoder_layers=config["n_encoder_layers"], d_model=config["d_model"], n_head=config["n_head"], d_ff=4 * config["d_model"], use_horovod=True, ) logger.info( f"Created CodeContrastiveMLM model with {count_parameters(model)} params" ) else: raise ValueError(f"Bad loss mode {config['loss_mode']}") assert config["use_cuda"] model.cuda() # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have # config["batch_size"] = int(config["batch_size"] / ngpus_per_node) # config["num_workers"] = int((config["num_workers"] + ngpus_per_node - 1) / ngpus_per_node) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[gpu]) # define optimizer # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not config["use_adasum"] else 1 # If using GPU Adasum allreduce, scale learning rate by local_size. if config["use_adasum"] and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"] * lr_scaler, betas=config["adam_betas"], eps=1e-6, weight_decay=config["weight_decay"]) sched = get_linear_schedule_with_warmup(optimizer, config["warmup_steps"], config["num_steps"]) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if config[ "fp16_allreduce"] else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if config["use_adasum"] else hvd.Average, gradient_predivide_factor=config["gradient_predivide_factor"], ) # Load checkpoint if config["resume_path"]: logger.info(f"Loading parameters from {config['resume_path']}") # configure map_location properly map_location = {"cuda:%d" % 0: "cuda:%d" % hvd.rank()} checkpoint = torch.load(config["resume_path"], map_location=map_location) model.load_state_dict(checkpoint["model_state_dict"]) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) start_epoch = checkpoint["epoch"] + 1 start_global_step = checkpoint["global_step"] else: start_epoch = 1 start_global_step = 0 # Setup data train_dataset = PrecomputedDataset( config["train_filepath"], min_alternatives=config["min_alternatives"], program_mode=config["program_mode"], limit_size=config["limit_dataset_size"], sp=sp, subword_regularization_alpha=config["subword_regularization_alpha"], max_length=config["max_length"], ) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config["batch_size"], shuffle=False, collate_fn=pad_collate_contrastive if config["program_mode"] == "contrastive" else pad_collate, drop_last=True, sampler=train_sampler, **kwargs, ) # Train global_step = 0 while global_step < start_global_step: sched.step() global_step += 1 for epoch in tqdm.trange(start_epoch, config["num_epochs"] + 1, desc="training", unit="epoch", leave=False): logger.info(f"Starting epoch {epoch}\n") train_sampler.set_epoch(epoch) model.train() pbar = tqdm.tqdm(train_loader, desc=f"epoch {epoch}") for batch in pbar: optimizer.zero_grad() if config["loss_mode"] == "infonce": train_metrics = training_step(model, batch, use_cuda=config["use_cuda"]) elif config["loss_mode"] == "mlm": # replace tokens randomly with tokens from _ (8) train_metrics = training_step_mlm(sp, model, batch, pad_id=pad_id, mask_id=mask_id, vocab_start_idx=8, vocab_end_idx=7999, use_cuda=config["use_cuda"]) elif config["loss_mode"] == "hybrid": train_metrics = training_step_hybrid( sp, model, batch, mask_id=mask_id, pad_id=pad_id, vocab_start_idx=0, vocab_end_idx=7999, use_cuda=config["use_cuda"]) else: raise ValueError("Bad loss type") loss = train_metrics["loss"] loss.backward() optimizer.step() sched.step() global_step += 1 pbar.set_description( f"epoch {epoch} gpu {gpu} step {global_step} loss {loss.item():.4f}" ) if chief_node: wandb.log(dict(lr=sched.get_last_lr()[0])) wandb.log(dict(epoch=epoch, **train_metrics["log"]), step=global_step) # Save checkpoint if config["save_every"] and global_step % config[ "save_every"] == 0: checkpoint = { "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "epoch": epoch, "global_step": global_step, "config": config, } model_file = os.path.join( config["run_dir"], f"ckpt_pretrain_ep{epoch:04d}_step{global_step:07d}.pth" ) logger.info(f"Saving checkpoint to {model_file}...") torch.save(checkpoint, model_file) wandb.save(str(model_file)) logger.info("Done.")
def hvd_param_scaling(self): if hvd.nccl_built(): self.lr_scaler = hvd.local_size() print('Rescale lr = {} * lr'.format(self.lr_scaler))
def main(_): """ Basic Configurations """ ssl_set_unverified_context() FLAGS.CUDA = FLAGS.CUDA and torch.cuda.is_available() allreduce_batch_size = FLAGS.BATCH_SIZE * FLAGS.BATCHES_PER_ALLREDUCE hvd.init() np.random.seed(FLAGS.SEED) torch.manual_seed(FLAGS.SEED) if FLAGS.CUDA: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(FLAGS.SEED) cudnn.benchmark = True # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Select subdirectory as datetime if flagfile is not specified subdir = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') # If sys.argv has flagfile argument, set subdir as filename of flagfile parser = argparse.ArgumentParser() parser.add_argument('--flagfile') for flag in FLAGS.flag_values_dict().keys(): if flag.isupper(): parser.add_argument('--' + flag) args = parser.parse_args() if args.flagfile is not None: flagfile = args.flagfile subdir = os.path.splitext(os.path.basename(flagfile))[0] subdir = os.path.join(subdir, '-'.join(FLAGS.BLOCK_ARGS)) script_name = [os.path.splitext(os.path.basename(arg))[0] for arg in sys.argv if arg.endswith('.py')] if len(script_name) > 0: subdir = subdir.replace('train', script_name[0]) # Horovod: write TensorBoard logs on first worker. if hvd.rank() == 0: fileroot = get_real_path(FLAGS.TENSORBOARD_DIR) train_tensorboard_dir = os.path.join(fileroot, subdir, 'train') valid_tensorboard_dir = os.path.join(fileroot, subdir, 'valid') train_summary_writer = tensorboard.SummaryWriter(train_tensorboard_dir) valid_summary_writer = tensorboard.SummaryWriter(valid_tensorboard_dir) """ Prepare Dataset """ # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(FLAGS.NUM_THREADS) kwargs = {'num_workers': FLAGS.NUM_WORKERS, 'pin_memory': True} if FLAGS.CUDA else {} dataset_module = 'lib.data.datasets.' + FLAGS.DATASET_NAME.lower() dataset = importlib.import_module(dataset_module).__getattribute__(FLAGS.DATASET_NAME) train_dataset = dataset('train', data_dir=FLAGS.DATASET_DIR, mean=FLAGS.DATA_MEAN, std=FLAGS.DATA_STD) valid_dataset = dataset('valid', data_dir=FLAGS.DATASET_DIR, mean=FLAGS.DATA_MEAN, std=FLAGS.DATA_STD) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = distributed.DistributedSampler(train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) valid_sampler = distributed.DistributedSampler(valid_dataset, num_replicas=hvd.size(), rank=hvd.rank()) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=FLAGS.VALID_BATCH_SIZE, sampler=valid_sampler, **kwargs) """ Build Model """ # Set up a model. model_module = 'models.' + FLAGS.MODEL_NAME.lower() net = importlib.import_module(model_module).__getattribute__(FLAGS.MODEL_NAME) model = net(num_classes=len(train_dataset.classes), block_args=FLAGS.BLOCK_ARGS) # By default, Adasum doesn't need scaling up learning rate. # For sum/average with gradient Accumulation: scale learning rate by batches_per_allreduce lr_scaler = FLAGS.BATCHES_PER_ALLREDUCE * hvd.size() if not FLAGS.USE_ADASUM else 1 if FLAGS.CUDA: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if FLAGS.USE_ADASUM and hvd.nccl_built(): lr_scaler = FLAGS.BATCHES_PER_ALLREDUCE * hvd.local_size() """ Optimizer """ # Horovod: scale learning rate by the number of GPUs. optimizer = optim.SGD(model.parameters(), lr=(FLAGS.BASE_LR * lr_scaler), momentum=FLAGS.MOMENTUM, weight_decay=FLAGS.WEIGHT_DECAY, nesterov=FLAGS.NESTEROV) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if FLAGS.FP16_ALLREDUCE else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, backward_passes_per_step=FLAGS.BATCHES_PER_ALLREDUCE) # TODO: hvd.Adasum is not supported yet(0.18.2) #backward_passes_per_step = FLAGS.BATCHES_PER_ALLREDUCE, #op = hvd.Adasum if FLAGS.USE_ADASUM else hvd.Average) """ Restore & Broadcast """ # If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 fileroot = get_real_path(FLAGS.CHECKPOINT_DIR) for try_epoch in range(FLAGS.EPOCHS, 0, -1): filename = FLAGS.CHECKPOINT_FORMAT.format(epoch=try_epoch) filepath = os.path.join(fileroot, subdir, filename) if os.path.exists(filepath): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0, name='resume_from_epoch').item() # Restore from a previous checkpoint, if initial_epoch is specified. # Horovod: restore on the first worker which will broadcast weights to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: fileroot = get_real_path(FLAGS.CHECKPOINT_DIR) filename = FLAGS.CHECKPOINT_FORMAT.format(epoch=resume_from_epoch) filepath = os.path.join(fileroot, subdir, filename) checkpoint = torch.load(filepath) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) """ Training Operations """ def train(epoch): model.train() lr = adjust_learning_rate(FLAGS, optimizer, epoch) train_sampler.set_epoch(epoch) train_loss = Metric('train_loss') train_accuracy = Metric('train_accuracy') with tqdm.tqdm(total=len(train_loader), desc='Train Epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx, (data, target) in enumerate(train_loader): if FLAGS.CUDA: data, target = data.cuda(), target.cuda() optimizer.zero_grad() # Split data into sub-batches of size batch_size for i in range(0, len(data), FLAGS.BATCH_SIZE): data_batch = data[i:i + FLAGS.BATCH_SIZE] target_batch = target[i:i + FLAGS.BATCH_SIZE] output = model(data_batch) train_accuracy.update(accuracy(output, target_batch)) loss = F.cross_entropy(output, target_batch) train_loss.update(loss) # Average gradients among sub-batches loss.div_(math.ceil(float(len(data)) / FLAGS.BATCH_SIZE)) loss.backward() if i == 0 and hvd.rank() == 0: train_summary_writer.add_image("input", transforms.denormalize(data[0], mean=FLAGS.DATA_MEAN, std=FLAGS.DATA_STD), epoch) # Gradient is applied across all ranks optimizer.step() t.set_postfix({'loss': train_loss.avg.item(), 'accuracy': 100. * train_accuracy.avg.item(), 'lr': lr}) t.update(1) if hvd.rank() == 0: train_summary_writer.add_scalar('info/lr', lr, epoch) train_summary_writer.add_scalar('info/loss', train_loss.avg, epoch) train_summary_writer.add_scalar('metric/accuracy', train_accuracy.avg, epoch) def validate(epoch): model.eval() valid_loss = Metric('valid_loss') valid_accuracy = Metric('valid_accuracy') with tqdm.tqdm(total=len(valid_loader), desc='Validate Epoch #{}'.format(epoch + 1), disable=not verbose) as t: with torch.no_grad(): for data, target in valid_loader: if FLAGS.CUDA: data, target = data.cuda(), target.cuda() output = model(data) valid_loss.update(F.cross_entropy(output, target)) valid_accuracy.update(accuracy(output, target)) t.set_postfix({'loss': valid_loss.avg.item(), 'accuracy': 100. * valid_accuracy.avg.item()}) t.update(1) if hvd.rank() == 0: valid_summary_writer.add_scalar('info/loss', valid_loss.avg, epoch) valid_summary_writer.add_scalar('metric/accuracy', valid_accuracy.avg, epoch) def save_checkpoint(epoch): if hvd.rank() == 0: fileroot = get_real_path(FLAGS.CHECKPOINT_DIR) filename = FLAGS.CHECKPOINT_FORMAT.format(epoch=epoch + 1) filepath = os.path.join(fileroot, subdir, filename) if not os.path.exists(os.path.dirname(filepath)): os.makedirs(os.path.dirname(filepath)) state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(state, filepath) """ Training Loop """ if hvd.rank() == 0: print(model) print(flags_to_string(FLAGS)) for epoch in range(resume_from_epoch, FLAGS.EPOCHS): train(epoch) validate(epoch) save_checkpoint(epoch)
def train_fn(args): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() print("hvd rank:", hvd.rank(), " hvd local rank:", hvd.local_rank(), " using cuda: ", args.cuda) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} data_dir = args.data_dir or './data' with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) transformations = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) test_dataset = datasets.MNIST(data_dir, train=False, transform=transformations) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) def train(epoch): model.train() train_sampler.set_epoch(epoch) for batch, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch % args.log_interval == 0: # Horovod: use train_sampler to determine # the number of examples in this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch * len(data), len(train_sampler), 100.0 * batch / len(train_loader), loss.item())) def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) for epoch in range(1, args.epochs + 1): train(epoch) test()
def hvd_param_scaling(self): if hvd.nccl_built(): # self.batch_size = int(self.batch_size/hvd.local_size()) self.lr_scaler = 1,0
def pytorch_mnist_example(): class Net(torch.nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = torch.nn.Conv2d(1, 10, kernel_size=5) self.conv2 = torch.nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = torch.nn.Dropout2d() self.fc1 = torch.nn.Linear(320, 50) self.fc2 = torch.nn.Linear(50, 10) def forward(self, x): x = torch.nn.functional.relu( torch.nn.functional.max_pool2d(self.conv1(x), 2)) x = torch.nn.functional.relu( torch.nn.functional.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = torch.nn.functional.relu(self.fc1(x)) x = torch.nn.functional.dropout(x, training=self.training) x = self.fc2(x) return torch.nn.functional.log_softmax(x) def train(epoch, is_cuda, log_interval): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if is_cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = torch.nn.functional.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) def metric_average(val, name): tensor = torch.tensor(val) avg_tensor = hvd.allreduce(tensor, name=name) return avg_tensor.item() def test(is_cuda): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if is_cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += torch.nn.functional.nll_loss( output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) batch_size = 64 test_batch_size = 1000 epochs = 10 lr = 0.01 momentum = 0.5 random_seed = 42 log_interval = 10 fp16_allreduce = False use_adasum = False gradient_predivide_factor = 1.0 data_dir = './data' is_cuda = torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(random_seed) if is_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(random_seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if is_cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent issues with Infiniband implementations that are not fork-safe. if (kwargs.get('num_workers', 0) > 0 and hasattr(torch.multiprocessing, '_supports_context') and torch.multiprocessing._supports_context and 'forkserver' in torch.multiprocessing.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' with FileLock(os.path.expanduser('~/.horovod_lock')): train_dataset = torchvision.datasets.MNIST( data_dir, train=True, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.1307, ), (0.3081, )) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) test_dataset = torchvision.datasets.MNIST( data_dir, train=False, download=True, transform=torchvision.transforms.Compose([ torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.1307, ), (0.3081, )) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if is_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = torch.optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if use_adasum else hvd.Average, gradient_predivide_factor=gradient_predivide_factor) for epoch in range(1, epochs + 1): train(epoch, is_cuda, log_interval) test(is_cuda)
def main(args): def train_mixed_precision(epoch, scaler): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() with torch.cuda.amp.autocast(): output = model(data) loss = F.nll_loss(output, target) scaler.scale(loss).backward() # Make sure all async allreduces are done optimizer.synchronize() # In-place unscaling of all gradients before weights update scaler.unscale_(optimizer) with optimizer.skip_synchronize(): scaler.step(optimizer) # Update scaler in case of overflow/underflow scaler.update() if batch_idx % args.log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print( 'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLoss Scale: {}' .format(epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item(), scaler.get_scale())) def train_epoch(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) def metric_average(val, name): tensor = torch.tensor(val) avg_tensor = hvd.allreduce(tensor, name=name) return avg_tensor.item() def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) else: if args.use_mixed_precision: raise ValueError( "Mixed precision is only supported with cuda enabled.") if (args.use_mixed_precision and LooseVersion(torch.__version__) < LooseVersion('1.6.0')): raise ValueError("""Mixed precision is using torch.cuda.amp.autocast(), which requires torch >= 1.6.0""") # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' data_dir = args.data_dir or './data' with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) if args.use_mixed_precision: # Initialize scaler in global scale scaler = torch.cuda.amp.GradScaler() for epoch in range(1, args.epochs + 1): if args.use_mixed_precision: train_mixed_precision(epoch, scaler) else: train_epoch(epoch) # Keep test in full precision since computation is relatively light. test()
def train_main(args, filenames): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if torch.cuda.is_available() and not args.no_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) rank = hvd.rank() train_dataset = create_dataset( filenames, batch_size=args.batch_size, rank=rank, num_epochs=args.epochs, world_size=hvd.size(), num_reducers=args.num_reducers, max_concurrent_epochs=args.max_concurrent_epochs) model = Net() # By default, Adasum doesn"t need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if torch.cuda.is_available() and not args.no_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average, gradient_predivide_factor=args.gradient_predivide_factor) def _train(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_dataset.set_epoch(epoch) start_epoch = timeit.default_timer() last_batch_time = start_epoch batch_wait_times = [] for batch_idx, (data, target) in enumerate(train_dataset): batch_wait_times.append(timeit.default_timer() - last_batch_time) if torch.cuda.is_available() and not args.no_cuda: if isinstance(data, list): data = [t.cuda() for t in data] target = target.cuda() optimizer.zero_grad() # output = model(data) if batch_idx % args.log_interval == 0: print( f"Processing batch {batch_idx} in epoch {epoch} on worker " f"{rank}.") time.sleep(args.mock_train_step_time) # TODO(Clark): Add worker synchronization barrier here. # loss = F.nll_loss(output, target) # loss.backward() # optimizer.step() last_batch_time = timeit.default_timer() epoch_duration = timeit.default_timer() - start_epoch avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nEpoch {epoch}, worker {rank} stats over " f"{len(batch_wait_times)} steps: {epoch_duration:.3f}") print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}") print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s") return batch_wait_times print(f"Starting training on worker {rank}.") batch_wait_times = [] for epoch in range(args.epochs): # TODO(Clark): Don't include stats from first epoch since we already # expect that epoch to be cold? batch_wait_times.extend(_train(epoch)) print(f"Done training on worker {rank}.") avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nWorker {rank} training stats over {args.epochs} epochs:") print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}") print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s") # TODO(Clark): Add logic to the dataset abstraction so we don't have to do # this. if rank == 0: print("Waiting in rank 0 worker to let other workers consume queue...") time.sleep(10) print("Done waiting in rank 0 worker.")
def main(): args = parser.parse_args() # Set-up tensorboard # Horovod: initialize library. seed = 42 hvd.init() torch.manual_seed(seed) # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' data_dir = args.data_dir with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST(data_dir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() loss_function = nn.CrossEntropyLoss() running_loss = 0.0 # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.Adam(model.parameters(), lr=args.base_lr * lr_scaler) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) # Profile training logs = "logs/pytorch-" + datetime.now().strftime("%Y%m%d-%H%M%S") writer = SummaryWriter(log_dir=logs) for epoch in range(1, args.epochs + 1): train(epoch, model, train_sampler, train_loader, optimizer, loss_function, args) test_loss, test_accuracy = test(model, test_loader, test_sampler) if hvd.rank() == 0: writer.add_scalars("Test", { "loss": test_loss, "acc.": test_accuracy }) writer.close()
def __init__(self, opt): """Initialize the pix2pix class. Parameters: opt (Option class)-- stores all the experiment flags; needs to be a subclass of BaseOptions """ BaseModel.__init__(self, opt) # specify the training losses you want to print out. The training/test scripts will call <BaseModel.get_current_losses> self.loss_names = ['G_GAN', 'G_L1', 'D_real', 'D_fake'] # specify the images you want to save/display. The training/test scripts will call <BaseModel.get_current_visuals> self.visual_names = ['real_A', 'fake_B', 'real_B'] # specify the models you want to save to the disk. The training/test scripts will call <BaseModel.save_networks> and <BaseModel.load_networks> if self.isTrain: self.model_names = ['G', 'D'] else: # during test time, only load G self.model_names = ['G'] # define networks (both generator and discriminator) self.netG = networks.define_G(opt.input_nc, opt.output_nc, opt.ngf, opt.netG, opt.norm, not opt.no_dropout, opt.init_type, opt.init_gain, self.gpu_ids) # Horovod hvd.broadcast_parameters(self.netG.state_dict(), root_rank=0) if self.isTrain: # define a discriminator; conditional GANs need to take both input and output images; Therefore, #channels for D is input_nc + output_nc self.netD = networks.define_D(opt.input_nc + opt.output_nc, opt.ndf, opt.netD, opt.n_layers_D, opt.norm, opt.init_type, opt.init_gain, self.gpu_ids) # Horovod hvd.broadcast_parameters(self.netD.state_dict(), root_rank=0) if self.isTrain: # Horovod compression = hvd.Compression.fp16 if opt.fp16_allreduce else hvd.Compression.none lr_scaler = hvd.size() if not opt.use_adasum else 1 if opt.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # define loss functions self.criterionGAN = networks.GANLoss(opt.gan_mode).to(self.device) self.criterionL1 = torch.nn.L1Loss() # initialize optimizers; schedulers will be automatically created by function <BaseModel.setup>. optimizer_G = torch.optim.Adam(self.netG.parameters(), lr=opt.lr * lr_scaler, betas=(opt.beta1, 0.999)) # Horovod hvd.broadcast_optimizer_state(optimizer_G, root_rank=0) self.optimizer_G = hvd.DistributedOptimizer( optimizer_G, named_parameters=self.netG.named_parameters()) optimizer_D = torch.optim.Adam(self.netD.parameters(), lr=opt.lr * lr_scaler, betas=(opt.beta1, 0.999)) # Horovod hvd.broadcast_optimizer_state(optimizer_D, root_rank=0) self.optimizer_D = hvd.DistributedOptimizer( optimizer_D, named_parameters=self.netD.named_parameters()) self.optimizers.append(self.optimizer_G) self.optimizers.append(self.optimizer_D)
def hvd_param_scaling(self): if hvd.nccl_built(): self.batch_size = int(self.batch_size / hvd.local_size()) self.iters_per_epoch = int(self.max_iterations / self.epochs / hvd.local_size())
def main(args): hvd.init() args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: device = torch.device('cuda') # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) device = 'GPU' if args.cuda else 'CPU' if hvd.rank() == 0: log('Using PyTorch version: %s, Device: %s' % (torch.__version__, device)) log('Horovod version: %s, CUDA: %s, ROCM: %s, NCCL: %s, MPI: %s' % (horovod.__version__, hvd.cuda_built(), hvd.rocm_built(), hvd.nccl_built(), hvd.mpi_built())) log(torch.__config__.show()) cudnn.benchmark = True # Set up standard model. log('Initializing %s model...' % args.model) model = getattr(models, args.model)() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if args.fixed_data: data, target = generate_data(args) def benchmark_step(): nonlocal data, target if not args.fixed_data: data, target = generate_data(args) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() log('Model: %s' % args.model) log('Batch size: %d' % args.batch_size) log('Number of %ss: %d' % (device, hvd.size())) # Warm-up log('Running warmup...') timeit.timeit(benchmark_step, number=args.num_warmup_batches) # Benchmark log('Running benchmark...') img_secs = [] for x in range(args.num_iters): time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) img_sec = args.batch_size * args.num_batches_per_iter / time log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device)) img_secs.append(img_sec) # Results img_sec_mean = np.mean(img_secs) img_sec_conf = 1.96 * np.std(img_secs) log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf)) log('Total img/sec on %d %s(s): %.1f +-%.1f' % (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
def train_fn(): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) transformations = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) test_dataset = datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transformations) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) @hvd.elastic.run def train(state): # post synchronization event (worker added, worker removed) init ... for state.epoch in range(state.epoch, args.epochs + 1): state.model.train() train_sampler.set_epoch(state.epoch) steps_remaining = len(train_loader) - state.batch for state.batch, (data, target) in enumerate(train_loader): if state.batch >= steps_remaining: break if args.cuda: data, target = data.cuda(), target.cuda() state.optimizer.zero_grad() output = state.model(data) loss = F.nll_loss(output, target) loss.backward() state.optimizer.step() if state.batch % args.log_interval == 0: # Horovod: use train_sampler to determine # the number of examples in this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'. format(state.epoch, state.batch * len(data), len(train_sampler), 100.0 * state.batch / len(train_loader), loss.item())) if (state.batch + 1) % args.num_batches_per_commit == 0: state.commit() state.batch = 0 def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) # adjust learning rate on reset def on_state_reset(): for param_group in optimizer.param_groups: param_group['lr'] = args.lr * hvd.size() state = hvd.elastic.TorchState(model, optimizer, epoch=1, batch=0) state.register_reset_callbacks([on_state_reset]) train(state) test()
def train_func(config): data_dir = config.get("data_dir", None) seed = config.get("seed", 42) use_cuda = config.get("use_cuda", False) batch_size = config.get("batch_size", 64) use_adasum = config.get("use_adasum", False) lr = config.get("lr", 0.01) momentum = config.get("momentum", 0.5) num_epochs = config.get("num_epochs", 10) log_interval = config.get("log_interval", 10) # Horovod: initialize library. hvd.init() torch.manual_seed(seed) if use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} data_dir = data_dir or "~/data" with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = \ datasets.MNIST(data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if use_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD( model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if use_adasum else hvd.Average) results = [] for epoch in range(1, num_epochs + 1): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) num_batches = len(train_loader) for batch_idx, (data, target) in enumerate(train_loader): if use_cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % log_interval == 0: # Horovod: use train_sampler to determine the number of # examples in this worker's partition. print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) if batch_idx == num_batches - 1: results.append(loss.item()) return results
def setup(config): data_dir = config.get("data_dir", None) seed = config.get("seed", 42) batch_size = config.get("batch_size", 64) use_adasum = config.get("use_adasum", False) lr = config.get("lr", 0.01) momentum = config.get("momentum", 0.5) use_cuda = config.get("use_cuda", False) # Horovod: initialize library. hvd.init() torch.manual_seed(seed) if use_cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} data_dir = data_dir or "~/data" with FileLock(os.path.expanduser("~/.horovod_lock")): train_dataset = datasets.MNIST( data_dir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 if use_cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), op=hvd.Adasum if use_adasum else hvd.Average, ) return model, optimizer, train_loader, train_sampler
def run(): args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) """model_init""" model = FFN_no_norm(in_channels=4, out_channels=1, input_size=args.input_size, delta=args.delta, depth=args.depth) #hvd ddl # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) """resume""" if args.resume is not None: model.load_state_dict(torch.load(args.resume)) if os.path.exists(args.save_path + 'resume_step.pkl'): resume = load_obj(args.save_path + 'resume_step.pkl') else: resume = {'resume_step': args.resume_step} args.resume_step = resume['resume_step'] print('resume_step', args.resume_step) if args.tb == None: tb = SummaryWriter('./tensorboard/'+args.tag+'tb_train_log_fov:{}_delta:{}_depth:{}.pth' .format(list(args.input_size)[0], list(args.delta)[0], args.depth)) else: tb = SummaryWriter(args.tb) """data_load""" train_dataset= BatchCreator(args.train_data_dir, args.input_size, delta=args.delta,train=True) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, sampler=train_sampler, **kwargs) batch_it = get_batch(train_loader, args.batch_size, args.input_size, partial(fixed_offsets, fov_moves=train_dataset.shifts)) """ for index in range(files_total): input_h5data_dict = [(abs_path_training_data + sorted_files_train_data)] print(input_h5data_dict) train_dataset_dict = BatchCreator(input_h5data_dict, args.input_size, delta=args.delta, train=True) train_sampler_dict = torch.utils.data.distributed.DistributedSampler(train_dataset_dict, num_replicas=world_size, rank=rank, shuffle=True) train_loader_dict = DataLoader(train_dataset_dict, num_workers=0, sampler=train_sampler_dict , pin_memory=True) batch_it_dict = get_batch(train_loader_dict, args.batch_size, args.input_size, partial(fixed_offsets, fov_moves=train_dataset_dict.shifts)) """ """optimizer""" """ if args.opt == 'Adam': optimizer = optim.Adam(model.parameters(), lr=args.lr) else: optimizer = optim.SGD(model.parameters(), lr=1e-3) """ # optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.step, gamma=args.gamma, last_epoch=-1) """train_loop""" t_last = time.time() cnt = 0 tp = fp = tn = fn = 0 best_loss = np.inf model.train() while cnt < args.iter: cnt += 1 # resume_tb if cnt % 1000 == 0: resume['resume_step'] = cnt + args.resume_step pickle_obj(resume, 'resume_step', args.save_path) """ index_batch = (cnt % train_num) train_sampler_dict[index_batch].set_epoch(cnt) seeds, images, labels, offsets = next(batch_it_dict[index_batch]) print(input_h5data_dict[index_batch]) """ train_sampler.set_epoch(cnt) seeds, images, labels, offsets = next(batch_it) # train t_curr = time.time() labels = labels.cuda() torch_seed = torch.from_numpy(seeds) input_data = torch.cat([images, torch_seed], dim=1) input_data = Variable(input_data.cuda()) logits = model(input_data) updated = torch_seed.cuda() + logits optimizer.zero_grad() loss = F.binary_cross_entropy_with_logits(updated, labels) loss.backward() # torch.nn.utils.clip_grad_value_(model.parameters(), args.clip_grad_thr) optimizer.step() seeds[...] = updated.detach().cpu().numpy() pred_mask = (updated >= logit(0.8)).detach().cpu().numpy() true_mask = (labels > 0.5).cpu().numpy() true_bg = np.logical_not(true_mask) pred_bg = np.logical_not(pred_mask) tp += (true_mask & pred_mask).sum() fp += (true_bg & pred_mask).sum() fn += (true_mask & pred_bg).sum() tn += (true_bg & pred_bg).sum() precision = 1.0 * tp / max(tp + fp, 1) recall = 1.0 * tp / max(tp + fn, 1) accuracy = 1.0 * (tp + tn) / (tp + tn + fp + fn) print('[rank_{}:, Iter_{}:, loss: {:.4}, Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%]\r'.format(hvd.rank(), cnt, loss.item(), precision * 100, recall * 100, accuracy * 100)) # scheduler.step() """model_saving_(iter)""" if (cnt % args.save_interval) == 0 and hvd.rank() == 0: tp = fp = tn = fn = 0 # t_last = t_curr # best_loss = loss.item() input_size_r = list(args.input_size) delta_r = list(args.delta) torch.save(model.state_dict(), os.path.join(args.save_path, ( str(args.tag) + 'ffn_model_fov:{}_delta:{}_depth:{}.pth'.format(input_size_r[0], delta_r[0], args.depth)))) torch.save(model.state_dict(), os.path.join(args.save_path, ( str(args.tag) + 'ffn_model_fov:{}_delta:{}_depth:{}_recall{}_.pth'.format(input_size_r[0], delta_r[0], args.depth,recall*100)))) print('Precision: {:.2f}%, Recall: {:.2f}%, Accuracy: {:.2f}%, Model saved!'.format( precision * 100, recall * 100, accuracy * 100)) buffer_step = 3000 resume_step = args.resume_step - buffer_step if cnt > buffer_step: tb.add_scalar("Loss", loss.item(), cnt + resume_step) tb.add_scalar("Precision", precision * 100, cnt + resume_step) tb.add_scalar("Recall", recall * 100, cnt + resume_step) tb.add_scalar("Accuracy", accuracy * 100, cnt + resume_step)