def train_with_single(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, num_steps=1): torch.cuda.set_device(0) trainer = DLTrainer(0, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='singlegpu', num_steps = num_steps) iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update) seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer) logger.info('Bencharmked backward time: %f', np.sum(layerwise_times)) logger.info('Model size: %d', np.sum(layerwise_sizes)) times = [] display = 100 if iters_per_epoch > 100 else iters_per_epoch-1 for epoch in range(max_epochs): if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() trainer.optimizer.zero_grad() for j in range(nsteps_update): if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) trainer.update_model() times.append(time.time()-s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter) times = []
def ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps, compressor, density, threshold, gradient_path=None): rank = hvd.rank() torch.cuda.set_device(rank % nwpernode) if rank != 0: pretrain = None trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) is_sparse = density < 1 #if not is_sparse: # compressor = None if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE: seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer) layerwise_times = comm.bcast(layerwise_times, root=0) if rank == 0: logger.info('layerwise backward times: %s', list(layerwise_times)) logger.info('layerwise backward sizes: %s', list(layerwise_sizes)) logger.info('Bencharmked backward time: %f', np.sum(layerwise_times)) logger.info('Model size: %d', np.sum(layerwise_sizes)) else: seq_layernames, layerwise_times, layerwise_sizes = None, None, None norm_clip = None if dnn == 'lstm': norm_clip = 0.25 elif dnn == 'lstman4': norm_clip = 400 optimizer = hvd.DistributedOptimizer( trainer.optimizer, named_parameters=trainer.net.named_parameters(), compression=compressors[compressor], is_sparse=is_sparse, density=density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=norm_clip, threshold=threshold, writer=writer, gradient_path=gradient_path) hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // ( nworkers * batch_size * nsteps_update) times = [] logger.info('max_epochs: %d', max_epochs) display = 40 if iters_per_epoch > 40 else iters_per_epoch - 1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) trainer.update_model() times.append(time.time() - s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.warn( 'Time per iteration including communication: %f, Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter) times = [] optimizer.increase_one_epoch()
# If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler) # Set up fixed fake data data = torch.randn(args.batch_size, 3, 224, 224) target = torch.LongTensor(args.batch_size).random_() % 1000 if args.cuda: data, target = data.cuda(), target.cuda() if args.mgwfbp: seq_layernames, layerwise_times, _ = benchmark(model, (data, target), F.cross_entropy, task='imagenet') layerwise_times = comm.bcast(layerwise_times, root=0) else: seq_layernames, layerwise_times = None, None optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compressors[args.compressor](), is_sparse=args.density<1, density=args.density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=None, threshold=args.threshold, writer=None, gradient_path='./', momentum_correction=False, fp16=args.fp16, mgwfbp=args.mgwfbp, rdma=args.rdma) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) def benchmark_step(): optimizer.zero_grad()
def ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps, compressor, density, threshold, gradient_path=None, tb=None, iratio=0.1, stages=1, partitions=0, ec_gradw=1.0, ec_memw=0.0, optimizer='nesterov', totaltime=0): global SPEED if not settings.USE_CPU: if nworkers > 1: rank = hvd.rank() torch.cuda.set_device(hvd.local_rank()) #%rank%nwpernode) else: rank = 0 torch.cuda.set_device(rank) if rank != 0: pretrain = None #### CHECK whether to use GPU or CPU if settings.USE_CPU: trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=0, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer, tb=tb, optimizer_str=optimizer) else: trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer, tb=tb, optimizer_str=optimizer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) is_sparse = True #density < 1 if not is_sparse: compressor = None if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE: seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer) layerwise_times = comm.bcast(layerwise_times, root=0) if rank == 0: logger.info('layerwise backward times: %s', list(layerwise_times)) logger.info('layerwise backward sizes: %s', list(layerwise_sizes)) logger.info('Bencharmked backward time: %f', np.sum(layerwise_times)) logger.info('Model size: %d', np.sum(layerwise_sizes)) else: seq_layernames, layerwise_times, layerwise_sizes = None, None, None norm_clip = None if dnn == 'lstm': norm_clip = 0.25 elif dnn == 'lstman4': norm_clip = 400 optimizer = hvd.DistributedOptimizer( trainer.optimizer, named_parameters=trainer.net.named_parameters(), compression=compressors[compressor], is_sparse=is_sparse, density=density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=norm_clip, threshold=threshold, writer=writer, gradient_path=gradient_path, tb=tb, iratio=iratio, stages=stages, partitions=partitions, ec_gradw=ec_gradw, ec_memw=ec_memw) hvd.SPEED hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // ( nworkers * batch_size * nsteps_update) start = time.time() times = [] noupdate_times = [] logger.info('max_epochs: %d', max_epochs) display = settings.DISPLAY if iters_per_epoch > settings.DISPLAY else iters_per_epoch - 1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) noupdate_times.append(time.time() - s) trainer.update_model() torch.cuda.synchronize() times.append(time.time() - s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) update_per_iter = time_per_iter - np.mean(noupdate_times) throughput = batch_size * nsteps_update / time_per_iter trainer.log_info(time_per_iter, throughput, update_per_iter) logger.warning( 'Time per iteration: %f, communication: %f, Speed: %f images/s', time_per_iter, update_per_iter, throughput) times = [] noupdate_times = [] optimizer.increase_one_epoch() if totaltime > 0 and time.time() - start > totaltime: trainer.test(trainer.get_train_epoch()) break if not (dataset == 'cifar10'): trainer.test(trainer.get_train_epoch())
class BertPretrainingCriterion(torch.nn.Module): def __init__(self, vocab_size): super(BertPretrainingCriterion, self).__init__() self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-1) self.vocab_size = vocab_size def forward(self, prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels): masked_lm_loss = self.loss_fn(prediction_scores.view(-1, self.vocab_size), masked_lm_labels.view(-1)) next_sentence_loss = self.loss_fn(seq_relationship_score.view(-1, 2), next_sentence_labels.view(-1)) total_loss = masked_lm_loss + next_sentence_loss return total_loss criterion = BertPretrainingCriterion(vocab_size) if args.mgwfbp: seq_layernames, layerwise_times, _ = benchmark(model, batch, criterion, task='bert') layerwise_times = comm.bcast(layerwise_times, root=0) else: seq_layernames, layerwise_times = None, None optimizer = AdamW(model.parameters(), lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = 1e-8 # args.adam_epsilon - default is 1e-8. ) #optimizer = optim.SGD(model.parameters(), lr=2e-5) #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. #optimizer = hvd.DistributedOptimizer(optimizer,