def ssgd_with_horovod(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps = 1): rank = hvd.rank() torch.cuda.set_device(rank%nwpernode) if rank != 0: pretrain = None trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) optimizer = hvd.DistributedOptimizer(trainer.optimizer, named_parameters=trainer.net.named_parameters()) hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update) times = [] display = 20 if iters_per_epoch > 20 else iters_per_epoch-1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) trainer.update_model() times.append(time.time()-s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter) times = []
def ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps, compressor, density, threshold, gradient_path=None): rank = hvd.rank() torch.cuda.set_device(rank % nwpernode) if rank != 0: pretrain = None trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) is_sparse = density < 1 #if not is_sparse: # compressor = None if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE: seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer) layerwise_times = comm.bcast(layerwise_times, root=0) if rank == 0: logger.info('layerwise backward times: %s', list(layerwise_times)) logger.info('layerwise backward sizes: %s', list(layerwise_sizes)) logger.info('Bencharmked backward time: %f', np.sum(layerwise_times)) logger.info('Model size: %d', np.sum(layerwise_sizes)) else: seq_layernames, layerwise_times, layerwise_sizes = None, None, None norm_clip = None if dnn == 'lstm': norm_clip = 0.25 elif dnn == 'lstman4': norm_clip = 400 optimizer = hvd.DistributedOptimizer( trainer.optimizer, named_parameters=trainer.net.named_parameters(), compression=compressors[compressor], is_sparse=is_sparse, density=density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=norm_clip, threshold=threshold, writer=writer, gradient_path=gradient_path) hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // ( nworkers * batch_size * nsteps_update) times = [] logger.info('max_epochs: %d', max_epochs) display = 40 if iters_per_epoch > 40 else iters_per_epoch - 1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) trainer.update_model() times.append(time.time() - s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.warn( 'Time per iteration including communication: %f, Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter) times = [] optimizer.increase_one_epoch()
def ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps, compressor, density, threshold, gradient_path=None, tb=None, iratio=0.1, stages=1, partitions=0, ec_gradw=1.0, ec_memw=0.0, optimizer='nesterov', totaltime=0): global SPEED if not settings.USE_CPU: if nworkers > 1: rank = hvd.rank() torch.cuda.set_device(hvd.local_rank()) #%rank%nwpernode) else: rank = 0 torch.cuda.set_device(rank) if rank != 0: pretrain = None #### CHECK whether to use GPU or CPU if settings.USE_CPU: trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=0, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer, tb=tb, optimizer_str=optimizer) else: trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer, tb=tb, optimizer_str=optimizer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) is_sparse = True #density < 1 if not is_sparse: compressor = None if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE: seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer) layerwise_times = comm.bcast(layerwise_times, root=0) if rank == 0: logger.info('layerwise backward times: %s', list(layerwise_times)) logger.info('layerwise backward sizes: %s', list(layerwise_sizes)) logger.info('Bencharmked backward time: %f', np.sum(layerwise_times)) logger.info('Model size: %d', np.sum(layerwise_sizes)) else: seq_layernames, layerwise_times, layerwise_sizes = None, None, None norm_clip = None if dnn == 'lstm': norm_clip = 0.25 elif dnn == 'lstman4': norm_clip = 400 optimizer = hvd.DistributedOptimizer( trainer.optimizer, named_parameters=trainer.net.named_parameters(), compression=compressors[compressor], is_sparse=is_sparse, density=density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=norm_clip, threshold=threshold, writer=writer, gradient_path=gradient_path, tb=tb, iratio=iratio, stages=stages, partitions=partitions, ec_gradw=ec_gradw, ec_memw=ec_memw) hvd.SPEED hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // ( nworkers * batch_size * nsteps_update) start = time.time() times = [] noupdate_times = [] logger.info('max_epochs: %d', max_epochs) display = settings.DISPLAY if iters_per_epoch > settings.DISPLAY else iters_per_epoch - 1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) noupdate_times.append(time.time() - s) trainer.update_model() torch.cuda.synchronize() times.append(time.time() - s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) update_per_iter = time_per_iter - np.mean(noupdate_times) throughput = batch_size * nsteps_update / time_per_iter trainer.log_info(time_per_iter, throughput, update_per_iter) logger.warning( 'Time per iteration: %f, communication: %f, Speed: %f images/s', time_per_iter, update_per_iter, throughput) times = [] noupdate_times = [] optimizer.increase_one_epoch() if totaltime > 0 and time.time() - start > totaltime: trainer.test(trainer.get_train_epoch()) break if not (dataset == 'cifar10'): trainer.test(trainer.get_train_epoch())
def robust_ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, compression=False, compressor='topk', nwpernode=1, sigma_scale=2.5, pretrain=None, density=0.01, prefix=None): global relative_path torch.cuda.set_device(dopt.rank()%nwpernode) rank = dopt.rank() if rank != 0: pretrain = None trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix=prefix+'-ds%s'%str(density), pretrain=pretrain, tb_writer=writer) init_epoch = trainer.get_train_epoch() init_iter = trainer.get_train_iter() trainer.set_train_epoch(comm.bcast(init_epoch)) trainer.set_train_iter(comm.bcast(init_iter)) def _error_handler(new_num_workers, new_rank): logger.info('Error info catched by trainer') trainer.update_nworker(new_num_workers, new_rank) compressor = compressor if compression else 'none' compressor = compressors[compressor] is_sparse = compression logger.info('Broadcast parameters....') hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) logger.info('Broadcast parameters finished....') norm_clip = None optimizer = dopt.DistributedOptimizer(trainer.optimizer, trainer.net.named_parameters(), compression=compressor, is_sparse=is_sparse, err_handler=_error_handler, layerwise_times=None, sigma_scale=sigma_scale, density=density, norm_clip=norm_clip, writer=writer) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update) times = [] NUM_OF_DISLAY = 40 display = NUM_OF_DISLAY if iters_per_epoch > NUM_OF_DISLAY else iters_per_epoch-1 logger.info('Start training ....') for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) trainer.update_model() times.append(time.time()-s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.info('Time per iteration including communication: %f. Speed: %f images/s, current density: %f', time_per_iter, batch_size * nsteps_update / time_per_iter, optimizer.get_current_density()) times = [] optimizer.add_train_epoch() if settings.PROFILING_INDEX and rank == 0 and epoch % 10 == 0: fn = os.path.join(relative_path, 'index-rank%d-epoch%d.npy' % (rank, epoch)) key = list(optimizer._allreducer._update_index_counter.keys())[0] np.save(fn, optimizer._allreducer._update_index_counter[key].int().cpu().numpy()) if settings.PROFILING_NORM: # For comparison purpose ===> fn = os.path.join(relative_path, 'gtopknorm-rank%d-epoch%d.npy' % (rank, epoch)) fn2 = os.path.join(relative_path, 'randknorm-rank%d-epoch%d.npy' % (rank, epoch)) fn3 = os.path.join(relative_path, 'upbound-rank%d-epoch%d.npy' % (rank, epoch)) fn5 = os.path.join(relative_path, 'densestd-rank%d-epoch%d.npy' % (rank, epoch)) arr = [] arr2 = [] arr3 = [] arr4 = [] arr5 = [] for gtopk_norm, randk_norm, upbound, xnorm, dense_std in optimizer._allreducer._profiling_norms: arr.append(gtopk_norm) arr2.append(randk_norm) arr3.append(upbound) arr4.append(xnorm) arr5.append(dense_std) arr = np.array(arr) arr2 = np.array(arr2) arr3 = np.array(arr3) arr4 = np.array(arr4) arr5 = np.array(arr5) logger.info('[rank:%d][%d] gtopk norm mean: %f, std: %f', rank, epoch, np.mean(arr), np.std(arr)) logger.info('[rank:%d][%d] randk norm mean: %f, std: %f', rank, epoch, np.mean(arr2), np.std(arr2)) logger.info('[rank:%d][%d] upbound norm mean: %f, std: %f', rank, epoch, np.mean(arr3), np.std(arr3)) logger.info('[rank:%d][%d] x norm mean: %f, std: %f', rank, epoch, np.mean(arr4), np.std(arr4)) logger.info('[rank:%d][%d] dense std mean: %f, std: %f', rank, epoch, np.mean(arr5), np.std(arr5)) np.save(fn, arr) np.save(fn2, arr2) np.save(fn3, arr3) np.save(fn5, arr5) # For comparison purpose <=== End optimizer._allreducer._profiling_norms = [] optimizer.stop()
def ssgd_with_horovod(job, mode, start_dt): rank = hvd.rank() gpu_id = -1 if job.cuda: ngpus = 1 gpu_id = job.device_ids[rank] torch.cuda.set_device(gpu_id) else: ngpus = -1 if rank != 0: pretrain = None trainer = DLTrainer(rank, dist=False, batch_size=job.batch_size, is_weak_scaling=True, ngpus=ngpus, data_dir=job.data_dir, dataset=job.dataset, dnn=job.dnn, lr=job.lr, nworkers=job.nworkers, prefix='allreduce') if mode == 'simulate': synt_model = torch.rand(4, job.model_size * (2**20) / 4 / 4) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) optimizer = hvd.DistributedOptimizer( trainer.optimizer, named_parameters=trainer.net.named_parameters()) hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) #iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update) times = [] #display = 20 if iters_per_epoch > 20 else iters_per_epoch-1 display = 1 nsteps_update = job.nsteps_update # to integrate in LSTM code hidden = None dnn = job.dnn if dnn == 'lstm': hidden = trainer.net.init_hidden() logger.info( "(Server %s, Job %d, Rank %d, GPU %d) Wait until start time: %s." % (settings.hostname, job.job_id, rank, gpu_id, start_dt)) pause.until(start_dt) start_slot = time.time() for i in range(job.iters): s = time.time() optimizer.zero_grad() optimizer.local = False # forward time.sleep(job.get_forward_schedule(rank, i) * 0.001) fw_start_slot = int((time.time() - start_slot) * 1000) if mode == 'simulate': time.sleep((job.fw_time) * 0.001) else: #optimizer.local = True if dnn == 'lstm': #print(hidden) #print(" =========== j : %d ===========", j) _, hidden = trainer.train_forward(1, hidden=hidden) else: trainer.train_forward(1) #trainer.train(1) fw_end_slot = int((time.time() - start_slot) * 1000) logger.info( "(Server %s, Job %d, Rank %d, GPU %d) Forward task %d started at slot=%d, ended at slot=%d, duration=%d." % (settings.hostname, job.job_id, rank, gpu_id, i, fw_start_slot, fw_end_slot, fw_end_slot - fw_start_slot)) # backward time.sleep(job.get_backward_schedule(rank, i) * 0.001) bw_start_slot = int((time.time() - start_slot) * 1000) if mode == 'simulate': time.sleep((job.bw_time) * 0.001) else: trainer.train_backward(1) #trainer.train(1) pass bw_end_slot = int((time.time() - start_slot) * 1000) logger.info( "(Server %s, Job %d, Rank %d, GPU %d) Backward task %d started at slot=%d, ended at slot=%d, duration=%d." % (settings.hostname, job.job_id, rank, gpu_id, i, bw_start_slot, bw_end_slot, bw_end_slot - bw_start_slot)) # communication time.sleep(job.get_communication_schedule(rank, i) * 0.001) comm_start_slot = int((time.time() - start_slot) * 1000) if mode == 'simulate': hvd.allreduce(synt_model) pass else: trainer.update_model() comm_end_slot = int((time.time() - start_slot) * 1000) logger.info( "(Server %s, Job %d, Rank %d, GPU %d) Comm task %d started at slot=%d, ended at slot=%d, duration=%d." % (settings.hostname, job.job_id, rank, gpu_id, i, comm_start_slot, comm_end_slot, comm_end_slot - comm_start_slot)) times.append(time.time() - s) #if i % display == 0 and i > 0: # time_per_iter = np.mean(times) # logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, job.batch_size * nsteps_update / time_per_iter) # times = [] end_slot = time.time() logger.info( "(Server %s, Job %d, Rank %d, GPU %d) Job ended. Total time is % s." % (settings.hostname, job.job_id, rank, gpu_id, int((end_slot - start_slot) * 1000)))