def ssgd_with_horovod(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps = 1): rank = hvd.rank() torch.cuda.set_device(rank%nwpernode) if rank != 0: pretrain = None trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) optimizer = hvd.DistributedOptimizer(trainer.optimizer, named_parameters=trainer.net.named_parameters()) hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update) times = [] display = 20 if iters_per_epoch > 20 else iters_per_epoch-1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) trainer.update_model() times.append(time.time()-s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.info('Time per iteration including communication: %f. Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter) times = []
def ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps, compressor, density, threshold, gradient_path=None): rank = hvd.rank() torch.cuda.set_device(rank % nwpernode) if rank != 0: pretrain = None trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) is_sparse = density < 1 #if not is_sparse: # compressor = None if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE: seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer) layerwise_times = comm.bcast(layerwise_times, root=0) if rank == 0: logger.info('layerwise backward times: %s', list(layerwise_times)) logger.info('layerwise backward sizes: %s', list(layerwise_sizes)) logger.info('Bencharmked backward time: %f', np.sum(layerwise_times)) logger.info('Model size: %d', np.sum(layerwise_sizes)) else: seq_layernames, layerwise_times, layerwise_sizes = None, None, None norm_clip = None if dnn == 'lstm': norm_clip = 0.25 elif dnn == 'lstman4': norm_clip = 400 optimizer = hvd.DistributedOptimizer( trainer.optimizer, named_parameters=trainer.net.named_parameters(), compression=compressors[compressor], is_sparse=is_sparse, density=density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=norm_clip, threshold=threshold, writer=writer, gradient_path=gradient_path) hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // ( nworkers * batch_size * nsteps_update) times = [] logger.info('max_epochs: %d', max_epochs) display = 40 if iters_per_epoch > 40 else iters_per_epoch - 1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) trainer.update_model() times.append(time.time() - s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.warn( 'Time per iteration including communication: %f, Speed: %f images/s', time_per_iter, batch_size * nsteps_update / time_per_iter) times = [] optimizer.increase_one_epoch()
def robust_ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, compression=False, compressor='topk', nwpernode=1, sigma_scale=2.5, pretrain=None, density=0.01, prefix=None): global relative_path torch.cuda.set_device(dopt.rank()%nwpernode) rank = dopt.rank() if rank != 0: pretrain = None trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix=prefix+'-ds%s'%str(density), pretrain=pretrain, tb_writer=writer) init_epoch = trainer.get_train_epoch() init_iter = trainer.get_train_iter() trainer.set_train_epoch(comm.bcast(init_epoch)) trainer.set_train_iter(comm.bcast(init_iter)) def _error_handler(new_num_workers, new_rank): logger.info('Error info catched by trainer') trainer.update_nworker(new_num_workers, new_rank) compressor = compressor if compression else 'none' compressor = compressors[compressor] is_sparse = compression logger.info('Broadcast parameters....') hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) logger.info('Broadcast parameters finished....') norm_clip = None optimizer = dopt.DistributedOptimizer(trainer.optimizer, trainer.net.named_parameters(), compression=compressor, is_sparse=is_sparse, err_handler=_error_handler, layerwise_times=None, sigma_scale=sigma_scale, density=density, norm_clip=norm_clip, writer=writer) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // (nworkers * batch_size * nsteps_update) times = [] NUM_OF_DISLAY = 40 display = NUM_OF_DISLAY if iters_per_epoch > NUM_OF_DISLAY else iters_per_epoch-1 logger.info('Start training ....') for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) trainer.update_model() times.append(time.time()-s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) logger.info('Time per iteration including communication: %f. Speed: %f images/s, current density: %f', time_per_iter, batch_size * nsteps_update / time_per_iter, optimizer.get_current_density()) times = [] optimizer.add_train_epoch() if settings.PROFILING_INDEX and rank == 0 and epoch % 10 == 0: fn = os.path.join(relative_path, 'index-rank%d-epoch%d.npy' % (rank, epoch)) key = list(optimizer._allreducer._update_index_counter.keys())[0] np.save(fn, optimizer._allreducer._update_index_counter[key].int().cpu().numpy()) if settings.PROFILING_NORM: # For comparison purpose ===> fn = os.path.join(relative_path, 'gtopknorm-rank%d-epoch%d.npy' % (rank, epoch)) fn2 = os.path.join(relative_path, 'randknorm-rank%d-epoch%d.npy' % (rank, epoch)) fn3 = os.path.join(relative_path, 'upbound-rank%d-epoch%d.npy' % (rank, epoch)) fn5 = os.path.join(relative_path, 'densestd-rank%d-epoch%d.npy' % (rank, epoch)) arr = [] arr2 = [] arr3 = [] arr4 = [] arr5 = [] for gtopk_norm, randk_norm, upbound, xnorm, dense_std in optimizer._allreducer._profiling_norms: arr.append(gtopk_norm) arr2.append(randk_norm) arr3.append(upbound) arr4.append(xnorm) arr5.append(dense_std) arr = np.array(arr) arr2 = np.array(arr2) arr3 = np.array(arr3) arr4 = np.array(arr4) arr5 = np.array(arr5) logger.info('[rank:%d][%d] gtopk norm mean: %f, std: %f', rank, epoch, np.mean(arr), np.std(arr)) logger.info('[rank:%d][%d] randk norm mean: %f, std: %f', rank, epoch, np.mean(arr2), np.std(arr2)) logger.info('[rank:%d][%d] upbound norm mean: %f, std: %f', rank, epoch, np.mean(arr3), np.std(arr3)) logger.info('[rank:%d][%d] x norm mean: %f, std: %f', rank, epoch, np.mean(arr4), np.std(arr4)) logger.info('[rank:%d][%d] dense std mean: %f, std: %f', rank, epoch, np.mean(arr5), np.std(arr5)) np.save(fn, arr) np.save(fn2, arr2) np.save(fn3, arr3) np.save(fn5, arr5) # For comparison purpose <=== End optimizer._allreducer._profiling_norms = [] optimizer.stop()
def ssgd(dnn, dataset, data_dir, nworkers, lr, batch_size, nsteps_update, max_epochs, nwpernode, pretrain, num_steps, compressor, density, threshold, gradient_path=None, tb=None, iratio=0.1, stages=1, partitions=0, ec_gradw=1.0, ec_memw=0.0, optimizer='nesterov', totaltime=0): global SPEED if not settings.USE_CPU: if nworkers > 1: rank = hvd.rank() torch.cuda.set_device(hvd.local_rank()) #%rank%nwpernode) else: rank = 0 torch.cuda.set_device(rank) if rank != 0: pretrain = None #### CHECK whether to use GPU or CPU if settings.USE_CPU: trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=0, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer, tb=tb, optimizer_str=optimizer) else: trainer = DLTrainer(rank, nworkers, dist=False, batch_size=batch_size, is_weak_scaling=True, ngpus=1, data_dir=data_dir, dataset=dataset, dnn=dnn, lr=lr, nworkers=nworkers, prefix='allreduce', pretrain=pretrain, num_steps=num_steps, tb_writer=writer, tb=tb, optimizer_str=optimizer) init_epoch = torch.ones(1) * trainer.get_train_epoch() init_iter = torch.ones(1) * trainer.get_train_iter() trainer.set_train_epoch(int(hvd.broadcast(init_epoch, root_rank=0)[0])) trainer.set_train_iter(int(hvd.broadcast(init_iter, root_rank=0)[0])) is_sparse = True #density < 1 if not is_sparse: compressor = None if settings.ADAPTIVE_MERGE or settings.ADAPTIVE_SPARSE: seq_layernames, layerwise_times, layerwise_sizes = benchmark(trainer) layerwise_times = comm.bcast(layerwise_times, root=0) if rank == 0: logger.info('layerwise backward times: %s', list(layerwise_times)) logger.info('layerwise backward sizes: %s', list(layerwise_sizes)) logger.info('Bencharmked backward time: %f', np.sum(layerwise_times)) logger.info('Model size: %d', np.sum(layerwise_sizes)) else: seq_layernames, layerwise_times, layerwise_sizes = None, None, None norm_clip = None if dnn == 'lstm': norm_clip = 0.25 elif dnn == 'lstman4': norm_clip = 400 optimizer = hvd.DistributedOptimizer( trainer.optimizer, named_parameters=trainer.net.named_parameters(), compression=compressors[compressor], is_sparse=is_sparse, density=density, seq_layernames=seq_layernames, layerwise_times=layerwise_times, norm_clip=norm_clip, threshold=threshold, writer=writer, gradient_path=gradient_path, tb=tb, iratio=iratio, stages=stages, partitions=partitions, ec_gradw=ec_gradw, ec_memw=ec_memw) hvd.SPEED hvd.broadcast_parameters(trainer.net.state_dict(), root_rank=0) trainer.update_optimizer(optimizer) iters_per_epoch = trainer.get_num_of_training_samples() // ( nworkers * batch_size * nsteps_update) start = time.time() times = [] noupdate_times = [] logger.info('max_epochs: %d', max_epochs) display = settings.DISPLAY if iters_per_epoch > settings.DISPLAY else iters_per_epoch - 1 for epoch in range(max_epochs): hidden = None if dnn == 'lstm': hidden = trainer.net.init_hidden() for i in range(iters_per_epoch): s = time.time() optimizer.zero_grad() for j in range(nsteps_update): if j < nsteps_update - 1 and nsteps_update > 1: optimizer.local = True else: optimizer.local = False if dnn == 'lstm': _, hidden = trainer.train(1, hidden=hidden) else: trainer.train(1) if dnn == 'lstm': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 0.25) elif dnn == 'lstman4': optimizer.synchronize() torch.nn.utils.clip_grad_norm_(trainer.net.parameters(), 400) noupdate_times.append(time.time() - s) trainer.update_model() torch.cuda.synchronize() times.append(time.time() - s) if i % display == 0 and i > 0: time_per_iter = np.mean(times) update_per_iter = time_per_iter - np.mean(noupdate_times) throughput = batch_size * nsteps_update / time_per_iter trainer.log_info(time_per_iter, throughput, update_per_iter) logger.warning( 'Time per iteration: %f, communication: %f, Speed: %f images/s', time_per_iter, update_per_iter, throughput) times = [] noupdate_times = [] optimizer.increase_one_epoch() if totaltime > 0 and time.time() - start > totaltime: trainer.test(trainer.get_train_epoch()) break if not (dataset == 'cifar10'): trainer.test(trainer.get_train_epoch())