def reduce(): rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() torch.cuda.set_device(local_rank) nstreams = 1 communicator = tcmm.Communicator(rank, size, nstreams) n_elements = 32 * 1024 iterations = 100 tensor = torch.rand(n_elements).cuda() if rank == 0: print('before rank: %d' % rank, time.time()) for i in range(nstreams): communicator.reduce(tensor, 0) #communicator.allReduce(tensor) #hvd.allreduce(tensor) communicator.synchronize() start = time.time() previous = start for i in range(iterations): communicator.reduce(tensor, 0) #communicator.allReduce(tensor) #hvd.allreduce(tensor) current = time.time() if rank == 0: print('i: ', i, current - previous) previous = current communicator.synchronize() end = time.time() if rank == 0: print('after rank: %d' % rank, time.time(), (end - start) / iterations) print('throughput: ', n_elements * 4 * 1e-9 / ((end - start) / iterations), 'GB/s')
def __init__(self, symmetric=False, fp16=False): self.handles = [] self.symmetric = symmetric self.fp16 = fp16 # dosen't support fp16 at the current stage self.merged_tensors = {} nstreams = 1 self.merged_comm = tcmm.Communicator(hvd.rank(), hvd.size(), nstreams)
def allreduce(): rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() torch.cuda.set_device(local_rank) communicator = tcmm.Communicator(rank, size) tensor = torch.rand(2).cuda() print('before rank: %d' % rank, tensor) communicator.allReduce(tensor) print('after rank: %d' % rank, tensor)
def __init__(self, tensor_names=None, prefix='flag', merge=False, single_layer=False, symmetric=False, fp16=False): self._tensor_names = tensor_names self.merge = merge self.single_layer = single_layer self.symmetric = symmetric self.prefix = prefix self.fp16 = fp16 self.tensor_group_names = None if tensor_names is not None: self.init_tensor_group(tensor_names) nstreams = 1 self.merged_comm = tcmm.Communicator(hvd.rank(), hvd.size(), nstreams) self._name_tensors = {} self.handles = []
def benchmark_custom_comm(): torch.cuda.set_device(hvd.local_rank()) merged_comm = tcmm.Communicator(hvd.rank(), hvd.size(), 1) comm_op = merged_comm.reduce sync_op = merged_comm.synchronize sizes = [2**i for i in range(10, 11)] #sizes = [] #[1024*i for i in range(1, 1024)] large_sizes = [] #[1024*1024*i for i in range(1, 513)] # 1M to 512M sizes += large_sizes profiler = CommunicationProfiler(comm_op, sync_op, sizes) for root in range(hvd.size()): sizes, times = profiler.benchmark(root, num_iters=50) if hvd.rank() == 0: print('root: %d' % root) for s, t in zip(sizes, times): print(s, t, str(s*4/t*1e-6)+' MB/s') print()
def multi_bcast(): rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() torch.cuda.set_device(local_rank) communicator = tcmm.Communicator(rank, size) ntensors = 2 tensors = [] for i in range(ntensors): t = torch.rand(2).cuda() tensors.append(t) def _op(tensor): tensor.mul_(2) return None print('before rank: %d' % rank, tensors) communicator.multiBcast(tensors, _op) print('after rank: %d' % rank, tensors)
def bench_customize_comm(): import horovod.torch as hvd torch.random.manual_seed(10) hvd.init() rank = hvd.rank() local_rank = hvd.local_rank() size = hvd.size() torch.cuda.set_device(local_rank) logfile = './logs/resnet50-matrixsize-A.log' workloads = reader.read_tensor_sizes(logfile) tensors = [] outputs = [] for w in workloads: n = w[0] a = torch.rand(n).float().cuda() a = a.view(-1, a.size(-1)) A = a.t() @ (a) tensors.append(A) outputs.append(A.new_zeros(A.shape)) communicator = tcmm.Communicator(rank, size) warmup = 5 niters = 10 for i in range(warmup): communicator.multiBcast(tensors, outputs, compute_eigen) communicator.synchronize() torch.cuda.synchronize() stime = time.time() for i in range(niters): communicator.multiBcast(tensors, outputs, compute_eigen) communicator.synchronize() torch.cuda.synchronize() etime = time.time() print('Avg time: ', (etime - stime) / niters)
def __init__(self, model, lr=0.1, hook_enabled=True, factor_decay=0.95, damping=0.001, kl_clip=0.001, fac_update_freq=10, kfac_update_freq=100, batch_averaged=True, diag_blocks=1, diag_warmup=0, distribute_layer_factors=None, sparse=False, sparse_ratio=0.01, exclude_parts=''): #exclude_parts='CommunicateInverse,ComputeInverse,CommunicateFactor,ComputeFactor'): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 < factor_decay <= 1: raise ValueError( "Invalid factor decay rate: {}".format(factor_decay)) if not 0.0 < damping: raise ValueError("Invalid damping: {}".format(damping)) if not 0.0 < kl_clip: raise ValueError("Invalid clipping value: {}".format(kl_clip)) if not 0 < fac_update_freq: raise ValueError( "Invalid factor update frequency: {}".format(fac_update_freq)) if not 0 < kfac_update_freq: raise ValueError( "Invalid K-FAC update frequency: {}".format(kfac_update_freq)) if not 0 == kfac_update_freq % fac_update_freq: print( "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq" ) if not 0 < diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 0 <= diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 1 == diag_blocks: print( "WARNING: diag_blocks > 1 is experimental and may give poor results." ) # For compatibility with `KFACParamScheduler` defaults = dict(lr=lr, damping=damping, fac_update_freq=fac_update_freq, kfac_update_freq=kfac_update_freq) super(KFAC, self).__init__(model.parameters(), defaults) self.computeA = ComputeA() self.computeG = ComputeG() self.known_modules = {'Linear', 'Conv2d'} self.modules = [] self.module_names = [] # register hooks for known modules self.hook_enabled = hook_enabled self._register_modules(model) # tcmm communicator self.communicator = tcmm.Communicator(hvd.rank(), hvd.size(), 1) self.steps = 0 # Dictionaries keyed by `module` to storing the factors and inverse factors self.m_a, self.m_g = {}, {} self.m_A, self.m_G = {}, {} self.m_inv_A, self.m_inv_G = {}, {} self.module_ranks = None self.sparse = sparse self.sparse_ratio = sparse_ratio self.residualsA, self.residualsG = {}, {} self.factor_decay = factor_decay self.kl_clip = kl_clip self.fac_update_freq = fac_update_freq self.kfac_update_freq = kfac_update_freq self.diag_blocks = diag_blocks self.diag_warmup = diag_warmup self.batch_averaged = batch_averaged self.exclude_communicate_inverse = True if exclude_parts.find( 'CommunicateInverse') >= 0 else False self.exclude_compute_inverse = True if exclude_parts.find( 'ComputeInverse') >= 0 else False self.exclude_communicate_factor = True if exclude_parts.find( 'CommunicateFactor') >= 0 else False self.exclude_compute_factor = True if exclude_parts.find( 'ComputeFactor') >= 0 else False # Compute ideal value for `distribute_layer_factors` based on # registered module count if distribute_layer_factors is None: self.distribute_layer_factors = True \ if hvd.size() > len(self.modules) else False else: self.distribute_layer_factors = distribute_layer_factors self.eps = 1e-10 # for numerical stability self.rank_iter = cycle(list(range(hvd.size())))