def __init__(self, model, lr=0.1, factor_decay=0.95, damping=0.001, kl_clip=0.001, fac_update_freq=10, kfac_update_freq=100, batch_averaged=True, diag_blocks=1, diag_warmup=0, distribute_layer_factors=None, gradient_clip="agc"): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= factor_decay <= 1: raise ValueError( "Invalid factor decay rate: {}".format(factor_decay)) if not 0.0 < damping: raise ValueError("Invalid damping: {}".format(damping)) if not 0.0 < kl_clip: raise ValueError("Invalid clipping value: {}".format(kl_clip)) if not 0 < fac_update_freq: raise ValueError( "Invalid factor update frequency: {}".format(fac_update_freq)) if not 0 < kfac_update_freq: raise ValueError( "Invalid K-FAC update frequency: {}".format(kfac_update_freq)) if not 0 == kfac_update_freq % fac_update_freq: print( "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq" ) if not 0 < diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 0 <= diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 1 == diag_blocks: print( "WARNING: diag_blocks > 1 is experimental and may give poor results." ) # For compatibility with `KFACParamScheduler` # defaults – (dict): a dict containing default values of optimization options (used when a parameter group doesn’t specify them). defaults = dict(lr=lr, damping=damping, fac_update_freq=fac_update_freq, kfac_update_freq=kfac_update_freq, gradient_clip=gradient_clip) super(KFAC, self).__init__(model.parameters(), defaults) self.computeA = ComputeA() self.computeG = ComputeG() self.known_modules = {'Linear', 'Conv2d', 'BertLayerNorm0'} self.modules = [] self._register_modules(model) self.steps = 0 self.gradient_clip = gradient_clip #"agc" # Dictionaries keyed by `module` to storing the factors and # eigendecompositions self.m_a, self.m_g = {}, {} self.m_A, self.m_G = {}, {} self.m_QA, self.m_QG = {}, {} self.m_dA, self.m_dG = {}, {} self.factor_decay = factor_decay self.kl_clip = kl_clip self.fac_update_freq = fac_update_freq self.kfac_update_freq = kfac_update_freq self.diag_blocks = diag_blocks self.diag_warmup = diag_warmup self.batch_averaged = batch_averaged self.hvd_size = 1 #hvd.size() # Compute ideal value for `distribute_layer_factors` based on # registered module count if distribute_layer_factors is None: self.distribute_layer_factors = True \ if hvd.size() > len(self.modules) else False else: self.distribute_layer_factors = distribute_layer_factors self.have_cleared_Q = True if self.diag_warmup == 0 else False self.eps = 1e-10 # for numerical stability self.rank_iter = cycle(list(range(self.hvd_size))) self.T_all = 0
def __init__(self, model, lr=0.1, factor_decay=0.95, damping=0.001, kl_clip=0.001, fac_update_freq=10, kfac_update_freq=100, batch_averaged=True, diag_blocks=1, diag_warmup=0, distribute_layer_factors=None, sparse=False, sparse_ratio=0.01, exclude_parts=''): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 < factor_decay <= 1: raise ValueError( "Invalid factor decay rate: {}".format(factor_decay)) if not 0.0 < damping: raise ValueError("Invalid damping: {}".format(damping)) if not 0.0 < kl_clip: raise ValueError("Invalid clipping value: {}".format(kl_clip)) if not 0 < fac_update_freq: raise ValueError( "Invalid factor update frequency: {}".format(fac_update_freq)) if not 0 < kfac_update_freq: raise ValueError( "Invalid K-FAC update frequency: {}".format(kfac_update_freq)) if not 0 == kfac_update_freq % fac_update_freq: print( "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq" ) if not 0 < diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 0 <= diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 1 == diag_blocks: print( "WARNING: diag_blocks > 1 is experimental and may give poor results." ) # For compatibility with `KFACParamScheduler` defaults = dict(lr=lr, damping=damping, fac_update_freq=fac_update_freq, kfac_update_freq=kfac_update_freq) super(KFAC, self).__init__(model.parameters(), defaults) self.computeA = ComputeA() self.computeG = ComputeG() self.known_modules = {'Linear', 'Conv2d'} self.modules = [] self.module_names = [] self.name_module_map = {} self.module_name_map = {} #self.fw_factor_handles = [] #self.bw_factor_handles = [] self._register_modules(model) self.fw_merged_comm = MergedCommAllReduce(self.module_names, prefix='forward', merge=True, single_layer=False) self.bw_merged_comm = MergedCommAllReduce(self.module_names, prefix='backward', merge=True, single_layer=False) self.steps = 0 # Dictionaries keyed by `module` to storing the factors and # eigendecompositions self.m_a, self.m_g = {}, {} self.m_A, self.m_G = {}, {} self.m_QA, self.m_QG = {}, {} self.m_dA, self.m_dG = {}, {} self.m_dA_ranks = {} self.m_dG_ranks = {} self.module_ranks = None self.sparse = sparse self.sparse_ratio = sparse_ratio self.residualsA, self.residualsG = {}, {} self.factor_decay = factor_decay self.kl_clip = kl_clip self.fac_update_freq = fac_update_freq self.kfac_update_freq = kfac_update_freq self.diag_blocks = diag_blocks self.diag_warmup = diag_warmup self.batch_averaged = batch_averaged # Compute ideal value for `distribute_layer_factors` based on # registered module count if distribute_layer_factors is None: self.distribute_layer_factors = True \ if hvd.size() > len(self.modules) else False else: self.distribute_layer_factors = distribute_layer_factors self.have_cleared_Q = True if self.diag_warmup == 0 else False self.eps = 1e-10 # for numerical stability self.rank_iter = cycle(list(range(hvd.size())))
def __init__(self, model, lr=0.1, hook_enabled=True, factor_decay=0.95, damping=0.001, kl_clip=0.001, fac_update_freq=10, kfac_update_freq=100, batch_averaged=True, diag_blocks=1, diag_warmup=0, distribute_layer_factors=None, sparse=False, sparse_ratio=0.01, exclude_parts=''): #exclude_parts='CommunicateInverse,ComputeInverse,CommunicateFactor,ComputeFactor'): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 < factor_decay <= 1: raise ValueError( "Invalid factor decay rate: {}".format(factor_decay)) if not 0.0 < damping: raise ValueError("Invalid damping: {}".format(damping)) if not 0.0 < kl_clip: raise ValueError("Invalid clipping value: {}".format(kl_clip)) if not 0 < fac_update_freq: raise ValueError( "Invalid factor update frequency: {}".format(fac_update_freq)) if not 0 < kfac_update_freq: raise ValueError( "Invalid K-FAC update frequency: {}".format(kfac_update_freq)) if not 0 == kfac_update_freq % fac_update_freq: print( "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq" ) if not 0 < diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 0 <= diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 1 == diag_blocks: print( "WARNING: diag_blocks > 1 is experimental and may give poor results." ) # For compatibility with `KFACParamScheduler` defaults = dict(lr=lr, damping=damping, fac_update_freq=fac_update_freq, kfac_update_freq=kfac_update_freq) super(KFAC, self).__init__(model.parameters(), defaults) self.computeA = ComputeA() self.computeG = ComputeG() self.known_modules = {'Linear', 'Conv2d'} self.modules = [] self.module_names = [] # register hooks for known modules self.hook_enabled = hook_enabled self._register_modules(model) # tcmm communicator self.communicator = tcmm.Communicator(hvd.rank(), hvd.size(), 1) self.steps = 0 # Dictionaries keyed by `module` to storing the factors and inverse factors self.m_a, self.m_g = {}, {} self.m_A, self.m_G = {}, {} self.m_inv_A, self.m_inv_G = {}, {} self.module_ranks = None self.sparse = sparse self.sparse_ratio = sparse_ratio self.residualsA, self.residualsG = {}, {} self.factor_decay = factor_decay self.kl_clip = kl_clip self.fac_update_freq = fac_update_freq self.kfac_update_freq = kfac_update_freq self.diag_blocks = diag_blocks self.diag_warmup = diag_warmup self.batch_averaged = batch_averaged self.exclude_communicate_inverse = True if exclude_parts.find( 'CommunicateInverse') >= 0 else False self.exclude_compute_inverse = True if exclude_parts.find( 'ComputeInverse') >= 0 else False self.exclude_communicate_factor = True if exclude_parts.find( 'CommunicateFactor') >= 0 else False self.exclude_compute_factor = True if exclude_parts.find( 'ComputeFactor') >= 0 else False # Compute ideal value for `distribute_layer_factors` based on # registered module count if distribute_layer_factors is None: self.distribute_layer_factors = True \ if hvd.size() > len(self.modules) else False else: self.distribute_layer_factors = distribute_layer_factors self.eps = 1e-10 # for numerical stability self.rank_iter = cycle(list(range(hvd.size())))
class KFAC(optim.Optimizer): """KFAC Distributed Gradient Preconditioner Computes the natural gradient of a model in place with a layer-wise FIM approximation. Layer computations are distributed across workers using Horovod. Usage: optimizer = optim.SGD(model.parameters(), ...) optimizer = hvd.DistributedOptimizer(optimizer, ...) preconditioner = KFAC(model, ...) ... for i, (data, target) in enumerate(train_loader): optimizer.zero_grad() output = model(data) loss = criterion(output, target) loss.backward() optimizer.synchronize() preconditioner.step() with optimizer.skip_synchronize(): optimizer.step() Args: model (nn): Torch model to precondition lr (float, optional): learning rate (default: 0.1) factor_decay (float, optional): running average coefficient for Kronecker factors (default: 0.95) damping (float, optional): Tikhonov damping parameter (default: 0.001) kl_clip (float, optional): clipping parameter for gradient scaling (default: 0.001) fac_update_freq (int, optional): iterations between calculating and updating the running average of the Kronecker factors (default: 10) kfac_update_freq (int, optional): iterations between applying gradient preconditioning (default: 100) batch_averaged (bool, optional): boolean representing if the gradient is alrady averaged across the batches (default: True) diag_blocks (int, optional): Experimental: number of diagonal blocks to approximate the Kronecker factor eigendecomposition with. `diag_blocks=1` computes the eigendecomposition of the entire factor (default: 1) diag_warmup (int, optional): number of epochs to wait before starting the block diagonal factor approximation (default: 0) distribute_layer_factors (bool, optional): if `True`, computes factors A and G on different workers else computes A and G for a single layer on the same worker. If `None`, determines best value based on layer count (default: None) """ def __init__(self, model, lr=0.1, factor_decay=0.95, damping=0.001, kl_clip=0.001, fac_update_freq=10, kfac_update_freq=100, batch_averaged=True, diag_blocks=1, diag_warmup=0, distribute_layer_factors=None, sparse=False, sparse_ratio=0.01, exclude_parts=''): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 < factor_decay <= 1: raise ValueError( "Invalid factor decay rate: {}".format(factor_decay)) if not 0.0 < damping: raise ValueError("Invalid damping: {}".format(damping)) if not 0.0 < kl_clip: raise ValueError("Invalid clipping value: {}".format(kl_clip)) if not 0 < fac_update_freq: raise ValueError( "Invalid factor update frequency: {}".format(fac_update_freq)) if not 0 < kfac_update_freq: raise ValueError( "Invalid K-FAC update frequency: {}".format(kfac_update_freq)) if not 0 == kfac_update_freq % fac_update_freq: print( "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq" ) if not 0 < diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 0 <= diag_blocks: raise ValueError( "Invalid diagonal block approx count: {}".format(diag_blocks)) if not 1 == diag_blocks: print( "WARNING: diag_blocks > 1 is experimental and may give poor results." ) # For compatibility with `KFACParamScheduler` defaults = dict(lr=lr, damping=damping, fac_update_freq=fac_update_freq, kfac_update_freq=kfac_update_freq) super(KFAC, self).__init__(model.parameters(), defaults) self.computeA = ComputeA() self.computeG = ComputeG() self.known_modules = {'Linear', 'Conv2d'} self.modules = [] self.module_names = [] self.name_module_map = {} self.module_name_map = {} self._register_modules(model) self.fw_merged_comm = MergedCommAllReduce(self.module_names, prefix='forward', merge=True, single_layer=False) self.bw_merged_comm = MergedCommAllReduce(self.module_names, prefix='backward', merge=False, single_layer=False) self.inverseA_merged_comm = MergedCommBcast(self.module_names, prefix='inverseA') self.inverseG_merged_comm = MergedCommBcast(self.module_names, prefix='inverseG') self.multi_comm = MultiTensorComm() self.steps = 0 # Dictionaries keyed by `module` to storing the factors and # eigendecompositions self.m_a, self.m_g = {}, {} self.m_A, self.m_G = {}, {} self.m_QA, self.m_QG = {}, {} self.m_dA_ranks = {} self.m_dG_ranks = {} self.module_ranks = None self.sparse = sparse self.sparse_ratio = sparse_ratio self.residualsA, self.residualsG = {}, {} self.factor_decay = factor_decay self.kl_clip = kl_clip self.fac_update_freq = fac_update_freq self.kfac_update_freq = kfac_update_freq self.diag_blocks = diag_blocks self.diag_warmup = diag_warmup self.batch_averaged = batch_averaged # Compute ideal value for `distribute_layer_factors` based on # registered module count if distribute_layer_factors is None: self.distribute_layer_factors = True \ if hvd.size() > len(self.modules) else False else: self.distribute_layer_factors = distribute_layer_factors self.have_cleared_Q = True if self.diag_warmup == 0 else False self.eps = 1e-10 # for numerical stability self.rank_iter = cycle(list(range(hvd.size()))) def _compute_forward_factor(self, module, input): if torch.is_grad_enabled() and self.steps % self.fac_update_freq == 0: input_data = input[0].data self._update_module_A(input_data, module) if hvd.size() > 1: name = self.module_name_map[module] self.fw_merged_comm.allreduce_async_(name, self.m_a[module].data) def _compute_backward_factor(self, module, grad_input, grad_output): if self.steps % self.fac_update_freq == 0: input_data = grad_output[0].data self._update_module_G(input_data, module) if hvd.size() > 1: name = self.module_name_map[module] self.bw_merged_comm.allreduce_async_(name, self.m_g[module].data) def _register_modules(self, model): """Register hooks to all supported layers in the model""" name_idx = 0 for module in model.modules(): classname = module.__class__.__name__ if classname in self.known_modules: self.modules.append(module) module.register_forward_pre_hook(self._compute_forward_factor) module.register_backward_hook(self._compute_backward_factor) module_name = 'module_name_%s_%d' % (classname, name_idx) self.module_names.append(module_name) self.name_module_map[module_name] = module self.module_name_map[module] = module_name name_idx += 1 def _init_A(self, factor, module): """Initialize memory for factor A and its eigendecomp""" shape = (factor.shape[1], factor.shape[1]) self.m_A[module] = torch.diag(factor.new(shape[0]).fill_(1)) self.m_QA[module] = factor.new_zeros(shape) def _init_G(self, factor, module): """Initialize memory for factor G and its eigendecomp""" shape = (factor.shape[1], factor.shape[1]) self.m_G[module] = torch.diag(factor.new(shape[0]).fill_(1)) self.m_QG[module] = factor.new_zeros(shape) def _clear_eigen(self): """Clear eigendecompositions Useful for when switching between `diag_blocks=1` and `diag-blocks>1` because eigendecompositions saved in place and the off-diagonals must be cleared. """ for module in self.modules: self.m_QA[module].fill_(0) self.m_QG[module].fill_(0) def _update_module_A(self, input_data, module): a = self.computeA.get_data(input_data, module) if module in self.m_a: self.m_a[module].copy_(a) else: self.m_a[module] = a if self.steps == 0: self._init_A(a, module) #update_running_avg(a, self.m_A[module], self.factor_decay) if self.sparse: sparsification(self.m_A[module], module, ratio=self.sparse_ratio, residuals=self.residualsA) def _update_module_G(self, input_data, module): g = self.computeG.get_data(input_data, module, self.batch_averaged) if module in self.m_g: self.m_g[module].copy_(g) else: self.m_g[module] = g if self.steps == 0: self._init_G(g, module) #update_running_avg(g, self.m_G[module], self.factor_decay) if self.sparse: sparsification(self.m_G[module], module, ratio=self.sparse_ratio, residuals=self.residualsG) def _update_inverse_A(self, module, ranks): """Compute eigendecomposition of A for module on specified workers Note: all ranks will enter this function but only the ranks specified in `ranks` will continue to actually compute the eigendecomposition. All other ranks will simply zero out their buffer for the eigendecomposition for the current module. This is done so we can sum the eigendecompositions across all ranks to communicate the results of locally computed eigendecompositions. Args: module: module to compute eigendecomposition of A on ranks: list of horovod ranks (i.e. workers) to use when computing the eigendecomposition. """ if hvd.rank() in ranks: self._distributed_compute_inverse(self.m_A[module], self.m_QA[module], ranks) else: if ranks[0] == -1: self._local_computer_inverse(self.m_A[module], self.m_QA[module]) else: self.m_QA[module].fill_(0) def _update_inverse_G(self, module, ranks): """Compute eigendecomposition of A for module on specified workers See `_update_eigen_A` for more info` """ if hvd.rank() in ranks: self._distributed_compute_inverse(self.m_G[module], self.m_QG[module], ranks) else: if ranks[0] == -1: self._local_computer_inverse(self.m_G[module], self.m_QG[module]) else: self.m_QG[module].fill_(0) def _distributed_compute_inverse(self, factor, inverse, ranks): """Computes the eigendecomposition of a factor across ranks Assigns each rank in `ranks` to enter this function to compute a diagonal block of `factor`. Results are written to `evectors` and `evalues`. If `len(ranks)==1`, then that rank computes the eigendecomposition of the entire `factor`. Args: factor (tensor): tensor to eigendecompose inverse (tensor): tensor to save inverse of `factor` to ranks (list): list of ranks that will enter this function """ i = ranks.index(hvd.rank()) n = len(ranks) if n > min(factor.shape): n = min(factor.shape) if i < n: start, end = get_block_boundary(i, n, factor.shape) block = factor[start[0]:end[0], start[1]:end[1]] block = add_value_to_diagonal(block, self.damping) inv = torchsso.utils.inv(block) inverse.data[start[0]:end[0], start[1]:end[1]].copy_(inv) def _local_computer_inverse(self, factor, inverse): block = factor[0:, 0:] block = add_value_to_diagonal(block, self.damping) inv = torchsso.utils.inv(block) inverse.data[0:, 0:].copy_(inv) def _get_diag_blocks(self, module, diag_blocks): """Helper method for determining number of diag_blocks to use Overrides `diag_blocks` if the `module` does not support `diag_blocks>1`. I.e. for a Linear layer, we do not want to use a `diag_blocks>1`. Args: module: module diag_blocks (int): default number of diag blocks to use """ return diag_blocks if module.__class__.__name__ == 'Conv2d' else 1 def _get_grad(self, module): """Get formated gradient of module Args: module: module/layer to get gradient of Returns: Formatted gradient with shape [output_dim, input_dim] for module """ if module.__class__.__name__ == 'Conv2d': # n_filters * (in_c * kw * kh) grad = module.weight.grad.data.view( module.weight.grad.data.size(0), -1) else: grad = module.weight.grad.data if module.bias is not None: grad = torch.cat([grad, module.bias.grad.data.view(-1, 1)], 1) return grad def _get_preconditioned_grad(self, module, grad): """Precondition gradient of module Args: module: module to compute preconditioned gradient for grad: formatted gradient from `_get_grad()` Returns: preconditioned gradient with same shape as `grad` """ v = self.m_QG[module] @ grad @ self.m_QA[module] if module.bias is not None: v = [v[:, :-1], v[:, -1:]] v[0] = v[0].view(module.weight.grad.data.size()) # weight v[1] = v[1].view(module.bias.grad.data.size()) # bias else: v = [v.view(module.weight.grad.data.size())] return v def _update_scale_grad(self, updates): """Update the gradients in place and scale Updates the gradients in-place for all modules using the preconditioned gradients and scales the gradients. Args: updates (dict): dict of {module: precon_grad} """ vg_sum = 0 for module in self.modules: v = updates[module] vg_sum += (v[0] * module.weight.grad.data * self.lr**2).sum().item() if module.bias is not None: vg_sum += (v[1] * module.bias.grad.data * self.lr**2).sum().item() nu = min(1.0, math.sqrt(self.kl_clip / abs(vg_sum))) for module in self.modules: v = updates[module] module.weight.grad.data.copy_(v[0]) module.weight.grad.data.mul_(nu) if module.bias is not None: module.bias.grad.data.copy_(v[1]) module.bias.grad.data.mul_(nu) def step(self, closure=None, epoch=None): """Perform one K-FAC step Note: - this function should always be called before `optimizer.step()` - gradients must be averaged across ranks before calling `step()` Args: closure: for compatibility with the base optimizer class. `closure` is ignored by KFAC epoch (int, optional): epoch to use for determining when to end the `diag_warmup` period. `epoch` is not necessary if not using `diag_warmup` """ # Update params, used for compatibilty with `KFACParamScheduler` group = self.param_groups[0] self.lr = group['lr'] self.damping = group['damping'] self.fac_update_freq = group['fac_update_freq'] self.kfac_update_freq = group['kfac_update_freq'] #print('fac_update_freq: ', self.fac_update_freq) #print('kfac_update_freq: ', self.kfac_update_freq) updates = {} handles = [] if epoch is None: if self.diag_warmup > 0: print("WARNING: diag_warmup > 0 but epoch was not passed to " "KFAC.step(). Defaulting to no diag_warmup") diag_blocks = self.diag_blocks else: diag_blocks = self.diag_blocks if epoch >= self.diag_warmup else 1 if hvd.size() > 1 and self.steps % self.fac_update_freq == 0: self.fw_merged_comm.synchronize() self.bw_merged_comm.synchronize() # Compute A and G after aggregation of a and g for module in self.modules: a = self.m_a[module] g = self.m_g[module] if hvd.rank() == 0: logger.info('a Name: %s, shape %s', module, a.shape) logger.info('g Name: %s, shape %s', module, g.shape) A = torch.einsum('ki,kj->ij', a, a / a.size(0)) G = torch.einsum('ki,kj->ij', g, g / g.size(0)) update_running_avg(A, self.m_A[module], self.factor_decay) update_running_avg(G, self.m_G[module], self.factor_decay) raise # if we are switching from no diag approx to approx, we need to clear # off-block-diagonal elements if not self.have_cleared_Q and \ epoch == self.diag_warmup and \ self.steps % self.kfac_update_freq == 0: self._clear_eigen() self.have_cleared_Q = True if self.steps % self.kfac_update_freq == 0: # reset rank iter so device get the same layers # to compute to take advantage of caching self.rank_iter.reset() handles = [] #eigen_ranks = self._generate_eigen_ranks(epoch) eigen_ranks = self._generate_eigen_ranks_uniform(epoch) #eigen_ranks = self._generate_eigen_ranks_naive(epoch) #inverse_As = [] #A_ranks = [] #inverse_Gs = [] #G_ranks = [] rank_to_tensors = {} for module in self.modules: ranks_a, ranks_g = eigen_ranks[module] self.m_dA_ranks[module] = ranks_a[0] self.m_dG_ranks[module] = ranks_g[0] rank_a = ranks_a[0] rank_g = ranks_g[0] name = self.module_name_map[module] self._update_inverse_A(module, ranks_a) #if hvd.size() > 1 and rank_a >= 0: # self.inverseA_merged_comm.bcast_async_(name, self.m_QA[module], rank_a) self._update_inverse_G(module, ranks_g) #if hvd.size() > 1 and rank_g >= 0: # self.inverseG_merged_comm.bcast_async_(name, self.m_QG[module], rank_g) #if rank_a not in rank_to_tensors: # rank_to_tensors[rank_a] = [] #rank_to_tensors[rank_a].append((name, self.m_QA[module], self.m_QG[module])) if hvd.size() > 1 and rank_g >= 0: self.multi_comm.bcast_async_( [name], [self.m_QA[module], self.m_QG[module]], rank_g) #if hvd.size() > 1: # for rank in rank_to_tensors.keys(): # names = [] # tensors = [] # for name, ta, tb in rank_to_tensors[rank]: # names.append(name) # tensors.append(ta) # tensors.append(tb) # self.multi_comm.bcast_async_(names, tensors, rank) if hvd.size() > 1 and self.steps % self.kfac_update_freq == 0: #self.inverseA_merged_comm.synchronize() #self.inverseG_merged_comm.synchronize() self.multi_comm.synchronize() for i, module in enumerate(self.modules): grad = self._get_grad(module) precon_grad = self._get_preconditioned_grad(module, grad) updates[module] = precon_grad self._update_scale_grad(updates) self.steps += 1 def _generate_eigen_ranks_naive(self, epoch): if self.module_ranks is not None: return self.module_ranks module_ranks = {} diag_blocks = self.diag_blocks if epoch >= self.diag_warmup else 1 buckets = [0] * hvd.size() for module in self.modules: # Get ranks to compute this layer on n = self._get_diag_blocks(module, diag_blocks) ranks_a = self.rank_iter.next(n) ranks_g = ranks_a #ranks_g = self.rank_iter.next(n) if self.distribute_layer_factors \ # else ranks_a module_ranks[module] = (ranks_a, ranks_g) buckets[ranks_a[0]] += self.m_A[module].shape[1] buckets[ranks_g[0]] += self.m_G[module].shape[1] self.module_ranks = module_ranks if hvd.rank() == 0: logger.info('buckets: %s', buckets) logger.info('module_ranks: %s', module_ranks.values()) return module_ranks def _generate_eigen_ranks_uniform(self, epoch): if self.module_ranks is not None: return self.module_ranks module_ranks = {} diag_blocks = self.diag_blocks if epoch >= self.diag_warmup else 1 buckets = [0] * hvd.size() dimensions = [] module_factors = [] for i, m in enumerate(self.modules): name = self.module_names[i] a_dimension = self.m_A[m].shape[1] g_dimension = self.m_G[m].shape[1] dimensions.append(a_dimension) module_factors.append(name + '-A') dimensions.append(g_dimension) module_factors.append(name + '-G') descending_sorted_idx = np.argsort(dimensions)[::-1] A_ranks = {} G_ranks = {} bi = 0 for i in descending_sorted_idx: factor = module_factors[i] if factor[-1] == 'G': continue dimension = dimensions[i] m_i = self.module_names.index(factor[0:-2]) m = self.modules[m_i] if dimension < 1024: bi = -1 else: bi = np.argmin(buckets) buckets[bi] += dimension if factor[-1] == 'A': A_ranks[m] = (bi, ) G_ranks[m] = (bi, ) else: G_ranks[m] = (bi, ) for m in self.modules: module_ranks[m] = (A_ranks[m], G_ranks[m]) self.module_ranks = module_ranks if hvd.rank() == 0: logger.info('buckets: %s', buckets) logger.info('module_ranks: %s', module_ranks.values()) return module_ranks def _generate_eigen_ranks(self, epoch): if self.module_ranks is not None: return self.module_ranks module_ranks = {} diag_blocks = self.diag_blocks if epoch >= self.diag_warmup else 1 buckets = [0] * hvd.size() for module in self.modules: i = np.argmin(buckets) if hvd.rank() == 0: logger.info('A Name: %s, shape: %s', module, self.m_A[module].shape) logger.info('G Name: %s, shape: %s', module, self.m_G[module].shape) a_dimension = self.m_A[module].shape[1] g_dimension = self.m_G[module].shape[1] #buckets[i] += (a_dimension) + g_dimension) buckets[i] += a_dimension ranks_a = (i, ) i = np.argmin(buckets) ranks_g = (i, ) buckets[i] += g_dimension module_ranks[module] = (ranks_a, ranks_g) self.module_ranks = module_ranks if hvd.rank() == 0: logger.info('buckets: %s', buckets) logger.info('module_ranks: %s', module_ranks.values()) return module_ranks def _allreduce_factors(self): """Allreduce the factors for all layers""" handles = [] for m in self.modules: handles.append( hvd.allreduce_async_(self.m_A[m].data, op=hvd.Average)) handles.append( hvd.allreduce_async_(self.m_G[m].data, op=hvd.Average)) for handle in handles: hvd.synchronize(handle) def _allgather_factors(self): """Allgather the factors for all layers""" handles = [] def _get_value_and_idx(sparse_tensor): tensor = sparse_tensor.data.view(-1) one_indexes = tensor != 0 indexes = one_indexes.nonzero().data.squeeze().view(-1) values = tensor.data[indexes] return values, indexes.int() for i, m in enumerate(self.modules): module_name = self.module_names[i] A_values, A_indexes = _get_value_and_idx(self.m_A[m].data) A_value_name = module_name + '_A_value' A_idx_name = module_name + '_A_idx' h_value = allgather_async(A_values, A_value_name) h_idx = allgather_async(A_indexes, A_idx_name) G_values, G_indexes = _get_value_and_idx(self.m_G[m].data) G_value_name = module_name + '_G_value' G_idx_name = module_name + '_G_idx' h_value_G = allgather_async(G_values, G_value_name) h_idx_G = allgather_async(G_indexes, G_idx_name) handles.append((h_value, h_idx, h_value_G, h_idx_G)) for i, handle in enumerate(handles): module_name = self.module_names[i] module = self.modules[i] m_A = self.m_A[module].view(-1) m_A.fill_(0.0) m_G = self.m_G[module].view(-1) m_G.fill_(0.0) h_value_A, h_idx_A, h_value_G, h_idx_G = handle A_values = hvd.synchronize(h_value_A) A_indexes = hvd.synchronize(h_idx_A).long() m_A.scatter_add_(0, A_indexes, A_values) m_A.div_(hvd.size()) G_values = hvd.synchronize(h_value_G) G_indexes = hvd.synchronize(h_idx_G).long() m_G.scatter_add_(0, G_indexes, G_values) m_G.div_(hvd.size()) def _allreduce_eigendecomp(self): """Allreduce the eigendecompositions for all layers Note: we use `op=hvd.Sum` to simulate an allgather`. Each rank will either compute the eigendecomposition for a factor or just return zeros so we sum instead of averaging. """ handles = [] for m in self.modules: handles.append(hvd.allreduce_async_(self.m_QA[m].data, op=hvd.Sum)) handles.append(hvd.allreduce_async_(self.m_QG[m].data, op=hvd.Sum)) for handle in handles: hvd.synchronize(handle) def _broadcast_eigendecomp(self): """Broadcasts the eigendecompositions for all layers Note: we use `op=hvd.Sum` to simulate an allgather`. Each rank will either compute the eigendecomposition for a factor or just return zeros so we sum instead of averaging. """ handles = [] rank = hvd.rank() for i, m in enumerate(self.modules): rank_a = self.m_dA_ranks[m] rank_g = self.m_dG_ranks[m] name = self.module_names[i] h = hvd.broadcast_async_(self.m_QA[m], rank_a, name=name + 'mQA') handles.append(h) h = hvd.broadcast_async_(self.m_QG[m], rank_g, name=name + 'mQG') handles.append(h) for handle in handles: hvd.synchronize(handle)