def _SetNewLr(self, cur_lr, new_lr): """Do the actual work of updating the model and workspace blobs. """ for i in range(cfg.NUM_GPUS): with c2_utils.CudaScope(i): workspace.FeedBlob('gpu_{}/lr'.format(i), np.array([new_lr], dtype=np.float32)) ratio = _get_lr_change_ratio(cur_lr, new_lr) if cfg.SOLVER.SCALE_MOMENTUM and cur_lr > 1e-7 and \ ratio > cfg.SOLVER.SCALE_MOMENTUM_THRESHOLD: self._CorrectMomentum(new_lr / cur_lr)
def _do_broadcast(all_blobs): assert len(all_blobs) % cfg.NUM_GPUS == 0, \ ('Unexpected value for NUM_GPUS. Make sure you are not ' 'running single-GPU inference with NUM_GPUS > 1.') blobs_per_gpu = int(len(all_blobs) / cfg.NUM_GPUS) for i in range(blobs_per_gpu): blobs = [p for p in all_blobs[i::blobs_per_gpu]] data = workspace.FetchBlob(blobs[0]) logger.debug('Broadcasting {} to'.format(str(blobs[0]))) for i, p in enumerate(blobs[1:]): logger.debug(' |-> {}'.format(str(p))) with c2_utils.CudaScope(i + 1): workspace.FeedBlob(p, data)
def scale_momentum(scale, model): # for the LR warm-up in distributed training, when we change the LR after # warm-up, then we need to update the momentum accordingly logger.info('Scaling momentum: {}'.format(scale)) root_device_id = cfg.ROOT_DEVICE_ID num_devices = cfg.NUM_DEVICES for idx in range(root_device_id, root_device_id + num_devices): with c2_utils.CudaScope(idx): params = model.GetParams() for param in params: op = core.CreateOperator('Scale', [param + '_momentum'], [param + '_momentum'], scale=scale) workspace.RunOperatorOnce(op)
def add_variable_stepsize_lr( curr_iter, num_devices, lr_iters, start_model_iter, model=None, prev_checkpointed_lr=None, ): global CURRENT_LR # if the model is resumed from some checkpoint state, then we load the # checkpoint LR into the CURRENT_LR at the start of training only if prev_checkpointed_lr is not None and (curr_iter == start_model_iter): CURRENT_LR = prev_checkpointed_lr if curr_iter <= lr_iters[0]: gamma_pow = 0 else: idx = 0 while idx < len(lr_iters) and lr_iters[idx] < curr_iter: idx += 1 gamma_pow = idx learning_rate = (cfg.SOLVER.BASE_LR * math.pow(cfg.SOLVER.GAMMA, gamma_pow)) learning_rate = check_and_apply_warmup(curr_iter, learning_rate) root_device_id = cfg.ROOT_DEVICE_ID new_lr = learning_rate if curr_iter == 1: prev_lr = new_lr else: prev_lr = CURRENT_LR if cfg.SOLVER.SCALE_MOMENTUM and (not new_lr == prev_lr): scale = new_lr / float(prev_lr) scale_momentum(scale, model) CURRENT_LR = new_lr for idx in range(root_device_id, root_device_id + num_devices): with c2_utils.CudaScope(idx): workspace.FeedBlob('gpu_{}/lr'.format(idx), np.array(learning_rate, dtype=np.float32)) workspace.FeedBlob( 'gpu_{}/lr_x'.format(idx), np.array(learning_rate * cfg.SOLVER.LR_FACTOR, dtype=np.float32)) return CURRENT_LR
def _CorrectMomentum(self, correction): """The MomentumSGDUpdate op implements the update V as V := mu * V + lr * grad, where mu is the momentum factor, lr is the learning rate, and grad is the stochastic gradient. Since V is not defined independently of the learning rate (as it should ideally be), when the learning rate is changed we should scale the update history V in order to make it compatible in scale with lr * grad. """ logger.info( 'Scaling update history by {:.6f} (new lr / old lr)'.format( correction)) for i in range(cfg.NUM_GPUS): with c2_utils.CudaScope(i): for param in self.TrainableParams(gpu_id=i): op = core.CreateOperator('Scale', [param + '_momentum'], [param + '_momentum'], scale=correction) workspace.RunOperatorOnce(op)
def _add_allreduce_graph(model): """Construct the graph that performs Allreduce on the gradients.""" # Need to all-reduce the per-GPU gradients if training with more than 1 GPU all_params = model.TrainableParams() assert len(all_params) % cfg.NUM_GPUS == 0 # The model parameters are replicated on each GPU, get the number # distinct parameter blobs (i.e., the number of parameter blobs on # each GPU) params_per_gpu = int(len(all_params) / cfg.NUM_GPUS) with c2_utils.CudaScope(0): # Iterate over distinct parameter blobs for i in range(params_per_gpu): # Gradients from all GPUs for this parameter blob gradients = [ model.param_to_grad[p] for p in all_params[i::params_per_gpu] ] if len(gradients) > 0: if cfg.USE_NCCL: model.net.NCCLAllreduce(gradients, gradients) else: muji.Allreduce(model.net, gradients, reduced_affix='')