def update(self): """ Update Exponential Moving Average. Should only call this method in train program. """ param_master_emas = [] for param, tmp in self._params_tmps: with param.block.program._optimized_guard( [param, tmp]), name_scope('moving_average'): param_ema = self._ema_vars[param.name] if param.name + '.master' in self._ema_vars: master_ema = self._ema_vars[param.name + '.master'] param_master_emas.append([param_ema, master_ema]) else: ema_t = param_ema * self._decay_var + param * ( 1 - self._decay_var) layers.assign(input=ema_t, output=param_ema) # for fp16 params for param_ema, master_ema in param_master_emas: default_main_program().global_block().append_op( type="cast", inputs={"X": master_ema}, outputs={"Out": param_ema}, attrs={ "in_dtype": master_ema.dtype, "out_dtype": param_ema.dtype })
def apply_gradients(self, params_grads): flattened = [] for p, g in params_grads: flattened.extend([p, g]) with flattened[0].block.program._optimized_guard( flattened), name_scope("optimizer"): self._apply_gradients_impl(params_grads)
def _append_decoupled_weight_decay(self, block, param_and_grad): """ Add decoupled weight decay op. parameter = parameter - parameter * coeff * lr Args: block: block in which variable is to be created param_and_grad: (parameters, gradients) pairs, the parameters need to decay. Raises: Exception: The type of coeff and parameter is not consistent. """ if isinstance(param_and_grad, dict): param_and_grad = self._update_param_group(param_and_grad) param, grad = param_and_grad if self._apply_decay_param_fun is not None \ and not self._apply_decay_param_fun(param.name): return if isinstance(self._learning_rate, float): learning_rate = self._learning_rate else: # NOTE. We add this function to the _append_optimize_op(), # for we must make sure _create_param_lr() be called after # optimizer._create_global_learning_rate(). learning_rate = self._create_param_lr(param_and_grad) with block.program._optimized_guard( [param, grad]), framework.name_scope('weight decay'): self._params_name.add(param.name) # If it has been calculated, the result will be reused. # NOTE(wangxi): In dygraph mode, apply_gradient will be executed # every step, so need clear _lr_to_coeff every step, # we do this in _create_optimization_pass decay_coeff = self._lr_to_coeff.get(learning_rate, None) if decay_coeff is None: # NOTE(wangxi): for pipeline to set device:all with paddle.static.device_guard(None): decay_coeff = 1.0 - learning_rate * self._coeff self._lr_to_coeff[learning_rate] = decay_coeff find_master = (self._multi_precision and param.dtype == core.VarDesc.VarType.FP16) if find_master: master_weight = self._master_weights[param.name] scaled_param = master_weight * decay_coeff paddle.fluid.layers.assign( input=scaled_param, output=master_weight) else: scaled_param = param * decay_coeff paddle.fluid.layers.assign(input=scaled_param, output=param)
def __init__(self, decay=0.999, thres_steps=None, zero_debias=False, name=None): self._decay = decay self._thres_steps = thres_steps self._name = name if name is not None else '' self._decay_var = self._get_ema_decay() self._params_tmps = [] for param in default_main_program().global_block().all_parameters(): if param.do_model_average != False: tmp = param.block.create_var( name=unique_name.generate(".".join( [self._name + param.name, 'ema_tmp'])), dtype=param.dtype, persistable=False, stop_gradient=True) self._params_tmps.append((param, tmp)) self._ema_vars = {} for param, tmp in self._params_tmps: with param.block.program._optimized_guard( [param, tmp]), name_scope('moving_average'): self._ema_vars[param.name] = self._create_ema_vars(param) self.apply_program = Program() block = self.apply_program.global_block() with program_guard(main_program=self.apply_program): decay_pow = self._get_decay_pow(block) for param, tmp in self._params_tmps: param = block._clone_variable(param) tmp = block._clone_variable(tmp) ema = block._clone_variable(self._ema_vars[param.name]) layers.assign(input=param, output=tmp) # bias correction if zero_debias: ema = ema / (1.0 - decay_pow) layers.assign(input=ema, output=param) self.restore_program = Program() block = self.restore_program.global_block() with program_guard(main_program=self.restore_program): for param, tmp in self._params_tmps: tmp = block._clone_variable(tmp) param = block._clone_variable(param) layers.assign(input=tmp, output=param)
def _create_param_lr(self, param_and_grad): """ create learning rate parameter """ # create learning rate variable for every parameter param = param_and_grad[0] param_lr = param.optimize_attr['learning_rate'] if type(param_lr) == Variable: return param_lr else: if param_lr == 1.0: return self._global_learning_rate() else: with fluid.default_main_program()._lr_schedule_guard( is_with_opt=True), framework.name_scope( 'scale_with_param_lr'): return self._global_learning_rate() * param_lr
def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): params_grads = self.backward(loss=loss, startup_program=startup_program, parameter_list=parameter_list, no_grad_set=no_grad_set) scaled_params = self._scale_parameters(params_grads) for p_grad_sgrad in scaled_params: param, grad, scaled_param = p_grad_sgrad with param.block.program._optimized_guard( [param, grad]), framework.name_scope('weight decay'): updated_param = paddle.fluid.layers.elementwise_sub( x=param, y=scaled_param) paddle.fluid.layers.assign(input=updated_param, output=param) optimize_ops = self.apply_optimize(loss=loss, params_grads=params_grads, startup_program=startup_program) return optimize_ops, params_grads
def _scale_parameters(self, params_and_grads): """ Adds weight decay ops. scaled_parameter = parameter * coeff Args: params_and_grads: A list of (parameters, gradients) pairs, the parameters need to decay. Raises: Exception: The type of coeff and parameter is not consistent. """ if isinstance(self._coeff, float) and self._coeff == 0.0: return scaled_params = [] for param, grad in params_and_grads: # If no gradient then we don't need to do anything if grad is None: continue if self._apply_decay_param_fun is not None \ and not self._apply_decay_param_fun(param.name): continue if isinstance(self._coeff, float): assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \ "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype) else: assert self._coeff.dtype == param.dtype, \ "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype) with param.block.program._optimized_guard( [param, grad]), framework.name_scope('weight decay'): assert param.name not in self._params_name scaled_params.append((param, grad, param * self._coeff)) self._params_name.add(param.name) return scaled_params
def fp16_compression(param_and_grads): """ Compress fp32 gradients to fp16 during allreduce. """ op_maker = core.op_proto_and_checker_maker new_param_and_grads = [] # param, grad, is_cast # cast grad from fp32->fp16 before allreduce, for param, grad in param_and_grads: if grad is None or grad.dtype != core.VarDesc.VarType.FP32: new_param_and_grads.append((param, grad, False)) continue op = grad.op block = grad.block var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()] if param.name not in var_attr: new_param_and_grads.append((param, grad, False)) continue # remove (param, grad) from op_role_var var_attr.remove(param.name) var_attr.remove(grad.name) if len(var_attr) > 1: op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr) else: op._remove_attr(op_maker.kOpRoleVarAttrName()) new_grad = block.create_var( name=unique_name.generate(grad.name + ".cast_fp16"), dtype=core.VarDesc.VarType.FP16, persistable=False, stop_gradient=True) with block.program._backward_role_guard(): cast_op = block.append_op(type="cast", inputs={"X": grad}, outputs={"Out": new_grad}, attrs={ "in_dtype": core.VarDesc.VarType.FP32, "out_dtype": core.VarDesc.VarType.FP16 }, stop_gradient=True) backward = op_maker.OpRole.Backward cast_op._set_attr(op_maker.kOpRoleAttrName(), backward) cast_op._set_attr(op_maker.kOpRoleVarAttrName(), [param.name, new_grad.name]) new_grad.op = cast_op new_param_and_grads.append((param, new_grad, True)) ret_param_and_grads = [] # cast grad from fp16->fp32 after allreduce. # NOTE. Now we split fp16 compression into two for loops, # if we do not separate them, fuse allreduce will wrong. # This must be the problem of fuse allreduce pass, need # fixed in future. for param, grad, cast in new_param_and_grads: if not cast: ret_param_and_grads.append((param, grad)) continue block = grad.block new_grad = block.create_var( name=unique_name.generate(grad.name + ".cast_fp32"), dtype=core.VarDesc.VarType.FP32, persistable=False, stop_gradient=True) with block.program._optimized_guard( [param, grad]), framework.name_scope('fp16_allreduce'): cast_op = block.append_op(type="cast", inputs={"X": grad}, outputs={"Out": new_grad}, attrs={ "in_dtype": core.VarDesc.VarType.FP16, "out_dtype": core.VarDesc.VarType.FP32 }, stop_gradient=True) ret_param_and_grads.append((param, new_grad)) return ret_param_and_grads
def _create_optimization_pass(self, parameters_and_grads): """Add optimization operators to update gradients to tensors. Args: parameters_and_grads(list(tuple(Tensor, Tensor))): a list of (tensor, gradient) pair to update. Returns: return_op_list: a list of operators that will complete one step of optimization. This will include parameter update ops, global step update ops and any other custom ops required by subclasses to manage their internal state. """ # This is a default implementation of create_optimization_pass that # can be shared by most optimizers. This implementation assumes that # the subclass will implement the _append_optimize_op method and the # _initialize_tensors method. The subclass can extend the # _create_accumulators method if it needs to create accumulators # for parameters and extend _finish_update method to add custom ops. # Allways called under program_guard use global block as loss block # But if current block is in control flow, append optimize op in the # grad block of current block global_block = framework.default_main_program().global_block() target_block = global_block current_block = framework.default_main_program().current_block() if current_block.idx != global_block.idx: assert current_block.backward_block_idx != -1, \ "current block is not global_block, but it doesn't have backward block." target_block = framework.default_main_program().blocks[ current_block.backward_block_idx] start = len(target_block.ops) self.helper = LayerHelper(self.__class__.__name__) self._update_param_device_map(parameters_and_grads, target_block) self._create_accumulators( target_block, [p[0] for p in parameters_and_grads if p[0].trainable]) self._create_global_learning_rate() if framework.in_dygraph_mode(): for param_and_grad in parameters_and_grads: if param_and_grad[1] is None: continue if param_and_grad[0].trainable is True: self._append_optimize_op(target_block, param_and_grad) else: for param_and_grad in parameters_and_grads: if param_and_grad[1] is None: continue with param_and_grad[0].block.program._optimized_guard( param_and_grad), name_scope("optimizer"): if param_and_grad[0].trainable is True: device = self._get_device_for_param(param_and_grad[0] .name) with device_guard(device): optimize_op = self._append_optimize_op( target_block, param_and_grad) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies self._finish_update(target_block, parameters_and_grads) end = len(target_block.ops) return target_block._slice_ops(start, end)