def get_static_triple_grad(x, y, x_init=None, dy_init=None, place=None, program=None): """ Get Triple Grad result of static graph. Args: x (Variable|list[Variable]): input variables to the program. y (Variable|list[Variable]): output variables to the program. x_init (numpy.array|list[numpy.array]|None): the init value for input x. dy_init (numpy.array|list[numpy.array]|None): the init value for output y. place (fluid.CPUPlace or fluid.CUDAPlace): the device. program (Program|None): a Program with forward pass. If None, use fluid.default_main_program(). Returns: A list of numpy array that stores third derivative result calulated by static graph. """ if program is None: program = fluid.default_main_program() scope = fluid.executor.global_scope() y_grads = [] for i in six.moves.xrange(len(y)): yi = y[i] dyi_name = _append_grad_suffix_(yi.name) np_type = dtype_to_np_dtype(yi.dtype) dy = program.global_block().create_var(name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True) dy.stop_gradient = False set_var_in_scope(scope, place, dyi_name, dy_init[i]) y_grads.append(dy) # append first order grads dx = fluid.gradients(y, x, y_grads) # y_grads are the input of first-order backward, # so, they are also the input of second-order backward. x += y_grads x_init += dy_init y = dx x_grads_grads_init = [] for dxi in dx: np_type = dtype_to_np_dtype(dxi.dtype) value = np.ones(dxi.shape, dtype=np_type) x_grads_grads_init.append(value) return get_static_double_grad(x, y, x_init, dy_init=x_grads_grads_init, place=place, program=program)
def _create_loss_op_desc_(loss): op_desc = backward._create_op_desc_( "fill_constant", {}, {"Out": [backward._append_grad_suffix_(loss.name)]}, { "shape": [2, 1], "value": mdu.mpc_one_share, "dtype": loss.dtype, "force_cpu": False, core.op_proto_and_checker_maker.kOpRoleAttrName(): int(core.op_proto_and_checker_maker.OpRole.Backward) | int(core.op_proto_and_checker_maker.OpRole.Loss), }) return op_desc
def _get_stop_gradients(program, no_grad_set): """ get no grad var """ if no_grad_set is None: no_grad_set = set() else: no_grad_set = _get_no_grad_set_name(no_grad_set) no_grad_set_name = set() for var in program.list_vars(): assert isinstance(var, Variable) if "@GRAD" in var.name: break if var.stop_gradient: no_grad_set_name.add(_append_grad_suffix_(var.name)) no_grad_set_name.update(list(map(_append_grad_suffix_, no_grad_set))) return no_grad_set_name
def _create_loss_op_desc_(loss): shape = [2, 1] one_share = mdu.aby3_one_share mpc_protocol_index = np.array(fluid.global_scope().find_var("mpc_protocol_index").get_tensor()) if MpcProtocols(mpc_protocol_index) is MpcProtocols.PRIVC: shape = [1] one_share = mdu.privc_one_share op_desc = backward._create_op_desc_( "fill_constant", {}, {"Out": [backward._append_grad_suffix_(loss.name)]}, { "shape": shape, "value": one_share, "dtype": loss.dtype, "force_cpu": False, core.op_proto_and_checker_maker.kOpRoleAttrName(): int(core.op_proto_and_checker_maker.OpRole.Backward) | int(core.op_proto_and_checker_maker.OpRole.Loss), }) return op_desc
def append_backward(loss, parameter_list=None, no_grad_set=None, callbacks=None, checkpoints=None): """ This function appends backward part to main_program. A complete neural network training is made up of forward and backward propagation. However, when we configure a network, we only need to specify its forward part. This function uses the chain rule to automatically generate the backward part according to the forward part. In most cases, users do not need to invoke this function manually. It will be automatically invoked by the optimizer's `minimize` function. Parameters: loss( :ref:`api_guide_Variable_en` ): The loss variable of the network. parameter_list(list of str, optional): Names of parameters that need to be updated by optimizers. If it is None, all parameters will be updated. Default: None. no_grad_set(set of str, optional): Variable names in the :ref:`api_guide_Block_en` 0 whose gradients should be ignored. All variables with `stop_gradient=True` from all blocks will be automatically added into this set. If this parameter is not None, the names in this set will be added to the default set. Default: None. callbacks(list of callable object, optional): List of callback functions. The callbacks are used for doing some custom jobs during backward part building. All callable objects in it will be invoked once each time a new gradient operator is added into the program. The callable object must has two input parameters: 'block' and 'context'. The 'block' is the :ref:`api_guide_Block_en` which the new gradient operator will be added to. The 'context' is a map, whose keys are gradient variable names and values are corresponding original :ref:`api_guide_Variable_en` . In addition to this, the 'context' has another special key-value pair: the key is string '__current_op_desc__' and the value is the op_desc of the gradient operator who has just triggered the callable object. Default: None. Returns: list of tuple ( :ref:`api_guide_Variable_en` , :ref:`api_guide_Variable_en` ): Pairs of parameter and its corresponding gradients. The key is the parameter and the value is gradient variable. Raises: AssertionError: If `loss` is not an instance of Variable. Examples: .. code-block:: python import paddle.fluid as fluid x = fluid.data(name='x', shape=[None, 13], dtype='float32') y = fluid.data(name='y', shape=[None, 1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) loss = fluid.layers.square_error_cost(input=y_predict, label=y) avg_loss = fluid.layers.mean(loss) param_grad_list = fluid.backward.append_backward(loss=avg_loss) p_g_list1 = fluid.backward.append_backward(loss=avg_loss) # len(p_g_list1) == 2 p_g_list2 = fluid.backward.append_backward(loss=avg_loss, parameter_list=[p_g_list1[0][0].name]) # len(p_g_list1) == 1 p_g_list3 = fluid.backward.append_backward(loss=avg_loss, no_grad_set=set([p_g_list1[0][0].name])) # len(p_g_list1) == 1 p_g_list4 = fluid.backward.append_backward(loss=avg_loss, parameter_list=[p_g_list1[0][0].name], no_grad_set=set([p_g_list1[0][0].name])) # len(p_g_list1) == 0 """ assert isinstance(loss, framework.Variable) if loss.op is None: # the loss is from a cloned program. Find loss op manually. backward._find_loss_op_(loss) loss.op._set_attr( core.op_proto_and_checker_maker.kOpRoleAttrName(), int(core.op_proto_and_checker_maker.OpRole.Forward) | int(core.op_proto_and_checker_maker.OpRole.Loss)) if callbacks is not None: isinstance(callbacks, list) program = loss.block.program program._appending_grad_times += 1 if no_grad_set is None: no_grad_set = set() no_grad_set = copy.copy(no_grad_set) no_grad_dict = backward._get_stop_gradients_(program) no_grad_dict[0].update( list(map(backward._append_grad_suffix_, no_grad_set))) grad_info_map = dict() root_block = program.block(0) fwd_op_num = root_block.desc.op_size() current_block_idx = program.current_block_idx grad_to_var = dict() op_desc = _create_loss_op_desc_(loss) root_block.desc.append_op().copy_from(op_desc) block_no_grad_set = set(map(backward._strip_grad_suffix_, no_grad_dict[0])) op_path = backward._find_op_path_(root_block, [loss], [], block_no_grad_set) no_grad_vars = backward._find_no_grad_vars(root_block, op_path, [loss], block_no_grad_set) block_no_grad_set.update(no_grad_vars) no_grad_dict[0].update( list(map(backward._append_grad_suffix_, block_no_grad_set))) input_grad_names_set = None # For double backward, input_grad_names is used for filter # some non-used gradients op. if program._appending_grad_times > 1: input_grad_names_set = set([backward._append_grad_suffix_(loss.name)]) backward._append_backward_ops_(root_block, op_path, root_block, no_grad_dict, grad_to_var, callbacks, input_grad_names_set=input_grad_names_set) # Because calc_gradient may be called multiple times, # we need rename the internal gradient variables so that they have # different names. backward._rename_grad_(root_block, fwd_op_num, grad_to_var, {}) backward._append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) program.current_block_idx = current_block_idx program._sync_with_cpp() if parameter_list is not None: parameters = parameter_list else: params = list(filter(is_mpc_parameter, program.list_vars())) parameters = [param.name for param in params if param.trainable] params_and_grads = [] for param in parameters: if cpt.to_text(param) not in grad_info_map: continue grad_info = grad_info_map[param] grad_block = grad_info[1] if not grad_block.has_var(grad_info[0]): raise ValueError( "grad block[{0}] did not have grad var {1}".format( grad_info[1], grad_info[0])) # Get the param var from the global block param_var = program.global_block().var(param) grad_var = grad_block.var(grad_info[0]) if loss.block.has_var(grad_info[0]): params_and_grads.append((param_var, grad_var)) else: params_and_grads.append((param_var, None)) op_role_var_attr_name = core.op_proto_and_checker_maker.kOpRoleVarAttrName( ) for p, g in params_and_grads: if g is None: continue for op in reversed(program.global_block().ops): assert isinstance(op, framework.Operator) if g.name in op.output_arg_names: g.op = op break if g.op is None: raise ValueError("Unexpected branch") attr_val = [p.name, g.name] if g.op.has_attr(op_role_var_attr_name): attr_val.extend(g.op.attr(op_role_var_attr_name)) g.op._set_attr(op_role_var_attr_name, attr_val) return params_and_grads
def get_static_double_grad(x, y, x_init=None, dy_init=None, place=None, program=None): """ Get Double Grad result of static graph. Args: x (Variable|list[Variable]): input variables to the program. y (Variable|list[Variable]): output variables to the program. x_init (numpy.array|list[numpy.array]|None): the init value for input x. dy_init (numpy.array|list[numpy.array]|None): the init value for output y. place (fluid.CPUPlace or fluid.CUDAPlace): the device. program (Program|None): a Program with forward pass. If None, use fluid.default_main_program(). Returns: A list of numpy array that stores second derivative result calulated by static graph. """ if program is None: program = fluid.default_main_program() scope = fluid.executor.global_scope() y_grads = [] for i in six.moves.xrange(len(y)): yi = y[i] dyi_name = _append_grad_suffix_(yi.name) np_type = dtype_to_np_dtype(yi.dtype) dy = program.global_block().create_var(name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True) dy.stop_gradient = False set_var_in_scope(scope, place, dyi_name, dy_init[i]) y_grads.append(dy) # append first order grads dx = fluid.gradients(y, x, y_grads) # y_grads are the input of first-order backward, # so, they are also the input of second-order backward. x += y_grads x_init += dy_init # filter None in dx for DX/DY may be None in kernel filted_dx = [dxi for dxi in dx if dxi is not None] y = filted_dx # check input arguments x = _as_list(x) y = _as_list(y) for v in x: v.stop_gradient = False v.persistable = True if place is None: place = fluid.CPUPlace() if program is None: program = fluid.default_main_program() # init variable in strtup program scope = fluid.executor.global_scope() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) x_init = _as_list(x_init) # init inputs if x_init is not None if x_init: if len(x_init) != len(x): raise ValueError('len(x_init) (=%d) is not the same' ' as len(x) (= %d)' % (len(x_init), len(x))) # init variable in main program for var, arr in zip(x, x_init): assert var.shape == arr.shape feeds = {k.name: v for k, v in zip(x, x_init)} exe.run(program, feed=feeds, scope=scope) dys = [] for yi in y: np_type = dtype_to_np_dtype(yi.dtype) dy_name = _append_grad_suffix_(yi.name) # create dy Variable in Program dy = program.global_block().create_var(name=dy_name, shape=yi.shape, dtype=np_type, persistable=True) # init dy tensor in scope value = np.ones(yi.shape, dtype=np_type) dy_t = set_var_in_scope(scope, place, dy_name, value) dys.append(dy) # append second order backward ddx = fluid.gradients(y, x, dys) exe = fluid.Executor(place) # filter None in dx for DX/DY may be None in kernel # only fetch not None dx in exe.run filted = [(i, dxi) for i, dxi in enumerate(ddx) if dxi is not None] filted_idx, filted_ddx = zip(*filted) ddx_res = exe.run(program, scope=scope, fetch_list=filted_ddx) return ddx_res
def triple_grad_check(x, y, x_init=None, y_grads=None, x_grads_grads=None, place=None, program=None, eps=1e-6, atol=1e-5, rtol=1e-3, raise_exception=True): """ Check triple gradients. This function will append backward to the program before third order gradient check. Args: x (Variable|list[Variable]): input variables to the program. y (Variable|list[Variable]): output variables to the program. x_init (numpy.array|list[numpy.array]|None): the init value for input x. y_grads (numpy.array|list[numpy.array]|None): the gradients with respect to y. x_grads_grads (numpy.array|list[numpy.array]|None): the gradients with respect to your input. place (fluid.CPUPlace or fluid.CUDAPlace): the device. program (Program|None): a Program with forward pass. If None, use fluid.default_main_program(). eps (float): perturbation for finite differences. atol (float): absolute tolerance. rtol (float): relative tolerance. raise_exception (bool): whether to raise an exception if the check fails. Default is True. Returns: True if all differences satisfy numpy.allclose condition. """ # check input arguments x = _as_list(x) for v in x: v.stop_gradient = False v.persistable = True y = _as_list(y) if program is None: program = fluid.default_main_program() if y_grads is None: scope = fluid.executor.global_scope() y_grads = [] y_grads_init = [] for yi in y: dyi_name = _append_grad_suffix_(yi.name) np_type = dtype_to_np_dtype(yi.dtype) dy = program.global_block().create_var(name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True) dy.stop_gradient = False v = np.random.random(size=yi.shape).astype(np_type) set_var_in_scope(scope, place, dyi_name, v) y_grads.append(dy) y_grads_init.append(v) else: y_grads = _as_list(y_grads) y_grads_init = [ var_to_np_array_in_scope(scope, place, v.name) for v in y_grads ] # append first order grads target_grads = fluid.gradients(y, x, y_grads) if x_grads_grads is None: scope = fluid.executor.global_scope() x_grads_grads = [] x_grads_grads_init = [] for dxi in target_grads: ddxi_name = _append_grad_suffix_(dxi.name) np_type = dtype_to_np_dtype(dxi.dtype) ddx = program.global_block().create_var(name=ddxi_name, shape=dxi.shape, dtype=np_type, persistable=True) ddx.stop_gradient = False v = np.random.random(size=dxi.shape).astype(np_type) set_var_in_scope(scope, place, ddxi_name, v) x_grads_grads.append(ddx) x_grads_grads_init.append(v) else: x_grads_grads = _as_list(x_grads_grads) x_grads_grads_init = [ var_to_np_array_in_scope(scope, place, v.name) for v in x_grads_grads ] x += y_grads x_init = _as_list(x_init) x_init += y_grads_init # append second order grads target_grads_grads = fluid.gradients(target_grads, x, x_grads_grads) # filter None in target_grads_grads for Dy/Dx may be None in kernel filted = [(i, dyi) for i, dyi in enumerate(target_grads_grads) if dyi is not None] filted_idx, filted_target_grads_grads = zip(*filted) x += x_grads_grads x_init += x_grads_grads_init # x <=> [x, dout, ddx] grad_check(x=x, y=filted_target_grads_grads, x_init=x_init, place=place, program=program, eps=eps, atol=atol, rtol=rtol)
def double_grad_check(x, y, x_init=None, y_grads=None, place=None, program=None, eps=1e-6, atol=1e-5, rtol=1e-3, raise_exception=True): """ Check gradients of gradients. This function will append backward to the program before second order gradient check. Args: x (Variable|list[Variable]): input variables to the program. y (Variable|list[Variable]): output variables to the program. x_init (numpy.array|list[numpy.array]|None): the init value for input x. y_grads (numpy.array|list[numpy.array]|None): the gradients with respect to y. place (fluid.CPUPlace or fluid.CUDAPlace): the device. program (Program|None): a Program with forward pass. If None, use fluid.default_main_program(). eps (float): perturbation for finite differences. atol (float): absolute tolerance. rtol (float): relative tolerance. raise_exception (bool): whether to raise an exception if the check fails. Default is True. Returns: True if all differences satisfy numpy.allclose condition. """ # check input arguments x = _as_list(x) for v in x: v.stop_gradient = False v.persistable = True y = _as_list(y) if program is None: program = fluid.default_main_program() if y_grads is None: scope = fluid.executor.global_scope() y_grads = [] y_grads_init = [] for yi in y: dyi_name = _append_grad_suffix_(yi.name) np_type = dtype_to_np_dtype(yi.dtype) dy = program.global_block().create_var(name=dyi_name, shape=yi.shape, dtype=np_type, persistable=True) dy.stop_gradient = False v = np.random.random(size=yi.shape).astype(np_type) set_var_in_scope(scope, place, dyi_name, v) y_grads.append(dy) y_grads_init.append(v) else: y_grads = _as_list(y_grads) y_grads_init = [ var_to_np_array_in_scope(scope, place, v.name) for v in y_grads ] # append first order grads target_grads = fluid.gradients(y, x, y_grads) # y_grads are the input of first-order backward, # so, they are also the input of second-order backward. x += y_grads x_init = _as_list(x_init) x_init += y_grads_init grad_check(x, target_grads, x_init, place, program, eps, atol, rtol)
def _compute_analytical_jacobian(program, x, y, place, scope): """Computes the analytical Jacobian for dy/dx. Args: program (Program): a Program with forward pass. x (Variable|list[Variable]): a variable or list of variable y (Variable): the target variable. place (fluid.CPUPlace or fluid.CUDAPlace): the device. scope (Scope): the scope used to run program. Returns: A list of 2-D numpy array. The list length is len(x). Each 2-D numpy array represents the Jacobian for dy/dx_i. It has "xi_size" rows and "dy_size" columns where "x_size" is the number of elements in x_i and "dy_size" is the number of elements in y. """ if not isinstance(y, fluid.framework.Variable): raise TypeError('y is not Variable') dy_name = _append_grad_suffix_(y.name) np_type = dtype_to_np_dtype(y.dtype) # create dy Variable in Program dy = program.global_block().create_var(name=dy_name, shape=y.shape, dtype=np_type, persistable=True) # append backward dx = fluid.gradients(y, x, dy) # init dy tensor in scope value = np.zeros(y.shape, dtype=np_type) dy_t = set_var_in_scope(scope, place, dy_name, value) exe = fluid.Executor(place) y_size = _product(y.shape) x = _as_list(x) jacobian = make_jacobian(x, y_size, np_type) # filter None in dx for DX/DY may be None in kernel # only fetch not None dx in exe.run filted = [(i, dxi) for i, dxi in enumerate(dx) if dxi is not None] filted_idx, filted_dx = zip(*filted) for i in six.moves.xrange(y_size): _set_item(dy_t, i, 1, np_type) dx_res = exe.run(program, scope=scope, fetch_list=filted_dx) for j in six.moves.xrange(len(filted_dx)): dx_idx = filted_idx[j] if dx_res[j] is not None: jacobian[dx_idx][:, i] = dx_res[j].flatten() else: jacobian[dx_idx][:, i] = np.zeros(dx[dx_idx].shape, dtype=np_type).flatten() _set_item(dy_t, i, 0, np_type) return jacobian
def build_net(self, cond_i): """ pseudo code: sum_xy = x + y sub_yz = y - z if i > 1: internal = y + z sum_cond = internal + z else: sum_cond = y + z sum_all = sum_xy + sum_yz + sum_cond mean_out = mean(sum_all) optimizer.minimize(mean_out) """ param_x = fluid.layers.create_parameter( dtype="float32", shape=self.shape, attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_x"), default_initializer=fluid.initializer.NumpyArrayInitializer( self.x)) param_y = fluid.layers.create_parameter( dtype="float32", shape=self.shape, attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_y"), default_initializer=fluid.initializer.NumpyArrayInitializer( self.y)) param_z = fluid.layers.create_parameter( dtype="float32", shape=self.shape, attr=fluid.ParamAttr(learning_rate=self.param_lr, name="param_z"), default_initializer=fluid.initializer.NumpyArrayInitializer( self.z)) sum_xy = fluid.layers.elementwise_add(param_x, param_y, name='sum_xy') sub_yz = fluid.layers.elementwise_sub(param_y, param_z, name='sub_yz') useless = fluid.layers.fc(param_x, size=1, name='fc_useless') def cond_true(): cond_yz = fluid.layers.elementwise_add(param_y, param_z, name='sum_cond_yz') # param_y will not be updated param_y.stop_gradient = self.y_no_grad cond_res = fluid.layers.elementwise_add(cond_yz, param_z, name='sum_cond_true') cond_useless = fluid.layers.elementwise_mul(param_x, param_y) return cond_res def cond_false(): cond_res = fluid.layers.elementwise_add(param_y, param_z, name='sum_cond_false') cond_useless = fluid.layers.elementwise_mul(param_z, param_z) return cond_res cond_i = fluid.layers.assign(np.array([cond_i], dtype='float32')) sum_cond = fluid.layers.cond(cond_i > 1.0, cond_true, cond_false) sum_all = fluid.layers.sum([sum_xy, sub_yz, sum_cond]) mean_out = fluid.layers.mean(sum_all) self.optimizer.minimize(mean_out) fetch_list = ["param_x", "param_z"] if self.y_no_grad else [ "param_x", "param_y", "param_z" ] fetch_list += [_append_grad_suffix_(param) for param in fetch_list] return fetch_list