def grad(outputs, inputs, grad_outputs=None, grad_inputs=None, set_grad=False, retain_grad=False, enable_double_backprop=False): """Computes the gradient of output variables w.r.t.\\ the input variables. This function implements the backpropagation algorithm. While :meth:`Variable.backward` also implements backprop, this function selects the smallest paths in the computational graph needed to compute the gradients w.r.t. inputs. The error is backpropagated only through these selected paths, which may reduce the overall computational cost. This function also differs from :meth:`Variable.backward` in the way to return the gradients; it directly returns the gradient variables as a list instead of setting gradients to the :attr:`Variable.grad_var` attribute of the original variable. It means users do not need to clear the gradient w.r.t. each variable before computing the gradient using this function. If ``set_grad`` option is set to ``True``, the computed gradient is also stored in the :attr:`Variable.grad_var` attribute of each variable, in which case any original value of :attr:`Variable.grad_var` will be updated even if it had already been set. Args: outputs: A sequence of output variables from which backprop starts. inputs: A sequence of input variables each of which this function computes the gradient w.r.t. grad_outputs: A sequence of variables that gives the initial value of each output gradient. If an element is set to ``None``, an array filled with 1 is used. If this argument itself is ``None``, it is treated as a sequence of ``None`` s. grad_inputs: A sequence of variables that gives the initial value of each input gradient. The gradients computed by the backprop algorithm are accumulated to them (not in-place). If an element is set to ``None``, the gradient is not accumulated to this value. If this argument itself is ``None``, it is treated as a sequence of ``None`` s. set_grad (bool): If it is ``True``, the :attr:`Variable.grad_var` attribute of each input variable is set to the corresponding computed gradient variable. retain_grad (bool): If it is ``True``, the gradients w.r.t. all the intermediate variables are stored in the :attr:`Variable.grad_var` attribute. In this case, the ``set_grad`` option is ignored. enable_double_backprop (bool): If it is ``True``, the computed gradients can be further backpropagated. Enabling it may increase the memory consumption (and possibly the computational time) to remember the intermediate gradient values for the second backpropagation. Returns: A list of gradient variables w.r.t. the inputs. """ # The implementation consists of three steps. # 1. Backward enumeration: all the nodes reachable backward from the output # nodes are enumerated. The forward direction links are collected in # this step. Note that the variable nodes whose requires_grad is false # are ignored and their creators are not searched. candidate_funcs = [ v.creator_node for v in outputs if v.creator_node is not None ] visited_funcs = set() forward_graph = collections.defaultdict(list) while candidate_funcs: func = candidate_funcs.pop() if func in visited_funcs: continue visited_funcs.add(func) for x in func.inputs: if not x.requires_grad: continue forward_graph[x].append(func) creator = x.creator_node if creator is not None and creator not in visited_funcs: candidate_funcs.append(creator) # 2. Forward enumeration: all the nodes in the subgraph reachable from the # input nodes are enumerated. The extracted (sub-)subgraph is the union # of all paths that backpropagation will visit. candidate_vars = [x.node for x in inputs] visited_funcs = set() grad_required = set() while candidate_vars: x = candidate_vars.pop() grad_required.add(x) for func in forward_graph[x]: if func in visited_funcs: continue visited_funcs.add(func) for y_ref in func.outputs: y = y_ref() if y is not None and y in forward_graph: candidate_vars.append(y) # 3. Backpropagation: the backpropagation is executed along the # (sub-)subgraph. It uses the topological order of the subgraph which is # induced by the reversed order of function applications ("rank"). grads = {} # mapping from variable nodes to their gradients # Initialize the gradient mapping. if grad_outputs is None: grad_outputs = (None, ) * len(outputs) for y, gy in zip(outputs, grad_outputs): if gy is None: with cuda.get_device_from_array(y.data) as device: if device is cuda.DummyDevice: gy_data = numpy.ones_like(y.data) else: gy_data = cuda.cupy.ones_like(y.data) gy = variable.Variable(gy_data, requires_grad=False) grads[y.node] = gy if grad_inputs is not None: for x, gx in zip(inputs, grad_inputs): if gx is not None: grads[x.node] = gx # Backprop implementation. It edits grads which will only contain the # gradients w.r.t. the inputs. with chainer.using_config('enable_backprop', enable_double_backprop): _backprop(outputs, inputs, grad_required, retain_grad, grads) # Extract the gradients w.r.t. the inputs and return them. ret = [grads.get(x.node, None) for x in inputs] if set_grad: for x, gx in zip(inputs, ret): x.grad_var = gx return ret
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ chainerx_in_data = None chainerx_device = None is_chainerx, in_data = _extract_apply_in_data(inputs) if is_chainerx: # Try ChainerX C++ implementation. # If it's supported, the output arrays are wrapped with Variables # and returned. # If not supported, FunctionNode.forward_chainerx should return # Fallback. # In that case the input arrays are converted to numpy.ndarray # or cupy.ndarray (depending on the ChainerX backend) and # forward computation falls back to the conventional # FunctionNode.forward() implementaion. outputs = self.forward_chainerx(in_data) if outputs is not chainer.Fallback: # Supported. Wrap with variables and return assert isinstance(outputs, tuple) return tuple([ variable.Variable._init_unchecked( y, requires_grad=y.is_backprop_required(), is_chainerx_array=True) for y in outputs ]) # Fall back to FunctionNode.forward() chainerx_in_data, in_data, chainerx_device = ( self._chainerx_apply_fallback_preprocess(in_data, inputs)) self._is_chainerx_fallback_mode = True self.chainerx_device = chainerx_device utils._check_arrays_forward_compatible(in_data, self.label) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) elif self._is_chainerx_fallback_mode: # In ChainerX fallback, __class__ is temporarily replaced with # the fabricated one with automatic attirbute fallback. with _chainerx_attribute_fallback(self, chainerx_device): outputs = self.forward(in_data) else: # In normal case, simply run the forward method. outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError('forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format(self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) self._output_count = len(outputs) if self._is_chainerx_fallback_mode: ret = self._chainerx_apply_fallback_postprocess( chainerx_in_data, inputs, outputs) else: input_vars = [chainer.as_variable(x) for x in inputs] requires_grad = any([x.requires_grad for x in input_vars]) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def check_backward(func, x_data, y_grad, params=(), eps=1e-3, atol=1e-5, rtol=1e-4): """Test backward procedure of a given function. This function automatically check backward-process of given function. For example, when you have a :class:`~chainer.Function` class ``MyFunc``, that gets two arguments and returns one value, you can make its test like this:: >> def test_my_func(self): >> func = MyFunc() >> x1_data = xp.array(...) >> x2_data = xp.array(...) >> gy_data = xp.array(...) >> check_backward(func, (x1_data, x2_data), gy_data) This method creates :class:`~chainer.Variable` objects with ``x_data`` and calls ``func`` with the :class:`~chainer.Variable` s to get its result as :class:`~chainer.Variable`. Then, it sets ``y_grad`` array to ``grad`` attribute of the result and calls ``backward`` method to get gradients of the inputs. To check correctness of the gradients, the function calls :func:`numerical_grad` to calculate numerically the gradients and compares the types of gradients with :func:`assert_allclose`. If input objects (``x1_data`` or/and ``x2_data`` in this example) represent integer variables, their gradients are ignored. You can simplify a test when ``MyFunc`` gets only one argument:: >> check_backward(func, x1_data, gy_data) If ``MyFunc`` is a loss function which returns a zero-dimensional array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to ``grad`` attribute of the result:: >> check_backward(my_loss_func, (x1_data, x2_data), None) If ``MyFunc`` returns multiple outputs, pass all gradients for outputs as a tuple:: >> gy1_data = xp.array(...) >> gy2_data = xp.array(...) >> check_backward(func, x1_data, (gy1_data, gy2_data)) You can also test a :class:`~chainer.Link`. To check gradients of parameters of the link, set a tuple of the parameters to ``params`` arguments:: >> check_backward(my_link, (x1_data, x2_data), gy_data, >> (my_link.W, my_link.b)) Note that ``params`` are not ``ndarray`` s, but :class:`~chainer.Variables` s. Function objects are acceptable as ``func`` argument:: >> check_backward(lambda x1, x2: f(x1, x2), >> (x1_data, x2_data), gy_data) .. note:: ``func`` is called many times to get numerical gradients for all inputs. This function doesn't work correctly when ``func`` behaves randomly as it gets different gradients. Args: func (callable): A function which gets :class:`~chainer.Variable` s and returns :class:`~chainer.Variable` s. ``func`` must returns a tuple of :class:`~chainer.Variable` s or one :class:`~chainer.Variable`. You can use :class:`~chainer.Function` object, :class:`~chainer.Link` object or a function satisfying the condition. x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is treated as ``(x_data,)``. y_grad (ndarray or tuple of ndarrays or None): A set of ``ndarray`` s representing gradinents of return-values of ``func``. If ``y_grad`` is one ``ndarray`` object, it is treated as ``(y_grad,)``. If ``func`` is a loss-function, ``y_grad`` should be set to ``None``. params (~chainer.Variable or tuple of ~chainder.Variable): A set of :class:`~chainer.Variable` s whose gradients are checked. When ``func`` is a :class:`~chainer.Link` object, set its parameters as ``params``. If ``params`` is one :class:`~chainer.Variable` object, it is treated as ``(params,)``. eps (float): Epsilon value to be passed to :func:`numerical_grad`. atol (float): Absolute tolerance to be passed to :func:`assert_allclose`. rtol (float): Relative tolerance to be passed to :func:`assert_allclose`. See: :func:`numerical_grad` """ x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) xs = [variable.Variable(x) for x in x_data] y = func(*xs) y = _as_tuple(y) if y_grad is not None: if len(y) != len(y_grad): raise ValueError( '`y_grad` must have the same length of output values') for iy, igy in zip(y, y_grad): iy.grad = igy else: if len(y) != 1: raise ValueError( 'When `y_grad` is `None`, the function must return a' 'zero-dimentional array') y_grad = (1, ) # We only need to call `backward` for one result `Variable`. # `Variable.backward` method calls `Function.backward` of its creator. y[0].backward() def f(): ys = func(*xs) ys = _as_tuple(ys) return tuple(y.data for y in ys) for x in xs: if x.data.dtype.kind == 'f': gx, = numerical_grad(f, (x.data, ), y_grad, eps=eps) assert_allclose(gx, x.grad, atol=atol, rtol=rtol) assert gx.dtype is x.grad.dtype else: assert x.grad is None for p in params: gp, = numerical_grad(f, (p.data, ), y_grad, eps=eps) assert_allclose(gp, p.grad, atol=atol, rtol=rtol) assert gp.dtype is p.grad.dtype
def check_forward(self, x_data): x = variable.Variable(x_data) y = func(x) self.assertEqual(y.data.dtype, x_data.dtype) y_expected = func_expected(cuda.to_cpu(x_data), dtype=x_data.dtype) testing.assert_allclose(y_expected, y.data, atol=1e-4, rtol=1e-4)
def __call__(self, x,Whx,Wmx,Wmh,Whm): """Updates the internal state and returns the LSTM outputs. Args: x (~chainer.Variable): A new batch from the input sequence. Returns: ~chainer.Variable: Outputs of updated LSTM units. """ # if self.upward.has_uninitialized_params: # in_size = x.size // x.shape[0] # self.upward._initialize_params(in_size) # self._initialize_params() # if self.upward2.has_uninitialized_params: # in_size = x.size // x.shape[0] # self.upward2._initialize_params(in_size) # self._initialize_params() batch = x.shape[0] # Whx = self.upward() # Wmx = self.upward2() factor_in = F.linear(x,Wmx) lstm_in = F.linear(x,Whx,self.b) h_rest = None if self.h is not None: h_size = self.h.shape[0] if batch == 0: h_rest = self.h elif h_size < batch: msg = ('The batch size of x must be equal to or less than the ' 'size of the previous state h.') raise TypeError(msg) elif h_size > batch: h_update, h_rest = split_axis.split_axis( self.h, [batch], axis=0) # Wmh = self.lateral1() mult_in = F.linear(h_update,Wmh) mult_out = mult_in*factor_in # Whm = self.lateral2() lstm_in += F.linear(mult_out,Whm) else: # Wmh = self.lateral1() mult_in = F.linear(self.h,Wmh) mult_out = mult_in*factor_in # Whm = self.lateral2() lstm_in += F.linear(mult_out,Whm) if self.c is None: xp = self.xp self.c = variable.Variable(xp.zeros((batch, self.state_size), dtype=x.dtype),volatile='auto') self.c, y = lstm.lstm(self.c, lstm_in) if h_rest is None: self.h = y elif len(y.data) == 0: self.h = h_rest else: self.h = concat.concat([y, h_rest], axis=0) return y
def setUp(self): self.x1 = variable.Variable(np.zeros((1, 2)).astype(np.float32)) self.x2 = variable.Variable(np.zeros((1, 2)).astype(np.float32)) self.y = self.x1 + self.x2
def setUp(self): self.x = variable.Variable(np.zeros((1, 2)).astype(np.float32)) self.y1 = mock_function((self.x, ), 1) self.y2 = mock_function((self.x, ), 1)
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is expressed in documentation of :class:`Function` class. .. note:: If the :data:`~Variable.data` attribute of input variables exist on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementers do not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or :class:`cupy.ndarray` objects. If the input is an :class:`numpy.ndarray` or a :class:`cupy.ndarray`, it is automatically wrapped with :class:`Variable`. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ inputs = [ x if isinstance(x, variable.Variable) else variable.Variable( x, requires_grad=False) for x in inputs ] in_data = tuple([x.data for x in inputs]) requires_grad = any([x.requires_grad for x in inputs]) if chainer.is_debug(): self._stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) for hook in six.itervalues(hooks): hook.forward_preprocess(self, in_data) # Forward prop with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None outputs = self.forward(in_data) assert type(outputs) == tuple for hook in six.itervalues(hooks): hook.forward_postprocess(self, in_data) if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = 'NaN is detected on forward computation' raise RuntimeError(msg) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in inputs]) if inputs else 0 # Backward edges for y in ret: y.set_creator(self) self.inputs = tuple([x.node for x in inputs]) # Forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) input_indexes_to_retain = self._input_indexes_to_retain if input_indexes_to_retain is None: # input arrays are retained by default input_indexes_to_retain = six.moves.range(len(inputs)) for index in input_indexes_to_retain: inputs[index].retain_data() del self._input_indexes_to_retain output_indexes_to_retain = self._output_indexes_to_retain if output_indexes_to_retain is not None: for index in output_indexes_to_retain: ret[index].retain_data() del self._output_indexes_to_retain if len(ret) == 1: return ret[0] else: return ret
def forward(self, inputs): with function.no_backprop_mode(): xs = [variable.Variable(x) for x in inputs] outs = self._call_func(xs) return tuple(out.data for out in outs)
def connectionist_temporal_classification(x, t, blank_symbol, input_length=None, label_length=None, reduce='mean'): """Connectionist Temporal Classification loss function. Connectionist Temporal Classification(CTC) [Graves2006]_ is a loss function of sequence labeling where the alignment between the inputs and target is unknown. See also [Graves2012]_ The output is a variable whose value depends on the value of the option ``reduce``. If it is ``'no'``, it holds the samplewise loss values. If it is ``'mean'``, it takes the mean of loss values. Args: x (sequence of Variable): RNN output at each time. ``x`` must be a list of :class:`~chainer.Variable` s. Each element of ``x``, ``x[i]`` is a :class:`~chainer.Variable` representing output of RNN at time ``i``. t (Variable): Expected label sequence. blank_symbol (int): Index of blank_symbol. This value must be non-negative. input_length (Variable): Length of valid sequence for each of mini batch ``x`` (optional). If input_length is skipped, It regards that all of ``x`` is valid input. label_length (Variable): Length of valid sequence for each of mini batch ``t`` (optional). If label_length is skipped, It regards that all of ``t`` is valid input. reduce (str): Reduction option. Its value must be either ``'mean'`` or ``'no'``. Otherwise, :class:`ValueError` is raised. Returns: ~chainer.Variable: A variable holding a scalar value of the CTC loss. If ``reduce`` is ``'no'``, the output variable holds array whose shape is `(B,)` where `B` is the number of samples. If it is ``'mean'``, it holds a scalar. .. note:: You need to input ``x`` without applying to activation functions(e.g. softmax function), because this function applies softmax functions to ``x`` before calculating CTC loss to avoid numerical limitations. You also need to apply softmax function to forwarded values before you decode it. .. note:: This function is differentiable only by ``x``. .. note:: This function supports (batch, sequence, 1-dimensional input)-data. .. [Graves2006] Alex Graves, Santiago Fernandez,\ Faustino Gomez, Jurgen Schmidhuber,\ `Connectionist Temporal Classification: Labelling Unsegmented\ Sequence Data with Recurrent Neural Networks\ <ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf>`_ .. [Graves2012] Alex Graves,\ `Supervised Sequence Labelling with Recurrent Neural Networks\ <http://www.cs.toronto.edu/~graves/preprint.pdf>`_ """ if not isinstance(x, collections.Sequence): raise TypeError('x must be a list of Variables') if not isinstance(blank_symbol, int): raise TypeError('blank_symbol must be non-negative integer.') assert blank_symbol >= 0 assert blank_symbol < x[0].shape[1] # This implementation only supports 1-dimensional data. # TODO(jnishi): Support d(>1)-dimentinal inputs. assert (len(x[0].shape) == 2) if input_length is None: xp = cuda.get_array_module(x[0].data) input_length = variable.Variable( xp.full((len(x[0].data), ), len(x), dtype=numpy.int32)) if label_length is None: xp = cuda.get_array_module(t.data) label_length = variable.Variable( xp.full((len(t.data), ), len(t.data[0]), dtype=numpy.int32)) return ConnectionistTemporalClassification(blank_symbol, reduce)(input_length, label_length, t, *x)
def init_hx(self, xs): shape = (self.n_layers * self.direction, len(xs), self.out_size) with cuda.get_device_from_id(self._device_id): hx = variable.Variable(self.xp.zeros(shape, dtype=xs[0].dtype)) return hx
def g(): # This functions is called twice in `numerical_grad`. # `delta` is `epsilon` or `-epsilon` in these calls. # See the document of `numerical_grad`. def perturb(data, direction): if data is None: assert direction is None return data data = (data.astype(numpy.float64) + delta * direction).astype( data.dtype) if numpy.isscalar(data): data = xp.array(data) return data # Input arrays g_x_vars = [] j = 0 for x_var, no_gx in six.moves.zip(x_vars, no_gxs): if no_gx: g_x_vars.append(x_var) else: data = perturb(casted_data[j], directions[j]) g_x_vars.append( None if data is None else variable.Variable(data)) j += 1 # Parameters for i in range(len(params)): data = perturb(casted_data[j + i], directions[j + i]) if self.is_immutable_params: # Update the parameter array since it is converted into # a Parameter just before calling the func. params_data[i] = data else: # Update the given Parameter in-place since the object is # held by the caller. params[i].array = data # Clear gradients to support func that calls backward inside of # itself. self._clear_grads(g_x_vars) if not self.is_immutable_params: self._clear_grads(params) if self.is_immutable_params: ps = tuple([chainer.Parameter(p) for p in params_data]) ys = func(g_x_vars, ps) else: ys = func(*g_x_vars) ys = _as_tuple(ys) ys_data = tuple([None if y is None else y.array for y in ys]) if xp is chainerx: ys_data = tuple([ None if y is None else y.as_grad_stopped() for y in ys_data ]) if not self.is_immutable_params: for i, param in enumerate(params): param.array = casted_data[j + i] return ys_data
def _directional_numeric_gradients(self, directions, y0_data): device = self.device func = self.func xs = self.xs gys = self.gys params = self.params eps = self.eps no_gxs = self.no_gxs dtype = self.dtype detect_nondifferentiable = self.detect_nondifferentiable params_data = [ p if self.is_immutable_params else p.array for p in params ] xp = device.xp x_vars = [variable.Variable(x, requires_grad=False) for x in xs] xs_filtered = [ x.array for x, no_gx in six.moves.zip(x_vars, no_gxs) if not no_gx ] if dtype is None: casted_data = [x for x in xs_filtered + params_data] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') # Even skipped variable must have the same dtype. for x, no_gx in six.moves.zip(x_vars, no_gxs): if no_gx and x.array.dtype.kind == 'f': x.array = x.array.astype(dtype, copy=False) casted_data = [ None if x is None else x.astype(dtype, copy=False) for x in xs_filtered + params_data ] delta = xp.array(0., numpy.float64) def g(): # This functions is called twice in `numerical_grad`. # `delta` is `epsilon` or `-epsilon` in these calls. # See the document of `numerical_grad`. def perturb(data, direction): if data is None: assert direction is None return data data = (data.astype(numpy.float64) + delta * direction).astype( data.dtype) if numpy.isscalar(data): data = xp.array(data) return data # Input arrays g_x_vars = [] j = 0 for x_var, no_gx in six.moves.zip(x_vars, no_gxs): if no_gx: g_x_vars.append(x_var) else: data = perturb(casted_data[j], directions[j]) g_x_vars.append( None if data is None else variable.Variable(data)) j += 1 # Parameters for i in range(len(params)): data = perturb(casted_data[j + i], directions[j + i]) if self.is_immutable_params: # Update the parameter array since it is converted into # a Parameter just before calling the func. params_data[i] = data else: # Update the given Parameter in-place since the object is # held by the caller. params[i].array = data # Clear gradients to support func that calls backward inside of # itself. self._clear_grads(g_x_vars) if not self.is_immutable_params: self._clear_grads(params) if self.is_immutable_params: ps = tuple([chainer.Parameter(p) for p in params_data]) ys = func(g_x_vars, ps) else: ys = func(*g_x_vars) ys = _as_tuple(ys) ys_data = tuple([None if y is None else y.array for y in ys]) if xp is chainerx: ys_data = tuple([ None if y is None else y.as_grad_stopped() for y in ys_data ]) if not self.is_immutable_params: for i, param in enumerate(params): param.array = casted_data[j + i] return ys_data gx, = numerical_grad(g, (delta, ), gys, eps=eps, detect_nondifferentiable=detect_nondifferentiable, center_outputs=y0_data, diff_atol=0, diff_rtol=self.rtol) return gx
def init_hx_1(self, xs): first = xs[0] shape = (1 * self.direction, first.shape[0], self.out_size) with cuda.get_device_from_id(self._device_id): hx = variable.Variable(self.xp.zeros(shape, dtype=first.dtype)) return hx
def setUp(self): self.x1 = variable.Variable(np.zeros((1, 2)).astype(np.float32)) self.x2 = variable.Variable(np.zeros((1, 2)).astype(np.float32)) self.y = mock_function((self.x1, self.x2), 1)
def backward(self, inputs, grads): with function.force_backprop_mode(): xs = [variable.Variable(x) for x in inputs] outs = self._call_func(xs) _DummyFunction(grads)(*outs).backward() return tuple(x.grad for x in xs)
def setUp(self): self.x1 = variable.Variable(np.zeros((1, 2)).astype(np.float32)) self.x2 = variable.Variable(np.zeros((1, 2)).astype(np.float32)) self.y = self.x1 + self.x2 self.f = self.y.creator self.g = c.build_computational_graph((self.y, ))
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`Variable`. Returns: A tuple of output :class:`Variable` objects. """ input_vars = [chainer.as_variable(x) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) if chainer.is_debug(): self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None outputs = self.forward(in_data) assert type(outputs) is tuple for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for i, y in enumerate(ret): y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) return ret
def setUp(self): self.x1 = variable.Variable(np.zeros((1, 2)).astype('f')) self.x2 = variable.Variable(np.zeros((1, 2)).astype('f')) self.y = self.x1 + self.x2 self.f = self.y.creator self.g = c.build_computational_graph((self.y, ), remove_variable=True)
def forget(func, *xs): """Calls a function without storing intermediate results. On a forward propagation, Chainer normally stores all intermediate results of :class:`~chainer.variable.VariableNode`\\ s on a computational graph as they are required on backward propagation. Sometimes these results consume too much memory. ``F.forget`` *forgets* such intermediate results on forward propagation, and still supports backpropagation with recalculation. On a forward propagation, ``F.forget`` calls a given function with given variables without creating a computational graph. That means, no intermediate results are stored. On a backward propagation, ``F.forget`` calls the given function again to create a computational graph for backpropagation. ``F.forget`` reduces internal memory usage, whereas it requires more calculation time as it calls the function twice. .. admonition:: Example Let ``f`` be a function defined as: >>> def f(a, b): ... return a + b + a and, ``x`` and ``y`` be :class:`~chainer.Variable`\\ s: >>> x = chainer.Variable(np.random.uniform(-1, 1, 5).astype(np.float32)) >>> y = chainer.Variable(np.random.uniform(-1, 1, 5).astype(np.float32)) When ``z`` is calculated as ``z = f(x, y)``, its intermediate result ``x + y`` is stored in memory. Instead, if you call ``f`` with ``F.forget``: >>> z = F.forget(f, x, y) intermediate ``x + y`` is forgotten. .. note:: ``F.forget`` does not support functions which behave differently in multiple calls with the same inputs, such as :meth:`F.dropout() <chainer.functions.dropout>` and :meth:`F.negative_sampling() <chainer.functions.negative_sampling>`. .. note:: In case input argument variables are of class :class:`numpy.ndarray` or :class:`cupy.ndarray` objects, arguments will automatically be converted to :class:`~chainer.Variable`\\ s. This conversion takes place to ensure that this function is included in the computational graph to enable backward computations. Args: func (callable): A function to call. It needs to be called with :class:`~chainer.Variable` object(s) and to return a :class:`~chainer.Variable` object or a tuple of :class:`~chainer.Variable` objects. xs (~chainer.Variable): Argument variables of the function. Returns: ~chainer.Variable: A variable ``func`` returns. If it returns a tuple, the method returns a tuple too. """ xs = tuple(x if isinstance(x, variable.Variable) else variable.Variable(x, requires_grad=True) for x in xs) y = Forget(func).apply(xs) if len(y) == 1: y, = y return y
def _directional_numeric_gradients(self, directions, y0_data): device = self.device func = self.func x_data = self.x_data y_grad = self.y_grad params = self.params eps = self.eps no_grads = self.no_grads dtype = self.dtype detect_nondifferentiable = self.detect_nondifferentiable xp = device.xp x_vars = [variable.Variable(x, requires_grad=False) for x in x_data] variables = ( [x for x, no_grad in six.moves.zip(x_vars, no_grads) if not no_grad] + list(params)) if dtype is None: casted_data = [x.array for x in variables] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') casted_data = [ x.array.astype(dtype, copy=False) for x in variables] # Even skipped variable must have the same dtype. for x, skip in six.moves.zip(x_vars, no_grads): if skip and x.array.dtype.kind == 'f': x.array = x.array.astype(dtype, copy=False) delta = xp.array(0., numpy.float64) def g(): # This functions is called twice in `numerical_grad`. # `delta` is `epsilon` or `-epsilon` in these calls. # See the document of `numerical_grad`. def perturb(data, direction): data = (data.astype(numpy.float64) + delta * direction).astype(data.dtype) if numpy.isscalar(data): data = xp.array(data) return data # Input arrays g_x_vars = [] j = 0 for i in range(len(x_vars)): if no_grads[i]: g_x_vars.append(x_vars[i]) else: data = perturb(casted_data[j], directions[j]) g_x_vars.append(variable.Variable(data)) j += 1 # Parameters for i in range(len(params)): params[i].data = perturb( casted_data[j + i], directions[j + i]) # Clear gradients to support func that calls backward inside of # itself. self._clear_grads(g_x_vars) self._clear_grads(params) ys = func(*g_x_vars) ys = _as_tuple(ys) ys_data = tuple([None if y is None else y.array for y in ys]) if xp is chainerx: ys_data = tuple([ None if y is None else y.as_grad_stopped() for y in ys_data]) for param, data in six.moves.zip(params, casted_data): param.data = data return ys_data gx, = numerical_grad( g, (delta,), y_grad, eps=eps, detect_nondifferentiable=detect_nondifferentiable, center_outputs=y0_data, diff_atol=0, diff_rtol=self.rtol) return gx
def __call__(self, x, active_len, mask, **kwargs): """__call__(self, x, finetune=False) Invokes the forward propagation of BatchNormalization. In training mode, the BatchNormalization computes moving averages of mean and variance for evaluation during training, and normalizes the input using batch statistics. .. warning:: ``test`` argument is not supported anymore since v2. Instead, use ``chainer.using_config('train', False)``. See :func:`chainer.using_config`. Args: x (Variable): Input variable. finetune (bool): If it is in the training mode and ``finetune`` is ``True``, BatchNormalization runs in fine-tuning mode; it accumulates the input array to compute population statistics for normalization, and normalizes the input using batch statistics. """ argument.check_unexpected_kwargs( kwargs, test='test argument is not supported anymore. ' 'Use chainer.using_config') finetune, = argument.parse_kwargs(kwargs, ('finetune', False)) if hasattr(self, 'gamma'): gamma = self.gamma else: with cuda.get_device_from_id(self._device_id): gamma = variable.Variable( self.xp.ones(self.avg_mean.shape, dtype=x.dtype)) if hasattr(self, 'beta'): beta = self.beta else: with cuda.get_device_from_id(self._device_id): beta = variable.Variable( self.xp.zeros(self.avg_mean.shape, dtype=x.dtype)) if configuration.config.train: if finetune: self.N += 1 decay = 1. - 1. / self.N else: decay = self.decay ret = func_active_bn.batch_normalization( x, gamma, beta, eps=self.eps, running_mean=self.avg_mean, running_var=self.avg_var, decay=decay, active_len=active_len, mask=mask) else: # Use running average statistics or fine-tuned statistics. mean = variable.Variable(self.avg_mean) var = variable.Variable(self.avg_var) ret = func_active_bn.fixed_batch_normalization( x, gamma, beta, mean, var, self.eps) return ret
def forward(self, inputs): self.retain_inputs(tuple(range(len(inputs)))) with function.no_backprop_mode(): xs = [variable.Variable(x) for x in inputs] outs = _call_func(self.func, xs) return tuple(out.data for out in outs)
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is expressed in documentation of :class:`Function` class. .. note:: If the :data:`~Variable.data` attribute of input variables exist on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementers do not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or :class:`cupy.ndarray` objects. The volatile flags of all input variables must agree. If the input is an :class:`numpy.ndarray` or a :class:`cupy.ndarray`, it is automatically wrapped with :class:`Variable`. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ inputs = [ x if isinstance(x, chainer.Variable) else chainer.Variable( x, volatile=flag.AUTO) for x in inputs ] in_data = tuple([x.data for x in inputs]) if chainer.is_debug(): self._stack = traceback.extract_stack() if self.type_check_enable: self._check_data_type_forward(in_data) hooks = collections.OrderedDict(chainer.get_function_hooks()) hooks.update(self.local_function_hooks) for hook in six.itervalues(hooks): hook.forward_preprocess(self, in_data) # Forward prop with cuda.get_device(*in_data): outputs = self.forward(in_data) assert type(outputs) == tuple for hook in six.itervalues(hooks): hook.forward_postprocess(self, in_data) if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = 'NaN is detected on forward computation' raise RuntimeError(msg) out_v = flag.aggregate_flags([x.volatile for x in inputs]) ret = tuple([variable.Variable(y, volatile=out_v) for y in outputs]) if out_v != 'on': # Topological ordering self.rank = max([x.rank for x in inputs]) if inputs else 0 # Backward edges for y in ret: y.set_creator(self) self.inputs = inputs # Forward edges (must be weak references) self.outputs = tuple([weakref.ref(y) for y in ret]) if len(ret) == 1: return ret[0] else: return ret
def check_backward(func, x_data, y_grad, params=(), eps=1e-3, atol=1e-5, rtol=1e-4, no_grads=None, dtype=None, detect_nondifferentiable=False): """Test backward procedure of a given function. This function automatically checks the backward-process of a given function to ensure that the computed gradients are approximately correct. For example, assuming you've defined a :class:`~chainer.FunctionNode` class ``MyFunc``, that takes two arguments and returns one value, you can wrap it in a ordinary function and check its gradient computations as follows:: >> def test_my_func(self): >> >> def func(xs): >> y, = MyFunc().apply(xs) >> return y >> >> x1_data = xp.array(...) >> x2_data = xp.array(...) >> gy_data = xp.array(...) >> check_backward(func, (x1_data, x2_data), gy_data) This method creates :class:`~chainer.Variable` objects with ``x_data`` and calls ``func`` with the :class:`~chainer.Variable`\\ s to get its result as :class:`~chainer.Variable`. Then, it sets ``y_grad`` array to ``grad`` attribute of the result and calls ``backward`` method to get gradients of the inputs. To check correctness of the gradients, the function calls :func:`numerical_grad` to calculate numerically the gradients and compares the types of gradients with :func:`chainer.testing.assert_allclose`. To reduce computational time, it uses directional derivative along a random vector. A function :math:`g: \\mathbb{R} \\rightarrow \\mathbb{R}^n` is defined as :math:`g(\\delta) = f(x + \\delta r)`, where :math:`\\delta \\in \\mathbb{R}`, :math:`r \\in \\mathbb{R}^n` is a random vector and :math:`f` is a function which you want to test. Its gradient is .. math:: g'(\\delta) = f'(x + \\delta r) \\cdot r. Therefore, :math:`g'(0) = f'(x) \\cdot r`. So we can check the correctness of back propagation of :math:`f` indirectly by comparing this equation with the gradient of :math:`g` numerically calculated and that of :math:`f` computed by backprop. If :math:`r` is chosen from uniform distribution, we can conclude with high probability that the gradient of :math:`f` itself is correct. If input objects (``x1_data`` or/and ``x2_data`` in this example) represent integer variables, their gradients are ignored. You can simplify a test when ``MyFunc`` gets only one argument:: >> check_backward(func, x1_data, gy_data) If ``MyFunc`` is a loss function which returns a zero-dimensional array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to ``grad`` attribute of the result:: >> check_backward(my_loss_func, (x1_data, x2_data), None) If ``MyFunc`` returns multiple outputs, pass all gradients for outputs as a tuple:: >> gy1_data = xp.array(...) >> gy2_data = xp.array(...) >> check_backward(func, x1_data, (gy1_data, gy2_data)) You can also test a :class:`~chainer.Link`. To check gradients of parameters of the link, set a tuple of the parameters to ``params`` arguments:: >> check_backward(my_link, (x1_data, x2_data), gy_data, >> (my_link.W, my_link.b)) Note that ``params`` are not ``ndarray``\\ s, but :class:`~chainer.Variables`\\ s. Function objects are acceptable as ``func`` argument:: >> check_backward(lambda x1, x2: f(x1, x2), >> (x1_data, x2_data), gy_data) .. note:: ``func`` is called many times to get numerical gradients for all inputs. This function doesn't work correctly when ``func`` behaves randomly as it gets different gradients. Args: func (callable): A function which gets :class:`~chainer.Variable`\\ s and returns :class:`~chainer.Variable`\\ s. ``func`` must returns a tuple of :class:`~chainer.Variable`\\ s or one :class:`~chainer.Variable`. You can use a :class:`~chainer.Function`, :class:`~chainer.FunctionNode` or a :class:`~chainer.Link` object or any other function satisfying the condition. x_data (ndarray or tuple of ndarrays): A set of ``ndarray``\\ s to be passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is treated as ``(x_data,)``. y_grad (ndarray or tuple of ndarrays or None): A set of ``ndarray``\\ s representing gradients of return-values of ``func``. If ``y_grad`` is one ``ndarray`` object, it is treated as ``(y_grad,)``. If ``func`` is a loss-function, ``y_grad`` should be set to ``None``. params (~chainer.Variable or tuple of ~chainder.Variable): A set of :class:`~chainer.Variable`\\ s whose gradients are checked. When ``func`` is a :class:`~chainer.Link` object, set its parameters as ``params``. If ``params`` is one :class:`~chainer.Variable` object, it is treated as ``(params,)``. eps (float): Epsilon value to be passed to :func:`numerical_grad`. atol (float): Absolute tolerance to be passed to :func:`chainer.testing.assert_allclose`. rtol (float): Relative tolerance to be passed to :func:`chainer.testing.assert_allclose`. no_grads (list of bool): Flag to skip variable for gradient assertion. It should be same length as ``x_data``. dtype (~numpy.dtype): ``x_data``, ``y_grad`` and ``params`` are casted to this dtype when calculating numerical gradients. Only float types and ``None`` are allowed. detect_nondifferentiable (bool): If ``True``, check for non-differentiable inputs is enabled. If ``func`` is non-differentiable at ``x_data``, ``check_backward`` raises :class:`~chainer.gradient_check.NondifferentiableError`. .. seealso:: :func:`numerical_grad` """ if dtype is not None and numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) xs = [variable.Variable(x) for x in x_data] y = func(*xs) y = _as_tuple(y) y0_data = [_.data for _ in y] # All creators of `y` need to be the same because we only call # `y[0].backward` to call `backward` method of the creator. # To do so we need to insert a dummy function `Ident` to the # computational graph. # Note that `func` may not be a `Function` object. y = identity.Identity().apply(y) y_grad = _set_y_grad(y, y_grad) # Clear gradients which may exist if func calls backward inside of itself. _clear_grads(xs) _clear_grads(params) # We only need to call `backward` for one result `Variable`. # `Variable.backward` method calls `Function.backward` of its creator. y[0].backward() if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in xs] else: if len(no_grads) != len(xs): raise ValueError( 'Length of no_grads param and xs should be same.\n' 'Actual: {0} != {1}'.format(len(no_grads), len(xs))) for skip, x in six.moves.zip(no_grads, xs): if skip: if x.grad is not None: raise RuntimeError('gradient of int variable must be None') else: if x.grad is None: raise RuntimeError( 'gradients of some arguments are not calculated') if len(xs) - no_grads.count(True) + len(params) == 0: # When there is no float variables, we need not to check gradient # values return variables = _filter_list(xs, no_grads) + list(params) # Keep the gradient arrays of params which may be overwritten by func grads = [x.grad for x in variables] if dtype is None: casted_data = [x.data for x in variables] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') casted_data = [x.data.astype(dtype, copy=False) for x in variables] # Even skipped variable must have the same dtype. for x, skip in six.moves.zip(xs, no_grads): if skip and x.data.dtype.kind == 'f': x.data = x.data.astype(dtype, copy=False) xp = cuda.get_array_module(*xs) directions = [xp.random.normal(size=x.shape) for x in variables] # Use unit vector norm = math.sqrt(sum([xp.square(d).sum() for d in directions])) if norm != 0: # norm could be zero if input arrays are 0-sized. scale = 1. / norm directions = [d * scale for d in directions] delta = xp.array(0., 'd') def g(): # This functions is called twice in `numerical_grad`. # `delta` is `epsilon` or `-epsilon` in these calls. # See the document of `numerical_grad`. for x, data, direction in six.moves.zip(variables, casted_data, directions): # astype is require to store data with the given type data = (data.astype('d') + delta * direction).astype(data.dtype) if numpy.isscalar(data): data = xp.array(data) x.data = data # Clear gradients to support func that calls backward inside of itself. _clear_grads(xs) _clear_grads(params) ys = func(*xs) ys = _as_tuple(ys) ys_data = tuple(y.data for y in ys) for x, data in six.moves.zip(variables, casted_data): x.data = data return ys_data gx, = numerical_grad(g, (delta, ), y_grad, eps=eps, detect_nondifferentiable=detect_nondifferentiable, center_outputs=y0_data) gx_accum = 0 for g, direction in six.moves.zip(grads, directions): gx_accum += (g.astype('d') * direction).sum() try: testing.assert_allclose(gx, gx_accum, atol=atol, rtol=rtol) except AssertionError as e: f = six.StringIO() f.write('check_backward failed (eps={} atol={} rtol={})\n'.format( eps, atol, rtol)) for i, x_ in enumerate(xs): f.write('inputs[{}]:\n'.format(i)) f.write('{}\n'.format(x_)) for i, gy_ in enumerate(y_grad): f.write('grad_outputs[{}]:\n'.format(i)) f.write('{}\n'.format(gy_)) f.write('gradients (numeric): {}\n'.format(gx)) f.write('gradients (backward): {}\n'.format(gx_accum)) f.write('\n') f.write(str(e)) raise AssertionError(f.getvalue())
def __call__(self, x, **kwargs): """__call__(self, x, finetune=False) Invokes the forward propagation of BatchNormalization. In training mode, the BatchNormalization computes moving averages of mean and variance for evaluation during training, and normalizes the input using batch statistics. .. warning:: ``test`` argument is not supported anymore since v2. Instead, use ``chainer.using_config('train', False)``. See :func:`chainer.using_config`. Args: x (Variable): Input variable. finetune (bool): If it is in the training mode and ``finetune`` is ``True``, BatchNormalization runs in fine-tuning mode; it accumulates the input array to compute population statistics for normalization, and normalizes the input using batch statistics. """ # check argument argument.check_unexpected_kwargs( kwargs, test='test argument is not supported anymore. ' 'Use chainer.using_config') finetune, = argument.parse_kwargs(kwargs, ('finetune', False)) original_shape = x.shape batch_size = original_shape[0] # reshape input x if batchsize > 1 if batch_size > 1: reshaped_x = functions.expand_dims(x, axis=0) else: reshaped_x = x if hasattr(self, 'gamma'): gamma = self.gamma if self.norm_grad: # gamma.add_batch(batch_size) gamma.n_batch = batch_size else: with cuda.get_device_from_id(self._device_id): gamma = variable.Variable(self.xp.ones( self.avg_mean.shape, dtype=x.dtype)) if hasattr(self, 'beta'): beta = self.beta if self.norm_grad: # beta.add_batch(batch_size) beta.n_batch = batch_size else: with cuda.get_device_from_id(self._device_id): beta = variable.Variable(self.xp.zeros( self.avg_mean.shape, dtype=x.dtype)) #align shapes if x was reshaped if batch_size > 1: mean = self.xp.stack((self.avg_mean,) * batch_size) var = self.xp.stack((self.avg_var,) * batch_size) gamma = functions.stack((gamma,) * batch_size) beta = functions.stack((beta,) * batch_size) else: mean = self.xp.asarray(self.avg_mean) var = self.xp.asarray(self.avg_var) if configuration.config.train: if finetune: self.N += 1 decay = 1. - 1. / self.N else: decay = self.decay func = batch_normalization.BatchNormalizationFunction( self.eps, mean, var, decay) ret = func(reshaped_x, gamma, beta) else: head_ndim = gamma.ndim + 1 axis = (0,) + tuple(range(head_ndim, reshaped_x.ndim)) mean = reshaped_x.data.mean(axis=axis) var = reshaped_x.data.var(axis=axis) ret = functions.fixed_batch_normalization( reshaped_x, gamma, beta, mean, var, self.eps) # ret is normalized input x if batch_size > 1: ret = functions.reshape(ret, original_shape) return ret
def grad(outputs, inputs, grad_outputs=None, grad_inputs=None, set_grad=False, retain_grad=False, enable_double_backprop=False, loss_scale=None): """Computes the gradient of output variables w.r.t.\\ the input variables. This function implements the backpropagation algorithm. While :meth:`Variable.backward` also implements backprop, this function selects the smallest paths in the computational graph needed to compute the gradients w.r.t. inputs. The error is backpropagated only through these selected paths, which may reduce the overall computational cost. This function also differs from :meth:`Variable.backward` in the way to return the gradients; it directly returns the gradient variables as a list instead of setting gradients to the :attr:`Variable.grad_var` attribute of the original variable. It means users do not need to clear the gradient w.r.t. each variable before computing the gradient using this function. If ``set_grad`` option is set to ``True``, the computed gradient is also stored in the :attr:`Variable.grad_var` attribute of each variable, in which case any original value of :attr:`Variable.grad_var` will be updated even if it had already been set. Args: outputs (tuple or list of :class:`~chainer.Variable`): A sequence of output variables from which backprop starts. inputs (tuple or list of :class:`~chainer.Variable`): A sequence of input variables each of which this function computes the gradient w.r.t. grad_outputs (tuple or list of :class:`~chainer.Variable` or None): A sequence of variables that gives the initial value of each output gradient. If an element is set to ``None``, an array filled with 1 is used. If this argument itself is ``None``, it is treated as a sequence of ``None``\\ s. grad_inputs (tuple or list of :class:`~chainer.Variable` or None): A sequence of variables that gives the initial value of each input gradient. The gradients computed by the backprop algorithm are accumulated to them (not in-place). If an element is set to ``None``, the gradient is not accumulated to this value. If this argument itself is ``None``, it is treated as a sequence of ``None``\\ s. set_grad (bool): If it is ``True``, the :attr:`Variable.grad_var` attribute of each input variable is set to the corresponding computed gradient variable. retain_grad (bool): If it is ``True``, the gradients w.r.t. all the intermediate variables are stored in the :attr:`Variable.grad_var` attribute. In this case, the ``set_grad`` option is ignored. enable_double_backprop (bool): If it is ``True``, the computed gradients can be further backpropagated. Enabling it may increase the memory consumption (and possibly the computational time) to remember the intermediate gradient values for the second backpropagation. loss_scale (float): Loss scaling factor. Loss scaling is a usefull technique to mitigate vanishing gradient issue that tends to happen when low precision data type like float16 is used during training. If you set loss scaling factor, gradients of loss values are to be multiplied by the factor before backprop starts. The factor is propagated to whole gradients in a computational graph along the backprop. The gradients of parameters are divided by the factor just before the parameters are to be updated. Returns: A list of gradient variables w.r.t. the inputs. """ if not isinstance(outputs, (tuple, list)): raise TypeError('outputs must be a tuple or a list, not {}.'.format( type(outputs))) if not isinstance(inputs, (tuple, list)): raise TypeError('inputs must be a tuple or a list, not {}.'.format( type(inputs))) if not (grad_outputs is None or isinstance(grad_outputs, (tuple, list))): raise TypeError( 'grad_outputs must be a tuple or a list or None, not {}.'.format( type(grad_outputs))) if not (grad_inputs is None or isinstance(grad_inputs, (tuple, list))): raise TypeError( 'grad_inputs must be a tuple or a list or None, not {}.'.format( type(grad_inputs))) for v in outputs: # Raise error here if v is created by Function.backward. # In such case, we don't know exact inputs of the creator. v.node._check_old_style_gradient() # The implementation consists of three steps. # 1. Backward enumeration: all the nodes reachable backward from the output # nodes are enumerated. The forward direction links are collected in # this step. Note that the variable nodes whose requires_grad is false # are ignored and their creators are not searched. candidate_funcs = [ v.creator_node for v in outputs if v.creator_node is not None ] visited_funcs = set() forward_graph = collections.defaultdict(list) while candidate_funcs: func = candidate_funcs.pop() if func in visited_funcs: continue visited_funcs.add(func) for x in func.inputs: # Raise error here if x is created by Function.backward. # In such case, we don't know exact inputs of the creator. x._check_old_style_gradient() if not x.requires_grad: continue forward_graph[x].append(func) creator = x.creator_node if creator is not None and creator not in visited_funcs: candidate_funcs.append(creator) # 2. Forward enumeration: all the nodes in the subgraph reachable from the # input nodes are enumerated. The extracted (sub-)subgraph is the union # of all paths that backpropagation will visit. candidate_vars = [x.node for x in inputs] visited_funcs = set() grad_required = set() while candidate_vars: x = candidate_vars.pop() grad_required.add(x) for func in forward_graph[x]: if func in visited_funcs: continue visited_funcs.add(func) for y_ref in func.outputs: y = y_ref() if y is not None and y in forward_graph: candidate_vars.append(y) # 3. Backpropagation: the backpropagation is executed along the # (sub-)subgraph. It uses the topological order of the subgraph which is # induced by the reversed order of function applications ("rank"). grads = _backprop_utils.GradTable() # Initialize the gradient mapping. if grad_outputs is None: grad_outputs = (None, ) * len(outputs) for y, gy in zip(outputs, grad_outputs): if gy is None: with cuda.get_device_from_array(y.data) as device: if device is cuda.DummyDevice: gy_data = numpy.ones_like(y.data) else: gy_data = cuda.cupy.ones_like(y.data) gy = variable.Variable(gy_data, requires_grad=False) if loss_scale is not None: gy.data *= loss_scale grads[y.node] = gy if grad_inputs is not None: for x, gx in zip(inputs, grad_inputs): if gx is not None: grads[x.node] = gx # Backprop implementation. It edits grads which will only contain the # gradients w.r.t. the inputs. with chainer.using_config('enable_backprop', enable_double_backprop): ret_dict = _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale) # Extract the gradients w.r.t. the inputs and return them. ret = [ret_dict[x.node] for x in inputs] if set_grad: for x, gx in zip(inputs, ret): x.grad_var = gx return ret
def check_backward(func, x_data, y_grad, params=(), eps=1e-3, atol=1e-5, rtol=1e-4, no_grads=None, dtype=None): """Test backward procedure of a given function. This function automatically check backward-process of given function. For example, when you have a :class:`~chainer.Function` class ``MyFunc``, that gets two arguments and returns one value, you can make its test like this:: >> def test_my_func(self): >> func = MyFunc() >> x1_data = xp.array(...) >> x2_data = xp.array(...) >> gy_data = xp.array(...) >> check_backward(func, (x1_data, x2_data), gy_data) This method creates :class:`~chainer.Variable` objects with ``x_data`` and calls ``func`` with the :class:`~chainer.Variable` s to get its result as :class:`~chainer.Variable`. Then, it sets ``y_grad`` array to ``grad`` attribute of the result and calls ``backward`` method to get gradients of the inputs. To check correctness of the gradients, the function calls :func:`numerical_grad` to calculate numerically the gradients and compares the types of gradients with :func:`chainer.testing.assert_allclose`. If input objects (``x1_data`` or/and ``x2_data`` in this example) represent integer variables, their gradients are ignored. You can simplify a test when ``MyFunc`` gets only one argument:: >> check_backward(func, x1_data, gy_data) If ``MyFunc`` is a loss function which returns a zero-dimensional array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to ``grad`` attribute of the result:: >> check_backward(my_loss_func, (x1_data, x2_data), None) If ``MyFunc`` returns multiple outputs, pass all gradients for outputs as a tuple:: >> gy1_data = xp.array(...) >> gy2_data = xp.array(...) >> check_backward(func, x1_data, (gy1_data, gy2_data)) You can also test a :class:`~chainer.Link`. To check gradients of parameters of the link, set a tuple of the parameters to ``params`` arguments:: >> check_backward(my_link, (x1_data, x2_data), gy_data, >> (my_link.W, my_link.b)) Note that ``params`` are not ``ndarray`` s, but :class:`~chainer.Variables` s. Function objects are acceptable as ``func`` argument:: >> check_backward(lambda x1, x2: f(x1, x2), >> (x1_data, x2_data), gy_data) .. note:: ``func`` is called many times to get numerical gradients for all inputs. This function doesn't work correctly when ``func`` behaves randomly as it gets different gradients. Args: func (callable): A function which gets :class:`~chainer.Variable` s and returns :class:`~chainer.Variable` s. ``func`` must returns a tuple of :class:`~chainer.Variable` s or one :class:`~chainer.Variable`. You can use :class:`~chainer.Function` object, :class:`~chainer.Link` object or a function satisfying the condition. x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is treated as ``(x_data,)``. y_grad (ndarray or tuple of ndarrays or None): A set of ``ndarray`` s representing gradients of return-values of ``func``. If ``y_grad`` is one ``ndarray`` object, it is treated as ``(y_grad,)``. If ``func`` is a loss-function, ``y_grad`` should be set to ``None``. params (~chainer.Variable or tuple of ~chainder.Variable): A set of :class:`~chainer.Variable` s whose gradients are checked. When ``func`` is a :class:`~chainer.Link` object, set its parameters as ``params``. If ``params`` is one :class:`~chainer.Variable` object, it is treated as ``(params,)``. eps (float): Epsilon value to be passed to :func:`numerical_grad`. atol (float): Absolute tolerance to be passed to :func:`chainer.testing.assert_allclose`. rtol (float): Relative tolerance to be passed to :func:`chainer.testing.assert_allclose`. no_grads (list of bool): Flag to skip variable for gradient assertion. It should be same length as ``x_data``. dtype (~numpy.dtype): ``x_data`` and ``y_grad`` are casted to this dtype when calculating numerical gradients. Only float types and ``None`` are allowed. See: :func:`numerical_grad` """ x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) xs = [variable.Variable(x) for x in x_data] y = func(*xs) y = _as_tuple(y) # All creators of `y` need to be the same because we only call # `y[0].backward` to call `backward` method of the creator. # To do so we need to insert a dummy function `Ident` to the # computational graph. # Note that `func` may not be a `Function` object. y = identity.Identity()(*y) y = _as_tuple(y) if y_grad is not None: if len(y) != len(y_grad): raise ValueError( '`y_grad` must have the same length of output values') for iy, igy in six.moves.zip(y, y_grad): iy.grad = igy else: if len(y) != 1: raise ValueError( 'When `y_grad` is `None`, the function must return a' 'zero-dimentional array') y_grad = (1, ) # We only need to call `backward` for one result `Variable`. # `Variable.backward` method calls `Function.backward` of its creator. y[0].backward() if dtype is None: casted_xs = [variable.Variable(x) for x in x_data] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') if len(params) > 0: raise ValueError('`dtype` is available only if `params` is empty') casted_xs = [ variable.Variable( x.astype(dtype, copy=False) if x.dtype.kind == 'f' else x) for x in x_data ] def f(): ys = func(*casted_xs) ys = _as_tuple(ys) return tuple(y.data for y in ys) if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in xs] else: if len(no_grads) != len(xs): raise ValueError('Length of no_grads param and xs should be same.') for skip, x, cx in six.moves.zip(no_grads, xs, casted_xs): if skip: assert x.grad is None continue gx, = numerical_grad(f, (cx.data, ), y_grad, eps=eps) testing.assert_allclose(gx, x.grad, atol=atol, rtol=rtol) if dtype is None: assert gx.dtype == x.grad.dtype else: assert gx.dtype.kind == 'f' and gx.dtype == dtype for p in params: gp, = numerical_grad(f, (p.data, ), y_grad, eps=eps) testing.assert_allclose(gp, p.grad, atol=atol, rtol=rtol) assert gp.dtype is p.grad.dtype
def check_backward(func, x_data, y_grad, params=(), eps=1e-3, atol=1e-5, rtol=1e-4, no_grads=None, dtype=None): """Test backward procedure of a given function. This function automatically checks backward-process of a given function. For example, when you have a :class:`~chainer.Function` class ``MyFunc``, that gets two arguments and returns one value, you can make its test like this:: >> def test_my_func(self): >> func = MyFunc() >> x1_data = xp.array(...) >> x2_data = xp.array(...) >> gy_data = xp.array(...) >> check_backward(func, (x1_data, x2_data), gy_data) This method creates :class:`~chainer.Variable` objects with ``x_data`` and calls ``func`` with the :class:`~chainer.Variable` s to get its result as :class:`~chainer.Variable`. Then, it sets ``y_grad`` array to ``grad`` attribute of the result and calls ``backward`` method to get gradients of the inputs. To check correctness of the gradients, the function calls :func:`numerical_grad` to calculate numerically the gradients and compares the types of gradients with :func:`chainer.testing.assert_allclose`. To reduce computational time, it uses a function :math:`g: \\mathbb{R} \\rightarrow \\mathbb{R}^n` defined as :math:`g(\\alpha) = f(\\alpha x)`, where :math:`\\alpha \\in \\mathbb{R}` and :math:`f` is a function which actually you want to test. Its gradient is .. math:: g'(\\alpha) = f'(\\alpha x) \\cdot x. When :math:`\\alpha = 1`, :math:`g'(1) = f'(x) \\cdot x`. So :math:`g'(1)` is calculated with :func:`numerical_grad` and compared with dot product of the gradient :math:`f` and :math:`x`. If input objects (``x1_data`` or/and ``x2_data`` in this example) represent integer variables, their gradients are ignored. You can simplify a test when ``MyFunc`` gets only one argument:: >> check_backward(func, x1_data, gy_data) If ``MyFunc`` is a loss function which returns a zero-dimensional array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to ``grad`` attribute of the result:: >> check_backward(my_loss_func, (x1_data, x2_data), None) If ``MyFunc`` returns multiple outputs, pass all gradients for outputs as a tuple:: >> gy1_data = xp.array(...) >> gy2_data = xp.array(...) >> check_backward(func, x1_data, (gy1_data, gy2_data)) You can also test a :class:`~chainer.Link`. To check gradients of parameters of the link, set a tuple of the parameters to ``params`` arguments:: >> check_backward(my_link, (x1_data, x2_data), gy_data, >> (my_link.W, my_link.b)) Note that ``params`` are not ``ndarray`` s, but :class:`~chainer.Variables` s. Function objects are acceptable as ``func`` argument:: >> check_backward(lambda x1, x2: f(x1, x2), >> (x1_data, x2_data), gy_data) .. note:: ``func`` is called many times to get numerical gradients for all inputs. This function doesn't work correctly when ``func`` behaves randomly as it gets different gradients. Args: func (callable): A function which gets :class:`~chainer.Variable` s and returns :class:`~chainer.Variable` s. ``func`` must returns a tuple of :class:`~chainer.Variable` s or one :class:`~chainer.Variable`. You can use :class:`~chainer.Function` object, :class:`~chainer.Link` object or a function satisfying the condition. x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is treated as ``(x_data,)``. y_grad (ndarray or tuple of ndarrays or None): A set of ``ndarray`` s representing gradients of return-values of ``func``. If ``y_grad`` is one ``ndarray`` object, it is treated as ``(y_grad,)``. If ``func`` is a loss-function, ``y_grad`` should be set to ``None``. params (~chainer.Variable or tuple of ~chainder.Variable): A set of :class:`~chainer.Variable` s whose gradients are checked. When ``func`` is a :class:`~chainer.Link` object, set its parameters as ``params``. If ``params`` is one :class:`~chainer.Variable` object, it is treated as ``(params,)``. eps (float): Epsilon value to be passed to :func:`numerical_grad`. atol (float): Absolute tolerance to be passed to :func:`chainer.testing.assert_allclose`. rtol (float): Relative tolerance to be passed to :func:`chainer.testing.assert_allclose`. no_grads (list of bool): Flag to skip variable for gradient assertion. It should be same length as ``x_data``. dtype (~numpy.dtype): ``x_data``, ``y_grad`` and ``params`` are casted to this dtype when calculating numerical gradients. Only float types and ``None`` are allowed. See: :func:`numerical_grad` """ x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) xs = [variable.Variable(x) for x in x_data] y = func(*xs) y = _as_tuple(y) # All creators of `y` need to be the same because we only call # `y[0].backward` to call `backward` method of the creator. # To do so we need to insert a dummy function `Ident` to the # computational graph. # Note that `func` may not be a `Function` object. y = identity.Identity().apply(y) y_grad = _set_y_grad(y, y_grad) # Clear gradients which may exist if func calls backward inside of itself. _clear_grads(xs) _clear_grads(params) # We only need to call `backward` for one result `Variable`. # `Variable.backward` method calls `Function.backward` of its creator. y[0].backward() param_data = [p.data for p in params] if dtype is None: casted_xs = [variable.Variable(x) for x in x_data] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') casted_xs = [variable.Variable(x.astype(dtype, copy=False) if x.dtype.kind == 'f' else x) for x in x_data] if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in xs] else: if len(no_grads) != len(xs): raise ValueError( 'Length of no_grads param and xs should be same.') casted_data = [x.data.copy() for x in casted_xs] for skip, x in six.moves.zip(no_grads, xs): if skip: assert x.grad is None else: if x.grad is None: raise RuntimeError( 'gradients of some arguments are not calculated') # Keep the gradient arrays of params which may be overwritten by func params_grad = [param.grad for param in params] xp = cuda.get_array_module(*xs) one = xp.array(1., dtype) def g(): # This functions is called twice in `numerical_grad`. # `one` is `1 + epsilon` or `1 - epsilon` in these calls. # See the document of `numerical_grad`. for skip, cx, data in six.moves.zip(no_grads, casted_xs, casted_data): if skip: continue # astype is require to store data with the given type data = (one * data).astype(data.dtype) if numpy.isscalar(data): data = xp.array(data) cx.data = data for param, data in six.moves.zip(params, param_data): if dtype is not None: param_dtype = dtype else: param_dtype = param.dtype # The inner astype is required to calculates __mul__ in # `param_type` when data is low accuracy float. # The outer one is require to store data with the given type. param.data = (one * data.astype(param_dtype)).astype(param_dtype) # Clear gradients to support func that calls backward inside of itself. _clear_grads(casted_xs) _clear_grads(params) ys = func(*casted_xs) ys = _as_tuple(ys) ys_data = tuple(y.data for y in ys) for skip, cx, data in six.moves.zip(no_grads, casted_xs, casted_data): if skip: continue cx.data = data for param, data in six.moves.zip(params, param_data): param.data = data return ys_data gx, = numerical_grad(g, (one,), y_grad, eps=eps) gx_accum = 0 for skip, x, cx in six.moves.zip(no_grads, xs, casted_xs): if skip: continue gxi = x.grad.ravel() cxi = cx.data.ravel() if dtype is not None: gxi = gxi.astype(dtype) cxi = cxi.astype(dtype) gx_accum += gxi.dot(cxi) for p, gpi in six.moves.zip(params, params_grad): gpi = gpi.ravel() pi = p.data.ravel() if dtype is not None: gpi = gpi.astype(dtype) pi = pi.astype(dtype) gx_accum += gpi.dot(pi) testing.assert_allclose(gx, gx_accum, atol=atol, rtol=rtol)
def add_branch(self): x = self.inputs[0] output = variable.Variable(x.data) output.set_creator(self) self.outputs.append(weakref.ref(output)) return output