def first_order_grad(*inputs): xs = inputs[:n_x] gys = inputs[n_x:] y = _as_tuple(func(*xs)) # Let all elements of y share the same creator. # See the comment in check_backward. y = identity.Identity().apply(y) _set_y_grad(y, gys) y[0].backward(enable_double_backprop=True) return tuple([x.grad_var for x in xs] + [p.grad_var for p in params])
def first_order_grad(*inputs): xs = inputs[:n_x] gys = inputs[n_x:] y = _as_tuple(func(*xs)) # Let all elements of y share the same creator. # See the comment in check_backward. y = identity.Identity().apply(y) _set_y_grad(y, gys) y[0].backward() ret = tuple([x.grad_var for x in xs]) for x in xs: x.grad_var = None return ret
def check_backward(func, x_data, y_grad, params=(), eps=1e-3, atol=1e-5, rtol=1e-4, no_grads=None, dtype=None): """Test backward procedure of a given function. This function automatically checks backward-process of a given function. For example, when you have a :class:`~chainer.Function` class ``MyFunc``, that gets two arguments and returns one value, you can make its test like this:: >> def test_my_func(self): >> func = MyFunc() >> x1_data = xp.array(...) >> x2_data = xp.array(...) >> gy_data = xp.array(...) >> check_backward(func, (x1_data, x2_data), gy_data) This method creates :class:`~chainer.Variable` objects with ``x_data`` and calls ``func`` with the :class:`~chainer.Variable` s to get its result as :class:`~chainer.Variable`. Then, it sets ``y_grad`` array to ``grad`` attribute of the result and calls ``backward`` method to get gradients of the inputs. To check correctness of the gradients, the function calls :func:`numerical_grad` to calculate numerically the gradients and compares the types of gradients with :func:`chainer.testing.assert_allclose`. To reduce computational time, it uses directional derivative along a random vector. A function :math:`g: \\mathbb{R} \\rightarrow \\mathbb{R}^n` is defined as :math:`g(\\delta) = f(x + \\delta r)`, where :math:`\\delta \\in \\mathbb{R}`, :math:`r \\in \\mathbb{R}^n` is a random vector and :math:`f` is a function which you want to test. Its gradient is .. math:: g'(\\delta) = f'(x + \\delta r) \\cdot r. Therefore, :math:`g'(0) = f'(x) \\cdot r`. So we can check the correctness of back propagation of :math:`f` indirectly by comparing this equation with the gradient of :math:`g` numerically calculated and that of :math:`f` computed by backprop. If :math:`r` is chosen from uniform distribution, we can conclude with high probability that the gradient of :math:`f` itself is correct. If input objects (``x1_data`` or/and ``x2_data`` in this example) represent integer variables, their gradients are ignored. You can simplify a test when ``MyFunc`` gets only one argument:: >> check_backward(func, x1_data, gy_data) If ``MyFunc`` is a loss function which returns a zero-dimensional array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to ``grad`` attribute of the result:: >> check_backward(my_loss_func, (x1_data, x2_data), None) If ``MyFunc`` returns multiple outputs, pass all gradients for outputs as a tuple:: >> gy1_data = xp.array(...) >> gy2_data = xp.array(...) >> check_backward(func, x1_data, (gy1_data, gy2_data)) You can also test a :class:`~chainer.Link`. To check gradients of parameters of the link, set a tuple of the parameters to ``params`` arguments:: >> check_backward(my_link, (x1_data, x2_data), gy_data, >> (my_link.W, my_link.b)) Note that ``params`` are not ``ndarray`` s, but :class:`~chainer.Variables` s. Function objects are acceptable as ``func`` argument:: >> check_backward(lambda x1, x2: f(x1, x2), >> (x1_data, x2_data), gy_data) .. note:: ``func`` is called many times to get numerical gradients for all inputs. This function doesn't work correctly when ``func`` behaves randomly as it gets different gradients. Args: func (callable): A function which gets :class:`~chainer.Variable` s and returns :class:`~chainer.Variable` s. ``func`` must returns a tuple of :class:`~chainer.Variable` s or one :class:`~chainer.Variable`. You can use :class:`~chainer.Function` object, :class:`~chainer.Link` object or a function satisfying the condition. x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is treated as ``(x_data,)``. y_grad (ndarray or tuple of ndarrays or None): A set of ``ndarray`` s representing gradients of return-values of ``func``. If ``y_grad`` is one ``ndarray`` object, it is treated as ``(y_grad,)``. If ``func`` is a loss-function, ``y_grad`` should be set to ``None``. params (~chainer.Variable or tuple of ~chainder.Variable): A set of :class:`~chainer.Variable` s whose gradients are checked. When ``func`` is a :class:`~chainer.Link` object, set its parameters as ``params``. If ``params`` is one :class:`~chainer.Variable` object, it is treated as ``(params,)``. eps (float): Epsilon value to be passed to :func:`numerical_grad`. atol (float): Absolute tolerance to be passed to :func:`chainer.testing.assert_allclose`. rtol (float): Relative tolerance to be passed to :func:`chainer.testing.assert_allclose`. no_grads (list of bool): Flag to skip variable for gradient assertion. It should be same length as ``x_data``. dtype (~numpy.dtype): ``x_data``, ``y_grad`` and ``params`` are casted to this dtype when calculating numerical gradients. Only float types and ``None`` are allowed. .. seealso:: :func:`numerical_grad` """ if dtype is not None and numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) xs = [variable.Variable(x) for x in x_data] y = func(*xs) y = _as_tuple(y) # All creators of `y` need to be the same because we only call # `y[0].backward` to call `backward` method of the creator. # To do so we need to insert a dummy function `Ident` to the # computational graph. # Note that `func` may not be a `Function` object. y = identity.Identity().apply(y) y_grad = _set_y_grad(y, y_grad) # Clear gradients which may exist if func calls backward inside of itself. _clear_grads(xs) _clear_grads(params) # We only need to call `backward` for one result `Variable`. # `Variable.backward` method calls `Function.backward` of its creator. y[0].backward() if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in xs] else: if len(no_grads) != len(xs): raise ValueError( 'Length of no_grads param and xs should be same.\n' 'Actual: {0} != {1}'.format(len(no_grads), len(xs))) for skip, x in six.moves.zip(no_grads, xs): if skip: if x.grad is not None: raise RuntimeError('gradient of int variable must be None') else: if x.grad is None: raise RuntimeError( 'gradients of some arguments are not calculated') if len(xs) - no_grads.count(True) + len(params) == 0: # When there is no float variables, we need not to check gradient # values return variables = _filter_list(xs, no_grads) + list(params) # Keep the gradient arrays of params which may be overwritten by func grads = [x.grad for x in variables] if dtype is None: casted_data = [x.data for x in variables] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') casted_data = [x.data.astype(dtype, copy=False) for x in variables] # Even skipped variable must have the same dtype. for x, skip in six.moves.zip(xs, no_grads): if skip and x.data.dtype.kind == 'f': x.data = x.data.astype(dtype, copy=False) xp = cuda.get_array_module(*xs) directions = [xp.random.normal(size=x.shape) for x in variables] # Use unit vector norm = math.sqrt(sum([xp.square(d).sum() for d in directions])) if norm != 0: # norm could be zero if input arrays are 0-sized. scale = 1. / norm directions = [d * scale for d in directions] delta = xp.array(0., 'd') def g(): # This functions is called twice in `numerical_grad`. # `delta` is `epsilon` or `-epsilon` in these calls. # See the document of `numerical_grad`. for x, data, direction in six.moves.zip(variables, casted_data, directions): # astype is require to store data with the given type data = (data.astype('d') + delta * direction).astype(data.dtype) if numpy.isscalar(data): data = xp.array(data) x.data = data # Clear gradients to support func that calls backward inside of itself. _clear_grads(xs) _clear_grads(params) ys = func(*xs) ys = _as_tuple(ys) ys_data = tuple(y.data for y in ys) for x, data in six.moves.zip(variables, casted_data): x.data = data return ys_data gx, = numerical_grad(g, (delta, ), y_grad, eps=eps) gx_accum = 0 for g, direction in six.moves.zip(grads, directions): gx_accum += (g.astype('d') * direction).sum() try: testing.assert_allclose(gx, gx_accum, atol=atol, rtol=rtol) except AssertionError as e: f = six.StringIO() f.write('check_backward failed (eps={} atol={} rtol={})\n'.format( eps, atol, rtol)) for i, x_ in enumerate(xs): f.write('inputs[{}]:\n'.format(i)) f.write('{}\n'.format(x_)) for i, gy_ in enumerate(y_grad): f.write('grad_outputs[{}]:\n'.format(i)) f.write('{}\n'.format(gy_)) f.write('gradients (numeric): {}\n'.format(gx)) f.write('gradients (backward): {}\n'.format(gx_accum)) f.write('\n') f.write(str(e)) raise AssertionError(f.getvalue())
def check_backward(func, x_data, y_grad, params=(), eps=1e-3, atol=1e-5, rtol=1e-4, no_grads=None, dtype=None): """Test backward procedure of a given function. This function automatically checks backward-process of a given function. For example, when you have a :class:`~chainer.Function` class ``MyFunc``, that gets two arguments and returns one value, you can make its test like this:: >> def test_my_func(self): >> func = MyFunc() >> x1_data = xp.array(...) >> x2_data = xp.array(...) >> gy_data = xp.array(...) >> check_backward(func, (x1_data, x2_data), gy_data) This method creates :class:`~chainer.Variable` objects with ``x_data`` and calls ``func`` with the :class:`~chainer.Variable` s to get its result as :class:`~chainer.Variable`. Then, it sets ``y_grad`` array to ``grad`` attribute of the result and calls ``backward`` method to get gradients of the inputs. To check correctness of the gradients, the function calls :func:`numerical_grad` to calculate numerically the gradients and compares the types of gradients with :func:`chainer.testing.assert_allclose`. To reduce computational time, it uses a function :math:`g: \\mathbb{R} \\rightarrow \\mathbb{R}^n` defined as :math:`g(\\alpha) = f(\\alpha x)`, where :math:`\\alpha \\in \\mathbb{R}` and :math:`f` is a function which actually you want to test. Its gradient is .. math:: g'(\\alpha) = f'(\\alpha x) \\cdot x. When :math:`\\alpha = 1`, :math:`g'(1) = f'(x) \\cdot x`. So :math:`g'(1)` is calculated with :func:`numerical_grad` and compared with dot product of the gradient :math:`f` and :math:`x`. If input objects (``x1_data`` or/and ``x2_data`` in this example) represent integer variables, their gradients are ignored. You can simplify a test when ``MyFunc`` gets only one argument:: >> check_backward(func, x1_data, gy_data) If ``MyFunc`` is a loss function which returns a zero-dimensional array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to ``grad`` attribute of the result:: >> check_backward(my_loss_func, (x1_data, x2_data), None) If ``MyFunc`` returns multiple outputs, pass all gradients for outputs as a tuple:: >> gy1_data = xp.array(...) >> gy2_data = xp.array(...) >> check_backward(func, x1_data, (gy1_data, gy2_data)) You can also test a :class:`~chainer.Link`. To check gradients of parameters of the link, set a tuple of the parameters to ``params`` arguments:: >> check_backward(my_link, (x1_data, x2_data), gy_data, >> (my_link.W, my_link.b)) Note that ``params`` are not ``ndarray`` s, but :class:`~chainer.Variables` s. Function objects are acceptable as ``func`` argument:: >> check_backward(lambda x1, x2: f(x1, x2), >> (x1_data, x2_data), gy_data) .. note:: ``func`` is called many times to get numerical gradients for all inputs. This function doesn't work correctly when ``func`` behaves randomly as it gets different gradients. Args: func (callable): A function which gets :class:`~chainer.Variable` s and returns :class:`~chainer.Variable` s. ``func`` must returns a tuple of :class:`~chainer.Variable` s or one :class:`~chainer.Variable`. You can use :class:`~chainer.Function` object, :class:`~chainer.Link` object or a function satisfying the condition. x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is treated as ``(x_data,)``. y_grad (ndarray or tuple of ndarrays or None): A set of ``ndarray`` s representing gradients of return-values of ``func``. If ``y_grad`` is one ``ndarray`` object, it is treated as ``(y_grad,)``. If ``func`` is a loss-function, ``y_grad`` should be set to ``None``. params (~chainer.Variable or tuple of ~chainder.Variable): A set of :class:`~chainer.Variable` s whose gradients are checked. When ``func`` is a :class:`~chainer.Link` object, set its parameters as ``params``. If ``params`` is one :class:`~chainer.Variable` object, it is treated as ``(params,)``. eps (float): Epsilon value to be passed to :func:`numerical_grad`. atol (float): Absolute tolerance to be passed to :func:`chainer.testing.assert_allclose`. rtol (float): Relative tolerance to be passed to :func:`chainer.testing.assert_allclose`. no_grads (list of bool): Flag to skip variable for gradient assertion. It should be same length as ``x_data``. dtype (~numpy.dtype): ``x_data``, ``y_grad`` and ``params`` are casted to this dtype when calculating numerical gradients. Only float types and ``None`` are allowed. See: :func:`numerical_grad` """ x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) xs = [variable.Variable(x) for x in x_data] y = func(*xs) y = _as_tuple(y) # All creators of `y` need to be the same because we only call # `y[0].backward` to call `backward` method of the creator. # To do so we need to insert a dummy function `Ident` to the # computational graph. # Note that `func` may not be a `Function` object. y = identity.Identity().apply(y) y_grad = _set_y_grad(y, y_grad) # Clear gradients which may exist if func calls backward inside of itself. _clear_grads(xs) _clear_grads(params) # We only need to call `backward` for one result `Variable`. # `Variable.backward` method calls `Function.backward` of its creator. y[0].backward() param_data = [p.data for p in params] if dtype is None: casted_xs = [variable.Variable(x) for x in x_data] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') casted_xs = [variable.Variable(x.astype(dtype, copy=False) if x.dtype.kind == 'f' else x) for x in x_data] if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in xs] else: if len(no_grads) != len(xs): raise ValueError( 'Length of no_grads param and xs should be same.') casted_data = [x.data.copy() for x in casted_xs] for skip, x in six.moves.zip(no_grads, xs): if skip: assert x.grad is None else: if x.grad is None: raise RuntimeError( 'gradients of some arguments are not calculated') # Keep the gradient arrays of params which may be overwritten by func params_grad = [param.grad for param in params] xp = cuda.get_array_module(*xs) one = xp.array(1., dtype) def g(): # This functions is called twice in `numerical_grad`. # `one` is `1 + epsilon` or `1 - epsilon` in these calls. # See the document of `numerical_grad`. for skip, cx, data in six.moves.zip(no_grads, casted_xs, casted_data): if skip: continue # astype is require to store data with the given type data = (one * data).astype(data.dtype) if numpy.isscalar(data): data = xp.array(data) cx.data = data for param, data in six.moves.zip(params, param_data): if dtype is not None: param_dtype = dtype else: param_dtype = param.dtype # The inner astype is required to calculates __mul__ in # `param_type` when data is low accuracy float. # The outer one is require to store data with the given type. param.data = (one * data.astype(param_dtype)).astype(param_dtype) # Clear gradients to support func that calls backward inside of itself. _clear_grads(casted_xs) _clear_grads(params) ys = func(*casted_xs) ys = _as_tuple(ys) ys_data = tuple(y.data for y in ys) for skip, cx, data in six.moves.zip(no_grads, casted_xs, casted_data): if skip: continue cx.data = data for param, data in six.moves.zip(params, param_data): param.data = data return ys_data gx, = numerical_grad(g, (one,), y_grad, eps=eps) gx_accum = 0 for skip, x, cx in six.moves.zip(no_grads, xs, casted_xs): if skip: continue gxi = x.grad.ravel() cxi = cx.data.ravel() if dtype is not None: gxi = gxi.astype(dtype, copy=False) cxi = cxi.astype(dtype, copy=False) gx_accum += gxi.dot(cxi) for p, gpi in six.moves.zip(params, params_grad): gpi = gpi.ravel() pi = p.data.ravel() if dtype is not None: gpi = gpi.astype(dtype, copy=False) pi = pi.astype(dtype, copy=False) gx_accum += gpi.dot(pi) testing.assert_allclose(gx, gx_accum, atol=atol, rtol=rtol)
def check_backward(func, x_data, y_grad, params=(), eps=1e-3, atol=1e-5, rtol=1e-4, no_grads=None, dtype=None): """Test backward procedure of a given function. This function automatically check backward-process of given function. For example, when you have a :class:`~chainer.Function` class ``MyFunc``, that gets two arguments and returns one value, you can make its test like this:: >> def test_my_func(self): >> func = MyFunc() >> x1_data = xp.array(...) >> x2_data = xp.array(...) >> gy_data = xp.array(...) >> check_backward(func, (x1_data, x2_data), gy_data) This method creates :class:`~chainer.Variable` objects with ``x_data`` and calls ``func`` with the :class:`~chainer.Variable` s to get its result as :class:`~chainer.Variable`. Then, it sets ``y_grad`` array to ``grad`` attribute of the result and calls ``backward`` method to get gradients of the inputs. To check correctness of the gradients, the function calls :func:`numerical_grad` to calculate numerically the gradients and compares the types of gradients with :func:`chainer.testing.assert_allclose`. If input objects (``x1_data`` or/and ``x2_data`` in this example) represent integer variables, their gradients are ignored. You can simplify a test when ``MyFunc`` gets only one argument:: >> check_backward(func, x1_data, gy_data) If ``MyFunc`` is a loss function which returns a zero-dimensional array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to ``grad`` attribute of the result:: >> check_backward(my_loss_func, (x1_data, x2_data), None) If ``MyFunc`` returns multiple outputs, pass all gradients for outputs as a tuple:: >> gy1_data = xp.array(...) >> gy2_data = xp.array(...) >> check_backward(func, x1_data, (gy1_data, gy2_data)) You can also test a :class:`~chainer.Link`. To check gradients of parameters of the link, set a tuple of the parameters to ``params`` arguments:: >> check_backward(my_link, (x1_data, x2_data), gy_data, >> (my_link.W, my_link.b)) Note that ``params`` are not ``ndarray`` s, but :class:`~chainer.Variables` s. Function objects are acceptable as ``func`` argument:: >> check_backward(lambda x1, x2: f(x1, x2), >> (x1_data, x2_data), gy_data) .. note:: ``func`` is called many times to get numerical gradients for all inputs. This function doesn't work correctly when ``func`` behaves randomly as it gets different gradients. Args: func (callable): A function which gets :class:`~chainer.Variable` s and returns :class:`~chainer.Variable` s. ``func`` must returns a tuple of :class:`~chainer.Variable` s or one :class:`~chainer.Variable`. You can use :class:`~chainer.Function` object, :class:`~chainer.Link` object or a function satisfying the condition. x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is treated as ``(x_data,)``. y_grad (ndarray or tuple of ndarrays or None): A set of ``ndarray`` s representing gradients of return-values of ``func``. If ``y_grad`` is one ``ndarray`` object, it is treated as ``(y_grad,)``. If ``func`` is a loss-function, ``y_grad`` should be set to ``None``. params (~chainer.Variable or tuple of ~chainder.Variable): A set of :class:`~chainer.Variable` s whose gradients are checked. When ``func`` is a :class:`~chainer.Link` object, set its parameters as ``params``. If ``params`` is one :class:`~chainer.Variable` object, it is treated as ``(params,)``. eps (float): Epsilon value to be passed to :func:`numerical_grad`. atol (float): Absolute tolerance to be passed to :func:`chainer.testing.assert_allclose`. rtol (float): Relative tolerance to be passed to :func:`chainer.testing.assert_allclose`. no_grads (list of bool): Flag to skip variable for gradient assertion. It should be same length as ``x_data``. dtype (~numpy.dtype): ``x_data`` and ``y_grad`` are casted to this dtype when calculating numerical gradients. Only float types and ``None`` are allowed. See: :func:`numerical_grad` """ x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) xs = [variable.Variable(x) for x in x_data] y = func(*xs) y = _as_tuple(y) # All creators of `y` need to be the same because we only call # `y[0].backward` to call `backward` method of the creator. # To do so we need to insert a dummy function `Ident` to the # computational graph. # Note that `func` may not be a `Function` object. y = identity.Identity()(*y) y = _as_tuple(y) if y_grad is not None: if len(y) != len(y_grad): raise ValueError( '`y_grad` must have the same length of output values') for iy, igy in six.moves.zip(y, y_grad): iy.grad = igy else: if len(y) != 1: raise ValueError( 'When `y_grad` is `None`, the function must return a' 'zero-dimentional array') y_grad = (1, ) # We only need to call `backward` for one result `Variable`. # `Variable.backward` method calls `Function.backward` of its creator. y[0].backward() if dtype is None: casted_xs = [variable.Variable(x) for x in x_data] else: if numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') if len(params) > 0: raise ValueError('`dtype` is available only if `params` is empty') casted_xs = [ variable.Variable( x.astype(dtype, copy=False) if x.dtype.kind == 'f' else x) for x in x_data ] def f(): ys = func(*casted_xs) ys = _as_tuple(ys) return tuple(y.data for y in ys) if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in xs] else: if len(no_grads) != len(xs): raise ValueError('Length of no_grads param and xs should be same.') for skip, x, cx in six.moves.zip(no_grads, xs, casted_xs): if skip: assert x.grad is None continue gx, = numerical_grad(f, (cx.data, ), y_grad, eps=eps) testing.assert_allclose(gx, x.grad, atol=atol, rtol=rtol) if dtype is None: assert gx.dtype == x.grad.dtype else: assert gx.dtype.kind == 'f' and gx.dtype == dtype for p in params: gp, = numerical_grad(f, (p.data, ), y_grad, eps=eps) testing.assert_allclose(gp, p.grad, atol=atol, rtol=rtol) assert gp.dtype is p.grad.dtype