Exemple #1
0
def grad(outputs,
         inputs,
         grad_outputs=None,
         grad_inputs=None,
         set_grad=False,
         retain_grad=False,
         enable_double_backprop=False):
    """Computes the gradient of output variables w.r.t.\\ the input variables.

    This function implements the backpropagation algorithm. While
    :meth:`Variable.backward` also implements backprop, this function selects
    the smallest paths in the computational graph needed to compute the
    gradients w.r.t. inputs. The error is backpropagated only through these
    selected paths, which may reduce the overall computational cost.

    This function also differs from :meth:`Variable.backward` in the way to
    return the gradients; it directly returns the gradient variables as a list
    instead of setting gradients to the :attr:`Variable.grad_var` attribute of
    the original variable. It means users do not need to clear the gradient
    w.r.t. each variable before computing the gradient using this function.
    If ``set_grad`` option is set to ``True``, the computed gradient is also
    stored in the :attr:`Variable.grad_var` attribute of each variable, in
    which case any original value of :attr:`Variable.grad_var` will be updated
    even if it had already been set.

    Args:
        outputs: A sequence of output variables from which backprop starts.
        inputs: A sequence of input variables each of which this function
            computes the gradient w.r.t.
        grad_outputs: A sequence of variables that gives the initial value of
            each output gradient. If an element is set to ``None``, an array
            filled with 1 is used. If this argument itself is ``None``, it is
            treated as a sequence of ``None`` s.
        grad_inputs: A sequence of variables that gives the initial value of
            each input gradient. The gradients computed by the backprop
            algorithm are accumulated to them (not in-place). If an element
            is set to ``None``, the gradient is not accumulated to this value.
            If this argument itself is ``None``, it is treated as a sequence of
            ``None`` s.
        set_grad (bool): If it is ``True``, the :attr:`Variable.grad_var`
            attribute of each input variable is set to the corresponding
            computed gradient variable.
        retain_grad (bool): If it is ``True``, the gradients w.r.t. all the
            intermediate variables are stored in the :attr:`Variable.grad_var`
            attribute. In this case, the ``set_grad`` option is ignored.
        enable_double_backprop (bool): If it is ``True``, the computed
            gradients can be further backpropagated. Enabling it may increase
            the memory consumption (and possibly the computational time) to
            remember the intermediate gradient values for the second
            backpropagation.

    Returns:
        A list of gradient variables w.r.t. the inputs.

    """
    # The implementation consists of three steps.

    # 1. Backward enumeration: all the nodes reachable backward from the output
    #    nodes are enumerated. The forward direction links are collected in
    #    this step. Note that the variable nodes whose requires_grad is false
    #    are ignored and their creators are not searched.
    candidate_funcs = [
        v.creator_node for v in outputs if v.creator_node is not None
    ]
    visited_funcs = set()
    forward_graph = collections.defaultdict(list)
    while candidate_funcs:
        func = candidate_funcs.pop()
        if func in visited_funcs:
            continue
        visited_funcs.add(func)
        for x in func.inputs:
            if not x.requires_grad:
                continue
            forward_graph[x].append(func)
            creator = x.creator_node
            if creator is not None and creator not in visited_funcs:
                candidate_funcs.append(creator)

    # 2. Forward enumeration: all the nodes in the subgraph reachable from the
    #    input nodes are enumerated. The extracted (sub-)subgraph is the union
    #    of all paths that backpropagation will visit.
    candidate_vars = [x.node for x in inputs]
    visited_funcs = set()
    grad_required = set()
    while candidate_vars:
        x = candidate_vars.pop()
        grad_required.add(x)
        for func in forward_graph[x]:
            if func in visited_funcs:
                continue
            visited_funcs.add(func)
            for y_ref in func.outputs:
                y = y_ref()
                if y is not None and y in forward_graph:
                    candidate_vars.append(y)

    # 3. Backpropagation: the backpropagation is executed along the
    #    (sub-)subgraph. It uses the topological order of the subgraph which is
    #    induced by the reversed order of function applications ("rank").
    grads = {}  # mapping from variable nodes to their gradients

    # Initialize the gradient mapping.
    if grad_outputs is None:
        grad_outputs = (None, ) * len(outputs)
    for y, gy in zip(outputs, grad_outputs):
        if gy is None:
            with cuda.get_device_from_array(y.data) as device:
                if device is cuda.DummyDevice:
                    gy_data = numpy.ones_like(y.data)
                else:
                    gy_data = cuda.cupy.ones_like(y.data)
                gy = variable.Variable(gy_data, requires_grad=False)
        grads[y.node] = gy

    if grad_inputs is not None:
        for x, gx in zip(inputs, grad_inputs):
            if gx is not None:
                grads[x.node] = gx

    # Backprop implementation. It edits grads which will only contain the
    # gradients w.r.t. the inputs.
    with chainer.using_config('enable_backprop', enable_double_backprop):
        _backprop(outputs, inputs, grad_required, retain_grad, grads)

    # Extract the gradients w.r.t. the inputs and return them.
    ret = [grads.get(x.node, None) for x in inputs]
    if set_grad:
        for x, gx in zip(inputs, ret):
            x.grad_var = gx

    return ret
Exemple #2
0
    def apply(self, inputs):
        """Computes output variables and grows the computational graph.

        Basic behavior is expressed in the documentation of
        :class:`FunctionNode`.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           a GPU device, that device is made current before calling
           :meth:`forward`, so implementors do not need to take care of device
           selection in most cases.

        Args:
            inputs: Tuple of input variables. Each element can be either
                :class:`~chainer.Variable`, :class:`numpy.ndarray`,
                or :class:`cupy.ndarray`. If the element is an ndarray, it is
                automatically wrapped with :class:`~chainer.Variable`.

        Returns:
            A tuple of output :class:`~chainer.Variable` objects.

        """
        chainerx_in_data = None
        chainerx_device = None
        is_chainerx, in_data = _extract_apply_in_data(inputs)

        if is_chainerx:
            # Try ChainerX C++ implementation.
            # If it's supported, the output arrays are wrapped with Variables
            # and returned.
            # If not supported, FunctionNode.forward_chainerx should return
            # Fallback.
            # In that case the input arrays are converted to numpy.ndarray
            # or cupy.ndarray (depending on the ChainerX backend) and
            # forward computation falls back to the conventional
            # FunctionNode.forward() implementaion.
            outputs = self.forward_chainerx(in_data)

            if outputs is not chainer.Fallback:
                # Supported. Wrap with variables and return
                assert isinstance(outputs, tuple)
                return tuple([
                    variable.Variable._init_unchecked(
                        y,
                        requires_grad=y.is_backprop_required(),
                        is_chainerx_array=True) for y in outputs
                ])

            # Fall back to FunctionNode.forward()
            chainerx_in_data, in_data, chainerx_device = (
                self._chainerx_apply_fallback_preprocess(in_data, inputs))
            self._is_chainerx_fallback_mode = True
            self.chainerx_device = chainerx_device

        utils._check_arrays_forward_compatible(in_data, self.label)

        is_debug = chainer.is_debug()
        if is_debug:
            # Keep stack trace for debug
            self.stack = traceback.extract_stack()

        if configuration.config.type_check:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks > 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        for hook in hooks:
            hook.forward_preprocess(self, in_data)

        # Forward propagation
        with cuda.get_device_from_array(*in_data):
            self._input_indexes_to_retain = None
            self._output_indexes_to_retain = None
            if chainer.config.schedule_func is not None:
                outputs = static_forward_optimizations(self, in_data)
            elif self._is_chainerx_fallback_mode:
                # In ChainerX fallback, __class__ is temporarily replaced with
                # the fabricated one with automatic attirbute fallback.
                with _chainerx_attribute_fallback(self, chainerx_device):
                    outputs = self.forward(in_data)
            else:
                # In normal case, simply run the forward method.
                outputs = self.forward(in_data)

        # Check for output array types
        if not isinstance(outputs, tuple):
            raise TypeError('forward output must be a tuple ({})\n'
                            'Actual: {}'.format(self.label, type(outputs)))

        if not chainer.is_arrays_compatible(outputs):
            raise TypeError(
                'incompatible array types are mixed in the forward output '
                '({}).\n'
                'Actual: {}'.format(self.label,
                                    ', '.join(str(type(x)) for x in outputs)))

        for hook in hooks:
            hook.forward_postprocess(self, in_data)

        # NaN check of output values
        if is_debug:
            if any(chainer.backend._contains_nan(out) for out in outputs):
                msg = ('NaN is detected on forward computation of '
                       '{}'.format(self.label))
                raise RuntimeError(msg)

        self._output_count = len(outputs)

        if self._is_chainerx_fallback_mode:
            ret = self._chainerx_apply_fallback_postprocess(
                chainerx_in_data, inputs, outputs)

        else:
            input_vars = [chainer.as_variable(x) for x in inputs]
            requires_grad = any([x.requires_grad for x in input_vars])

            ret = tuple([
                variable.Variable(y, requires_grad=requires_grad)
                for y in outputs
            ])

            if configuration.config.enable_backprop:
                # Topological ordering
                self.rank = max([x.rank
                                 for x in input_vars]) if input_vars else 0
                # Add backward edges
                for y in ret:
                    y.creator_node = self
                self.inputs = tuple([x.node for x in input_vars])
                # Add forward edges (must be weak references)
                self.outputs = tuple([weakref.ref(y.node) for y in ret])

                if self._input_indexes_to_retain is not None:
                    for index in self._input_indexes_to_retain:
                        input_vars[index].retain_data()

                if self._output_indexes_to_retain is not None:
                    retained_data = []
                    for index in self._output_indexes_to_retain:
                        ret[index].retain_data()
                        retained_data.append(outputs[index])
                    self._retained_output_data = tuple(retained_data)

                self.lazy_grad_sum = configuration.config.lazy_grad_sum

        return ret
Exemple #3
0
def check_backward(func,
                   x_data,
                   y_grad,
                   params=(),
                   eps=1e-3,
                   atol=1e-5,
                   rtol=1e-4):
    """Test backward procedure of a given function.

    This function automatically check backward-process of given function.
    For example, when you have a :class:`~chainer.Function` class ``MyFunc``,
    that gets two arguments and returns one value, you can make its test like
    this::

    >> def test_my_func(self):
    >>   func = MyFunc()
    >>   x1_data = xp.array(...)
    >>   x2_data = xp.array(...)
    >>   gy_data = xp.array(...)
    >>   check_backward(func, (x1_data, x2_data), gy_data)

    This method creates :class:`~chainer.Variable` objects with ``x_data``
    and calls ``func`` with the :class:`~chainer.Variable` s to get its result
    as :class:`~chainer.Variable`.
    Then, it sets ``y_grad`` array to ``grad`` attribute of the result and
    calls ``backward`` method to get gradients of the inputs.
    To check correctness of the gradients, the function calls
    :func:`numerical_grad` to calculate numerically the gradients and compares
    the types of gradients with :func:`assert_allclose`.
    If input objects (``x1_data`` or/and ``x2_data`` in this example) represent
    integer variables, their gradients are ignored.

    You can simplify a test when ``MyFunc`` gets only one argument::

    >>   check_backward(func, x1_data, gy_data)

    If ``MyFunc`` is a loss function which returns a zero-dimensional
    array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to
    ``grad`` attribute of the result::

    >>   check_backward(my_loss_func, (x1_data, x2_data), None)

    If ``MyFunc`` returns multiple outputs, pass all gradients for outputs
    as a tuple::

    >>   gy1_data = xp.array(...)
    >>   gy2_data = xp.array(...)
    >>   check_backward(func, x1_data, (gy1_data, gy2_data))

    You can also test a :class:`~chainer.Link`.
    To check gradients of parameters of the link, set a tuple of the parameters
    to ``params`` arguments::

    >>   check_backward(my_link, (x1_data, x2_data), gy_data,
    >>                  (my_link.W, my_link.b))

    Note that ``params`` are not ``ndarray`` s,
    but :class:`~chainer.Variables` s.

    Function objects are acceptable as ``func`` argument::

    >>   check_backward(lambda x1, x2: f(x1, x2),
    >>                  (x1_data, x2_data), gy_data)

    .. note::

       ``func`` is called many times to get numerical gradients for all inputs.
       This function doesn't work correctly when ``func`` behaves randomly as
       it gets different gradients.


    Args:
        func (callable): A function which gets :class:`~chainer.Variable` s
            and returns :class:`~chainer.Variable` s. ``func`` must returns
            a tuple of :class:`~chainer.Variable` s or one
            :class:`~chainer.Variable`. You can use :class:`~chainer.Function`
            object, :class:`~chainer.Link` object or a function satisfying the
            condition.
        x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be
            passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is
            treated as ``(x_data,)``.
        y_grad (ndarray or tuple of ndarrays or None):
            A set of ``ndarray`` s representing gradinents of return-values of
            ``func``. If ``y_grad`` is one ``ndarray`` object, it is
            treated as ``(y_grad,)``. If ``func`` is a loss-function,
            ``y_grad`` should be set to ``None``.
        params (~chainer.Variable or tuple of ~chainder.Variable):
            A set of :class:`~chainer.Variable` s whose gradients are checked.
            When ``func`` is a :class:`~chainer.Link` object,
            set its parameters as ``params``.
            If ``params`` is one :class:`~chainer.Variable` object,
            it is treated as ``(params,)``.
        eps (float): Epsilon value to be passed to :func:`numerical_grad`.
        atol (float): Absolute tolerance to be passed to
            :func:`assert_allclose`.
        rtol (float): Relative tolerance to be passed to
            :func:`assert_allclose`.

    See:
       :func:`numerical_grad`
    """
    x_data = _as_tuple(x_data)
    if y_grad is not None:
        y_grad = _as_tuple(y_grad)
    params = _as_tuple(params)

    xs = [variable.Variable(x) for x in x_data]
    y = func(*xs)
    y = _as_tuple(y)

    if y_grad is not None:
        if len(y) != len(y_grad):
            raise ValueError(
                '`y_grad` must have the same length of output values')
        for iy, igy in zip(y, y_grad):
            iy.grad = igy
    else:
        if len(y) != 1:
            raise ValueError(
                'When `y_grad` is `None`, the function must return a'
                'zero-dimentional array')
        y_grad = (1, )

    # We only need to call `backward` for one result `Variable`.
    # `Variable.backward` method calls `Function.backward` of its creator.
    y[0].backward()

    def f():
        ys = func(*xs)
        ys = _as_tuple(ys)
        return tuple(y.data for y in ys)

    for x in xs:
        if x.data.dtype.kind == 'f':
            gx, = numerical_grad(f, (x.data, ), y_grad, eps=eps)
            assert_allclose(gx, x.grad, atol=atol, rtol=rtol)
            assert gx.dtype is x.grad.dtype
        else:
            assert x.grad is None

    for p in params:
        gp, = numerical_grad(f, (p.data, ), y_grad, eps=eps)
        assert_allclose(gp, p.grad, atol=atol, rtol=rtol)
        assert gp.dtype is p.grad.dtype
Exemple #4
0
 def check_forward(self, x_data):
     x = variable.Variable(x_data)
     y = func(x)
     self.assertEqual(y.data.dtype, x_data.dtype)
     y_expected = func_expected(cuda.to_cpu(x_data), dtype=x_data.dtype)
     testing.assert_allclose(y_expected, y.data, atol=1e-4, rtol=1e-4)
Exemple #5
0
    def __call__(self, x,Whx,Wmx,Wmh,Whm):
        """Updates the internal state and returns the LSTM outputs.
        Args:
            x (~chainer.Variable): A new batch from the input sequence.
        Returns:
            ~chainer.Variable: Outputs of updated LSTM units.
        """
    #    if self.upward.has_uninitialized_params:
    #        in_size = x.size // x.shape[0]
    #        self.upward._initialize_params(in_size)
    #        self._initialize_params()
    #    if self.upward2.has_uninitialized_params:
    #        in_size = x.size // x.shape[0]
    #        self.upward2._initialize_params(in_size)
    #        self._initialize_params()

        batch = x.shape[0]
    #    Whx = self.upward()

    #    Wmx = self.upward2()

        factor_in = F.linear(x,Wmx)
        lstm_in = F.linear(x,Whx,self.b)

        h_rest = None
        if self.h is not None:
            h_size = self.h.shape[0]
            if batch == 0:
                h_rest = self.h
            elif h_size < batch:
                msg = ('The batch size of x must be equal to or less than the '
                       'size of the previous state h.')
                raise TypeError(msg)
            elif h_size > batch:
                h_update, h_rest = split_axis.split_axis(
                    self.h, [batch], axis=0)
    #            Wmh = self.lateral1()

                mult_in = F.linear(h_update,Wmh)

                mult_out = mult_in*factor_in
        #        Whm = self.lateral2()
                lstm_in += F.linear(mult_out,Whm)

            else:
    #            Wmh = self.lateral1()

                mult_in = F.linear(self.h,Wmh)

                mult_out = mult_in*factor_in
        #        Whm = self.lateral2()
                lstm_in += F.linear(mult_out,Whm)

        if self.c is None:
            xp = self.xp
            self.c = variable.Variable(xp.zeros((batch, self.state_size), dtype=x.dtype),volatile='auto')
        self.c, y = lstm.lstm(self.c, lstm_in)

        if h_rest is None:
            self.h = y
        elif len(y.data) == 0:
            self.h = h_rest
        else:
            self.h = concat.concat([y, h_rest], axis=0)

        return y
 def setUp(self):
     self.x1 = variable.Variable(np.zeros((1, 2)).astype(np.float32))
     self.x2 = variable.Variable(np.zeros((1, 2)).astype(np.float32))
     self.y = self.x1 + self.x2
 def setUp(self):
     self.x = variable.Variable(np.zeros((1, 2)).astype(np.float32))
     self.y1 = mock_function((self.x, ), 1)
     self.y2 = mock_function((self.x, ), 1)
Exemple #8
0
    def __call__(self, *inputs):
        """Applies forward propagation with chaining backward references.

        Basic behavior is expressed in documentation of :class:`Function`
        class.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           GPU device, then, before it calls :meth:`forward` method, the
           appropriate device is selected, so in most cases implementers do
           not need to take care of device selection.

        Args:
            inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or
                :class:`cupy.ndarray` objects.
                If the input is an :class:`numpy.ndarray` or a
                :class:`cupy.ndarray`, it is automatically wrapped with
                :class:`Variable`.

        Returns:
            One :class:`Variable` object or a tuple of multiple
            :class:`Variable` objects.

        """

        inputs = [
            x if isinstance(x, variable.Variable) else variable.Variable(
                x, requires_grad=False) for x in inputs
        ]
        in_data = tuple([x.data for x in inputs])
        requires_grad = any([x.requires_grad for x in inputs])

        if chainer.is_debug():
            self._stack = traceback.extract_stack()

        if configuration.config.type_check:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks != 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        for hook in six.itervalues(hooks):
            hook.forward_preprocess(self, in_data)

        # Forward prop
        with cuda.get_device_from_array(*in_data):
            self._input_indexes_to_retain = None
            self._output_indexes_to_retain = None
            outputs = self.forward(in_data)
            assert type(outputs) == tuple
        for hook in six.itervalues(hooks):
            hook.forward_postprocess(self, in_data)

        if chainer.is_debug():
            if any(out.dtype.kind == 'f'
                   and cuda.get_array_module(out).isnan(out).any()
                   for out in outputs):
                msg = 'NaN is detected on forward computation'
                raise RuntimeError(msg)

        ret = tuple([
            variable.Variable(y, requires_grad=requires_grad) for y in outputs
        ])

        if configuration.config.enable_backprop:
            # Topological ordering
            self.rank = max([x.rank for x in inputs]) if inputs else 0
            # Backward edges
            for y in ret:
                y.set_creator(self)
            self.inputs = tuple([x.node for x in inputs])
            # Forward edges (must be weak references)
            self.outputs = tuple([weakref.ref(y.node) for y in ret])

            input_indexes_to_retain = self._input_indexes_to_retain
            if input_indexes_to_retain is None:
                # input arrays are retained by default
                input_indexes_to_retain = six.moves.range(len(inputs))
            for index in input_indexes_to_retain:
                inputs[index].retain_data()
            del self._input_indexes_to_retain

            output_indexes_to_retain = self._output_indexes_to_retain
            if output_indexes_to_retain is not None:
                for index in output_indexes_to_retain:
                    ret[index].retain_data()
            del self._output_indexes_to_retain

        if len(ret) == 1:
            return ret[0]
        else:
            return ret
Exemple #9
0
 def forward(self, inputs):
     with function.no_backprop_mode():
         xs = [variable.Variable(x) for x in inputs]
         outs = self._call_func(xs)
     return tuple(out.data for out in outs)
Exemple #10
0
def connectionist_temporal_classification(x,
                                          t,
                                          blank_symbol,
                                          input_length=None,
                                          label_length=None,
                                          reduce='mean'):
    """Connectionist Temporal Classification loss function.

    Connectionist Temporal Classification(CTC) [Graves2006]_ is a loss function
    of sequence labeling where the alignment between the inputs and target is
    unknown. See also [Graves2012]_

    The output is a variable whose value depends on the value of
    the option ``reduce``. If it is ``'no'``, it holds the samplewise
    loss values. If it is ``'mean'``, it takes the mean of loss values.


    Args:
        x (sequence of Variable): RNN output at each time. ``x`` must be a list
            of :class:`~chainer.Variable` s. Each element of ``x``, ``x[i]``
            is a :class:`~chainer.Variable` representing output of RNN at time
            ``i``.
        t (Variable): Expected label sequence.
        blank_symbol (int): Index of blank_symbol.
            This value must be non-negative.
        input_length (Variable): Length of valid sequence for each of mini
            batch ``x`` (optional). If input_length is skipped, It regards that
            all of ``x`` is valid input.
        label_length (Variable): Length of valid sequence for each of mini
            batch ``t`` (optional). If label_length is skipped, It regards that
            all of ``t`` is valid input.
        reduce (str): Reduction option. Its value must be either
            ``'mean'`` or ``'no'``. Otherwise,
            :class:`ValueError` is raised.

    Returns:
       ~chainer.Variable:
           A variable holding a scalar value of the CTC loss.
           If ``reduce`` is ``'no'``, the output variable holds array
           whose shape is `(B,)` where `B` is the number of samples.
           If it is ``'mean'``, it holds a scalar.

    .. note::
       You need to input ``x`` without applying to activation functions(e.g.
       softmax function), because this function applies softmax functions
       to ``x`` before calculating CTC loss to avoid numerical limitations.
       You also need to apply softmax function to forwarded values before you
       decode it.

    .. note::
       This function is differentiable only by ``x``.

    .. note::
       This function supports (batch, sequence, 1-dimensional input)-data.

    .. [Graves2006] Alex Graves, Santiago Fernandez,\
    Faustino Gomez, Jurgen Schmidhuber,\
    `Connectionist Temporal Classification: Labelling Unsegmented\
    Sequence Data with Recurrent Neural Networks\
    <ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf>`_

    .. [Graves2012] Alex Graves,\
    `Supervised Sequence Labelling with Recurrent Neural Networks\
    <http://www.cs.toronto.edu/~graves/preprint.pdf>`_

    """
    if not isinstance(x, collections.Sequence):
        raise TypeError('x must be a list of Variables')
    if not isinstance(blank_symbol, int):
        raise TypeError('blank_symbol must be non-negative integer.')
    assert blank_symbol >= 0
    assert blank_symbol < x[0].shape[1]
    # This implementation only supports 1-dimensional data.
    # TODO(jnishi): Support d(>1)-dimentinal inputs.
    assert (len(x[0].shape) == 2)

    if input_length is None:
        xp = cuda.get_array_module(x[0].data)
        input_length = variable.Variable(
            xp.full((len(x[0].data), ), len(x), dtype=numpy.int32))
    if label_length is None:
        xp = cuda.get_array_module(t.data)
        label_length = variable.Variable(
            xp.full((len(t.data), ), len(t.data[0]), dtype=numpy.int32))

    return ConnectionistTemporalClassification(blank_symbol,
                                               reduce)(input_length,
                                                       label_length, t, *x)
Exemple #11
0
 def init_hx(self, xs):
     shape = (self.n_layers * self.direction, len(xs), self.out_size)
     with cuda.get_device_from_id(self._device_id):
         hx = variable.Variable(self.xp.zeros(shape, dtype=xs[0].dtype))
     return hx
Exemple #12
0
        def g():
            # This functions is called twice in `numerical_grad`.
            # `delta` is `epsilon` or `-epsilon` in these calls.
            # See the document of `numerical_grad`.

            def perturb(data, direction):
                if data is None:
                    assert direction is None
                    return data

                data = (data.astype(numpy.float64) + delta * direction).astype(
                    data.dtype)
                if numpy.isscalar(data):
                    data = xp.array(data)
                return data

            # Input arrays
            g_x_vars = []
            j = 0
            for x_var, no_gx in six.moves.zip(x_vars, no_gxs):
                if no_gx:
                    g_x_vars.append(x_var)
                else:
                    data = perturb(casted_data[j], directions[j])
                    g_x_vars.append(
                        None if data is None else variable.Variable(data))
                    j += 1

            # Parameters
            for i in range(len(params)):
                data = perturb(casted_data[j + i], directions[j + i])

                if self.is_immutable_params:
                    # Update the parameter array since it is converted into
                    # a Parameter just before calling the func.
                    params_data[i] = data
                else:
                    # Update the given Parameter in-place since the object is
                    # held by the caller.
                    params[i].array = data

            # Clear gradients to support func that calls backward inside of
            # itself.
            self._clear_grads(g_x_vars)
            if not self.is_immutable_params:
                self._clear_grads(params)

            if self.is_immutable_params:
                ps = tuple([chainer.Parameter(p) for p in params_data])
                ys = func(g_x_vars, ps)
            else:
                ys = func(*g_x_vars)
            ys = _as_tuple(ys)
            ys_data = tuple([None if y is None else y.array for y in ys])
            if xp is chainerx:
                ys_data = tuple([
                    None if y is None else y.as_grad_stopped() for y in ys_data
                ])

            if not self.is_immutable_params:
                for i, param in enumerate(params):
                    param.array = casted_data[j + i]

            return ys_data
Exemple #13
0
    def _directional_numeric_gradients(self, directions, y0_data):
        device = self.device
        func = self.func
        xs = self.xs
        gys = self.gys
        params = self.params
        eps = self.eps
        no_gxs = self.no_gxs
        dtype = self.dtype
        detect_nondifferentiable = self.detect_nondifferentiable
        params_data = [
            p if self.is_immutable_params else p.array for p in params
        ]

        xp = device.xp

        x_vars = [variable.Variable(x, requires_grad=False) for x in xs]

        xs_filtered = [
            x.array for x, no_gx in six.moves.zip(x_vars, no_gxs) if not no_gx
        ]

        if dtype is None:
            casted_data = [x for x in xs_filtered + params_data]
        else:
            if numpy.dtype(dtype).kind != 'f':
                raise ValueError('`dtype` is allowed only float type')

            # Even skipped variable must have the same dtype.
            for x, no_gx in six.moves.zip(x_vars, no_gxs):
                if no_gx and x.array.dtype.kind == 'f':
                    x.array = x.array.astype(dtype, copy=False)

            casted_data = [
                None if x is None else x.astype(dtype, copy=False)
                for x in xs_filtered + params_data
            ]

        delta = xp.array(0., numpy.float64)

        def g():
            # This functions is called twice in `numerical_grad`.
            # `delta` is `epsilon` or `-epsilon` in these calls.
            # See the document of `numerical_grad`.

            def perturb(data, direction):
                if data is None:
                    assert direction is None
                    return data

                data = (data.astype(numpy.float64) + delta * direction).astype(
                    data.dtype)
                if numpy.isscalar(data):
                    data = xp.array(data)
                return data

            # Input arrays
            g_x_vars = []
            j = 0
            for x_var, no_gx in six.moves.zip(x_vars, no_gxs):
                if no_gx:
                    g_x_vars.append(x_var)
                else:
                    data = perturb(casted_data[j], directions[j])
                    g_x_vars.append(
                        None if data is None else variable.Variable(data))
                    j += 1

            # Parameters
            for i in range(len(params)):
                data = perturb(casted_data[j + i], directions[j + i])

                if self.is_immutable_params:
                    # Update the parameter array since it is converted into
                    # a Parameter just before calling the func.
                    params_data[i] = data
                else:
                    # Update the given Parameter in-place since the object is
                    # held by the caller.
                    params[i].array = data

            # Clear gradients to support func that calls backward inside of
            # itself.
            self._clear_grads(g_x_vars)
            if not self.is_immutable_params:
                self._clear_grads(params)

            if self.is_immutable_params:
                ps = tuple([chainer.Parameter(p) for p in params_data])
                ys = func(g_x_vars, ps)
            else:
                ys = func(*g_x_vars)
            ys = _as_tuple(ys)
            ys_data = tuple([None if y is None else y.array for y in ys])
            if xp is chainerx:
                ys_data = tuple([
                    None if y is None else y.as_grad_stopped() for y in ys_data
                ])

            if not self.is_immutable_params:
                for i, param in enumerate(params):
                    param.array = casted_data[j + i]

            return ys_data

        gx, = numerical_grad(g, (delta, ),
                             gys,
                             eps=eps,
                             detect_nondifferentiable=detect_nondifferentiable,
                             center_outputs=y0_data,
                             diff_atol=0,
                             diff_rtol=self.rtol)

        return gx
Exemple #14
0
 def init_hx_1(self, xs):
     first = xs[0]
     shape = (1 * self.direction, first.shape[0], self.out_size)
     with cuda.get_device_from_id(self._device_id):
         hx = variable.Variable(self.xp.zeros(shape, dtype=first.dtype))
     return hx
 def setUp(self):
     self.x1 = variable.Variable(np.zeros((1, 2)).astype(np.float32))
     self.x2 = variable.Variable(np.zeros((1, 2)).astype(np.float32))
     self.y = mock_function((self.x1, self.x2), 1)
Exemple #16
0
 def backward(self, inputs, grads):
     with function.force_backprop_mode():
         xs = [variable.Variable(x) for x in inputs]
         outs = self._call_func(xs)
         _DummyFunction(grads)(*outs).backward()
     return tuple(x.grad for x in xs)
 def setUp(self):
     self.x1 = variable.Variable(np.zeros((1, 2)).astype(np.float32))
     self.x2 = variable.Variable(np.zeros((1, 2)).astype(np.float32))
     self.y = self.x1 + self.x2
     self.f = self.y.creator
     self.g = c.build_computational_graph((self.y, ))
Exemple #18
0
    def apply(self, inputs):
        """Computes output variables and grows the computational graph.

        Basic behavior is expressed in the documentation of
        :class:`FunctionNode`.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           a GPU device, that device is made current before calling
           :meth:`forward`, so implementors do not need to take care of device
           selection in most cases.

        Args:
            inputs: Tuple of input variables. Each element can be either
                :class:`Variable`, :class:`numpy.ndarray`,
                or :class:`cupy.ndarray`. If the element is an ndarray, it is
                automatically wrapped with :class:`Variable`.

        Returns:
            A tuple of output :class:`Variable` objects.

        """
        input_vars = [chainer.as_variable(x) for x in inputs]
        in_data = tuple([x.data for x in input_vars])
        requires_grad = any([x.requires_grad for x in input_vars])

        if chainer.is_debug():
            self.stack = traceback.extract_stack()

        if configuration.config.type_check:
            self._check_data_type_forward(in_data)

        hooks = chainer.get_function_hooks()
        if self._n_local_function_hooks > 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(self.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        for hook in hooks:
            hook.forward_preprocess(self, in_data)

        # Forward propagation
        with cuda.get_device_from_array(*in_data):
            self._input_indexes_to_retain = None
            self._output_indexes_to_retain = None
            outputs = self.forward(in_data)
            assert type(outputs) is tuple

        for hook in hooks:
            hook.forward_postprocess(self, in_data)

        # NaN check of output values
        if chainer.is_debug():
            if any(out.dtype.kind == 'f'
                   and cuda.get_array_module(out).isnan(out).any()
                   for out in outputs):
                msg = ('NaN is detected on forward computation of '
                       '{}'.format(self.label))
                raise RuntimeError(msg)

        ret = tuple([
            variable.Variable(y, requires_grad=requires_grad) for y in outputs
        ])

        if configuration.config.enable_backprop:
            # Topological ordering
            self.rank = max([x.rank for x in input_vars]) if input_vars else 0
            # Add backward edges
            for i, y in enumerate(ret):
                y.creator_node = self
            self.inputs = tuple([x.node for x in input_vars])
            # Add forward edges (must be weak references)
            self.outputs = tuple([weakref.ref(y.node) for y in ret])

            if self._input_indexes_to_retain is not None:
                for index in self._input_indexes_to_retain:
                    input_vars[index].retain_data()

            if self._output_indexes_to_retain is not None:
                retained_data = []
                for index in self._output_indexes_to_retain:
                    ret[index].retain_data()
                    retained_data.append(outputs[index])
                self._retained_output_data = tuple(retained_data)

        return ret
 def setUp(self):
     self.x1 = variable.Variable(np.zeros((1, 2)).astype('f'))
     self.x2 = variable.Variable(np.zeros((1, 2)).astype('f'))
     self.y = self.x1 + self.x2
     self.f = self.y.creator
     self.g = c.build_computational_graph((self.y, ), remove_variable=True)
Exemple #20
0
def forget(func, *xs):
    """Calls a function without storing intermediate results.

    On a forward propagation, Chainer normally stores all intermediate results
    of :class:`~chainer.variable.VariableNode`\\ s on a computational graph as
    they are required on backward propagation.
    Sometimes these results consume too much memory.
    ``F.forget`` *forgets* such intermediate results on forward propagation,
    and still supports backpropagation with recalculation.

    On a forward propagation, ``F.forget`` calls a given function with given
    variables without creating a computational graph. That means, no
    intermediate results are stored.
    On a backward propagation, ``F.forget`` calls the given function again to
    create a computational graph for backpropagation.

    ``F.forget`` reduces internal memory usage, whereas it requires more
    calculation time as it calls the function twice.

    .. admonition:: Example

       Let ``f`` be a function defined as:

       >>> def f(a, b):
       ...   return a + b + a

       and, ``x`` and ``y`` be :class:`~chainer.Variable`\\ s:

       >>> x = chainer.Variable(np.random.uniform(-1, 1, 5).astype(np.float32))
       >>> y = chainer.Variable(np.random.uniform(-1, 1, 5).astype(np.float32))

       When ``z`` is calculated as ``z = f(x, y)``, its intermediate result
       ``x + y`` is stored in memory. Instead, if you call ``f`` with
       ``F.forget``:

       >>> z = F.forget(f, x, y)

       intermediate ``x + y`` is forgotten.

    .. note::

        ``F.forget`` does not support functions which behave differently in
        multiple calls with the same inputs, such as
        :meth:`F.dropout() <chainer.functions.dropout>` and
        :meth:`F.negative_sampling() <chainer.functions.negative_sampling>`.

    .. note::

        In case input argument variables are of class :class:`numpy.ndarray` or
        :class:`cupy.ndarray` objects, arguments will automatically be
        converted to :class:`~chainer.Variable`\\ s.
        This conversion takes place to ensure that this function is included
        in the computational graph to enable backward computations.

    Args:
        func (callable): A function to call. It needs to be called with
            :class:`~chainer.Variable` object(s) and to return a
            :class:`~chainer.Variable` object or a tuple of
            :class:`~chainer.Variable` objects.
        xs (~chainer.Variable): Argument variables of the function.

    Returns:
        ~chainer.Variable: A variable ``func`` returns. If it returns a tuple,
        the method returns a tuple too.

    """
    xs = tuple(x if isinstance(x, variable.Variable) else
               variable.Variable(x, requires_grad=True) for x in xs)
    y = Forget(func).apply(xs)
    if len(y) == 1:
        y, = y
    return y
    def _directional_numeric_gradients(self, directions, y0_data):
        device = self.device
        func = self.func
        x_data = self.x_data
        y_grad = self.y_grad
        params = self.params
        eps = self.eps
        no_grads = self.no_grads
        dtype = self.dtype
        detect_nondifferentiable = self.detect_nondifferentiable

        xp = device.xp

        x_vars = [variable.Variable(x, requires_grad=False) for x in x_data]
        variables = (
            [x for x, no_grad in six.moves.zip(x_vars, no_grads)
             if not no_grad]
            + list(params))

        if dtype is None:
            casted_data = [x.array for x in variables]
        else:
            if numpy.dtype(dtype).kind != 'f':
                raise ValueError('`dtype` is allowed only float type')
            casted_data = [
                x.array.astype(dtype, copy=False) for x in variables]

            # Even skipped variable must have the same dtype.
            for x, skip in six.moves.zip(x_vars, no_grads):
                if skip and x.array.dtype.kind == 'f':
                    x.array = x.array.astype(dtype, copy=False)

        delta = xp.array(0., numpy.float64)

        def g():
            # This functions is called twice in `numerical_grad`.
            # `delta` is `epsilon` or `-epsilon` in these calls.
            # See the document of `numerical_grad`.

            def perturb(data, direction):
                data = (data.astype(numpy.float64)
                        + delta * direction).astype(data.dtype)
                if numpy.isscalar(data):
                    data = xp.array(data)
                return data

            # Input arrays
            g_x_vars = []
            j = 0
            for i in range(len(x_vars)):
                if no_grads[i]:
                    g_x_vars.append(x_vars[i])
                else:
                    data = perturb(casted_data[j], directions[j])
                    g_x_vars.append(variable.Variable(data))
                    j += 1
            # Parameters
            for i in range(len(params)):
                params[i].data = perturb(
                    casted_data[j + i], directions[j + i])

            # Clear gradients to support func that calls backward inside of
            # itself.
            self._clear_grads(g_x_vars)
            self._clear_grads(params)

            ys = func(*g_x_vars)
            ys = _as_tuple(ys)
            ys_data = tuple([None if y is None else y.array for y in ys])
            if xp is chainerx:
                ys_data = tuple([
                    None if y is None else y.as_grad_stopped()
                    for y in ys_data])
            for param, data in six.moves.zip(params, casted_data):
                param.data = data
            return ys_data

        gx, = numerical_grad(
            g, (delta,), y_grad, eps=eps,
            detect_nondifferentiable=detect_nondifferentiable,
            center_outputs=y0_data, diff_atol=0, diff_rtol=self.rtol)

        return gx
Exemple #22
0
    def __call__(self, x, active_len, mask, **kwargs):
        """__call__(self, x, finetune=False)

        Invokes the forward propagation of BatchNormalization.

        In training mode, the BatchNormalization computes moving averages of
        mean and variance for evaluation during training, and normalizes the
        input using batch statistics.

        .. warning::

           ``test`` argument is not supported anymore since v2.
           Instead, use ``chainer.using_config('train', False)``.
           See :func:`chainer.using_config`.

        Args:
            x (Variable): Input variable.
            finetune (bool): If it is in the training mode and ``finetune`` is
                ``True``, BatchNormalization runs in fine-tuning mode; it
                accumulates the input array to compute population statistics
                for normalization, and normalizes the input using batch
                statistics.

        """
        argument.check_unexpected_kwargs(
            kwargs,
            test='test argument is not supported anymore. '
            'Use chainer.using_config')
        finetune, = argument.parse_kwargs(kwargs, ('finetune', False))

        if hasattr(self, 'gamma'):
            gamma = self.gamma
        else:
            with cuda.get_device_from_id(self._device_id):
                gamma = variable.Variable(
                    self.xp.ones(self.avg_mean.shape, dtype=x.dtype))

        if hasattr(self, 'beta'):
            beta = self.beta
        else:
            with cuda.get_device_from_id(self._device_id):
                beta = variable.Variable(
                    self.xp.zeros(self.avg_mean.shape, dtype=x.dtype))

        if configuration.config.train:
            if finetune:
                self.N += 1
                decay = 1. - 1. / self.N
            else:
                decay = self.decay

            ret = func_active_bn.batch_normalization(
                x,
                gamma,
                beta,
                eps=self.eps,
                running_mean=self.avg_mean,
                running_var=self.avg_var,
                decay=decay,
                active_len=active_len,
                mask=mask)
        else:
            # Use running average statistics or fine-tuned statistics.
            mean = variable.Variable(self.avg_mean)
            var = variable.Variable(self.avg_var)
            ret = func_active_bn.fixed_batch_normalization(
                x, gamma, beta, mean, var, self.eps)
        return ret
Exemple #23
0
 def forward(self, inputs):
     self.retain_inputs(tuple(range(len(inputs))))
     with function.no_backprop_mode():
         xs = [variable.Variable(x) for x in inputs]
         outs = _call_func(self.func, xs)
     return tuple(out.data for out in outs)
Exemple #24
0
    def __call__(self, *inputs):
        """Applies forward propagation with chaining backward references.

        Basic behavior is expressed in documentation of :class:`Function`
        class.

        .. note::

           If the :data:`~Variable.data` attribute of input variables exist on
           GPU device, then, before it calls :meth:`forward` method, the
           appropriate device is selected, so in most cases implementers do
           not need to take care of device selection.

        Args:
            inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or
                :class:`cupy.ndarray` objects. The volatile flags of all input
                variables must agree. If the input is an :class:`numpy.ndarray`
                or a :class:`cupy.ndarray`, it is automatically wrapped with
                :class:`Variable`.

        Returns:
            One :class:`Variable` object or a tuple of multiple
            :class:`Variable` objects.

        """

        inputs = [
            x if isinstance(x, chainer.Variable) else chainer.Variable(
                x, volatile=flag.AUTO) for x in inputs
        ]

        in_data = tuple([x.data for x in inputs])
        if chainer.is_debug():
            self._stack = traceback.extract_stack()

        if self.type_check_enable:
            self._check_data_type_forward(in_data)

        hooks = collections.OrderedDict(chainer.get_function_hooks())
        hooks.update(self.local_function_hooks)
        for hook in six.itervalues(hooks):
            hook.forward_preprocess(self, in_data)
        # Forward prop
        with cuda.get_device(*in_data):
            outputs = self.forward(in_data)
            assert type(outputs) == tuple
        for hook in six.itervalues(hooks):
            hook.forward_postprocess(self, in_data)

        if chainer.is_debug():
            if any(out.dtype.kind == 'f'
                   and cuda.get_array_module(out).isnan(out).any()
                   for out in outputs):
                msg = 'NaN is detected on forward computation'
                raise RuntimeError(msg)

        out_v = flag.aggregate_flags([x.volatile for x in inputs])
        ret = tuple([variable.Variable(y, volatile=out_v) for y in outputs])

        if out_v != 'on':
            # Topological ordering
            self.rank = max([x.rank for x in inputs]) if inputs else 0
            # Backward edges
            for y in ret:
                y.set_creator(self)
            self.inputs = inputs
            # Forward edges (must be weak references)
            self.outputs = tuple([weakref.ref(y) for y in ret])

        if len(ret) == 1:
            return ret[0]
        else:
            return ret
Exemple #25
0
def check_backward(func,
                   x_data,
                   y_grad,
                   params=(),
                   eps=1e-3,
                   atol=1e-5,
                   rtol=1e-4,
                   no_grads=None,
                   dtype=None,
                   detect_nondifferentiable=False):
    """Test backward procedure of a given function.

    This function automatically checks the backward-process of a given function
    to ensure that the computed gradients are approximately correct.
    For example, assuming you've defined a :class:`~chainer.FunctionNode` class
    ``MyFunc``, that takes two arguments and returns one value, you can wrap
    it in a ordinary function and check its gradient computations as follows::

    >> def test_my_func(self):
    >>
    >>     def func(xs):
    >>         y, = MyFunc().apply(xs)
    >>         return y
    >>
    >>   x1_data = xp.array(...)
    >>   x2_data = xp.array(...)
    >>   gy_data = xp.array(...)
    >>   check_backward(func, (x1_data, x2_data), gy_data)

    This method creates :class:`~chainer.Variable` objects with ``x_data``
    and calls ``func`` with the :class:`~chainer.Variable`\\ s to get its
    result as :class:`~chainer.Variable`.
    Then, it sets ``y_grad`` array to ``grad`` attribute of the result and
    calls ``backward`` method to get gradients of the inputs.
    To check correctness of the gradients, the function calls
    :func:`numerical_grad` to calculate numerically the gradients and compares
    the types of gradients with :func:`chainer.testing.assert_allclose`.

    To reduce computational time, it uses directional derivative along a
    random vector. A function
    :math:`g: \\mathbb{R} \\rightarrow \\mathbb{R}^n` is defined as
    :math:`g(\\delta) = f(x + \\delta r)`, where
    :math:`\\delta \\in \\mathbb{R}`, :math:`r \\in \\mathbb{R}^n`
    is a random vector
    and :math:`f` is a function which you want to test.
    Its gradient is

    .. math::
       g'(\\delta) = f'(x + \\delta r) \\cdot r.

    Therefore, :math:`g'(0) = f'(x) \\cdot r`.
    So we can check the correctness of back propagation of :math:`f` indirectly
    by comparing this equation with the gradient of :math:`g` numerically
    calculated and that of :math:`f` computed by backprop.
    If :math:`r` is chosen from uniform distribution, we can conclude with
    high probability that the gradient of :math:`f` itself is correct.

    If input objects (``x1_data`` or/and ``x2_data`` in this example) represent
    integer variables, their gradients are ignored.

    You can simplify a test when ``MyFunc`` gets only one argument::

    >>   check_backward(func, x1_data, gy_data)

    If ``MyFunc`` is a loss function which returns a zero-dimensional
    array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to
    ``grad`` attribute of the result::

    >>   check_backward(my_loss_func, (x1_data, x2_data), None)

    If ``MyFunc`` returns multiple outputs, pass all gradients for outputs
    as a tuple::

    >>   gy1_data = xp.array(...)
    >>   gy2_data = xp.array(...)
    >>   check_backward(func, x1_data, (gy1_data, gy2_data))

    You can also test a :class:`~chainer.Link`.
    To check gradients of parameters of the link, set a tuple of the parameters
    to ``params`` arguments::

    >>   check_backward(my_link, (x1_data, x2_data), gy_data,
    >>                  (my_link.W, my_link.b))

    Note that ``params`` are not ``ndarray``\\ s,
    but :class:`~chainer.Variables`\\ s.

    Function objects are acceptable as ``func`` argument::

    >>   check_backward(lambda x1, x2: f(x1, x2),
    >>                  (x1_data, x2_data), gy_data)

    .. note::

       ``func`` is called many times to get numerical gradients for all inputs.
       This function doesn't work correctly when ``func`` behaves randomly as
       it gets different gradients.


    Args:
        func (callable): A function which gets :class:`~chainer.Variable`\\ s
            and returns :class:`~chainer.Variable`\\ s. ``func`` must returns
            a tuple of :class:`~chainer.Variable`\\ s or one
            :class:`~chainer.Variable`. You can use a
            :class:`~chainer.Function`, :class:`~chainer.FunctionNode` or a
            :class:`~chainer.Link` object or any other function satisfying the
            condition.
        x_data (ndarray or tuple of ndarrays): A set of ``ndarray``\\ s to be
            passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is
            treated as ``(x_data,)``.
        y_grad (ndarray or tuple of ndarrays or None):
            A set of ``ndarray``\\ s representing gradients of return-values of
            ``func``. If ``y_grad`` is one ``ndarray`` object, it is
            treated as ``(y_grad,)``. If ``func`` is a loss-function,
            ``y_grad`` should be set to ``None``.
        params (~chainer.Variable or tuple of ~chainder.Variable):
            A set of :class:`~chainer.Variable`\\ s whose gradients are
            checked. When ``func`` is a :class:`~chainer.Link` object,
            set its parameters as ``params``.
            If ``params`` is one :class:`~chainer.Variable` object,
            it is treated as ``(params,)``.
        eps (float): Epsilon value to be passed to :func:`numerical_grad`.
        atol (float): Absolute tolerance to be passed to
            :func:`chainer.testing.assert_allclose`.
        rtol (float): Relative tolerance to be passed to
            :func:`chainer.testing.assert_allclose`.
        no_grads (list of bool): Flag to skip variable for gradient assertion.
            It should be same length as ``x_data``.
        dtype (~numpy.dtype): ``x_data``, ``y_grad`` and ``params`` are casted
            to this dtype when calculating numerical gradients. Only float
            types and ``None`` are allowed.
        detect_nondifferentiable (bool):
            If ``True``, check for non-differentiable inputs is enabled.
            If ``func`` is non-differentiable at ``x_data``, ``check_backward``
            raises :class:`~chainer.gradient_check.NondifferentiableError`.

    .. seealso::
       :func:`numerical_grad`
    """
    if dtype is not None and numpy.dtype(dtype).kind != 'f':
        raise ValueError('`dtype` is allowed only float type')

    x_data = _as_tuple(x_data)
    if y_grad is not None:
        y_grad = _as_tuple(y_grad)
    params = _as_tuple(params)

    xs = [variable.Variable(x) for x in x_data]
    y = func(*xs)
    y = _as_tuple(y)
    y0_data = [_.data for _ in y]

    # All creators of `y` need to be the same because we only call
    # `y[0].backward` to call `backward` method of the creator.
    # To do so we need to insert a dummy function `Ident` to the
    # computational graph.
    # Note that `func` may not be a `Function` object.
    y = identity.Identity().apply(y)

    y_grad = _set_y_grad(y, y_grad)

    # Clear gradients which may exist if func calls backward inside of itself.
    _clear_grads(xs)
    _clear_grads(params)

    # We only need to call `backward` for one result `Variable`.
    # `Variable.backward` method calls `Function.backward` of its creator.
    y[0].backward()

    if no_grads is None:
        no_grads = [x.dtype.kind != 'f' for x in xs]
    else:
        if len(no_grads) != len(xs):
            raise ValueError(
                'Length of no_grads param and xs should be same.\n'
                'Actual: {0} != {1}'.format(len(no_grads), len(xs)))

    for skip, x in six.moves.zip(no_grads, xs):
        if skip:
            if x.grad is not None:
                raise RuntimeError('gradient of int variable must be None')
        else:
            if x.grad is None:
                raise RuntimeError(
                    'gradients of some arguments are not calculated')

    if len(xs) - no_grads.count(True) + len(params) == 0:
        # When there is no float variables, we need not to check gradient
        # values
        return

    variables = _filter_list(xs, no_grads) + list(params)
    # Keep the gradient arrays of params which may be overwritten by func
    grads = [x.grad for x in variables]

    if dtype is None:
        casted_data = [x.data for x in variables]
    else:
        if numpy.dtype(dtype).kind != 'f':
            raise ValueError('`dtype` is allowed only float type')
        casted_data = [x.data.astype(dtype, copy=False) for x in variables]

        # Even skipped variable must have the same dtype.
        for x, skip in six.moves.zip(xs, no_grads):
            if skip and x.data.dtype.kind == 'f':
                x.data = x.data.astype(dtype, copy=False)

    xp = cuda.get_array_module(*xs)
    directions = [xp.random.normal(size=x.shape) for x in variables]
    # Use unit vector
    norm = math.sqrt(sum([xp.square(d).sum() for d in directions]))
    if norm != 0:
        # norm could be zero if input arrays are 0-sized.
        scale = 1. / norm
        directions = [d * scale for d in directions]

    delta = xp.array(0., 'd')

    def g():
        # This functions is called twice in `numerical_grad`.
        # `delta` is `epsilon` or `-epsilon` in these calls.
        # See the document of `numerical_grad`.
        for x, data, direction in six.moves.zip(variables, casted_data,
                                                directions):
            # astype is require to store data with the given type
            data = (data.astype('d') + delta * direction).astype(data.dtype)
            if numpy.isscalar(data):
                data = xp.array(data)
            x.data = data

        # Clear gradients to support func that calls backward inside of itself.
        _clear_grads(xs)
        _clear_grads(params)

        ys = func(*xs)
        ys = _as_tuple(ys)
        ys_data = tuple(y.data for y in ys)
        for x, data in six.moves.zip(variables, casted_data):
            x.data = data
        return ys_data

    gx, = numerical_grad(g, (delta, ),
                         y_grad,
                         eps=eps,
                         detect_nondifferentiable=detect_nondifferentiable,
                         center_outputs=y0_data)
    gx_accum = 0
    for g, direction in six.moves.zip(grads, directions):
        gx_accum += (g.astype('d') * direction).sum()

    try:
        testing.assert_allclose(gx, gx_accum, atol=atol, rtol=rtol)
    except AssertionError as e:
        f = six.StringIO()
        f.write('check_backward failed (eps={} atol={} rtol={})\n'.format(
            eps, atol, rtol))
        for i, x_ in enumerate(xs):
            f.write('inputs[{}]:\n'.format(i))
            f.write('{}\n'.format(x_))
        for i, gy_ in enumerate(y_grad):
            f.write('grad_outputs[{}]:\n'.format(i))
            f.write('{}\n'.format(gy_))
        f.write('gradients (numeric):  {}\n'.format(gx))
        f.write('gradients (backward): {}\n'.format(gx_accum))
        f.write('\n')
        f.write(str(e))
        raise AssertionError(f.getvalue())
Exemple #26
0
    def __call__(self, x, **kwargs):
        """__call__(self, x, finetune=False)
        Invokes the forward propagation of BatchNormalization.
        In training mode, the BatchNormalization computes moving averages of
        mean and variance for evaluation during training, and normalizes the
        input using batch statistics.
        .. warning::
           ``test`` argument is not supported anymore since v2.
           Instead, use ``chainer.using_config('train', False)``.
           See :func:`chainer.using_config`.
        Args:
            x (Variable): Input variable.
            finetune (bool): If it is in the training mode and ``finetune`` is
                ``True``, BatchNormalization runs in fine-tuning mode; it
                accumulates the input array to compute population statistics
                for normalization, and normalizes the input using batch
                statistics.
        """
        # check argument
        argument.check_unexpected_kwargs(
            kwargs, test='test argument is not supported anymore. '
                         'Use chainer.using_config')
        finetune, = argument.parse_kwargs(kwargs, ('finetune', False))

        original_shape = x.shape
        batch_size = original_shape[0]
        # reshape input x if batchsize > 1
        if batch_size > 1:
            reshaped_x = functions.expand_dims(x, axis=0)
        else:
            reshaped_x = x

        if hasattr(self, 'gamma'):
            gamma = self.gamma
            if self.norm_grad:
                # gamma.add_batch(batch_size)
                gamma.n_batch = batch_size
        else:
            with cuda.get_device_from_id(self._device_id):
                gamma = variable.Variable(self.xp.ones(
                    self.avg_mean.shape, dtype=x.dtype))
        if hasattr(self, 'beta'):
            beta = self.beta
            if self.norm_grad:
                # beta.add_batch(batch_size)
                beta.n_batch = batch_size
        else:
            with cuda.get_device_from_id(self._device_id):
                beta = variable.Variable(self.xp.zeros(
                    self.avg_mean.shape, dtype=x.dtype))

        #align shapes if x was reshaped
        if batch_size > 1:
            mean = self.xp.stack((self.avg_mean,) * batch_size)
            var = self.xp.stack((self.avg_var,) * batch_size)
            gamma = functions.stack((gamma,) * batch_size)
            beta = functions.stack((beta,) * batch_size)
        else:
            mean = self.xp.asarray(self.avg_mean)
            var = self.xp.asarray(self.avg_var)

        if configuration.config.train:
            if finetune:
                self.N += 1
                decay = 1. - 1. / self.N
            else:
                decay = self.decay

            func = batch_normalization.BatchNormalizationFunction(
                self.eps, mean, var, decay)
            ret = func(reshaped_x, gamma, beta)

        else:
            head_ndim = gamma.ndim + 1
            axis = (0,) + tuple(range(head_ndim, reshaped_x.ndim))
            mean = reshaped_x.data.mean(axis=axis)
            var = reshaped_x.data.var(axis=axis)
            ret = functions.fixed_batch_normalization(
                reshaped_x, gamma, beta, mean, var, self.eps)

        # ret is normalized input x
        if batch_size > 1:
            ret = functions.reshape(ret, original_shape)
        return ret
Exemple #27
0
def grad(outputs,
         inputs,
         grad_outputs=None,
         grad_inputs=None,
         set_grad=False,
         retain_grad=False,
         enable_double_backprop=False,
         loss_scale=None):
    """Computes the gradient of output variables w.r.t.\\  the input variables.

    This function implements the backpropagation algorithm. While
    :meth:`Variable.backward` also implements backprop, this function selects
    the smallest paths in the computational graph needed to compute the
    gradients w.r.t. inputs. The error is backpropagated only through these
    selected paths, which may reduce the overall computational cost.

    This function also differs from :meth:`Variable.backward` in the way to
    return the gradients; it directly returns the gradient variables as a list
    instead of setting gradients to the :attr:`Variable.grad_var` attribute of
    the original variable. It means users do not need to clear the gradient
    w.r.t. each variable before computing the gradient using this function.
    If ``set_grad`` option is set to ``True``, the computed gradient is also
    stored in the :attr:`Variable.grad_var` attribute of each variable, in
    which case any original value of :attr:`Variable.grad_var` will be updated
    even if it had already been set.

    Args:
        outputs (tuple or list of :class:`~chainer.Variable`):
            A sequence of output variables from which backprop starts.
        inputs (tuple or list of :class:`~chainer.Variable`):
            A sequence of input variables each of which this function computes
            the gradient w.r.t.
        grad_outputs (tuple or list of :class:`~chainer.Variable` or None):
            A sequence of variables that gives the initial value of each output
            gradient.
            If an element is set to ``None``, an array filled with 1 is used.
            If this argument itself is ``None``, it is treated as a sequence of
            ``None``\\ s.
        grad_inputs (tuple or list of :class:`~chainer.Variable` or None):
            A sequence of variables that gives the initial value of each input
            gradient. The gradients computed by the backprop
            algorithm are accumulated to them (not in-place). If an element
            is set to ``None``, the gradient is not accumulated to this value.
            If this argument itself is ``None``, it is treated as a sequence of
            ``None``\\ s.
        set_grad (bool): If it is ``True``, the :attr:`Variable.grad_var`
            attribute of each input variable is set to the corresponding
            computed gradient variable.
        retain_grad (bool): If it is ``True``, the gradients w.r.t. all the
            intermediate variables are stored in the :attr:`Variable.grad_var`
            attribute. In this case, the ``set_grad`` option is ignored.
        enable_double_backprop (bool): If it is ``True``, the computed
            gradients can be further backpropagated. Enabling it may increase
            the memory consumption (and possibly the computational time) to
            remember the intermediate gradient values for the second
            backpropagation.
        loss_scale (float): Loss scaling factor. Loss scaling is a usefull
            technique to mitigate vanishing gradient issue that tends to happen
            when low precision data type like float16 is used during training.
            If you set loss scaling factor, gradients of loss values are to be
            multiplied by the factor before backprop starts. The factor is
            propagated to whole gradients in a computational graph along the
            backprop. The gradients of parameters are divided by the factor
            just before the parameters are to be updated.

    Returns:
        A list of gradient variables w.r.t. the inputs.

    """
    if not isinstance(outputs, (tuple, list)):
        raise TypeError('outputs must be a tuple or a list, not {}.'.format(
            type(outputs)))
    if not isinstance(inputs, (tuple, list)):
        raise TypeError('inputs must be a tuple or a list, not {}.'.format(
            type(inputs)))
    if not (grad_outputs is None or isinstance(grad_outputs, (tuple, list))):
        raise TypeError(
            'grad_outputs must be a tuple or a list or None, not {}.'.format(
                type(grad_outputs)))
    if not (grad_inputs is None or isinstance(grad_inputs, (tuple, list))):
        raise TypeError(
            'grad_inputs must be a tuple or a list or None, not {}.'.format(
                type(grad_inputs)))

    for v in outputs:
        # Raise error here if v is created by Function.backward.
        # In such case, we don't know exact inputs of the creator.
        v.node._check_old_style_gradient()

    # The implementation consists of three steps.

    # 1. Backward enumeration: all the nodes reachable backward from the output
    #    nodes are enumerated. The forward direction links are collected in
    #    this step. Note that the variable nodes whose requires_grad is false
    #    are ignored and their creators are not searched.
    candidate_funcs = [
        v.creator_node for v in outputs if v.creator_node is not None
    ]
    visited_funcs = set()
    forward_graph = collections.defaultdict(list)
    while candidate_funcs:
        func = candidate_funcs.pop()
        if func in visited_funcs:
            continue
        visited_funcs.add(func)
        for x in func.inputs:
            # Raise error here if x is created by Function.backward.
            # In such case, we don't know exact inputs of the creator.
            x._check_old_style_gradient()

            if not x.requires_grad:
                continue
            forward_graph[x].append(func)
            creator = x.creator_node
            if creator is not None and creator not in visited_funcs:
                candidate_funcs.append(creator)

    # 2. Forward enumeration: all the nodes in the subgraph reachable from the
    #    input nodes are enumerated. The extracted (sub-)subgraph is the union
    #    of all paths that backpropagation will visit.
    candidate_vars = [x.node for x in inputs]
    visited_funcs = set()
    grad_required = set()
    while candidate_vars:
        x = candidate_vars.pop()
        grad_required.add(x)
        for func in forward_graph[x]:
            if func in visited_funcs:
                continue
            visited_funcs.add(func)
            for y_ref in func.outputs:
                y = y_ref()
                if y is not None and y in forward_graph:
                    candidate_vars.append(y)

    # 3. Backpropagation: the backpropagation is executed along the
    #    (sub-)subgraph. It uses the topological order of the subgraph which is
    #    induced by the reversed order of function applications ("rank").
    grads = _backprop_utils.GradTable()

    # Initialize the gradient mapping.
    if grad_outputs is None:
        grad_outputs = (None, ) * len(outputs)
    for y, gy in zip(outputs, grad_outputs):
        if gy is None:
            with cuda.get_device_from_array(y.data) as device:
                if device is cuda.DummyDevice:
                    gy_data = numpy.ones_like(y.data)
                else:
                    gy_data = cuda.cupy.ones_like(y.data)
                gy = variable.Variable(gy_data, requires_grad=False)
            if loss_scale is not None:
                gy.data *= loss_scale
        grads[y.node] = gy

    if grad_inputs is not None:
        for x, gx in zip(inputs, grad_inputs):
            if gx is not None:
                grads[x.node] = gx

    # Backprop implementation. It edits grads which will only contain the
    # gradients w.r.t. the inputs.
    with chainer.using_config('enable_backprop', enable_double_backprop):
        ret_dict = _backprop(outputs, inputs, grad_required, retain_grad,
                             grads, loss_scale)

    # Extract the gradients w.r.t. the inputs and return them.
    ret = [ret_dict[x.node] for x in inputs]
    if set_grad:
        for x, gx in zip(inputs, ret):
            x.grad_var = gx

    return ret
Exemple #28
0
def check_backward(func,
                   x_data,
                   y_grad,
                   params=(),
                   eps=1e-3,
                   atol=1e-5,
                   rtol=1e-4,
                   no_grads=None,
                   dtype=None):
    """Test backward procedure of a given function.

    This function automatically check backward-process of given function.
    For example, when you have a :class:`~chainer.Function` class ``MyFunc``,
    that gets two arguments and returns one value, you can make its test like
    this::

    >> def test_my_func(self):
    >>   func = MyFunc()
    >>   x1_data = xp.array(...)
    >>   x2_data = xp.array(...)
    >>   gy_data = xp.array(...)
    >>   check_backward(func, (x1_data, x2_data), gy_data)

    This method creates :class:`~chainer.Variable` objects with ``x_data``
    and calls ``func`` with the :class:`~chainer.Variable` s to get its result
    as :class:`~chainer.Variable`.
    Then, it sets ``y_grad`` array to ``grad`` attribute of the result and
    calls ``backward`` method to get gradients of the inputs.
    To check correctness of the gradients, the function calls
    :func:`numerical_grad` to calculate numerically the gradients and compares
    the types of gradients with :func:`chainer.testing.assert_allclose`.
    If input objects (``x1_data`` or/and ``x2_data`` in this example) represent
    integer variables, their gradients are ignored.

    You can simplify a test when ``MyFunc`` gets only one argument::

    >>   check_backward(func, x1_data, gy_data)

    If ``MyFunc`` is a loss function which returns a zero-dimensional
    array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to
    ``grad`` attribute of the result::

    >>   check_backward(my_loss_func, (x1_data, x2_data), None)

    If ``MyFunc`` returns multiple outputs, pass all gradients for outputs
    as a tuple::

    >>   gy1_data = xp.array(...)
    >>   gy2_data = xp.array(...)
    >>   check_backward(func, x1_data, (gy1_data, gy2_data))

    You can also test a :class:`~chainer.Link`.
    To check gradients of parameters of the link, set a tuple of the parameters
    to ``params`` arguments::

    >>   check_backward(my_link, (x1_data, x2_data), gy_data,
    >>                  (my_link.W, my_link.b))

    Note that ``params`` are not ``ndarray`` s,
    but :class:`~chainer.Variables` s.

    Function objects are acceptable as ``func`` argument::

    >>   check_backward(lambda x1, x2: f(x1, x2),
    >>                  (x1_data, x2_data), gy_data)

    .. note::

       ``func`` is called many times to get numerical gradients for all inputs.
       This function doesn't work correctly when ``func`` behaves randomly as
       it gets different gradients.


    Args:
        func (callable): A function which gets :class:`~chainer.Variable` s
            and returns :class:`~chainer.Variable` s. ``func`` must returns
            a tuple of :class:`~chainer.Variable` s or one
            :class:`~chainer.Variable`. You can use :class:`~chainer.Function`
            object, :class:`~chainer.Link` object or a function satisfying the
            condition.
        x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be
            passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is
            treated as ``(x_data,)``.
        y_grad (ndarray or tuple of ndarrays or None):
            A set of ``ndarray`` s representing gradients of return-values of
            ``func``. If ``y_grad`` is one ``ndarray`` object, it is
            treated as ``(y_grad,)``. If ``func`` is a loss-function,
            ``y_grad`` should be set to ``None``.
        params (~chainer.Variable or tuple of ~chainder.Variable):
            A set of :class:`~chainer.Variable` s whose gradients are checked.
            When ``func`` is a :class:`~chainer.Link` object,
            set its parameters as ``params``.
            If ``params`` is one :class:`~chainer.Variable` object,
            it is treated as ``(params,)``.
        eps (float): Epsilon value to be passed to :func:`numerical_grad`.
        atol (float): Absolute tolerance to be passed to
            :func:`chainer.testing.assert_allclose`.
        rtol (float): Relative tolerance to be passed to
            :func:`chainer.testing.assert_allclose`.
        no_grads (list of bool): Flag to skip variable for gradient assertion.
            It should be same length as ``x_data``.
        dtype (~numpy.dtype): ``x_data`` and ``y_grad`` are casted to this
            dtype when calculating numerical gradients. Only float types and
            ``None`` are allowed.

    See:
       :func:`numerical_grad`
    """
    x_data = _as_tuple(x_data)
    if y_grad is not None:
        y_grad = _as_tuple(y_grad)
    params = _as_tuple(params)

    xs = [variable.Variable(x) for x in x_data]
    y = func(*xs)
    y = _as_tuple(y)

    # All creators of `y` need to be the same because we only call
    # `y[0].backward` to call `backward` method of the creator.
    # To do so we need to insert a dummy function `Ident` to the
    # computational graph.
    # Note that `func` may not be a `Function` object.
    y = identity.Identity()(*y)
    y = _as_tuple(y)

    if y_grad is not None:
        if len(y) != len(y_grad):
            raise ValueError(
                '`y_grad` must have the same length of output values')
        for iy, igy in six.moves.zip(y, y_grad):
            iy.grad = igy
    else:
        if len(y) != 1:
            raise ValueError(
                'When `y_grad` is `None`, the function must return a'
                'zero-dimentional array')
        y_grad = (1, )

    # We only need to call `backward` for one result `Variable`.
    # `Variable.backward` method calls `Function.backward` of its creator.
    y[0].backward()

    if dtype is None:
        casted_xs = [variable.Variable(x) for x in x_data]
    else:
        if numpy.dtype(dtype).kind != 'f':
            raise ValueError('`dtype` is allowed only float type')
        if len(params) > 0:
            raise ValueError('`dtype` is available only if `params` is empty')
        casted_xs = [
            variable.Variable(
                x.astype(dtype, copy=False) if x.dtype.kind == 'f' else x)
            for x in x_data
        ]

    def f():
        ys = func(*casted_xs)
        ys = _as_tuple(ys)
        return tuple(y.data for y in ys)

    if no_grads is None:
        no_grads = [x.dtype.kind != 'f' for x in xs]
    else:
        if len(no_grads) != len(xs):
            raise ValueError('Length of no_grads param and xs should be same.')
    for skip, x, cx in six.moves.zip(no_grads, xs, casted_xs):
        if skip:
            assert x.grad is None
            continue
        gx, = numerical_grad(f, (cx.data, ), y_grad, eps=eps)
        testing.assert_allclose(gx, x.grad, atol=atol, rtol=rtol)
        if dtype is None:
            assert gx.dtype == x.grad.dtype
        else:
            assert gx.dtype.kind == 'f' and gx.dtype == dtype

    for p in params:
        gp, = numerical_grad(f, (p.data, ), y_grad, eps=eps)
        testing.assert_allclose(gp, p.grad, atol=atol, rtol=rtol)
        assert gp.dtype is p.grad.dtype
def check_backward(func, x_data, y_grad, params=(),
                   eps=1e-3, atol=1e-5, rtol=1e-4, no_grads=None, dtype=None):
    """Test backward procedure of a given function.

    This function automatically checks backward-process of a given function.
    For example, when you have a :class:`~chainer.Function` class ``MyFunc``,
    that gets two arguments and returns one value, you can make its test like
    this::

    >> def test_my_func(self):
    >>   func = MyFunc()
    >>   x1_data = xp.array(...)
    >>   x2_data = xp.array(...)
    >>   gy_data = xp.array(...)
    >>   check_backward(func, (x1_data, x2_data), gy_data)

    This method creates :class:`~chainer.Variable` objects with ``x_data``
    and calls ``func`` with the :class:`~chainer.Variable` s to get its result
    as :class:`~chainer.Variable`.
    Then, it sets ``y_grad`` array to ``grad`` attribute of the result and
    calls ``backward`` method to get gradients of the inputs.
    To check correctness of the gradients, the function calls
    :func:`numerical_grad` to calculate numerically the gradients and compares
    the types of gradients with :func:`chainer.testing.assert_allclose`.

    To reduce computational time, it uses a function
    :math:`g: \\mathbb{R} \\rightarrow \\mathbb{R}^n` defined as
    :math:`g(\\alpha) = f(\\alpha x)`, where :math:`\\alpha \\in \\mathbb{R}`
    and :math:`f` is a function which actually
    you want to test.
    Its gradient is

    .. math::
       g'(\\alpha) = f'(\\alpha x) \\cdot x.

    When :math:`\\alpha = 1`, :math:`g'(1) = f'(x) \\cdot x`.
    So :math:`g'(1)` is calculated with :func:`numerical_grad` and
    compared with dot product of the gradient :math:`f` and
    :math:`x`.

    If input objects (``x1_data`` or/and ``x2_data`` in this example) represent
    integer variables, their gradients are ignored.

    You can simplify a test when ``MyFunc`` gets only one argument::

    >>   check_backward(func, x1_data, gy_data)

    If ``MyFunc`` is a loss function which returns a zero-dimensional
    array, pass ``None`` to ``gy_data``. In this case, it sets ``1`` to
    ``grad`` attribute of the result::

    >>   check_backward(my_loss_func, (x1_data, x2_data), None)

    If ``MyFunc`` returns multiple outputs, pass all gradients for outputs
    as a tuple::

    >>   gy1_data = xp.array(...)
    >>   gy2_data = xp.array(...)
    >>   check_backward(func, x1_data, (gy1_data, gy2_data))

    You can also test a :class:`~chainer.Link`.
    To check gradients of parameters of the link, set a tuple of the parameters
    to ``params`` arguments::

    >>   check_backward(my_link, (x1_data, x2_data), gy_data,
    >>                  (my_link.W, my_link.b))

    Note that ``params`` are not ``ndarray`` s,
    but :class:`~chainer.Variables` s.

    Function objects are acceptable as ``func`` argument::

    >>   check_backward(lambda x1, x2: f(x1, x2),
    >>                  (x1_data, x2_data), gy_data)

    .. note::

       ``func`` is called many times to get numerical gradients for all inputs.
       This function doesn't work correctly when ``func`` behaves randomly as
       it gets different gradients.


    Args:
        func (callable): A function which gets :class:`~chainer.Variable` s
            and returns :class:`~chainer.Variable` s. ``func`` must returns
            a tuple of :class:`~chainer.Variable` s or one
            :class:`~chainer.Variable`. You can use :class:`~chainer.Function`
            object, :class:`~chainer.Link` object or a function satisfying the
            condition.
        x_data (ndarray or tuple of ndarrays): A set of ``ndarray`` s to be
            passed to ``func``. If ``x_data`` is one ``ndarray`` object, it is
            treated as ``(x_data,)``.
        y_grad (ndarray or tuple of ndarrays or None):
            A set of ``ndarray`` s representing gradients of return-values of
            ``func``. If ``y_grad`` is one ``ndarray`` object, it is
            treated as ``(y_grad,)``. If ``func`` is a loss-function,
            ``y_grad`` should be set to ``None``.
        params (~chainer.Variable or tuple of ~chainder.Variable):
            A set of :class:`~chainer.Variable` s whose gradients are checked.
            When ``func`` is a :class:`~chainer.Link` object,
            set its parameters as ``params``.
            If ``params`` is one :class:`~chainer.Variable` object,
            it is treated as ``(params,)``.
        eps (float): Epsilon value to be passed to :func:`numerical_grad`.
        atol (float): Absolute tolerance to be passed to
            :func:`chainer.testing.assert_allclose`.
        rtol (float): Relative tolerance to be passed to
            :func:`chainer.testing.assert_allclose`.
        no_grads (list of bool): Flag to skip variable for gradient assertion.
            It should be same length as ``x_data``.
        dtype (~numpy.dtype): ``x_data``, ``y_grad`` and ``params`` are casted
            to this dtype when calculating numerical gradients. Only float
            types and ``None`` are allowed.

    See:
       :func:`numerical_grad`
    """
    x_data = _as_tuple(x_data)
    if y_grad is not None:
        y_grad = _as_tuple(y_grad)
    params = _as_tuple(params)

    xs = [variable.Variable(x) for x in x_data]
    y = func(*xs)
    y = _as_tuple(y)

    # All creators of `y` need to be the same because we only call
    # `y[0].backward` to call `backward` method of the creator.
    # To do so we need to insert a dummy function `Ident` to the
    # computational graph.
    # Note that `func` may not be a `Function` object.
    y = identity.Identity().apply(y)

    y_grad = _set_y_grad(y, y_grad)

    # Clear gradients which may exist if func calls backward inside of itself.
    _clear_grads(xs)
    _clear_grads(params)

    # We only need to call `backward` for one result `Variable`.
    # `Variable.backward` method calls `Function.backward` of its creator.
    y[0].backward()

    param_data = [p.data for p in params]
    if dtype is None:
        casted_xs = [variable.Variable(x) for x in x_data]
    else:
        if numpy.dtype(dtype).kind != 'f':
            raise ValueError('`dtype` is allowed only float type')
        casted_xs = [variable.Variable(x.astype(dtype, copy=False)
                                       if x.dtype.kind == 'f' else x)
                     for x in x_data]

    if no_grads is None:
        no_grads = [x.dtype.kind != 'f' for x in xs]
    else:
        if len(no_grads) != len(xs):
            raise ValueError(
                'Length of no_grads param and xs should be same.')
    casted_data = [x.data.copy() for x in casted_xs]
    for skip, x in six.moves.zip(no_grads, xs):
        if skip:
            assert x.grad is None
        else:
            if x.grad is None:
                raise RuntimeError(
                    'gradients of some arguments are not calculated')

    # Keep the gradient arrays of params which may be overwritten by func
    params_grad = [param.grad for param in params]

    xp = cuda.get_array_module(*xs)
    one = xp.array(1., dtype)

    def g():
        # This functions is called twice in `numerical_grad`.
        # `one` is `1 + epsilon` or `1 - epsilon` in these calls.
        # See the document of `numerical_grad`.
        for skip, cx, data in six.moves.zip(no_grads, casted_xs, casted_data):
            if skip:
                continue
            # astype is require to store data with the given type
            data = (one * data).astype(data.dtype)
            if numpy.isscalar(data):
                data = xp.array(data)
            cx.data = data
        for param, data in six.moves.zip(params, param_data):
            if dtype is not None:
                param_dtype = dtype
            else:
                param_dtype = param.dtype
            # The inner astype is required to calculates __mul__ in
            # `param_type` when data is low accuracy float.
            # The outer one is require to store data with the given type.
            param.data = (one * data.astype(param_dtype)).astype(param_dtype)

        # Clear gradients to support func that calls backward inside of itself.
        _clear_grads(casted_xs)
        _clear_grads(params)

        ys = func(*casted_xs)
        ys = _as_tuple(ys)
        ys_data = tuple(y.data for y in ys)
        for skip, cx, data in six.moves.zip(no_grads, casted_xs, casted_data):
            if skip:
                continue
            cx.data = data
        for param, data in six.moves.zip(params, param_data):
            param.data = data
        return ys_data

    gx, = numerical_grad(g, (one,), y_grad, eps=eps)
    gx_accum = 0
    for skip, x, cx in six.moves.zip(no_grads, xs, casted_xs):
        if skip:
            continue
        gxi = x.grad.ravel()
        cxi = cx.data.ravel()
        if dtype is not None:
            gxi = gxi.astype(dtype)
            cxi = cxi.astype(dtype)
        gx_accum += gxi.dot(cxi)

    for p, gpi in six.moves.zip(params, params_grad):
        gpi = gpi.ravel()
        pi = p.data.ravel()
        if dtype is not None:
            gpi = gpi.astype(dtype)
            pi = pi.astype(dtype)
        gx_accum += gpi.dot(pi)

    testing.assert_allclose(gx, gx_accum, atol=atol, rtol=rtol)
Exemple #30
0
 def add_branch(self):
     x = self.inputs[0]
     output = variable.Variable(x.data)
     output.set_creator(self)
     self.outputs.append(weakref.ref(output))
     return output