Exemple #1
0
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale):
    candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap()

    for y in outputs:
        creator = y.creator_node
        if creator is not None:
            push_candidate(creator)

    input_nodes = set(x.node for x in inputs)

    while candidate_funcs:
        func = pop_candidate()

        # Collect the gradients w.r.t. the outputs
        gys = []
        for y_ref in func.outputs:
            y = y_ref()
            if y is None:
                # output is not a part of the selected subgraph and has already
                # been released.
                gys.append(None)
                continue
            gys.append(grads.get(y, None))
        gys = tuple(gys)

        # Collect the gradients w.r.t. the inputs
        #
        # Note (Tokui): when the same variable is passed multiple times as
        # inputs in the same function (e.g. an expression like f(x, x)), the
        # current implementation passes None as the current gradient w.r.t.
        # such an input except for the first one (i.e., it builds gxs like
        # (gx, None) where gx is the current gradient w.r.t. x).
        gxs = []
        input_indexes = []
        selected_inputs = set()
        for i, x in enumerate(func.inputs):
            if x not in grad_required:
                continue
            input_indexes.append(i)
            if x in selected_inputs:
                gxs.append(None)
            else:
                gxs.append(grads.get(x, None))
                selected_inputs.add(x)
        gxs = tuple(gxs)
        input_indexes = tuple(input_indexes)

        if not input_indexes:
            continue

        # Do backward
        gys = tuple([
            gy if not isinstance(gy, tuple) else chainer.functions.add(*gy)
            for gy in gys
        ])
        new_gxs = func.backward_accumulate(input_indexes, gys, gxs)

        # Delete output gradients that are not required to return
        for y_ref in func.outputs:
            y = y_ref()
            if y is not None and y in grads and y not in input_nodes:
                del grads[y]

        # Update grads
        selected_inputs = set()
        for i, g in zip(input_indexes, new_gxs):
            if g is None:
                continue

            node = func.inputs[i]
            if node in selected_inputs:
                # Accumulate the duplicated gradients here
                cur_gx = grads.get(node, None)
                if cur_gx is not None:
                    if func.lazy_grad_sum:
                        if x.creator is None:
                            g = _backprop_utils.add(g, cur_gx)
                        else:
                            g = _backprop_utils.concat_variable(g, cur_gx)
                    # cur_gx can't be tuple, the lazy_grad_sum can't
                    # be enabled in its sibling node.
                    else:
                        g = g + cur_gx
            else:
                selected_inputs.add(node)

            grads[node] = g

            if retain_grad:
                v = node.get_variable_or_none()
                if v is not None:
                    v.grad_var = g
                    v._loss_scale = loss_scale

            creator = node.creator_node
            if creator is not None:
                push_candidate(creator)
Exemple #2
0
    def backward_accumulate(self, target_input_indexes, grad_outputs,
                            grad_inputs):
        """Computes gradients w.r.t.\\  specified inputs and accumulates them.

        This method provides a way to fuse the backward computation and the
        gradient accumulations in the case that the multiple functions are
        applied to the same variable.

        Users have to override either of this method or :meth:`backward`.
        It is often simpler to implement :meth:`backward` and is recommended
        if you do not need to provide efficient gradient accumulation.

        Args:
            target_input_indexes (tuple of int): Indices of the input variables
                w.r.t. which the gradients are required. It is guaranteed that
                this tuple contains at least one element.
            grad_outputs (tuple of Variable): Gradients w.r.t. the output
                variables. If the gradient w.r.t. an output variable is not
                given, the corresponding element is ``None``.
            grad_inputs (tuple of Variable): Gradients w.r.t. the input
                variables specified by ``target_input_indexes``. These values
                are computed by other computation paths. If there is no
                gradient value existing for the variable, the corresponding
                element is ``None``. See also the note below.

        Returns:
            Tuple of variables that represent the gradients w.r.t. specified
            input variables. Unlike :meth:`backward`, the length of the tuple
            **must** be same as that of ``target_input_indices``.

        .. note::

           When the same variable is passed to the multiple input arguments of
           a function, only the first position of ``grad_inputs`` corresponding
           to these input arguments may contain the gradient variable
           corresponding to that input variable, and other entries are set to
           ``None``. This is an implementation-detail convention to avoid the
           complication of correctly accumulating gradients in such a case.
           This behavior might be changed in a future version.

        """
        assert isinstance(target_input_indexes, tuple)
        assert isinstance(grad_outputs, tuple)
        assert isinstance(grad_inputs, tuple)

        # The default implementation uses backward(). You can override this
        # method without using backward().
        gxs = self.backward(target_input_indexes, grad_outputs)

        len_gxs = len(gxs)
        if len_gxs == len(self.inputs):
            gxs = tuple([gxs[i] for i in target_input_indexes])
        elif len_gxs != len(target_input_indexes):
            raise ValueError(
                'number of gradients returned by %s (%s) is incorrect.' %
                (self._impl_name, self.label))

        if self.lazy_grad_sum:
            gxs_output = ()
            for i, (gx, g_input) in enumerate(six.moves.zip(gxs, grad_inputs)):
                sum_gx = _backprop_utils.concat_variable(gx, g_input)
                j = target_input_indexes[i]
                if self.inputs[j].creator is None and \
                        isinstance(sum_gx, tuple):
                    sum_gx = chainer.functions.add(*sum_gx)
                gxs_output += sum_gx,
            return gxs_output
        else:
            return tuple([
                gx if g_input is None else g_input if gx is None else gx +
                g_input for gx, g_input in six.moves.zip(gxs, grad_inputs)
            ])
Exemple #3
0
    def _backward_main(self, retain_grad):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.cupy.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = {}

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)

        def get_grad(node):
            if node is None:
                return None
            if node in grads:
                return grads[node]
            return node.grad_var

        def set_grad(node, value):
            if node is None:
                return None
            if node in grads:
                grads[node] = value
            var = node.get_variable()
            if var is not None:
                var._grad_var = value

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = [
                i for i, x in enumerate(inputs) if x.requires_grad
            ]
            if not target_input_indexes:
                continue
            outputs = [y() for y in func.outputs]  # access via weak ref

            in_data = tuple([x.data for x in inputs])
            # We need calculate the value of for the out_grad which accumulated
            # because now out_grad is used in backward calculation.
            if func.lazy_grad_sum:
                for y in outputs:
                    grad = get_grad(y)
                    if isinstance(grad, tuple):
                        grad = chainer.functions.accumulate_add(grad)
                        set_grad(y, grad)
            out_grad = tuple([get_grad(y) for y in outputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            cuda.get_device_from_array(*in_data).use()
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            # Collect the current input gradients.
            #
            # Note (Tokui): When the same variable is passed to multiple input
            # slots (e.g. an expression like ``f(x, x)``), it makes the
            # gradient accumulation complicated since the back-propagated
            # gradients w.r.t. the first and second argument should be
            # accumulated to the current gradient w.r.t. the same variable.
            # In this case, the current implementation passes the current
            # gradient only to the first occurrence of the variable in the
            # input tuple and passes ``None`` to the rest of the occurrences.
            # For example, when the input variables are ``(x, x)``, the
            # input gradient passed to the ``backward_accumulate`` method is
            # ``(gx, None)`` where ``gx`` is the current gradient of ``x``.
            # See also the docstring of ``FunctionNode.backward_accumulate``.
            target_inputs = [inputs[i] for i in target_input_indexes]
            in_grad = []
            for i, index_i in enumerate(target_input_indexes):
                x = inputs[index_i]
                if x in target_inputs[:i]:
                    # Pass ``None`` for duplicated input variables except for
                    # the first occurrence (see the comment above).
                    gx = None
                elif x in grads:
                    gx = grads[x]
                elif x.creator_node is None:
                    x._check_old_style_gradient()
                    # accumulate the gradient only if the node is a leaf
                    gx = x.grad_var
                else:
                    gx = None
                in_grad.append(gx)

            gxs = func.backward_accumulate(target_input_indexes, out_grad,
                                           in_grad)

            assert len(gxs) == len(in_grad)
            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                for gx in gxs:
                    if gx is None:
                        continue
                    gx_data = gx.data
                    if gx_data.dtype.kind == 'f':
                        cuda.get_device_from_array(gx_data).use()
                        if cuda.get_array_module(gx_data).isnan(gx_data).any():
                            raise RuntimeError(
                                'NaN is detected on backward computation of '
                                '{}'.format(func.label))

            if not retain_grad:
                for y in outputs:
                    if y is not None and y is not self.node:
                        grads[y] = None
                        y_var = y.get_variable_or_none()
                        if y_var is not None:
                            y_var._grad_var = None

            for i, gx in enumerate(gxs):
                if gx is None:
                    continue

                x = target_inputs[i]
                if not x.requires_grad:
                    continue

                if isinstance(gx, tuple):
                    # No need to check each data in the tuple,
                    # just check the new gx concated in
                    # backward_accumulate().
                    _check_grad_type(func, x, gx[0].data)
                else:
                    _check_grad_type(func, x, gx.data)

                if x in target_inputs[:i]:
                    # Accumulate the duplicated gradients here. See the comment
                    # above the code that builds ``in_grad``.
                    cur_gx = grads[x]
                    if func.lazy_grad_sum:
                        if x.creator is None:
                            gx = _backprop_utils.concat_variable(gx, cur_gx)
                            gx = chainer.functions.accumulate_add(gx)
                            grads[x] = gx
                        else:
                            grads[x] = _backprop_utils.concat_variable(
                                gx, cur_gx)
                    else:
                        grads[x] = gx if cur_gx is None else gx + cur_gx

                else:
                    grads[x] = gx

                x_var = x.get_variable_or_none()
                if x_var is not None:
                    x_var._grad_var = grads[x]

                if x.creator_node is not None:
                    add_cand(x.creator_node)

            del gxs  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()
Exemple #4
0
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale):
    candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap()

    for y in outputs:
        creator = y.creator_node
        if creator is not None:
            push_candidate(creator)

    input_nodes = set(x.node for x in inputs)

    while candidate_funcs:
        func = pop_candidate()

        # Collect the gradients w.r.t. the outputs
        gys = []
        for y_ref in func.outputs:
            y = y_ref()
            if y is None:
                # output is not a part of the selected subgraph and has already
                # been released.
                gys.append(None)
                continue
            gys.append(grads.get(y, None))
        gys = tuple(gys)

        # Collect the gradients w.r.t. the inputs
        #
        # Note (Tokui): when the same variable is passed multiple times as
        # inputs in the same function (e.g. an expression like f(x, x)), the
        # current implementation passes None as the current gradient w.r.t.
        # such an input except for the first one (i.e., it builds gxs like
        # (gx, None) where gx is the current gradient w.r.t. x).
        gxs = []
        input_indexes = []
        selected_inputs = set()
        for i, x in enumerate(func.inputs):
            if x not in grad_required:
                continue
            input_indexes.append(i)
            if x in selected_inputs:
                gxs.append(None)
            else:
                gxs.append(grads.get(x, None))
                selected_inputs.add(x)
        gxs = tuple(gxs)
        input_indexes = tuple(input_indexes)

        if not input_indexes:
            continue

        # Do backward
        gys = tuple([gy if not isinstance(gy, tuple) else
                     chainer.functions.add(*gy)
                     for gy in gys])

        # Call pre-backward hooks
        hooks = chainer.get_function_hooks()
        if func._n_local_function_hooks != 0:
            hooks = collections.OrderedDict(hooks)
            hooks.update(func.local_function_hooks)
        hooks = hooks.values()  # avoid six for performance

        in_data = tuple([x.data for x in func.inputs])
        out_grad_data = tuple(
            [None if g is None else g.data for g in gys])
        cuda.get_device_from_array(*in_data).use()

        for hook in hooks:
            hook.backward_preprocess(func, in_data, out_grad_data)

        new_gxs = func.backward_accumulate(input_indexes, gys, gxs)

        # Call post-backward hooks
        for hook in hooks:
            hook.backward_postprocess(func, in_data, out_grad_data)

        # Delete output gradients that are not required to return
        for y_ref in func.outputs:
            y = y_ref()
            if y is not None and y in grads and y not in input_nodes:
                del grads[y]

        # Update grads
        selected_inputs = set()
        for i, g in zip(input_indexes, new_gxs):
            if g is None:
                continue

            node = func.inputs[i]
            if node in selected_inputs:
                # Accumulate the duplicated gradients here
                cur_gx = grads.get(node, None)
                if cur_gx is not None:
                    if func.lazy_grad_sum:
                        if x.creator is None:
                            g = _backprop_utils.add(g, cur_gx)
                        else:
                            g = _backprop_utils.concat_variable(g, cur_gx)
                    # cur_gx can't be tuple, the lazy_grad_sum can't
                    # be enabled in its sibling node.
                    else:
                        g = g + cur_gx
            else:
                selected_inputs.add(node)

            grads[node] = g

            if retain_grad:
                v = node.get_variable_or_none()
                if v is not None:
                    v.grad_var = g
                    v._loss_scale = loss_scale

            creator = node.creator_node
            if creator is not None:
                push_candidate(creator)
Exemple #5
0
    def backward_accumulate(self, target_input_indexes, grad_outputs,
                            grad_inputs):
        """Computes gradients w.r.t.\\  specified inputs and accumulates them.

        This method provides a way to fuse the backward computation and the
        gradient accumulations in the case that the multiple functions are
        applied to the same variable.

        Users have to override either of this method or :meth:`backward`.
        It is often simpler to implement :meth:`backward` and is recommended
        if you do not need to provide efficient gradient accumulation.

        Args:
            target_input_indexes (tuple of int): Indices of the input variables
                w.r.t. which the gradients are required. It is guaranteed that
                this tuple contains at least one element.
            grad_outputs (tuple of Variable): Gradients w.r.t. the output
                variables. If the gradient w.r.t. an output variable is not
                given, the corresponding element is ``None``.
            grad_inputs (tuple of Variable): Gradients w.r.t. the input
                variables specified by ``target_input_indexes``. These values
                are computed by other computation paths. If there is no
                gradient value existing for the variable, the corresponding
                element is ``None``. See also the note below.

        Returns:
            Tuple of variables that represent the gradients w.r.t. specified
            input variables. Unlike :meth:`backward`, the length of the tuple
            **must** be same as that of ``target_input_indices``.

        .. note::

           When the same variable is passed to the multiple input arguments of
           a function, only the first position of ``grad_inputs`` corresponding
           to these input arguments may contain the gradient variable
           corresponding to that input variable, and other entries are set to
           ``None``. This is an implementation-detail convention to avoid the
           complication of correctly accumulating gradients in such a case.
           This behavior might be changed in a future version.

        """
        assert isinstance(target_input_indexes, tuple)
        assert isinstance(grad_outputs, tuple)
        assert isinstance(grad_inputs, tuple)

        # The default implementation uses backward(). You can override this
        # method without using backward().
        gxs = self.backward(target_input_indexes, grad_outputs)

        len_gxs = len(gxs)
        if len_gxs == len(self.inputs):
            gxs = tuple([gxs[i] for i in target_input_indexes])
        elif len_gxs != len(target_input_indexes):
            raise ValueError(
                'number of gradients returned by %s (%s) is incorrect.'
                % (self._impl_name, self.label))

        if self.lazy_grad_sum:
            gxs_output = ()
            for i, (gx, g_input) in enumerate(six.moves.zip(gxs, grad_inputs)):
                sum_gx = _backprop_utils.concat_variable(gx, g_input)
                j = target_input_indexes[i]
                if self.inputs[j].creator is None and \
                        isinstance(sum_gx, tuple):
                    sum_gx = chainer.functions.add(*sum_gx)
                gxs_output += sum_gx,
            return gxs_output
        else:
            return tuple([gx if g_input is None else
                          g_input if gx is None else
                          gx + g_input
                          for gx, g_input in six.moves.zip(gxs, grad_inputs)])
Exemple #6
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = {}

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            if self.data.ndim != 0:
                warnings.warn(
                    'Treating a scalar as a variable with only one element'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.',
                    DeprecationWarning)
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)

        def get_grad(node):
            if node is None:
                return None
            if node in grads:
                return grads[node]
            return node.grad_var

        def set_grad(node, value):
            if node is None:
                return
            if node in grads:
                grads[node] = value
            var = node.get_variable()
            if var is not None:
                var._grad_var = value

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple([
                i for i, x in enumerate(inputs) if x.requires_grad
            ])
            if not target_input_indexes:
                continue
            outputs = [y() for y in func.outputs]  # access via weak ref

            in_data = tuple([x.data for x in inputs])
            # We need calculate the value of for the out_grad which accumulated
            # because now out_grad is used in backward calculation.
            for y in outputs:
                grad = get_grad(y)
                if isinstance(grad, tuple):
                    grad = chainer.functions.add(*grad)
                    set_grad(y, grad)
            out_grad = tuple([get_grad(y) for y in outputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            cuda.get_device_from_array(*in_data).use()
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            # Collect the current input gradients.
            #
            # Note (Tokui): When the same variable is passed to multiple input
            # slots (e.g. an expression like ``f(x, x)``), it makes the
            # gradient accumulation complicated since the back-propagated
            # gradients w.r.t. the first and second argument should be
            # accumulated to the current gradient w.r.t. the same variable.
            # In this case, the current implementation passes the current
            # gradient only to the first occurrence of the variable in the
            # input tuple and passes ``None`` to the rest of the occurrences.
            # For example, when the input variables are ``(x, x)``, the
            # input gradient passed to the ``backward_accumulate`` method is
            # ``(gx, None)`` where ``gx`` is the current gradient of ``x``.
            # See also the docstring of ``FunctionNode.backward_accumulate``.
            target_inputs = [inputs[i] for i in target_input_indexes]
            in_grad = []
            for i, index_i in enumerate(target_input_indexes):
                x = inputs[index_i]
                if x in target_inputs[:i]:
                    # Pass ``None`` for duplicated input variables except for
                    # the first occurrence (see the comment above).
                    gx = None
                elif x in grads:
                    gx = grads[x]
                elif x.creator_node is None:
                    x._check_old_style_gradient()
                    # accumulate the gradient only if the node is a leaf
                    gx = x.grad_var
                else:
                    gx = None
                in_grad.append(gx)
            in_grad = tuple(in_grad)

            gxs = func.backward_accumulate(
                target_input_indexes, out_grad, in_grad)

            assert len(gxs) == len(in_grad)
            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                for gx in gxs:
                    if gx is None:
                        continue
                    gx_data = gx.data
                    if gx_data.dtype.kind == 'f':
                        cuda.get_device_from_array(gx_data).use()
                        if cuda.get_array_module(gx_data).isnan(gx_data).any():
                            raise RuntimeError(
                                'NaN is detected on backward computation of '
                                '{}'.format(func.label))

            if not retain_grad:
                for y in outputs:
                    if y is not None and y is not self.node:
                        grads[y] = None
                        y_var = y.get_variable_or_none()
                        if y_var is not None:
                            y_var._grad_var = None

            for i, gx in enumerate(gxs):
                if gx is None:
                    continue

                x = target_inputs[i]
                if not x.requires_grad:
                    continue

                if isinstance(gx, tuple):
                    # No need to check each data in the tuple,
                    # just check the new gx concated in
                    # backward_accumulate().
                    _check_grad_type(func, x, gx[0].data)
                else:
                    _check_grad_type(func, x, gx.data)

                if x in target_inputs[:i]:
                    # Accumulate the duplicated gradients here. See the comment
                    # above the code that builds ``in_grad``.
                    cur_gx = grads[x]
                    if func.lazy_grad_sum:
                        if x.creator is None:
                            gx = _backprop_utils.add(gx, cur_gx)
                            grads[x] = gx
                        else:
                            grads[x] = _backprop_utils.concat_variable(
                                gx, cur_gx)
                    else:
                        grads[x] = gx if cur_gx is None else gx + cur_gx

                else:
                    grads[x] = gx

                x_var = x.get_variable_or_none()
                if x_var is not None:
                    x_var._grad_var = grads[x]
                    x_var._loss_scale = loss_scale

                if x.creator_node is not None:
                    add_cand(x.creator_node)

            del gxs  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()