Esempio n. 1
0
def grad(outputs,
         inputs,
         grad_outputs=None,
         grad_inputs=None,
         set_grad=False,
         retain_grad=False,
         enable_double_backprop=False,
         loss_scale=None):
    """Computes the gradient of output variables w.r.t.\\  the input variables.

    This function implements the backpropagation algorithm. While
    :meth:`Variable.backward` also implements backprop, this function selects
    the smallest paths in the computational graph needed to compute the
    gradients w.r.t. inputs. The error is backpropagated only through these
    selected paths, which may reduce the overall computational cost.

    This function also differs from :meth:`Variable.backward` in the way to
    return the gradients; it directly returns the gradient variables as a list
    instead of setting gradients to the :attr:`Variable.grad_var` attribute of
    the original variable. It means users do not need to clear the gradient
    w.r.t. each variable before computing the gradient using this function.
    If ``set_grad`` option is set to ``True``, the computed gradient is also
    stored in the :attr:`Variable.grad_var` attribute of each variable, in
    which case any original value of :attr:`Variable.grad_var` will be updated
    even if it had already been set.

    Args:
        outputs (tuple or list of :class:`~chainer.Variable`):
            A sequence of output variables from which backprop starts.
        inputs (tuple or list of :class:`~chainer.Variable`):
            A sequence of input variables each of which this function computes
            the gradient w.r.t.
        grad_outputs (tuple or list of :class:`~chainer.Variable` or None):
            A sequence of variables that gives the initial value of each output
            gradient.
            If an element is set to ``None``, an array filled with 1 is used.
            If this argument itself is ``None``, it is treated as a sequence of
            ``None``\\ s.
        grad_inputs (tuple or list of :class:`~chainer.Variable` or None):
            A sequence of variables that gives the initial value of each input
            gradient. The gradients computed by the backprop
            algorithm are accumulated to them (not in-place). If an element
            is set to ``None``, the gradient is not accumulated to this value.
            If this argument itself is ``None``, it is treated as a sequence of
            ``None``\\ s.
        set_grad (bool): If it is ``True``, the :attr:`Variable.grad_var`
            attribute of each input variable is set to the corresponding
            computed gradient variable.
        retain_grad (bool): If it is ``True``, the gradients w.r.t. all the
            intermediate variables are stored in the :attr:`Variable.grad_var`
            attribute. In this case, the ``set_grad`` option is ignored.
        enable_double_backprop (bool): If it is ``True``, the computed
            gradients can be further backpropagated. Enabling it may increase
            the memory consumption (and possibly the computational time) to
            remember the intermediate gradient values for the second
            backpropagation.
        loss_scale (float): Loss scaling factor. Loss scaling is a usefull
            technique to mitigate vanishing gradient issue that tends to happen
            when low precision data type like float16 is used during training.
            If you set loss scaling factor, gradients of loss values are to be
            multiplied by the factor before backprop starts. The factor is
            propagated to whole gradients in a computational graph along the
            backprop. The gradients of parameters are divided by the factor
            just before the parameters are to be updated.

    Returns:
        A list of gradient variables w.r.t. the inputs.

    """
    if not isinstance(outputs, (tuple, list)):
        raise TypeError('outputs must be a tuple or a list, not {}.'.format(
            type(outputs)))
    if not isinstance(inputs, (tuple, list)):
        raise TypeError('inputs must be a tuple or a list, not {}.'.format(
            type(inputs)))
    if not (grad_outputs is None or isinstance(grad_outputs, (tuple, list))):
        raise TypeError(
            'grad_outputs must be a tuple or a list or None, not {}.'.format(
                type(grad_outputs)))
    if not (grad_inputs is None or isinstance(grad_inputs, (tuple, list))):
        raise TypeError(
            'grad_inputs must be a tuple or a list or None, not {}.'.format(
                type(grad_inputs)))

    for v in outputs:
        # Raise error here if v is created by Function.backward.
        # In such case, we don't know exact inputs of the creator.
        v.node._check_old_style_gradient()

    # The implementation consists of three steps.

    # 1. Backward enumeration: all the nodes reachable backward from the output
    #    nodes are enumerated. The forward direction links are collected in
    #    this step. Note that the variable nodes whose requires_grad is false
    #    are ignored and their creators are not searched.
    candidate_funcs = [
        v.creator_node for v in outputs if v.creator_node is not None
    ]
    visited_funcs = set()
    forward_graph = collections.defaultdict(list)
    while candidate_funcs:
        func = candidate_funcs.pop()
        if func in visited_funcs:
            continue
        visited_funcs.add(func)
        for x in func.inputs:
            # Raise error here if x is created by Function.backward.
            # In such case, we don't know exact inputs of the creator.
            x._check_old_style_gradient()

            if not x.requires_grad:
                continue
            forward_graph[x].append(func)
            creator = x.creator_node
            if creator is not None and creator not in visited_funcs:
                candidate_funcs.append(creator)

    # 2. Forward enumeration: all the nodes in the subgraph reachable from the
    #    input nodes are enumerated. The extracted (sub-)subgraph is the union
    #    of all paths that backpropagation will visit.
    candidate_vars = [x.node for x in inputs]
    visited_funcs = set()
    grad_required = set()
    while candidate_vars:
        x = candidate_vars.pop()
        grad_required.add(x)
        for func in forward_graph[x]:
            if func in visited_funcs:
                continue
            visited_funcs.add(func)
            for y_ref in func.outputs:
                y = y_ref()
                if y is not None and y in forward_graph:
                    candidate_vars.append(y)

    # 3. Backpropagation: the backpropagation is executed along the
    #    (sub-)subgraph. It uses the topological order of the subgraph which is
    #    induced by the reversed order of function applications ("rank").
    grads = _backprop_utils.GradTable()

    # Initialize the gradient mapping.
    if grad_outputs is None:
        grad_outputs = (None, ) * len(outputs)
    for y, gy in zip(outputs, grad_outputs):
        if gy is None:
            with cuda.get_device_from_array(y.data) as device:
                if device is cuda.DummyDevice:
                    gy_data = numpy.ones_like(y.data)
                else:
                    gy_data = cuda.cupy.ones_like(y.data)
                gy = variable.Variable(gy_data, requires_grad=False)
            if loss_scale is not None:
                gy.data *= loss_scale
        grads[y.node] = gy

    if grad_inputs is not None:
        for x, gx in zip(inputs, grad_inputs):
            if gx is not None:
                grads[x.node] = gx

    # Backprop implementation. It edits grads which will only contain the
    # gradients w.r.t. the inputs.
    with chainer.using_config('enable_backprop', enable_double_backprop):
        ret_dict = _backprop(outputs, inputs, grad_required, retain_grad,
                             grads, loss_scale)

    # Extract the gradients w.r.t. the inputs and return them.
    ret = [ret_dict[x.node] for x in inputs]
    if set_grad:
        for x, gx in zip(inputs, ret):
            x.grad_var = gx

    return ret
Esempio n. 2
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return
        initial_device = None
        if cuda.available and isinstance(self.data, cuda.ndarray):
            try:
                initial_device = cuda.Device()
            except cuda.cupy.cuda.runtime.CUDARuntimeError as e:
                if e.status != 38:  # cudaErrorNoDevice
                    raise

        is_debug = chainer.is_debug()

        cand_funcs = []
        seen_set = set()
        grads = _backprop_utils.GradTable(load_if_new=True)

        # Initialize error by 1, if this is a loss variable
        if self.data.size == 1 and self._grad_var is None:
            if self.data.ndim != 0:
                warnings.warn(
                    'Treating a scalar as a variable with only one element'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.', DeprecationWarning)
            with cuda.get_device_from_array(self.data) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.data)
                else:
                    self.grad = cuda.cupy.ones_like(self.data)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)
        leaf_nodes = set()

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple(
                [i for i, x in enumerate(inputs) if x.requires_grad])
            outputs = [y() for y in func.outputs]  # access via weak ref
            out_grad = tuple([grads.pop(y) for y in outputs])
            if not target_input_indexes:
                continue

            in_data = tuple([x.data for x in inputs])
            out_grad_data = tuple(
                [None if g is None else g.data for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            cuda.get_device_from_array(*(in_data + out_grad_data)).use()
            for hook in hooks:
                hook.backward_preprocess(func, in_data, out_grad_data)

            # Collect the current input gradients.
            target_inputs = [inputs[i] for i in target_input_indexes]
            # Keep the order for the portability, rather than
            # in_grad = {x: grads.get_as_list(x) for x in set(target_inputs)}
            in_grad = collections.OrderedDict()
            for x in target_inputs:
                if x not in in_grad:
                    in_grad[x] = grads.get_as_list(x)

            _backprop_utils.backprop_step(func, target_input_indexes, out_grad,
                                          in_grad)

            for hook in hooks:
                hook.backward_postprocess(func, in_data, out_grad_data)

            if is_debug:
                # each grad is a list of variables
                # iter_gxs expands it as a sequence of variables.
                def iter_gxs(gxs):
                    for gx in gxs:
                        for gx_elem in gx:
                            yield gx_elem

                for gx in iter_gxs(in_grad.values()):
                    gx_data = gx.data
                    if gx_data.dtype.kind == 'f':
                        cuda.get_device_from_array(gx_data).use()
                        if cuda.get_array_module(gx_data).isnan(gx_data).any():
                            raise RuntimeError(
                                'NaN is detected on backward computation of '
                                '{}'.format(func.label))

            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None and y is not self.node:
                    y_var = y.get_variable_or_none()
                    if y_var is not None:
                        y_var._grad_var = gy if retain_grad else None

            for x, gx in in_grad.items():
                if not gx:  # gradient == None
                    continue

                for gx_elem in gx:
                    _check_grad_type(func, x, gx_elem.data)

                if x.creator_node is None:  # leaf
                    leaf_nodes.add(x)
                else:
                    add_cand(x.creator_node)

            del in_grad  # to reduce memory usage
            if initial_device is not None:
                initial_device.use()

        for x in leaf_nodes:
            x_var = x.get_variable_or_none()
            gx = grads.pop(x)
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale
        grads.assert_no_grads()
Esempio n. 3
0
    def _backward_main(self, retain_grad, loss_scale):
        self._node._check_old_style_gradient()
        if self.creator_node is None:
            return

        # fix py2 memory leak
        OrderedDict = chainer.utils._collections.OrderedDict

        cand_funcs = []
        seen_set = set()
        grads = _backprop_utils.GradTable(load_if_new=True)

        # Initialize error by 1, if this is a loss variable
        if self.array.size == 1 and self._grad_var is None:
            if self.array.ndim != 0:
                warnings.warn(
                    'Treating a variable with only one element as a scalar'
                    ' in Variable.backward is deprecated. A scalar variable'
                    ' must be a 0-dimensional array. Apply'
                    ' chainer.functions.squeeze to obtain a scalar variable.'
                    ' If the size of this variable accidentally becomes one,'
                    ' set zero to grad.', DeprecationWarning)
            with cuda.get_device_from_array(self.array) as device:
                if device is cuda.DummyDevice:
                    self.grad = numpy.ones_like(self.array)
                else:
                    self.grad = cuda.cupy.ones_like(self.array)
            if loss_scale is not None:
                self.grad *= loss_scale
        grads[self._node] = self._grad_var

        def add_cand(cand):
            if cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator_node)
        leaf_nodes = set()

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            inputs = func.inputs
            target_input_indexes = tuple(
                [i for i, x in enumerate(inputs) if x.requires_grad])
            outputs = [y() for y in func.outputs]  # access via weak ref
            out_grad = tuple([grads.pop(y) for y in outputs])
            if not target_input_indexes:
                continue

            in_data = tuple([x.data for x in inputs])
            out_grad_array = tuple(
                [None if g is None else g.array for g in out_grad])
            hooks = chainer.get_function_hooks()
            if func._n_local_function_hooks != 0:
                hooks = collections.OrderedDict(hooks)
                hooks.update(func.local_function_hooks)
            hooks = hooks.values()  # avoid six for performance

            with cuda.get_device_from_array(*(in_data + out_grad_array)):
                for hook in hooks:
                    hook.backward_preprocess(func, in_data, out_grad_array)

                # Collect the current input gradients.
                target_inputs = [inputs[i] for i in target_input_indexes]
                # Keep the order for the portability, rather than
                # in_grad = {x: grads.get_as_list(x)
                #            for x in set(target_inputs)}
                in_grad = OrderedDict()
                for x in target_inputs:
                    if x not in in_grad:
                        in_grad[x] = grads.get_as_list(x)
                        # to reduce memory usage
                        x._set_grad_var_if_available(None)

                _backprop_utils.backprop_step(func, target_input_indexes,
                                              out_grad, in_grad)

                for hook in hooks:
                    hook.backward_postprocess(func, in_data, out_grad_array)

            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None and y is not self.node:
                    y._set_grad_var_if_available(gy if retain_grad else None)
            del gy, out_grad  # to reduce memory usage

            for x, gx in in_grad.items():
                if not gx:  # gradient == None
                    continue

                for gx_elem in gx:
                    _check_grad_type(func, x, True, gx_elem, True)
                del gx_elem  # to reduce memory usage

                if x.creator_node is None:  # leaf
                    leaf_nodes.add(x)
                else:
                    add_cand(x.creator_node)
            del gx, in_grad  # to reduce memory usage

        for x in leaf_nodes:
            x_var = x.get_variable_or_none()
            gx = grads.pop(x)
            if x_var is not None:
                x_var._grad_var = gx
                x_var._loss_scale = loss_scale
        grads.assert_no_grads()
Esempio n. 4
0
def _backprop_to_all(outputs, retain_grad, loss_scale):
    """Backprop to all input variables

    Args:
        outputs (list of tuple): each tuple is (y_node, y_grad_var).
            y_grad_var should not be None.
        retain_grad (bool): see docstring of Variable.backward
        loss_scale (float): see docstring of Variable.backward

    """
    OrderedDict = chainer.utils._collections.OrderedDict  # fix py2 memory leak

    cand_funcs = []
    seen_set = set()

    def add_cand(cand):
        if cand not in seen_set:
            # Negate since heapq is min-heap
            heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
            seen_set.add(cand)

    grads = _backprop_utils.GradTable(accumulate_grad_inputs=True)

    leaf_nodes = set()

    for y, gy in outputs:
        grads.accumulate(y, gy)

        func = y.creator_node
        if func is None:  # leaf
            leaf_nodes.add(y)
        else:
            add_cand(func)

    # Fix F812 (Python 2)
    y = None
    del y

    is_debug = chainer.is_debug()
    base_hooks = chainer.get_function_hooks().values()
    while cand_funcs:
        _, _, func = heapq.heappop(cand_funcs)
        inputs = func.inputs
        target_input_indexes = tuple([
            i for i, x in enumerate(inputs) if x.requires_grad
        ])
        outputs = [y() for y in func.outputs]  # access via weak ref
        out_grad = tuple([grads.pop(y)
                          if y is not None and y.creator_node is not None
                          else None
                          for y in outputs])
        if not target_input_indexes:
            continue

        in_data = [x.data for x in inputs]
        out_grad_array = [None if g is None else g.raw_array for g in out_grad]
        if func._n_local_function_hooks != 0:
            local_hooks = collections.OrderedDict(chainer.get_function_hooks())
            local_hooks.update(func.local_function_hooks)
            hooks = local_hooks.values()  # avoid six for performance
        else:
            hooks = base_hooks

        with chainer.using_device(
                backend.get_device_from_array(*(in_data + out_grad_array))):
            for hook in hooks:
                hook.backward_preprocess(
                    func, tuple(in_data), tuple(out_grad_array))

            # Collect the current input gradients.
            target_inputs = [inputs[i] for i in target_input_indexes]
            # Keep the order for the portability, rather than
            # in_grad = {x: grads.get_as_list(x)
            #            for x in set(target_inputs)}
            in_grad = OrderedDict()
            for x in target_inputs:
                if x not in in_grad:
                    in_grad[x] = grads.get_as_list(x)

            _backprop_utils.backprop_step(
                func, target_input_indexes, out_grad, in_grad, is_debug)

            for hook in hooks:
                hook.backward_postprocess(
                    func, tuple(in_data), tuple(out_grad_array))

        if retain_grad:
            # The gradients of the outputs of `func` are final. Store them if
            # retain_grad=True.
            for y, gy in six.moves.zip(outputs, out_grad):
                if y is not None:
                    y._set_grad_var_if_available(gy)
            del gy  # to reduce memory usage
        del out_grad  # to reduce memory usage

        for x, gx in in_grad.items():
            if not gx:  # gradient == None
                continue

            for gx_elem in gx:
                if gx_elem is not None:
                    chainer.variable._check_grad_type(
                        func, x, True, gx_elem.raw_array)
            del gx_elem  # to reduce memory usage

            if x.creator_node is None:  # leaf
                leaf_nodes.add(x)
            else:
                add_cand(x.creator_node)
        del gx, in_grad  # to reduce memory usage

    for x in leaf_nodes:
        x_var = x.get_variable_or_none()
        gx = grads.pop(x)
        if x_var is not None:
            x_var._set_grad_var_without_check(gx)
            x_var._loss_scale = loss_scale
    grads.assert_no_grads()