def __enter__(self): function_hooks = chainer.get_function_hooks() if self.name in function_hooks: raise KeyError("hook %s already exists" % self.name) function_hooks[self.name] = self return self
def __enter__(self): function_hooks = chainer.get_function_hooks() if self.name in function_hooks: raise KeyError('hook %s already exists' % self.name) function_hooks[self.name] = self self.added(None) return self
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = _backprop_utils.GradTable(load_if_new=True) # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: if self.data.ndim != 0: warnings.warn( 'Treating a scalar as a variable with only one element' ' in Variable.backward is deprecated. A scalar variable' ' must be a 0-dimensional array. Apply' ' chainer.functions.squeeze to obtain a scalar variable.' ' If the size of this variable accidentally becomes one,' ' set zero to grad.', DeprecationWarning) with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) leaf_nodes = set() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple([ i for i, x in enumerate(inputs) if x.requires_grad ]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) for y in outputs]) if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*(in_data + out_grad_data)).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) for x in set(target_inputs)} in_grad = collections.OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) _backprop_utils.backprop_step( func, target_input_indexes, out_grad, in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: # each grad is a list of variables # iter_gxs expands it as a sequence of variables. def iter_gxs(gxs): for gx in gxs: for gx_elem in gx: yield gx_elem for gx in iter_gxs(in_grad.values()): gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on backward computation of ' '{}'.format(func.label)) for y, gy in six.moves.zip(outputs, out_grad): if y is not None and y is not self.node: y_var = y.get_variable_or_none() if y_var is not None: y_var._grad_var = gy if retain_grad else None for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: _check_grad_type(func, x, gx_elem.data) if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del in_grad # to reduce memory usage if initial_device is not None: initial_device.use() for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale grads.assert_no_grads()
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is expressed in documentation of :class:`Function` class. .. note:: If the :data:`~Variable.data` attribute of input variables exist on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementers do not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable` objects. The volatile flags of all input variables must agree. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ in_data = tuple([x.data for x in inputs]) if chainer.is_debug(): self._stack = traceback.extract_stack() if self.type_check_enable: self._check_data_type_forward(in_data) hooks = collections.OrderedDict(chainer.get_function_hooks()) hooks.update(self.local_function_hooks) for hook in six.itervalues(hooks): hook.forward_preprocess(self, in_data) # Forward prop with cuda.get_device(*in_data): outputs = self.forward(in_data) assert type(outputs) == tuple for hook in six.itervalues(hooks): hook.forward_postprocess(self, in_data) if chainer.is_debug(): if any(out.dtype.kind == "f" and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = "NaN is detected on forward computation" raise RuntimeError(msg) out_v = flag.aggregate_flags([x.volatile for x in inputs]) ret = tuple([variable.Variable(y, volatile=out_v) for y in outputs]) if out_v != "on": # Topological ordering self.rank = max([x.rank for x in inputs]) if inputs else 0 # Backward edges for y in ret: y.set_creator(self) self.inputs = inputs # Forward edges (must be weak references) self.outputs = tuple([weakref.ref(y) for y in ret]) if len(ret) == 1: return ret[0] else: return ret
def __exit__(self, exc_type, exc_value, traceback): del chainer.get_function_hooks()[self.name]
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is None, then this method automatically complements 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If True, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some model, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag False. """ if self.creator is None: return cand_funcs = [] seen_set = set() seen_vars = set() need_copy = set() # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.get_device(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) if self.creator is not None: add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = tuple(y() for y in func.outputs) # access via weak ref in_data = tuple(x.data for x in func.inputs) out_grad = tuple(None if y is None else y.grad for y in outputs) hooks = collections.OrderedDict(chainer.get_function_hooks()) hooks.update(func.local_function_hooks) for hook in six.itervalues(hooks): hook.backward_preprocess(func, in_data, out_grad) with cuda.get_device(*(in_data + out_grad)): gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) for hook in six.itervalues(hooks): hook.backward_postprocess(func, in_data, out_grad) if chainer.is_debug(): if any(gx is not None and cuda.get_array_module(gx).isnan(gx).any() for gx in gxs): msg = 'NaN is detected on backward computation' raise RuntimeError(msg) if not retain_grad: for y in outputs: if y is not None and y is not self: y.grad = None for x, gx in zip(func.inputs, gxs): if gx is None: continue _check_grad_type(func, x, gx) # Accumulate the gradient to x. It is a bit tricky to handle # branches and parameter gradient accumulation correctly. with cuda.get_device(gx): id_x = id(x) if x.creator is None: # leaf if x._grad is None: x.grad = gx need_copy.add(id_x) elif id_x in need_copy: x.grad = x.grad + gx # copy need_copy.remove(id_x) else: x._grad += gx else: # not a leaf add_cand(x.creator) if id_x not in seen_vars: # 1st visit x.grad = gx seen_vars.add(id_x) need_copy.add(id_x) elif id_x in need_copy: # 2nd visit x._grad = gx + x._grad # copied need_copy.remove(id_x) else: # 3rd or later visit x._grad += gx del gxs # to reduce memory usage
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is expressed in documentation of :class:`Function` class. .. note:: If the :data:`~Variable.data` attribute of input variables exist on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementers do not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or :class:`cupy.ndarray` objects. The volatile flags of all input variables must agree. If the input is an :class:`numpy.ndarray` or a :class:`cupy.ndarray`, it is automatically wrapped with :class:`Variable`. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ inputs = [ x if isinstance(x, chainer.Variable) else chainer.Variable( x, volatile=flag.AUTO) for x in inputs ] in_data = tuple([x.data for x in inputs]) if chainer.is_debug(): self._stack = traceback.extract_stack() if self.type_check_enable: self._check_data_type_forward(in_data) hooks = collections.OrderedDict(chainer.get_function_hooks()) hooks.update(self.local_function_hooks) for hook in six.itervalues(hooks): hook.forward_preprocess(self, in_data) # Forward prop with cuda.get_device(*in_data): outputs = self.forward(in_data) assert type(outputs) == tuple for hook in six.itervalues(hooks): hook.forward_postprocess(self, in_data) if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = 'NaN is detected on forward computation' raise RuntimeError(msg) out_v = flag.aggregate_flags([x.volatile for x in inputs]) ret = tuple([variable.Variable(y, volatile=out_v) for y in outputs]) if out_v != 'on': # Topological ordering self.rank = max([x.rank for x in inputs]) if inputs else 0 # Backward edges for y in ret: y.set_creator(self) self.inputs = inputs # Forward edges (must be weak references) self.outputs = tuple([weakref.ref(y) for y in ret]) if len(ret) == 1: return ret[0] else: return ret
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale): candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap() for y in outputs: creator = y.creator_node if creator is not None: push_candidate(creator) input_nodes = set(x.node for x in inputs) ret_dict = {} while candidate_funcs: func = pop_candidate() # Collect the gradients w.r.t. the outputs ys = [y() for y in func.outputs] # access via weak ref gys = tuple([grads.pop(y) for y in ys]) for node, gy in six.moves.zip(ys, gys): if node is not None: if node in input_nodes: ret_dict[node] = gy if retain_grad: y = node.get_variable_or_none() if y is not None: y.grad_var = gy y._loss_scale = loss_scale # Collect the gradients w.r.t. the inputs input_indexes = [] x_grads = collections.OrderedDict() for i, x in enumerate(func.inputs): if x not in grad_required: continue input_indexes.append(i) if x not in x_grads: x_grads[x] = grads.get_as_list(x) if not input_indexes: continue input_indexes = tuple(input_indexes) # Do backward # Call pre-backward hooks hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance in_data = tuple([x.data for x in func.inputs]) out_grad_data = tuple( [None if g is None else g.data for g in gys]) with cuda.get_device_from_array(*in_data): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) _backprop_utils.backprop_step(func, input_indexes, gys, x_grads) # Call post-backward hooks for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) # Update grads for node, g in x_grads.items(): if not g: # gradient == None continue creator = node.creator_node if creator is not None: push_candidate(creator) for x in input_nodes: if x not in ret_dict: ret_dict[x] = grads.pop(x) return ret_dict
def __exit__(self, *_): chainer.get_function_hooks()[self.name].deleted(None) del chainer.get_function_hooks()[self.name]
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ chainerx_in_data = None chainerx_device = None is_chainerx, in_data = _extract_apply_in_data(inputs) if is_chainerx: # Try ChainerX C++ implementation. # If it's supported, the output arrays are wrapped with Variables # and returned. # If not supported, FunctionNode.forward_chainerx should return # Fallback. # In that case the input arrays are converted to numpy.ndarray # or cupy.ndarray (depending on the ChainerX backend) and # forward computation falls back to the conventional # FunctionNode.forward() implementaion. outputs = self.forward_chainerx(in_data) if outputs is not chainer.Fallback: # Supported. Wrap with variables and return assert isinstance(outputs, tuple) return tuple([ variable.Variable( y, requires_grad=y.is_backprop_required()) for y in outputs]) # Fall back to FunctionNode.forward() chainerx_in_data, in_data, chainerx_device = ( self._chainerx_apply_fallback_preprocess(in_data, inputs)) self._is_chainex_fallback_mode = True self.chainerx_device = chainerx_device utils._check_arrays_forward_compatible(in_data, self.label) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) elif self._is_chainex_fallback_mode: # In ChainerX fallback, __class__ is temporarily replaced with # the fabricated one with automatic attirbute fallback. with _chainerx_attribute_fallback(self, chainerx_device): outputs = self.forward(in_data) else: # In normal case, simply run the forward method. outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError( 'forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) self._output_count = len(outputs) if self._is_chainex_fallback_mode: ret = self._chainerx_apply_fallback_postprocess( chainerx_in_data, inputs, outputs) else: input_vars = [chainer.as_variable(x) for x in inputs] requires_grad = any([x.requires_grad for x in input_vars]) ret = tuple( [variable.Variable(y, requires_grad=requires_grad) for y in outputs]) if configuration.config.enable_backprop: # Topological ordering self.rank = max( [x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale): candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap() for y in outputs: creator = y.creator_node if creator is not None: push_candidate(creator) input_nodes = set(x.node for x in inputs) while candidate_funcs: func = pop_candidate() # Collect the gradients w.r.t. the outputs gys = [] for y_ref in func.outputs: y = y_ref() if y is None: # output is not a part of the selected subgraph and has already # been released. gys.append(None) continue gys.append(grads.get(y, None)) gys = tuple(gys) # Collect the gradients w.r.t. the inputs # # Note (Tokui): when the same variable is passed multiple times as # inputs in the same function (e.g. an expression like f(x, x)), the # current implementation passes None as the current gradient w.r.t. # such an input except for the first one (i.e., it builds gxs like # (gx, None) where gx is the current gradient w.r.t. x). gxs = [] input_indexes = [] selected_inputs = set() for i, x in enumerate(func.inputs): if x not in grad_required: continue input_indexes.append(i) if x in selected_inputs: gxs.append(None) else: gxs.append(grads.get(x, None)) selected_inputs.add(x) gxs = tuple(gxs) input_indexes = tuple(input_indexes) if not input_indexes: continue # Do backward gys = tuple([ gy if not isinstance(gy, tuple) else chainer.functions.add(*gy) for gy in gys ]) # Call pre-backward hooks hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance in_data = tuple([x.data for x in func.inputs]) out_grad_data = tuple([None if g is None else g.data for g in gys]) cuda.get_device_from_array(*in_data).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) new_gxs = func.backward_accumulate(input_indexes, gys, gxs) # Call post-backward hooks for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) # Delete output gradients that are not required to return for y_ref in func.outputs: y = y_ref() if y is not None and y in grads and y not in input_nodes: del grads[y] # Update grads selected_inputs = set() for i, g in zip(input_indexes, new_gxs): if g is None: continue node = func.inputs[i] if node in selected_inputs: # Accumulate the duplicated gradients here cur_gx = grads.get(node, None) if cur_gx is not None: if func.lazy_grad_sum: if x.creator is None: g = _backprop_utils.add(g, cur_gx) else: g = _backprop_utils.concat_variable(g, cur_gx) # cur_gx can't be tuple, the lazy_grad_sum can't # be enabled in its sibling node. else: g = g + cur_gx else: selected_inputs.add(node) grads[node] = g if retain_grad: v = node.get_variable_or_none() if v is not None: v.grad_var = g v._loss_scale = loss_scale creator = node.creator_node if creator is not None: push_candidate(creator)
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable` or :ref:`ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ input_vars = [chainer.as_variable(x) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) # Check for input array types if not chainer.is_arrays_compatible(in_data): raise TypeError( 'incompatible array types are mixed in the forward input ' '({}).\n' 'Actual: {}'.format(self.label, ', '.join(str(type(x)) for x in in_data))) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) else: outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError('forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format(self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def _backprop_to_all(outputs, retain_grad, loss_scale): """Backprop to all input variables Args: outputs (list of tuple): each tuple is (y_node, y_grad_var). y_grad_var should not be None. retain_grad (bool): see docstring of Variable.backward loss_scale (float): see docstring of Variable.backward """ OrderedDict = chainer.utils._collections.OrderedDict # fix py2 memory leak cand_funcs = [] seen_set = set() def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) grads = _backprop_utils.GradTable(accumulate_grad_inputs=True) leaf_nodes = set() for y, gy in outputs: grads.accumulate(y, gy) func = y.creator_node if func is None: # leaf leaf_nodes.add(y) else: add_cand(func) # Fix F812 (Python 2) y = None del y is_debug = chainer.is_debug() base_hooks = chainer.get_function_hooks().values() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple([ i for i, x in enumerate(inputs) if x.requires_grad ]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) if y is not None and y.creator_node is not None else None for y in outputs]) if not target_input_indexes: continue in_data = [x.data for x in inputs] out_grad_array = [None if g is None else g.raw_array for g in out_grad] if func._n_local_function_hooks != 0: local_hooks = collections.OrderedDict(chainer.get_function_hooks()) local_hooks.update(func.local_function_hooks) hooks = local_hooks.values() # avoid six for performance else: hooks = base_hooks with chainer.using_device( backend.get_device_from_array(*(in_data + out_grad_array))): for hook in hooks: hook.backward_preprocess( func, tuple(in_data), tuple(out_grad_array)) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) # for x in set(target_inputs)} in_grad = OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) _backprop_utils.backprop_step( func, target_input_indexes, out_grad, in_grad, is_debug) for hook in hooks: hook.backward_postprocess( func, tuple(in_data), tuple(out_grad_array)) if retain_grad: # The gradients of the outputs of `func` are final. Store them if # retain_grad=True. for y, gy in six.moves.zip(outputs, out_grad): if y is not None: y._set_grad_var_if_available(gy) del gy # to reduce memory usage del out_grad # to reduce memory usage for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: if gx_elem is not None: chainer.variable._check_grad_type( func, x, True, gx_elem.raw_array) del gx_elem # to reduce memory usage if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del gx, in_grad # to reduce memory usage for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._set_grad_var_without_check(gx) x_var._loss_scale = loss_scale grads.assert_no_grads()
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`FunctionNode.backward` is called on each :class:`FunctionNode` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variable nodes to their creators, and from function nodes to their input variable nodes. The backprop stops at all root nodes. Some function nodes set ``None`` as gradients of some inputs, where further backprop does not take place at such inputs. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is ``None``, then this method automatically complements 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Note that this method does not support *differentiable backprop*. Use :func:`grad` to compute the gradient of gradients. Args: retain_grad (bool): If ``True``, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some models, the purpose of backprop is to compute gradients of parameters, not of all variables, and therefore it is recommended to set this flag ``False``. """ self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.cupy.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = {} # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) def get_grad(node): if node is None: return None if node in grads: return grads[node] return node.grad_var while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in inputs]) out_grad = tuple([get_grad(y) for y in outputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*in_data).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. # # Note (Tokui): When the same variable is passed to multiple input # slots (e.g. an expression like ``f(x, x)``), it makes the # gradient accumulation complicated since the back-propagated # gradients w.r.t. the first and second argument should be # accumulated to the current gradient w.r.t. the same variable. # In this case, the current implementation passes the current # gradient only to the first occurrence of the variable in the # input tuple and passes ``None`` to the rest of the occurrences. # For example, when the input variables are ``(x, x)``, the # input gradient passed to the ``backward_accumulate`` method is # ``(gx, None)`` where ``gx`` is the current gradient of ``x``. # See also the docstring of ``FunctionNode.backward_accumulate``. target_input_indexes = [ i for i, x in enumerate(inputs) if x.requires_grad ] target_inputs = [inputs[i] for i in target_input_indexes] in_grad = [] for i, index_i in enumerate(target_input_indexes): x = inputs[index_i] if x in target_inputs[:i]: # Pass ``None`` for duplicated input variables except for # the first occurrence (see the comment above). gx = None elif x in grads: gx = grads[x] elif x.creator_node is None: x._check_old_style_gradient() # accumulate the gradient only if the node is a leaf gx = x.grad_var else: gx = None in_grad.append(gx) gxs = func.backward_accumulate(target_input_indexes, out_grad, in_grad) assert len(gxs) == len(in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: for gx in gxs: if gx is None: continue gx_data = gx.data cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): msg = 'NaN is detected on backward computation' raise RuntimeError(msg) if not retain_grad: for y in outputs: if y is not None and y is not self.node: grads[y] = None y_var = y.get_variable() if y_var is not None: y_var._grad_var = None for i, gx in enumerate(gxs): if gx is None: continue x = target_inputs[i] if not x.requires_grad: continue _check_grad_type(func, x, gx.data) if x in target_inputs[:i]: # Accumulate the duplicated gradients here. See the comment # above the code that builds ``in_grad``. cur_gx = grads[x] grads[x] = gx if cur_gx is None else gx + cur_gx else: grads[x] = gx x_var = x.get_variable() if x_var is not None: x_var._grad_var = grads[x] if x.creator_node is not None: add_cand(x.creator_node) del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ input_vars = [chainer.as_variable(x) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) # Check for input array types if not chainer.is_arrays_compatible(in_data): raise TypeError( 'incompatible array types are mixed in the forward input ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in in_data))) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) else: outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError( 'forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) ret = tuple([variable.Variable(y, requires_grad=requires_grad) for y in outputs]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is ``None``, then this method automatically complements 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If ``True``, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some models, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag ``False``. """ if self.creator is None: return initial_device = None if cuda.available: try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() seen_vars = set() need_copy = set() # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.get_device(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in func.inputs]) out_grad = () # if enable grad accumulate if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): out_grad_tmp = tuple([None if y is None else y.grad for y in outputs]) acc_grad_tuple = tuple([None if y is None else y.acc_grad for y in outputs]) for grad_tmp, acc_grad in zip(out_grad_tmp, acc_grad_tuple): if len(acc_grad) == 0: # no need accumulate, just return grad out_grad += (grad_tmp,) else: """ acc_grad's length is not 0, means need to do grad accumulate call native MKLDNN sum primitive """ y = numpy.empty((grad_tmp.shape), dtype=grad_tmp.dtype) acc_grad += (grad_tmp,) mkldnn_sum = mkldnn.Sum_F32() mkldnn_sum.sum4d_gx(acc_grad, y) out_grad += (y,) else: out_grad = tuple([None if y is None else y.grad for y in outputs]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) cuda.get_device(*(in_data + out_grad)).use() for hook in six.itervalues(hooks): hook.backward_preprocess(func, in_data, out_grad) if isinstance(func, chainer.functions.connection.convolution_2d.Convolution2DFunction): _x = func.inputs[0] if _x.creator is None and func.in_chain is True: func.mkldnn_opt = True cosim_output = func.backward_cpu_cosim(in_data, out_grad) gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) func.cpu_cosim_verify_result(gxs, cosim_output) for hook in six.itervalues(hooks): hook.backward_postprocess(func, in_data, out_grad) if is_debug: for gx in gxs: if gx is None: continue cuda.get_device(gx).use() if cuda.get_array_module(gx).isnan(gx).any(): msg = 'NaN is detected on backward computation' raise RuntimeError(msg) if not retain_grad: for y in outputs: if y is not None and y is not self: y.grad = None for x, gx in zip(func.inputs, gxs): if gx is None: continue _check_grad_type(func, x, gx) # Accumulate the gradient to x. It is a bit tricky to handle # branches and parameter gradient accumulation correctly. id_x = id(x) if x.creator is None: # leaf if x._grad is None: # 1st visit x.grad = gx need_copy.add(id_x) else: cuda.get_device(gx).use() if id_x in need_copy: # 2nd visit if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad,will deply to do grad accumulate,only record grad x.acc_grad += (gx,) else: x.grad = utils.force_array(x.grad + gx) # copy need_copy.remove(id_x) # remove from list in 2nd visit else: if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad, will deply to do grad accumulate, only record grad if len(x.acc_grad) > 0: # means 3rd or later visit for variable x x.acc_grad += (gx,) else: # means this variable is W or b x._grad += gx else: x._grad += gx # 3rd or later visit else: # not a leaf add_cand(x.creator) if id_x not in seen_vars: # 1st visit x.grad = gx seen_vars.add(id_x) need_copy.add(id_x) else: cuda.get_device(gx).use() if id_x in need_copy: # 2nd visit if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad, will deply to do grad accumulate, only record grad x.acc_grad += (gx,) else: x._grad = utils.force_array(gx + x._grad) # copied need_copy.remove(id_x) else: # 3rd or later visit if mkld.enable_acc_gradF((in_data,)) and in_data[0].ndim == 4 and all(isinstance(xi, numpy.ndarray) for xi in in_data): # if enable_acc_grad, will deply to do grad accumulate, only record grad x.acc_grad += (gx,) else: x._grad += gx del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def __exit__(self, *_): del chainer.get_function_hooks()[self.name]
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale): candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap() for y in outputs: creator = y.creator_node if creator is not None: push_candidate(creator) input_nodes = set(x.node for x in inputs) while candidate_funcs: func = pop_candidate() # Collect the gradients w.r.t. the outputs gys = [] for y_ref in func.outputs: y = y_ref() if y is None: # output is not a part of the selected subgraph and has already # been released. gys.append(None) continue gys.append(grads.get(y, None)) gys = tuple(gys) # Collect the gradients w.r.t. the inputs # # Note (Tokui): when the same variable is passed multiple times as # inputs in the same function (e.g. an expression like f(x, x)), the # current implementation passes None as the current gradient w.r.t. # such an input except for the first one (i.e., it builds gxs like # (gx, None) where gx is the current gradient w.r.t. x). gxs = [] input_indexes = [] selected_inputs = set() for i, x in enumerate(func.inputs): if x not in grad_required: continue input_indexes.append(i) if x in selected_inputs: gxs.append(None) else: gxs.append(grads.get(x, None)) selected_inputs.add(x) gxs = tuple(gxs) input_indexes = tuple(input_indexes) if not input_indexes: continue # Do backward gys = tuple([gy if not isinstance(gy, tuple) else chainer.functions.add(*gy) for gy in gys]) # Call pre-backward hooks hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance in_data = tuple([x.data for x in func.inputs]) out_grad_data = tuple( [None if g is None else g.data for g in gys]) cuda.get_device_from_array(*in_data).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) new_gxs = func.backward_accumulate(input_indexes, gys, gxs) # Call post-backward hooks for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) # Delete output gradients that are not required to return for y_ref in func.outputs: y = y_ref() if y is not None and y in grads and y not in input_nodes: del grads[y] # Update grads selected_inputs = set() for i, g in zip(input_indexes, new_gxs): if g is None: continue node = func.inputs[i] if node in selected_inputs: # Accumulate the duplicated gradients here cur_gx = grads.get(node, None) if cur_gx is not None: if func.lazy_grad_sum: if x.creator is None: g = _backprop_utils.add(g, cur_gx) else: g = _backprop_utils.concat_variable(g, cur_gx) # cur_gx can't be tuple, the lazy_grad_sum can't # be enabled in its sibling node. else: g = g + cur_gx else: selected_inputs.add(node) grads[node] = g if retain_grad: v = node.get_variable_or_none() if v is not None: v.grad_var = g v._loss_scale = loss_scale creator = node.creator_node if creator is not None: push_candidate(creator)
def _backward_main(self, retain_grad): self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.cupy.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = {} # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) def get_grad(node): if node is None: return None if node in grads: return grads[node] return node.grad_var while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = [ i for i, x in enumerate(inputs) if x.requires_grad ] if not target_input_indexes: continue outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in inputs]) out_grad = tuple([get_grad(y) for y in outputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*in_data).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. # # Note (Tokui): When the same variable is passed to multiple input # slots (e.g. an expression like ``f(x, x)``), it makes the # gradient accumulation complicated since the back-propagated # gradients w.r.t. the first and second argument should be # accumulated to the current gradient w.r.t. the same variable. # In this case, the current implementation passes the current # gradient only to the first occurrence of the variable in the # input tuple and passes ``None`` to the rest of the occurrences. # For example, when the input variables are ``(x, x)``, the # input gradient passed to the ``backward_accumulate`` method is # ``(gx, None)`` where ``gx`` is the current gradient of ``x``. # See also the docstring of ``FunctionNode.backward_accumulate``. target_inputs = [inputs[i] for i in target_input_indexes] in_grad = [] for i, index_i in enumerate(target_input_indexes): x = inputs[index_i] if x in target_inputs[:i]: # Pass ``None`` for duplicated input variables except for # the first occurrence (see the comment above). gx = None elif x in grads: gx = grads[x] elif x.creator_node is None: x._check_old_style_gradient() # accumulate the gradient only if the node is a leaf gx = x.grad_var else: gx = None in_grad.append(gx) gxs = func.backward_accumulate(target_input_indexes, out_grad, in_grad) assert len(gxs) == len(in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: for gx in gxs: if gx is None: continue gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on backward computation of ' '{}'.format(func.label)) if not retain_grad: for y in outputs: if y is not None and y is not self.node: grads[y] = None y_var = y.get_variable_or_none() if y_var is not None: y_var._grad_var = None for i, gx in enumerate(gxs): if gx is None: continue x = target_inputs[i] if not x.requires_grad: continue _check_grad_type(func, x, gx.data) if x in target_inputs[:i]: # Accumulate the duplicated gradients here. See the comment # above the code that builds ``in_grad``. cur_gx = grads[x] grads[x] = gx if cur_gx is None else gx + cur_gx else: grads[x] = gx x_var = x.get_variable_or_none() if x_var is not None: x_var._grad_var = grads[x] if x.creator_node is not None: add_cand(x.creator_node) del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return cand_funcs = [] seen_set = set() grads = _backprop_utils.GradTable(load_if_new=True) # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: if self.data.ndim != 0: warnings.warn( 'Treating a scalar as a variable with only one element' ' in Variable.backward is deprecated. A scalar variable' ' must be a 0-dimensional array. Apply' ' chainer.functions.squeeze to obtain a scalar variable.' ' If the size of this variable accidentally becomes one,' ' set zero to grad.', DeprecationWarning) with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) leaf_nodes = set() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple([ i for i, x in enumerate(inputs) if x.requires_grad ]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) for y in outputs]) if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance with cuda.get_device_from_array(*(in_data + out_grad_data)): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) # for x in set(target_inputs)} in_grad = collections.OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) # to reduce memory usage x._set_grad_var_if_available(None) _backprop_utils.backprop_step( func, target_input_indexes, out_grad, in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) for y, gy in six.moves.zip(outputs, out_grad): if y is not None and y is not self.node: y._set_grad_var_if_available( gy if retain_grad else None) del gy, out_grad # to reduce memory usage for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: _check_grad_type(func, x, gx_elem.data) del gx_elem # to reduce memory usage if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del gx, in_grad # to reduce memory usage for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale grads.assert_no_grads()
def __exit__(self, *_): chainer.get_function_hooks()[self.name].deleted() del chainer.get_function_hooks()[self.name]
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is expressed in documentation of :class:`Function` class. .. note:: If the :data:`~Variable.data` attribute of input variables exist on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementers do not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable`, :class:`numpy.ndarray` or :class:`cupy.ndarray` objects. If the input is an :class:`numpy.ndarray` or a :class:`cupy.ndarray`, it is automatically wrapped with :class:`Variable`. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ inputs = [ x if isinstance(x, variable.Variable) else variable.Variable( x, requires_grad=False) for x in inputs ] in_data = tuple([x.data for x in inputs]) requires_grad = any([x.requires_grad for x in inputs]) if chainer.is_debug(): self._stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) for hook in six.itervalues(hooks): hook.forward_preprocess(self, in_data) # Forward prop with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None outputs = self.forward(in_data) assert type(outputs) == tuple for hook in six.itervalues(hooks): hook.forward_postprocess(self, in_data) if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = 'NaN is detected on forward computation' raise RuntimeError(msg) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in inputs]) if inputs else 0 # Backward edges for y in ret: y.set_creator(self) self.inputs = tuple([x.node for x in inputs]) # Forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) input_indexes_to_retain = self._input_indexes_to_retain if input_indexes_to_retain is None: # input arrays are retained by default input_indexes_to_retain = six.moves.range(len(inputs)) for index in input_indexes_to_retain: inputs[index].retain_data() del self._input_indexes_to_retain output_indexes_to_retain = self._output_indexes_to_retain if output_indexes_to_retain is not None: for index in output_indexes_to_retain: ret[index].retain_data() del self._output_indexes_to_retain if len(ret) == 1: return ret[0] else: return ret
def _backward_main(self, retain_grad): self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.cupy.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = {} # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) def get_grad(node): if node is None: return None if node in grads: return grads[node] return node.grad_var while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = [ i for i, x in enumerate(inputs) if x.requires_grad ] if not target_input_indexes: continue outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in inputs]) out_grad = tuple([get_grad(y) for y in outputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*in_data).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. # # Note (Tokui): When the same variable is passed to multiple input # slots (e.g. an expression like ``f(x, x)``), it makes the # gradient accumulation complicated since the back-propagated # gradients w.r.t. the first and second argument should be # accumulated to the current gradient w.r.t. the same variable. # In this case, the current implementation passes the current # gradient only to the first occurrence of the variable in the # input tuple and passes ``None`` to the rest of the occurrences. # For example, when the input variables are ``(x, x)``, the # input gradient passed to the ``backward_accumulate`` method is # ``(gx, None)`` where ``gx`` is the current gradient of ``x``. # See also the docstring of ``FunctionNode.backward_accumulate``. target_inputs = [inputs[i] for i in target_input_indexes] in_grad = [] for i, index_i in enumerate(target_input_indexes): x = inputs[index_i] if x in target_inputs[:i]: # Pass ``None`` for duplicated input variables except for # the first occurrence (see the comment above). gx = None elif x in grads: gx = grads[x] elif x.creator_node is None: x._check_old_style_gradient() # accumulate the gradient only if the node is a leaf gx = x.grad_var else: gx = None in_grad.append(gx) gxs = func.backward_accumulate( target_input_indexes, out_grad, in_grad) assert len(gxs) == len(in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: for gx in gxs: if gx is None: continue gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on backward computation of ' '{}'.format(func.label)) if not retain_grad: for y in outputs: if y is not None and y is not self.node: grads[y] = None y_var = y.get_variable_or_none() if y_var is not None: y_var._grad_var = None for i, gx in enumerate(gxs): if gx is None: continue x = target_inputs[i] if not x.requires_grad: continue _check_grad_type(func, x, gx.data) if x in target_inputs[:i]: # Accumulate the duplicated gradients here. See the comment # above the code that builds ``in_grad``. cur_gx = grads[x] grads[x] = gx if cur_gx is None else gx + cur_gx else: grads[x] = gx x_var = x.get_variable_or_none() if x_var is not None: x_var._grad_var = grads[x] if x.creator_node is not None: add_cand(x.creator_node) del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`Variable`. Returns: A tuple of output :class:`Variable` objects. """ input_vars = [chainer.as_variable(x) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) if chainer.is_debug(): self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None outputs = self.forward(in_data) assert type(outputs) is tuple for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for i, y in enumerate(ret): y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) return ret
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale): candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap() for y in outputs: creator = y.creator_node if creator is not None: push_candidate(creator) input_nodes = set(x.node for x in inputs) ret_dict = {} while candidate_funcs: func = pop_candidate() # Collect the gradients w.r.t. the outputs ys = [y() for y in func.outputs] # access via weak ref gys = tuple([grads.pop(y) for y in ys]) for node, gy in six.moves.zip(ys, gys): if node is not None: if node in input_nodes: ret_dict[node] = gy if retain_grad: y = node.get_variable_or_none() if y is not None: y.grad_var = gy y._loss_scale = loss_scale # Collect the gradients w.r.t. the inputs input_indexes = [] x_grads = collections.OrderedDict() for i, x in enumerate(func.inputs): if x not in grad_required: continue input_indexes.append(i) if x not in x_grads: x_grads[x] = grads.get_as_list(x) if not input_indexes: continue input_indexes = tuple(input_indexes) # Do backward # Call pre-backward hooks hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance in_data = tuple([x.data for x in func.inputs]) out_grad_data = tuple([None if g is None else g.data for g in gys]) with cuda.get_device_from_array(*in_data): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) _backprop_utils.backprop_step(func, input_indexes, gys, x_grads) # Call post-backward hooks for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) # Update grads for node, g in x_grads.items(): if not g: # gradient == None continue creator = node.creator_node if creator is not None: push_candidate(creator) for x in input_nodes: if x not in ret_dict: ret_dict[x] = grads.pop(x) return ret_dict
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return cand_funcs = [] seen_set = set() grads = _backprop_utils.GradTable(load_if_new=True) # Initialize error by 1, if this is a loss variable if self.array.size == 1 and self._grad_var is None: if self.array.ndim != 0: warnings.warn( 'Treating a scalar as a variable with only one element' ' in Variable.backward is deprecated. A scalar variable' ' must be a 0-dimensional array. Apply' ' chainer.functions.squeeze to obtain a scalar variable.' ' If the size of this variable accidentally becomes one,' ' set zero to grad.', DeprecationWarning) with cuda.get_device_from_array(self.array) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.array) else: self.grad = cuda.cupy.ones_like(self.array) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) leaf_nodes = set() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple([ i for i, x in enumerate(inputs) if x.requires_grad ]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) for y in outputs]) if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) out_grad_array = tuple( [None if g is None else g.array for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance with cuda.get_device_from_array(*(in_data + out_grad_array)): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_array) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) # for x in set(target_inputs)} in_grad = collections.OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) # to reduce memory usage x._set_grad_var_if_available(None) _backprop_utils.backprop_step( func, target_input_indexes, out_grad, in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_array) for y, gy in six.moves.zip(outputs, out_grad): if y is not None and y is not self.node: y._set_grad_var_if_available( gy if retain_grad else None) del gy, out_grad # to reduce memory usage for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: _check_grad_type(func, x, gx_elem.array) del gx_elem # to reduce memory usage if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del gx, in_grad # to reduce memory usage for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale grads.assert_no_grads()
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ chainerx_in_data = None chainerx_device = None is_chainerx, in_data = _extract_apply_in_data(inputs) if is_chainerx: # Try ChainerX C++ implementation. # If it's supported, the output arrays are wrapped with Variables # and returned. # If not supported, FunctionNode.forward_chainerx should return # Fallback. # In that case the input arrays are converted to numpy.ndarray # or cupy.ndarray (depending on the ChainerX backend) and # forward computation falls back to the conventional # FunctionNode.forward() implementaion. outputs = self.forward_chainerx(in_data) if outputs is not chainer.Fallback: # Supported. Wrap with variables and return assert isinstance(outputs, tuple) return tuple([ variable.Variable._init_unchecked( y, requires_grad=y.is_backprop_required(), is_chainerx_array=True) for y in outputs ]) # Fall back to FunctionNode.forward() chainerx_in_data, in_data, chainerx_device = ( self._chainerx_apply_fallback_preprocess(in_data, inputs)) self._is_chainerx_fallback_mode = True self.chainerx_device = chainerx_device utils._check_arrays_forward_compatible(in_data, self.label) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) elif self._is_chainerx_fallback_mode: # In ChainerX fallback, __class__ is temporarily replaced with # the fabricated one with automatic attirbute fallback. with _chainerx_attribute_fallback(self, chainerx_device): outputs = self.forward(in_data) else: # In normal case, simply run the forward method. outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError('forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format(self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) self._output_count = len(outputs) if self._is_chainerx_fallback_mode: ret = self._chainerx_apply_fallback_postprocess( chainerx_in_data, inputs, outputs) else: input_vars = [chainer.as_variable(x) for x in inputs] requires_grad = any([x.requires_grad for x in input_vars]) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is ``None``, then this method automatically complements 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If ``True``, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some models, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag ``False``. """ if self.creator is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.cupy.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() seen_vars = set() need_copy = set() # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = [y() for y in func.outputs] # access via weak ref in_data = tuple([x.data for x in func.inputs]) out_grad = tuple([None if y is None else y.grad for y in outputs]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) cuda.get_device_from_array(*(in_data + out_grad)).use() for hook in six.itervalues(hooks): hook.backward_preprocess(func, in_data, out_grad) gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) for hook in six.itervalues(hooks): hook.backward_postprocess(func, in_data, out_grad) if is_debug: for gx in gxs: if gx is None: continue cuda.get_device_from_array(gx).use() if cuda.get_array_module(gx).isnan(gx).any(): msg = 'NaN is detected on backward computation' raise RuntimeError(msg) if not retain_grad: for y in outputs: if y is not None and y is not self: y.grad = None for x, gx in zip(func.inputs, gxs): if gx is None: continue _check_grad_type(func, x, gx) # Accumulate the gradient to x. It is a bit tricky to handle # branches and parameter gradient accumulation correctly. id_x = id(x) if x.creator is None: # leaf if x._grad is None: x.grad = gx need_copy.add(id_x) else: cuda.get_device_from_array(gx).use() if id_x in need_copy: x.grad = utils.force_array(x.grad + gx) # copy need_copy.remove(id_x) else: x._grad += gx else: # not a leaf add_cand(x.creator) if id_x not in seen_vars: # 1st visit x.grad = gx seen_vars.add(id_x) need_copy.add(id_x) else: cuda.get_device_from_array(gx).use() if id_x in need_copy: # 2nd visit x._grad = utils.force_array(gx + x._grad) # copied need_copy.remove(id_x) else: # 3rd or later visit x._grad += gx del gxs # to reduce memory usage if initial_device is not None: initial_device.use()
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`Variable`. Returns: A tuple of output :class:`Variable` objects. """ input_vars = [x if isinstance(x, variable.Variable) else variable.Variable(x, requires_grad=False) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) if chainer.is_debug(): self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None outputs = self.forward(in_data) assert type(outputs) is tuple for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if chainer.is_debug(): if any(out.dtype.kind == 'f' and cuda.get_array_module(out).isnan(out).any() for out in outputs): msg = 'NaN is detected on forward computation' raise RuntimeError(msg) ret = tuple([variable.Variable(y, requires_grad=requires_grad) for y in outputs]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for i, y in enumerate(ret): y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) return ret