def check_backprop_step(self, gxs): flag_none = gxs[0] is None x1 = chainer.Variable(self.x1) x2 = chainer.Variable(self.x2) self.f.inputs = (x1.node, x2.node) gxrefs = [[gx] if gx is not None else [] for gx in gxs] grad_outputs = (self.gy1, self.gy2) grad_inputs = dict(zip(self.f.inputs, gxrefs)) _backprop_utils.backprop_step(self.f, (0, 1), grad_outputs, grad_inputs, True) if not chainer.configuration.config.lazy_grad_sum: # assert eager grad sum for gxref in gxrefs: self.assertLessEqual(len(gxref), 1) gx1 = _backprop_utils._reduce(gxrefs[0]) gx2 = _backprop_utils._reduce(gxrefs[1]) if flag_none: numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data), cuda.to_cpu(self.gx1.data)) self.assertIsNone(gx2) else: numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data), cuda.to_cpu(self.gx1_accum.data)) numpy.testing.assert_array_equal(cuda.to_cpu(gx2.data), cuda.to_cpu(self.gx2_orig.data))
def check_backprop_step(self, gxs): flag_none = gxs[0] is None x1 = chainer.Variable(self.x1) x2 = chainer.Variable(self.x2) self.f.inputs = (x1.node, x2.node) gxrefs = [[gx] if gx is not None else [] for gx in gxs] grad_outputs = (self.gy1, self.gy2) grad_inputs = dict(zip(self.f.inputs, gxrefs)) _backprop_utils.backprop_step( self.f, (0, 1), grad_outputs, grad_inputs, True) if not chainer.configuration.config.lazy_grad_sum: # assert eager grad sum for gxref in gxrefs: self.assertLessEqual(len(gxref), 1) gx1 = _backprop_utils._reduce(gxrefs[0]) gx2 = _backprop_utils._reduce(gxrefs[1]) if flag_none: numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data), cuda.to_cpu(self.gx1.data)) self.assertIsNone(gx2) else: numpy.testing.assert_array_equal(cuda.to_cpu(gx1.data), cuda.to_cpu(self.gx1_accum.data)) numpy.testing.assert_array_equal(cuda.to_cpu(gx2.data), cuda.to_cpu(self.gx2_orig.data))
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = _backprop_utils.GradTable(load_if_new=True) # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: if self.data.ndim != 0: warnings.warn( 'Treating a scalar as a variable with only one element' ' in Variable.backward is deprecated. A scalar variable' ' must be a 0-dimensional array. Apply' ' chainer.functions.squeeze to obtain a scalar variable.' ' If the size of this variable accidentally becomes one,' ' set zero to grad.', DeprecationWarning) with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) leaf_nodes = set() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple([ i for i, x in enumerate(inputs) if x.requires_grad ]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) for y in outputs]) if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*(in_data + out_grad_data)).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) for x in set(target_inputs)} in_grad = collections.OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) _backprop_utils.backprop_step( func, target_input_indexes, out_grad, in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: # each grad is a list of variables # iter_gxs expands it as a sequence of variables. def iter_gxs(gxs): for gx in gxs: for gx_elem in gx: yield gx_elem for gx in iter_gxs(in_grad.values()): gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on backward computation of ' '{}'.format(func.label)) for y, gy in six.moves.zip(outputs, out_grad): if y is not None and y is not self.node: y_var = y.get_variable_or_none() if y_var is not None: y_var._grad_var = gy if retain_grad else None for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: _check_grad_type(func, x, gx_elem.data) if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del in_grad # to reduce memory usage if initial_device is not None: initial_device.use() for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale grads.assert_no_grads()
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return cand_funcs = [] seen_set = set() grads = _backprop_utils.GradTable(load_if_new=True) # Initialize error by 1, if this is a loss variable if self.array.size == 1 and self._grad_var is None: if self.array.ndim != 0: warnings.warn( 'Treating a scalar as a variable with only one element' ' in Variable.backward is deprecated. A scalar variable' ' must be a 0-dimensional array. Apply' ' chainer.functions.squeeze to obtain a scalar variable.' ' If the size of this variable accidentally becomes one,' ' set zero to grad.', DeprecationWarning) with cuda.get_device_from_array(self.array) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.array) else: self.grad = cuda.cupy.ones_like(self.array) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) leaf_nodes = set() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple([ i for i, x in enumerate(inputs) if x.requires_grad ]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) for y in outputs]) if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) out_grad_array = tuple( [None if g is None else g.array for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance with cuda.get_device_from_array(*(in_data + out_grad_array)): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_array) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) # for x in set(target_inputs)} in_grad = collections.OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) # to reduce memory usage x._set_grad_var_if_available(None) _backprop_utils.backprop_step( func, target_input_indexes, out_grad, in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_array) for y, gy in six.moves.zip(outputs, out_grad): if y is not None and y is not self.node: y._set_grad_var_if_available( gy if retain_grad else None) del gy, out_grad # to reduce memory usage for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: _check_grad_type(func, x, gx_elem.array) del gx_elem # to reduce memory usage if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del gx, in_grad # to reduce memory usage for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale grads.assert_no_grads()
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale): candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap() for y in outputs: creator = y.creator_node if creator is not None: push_candidate(creator) input_nodes = set(x.node for x in inputs) ret_dict = {} while candidate_funcs: func = pop_candidate() # Collect the gradients w.r.t. the outputs ys = [y() for y in func.outputs] # access via weak ref gys = tuple([grads.pop(y) for y in ys]) for node, gy in six.moves.zip(ys, gys): if node is not None: if node in input_nodes: ret_dict[node] = gy if retain_grad: y = node.get_variable_or_none() if y is not None: y.grad_var = gy y._loss_scale = loss_scale # Collect the gradients w.r.t. the inputs input_indexes = [] x_grads = collections.OrderedDict() for i, x in enumerate(func.inputs): if x not in grad_required: continue input_indexes.append(i) if x not in x_grads: x_grads[x] = grads.get_as_list(x) if not input_indexes: continue input_indexes = tuple(input_indexes) # Do backward # Call pre-backward hooks hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance in_data = tuple([x.data for x in func.inputs]) out_grad_data = tuple([None if g is None else g.data for g in gys]) with cuda.get_device_from_array(*in_data): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) _backprop_utils.backprop_step(func, input_indexes, gys, x_grads) # Call post-backward hooks for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) # Update grads for node, g in x_grads.items(): if not g: # gradient == None continue creator = node.creator_node if creator is not None: push_candidate(creator) for x in input_nodes: if x not in ret_dict: ret_dict[x] = grads.pop(x) return ret_dict
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return # fix py2 memory leak OrderedDict = chainer.utils._collections.OrderedDict cand_funcs = [] seen_set = set() grads = _backprop_utils.GradTable(load_if_new=True) # Initialize error by 1, if this is a loss variable if self.array.size == 1 and self._grad_var is None: if self.array.ndim != 0: warnings.warn( 'Treating a variable with only one element as a scalar' ' in Variable.backward is deprecated. A scalar variable' ' must be a 0-dimensional array. Apply' ' chainer.functions.squeeze to obtain a scalar variable.' ' If the size of this variable accidentally becomes one,' ' set zero to grad.', DeprecationWarning) with cuda.get_device_from_array(self.array) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.array) else: self.grad = cuda.cupy.ones_like(self.array) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) leaf_nodes = set() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple( [i for i, x in enumerate(inputs) if x.requires_grad]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) for y in outputs]) if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) out_grad_array = tuple( [None if g is None else g.array for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance with cuda.get_device_from_array(*(in_data + out_grad_array)): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_array) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) # for x in set(target_inputs)} in_grad = OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) # to reduce memory usage x._set_grad_var_if_available(None) _backprop_utils.backprop_step(func, target_input_indexes, out_grad, in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_array) for y, gy in six.moves.zip(outputs, out_grad): if y is not None and y is not self.node: y._set_grad_var_if_available(gy if retain_grad else None) del gy, out_grad # to reduce memory usage for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: _check_grad_type(func, x, True, gx_elem, True) del gx_elem # to reduce memory usage if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del gx, in_grad # to reduce memory usage for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale grads.assert_no_grads()
def _backward_main(self, retain_grad, loss_scale): self._node._check_old_style_gradient() if self.creator_node is None: return initial_device = None if cuda.available and isinstance(self.data, cuda.ndarray): try: initial_device = cuda.Device() except cuda.cupy.cuda.runtime.CUDARuntimeError as e: if e.status != 38: # cudaErrorNoDevice raise is_debug = chainer.is_debug() cand_funcs = [] seen_set = set() grads = _backprop_utils.GradTable(load_if_new=True) # Initialize error by 1, if this is a loss variable if self.data.size == 1 and self._grad_var is None: if self.data.ndim != 0: warnings.warn( 'Treating a scalar as a variable with only one element' ' in Variable.backward is deprecated. A scalar variable' ' must be a 0-dimensional array. Apply' ' chainer.functions.squeeze to obtain a scalar variable.' ' If the size of this variable accidentally becomes one,' ' set zero to grad.', DeprecationWarning) with cuda.get_device_from_array(self.data) as device: if device is cuda.DummyDevice: self.grad = numpy.ones_like(self.data) else: self.grad = cuda.cupy.ones_like(self.data) if loss_scale is not None: self.grad *= loss_scale grads[self._node] = self._grad_var def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator_node) leaf_nodes = set() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple( [i for i, x in enumerate(inputs) if x.requires_grad]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) for y in outputs]) if not target_input_indexes: continue in_data = tuple([x.data for x in inputs]) out_grad_data = tuple( [None if g is None else g.data for g in out_grad]) hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance cuda.get_device_from_array(*(in_data + out_grad_data)).use() for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) for x in set(target_inputs)} in_grad = collections.OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) _backprop_utils.backprop_step(func, target_input_indexes, out_grad, in_grad) for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) if is_debug: # each grad is a list of variables # iter_gxs expands it as a sequence of variables. def iter_gxs(gxs): for gx in gxs: for gx_elem in gx: yield gx_elem for gx in iter_gxs(in_grad.values()): gx_data = gx.data if gx_data.dtype.kind == 'f': cuda.get_device_from_array(gx_data).use() if cuda.get_array_module(gx_data).isnan(gx_data).any(): raise RuntimeError( 'NaN is detected on backward computation of ' '{}'.format(func.label)) for y, gy in six.moves.zip(outputs, out_grad): if y is not None and y is not self.node: y_var = y.get_variable_or_none() if y_var is not None: y_var._grad_var = gy if retain_grad else None for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: _check_grad_type(func, x, gx_elem.data) if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del in_grad # to reduce memory usage if initial_device is not None: initial_device.use() for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._grad_var = gx x_var._loss_scale = loss_scale grads.assert_no_grads()
def _backprop(outputs, inputs, grad_required, retain_grad, grads, loss_scale): candidate_funcs, push_candidate, pop_candidate = _get_ordered_func_heap() for y in outputs: creator = y.creator_node if creator is not None: push_candidate(creator) input_nodes = set(x.node for x in inputs) ret_dict = {} while candidate_funcs: func = pop_candidate() # Collect the gradients w.r.t. the outputs ys = [y() for y in func.outputs] # access via weak ref gys = tuple([grads.pop(y) for y in ys]) for node, gy in six.moves.zip(ys, gys): if node is not None: if node in input_nodes: ret_dict[node] = gy if retain_grad: y = node.get_variable_or_none() if y is not None: y.grad_var = gy y._loss_scale = loss_scale # Collect the gradients w.r.t. the inputs input_indexes = [] x_grads = collections.OrderedDict() for i, x in enumerate(func.inputs): if x not in grad_required: continue input_indexes.append(i) if x not in x_grads: x_grads[x] = grads.get_as_list(x) if not input_indexes: continue input_indexes = tuple(input_indexes) # Do backward # Call pre-backward hooks hooks = chainer.get_function_hooks() if func._n_local_function_hooks != 0: hooks = collections.OrderedDict(hooks) hooks.update(func.local_function_hooks) hooks = hooks.values() # avoid six for performance in_data = tuple([x.data for x in func.inputs]) out_grad_data = tuple( [None if g is None else g.data for g in gys]) with cuda.get_device_from_array(*in_data): for hook in hooks: hook.backward_preprocess(func, in_data, out_grad_data) _backprop_utils.backprop_step(func, input_indexes, gys, x_grads) # Call post-backward hooks for hook in hooks: hook.backward_postprocess(func, in_data, out_grad_data) # Update grads for node, g in x_grads.items(): if not g: # gradient == None continue creator = node.creator_node if creator is not None: push_candidate(creator) for x in input_nodes: if x not in ret_dict: ret_dict[x] = grads.pop(x) return ret_dict
def _backprop_to_all(outputs, retain_grad, loss_scale): """Backprop to all input variables Args: outputs (list of tuple): each tuple is (y_node, y_grad_var). y_grad_var should not be None. retain_grad (bool): see docstring of Variable.backward loss_scale (float): see docstring of Variable.backward """ OrderedDict = chainer.utils._collections.OrderedDict # fix py2 memory leak cand_funcs = [] seen_set = set() def add_cand(cand): if cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) grads = _backprop_utils.GradTable(accumulate_grad_inputs=True) leaf_nodes = set() for y, gy in outputs: grads.accumulate(y, gy) func = y.creator_node if func is None: # leaf leaf_nodes.add(y) else: add_cand(func) # Fix F812 (Python 2) y = None del y is_debug = chainer.is_debug() base_hooks = chainer.get_function_hooks().values() while cand_funcs: _, _, func = heapq.heappop(cand_funcs) inputs = func.inputs target_input_indexes = tuple([ i for i, x in enumerate(inputs) if x.requires_grad ]) outputs = [y() for y in func.outputs] # access via weak ref out_grad = tuple([grads.pop(y) if y is not None and y.creator_node is not None else None for y in outputs]) if not target_input_indexes: continue in_data = [x.data for x in inputs] out_grad_array = [None if g is None else g.raw_array for g in out_grad] if func._n_local_function_hooks != 0: local_hooks = collections.OrderedDict(chainer.get_function_hooks()) local_hooks.update(func.local_function_hooks) hooks = local_hooks.values() # avoid six for performance else: hooks = base_hooks with chainer.using_device( backend.get_device_from_array(*(in_data + out_grad_array))): for hook in hooks: hook.backward_preprocess( func, tuple(in_data), tuple(out_grad_array)) # Collect the current input gradients. target_inputs = [inputs[i] for i in target_input_indexes] # Keep the order for the portability, rather than # in_grad = {x: grads.get_as_list(x) # for x in set(target_inputs)} in_grad = OrderedDict() for x in target_inputs: if x not in in_grad: in_grad[x] = grads.get_as_list(x) _backprop_utils.backprop_step( func, target_input_indexes, out_grad, in_grad, is_debug) for hook in hooks: hook.backward_postprocess( func, tuple(in_data), tuple(out_grad_array)) if retain_grad: # The gradients of the outputs of `func` are final. Store them if # retain_grad=True. for y, gy in six.moves.zip(outputs, out_grad): if y is not None: y._set_grad_var_if_available(gy) del gy # to reduce memory usage del out_grad # to reduce memory usage for x, gx in in_grad.items(): if not gx: # gradient == None continue for gx_elem in gx: if gx_elem is not None: chainer.variable._check_grad_type( func, x, True, gx_elem.raw_array) del gx_elem # to reduce memory usage if x.creator_node is None: # leaf leaf_nodes.add(x) else: add_cand(x.creator_node) del gx, in_grad # to reduce memory usage for x in leaf_nodes: x_var = x.get_variable_or_none() gx = grads.pop(x) if x_var is not None: x_var._set_grad_var_without_check(gx) x_var._loss_scale = loss_scale grads.assert_no_grads()