def _check_grad_type(func, x, gx): if x.data is None or gx is None: # ``x.data is None`` implies that the data array is not retained return if not chainer.is_arrays_compatible((gx, x.data)): msg = ('Type of data and grad mismatch\ngrad: %s != data: %s' % (type(x.data), type(gx))) typ = TypeError elif gx.dtype != x.data.dtype: msg = ('Dtype of data and grad mismatch\ngrad: %s != data: %s' % (x.data.dtype, gx.dtype)) typ = TypeError elif gx.shape != x.data.shape: msg = ('Shape of data and grad mismatch\ngrad: %s != data: %s' % (x.data.shape, gx.shape)) typ = ValueError else: return detail = '' if func: detail = 'Function `{0}` ({1}) has a bug.\n'.format( type(func)._impl_name, func.label) stack = func.stack if stack: detail += 'Stacktrace of the function is below:\n' for line in traceback.format_list(func.stack): detail += line detail += ''' Please report this error to the issue tracker with the stack trace, the information of your environment, and your script: https://github.com/chainer/chainer/issues/new. '''.format(type(func).__name__, func.label) raise typ(detail + msg)
def _check_grad_type(func, x, gx): if x.data is None or gx is None: # ``x.data is None`` implies that the data array is not retained return if not chainer.is_arrays_compatible((gx, x.data)): msg = ('Type of data and grad mismatch\ngrad: %s != data: %s' % (type(gx), type(x.data))) typ = TypeError elif gx.dtype != x.data.dtype: msg = ('Dtype of data and grad mismatch\ngrad: %s != data: %s' % (gx.dtype, x.data.dtype)) typ = TypeError elif gx.shape != x.data.shape: msg = ('Shape of data and grad mismatch\ngrad: %s != data: %s' % (gx.shape, x.data.shape)) typ = ValueError else: return detail = '' if func: detail = 'Function `{0}` ({1}) has a bug.\n'.format( type(func)._impl_name, func.label) stack = func.stack if stack: detail += 'Stacktrace of the function is below:\n' for line in traceback.format_list(func.stack): detail += line detail += ''' Please report this error to the issue tracker with the stack trace, the information of your environment, and your script: https://github.com/chainer/chainer/issues/new. '''.format(type(func).__name__, func.label) raise typ(detail + msg)
def _check_arrays_forward_compatible(arrays, label=None): if not chainer.is_arrays_compatible(arrays): raise TypeError( 'incompatible array types are mixed in the forward input{}.\n' 'Actual: {}'.format( ' ({})'.format(label) if label is not None else '', ', '.join(str(type(a)) for a in arrays)))
def _check_arrays_forward_compatible(arrays, label=None): if not chainer.is_arrays_compatible(arrays): raise TypeError( 'incompatible array types are mixed in the forward input{}.\n' 'Actual: {}'.format( ' ({})'.format(label) if label is not None else '', ', '.join(str(type(a)) for a in arrays)))
def __init__(self, in_size, out_size, pool_size, initialW=None, initial_bias=0): super(Maxout, self).__init__() linear_out_size = out_size * pool_size if initialW is None or \ numpy.isscalar(initialW) or \ isinstance(initialW, initializer.Initializer): pass elif chainer.is_arrays_compatible([initialW]): if initialW.ndim != 3: raise ValueError('initialW.ndim should be 3') initialW = initialW.reshape(linear_out_size, in_size) elif callable(initialW): initialW_orig = initialW def initialW(array): array.shape = (out_size, pool_size, in_size) initialW_orig(array) array.shape = (linear_out_size, in_size) if initial_bias is None or \ numpy.isscalar(initial_bias) or \ isinstance(initial_bias, initializer.Initializer): pass elif chainer.is_arrays_compatible([initial_bias]): if initial_bias.ndim != 2: raise ValueError('initial_bias.ndim should be 2') initial_bias = initial_bias.reshape(linear_out_size) elif callable(initial_bias): initial_bias_orig = initial_bias def initial_bias(array): array.shape = (out_size, pool_size) initial_bias_orig(array) array.shape = linear_out_size, with self.init_scope(): self.linear = linear.Linear( in_size, linear_out_size, nobias=initial_bias is None, initialW=initialW, initial_bias=initial_bias) self.out_size = out_size self.pool_size = pool_size
def _check_grad_type(func, x, is_node_x, gx, is_var_gx): if gx is None: return x_grad = gx.array if is_var_gx else gx x_data = x.data # TODO(kataoka): Make _update_data_info store the array module. # ``is_node_x and x_data is None`` implies that the data array is not # retained. # ``not is_node_x and x_data is None`` implies that grad of uninitialized # variable is checked here. if x_grad is None: # TODO(kataoka): This should be an error. return elif x_data is None and not is_node_x: # TODO(kataoka): This should be an error. return elif not chainer.is_arrays_compatible((x_grad, x_data)): msg = ('Type of data and grad mismatch\ngrad: %s != data: %s' % (type(x_grad), type(x_data))) typ = TypeError elif x.dtype is None or x.shape is None: # unretained Variable(None) # TODO(kataoka): This should be an error. return elif gx.dtype != x.dtype: msg = ('Dtype of data and grad mismatch\ngrad: %s != data: %s' % (gx.dtype, x.dtype)) typ = TypeError elif gx.shape != x.shape: msg = ('Shape of data and grad mismatch\ngrad: %s != data: %s' % (gx.shape, x.shape)) typ = ValueError else: return detail = '' if func: detail = 'Function `{0}` ({1}) has a bug.\n'.format( type(func)._impl_name, func.label) stack = func.stack if stack: detail += 'Stacktrace of the function is below:\n' for line in traceback.format_list(func.stack): detail += line detail += ''' Please report this error to the issue tracker with the stack trace, the information of your environment, and your script: https://github.com/chainer/chainer/issues/new. ''' raise typ(detail + msg)
def forward(self, xs): self.xs = xs results = self.func(*self.args, **self.kwargs) if isinstance(results, (tuple, list)): dummy_results = tuple(_unwrap_var(ret) for ret in results) elif isinstance(results, dict): dummy_results = tuple(_unwrap_var(ret) for ret in results.values()) else: dummy_results = _unwrap_var(results) dummy_results = dummy_results, if not chainer.is_arrays_compatible(dummy_results): raise ValueError( 'returned values from the function wrapped by \'as_funcnode\' ' 'must consist only array, function name: {}'.format(self.name)) return dummy_results
def forward(self, xs): assert len(xs) == len(self.arg_vars) self.xs = xs results = self.func(*self.args, **self.kwargs) self.skeleton, flattened_results = self._flatten_return_value(results) dummy_results = tuple(_unwrap_var(ret) for ret in flattened_results) if all([_is_var(ret) for ret in flattened_results]): self.internal_results = flattened_results if not chainer.is_arrays_compatible(dummy_results): raise ValueError( 'returned values from the function wrapped by \'as_funcnode\' ' 'must consist only array, function name: {}'.format( self.custom_function_node_name)) return dummy_results
def numerical_grad(f, inputs, grad_outputs, eps=1e-3, detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2, center_outputs=None): """Computes numerical gradient by finite differences. This function is used to implement gradient check. For usage example, see unit tests of :mod:`chainer.functions`. By default, ``numerical_grad`` computes the gradient to the first order of ``eps``. Args: f (callable): Python function with no arguments that runs forward computation and returns the result. inputs (tuple of arrays): Tuple of arrays that should be treated as inputs. Each element of them is slightly modified to realize numerical gradient by finite differences. grad_outputs (tuple of arrays or scalars): Tuple of arrays or scalars that are treated as output gradients. eps (float): Epsilon value of finite differences. detect_nondifferentiable (bool): ``False`` by default. If ``True``, ``numerical_grad`` checks whether ``f`` is differentiable at ``inputs``. It requires evaluation of ``f`` at 5 points instead of 2. As a side effect, the accuracy of numerical gradient will be increased to the third order of ``eps``. If it turns out that ``f`` is non-differentiable at ``input``, ``numerical_grad`` raises :class:`~chainer.gradient_check.NondifferentiableError`. diff_atol (float): Absolute tolerance of fitting error of non-differentiable point detection. diff_rtol (float): Tolerance of fitting error of non-differentiable point detection relative to the output values of ``f``. center_outputs (tuple of arrays or None): Only used if ``detect_nondifferentiable`` is ``True``. If specified, these arrays are used as the outputs of ``f`` at ``inputs``. Otherwise, it is calculated. It can be used to reduce the computation if these arrays are already calculated before calling ``numerical_grad``. Returns: tuple: Numerical gradient arrays corresponding to ``inputs``. """ # TODO(niboshi): Deprecate `center_outputs` argument. # If dtype of this argument is not float64, often the resolution is # insufficient for numerical gradient calculation. We might use it only # when its dtype is float64, but it would be better to simply remove it. center_outputs = None assert eps > 0 assert isinstance(inputs, (tuple, list)) for x in inputs: if x.dtype.kind != 'f': raise RuntimeError( 'The dtype of input arrays must be kind of float') inputs = tuple(inputs) # Cast grad_outputs to float64 grad_outputs = tuple([ None if g is None else numpy.float64(g) if numpy.isscalar(g) else g.astype(numpy.float64) for g in grad_outputs ]) if not chainer.is_arrays_compatible( [a for a in inputs + grad_outputs if not numpy.isscalar(a)]): raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`') device = backend.get_device_from_array(*(inputs + grad_outputs)) xp = device.xp if xp is cuda.cupy: numerical_grad_kernel_1 = cuda.reduce('T y1, T y2, U gy, T eps', 'V gxi', '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0', 'numerical_grad_kernel_1') numerical_grad_kernel_3 = cuda.reduce( 'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi', '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b', 'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3') if xp is chainerx: grads = [ xp.zeros(x.shape, numpy.float64, device=x.device) for x in inputs ] else: grads = [xp.zeros(x.shape, numpy.float64) for x in inputs] if detect_nondifferentiable: if center_outputs is None: ys0 = _copy_arrays(f()) else: ys0 = center_outputs nout = len(ys0) shapes = [y0.shape for y0 in ys0] sizes = numpy.array([y0.size for y0 in ys0]) cumsizes = numpy.cumsum(sizes) # Evaluate func at a single input def eval_func(x, x_ind, delta, orig): x[x_ind] = orig + delta ys = _copy_arrays(f()) assert len(ys) == len(grad_outputs) assert all( [gy is None for y, gy in zip(ys, grad_outputs) if y is None]) assert all([ gy is None or numpy.isscalar(gy) or y.shape == gy.shape for y, gy in zip(ys, grad_outputs) ]) x[x_ind] = orig return ys # An iteration on a single input displacement def iterate_single_input(i_in, x, orig_x, x_ind): orig = orig_x[x_ind] # `yss` holds a list of output arrays for each of 2 or 5 sampling # points. if detect_nondifferentiable: yss = [ eval_func(x, x_ind, -eps * 1., orig), eval_func(x, x_ind, -eps * .5, orig), ys0, eval_func(x, x_ind, +eps * .5, orig), eval_func(x, x_ind, +eps * 1., orig), ] else: yss = [ eval_func(x, x_ind, -eps * 1, orig), eval_func(x, x_ind, +eps * 1, orig), ] assert all([ y is None or (y.shape == yss[0][i].shape and y.dtype == yss[0][i].dtype) for ys in yss for i, y in enumerate(ys) ]) # If all the outputs are 0-size, skip non-differentiable check. if all([y is None or y.size == 0 for y in yss[0]]): detect_nondifferentiable_ = False else: detect_nondifferentiable_ = detect_nondifferentiable if detect_nondifferentiable_: # Detect non-differentiable point by quadratic fitting # Check for non-finite output. # If any single element in the output arrays has different # finiteness among sampled points, that means this is a # non-differentiable point. # If the function consistently generates non-finite values # around the point, we do not treat the point as # non-differentiable. # (Example: x<0 region for the logarithm function) any_nonfinite = False for i_out in range(nout): isfinites = [xp.isfinite(ys[i_out]) for ys in yss] if any((isfinites[0] != isfinites[i]).any() for i in range(1, len(yss))): s = six.StringIO() s.write('Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(x_ind)) s.write('eps: {}\n'.format(eps)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) any_nonfinite |= not all((_).all() for _ in isfinites) if not any_nonfinite: # Stack flattened outputs to make (5, *)-shaped 2D array ystack = xp.vstack( [xp.hstack([y.ravel() for y in ys]) for ys in yss]) assert ystack.ndim == 2 and ystack.shape[0] == len(yss) # Fit to quadratic if xp is not numpy: ystack = _cpu._to_cpu(ystack) polyfit = numpy.polynomial.polynomial.polyfit _, (residuals, _, _, _) = polyfit(range(len(yss)), ystack, deg=2, full=True) if xp is not numpy: residuals = device.send(residuals) residuals = xp.sqrt(residuals / len(yss)) # Check for error for each output array for i_out in range(nout): size = sizes[i_out] cumsize = cumsizes[i_out] shape = shapes[i_out] # TODO(niboshi): The following two lines could be # rewritten using xp.stack, which is supported in # NumPy>=1.10 ymax = xp.concatenate([ys[i_out][None] for ys in yss]).max(axis=0) ymin = xp.concatenate([ys[i_out][None] for ys in yss]).min(axis=0) # Restore the shape of flattened residual res = residuals[cumsize - size:cumsize] res = res.reshape(shape) det = utils.force_array(diff_atol + diff_rtol * (ymax - ymin) < res) # Constant output = not nondifferentiable det[ymax == ymin] = False if det.any(): s = six.StringIO() s.write('Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(x_ind)) s.write('eps: {}\n'.format(eps)) s.write('diff_rtol: {}\n'.format(diff_rtol)) s.write('diff_atol: {}\n'.format(diff_atol)) s.write('ymax: {}\n'.format(ymax)) s.write('ymin: {}\n'.format(ymin)) s.write( 'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format( diff_atol + diff_rtol * (ymax - ymin))) s.write('fitting errors: {}\n'.format(res)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) # Calculate numerical gradient for i_out, gy in enumerate(grad_outputs): if gy is None: continue if not numpy.isscalar(gy): gy = gy.astype(numpy.float64, copy=False) gpu_ = (xp is cuda.cupy and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss)) # If any output sample is None, all others must be. assert all([(yss[0][i_out] is None) == (yss[j][i_out] is None) for j in range(len(yss))]) # If outputs samples are None, the part of numeric gradient for # this output is considered as zero: skip the accumulation. if yss[0][i_out] is None: continue if len(yss) == 2: # 1st order y0 = yss[0][i_out] y1 = yss[1][i_out] if gpu_: numerical_grad_kernel_1(y1, y0, xp.asarray(gy), eps, gx[x_ind]) else: dot = ((y1 - y0) * gy).sum() gx[x_ind] = gx[x_ind] + dot / (2 * eps) elif len(yss) == 5: # 3rd order y0 = yss[0][i_out] y1 = yss[1][i_out] y2 = yss[3][i_out] y3 = yss[4][i_out] if gpu_: numerical_grad_kernel_3(y3, y2, y1, y0, gy, eps, gx[x_ind]) else: num = -y3 + 8 * y2 - 8 * y1 + y0 dot = (num * gy).sum() gx[x_ind] = gx[x_ind] + dot / (6 * eps) else: assert False # Calculate numeric gradient with configuration.using_config('type_check', False): for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)): orig_x = x.copy() # hold original value for x_ind in numpy.ndindex(x.shape): iterate_single_input(i_in, x, orig_x, x_ind) return [ g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs) ]
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ input_vars = [chainer.as_variable(x) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) # Check for input array types if not chainer.is_arrays_compatible(in_data): raise TypeError( 'incompatible array types are mixed in the forward input ' '({}).\n' 'Actual: {}'.format(self.label, ', '.join(str(type(x)) for x in in_data))) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError('forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format(self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backends._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) ret = tuple([ variable.Variable(y, requires_grad=requires_grad) for y in outputs ]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable` or :ref:`ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ chainerx_in_data = None chainerx_device = None is_chainerx, in_data = _extract_apply_in_data(inputs) if is_chainerx: # Try ChainerX C++ implementation. # If it's supported, the output arrays are wrapped with Variables # and returned. # If not supported, FunctionNode.forward_chainerx should return # Fallback. # In that case the input arrays are converted to numpy.ndarray # or cupy.ndarray (depending on the ChainerX backend) and # forward computation falls back to the conventional # FunctionNode.forward() implementaion. outputs = self.forward_chainerx(in_data) if outputs is not chainer.Fallback: # Supported. Wrap with variables and return assert isinstance(outputs, tuple) return tuple([ variable.Variable._init_unchecked( y, requires_grad=y.is_backprop_required(), is_chainerx_array=True) for y in outputs]) # Fall back to FunctionNode.forward() chainerx_in_data, in_data, chainerx_device = ( self._chainerx_apply_fallback_preprocess(in_data, inputs)) self._is_chainerx_fallback_mode = True self.chainerx_device = chainerx_device utils._check_arrays_forward_compatible(in_data, self.label) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) elif self._is_chainerx_fallback_mode: # In ChainerX fallback, __class__ is temporarily replaced with # the fabricated one with automatic attirbute fallback. with _chainerx_attribute_fallback(self, chainerx_device): outputs = self.forward(in_data) else: # In normal case, simply run the forward method. outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError( 'forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) self._output_count = len(outputs) if self._is_chainerx_fallback_mode: ret = self._chainerx_apply_fallback_postprocess( chainerx_in_data, inputs, outputs) else: input_vars = [chainer.as_variable(x) for x in inputs] requires_grad = any([x.requires_grad for x in input_vars]) ret = tuple( [variable.Variable(y, requires_grad=requires_grad) for y in outputs]) if configuration.config.enable_backprop: # Topological ordering self.rank = max( [x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def numerical_grad( f, inputs, grad_outputs, eps=1e-3, detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2, center_outputs=None): """Computes numerical gradient by finite differences. This function is used to implement gradient check. For usage example, see unit tests of :mod:`chainer.functions`. By default, ``numerical_grad`` computes the gradient to the first order of ``eps``. Args: f (callable): Python function with no arguments that runs forward computation and returns the result. inputs (tuple of arrays): Tuple of arrays that should be treated as inputs. Each element of them is slightly modified to realize numerical gradient by finite differences. grad_outputs (tuple of arrays or scalars): Tuple of arrays or scalars that are treated as output gradients. eps (float): Epsilon value of finite differences. detect_nondifferentiable (bool): ``False`` by default. If ``True``, ``numerical_grad`` checks whether ``f`` is differentiable at ``inputs``. It requires evaluation of ``f`` at 5 points instead of 2. As a side effect, the accuracy of numerical gradient will be increased to the third order of ``eps``. If it turns out that ``f`` is non-differentiable at ``input``, ``numerical_grad`` raises :class:`~chainer.gradient_check.NondifferentiableError`. diff_atol (float): Absolute tolerance of fitting error of non-differentiable point detection. diff_rtol (float): Tolerance of fitting error of non-differentiable point detection relative to the output values of ``f``. center_outputs (tuple of arrays or None): Only used if ``detect_nondifferentiable`` is ``True``. If specified, these arrays are used as the outputs of ``f`` at ``inputs``. Otherwise, it is calculated. It can be used to reduce the computation if these arrays are already calculated before calling ``numerical_grad``. Returns: tuple: Numerical gradient arrays corresponding to ``inputs``. """ # TODO(niboshi): Deprecate `center_outputs` argument. # If dtype of this argument is not float64, often the resolution is # insufficient for numerical gradient calculation. We might use it only # when its dtype is float64, but it would be better to simply remove it. center_outputs = None assert eps > 0 assert isinstance(inputs, (tuple, list)) for x in inputs: if x.dtype.kind != 'f': raise RuntimeError( 'The dtype of input arrays must be kind of float') inputs = tuple(inputs) # Cast grad_outputs to float64 grad_outputs = tuple([ None if g is None else numpy.float64(g) if numpy.isscalar(g) else g.astype(numpy.float64) for g in grad_outputs]) if not chainer.is_arrays_compatible( [a for a in inputs + grad_outputs if not numpy.isscalar(a)]): raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`') device = backend.get_device_from_array(*(inputs + grad_outputs)) xp = device.xp if xp is cuda.cupy: numerical_grad_kernel_1 = cuda.reduce( 'T y1, T y2, U gy, T eps', 'V gxi', '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0', 'numerical_grad_kernel_1' ) numerical_grad_kernel_3 = cuda.reduce( 'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi', '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b', 'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3' ) if xp is chainerx: grads = [ xp.zeros(x.shape, numpy.float64, device=x.device) for x in inputs] else: grads = [xp.zeros(x.shape, numpy.float64) for x in inputs] if detect_nondifferentiable: if center_outputs is None: ys0 = _copy_arrays(f()) else: ys0 = center_outputs nout = len(ys0) shapes = [_.shape for _ in ys0] sizes = numpy.array([_.size for _ in ys0]) cumsizes = numpy.cumsum(sizes) # Evaluate func at a single input def eval_func(x, i, delta, orig): x[i] = orig + delta y = _copy_arrays(f()) assert len(y) == len(grad_outputs) assert all([ gy is None for y_, gy in zip(y, grad_outputs) if y_ is None]) assert all([ gy is None or numpy.isscalar(gy) or y_.shape == gy.shape for y_, gy in zip(y, grad_outputs)]) x[i] = orig return y # An iteration on a single input displacement def iterate_single_input(i_in, x, orig_x, i): orig = orig_x[i] # `yss` holds a list of output arrays for each of 2 or 5 sampling # points. if detect_nondifferentiable: yss = [ eval_func(x, i, -eps * 1., orig), eval_func(x, i, -eps * .5, orig), ys0, eval_func(x, i, +eps * .5, orig), eval_func(x, i, +eps * 1., orig), ] else: yss = [ eval_func(x, i, -eps * 1, orig), eval_func(x, i, +eps * 1, orig), ] if detect_nondifferentiable: # Detect non-differentiable point by quadratic fitting # Check for non-finite output. # If any single element in the output arrays has different # finiteness among sampled points, that means this is a # non-differentiable point. # If the function consistently generates non-finite values # around the point, we do not treat the point as # non-differentiable. # (Example: x<0 region for the logarithm function) any_nonfinite = False for i_out in range(nout): isfinites = [xp.isfinite(ys[i_out]) for ys in yss] if any((isfinites[0] != isfinites[i]).any() for i in range(1, len(yss))): s = six.StringIO() s.write( 'Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) any_nonfinite |= not all((_).all() for _ in isfinites) if not any_nonfinite: # Stack flattened outputs to make (5, *)-shaped 2D array ystack = xp.vstack( [xp.hstack([y.ravel() for y in ys]) for ys in yss]) assert ystack.ndim == 2 and ystack.shape[0] == len(yss) # Fit to quadratic if xp is not numpy: ystack = _cpu._to_cpu(ystack) polyfit = numpy.polynomial.polynomial.polyfit _, (residuals, _, _, _) = polyfit( range(len(yss)), ystack, deg=2, full=True) if xp is not numpy: residuals = device.send(residuals) residuals = xp.sqrt(residuals / len(yss)) # Check for error for each output array for i_out in range(nout): size = sizes[i_out] cumsize = cumsizes[i_out] shape = shapes[i_out] # TODO(niboshi): The following two lines could be # rewritten using xp.stack, which is supported in # NumPy>=1.10 ymax = xp.concatenate( [ys[i_out][None] for ys in yss]).max(axis=0) ymin = xp.concatenate( [ys[i_out][None] for ys in yss]).min(axis=0) # Restore the shape of flattened residual res = residuals[cumsize - size:cumsize] res = res.reshape(shape) det = utils.force_array( diff_atol + diff_rtol * (ymax - ymin) < res) # Constant output = not nondifferentiable det[ymax == ymin] = False if det.any(): s = six.StringIO() s.write( 'Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('diff_rtol: {}\n'.format(diff_rtol)) s.write('diff_atol: {}\n'.format(diff_atol)) s.write('ymax: {}\n'.format(ymax)) s.write('ymin: {}\n'.format(ymin)) s.write( 'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format( diff_atol + diff_rtol * (ymax - ymin))) s.write('fitting errors: {}\n'.format(res)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) # Calculate numerical gradient for i_out, gy in enumerate(grad_outputs): if gy is None: continue if not numpy.isscalar(gy): gy = gy.astype(numpy.float64, copy=False) gpu_ = (xp is cuda.cupy and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss)) # If any output sample is None, all others must be. assert all([ (yss[0][i_out] is None) == (yss[j][i_out] is None) for j in range(len(yss))]) # If outputs samples are None, the part of numeric gradient for # this output is considered as zero: skip the accumulation. if yss[0][i_out] is None: continue if len(yss) == 2: # 1st order y0 = yss[0][i_out] y1 = yss[1][i_out] if gpu_: numerical_grad_kernel_1( y1, y0, xp.asarray(gy), eps, gx[i]) else: dot = ((y1 - y0) * gy).sum() gx[i] = gx[i] + dot / (2 * eps) elif len(yss) == 5: # 3rd order y0 = yss[0][i_out] y1 = yss[1][i_out] y2 = yss[3][i_out] y3 = yss[4][i_out] if gpu_: numerical_grad_kernel_3( y3, y2, y1, y0, gy, eps, gx[i]) else: num = -y3 + 8 * y2 - 8 * y1 + y0 dot = (num * gy).sum() gx[i] = gx[i] + dot / (6 * eps) else: assert False # Calculate numeric gradient with configuration.using_config('type_check', False): for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)): orig_x = x.copy() # hold original value for i in numpy.ndindex(x.shape): iterate_single_input(i_in, x, orig_x, i) return [g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs)]
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ input_vars = [chainer.as_variable(x) for x in inputs] in_data = tuple([x.data for x in input_vars]) requires_grad = any([x.requires_grad for x in input_vars]) # Check for input array types if not chainer.is_arrays_compatible(in_data): raise TypeError( 'incompatible array types are mixed in the forward input ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in in_data))) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) else: outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError( 'forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) ret = tuple([variable.Variable(y, requires_grad=requires_grad) for y in outputs]) if configuration.config.enable_backprop: # Topological ordering self.rank = max([x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret
def apply(self, inputs): """Computes output variables and grows the computational graph. Basic behavior is expressed in the documentation of :class:`FunctionNode`. .. note:: If the :data:`~Variable.data` attribute of input variables exist on a GPU device, that device is made current before calling :meth:`forward`, so implementors do not need to take care of device selection in most cases. Args: inputs: Tuple of input variables. Each element can be either :class:`~chainer.Variable`, :class:`numpy.ndarray`, or :class:`cupy.ndarray`. If the element is an ndarray, it is automatically wrapped with :class:`~chainer.Variable`. Returns: A tuple of output :class:`~chainer.Variable` objects. """ chainerx_in_data = None chainerx_device = None is_chainerx, in_data = _extract_apply_in_data(inputs) if is_chainerx: # Try ChainerX C++ implementation. # If it's supported, the output arrays are wrapped with Variables # and returned. # If not supported, FunctionNode.forward_chainerx should return # Fallback. # In that case the input arrays are converted to numpy.ndarray # or cupy.ndarray (depending on the ChainerX backend) and # forward computation falls back to the conventional # FunctionNode.forward() implementaion. outputs = self.forward_chainerx(in_data) if outputs is not chainer.Fallback: # Supported. Wrap with variables and return assert isinstance(outputs, tuple) return tuple([ variable.Variable( y, requires_grad=y.is_backprop_required()) for y in outputs]) # Fall back to FunctionNode.forward() chainerx_in_data, in_data, chainerx_device = ( self._chainerx_apply_fallback_preprocess(in_data, inputs)) self._is_chainex_fallback_mode = True self.chainerx_device = chainerx_device utils._check_arrays_forward_compatible(in_data, self.label) is_debug = chainer.is_debug() if is_debug: # Keep stack trace for debug self.stack = traceback.extract_stack() if configuration.config.type_check: self._check_data_type_forward(in_data) hooks = chainer.get_function_hooks() if self._n_local_function_hooks > 0: hooks = collections.OrderedDict(hooks) hooks.update(self.local_function_hooks) hooks = hooks.values() # avoid six for performance for hook in hooks: hook.forward_preprocess(self, in_data) # Forward propagation with cuda.get_device_from_array(*in_data): self._input_indexes_to_retain = None self._output_indexes_to_retain = None if chainer.config.schedule_func is not None: outputs = static_forward_optimizations(self, in_data) elif self._is_chainex_fallback_mode: # In ChainerX fallback, __class__ is temporarily replaced with # the fabricated one with automatic attirbute fallback. with _chainerx_attribute_fallback(self, chainerx_device): outputs = self.forward(in_data) else: # In normal case, simply run the forward method. outputs = self.forward(in_data) # Check for output array types if not isinstance(outputs, tuple): raise TypeError( 'forward output must be a tuple ({})\n' 'Actual: {}'.format(self.label, type(outputs))) if not chainer.is_arrays_compatible(outputs): raise TypeError( 'incompatible array types are mixed in the forward output ' '({}).\n' 'Actual: {}'.format( self.label, ', '.join(str(type(x)) for x in outputs))) for hook in hooks: hook.forward_postprocess(self, in_data) # NaN check of output values if is_debug: if any(chainer.backend._contains_nan(out) for out in outputs): msg = ('NaN is detected on forward computation of ' '{}'.format(self.label)) raise RuntimeError(msg) self._output_count = len(outputs) if self._is_chainex_fallback_mode: ret = self._chainerx_apply_fallback_postprocess( chainerx_in_data, inputs, outputs) else: input_vars = [chainer.as_variable(x) for x in inputs] requires_grad = any([x.requires_grad for x in input_vars]) ret = tuple( [variable.Variable(y, requires_grad=requires_grad) for y in outputs]) if configuration.config.enable_backprop: # Topological ordering self.rank = max( [x.rank for x in input_vars]) if input_vars else 0 # Add backward edges for y in ret: y.creator_node = self self.inputs = tuple([x.node for x in input_vars]) # Add forward edges (must be weak references) self.outputs = tuple([weakref.ref(y.node) for y in ret]) if self._input_indexes_to_retain is not None: for index in self._input_indexes_to_retain: input_vars[index].retain_data() if self._output_indexes_to_retain is not None: retained_data = [] for index in self._output_indexes_to_retain: ret[index].retain_data() retained_data.append(outputs[index]) self._retained_output_data = tuple(retained_data) self.lazy_grad_sum = configuration.config.lazy_grad_sum return ret