def check_forward(self, x1_data, x2_data, x3_data): xp = self.link.xp x1 = chainer.Variable(x1_data) if self.input_variable else x1_data h1 = self.link(x1) device = backend.get_device_from_array(x1_data) with chainer.using_device(device): c0 = chainer.Variable(xp.zeros((len(self.x1), self.out_size), dtype=self.x1.dtype)) c1_expect, h1_expect = functions.lstm(c0, self.link.upward(x1)) testing.assert_allclose(h1.data, h1_expect.data) testing.assert_allclose(self.link.h.data, h1_expect.data) testing.assert_allclose(self.link.c.data, c1_expect.data) batch = len(x2_data) x2 = chainer.Variable(x2_data) if self.input_variable else x2_data h1_in, h1_rest = functions.split_axis( self.link.h.data, [batch], axis=0) y2 = self.link(x2) device = backend.get_device_from_array(x1) with chainer.using_device(device): c2_expect, y2_expect = \ functions.lstm(c1_expect, self.link.upward(x2) + self.link.lateral(h1_in)) testing.assert_allclose(y2.data, y2_expect.data) testing.assert_allclose(self.link.h.data[:batch], y2_expect.data) testing.assert_allclose(self.link.h.data[batch:], h1_rest.data) x3 = chainer.Variable(x3_data) if self.input_variable else x3_data h2_rest = self.link.h y3 = self.link(x3) c3_expect, y3_expect = \ functions.lstm(c2_expect, self.link.upward(x3)) testing.assert_allclose(y3.data, y3_expect.data) testing.assert_allclose(self.link.h.data, h2_rest.data)
def check_equal_memory_shared(self, arr1, arr2): # Check that the two arrays share the internal memory. numpy.testing.assert_array_equal(backend.CpuDevice().send(arr1), backend.CpuDevice().send(arr2)) with chainer.using_device(backend.get_device_from_array(arr1)): arr1 += 2 numpy.testing.assert_array_equal(backend.CpuDevice().send(arr1), backend.CpuDevice().send(arr2)) with chainer.using_device(backend.get_device_from_array(arr1)): arr1 -= 2
def check_equal_memory_shared(self, arr1, arr2): # Check that the two arrays share the internal memory. numpy.testing.assert_array_equal( backend.CpuDevice().send(arr1), backend.CpuDevice().send(arr2)) with chainer.using_device(backend.get_device_from_array(arr1)): arr1 += 2 numpy.testing.assert_array_equal( backend.CpuDevice().send(arr1), backend.CpuDevice().send(arr2)) with chainer.using_device(backend.get_device_from_array(arr1)): arr1 -= 2
def __init__(self, func, x_data, y_grad, params, eps, atol, rtol, no_grads, dtype, detect_nondifferentiable, is_immutable_params): # If `is_immutable_params` is `False`, `params` are expected to be of # type `chainer.Parameter` and are updated in-place. # To run `_CheckBackward` with ChainerX ndarrays however which cannot # be updated in-place when wrapped in `chainer.Parameter`s, this flag # should be `True` and parameters should be given as ndarrays. # `func` in the former case must take inputs as arguments only. In the # latter, it must take the parameters in addition. if dtype is not None and numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') if is_immutable_params: if not all( isinstance(p, chainer.get_array_types()) for p in params): raise ValueError( 'All parameters in `params` must be ndarrays if ' '`is_immutable_params` is `True`. Actual: {}.'.format( ', '.join(str(type(p)) for p in params))) x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in x_data] else: if len(no_grads) != len(x_data): raise ValueError( 'Length of no_grads param and xs should be same.\n' 'Actual: {0} != {1}'.format(len(no_grads), len(x_data))) device = backend.get_device_from_array(*x_data) if device.xp is chainerx: if len(params) > 0 and not is_immutable_params: raise NotImplementedError( 'gradient_check must be called with ' 'is_immutable_params=True to test parameters with ' 'ChainerX.') if any(no_grads): raise NotImplementedError( 'gradient_check does not support no_grads argument for ' 'ChainerX arrays') self.device = device self.func = func self.x_data = x_data self.y_grad = y_grad self.params = params self.no_grads = no_grads self.atol = atol self.rtol = rtol self.is_immutable_params = is_immutable_params # options for numeric gradients self.eps = eps self.dtype = dtype self.detect_nondifferentiable = detect_nondifferentiable
def test_device(self, model_initial_backend_config, model_backend_config, input_backend_config): model_initial_device = model_initial_backend_config.device device = model_backend_config.device input_device = input_backend_config.device model = chainer.Link() model.to_device(model_initial_device) optimizer = DummyOptimizer() optimizer.setup(model) iterator = DummyIterator([numpy.array(1), numpy.array(2)]) updater = training.updaters.StandardUpdater(iterator, optimizer, device=device, input_device=input_device) assert updater.device is device assert updater.input_device is input_device # Check the model device. assert model.device == device updater.update_core() assert optimizer.update.call_count == 1 args, kwargs = optimizer.update.call_args assert len(args) == 2 assert len(kwargs) == 0 loss, v1 = args # Check the input device. assert backend.get_device_from_array(v1) == input_device
def test_double_backward(self, src_backend_config, dst_backend_config): src_device = src_backend_config.device dst_device = dst_backend_config.device if (src_device.xp is chainerx) is not (dst_device.xp is chainerx): raise unittest.SkipTest( 'ChainerX to non-ChainerX does not support backward.') x = src_backend_config.get_array(self.x) gy = dst_backend_config.get_array(self.gy) ggx = src_backend_config.get_array(self.ggx) x_var = chainer.Variable(x, requires_grad=True) y_var = functions.copy(x_var, dst_device) y_var.grad = gy gy_var = y_var.grad_var y_var.backward(enable_double_backprop=True) assert x_var.grad_var.requires_grad is True x_var.grad_var.grad = ggx x_var.grad_var.backward() assert gy_var.grad_var.device == dst_device assert (backend.get_device_from_array( gy_var.grad_var.array) == dst_device) numpy.testing.assert_array_equal( _numpy_device.send(gy_var.grad_var.array), self.ggx)
def test_double_backward(self, src_backend_config, dst_backend_config): x = src_backend_config.get_array(self.x) gy = dst_backend_config.get_array(self.gy) ggx = src_backend_config.get_array(self.ggx) dst_device = dst_backend_config.device x_var = chainer.Variable(x, requires_grad=True) y_var = functions.copy(x_var, dst_device) y_var.grad = gy gy_var = y_var.grad_var y_var.backward(enable_double_backprop=True) assert x_var.grad_var.requires_grad is True x_var.grad_var.grad = ggx x_var.grad_var.backward() assert gy_var.grad_var.device == dst_device assert (backend.get_device_from_array( gy_var.grad_var.array) == dst_device) numpy.testing.assert_array_equal( _numpy_device.send(gy_var.grad_var.array), self.ggx)
def __call__(self, array): if self.dtype is not None: assert array.dtype == self.dtype device = backend.get_device_from_array(array) if not array.shape: # 0-dim case array[...] = self.scale * (2 * numpy.random.randint(2) - 1) elif not array.size: raise ValueError('Array to be initialized must be non-empty.') else: # numpy.prod returns float value when the argument is empty. out_dim = len(array) in_dim = utils.size_of_shape(array.shape[1:]) if (in_dim > out_dim and self._checks[0]) or (in_dim < out_dim and self._checks[1]): raise ValueError('Cannot make orthogonal {}.' 'shape = {}, interpreted as ' '{}-dim input and {}-dim output.'.format( self.mode, array.shape, in_dim, out_dim)) transpose = in_dim > out_dim a = numpy.random.normal(size=(out_dim, in_dim)) if transpose: a = a.T # cupy.linalg.qr requires cusolver in CUDA 8+ q, r = numpy.linalg.qr(a) q *= numpy.copysign(self.scale, numpy.diag(r)) if transpose: q = q.T array[...] = device.xp.asarray(q.reshape(array.shape))
def visit_array(self, arr): assert isinstance(arr, chainer.get_array_types()) device = backend.get_device_from_array(arr) if self._skip_visiting(device): self._warn_to_gpu(device, self._device) return arr return self._device.send(arr)
def __call__(self, array): if self.dtype is not None: assert array.dtype == self.dtype device = backend.get_device_from_array(array) array[...] = device.xp.random.uniform(low=-self.scale, high=self.scale, size=array.shape)
def test_double_backward(self, src_backend_config, dst_backend_config): x = src_backend_config.get_array(self.x) gy = dst_backend_config.get_array(self.gy) ggx = src_backend_config.get_array(self.ggx) dst_device = dst_backend_config.device x_var = chainer.Variable(x, requires_grad=True) y_var = functions.copy(x_var, dst_device) # TODO(niboshi): Remove this workround after Variable.grad.setter is # fixed so that it calls gy.require_grad() internally. if dst_backend_config.xp is chainerx: gy.require_grad() y_var.grad = gy gy_var = y_var.grad_var y_var.backward(enable_double_backprop=True) assert x_var.grad_var.requires_grad is True x_var.grad_var.grad = ggx x_var.grad_var.backward() assert gy_var.grad_var.device == dst_device assert (backend.get_device_from_array( gy_var.grad_var.array) == dst_device) numpy.testing.assert_array_equal( _numpy_device.send(gy_var.grad_var.array), self.ggx)
def _check_forward_internal(self, dst_device_spec, src_device, dst_device, x_mode): x = src_device.send(self.x) if x_mode == 'array': pass elif x_mode == 'non_requires_grad': x = chainer.Variable(x, requires_grad=False) elif x_mode == 'requires_grad': x = chainer.Variable(x, requires_grad=True) else: assert False, x_mode error_expected = ((src_device.xp is chainerx) != (dst_device.xp is chainerx) and x_mode == 'requires_grad') if error_expected: with pytest.raises(RuntimeError): functions.copy(x, dst_device_spec) return y = functions.copy(x, dst_device_spec) assert y.device == dst_device assert backend.get_device_from_array(y.array) == dst_device assert y.dtype == self.dtype numpy.testing.assert_array_equal(_numpy_device.send(y.array), self.x)
def test_from_array(self): arr = numpy.ndarray((2, ), numpy.float32) expected_device = backend.CpuDevice() device = backend.CpuDevice.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def check_concat_arrays(self, arrays, device, expected_device): array = self.converter(arrays, device) self.assertEqual(array.shape, (len(arrays),) + arrays[0].shape) assert backend.get_device_from_array(array) == expected_device np_array = backend.CpuDevice().send(array) for x, y in zip(np_array, arrays): numpy.testing.assert_array_equal(x, backend.CpuDevice().send(y))
def test_from_array(self): arr = numpy.ndarray((2,), numpy.float32) expected_device = backend.CpuDevice() device = backend.CpuDevice.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def check_concat_arrays(self, arrays, device, expected_device): array = self.converter(arrays, device) self.assertEqual(array.shape, (len(arrays),) + arrays[0].shape) assert backend.get_device_from_array(array) == expected_device np_array = backend.CpuDevice().send(array) for x, y in zip(np_array, arrays): numpy.testing.assert_array_equal(x, backend.CpuDevice().send(y))
def __init__( self, func, xs, gys, params, eps, atol, rtol, no_gxs, dtype, detect_nondifferentiable, is_immutable_params): # If `is_immutable_params` is `False`, `params` are expected to be of # type `chainer.Parameter` and are updated in-place. # To run `_CheckBackward` with ChainerX ndarrays however which cannot # be updated in-place when wrapped in `chainer.Parameter`s, this flag # should be `True` and parameters should be given as ndarrays. # `func` in the former case must take inputs as arguments only. In the # latter, it must take the parameters in addition. if dtype is not None and numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') if is_immutable_params: if not all( isinstance(p, chainer.get_array_types()) for p in params): raise ValueError( 'All parameters in `params` must be ndarrays if ' '`is_immutable_params` is `True`. Actual: {}.'.format( ', '.join(str(type(p)) for p in params))) xs = _as_tuple(xs) if gys is not None: gys = _as_tuple(gys) params = _as_tuple(params) if no_gxs is None: no_gxs = [x.dtype.kind != 'f' for x in xs] else: if len(no_gxs) != len(xs): raise ValueError( 'Length of no_grads param and xs should be same.\n' 'Actual: {0} != {1}'.format(len(no_gxs), len(xs))) device = backend.get_device_from_array(*xs) if device.xp is chainerx: if params and not is_immutable_params: raise NotImplementedError( 'gradient_check does not support params argument for ' 'ChainerX arrays') self.device = device self.func = func self.xs = xs self.gys = gys self.params = params self.no_gxs = no_gxs self.atol = atol self.rtol = rtol self.is_immutable_params = is_immutable_params # options for numeric gradients self.eps = eps self.dtype = dtype self.detect_nondifferentiable = detect_nondifferentiable
def __call__(self, array): if self.dtype is not None: assert array.dtype == self.dtype shape = array.shape if len(shape) != 2 or shape[0] != shape[1]: raise ValueError('Identity matrix initialization can only be used ' 'for 2D squared matrices.') device = backend.get_device_from_array(array) array[...] = device.xp.identity(shape[0]) * self.scale
def __call__(self, array): device = backend.get_device_from_array(array) args = {'loc': 0.0, 'scale': self.scale, 'size': array.shape} if device.xp is cuda.cupy: # Only CuPy supports dtype option if self.dtype == numpy.float32 or self.dtype == numpy.float16: # float16 is not supported in cuRAND args['dtype'] = numpy.float32 array[...] = device.xp.random.normal(**args)
def check_forward(self, dst_device_spec, src_device, dst_device): x = src_device.send(self.x) x_var = chainer.Variable(x) y = functions.copy(x_var, dst_device_spec) assert y.device == dst_device assert backend.get_device_from_array(y.array) == dst_device assert y.dtype == self.dtype numpy.testing.assert_array_equal(_numpy_device.send(y.array), self.x)
def __call__(self, array): if self.dtype is not None: assert array.dtype == self.dtype # Calling copy to ensures that the fill_value array # is moved to the device where array resides if isinstance(self.fill_value, chainer.get_array_types()): backend.copyto(array, self.fill_value) else: device = backend.get_device_from_array(array) array[...] = device.xp.asarray(self.fill_value)
def test_from_array(self, backend_config): arr = backend_config.get_array(numpy.ndarray((2, ), numpy.float32)) # Test precondition check assert arr.device.name == backend_config.chainerx_device expected_device = backend_config.device device = backend.ChainerxDevice.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def backward(self, target_input_indexes, grad_outputs): retained_inputs = self.get_retained_inputs() inputs = [None] * len(self.inputs) in_data = [None] * len(self.inputs) for retained, i_in in six.moves.zip(retained_inputs, self._input_indexes_to_retain): inputs[i_in] = retained in_data[i_in] = None if retained is None else retained.array in_data = tuple(in_data) grad_out_data = tuple( [None if grad is None else grad.array for grad in grad_outputs]) is_chainerx_fallback_mode = self._is_chainerx_fallback_mode if is_chainerx_fallback_mode: # Convert input and output gradients to numpy/cupy in_data = backend.from_chx(in_data) grad_out_data = backend.from_chx(grad_out_data) # Call Function.backward with chainer.using_device( backend.get_device_from_array(*(in_data + grad_out_data))): if is_chainerx_fallback_mode: # Enable attribute fallback with function_node._chainerx_attribute_fallback( self._function, self.chainerx_device): gxs = self._function.backward(in_data, grad_out_data) else: gxs = self._function.backward(in_data, grad_out_data) # Check gradients for x, gx in six.moves.zip(self.inputs, gxs): if gx is not None: variable._check_grad_type(self, x, True, gx) # Convert input gradients back to ChainerX if is_chainerx_fallback_mode: gxs = backend.to_chx(gxs) ret = [] for i in target_input_indexes: if gxs[i] is None: g = None else: # Intentionally not passing requires_grad=False so that # backprop routines can raise an error when a further backprop # is attempted against this gradient variable. g = variable.Variable(gxs[i]) if g.xp is not chainerx: g.node._old_style_grad_generator = self._function.label ret.append(g) return tuple(ret)
def test_from_array(self, backend_config): arr = backend_config.get_array(numpy.ndarray((2, ), numpy.float32)) # Test precondition check assert isinstance(arr, intel64.mdarray) expected_device = backend.Intel64Device() device = backend.Intel64Device.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def test_from_array(self, backend_config): arr = backend_config.get_array(numpy.ndarray((2,), numpy.float32)) # Test precondition check assert arr.device.name == backend_config.chainerx_device expected_device = backend_config.device device = backend.ChainerxDevice.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def test_from_array(self, backend_config): arr = backend_config.get_array(numpy.ndarray((2,), numpy.float32)) # Test precondition check assert isinstance(arr, intel64.mdarray) expected_device = backend.Intel64Device() device = backend.Intel64Device.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def copy(x, dst): """Copies the input variable onto the specified device. If the input ``x`` already resides on the device specified by ``dst``, no copy will actually take place and the returned variable will hold a view of the input. In other cases, the input will be copied to ``dst``. When ``dst == -1``, the array is copied to the host memory. This function supports copies from host to host, from host to device, from device to device and from device to host. Args: x (:class:`~chainer.Variable` or :ref:`ndarray`): Variable to be copied. dst: Target device specifier. Returns: ~chainer.Variable: Output variable. .. admonition:: Example >>> import chainer.backends.cuda as cuda >>> x_arr = np.random.uniform(-1, 1, (5, 10)) >>> x = chainer.Variable(x_arr) >>> x.device <CpuDevice (numpy)> >>> y = F.copy(x, '@cupy:0') # from CPU (NumPy) to GPU 0 (CuPy) >>> y.device <GpuDevice (cupy):0> .. note:: Copies between non-ChainerX devices and ChainerX devices are not supported. """ # For backward compatibility if dst is cuda.DummyDevice: dst = chainer.get_device('@numpy') in_device = backend.get_device_from_array( x.array if isinstance(x, chainer.Variable) else x) out_device = chainer.get_device(dst) is_chainerx = in_device.xp is chainerx if is_chainerx != (out_device.xp is chainerx): raise RuntimeError( 'F.copy does not support copies between non-ChainerX devices and ' 'ChainerX devices.\n' 'From: {}\n' 'To: {}'.format(in_device, out_device)) y, = Copy(in_device, out_device).apply((x,)) return y
def test_get_device_from_array(self, backend_config): with cuda.Device(backend_config.cuda_device): arr = cuda.ndarray((), numpy.float32) # Test precondition check assert arr.device.id == backend_config.cuda_device expected_device = backend_config.device device = backend.GpuDevice.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def check_concat_tuples(self, tuples, device, expected_device): arrays = self.converter(tuples, device) self.assertEqual(len(arrays), len(tuples[0])) for i in range(len(arrays)): shape = (len(tuples),) + tuples[0][i].shape self.assertEqual(arrays[i].shape, shape) assert backend.get_device_from_array(arrays[i]) == expected_device arr = backend.CpuDevice().send(arrays[i]) for x, y in zip(arr, tuples): numpy.testing.assert_array_equal( x, backend.CpuDevice().send(y[i]))
def check_concat_dicts(self, dicts, device, expected_device): arrays = self.converter(dicts, device) self.assertEqual(frozenset(arrays.keys()), frozenset(dicts[0].keys())) for key in arrays: shape = (len(dicts),) + dicts[0][key].shape self.assertEqual(arrays[key].shape, shape) self.assertEqual( backend.get_device_from_array(arrays[key]), expected_device) arr = backend.CpuDevice().send(arrays[key]) for x, y in zip(arr, dicts): numpy.testing.assert_array_equal( x, backend.CpuDevice().send(y[key]))
def check_concat_dicts(self, dicts, device, expected_device): arrays = self.converter(dicts, device) self.assertEqual(frozenset(arrays.keys()), frozenset(dicts[0].keys())) for key in arrays: shape = (len(dicts),) + dicts[0][key].shape self.assertEqual(arrays[key].shape, shape) self.assertEqual( backend.get_device_from_array(arrays[key]), expected_device) arr = backend.CpuDevice().send(arrays[key]) for x, y in zip(arr, dicts): numpy.testing.assert_array_equal( x, backend.CpuDevice().send(y[key]))
def check_concat_tuples(self, tuples, device, expected_device): arrays = self.converter(tuples, device) self.assertEqual(len(arrays), len(tuples[0])) for i in range(len(arrays)): shape = (len(tuples),) + tuples[0][i].shape self.assertEqual(arrays[i].shape, shape) assert backend.get_device_from_array(arrays[i]) == expected_device arr = backend.CpuDevice().send(arrays[i]) for x, y in zip(arr, tuples): numpy.testing.assert_array_equal( x, backend.CpuDevice().send(y[i]))
def test_get_device_from_array(self, backend_config): with cuda.Device(backend_config.cuda_device): arr = cuda.ndarray((), numpy.float32) # Test precondition check assert arr.device.id == backend_config.cuda_device expected_device = backend_config.device device = backend.GpuDevice.from_array(arr) assert device == expected_device device = backend.get_device_from_array(arr) assert device == expected_device
def make_statistics(self): """Computes and returns the mean and standard deviation values. Returns: tuple: Mean and standard deviation values. """ x, n = self._x, self._n xp = backend.get_array_module(x) with chainer.using_device(backend.get_device_from_array(x)): mean = x / n var = self._x2 / n - mean * mean std = xp.sqrt(var) return mean, std
def _concat_arrays(arrays, padding): # Convert `arrays` to numpy.ndarray if `arrays` consists of the built-in # types such as int, float or list. if not isinstance(arrays[0], chainer.get_array_types()): arrays = numpy.asarray(arrays) if padding is not None: arr_concat = _concat_arrays_with_padding(arrays, padding) else: device = backend.get_device_from_array(arrays[0]) with chainer.using_device(device): arr_concat = device.xp.concatenate( [array[None] for array in arrays]) return arr_concat
def __call__(self, array): if self.dtype is not None: assert array.dtype == self.dtype,\ '{} != {}'.format(array.dtype, self.dtype) if self.rng is None: device = backend.get_device_from_array(array) array[...] = device.xp.random.uniform(low=-self.scale, high=self.scale, size=array.shape) else: backend.copyto( array, self.rng.uniform(low=-self.scale, high=self.scale, size=array.shape).astype(array.dtype, copy=False))
def _concat_arrays_with_padding(arrays, padding): shape = numpy.array(arrays[0].shape, dtype=int) for array in arrays[1:]: if numpy.any(shape != array.shape): numpy.maximum(shape, array.shape, shape) shape = tuple(numpy.insert(shape, 0, len(arrays))) device = backend.get_device_from_array(arrays[0]) with chainer.using_device(device): result = device.xp.full(shape, padding, dtype=arrays[0].dtype) for i in six.moves.range(len(arrays)): src = arrays[i] slices = tuple(slice(dim) for dim in src.shape) result[(i,) + slices] = src return result
def test_backward(self, src_backend_config, dst_backend_config): x = src_backend_config.get_array(self.x) gy = dst_backend_config.get_array(self.gy) src_device = src_backend_config.device dst_device = dst_backend_config.device x_var = chainer.Variable(x, requires_grad=True) y_var = functions.copy(x_var, dst_device) y_var.grad = gy y_var.backward() x_grad = x_var.grad assert x_var.grad_var.device == src_device assert backend.get_device_from_array(x_grad) == src_device numpy.testing.assert_array_equal(_numpy_device.send(x_grad), self.gy)
def __call__(self, opt): sqnorm = _sum_sqnorm([p.grad for p in opt.target.params(False)]) device = backend.get_device_from_array(sqnorm) with chainer.using_device(device): norm = device.xp.sqrt(sqnorm) rate = self.threshold / norm # When no clipping is needed, skip the clipping on CPU and # multiply 1.0 on the device otherwise. if device.xp is numpy: if rate >= 1: return else: rate = rate.clip(None, 1) for param in opt.target.params(False): grad = param.grad with cuda.get_device_from_array(grad): grad *= rate
def __init__( self, func, x_data, y_grad, params, eps, atol, rtol, no_grads, dtype, detect_nondifferentiable): if dtype is not None and numpy.dtype(dtype).kind != 'f': raise ValueError('`dtype` is allowed only float type') x_data = _as_tuple(x_data) if y_grad is not None: y_grad = _as_tuple(y_grad) params = _as_tuple(params) if no_grads is None: no_grads = [x.dtype.kind != 'f' for x in x_data] else: if len(no_grads) != len(x_data): raise ValueError( 'Length of no_grads param and xs should be same.\n' 'Actual: {0} != {1}'.format(len(no_grads), len(x_data))) device = backend.get_device_from_array(*x_data) if device.xp is chainerx: if len(params) > 0: raise NotImplementedError( 'gradient_check does not support params argument for ' 'ChainerX arrays') if any(no_grads): raise NotImplementedError( 'gradient_check does not support no_grads argument for ' 'ChainerX arrays') self.device = device self.func = func self.x_data = x_data self.y_grad = y_grad self.params = params self.no_grads = no_grads self.atol = atol self.rtol = rtol # options for numeric gradients self.eps = eps self.dtype = dtype self.detect_nondifferentiable = detect_nondifferentiable
def _backward_chainerx(self, target_input_indexes, grad_outputs, retained_inputs, retained_outputs): # Backward wrapper that is called from C++ via a Python binding in case # self.apply was called with chainerx.ndarrays. assert self._is_chainex_fallback_mode assert len(target_input_indexes) > 0 assert ( (self._input_indexes_to_retain is None and len(retained_inputs) == 0) or (len(self._input_indexes_to_retain) == len(retained_inputs))) assert ( (self._output_indexes_to_retain is None and len(retained_outputs) == 0) or (len(self._output_indexes_to_retain) == len(retained_outputs))) assert all([ a is None or isinstance(a, chainerx.ndarray) for a in grad_outputs]) self._chainerx_retained_inputs = tuple([ variable.Variable( array, requires_grad=array.is_backprop_required()) for array in retained_inputs]) self._chainerx_retained_outputs = tuple([ variable.Variable( array, requires_grad=( False if array is None else array.is_backprop_required())) for array in retained_outputs]) device = backend.get_device_from_array( *(retained_inputs + retained_outputs + grad_outputs)) with chainer.using_device(device): gxs = self._backward_target_inputs( tuple(target_input_indexes), tuple([ None if gy is None else chainer.Variable( gy, requires_grad=gy.is_backprop_required()) for gy in grad_outputs])) gx_arrs = [gx._data[0] for gx in gxs] assert all([isinstance(gx, chainerx.ndarray) for gx in gx_arrs]) return gx_arrs
def as_noncontiguous_array(a): if a is None: return None if a.size <= 1: return a device = backend.get_device_from_array(a) xp = device.xp slices = (slice(None, None, 2),) * a.ndim with chainer.using_device(device): ret = xp.empty(tuple([s * 2 for s in a.shape]), dtype=a.dtype) ret[slices] = a ret = ret[slices] if device.xp is chainerx: assert not ret.is_contiguous else: assert not ret.flags.c_contiguous return ret
def as_noncontiguous_array(a): if a is None: return None if a.size <= 1: return a device = backend.get_device_from_array(a) xp = device.xp with chainer.using_device(device): ret = xp.empty( (a.shape[0] * 2,) + a.shape[1:], dtype=a.dtype) ret[::2] = a ret = ret[::2] if device.xp is chainerx: assert not ret.is_contiguous else: assert not ret.flags.c_contiguous return ret
def check_device(self, array, device, expected_device): self.assertIsInstance(array, expected_device.xp.ndarray) self.assertEqual( backend.get_device_from_array(array), expected_device)
def numerical_grad( f, inputs, grad_outputs, eps=1e-3, detect_nondifferentiable=False, diff_atol=0, diff_rtol=1e-2, center_outputs=None): """Computes numerical gradient by finite differences. This function is used to implement gradient check. For usage example, see unit tests of :mod:`chainer.functions`. By default, ``numerical_grad`` computes the gradient to the first order of ``eps``. Args: f (callable): Python function with no arguments that runs forward computation and returns the result. inputs (tuple of arrays): Tuple of arrays that should be treated as inputs. Each element of them is slightly modified to realize numerical gradient by finite differences. grad_outputs (tuple of arrays or scalars): Tuple of arrays or scalars that are treated as output gradients. eps (float): Epsilon value of finite differences. detect_nondifferentiable (bool): ``False`` by default. If ``True``, ``numerical_grad`` checks whether ``f`` is differentiable at ``inputs``. It requires evaluation of ``f`` at 5 points instead of 2. As a side effect, the accuracy of numerical gradient will be increased to the third order of ``eps``. If it turns out that ``f`` is non-differentiable at ``input``, ``numerical_grad`` raises :class:`~chainer.gradient_check.NondifferentiableError`. diff_atol (float): Absolute tolerance of fitting error of non-differentiable point detection. diff_rtol (float): Tolerance of fitting error of non-differentiable point detection relative to the output values of ``f``. center_outputs (tuple of arrays or None): Only used if ``detect_nondifferentiable`` is ``True``. If specified, these arrays are used as the outputs of ``f`` at ``inputs``. Otherwise, it is calculated. It can be used to reduce the computation if these arrays are already calculated before calling ``numerical_grad``. Returns: tuple: Numerical gradient arrays corresponding to ``inputs``. """ # TODO(niboshi): Deprecate `center_outputs` argument. # If dtype of this argument is not float64, often the resolution is # insufficient for numerical gradient calculation. We might use it only # when its dtype is float64, but it would be better to simply remove it. center_outputs = None assert eps > 0 assert isinstance(inputs, (tuple, list)) for x in inputs: if x.dtype.kind != 'f': raise RuntimeError( 'The dtype of input arrays must be kind of float') inputs = tuple(inputs) # Cast grad_outputs to float64 grad_outputs = tuple([ None if g is None else numpy.float64(g) if numpy.isscalar(g) else g.astype(numpy.float64) for g in grad_outputs]) if not chainer.is_arrays_compatible( [a for a in inputs + grad_outputs if not numpy.isscalar(a)]): raise RuntimeError('Do not mix GPU and CPU arrays in `numerical_grad`') device = backend.get_device_from_array(*(inputs + grad_outputs)) xp = device.xp if xp is cuda.cupy: numerical_grad_kernel_1 = cuda.reduce( 'T y1, T y2, U gy, T eps', 'V gxi', '(y1 - y2) * gy', 'a + b', 'gxi += a / (eps * 2)', '0', 'numerical_grad_kernel_1' ) numerical_grad_kernel_3 = cuda.reduce( 'T y1, T y2, T y3, T y4, U gy, T eps', 'V gxi', '(-y1 + 8 * y2 - 8 * y3 + y4) * gy', 'a + b', 'gxi += a / (eps * 6)', '0', 'numerical_grad_kernel_3' ) if xp is chainerx: grads = [ xp.zeros(x.shape, numpy.float64, device=x.device) for x in inputs] else: grads = [xp.zeros(x.shape, numpy.float64) for x in inputs] if detect_nondifferentiable: if center_outputs is None: ys0 = _copy_arrays(f()) else: ys0 = center_outputs nout = len(ys0) shapes = [_.shape for _ in ys0] sizes = numpy.array([_.size for _ in ys0]) cumsizes = numpy.cumsum(sizes) # Evaluate func at a single input def eval_func(x, i, delta, orig): x[i] = orig + delta y = _copy_arrays(f()) assert len(y) == len(grad_outputs) assert all([ gy is None for y_, gy in zip(y, grad_outputs) if y_ is None]) assert all([ gy is None or numpy.isscalar(gy) or y_.shape == gy.shape for y_, gy in zip(y, grad_outputs)]) x[i] = orig return y # An iteration on a single input displacement def iterate_single_input(i_in, x, orig_x, i): orig = orig_x[i] # `yss` holds a list of output arrays for each of 2 or 5 sampling # points. if detect_nondifferentiable: yss = [ eval_func(x, i, -eps * 1., orig), eval_func(x, i, -eps * .5, orig), ys0, eval_func(x, i, +eps * .5, orig), eval_func(x, i, +eps * 1., orig), ] else: yss = [ eval_func(x, i, -eps * 1, orig), eval_func(x, i, +eps * 1, orig), ] if detect_nondifferentiable: # Detect non-differentiable point by quadratic fitting # Check for non-finite output. # If any single element in the output arrays has different # finiteness among sampled points, that means this is a # non-differentiable point. # If the function consistently generates non-finite values # around the point, we do not treat the point as # non-differentiable. # (Example: x<0 region for the logarithm function) any_nonfinite = False for i_out in range(nout): isfinites = [xp.isfinite(ys[i_out]) for ys in yss] if any((isfinites[0] != isfinites[i]).any() for i in range(1, len(yss))): s = six.StringIO() s.write( 'Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) any_nonfinite |= not all((_).all() for _ in isfinites) if not any_nonfinite: # Stack flattened outputs to make (5, *)-shaped 2D array ystack = xp.vstack( [xp.hstack([y.ravel() for y in ys]) for ys in yss]) assert ystack.ndim == 2 and ystack.shape[0] == len(yss) # Fit to quadratic if xp is not numpy: ystack = _cpu._to_cpu(ystack) polyfit = numpy.polynomial.polynomial.polyfit _, (residuals, _, _, _) = polyfit( range(len(yss)), ystack, deg=2, full=True) if xp is not numpy: residuals = device.send(residuals) residuals = xp.sqrt(residuals / len(yss)) # Check for error for each output array for i_out in range(nout): size = sizes[i_out] cumsize = cumsizes[i_out] shape = shapes[i_out] # TODO(niboshi): The following two lines could be # rewritten using xp.stack, which is supported in # NumPy>=1.10 ymax = xp.concatenate( [ys[i_out][None] for ys in yss]).max(axis=0) ymin = xp.concatenate( [ys[i_out][None] for ys in yss]).min(axis=0) # Restore the shape of flattened residual res = residuals[cumsize - size:cumsize] res = res.reshape(shape) det = utils.force_array( diff_atol + diff_rtol * (ymax - ymin) < res) # Constant output = not nondifferentiable det[ymax == ymin] = False if det.any(): s = six.StringIO() s.write( 'Tried to compute the numeric gradient on a ' 'non-differentiable point.\n\n') s.write('i_in: {}\n'.format(i_in)) s.write('i_out: {}\n'.format(i_out)) s.write('x: {}\n'.format(inputs[i_in])) s.write('index on x: {}\n'.format(i)) s.write('eps: {}\n'.format(eps)) s.write('diff_rtol: {}\n'.format(diff_rtol)) s.write('diff_atol: {}\n'.format(diff_atol)) s.write('ymax: {}\n'.format(ymax)) s.write('ymin: {}\n'.format(ymin)) s.write( 'diff_atol + diff_rtol * (ymax-ymin): {}\n'.format( diff_atol + diff_rtol * (ymax - ymin))) s.write('fitting errors: {}\n'.format(res)) s.write('y[x-eps ]: {}\n'.format(yss[0][i_out])) s.write('y[x-eps/2]: {}\n'.format(yss[1][i_out])) s.write('y[x ]: {}\n'.format(yss[2][i_out])) s.write('y[x+eps/2]: {}\n'.format(yss[3][i_out])) s.write('y[x+eps ]: {}\n'.format(yss[4][i_out])) raise NondifferentiableError(s.getvalue()) # Calculate numerical gradient for i_out, gy in enumerate(grad_outputs): if gy is None: continue if not numpy.isscalar(gy): gy = gy.astype(numpy.float64, copy=False) gpu_ = (xp is cuda.cupy and all(isinstance(ys[i_out], cuda.ndarray) for ys in yss)) # If any output sample is None, all others must be. assert all([ (yss[0][i_out] is None) == (yss[j][i_out] is None) for j in range(len(yss))]) # If outputs samples are None, the part of numeric gradient for # this output is considered as zero: skip the accumulation. if yss[0][i_out] is None: continue if len(yss) == 2: # 1st order y0 = yss[0][i_out] y1 = yss[1][i_out] if gpu_: numerical_grad_kernel_1( y1, y0, xp.asarray(gy), eps, gx[i]) else: dot = ((y1 - y0) * gy).sum() gx[i] = gx[i] + dot / (2 * eps) elif len(yss) == 5: # 3rd order y0 = yss[0][i_out] y1 = yss[1][i_out] y2 = yss[3][i_out] y3 = yss[4][i_out] if gpu_: numerical_grad_kernel_3( y3, y2, y1, y0, gy, eps, gx[i]) else: num = -y3 + 8 * y2 - 8 * y1 + y0 dot = (num * gy).sum() gx[i] = gx[i] + dot / (6 * eps) else: assert False # Calculate numeric gradient with configuration.using_config('type_check', False): for i_in, (x, gx) in enumerate(six.moves.zip(inputs, grads)): orig_x = x.copy() # hold original value for i in numpy.ndindex(x.shape): iterate_single_input(i_in, x, orig_x, i) return [g.astype(x.dtype, copy=False) for g, x in six.moves.zip(grads, inputs)]
def forward(self, x, y): self.args.append((x, y)) with chainer.using_device(backend.get_device_from_array(x, y)): chainer.report({'loss': x.sum() + y.sum()}, self)
def check_unrecognized(self, arg): device = backend.get_device_from_array(arg) assert device == backend.CpuDevice()
def backward(self, indexes, grad_outputs): x, W, gy = self.get_retained_inputs() device = backend.get_device_from_array(x.data) xp = device.xp if 0 in indexes: gx = chainer.Variable(xp.zeros_like(x.data)) if 1 in indexes: gW = chainer.Variable(xp.zeros_like(W.data)) if 2 in indexes: ggy = chainer.Variable(xp.zeros_like(gy.data)) ggx, _, ggW = grad_outputs pos_neg_mask = xp.ones(self.sample_size + 1) pos_neg_mask[0] *= -1 with chainer.using_device(device): arange = xp.arange(len(self.ignore_mask)) for i in arange[self.ignore_mask]: # Partial forward pass to obtain intermediate `Variable`s ix = x[i] k = self.samples[i] if self.reduce == 'sum': igy = gy else: igy = gy[i] w = W[k] f = chainer.functions.flatten( chainer.functions.matmul(w, ix[:, None])) * pos_neg_mask sigf = chainer.functions.sigmoid(f) g = chainer.functions.broadcast_to(igy, f.shape) * sigf \ * pos_neg_mask dgW_dg = chainer.functions.flatten( chainer.functions.matmul(ggW[k], ix[:, None])) * pos_neg_mask dgW_df = chainer.functions.broadcast_to(igy, f.shape) \ * _sigmoid_grad(f, sigf, dgW_dg) * pos_neg_mask dgx_dg = chainer.functions.flatten( chainer.functions.matmul(ggx[i][None, :], w, transb=True)) dgx_df = chainer.functions.broadcast_to(igy, f.shape) \ * _sigmoid_grad(f, sigf, dgx_dg) if 0 in indexes: # derivative of gx dgx = chainer.functions.matmul(w, dgx_df[:, None], transa=True) # derivative of gW dgx += chainer.functions.matmul(g[None, :], ggW[k]).T dgx += chainer.functions.matmul( w, dgW_df[:, None], transa=True) gx = chainer.functions.scatter_add( gx, i, chainer.functions.flatten(dgx)) if 1 in indexes: # derivative of gx shape = ggx[i].shape for ik, ig, idgx_df in six.moves.zip(k, g, dgx_df): ig = chainer.functions.broadcast_to(ig, shape) idgx_df = chainer.functions.broadcast_to(idgx_df, shape) gW = chainer.functions.scatter_add( gW, ik, ig * ggx[i] + idgx_df * ix) # derivative of gW gW = chainer.functions.scatter_add( gW, k, chainer.functions.matmul(dgW_df[:, None], ix[None, :])) if 2 in indexes: dgx_dg *= pos_neg_mask dggy = chainer.functions.sum((dgx_dg + dgW_dg) * sigf) if self.reduce == 'sum': ggy += dggy else: ggy = chainer.functions.scatter_add(ggy, i, dggy) ret = [] if 0 in indexes: ret.append(gx) if 1 in indexes: ret.append(gW) if 2 in indexes: ret.append(ggy) return ret