def test_cupy_array_async1(self): x = cuda.to_gpu(self.x) if not self.c_contiguous: x = cuda.cupy.asfortranarray(x) y = cuda.to_cpu(x, stream=cuda.Stream(null=True)) self.assertIsInstance(y, numpy.ndarray) cuda.cupy.testing.assert_array_equal(self.x, y)
def __call__(self, batch, device=None, padding=None): """Concatenate data and transfer them to GPU asynchronously. See also :func:`chainer.dataset.concat_examples`. Args: batch (list): A list of examples. device (int): Device ID to which each array is sent. padding: Scalar value for extra elements. Returns: Array, a tuple of arrays, or a dictionary of arrays. The type depends on the type of each example in the batch. """ if len(batch) == 0: raise ValueError('batch is empty') first_elem = batch[0] if len(self._conveyor) == 0: self._device = device # device is set at first call if device is not None and device >= 0 and self._stream is None: with cuda.get_device_from_id(device): self._stream = cuda.Stream(non_blocking=True) if device is not self._device: raise ValueError('device is different') with cuda.get_device_from_id(device): if isinstance(first_elem, tuple): result = [] if not isinstance(padding, tuple): padding = [padding] * len(first_elem) for i in six.moves.range(len(first_elem)): self._conveyor[i].put( _concat_arrays([example[i] for example in batch], padding[i])) for i in six.moves.range(len(first_elem)): result.append(self._conveyor[i].get()) return tuple(result) elif isinstance(first_elem, dict): result = {} if not isinstance(padding, dict): padding = {key: padding for key in first_elem} for key in first_elem: self._conveyor[key].put( _concat_arrays([example[key] for example in batch], padding[key])) for key in first_elem: result[key] = self._conveyor[key].get() return result else: return to_device(device, _concat_arrays(batch, padding))
def test_cupy_array_async1(self): x = cuda.to_gpu(self.x) if not self.c_contiguous: x = cuda.cupy.asfortranarray(x) y = cuda.to_gpu(x, stream=cuda.Stream()) self.assertIsInstance(y, cuda.ndarray) self.assertIs(x, y) # Do not copy cuda.cupy.testing.assert_array_equal(x, y)
def test_cupy_array_async2(self): x = cuda.to_gpu(self.x, device=0) with x.device: if not self.c_contiguous: x = cuda.cupy.asfortranarray(x) y = cuda.to_gpu(x, device=1, stream=cuda.Stream(null=True)) self.assertIsInstance(y, cuda.ndarray) self.assertIsNot(x, y) # Do copy cuda.cupy.testing.assert_array_equal(x, y)
def forward(self, inputs): x = inputs[0].copy() # make sure data is aligned xp = cuda.get_array_module(x) alphabet_size = x.shape[2] label_lengths = np.asarray([len(l.flatten()) for l in self.labels], dtype=np.intc) seq_lengths = np.asarray(self.seq_lengths, dtype=np.intc) ws_size = np.zeros(1, dtype=np.intc) if xp is np: warp_ctc.ctc_get_workspace_size_cpu(label_lengths.ctypes.data, seq_lengths.ctypes.data, alphabet_size, x.shape[1], ws_size.ctypes.data) self.gradients = np.zeros_like(x) ws = np.empty(ws_size // 4, dtype=np.float32) loss = np.zeros(len(self.seq_lengths), dtype=np.float32) labels = np.concatenate([l.flatten() for l in self.labels]) warp_ctc.ctc_compute_ctc_loss_cpu( x.ctypes.data, self.gradients.ctypes.data, labels.ctypes.data, label_lengths.ctypes.data, seq_lengths.ctypes.data, alphabet_size, x.shape[1], loss.ctypes.data, ws.ctypes.data, 1) else: stream = cuda.Stream(null=True) warp_ctc.ctc_get_workspace_size_gpu(label_lengths.ctypes.data, seq_lengths.ctypes.data, alphabet_size, x.shape[1], ws_size.ctypes.data, stream.ptr) self.gradients = cuda.cupy.zeros_like(x) ws = cuda.cupy.empty(ws_size // 4, dtype=np.float32) loss = np.zeros(len(self.seq_lengths), dtype=np.float32) labels = np.concatenate([l.flatten() for l in self.labels]) def _ctc(): warp_ctc.ctc_compute_ctc_loss_gpu( x.data.ptr, self.gradients.data.ptr, labels.ctypes.data, label_lengths.ctypes.data, seq_lengths.ctypes.data, alphabet_size, x.shape[1], loss.ctypes.data, ws.data.ptr, stream.ptr) try: _ctc() except Exception as e: cuda.memory_pool.free_all_free() try: _ctc() except: raise e if np.any(np.isnan(loss)): raise ValueError score = xp.full((1, ), xp.mean(loss), dtype=np.float32) return score,
def test_cupy_array_async3(self): with cuda.Device(0): x = cuda.to_gpu(self.x) if not self.c_contiguous: x = cuda.cupy.asfortranarray(x) with cuda.Device(1): y = cuda.to_gpu(x, stream=cuda.Stream(null=True)) self.assertIsInstance(y, cuda.ndarray) self.assertIsNot(x, y) # Do copy cuda.cupy.testing.assert_array_equal(x, y)
def test_get_and_add_callback(self): N = 100 cupy_arrays = [testing.shaped_random((2, 3)) for _ in range(N)] stream = cuda.Stream(null=True) out = [] for i in range(N): numpy_array = cupy_arrays[i].get(stream=stream) stream.add_callback(lambda _, __, t: out.append(t[0]), (i, numpy_array)) stream.synchronize() self.assertEqual(out, list(range(N)))
def __call__(self, batch, device=None): assert len(batch) != 0, 'batch is empty' first_elem = batch[0] if len(self._conveyor) == 0: self._device = device if device is not None and device >= 0 and self._stream is None: with cuda.get_device_from_id(device): self._stream = cuda.Stream(non_blocking=True) assert device is self._device, 'device is different' with cuda.get_device_from_id(device): if isinstance(first_elem, tuple): I, J = len(first_elem), len(batch) result = [[] for i in range(I)] for i in range(I): for j in range(J): self._conveyor[j * J + i].put(batch[j][i]) for i in range(I): for j in range(J): result[i].append(self._conveyor[j * J + i].get()) return tuple(result) assert False, 'Not supported'
def test_numpy_array_async(self): y = cuda.to_cpu(self.x, stream=cuda.Stream()) self.assertIsInstance(y, numpy.ndarray) self.assertIs(self.x, y) # Do not copy
def test_numpy_array_async3(self): with cuda.Device(1): y = cuda.to_gpu(self.x, stream=cuda.Stream(null=True)) self.assertIsInstance(y, cuda.ndarray) cuda.cupy.testing.assert_array_equal(self.x, y) self.assertEqual(int(y.device), 1)
def test_numpy_array_async(self): y = cuda.to_gpu(self.x, stream=cuda.Stream(null=True)) self.assertIsInstance(y, cuda.ndarray) cuda.cupy.testing.assert_array_equal(self.x, y)