def setUp(self): self.pool = memory.MemoryAsyncPool() self.unit = memory._allocation_unit_size self.stream = stream_module.Stream() self.stream_ident = self.stream.ptr cupy.get_default_memory_pool().free_all_blocks() cupy.cuda.Device().synchronize()
def setUp(self): if cupy.cuda.runtime.deviceGetAttribute( cupy.cuda.runtime.cudaDevAttrMemoryPoolsSupported, 0) == 0: pytest.skip('malloc_async is not supported on device 0') self.pool = memory.MemoryAsyncPool() self.unit = memory._allocation_unit_size self.stream = stream_module.Stream() self.stream_ident = self.stream.ptr cupy.get_default_memory_pool().free_all_blocks() cupy.cuda.Device().synchronize()
c = cupy.random.random([extent[i] for i in mode_c]) a = a.astype(dtype) c = c.astype(dtype) desc_a = cutensor.create_tensor_descriptor(a) desc_c = cutensor.create_tensor_descriptor(c) alpha = 1.0 beta = 0.1 # rehearsal c = cutensor.reduction(alpha, a, desc_a, mode_a, beta, c, desc_c, mode_c) ev_start = stream.Event() ev_end = stream.Event() st = stream.Stream() with st: # measurement ev_start.record() c = cutensor.reduction(alpha, a, desc_a, mode_a, beta, c, desc_c, mode_c) ev_end.record() st.synchronize() elapsed_ms = stream.get_elapsed_time(ev_start, ev_end) transfer_byte = a.size * a.itemsize + c.size * c.itemsize if beta != 0.0: transfer_byte += c.size * c.itemsize gbs = transfer_byte / elapsed_ms / 1e6 print('dtype: {}'.format(numpy.dtype(dtype).name)) print('time (ms): {}'.format(elapsed_ms))
def setUp(self): self.pool = memory.SingleDeviceMemoryPool(allocator=mock_alloc) self.unit = memory._allocation_unit_size self.stream = stream_module.Stream() self.stream_ptr = self.stream.ptr
def setUp(self): self.stream = stream_module.Stream() if self.use_streams else None
def _get_stream(strm): if strm is None: return stream.Stream(null=True) else: return strm