def test_record_stream_cuda(self, cuda_sleep): # This test detects unexpected block reallocation. For reliable test, # the stream to allocate tensors is isolated. The allocator will not # reuse free blocks which were allocated from another stream. stream_alloc = new_stream(torch.device('cuda')) with torch.cuda.stream(stream_alloc): x = torch.rand(1, device=torch.device('cuda')) stream = new_stream(torch.device('cuda')) record_stream(x, stream) with use_stream(stream): cuda_sleep(0.5) # 'x' is deleted at Python's perspective. But the block of 'x' is still # required for 'stream'. 'y' shouldn't be allocated to the block. data_ptr = x.data_ptr() del x stream_alloc.synchronize() with torch.cuda.stream(stream_alloc): y = torch.rand(1, device=torch.device('cuda')) assert y.data_ptr() != data_ptr # Pause Python until 'stream' finishes tasks queued. Now the block of # 'x' is free to be reallocated. wait_stream(CPUStream, stream) with torch.cuda.stream(stream_alloc): z = torch.rand(1, device=torch.device('cuda')) assert z.data_ptr() == data_ptr
def test_record_stream_shifted_view(self, cuda_sleep): # Issue: https://github.com/pytorch/pytorch/issues/27366 stream_alloc = new_stream(torch.device('cuda')) with torch.cuda.stream(stream_alloc): x = torch.rand(2, device=torch.device('cuda')) y = x[1:] assert y.data_ptr() > x.data_ptr() stream = new_stream(torch.device('cuda')) with use_stream(stream): cuda_sleep(0.5) record_stream(y, stream) data_ptr = x.data_ptr() del x, y stream_alloc.synchronize() with torch.cuda.stream(stream_alloc): z = torch.rand(2, device=torch.device('cuda')) assert z.data_ptr() != data_ptr
def _ensure_copy_streams(self) -> List[List[AbstractStream]]: """Ensures that :class:`GPipe` caches CUDA streams for copy. It's worth to cache CUDA streams although PyTorch already manages a pool of pre-allocated CUDA streams, because it may reduce GPU memory fragementation when the number of micro-batches is small. """ if not self._copy_streams: for device in self.devices: self._copy_streams.append( [new_stream(device) for _ in range(self.chunks)]) return self._copy_streams
def __init__( self, batches: List[Batch], partitions: List[nn.Sequential], devices: Optional[List[torch.device]] = None, checkpoint_stop: int = 0, ) -> None: self.batches = batches self.partitions = partitions if devices is None: devices = [torch.device('cpu') for _ in partitions] self.devices = devices # NOTE(sublee): We don't need to manage a pool of CUDA streams because # PyTorch already manages it. # See https://github.com/pytorch/pytorch/pull/9938 self.copy_streams = [[new_stream(d) for _ in self.batches] for d in devices] self.checkpoint_stop = checkpoint_stop
def test_copy_wait_cuda_cuda(cuda_sleep): prev_stream = current_stream(torch.device('cuda')) next_stream = new_stream(torch.device('cuda')) _test_copy_wait(prev_stream, next_stream, cuda_sleep)
def test_wait_stream_cpu_cuda(self, cuda_sleep): source = CPUStream target = new_stream(torch.device('cuda')) self._test_wait_stream(source, target, cuda_sleep)
def test_use_stream_cuda(self): stream = new_stream(torch.device('cuda')) with use_stream(stream): assert current_stream(torch.device('cuda')) == stream
def test_new_stream_cuda(self): stream = new_stream(torch.device('cuda')) assert isinstance(stream, torch.cuda.Stream) assert stream != torch.cuda.default_stream()
def test_new_stream_cpu(self): stream = new_stream(torch.device('cpu')) assert stream is CPUStream
def test_wait_stream_cuda_cuda(self, cuda_sleep): source = current_stream(torch.device('cuda')) target = new_stream(torch.device('cuda')) self._test_wait_stream(source, target, cuda_sleep)