def _test_wait_stream(self, source, target, cuda_sleep=None): with use_stream(target): if is_cuda(target): cuda_sleep(0.5) x = torch.ones(100, 100, device=get_device(target)) wait_stream(source, target) with use_stream(source): assert x.sum().item() == 10000
def _test_copy_wait(prev_stream, next_stream, cuda_sleep=None): device = get_device(prev_stream) with use_stream(prev_stream): if is_cuda(prev_stream): cuda_sleep(0.5) x = torch.ones(100, device=device, requires_grad=True) y, = Copy.apply(prev_stream, next_stream, x) y, = Wait.apply(prev_stream, next_stream, x) with use_stream(next_stream): assert torch.allclose(y.sum(), torch.tensor(100.0, device=device)) y.norm().backward() with use_stream(prev_stream): assert torch.allclose(x.grad.sum(), torch.tensor(10.0, device=device))
def get_phony(device: torch.device, *, requires_grad: bool) -> Tensor: """Gets a phony. Phony is tensor without space. It is useful to make arbitrary dependency in a autograd graph because it doesn't require any gradient accumulation. .. note:: Phonies for each device are cached. If an autograd function gets a phony internally, the phony must be detached to be returned. Otherwise, the autograd engine will mutate the cached phony in-place:: class Phonify(torch.autograd.Function): @staticmethod def forward(ctx, input): phony = get_phony(input.device, requires_grad=False) return phony.detach() # detach() is necessary. """ key = (device, requires_grad) try: phony = _phonies[key] except KeyError: with use_stream(default_stream(device)): phony = torch.empty(0, device=device, requires_grad=requires_grad) _phonies[key] = phony return phony
def test_record_stream_cuda(self, cuda_sleep): # This test detects unexpected block reallocation. For reliable test, # the stream to allocate tensors is isolated. The allocator will not # reuse free blocks which were allocated from another stream. stream_alloc = new_stream(torch.device('cuda')) with torch.cuda.stream(stream_alloc): x = torch.rand(1, device=torch.device('cuda')) stream = new_stream(torch.device('cuda')) record_stream(x, stream) with use_stream(stream): cuda_sleep(0.5) # 'x' is deleted at Python's perspective. But the block of 'x' is still # required for 'stream'. 'y' shouldn't be allocated to the block. data_ptr = x.data_ptr() del x stream_alloc.synchronize() with torch.cuda.stream(stream_alloc): y = torch.rand(1, device=torch.device('cuda')) assert y.data_ptr() != data_ptr # Pause Python until 'stream' finishes tasks queued. Now the block of # 'x' is free to be reallocated. wait_stream(CPUStream, stream) with torch.cuda.stream(stream_alloc): z = torch.rand(1, device=torch.device('cuda')) assert z.data_ptr() == data_ptr
def __getitem__(self, device: torch.device) -> Tensor: try: return self.phonies[device] except KeyError: with use_stream(default_stream(device)): phony = torch.empty(0, device=device, requires_grad=True) self.phonies[device] = phony return phony
def backward(ctx: Context, *grad_output: Tensor, ) -> Tuple[Optional[Tensor], ...]: prev_stream = ctx.prev_stream next_stream = ctx.next_stream grad_input: Deque[Tensor] = deque(maxlen=len(grad_output)) input_stream = current_stream(get_device(prev_stream)) with use_stream(prev_stream), use_stream(next_stream): for x in reversed(grad_output): y = x.to(get_device(prev_stream)) grad_input.appendleft(y) # 'next_stream' is not where 'x' has been allocated. record_stream(x, next_stream) # 'y' has been allocated on 'prev_stream'. # It might be used on the current stream captured as 'input_stream'. record_stream(y, input_stream) grad_streams: Tuple[Optional[Tensor], ...] = (None, None) return grad_streams + tuple(grad_input)
def forward(ctx: Context, # type: ignore prev_stream: AbstractStream, next_stream: AbstractStream, *input: Tensor, ) -> Tensors: ctx.prev_stream = prev_stream ctx.next_stream = next_stream output = [] output_stream = current_stream(get_device(next_stream)) with use_stream(prev_stream), use_stream(next_stream): for x in input: y = x.to(get_device(next_stream)) output.append(y) # 'prev_stream' is not where 'x' has been allocated. record_stream(x, prev_stream) # 'y' has been allocated on 'next_stream'. # It might be used on the current stream captured as 'output_stream'. record_stream(y, output_stream) return tuple(output)
def test_record_stream_shifted_view(self, cuda_sleep): # Issue: https://github.com/pytorch/pytorch/issues/27366 stream_alloc = new_stream(torch.device('cuda')) with torch.cuda.stream(stream_alloc): x = torch.rand(2, device=torch.device('cuda')) y = x[1:] assert y.data_ptr() > x.data_ptr() stream = new_stream(torch.device('cuda')) with use_stream(stream): cuda_sleep(0.5) record_stream(y, stream) data_ptr = x.data_ptr() del x, y stream_alloc.synchronize() with torch.cuda.stream(stream_alloc): z = torch.rand(2, device=torch.device('cuda')) assert z.data_ptr() != data_ptr
def finalize(self, batch: Batch) -> None: if self._finalize is None: return with use_stream(self.stream): self._finalize(batch)
def compute(self) -> Batch: with use_stream(self.stream): return self._compute()
def test_use_stream_cuda(self): stream = new_stream(torch.device('cuda')) with use_stream(stream): assert current_stream(torch.device('cuda')) == stream
def test_use_stream_cpu(self): with use_stream(CPUStream): pass