Esempio n. 1
0
    def test_constant_pad_nd_memory_format(self, device, dtype):
        # Test memory format is preserved in unambiguous cases
        for mf, ndim in (
            (torch.channels_last, 4),
            (torch.contiguous_format, 4),
            (torch.channels_last_3d, 5),
            (torch.contiguous_format, 5),
        ):
            a = torch.zeros([2] * ndim).to(memory_format=mf)
            res = refs.constant_pad_nd(a, pad=[1] * (2 * ndim))
            self.assertTrue(res.is_contiguous(memory_format=mf))

        # Ambiguous cases

        # is_channels_last_ and is_contiguous_, results in channels_last output
        a = torch.empty_strided((2, 1, 2, 2), stride=(4, 1, 2, 1))
        self.assertTrue(a.is_contiguous(memory_format=torch.channels_last))
        self.assertTrue(a.is_contiguous())
        actual = refs.constant_pad_nd(a, pad=[1] * 8)
        expect = torch.constant_pad_nd(a, pad=[1] * 8)
        self.assertEqual(actual.stride(), expect.stride())
        self.assertTrue(
            actual.is_contiguous(memory_format=torch.channels_last))

        # is_channels_last_contiguous_ but not is_channels_last_, results in
        # contiguous output
        a = torch.empty_strided((2, 1, 2, 2), stride=(4, 4, 2, 1))
        self.assertTrue(a.is_contiguous(memory_format=torch.channels_last))
        self.assertTrue(a.is_contiguous())
        actual = refs.constant_pad_nd(a, pad=[1] * 8)
        expect = torch.constant_pad_nd(a, pad=[1] * 8)
        self.assertEqual(actual.stride(), expect.stride())
        self.assertTrue(actual.is_contiguous())
Esempio n. 2
0
def run_gemm(n, repeat=3, dtype=torch.float32):
    torch.random.manual_seed(10)
    device = torch.device("cuda:0")
    matrix = torch.randn((n, n), dtype=dtype)
    matrix = matrix.T

    gpu_in = torch.empty_strided((n, n),
                                 stride=matrix.stride(),
                                 dtype=matrix.dtype,
                                 device=device,
                                 requires_grad=False)
    gpu_out = torch.empty_strided((n, n),
                                  stride=matrix.stride(),
                                  dtype=matrix.dtype,
                                  device=device,
                                  requires_grad=False)
    gpu_in.copy_(matrix)
    torch.cuda.synchronize()

    gpu_times = []
    for i in range(repeat):
        gpu_out.fill_(0.0)
        start_time = time.time()
        torch.mm(gpu_in, gpu_in, out=gpu_out)
        torch.cuda.synchronize()
        gpu_times.append(time.time() - start_time)
    gpu_time = min(gpu_times)
    flop = n**3 * 2
    flops = flop / gpu_time
    print(
        f"GEMM Exp. of size {n} - GPU time {gpu_time:.2f}s - GFlops {flops / 1e9}"
    )
Esempio n. 3
0
 def _allocate_torch_weightset(self, weightset="slow"):
     H = self.Hlr if weightset in {"slow"} else self.Hgt
     Hu8 = self._np_Hlru8 if weightset in {"slow"} else self._np_Hgtu8
     M = self.M
     N = self.Nm
     Ns = self.Ns
     W0shape, W0stride = (M, Ns, H), (Ns * Hu8, Hu8, 1)
     B0shape, B0stride = (M, H), (Hu8, 1)
     W1shape, W1stride = (Ns, H), (Hu8, 1)
     B1shape, B1stride = (Ns, ), (1, )
     W0 = torch.empty_strided(W0shape,
                              W0stride,
                              dtype=torch.float32,
                              device="cpu")
     B0 = torch.empty_strided(B0shape,
                              B0stride,
                              dtype=torch.float32,
                              device="cpu")
     W1 = torch.empty_strided(W1shape,
                              W1stride,
                              dtype=torch.float32,
                              device="cpu")
     B1 = torch.empty_strided(B1shape,
                              B1stride,
                              dtype=torch.float32,
                              device="cpu")
     W0s = W0.storage()
     B0s = B0.storage()
     W1s = W1.storage()
     B1s = B1.storage()
     W0s.resize_((W0s.size() + 7) & ~7).fill_(0)
     B0s.resize_((B0s.size() + 7) & ~7).fill_(0)
     W1s.resize_((W1s.size() + 7) & ~7).fill_(0)
     B1s.resize_((B1s.size() + 7) & ~7).fill_(0)
     return W0, B0, W1, B1
Esempio n. 4
0
        def run_test(n, k, upper, unitriangular, transpose):
            triangle_function = torch.triu if upper else torch.tril
            A = make_tensor((n, n), dtype=dtype, device=device)
            A = triangle_function(A)
            A_sparse = A.to_sparse_csr()
            B = make_tensor((n, k), dtype=dtype, device=device)

            expected = torch.triangular_solve(B, A, upper=upper, unitriangular=unitriangular, transpose=transpose)
            expected_X = expected.solution

            actual = torch.triangular_solve(B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose)
            actual_X = actual.solution
            actual_A_clone = actual.cloned_coefficient
            self.assertTrue(actual_A_clone.numel() == 0)
            self.assertEqual(actual_X, expected_X)

            # test out with C contiguous strides
            out = torch.empty_strided((n, k), (k, 1), dtype=dtype, device=device)
            torch.triangular_solve(
                B, A_sparse,
                upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone)
            )
            self.assertEqual(out, expected_X)

            # test out with F contiguous strides
            # TODO (@ivanyashchuk): mixed memory format doesn't work yet for cuda
            # out is F contiguous but B is C contiguous
            if self.device_type == 'cuda' and (n > 0 and k > 1):
                with self.assertRaisesRegex(RuntimeError, "INTERNAL ASSERT FAILED"):
                    out = torch.empty_strided((n, k), (1, n), dtype=dtype, device=device)
                    torch.triangular_solve(
                        B, A_sparse,
                        upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone)
                    )
            else:
                out = torch.empty_strided((n, k), (1, n), dtype=dtype, device=device)
                torch.triangular_solve(
                    B, A_sparse,
                    upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone)
                )
                self.assertEqual(out, expected_X)
                self.assertEqual(out.stride(), (1, n))

            # test out with discontiguous strides
            out = torch.empty_strided((2 * n, k), (1, 2 * n), dtype=dtype, device=device)[::2]
            if n > 0 and k > 0:
                self.assertFalse(out.is_contiguous())
                self.assertFalse(out.t().is_contiguous())
            before_stride = out.stride()
            torch.triangular_solve(
                B, A_sparse,
                upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone)
            )
            self.assertEqual(out, expected_X)
            self.assertEqual(out.stride(), before_stride)
Esempio n. 5
0
def TensorMeta(
    tensorlike: Optional[Union[NumberType, torch.Tensor]] = None,
    *,
    shape: Optional[ShapeType] = None,
    strides: Optional[StrideType] = None,
    dtype: Optional[torch.dtype] = None,
    device: Optional[Union[torch.device, str]] = None,
):
    if isinstance(tensorlike, Number):
        assert not shape and (shape is None or isinstance(shape, Sequence))
        assert not strides and (strides is None
                                or isinstance(strides, Sequence))
        inferred_shape: Tuple[int, ...] = ()
        inferred_strides: Tuple[int, ...] = ()
        inferred_dtype = type_to_dtype(type(tensorlike))
        inferred_device = torch.device("cpu")
        # TODO: This looks wrong, a number that is wrapped into a tensor
        # needs to behave differently than a scalar tensor for type
        # promotion purposes
    elif tensorlike is not None:
        assert isinstance(tensorlike, torch.Tensor)
        inferred_shape = tuple(tensorlike.shape)
        inferred_strides = tuple(tensorlike.stride())
        inferred_dtype = tensorlike.dtype
        inferred_device = tensorlike.device
    else:
        # If no tensorlike "example" is given then all metadata
        # must be provided explicitly
        assert shape is not None
        assert strides is not None
        assert dtype is not None
        assert device is not None

    shape = inferred_shape if shape is None else tuple(shape)
    strides = inferred_strides if strides is None else tuple(strides)
    dtype = inferred_dtype if dtype is None else dtype
    device = inferred_device if device is None else device

    if isinstance(device, str):
        device = torch.device(device)

    if isinstance(tensorlike, FakeTensor):
        mode = tensorlike.fake_mode
    else:
        mode = get_prim_fake_mode()

    if device.type == "meta":
        return torch.empty_strided(shape, strides, dtype=dtype, device="meta")
    else:
        return FakeTensor(
            mode,
            torch.empty_strided(shape, strides, dtype=dtype, device="meta"),
            device,
        )
Esempio n. 6
0
 def test_zero_stride(self):
     device = self.get_device()
     cpu_tensor = torch.empty_strided(size=(6, 1024, 512), stride=(0, 0, 0))
     assert cpu_tensor.storage().size() == 1
     ort_tensor_copied = cpu_tensor.to(device)
     assert torch.allclose(cpu_tensor, ort_tensor_copied.cpu())
     ort_tensor = torch.empty_strided(size=(6, 1024, 512),
                                      stride=(0, 0, 0),
                                      device=device)
     assert ort_tensor.is_ort
     assert ort_tensor.stride() == (0, 0, 0)
     cpu_tensor_copied = ort_tensor.cpu()
     assert cpu_tensor_copied.stride() == (0, 0, 0)
Esempio n. 7
0
        def __new__(cls, elem, *args, **kwargs):
            assert type(elem) is not cls, \
                "Wrapping a CompositeCompliantTensor in a CompositeCompliantTensor is not supported"

            # The storage of CompositeCompliantTensor should never be used directly
            # by a Composite operation; if the Composite
            # operator attempts to read from the storage without dispatching then it'll
            # raise a RuntimeError due to it being a meta storage.
            r = torch.Tensor._make_wrapper_subclass(  # type: ignore[attr-defined]
                cls, elem.size(),
                dtype=elem.dtype, layout=elem.layout,
                device=elem.device, requires_grad=elem.requires_grad,
                strides=elem.stride(), storage_offset=elem.storage_offset())

            if elem.requires_grad:
                # CompositeCompliantTensor steals the "requires_grad"-ness.
                # Why a new copy of `elem`? Because sometimes OpInfo shares inputs between tests...
                tmp = torch.empty_strided(elem.shape, elem.stride(), dtype=elem.dtype,
                                          device=elem.device, layout=elem.layout,
                                          requires_grad=False)
                tmp.copy_(elem.detach())
                r.elem = tmp
            else:
                r.elem = elem

            assert r.stride() == r.elem.stride()

            # Propagate conjugate bits to the wrapper tensor
            # Ref: https://github.com/albanD/subclass_zoo/issues/24
            # Ref: https://github.com/albanD/subclass_zoo/issues/21
            torch._C._set_conj(r, r.elem.is_conj())
            torch._C._set_neg(r, r.elem.is_neg())
            return r
Esempio n. 8
0
def cutlass_matmul(a, b):
    if _cutlass is None:
        raise RuntimeError("Cannot find cutlass library")
    M, N = a.shape[0], b.shape[1]
    c = torch.empty_strided((M, N), (1, M), dtype=a.dtype, device=a.device)
    _cutlass.matmul(a, b, c)
    return c
Esempio n. 9
0
        def run_test(n, k, upper, unitriangular, transpose):
            triangle_function = torch.triu if upper else torch.tril
            A = make_tensor((n, n), dtype=dtype, device=device)
            A = triangle_function(A)
            A_sparse = A.to_sparse_csr()
            B = make_tensor((n, k), dtype=dtype, device=device)

            expected = torch.triangular_solve(B, A, upper=upper, unitriangular=unitriangular, transpose=transpose)
            expected_X = expected.solution

            actual = torch.triangular_solve(B, A_sparse, upper=upper, unitriangular=unitriangular, transpose=transpose)
            actual_X = actual.solution
            actual_A_clone = actual.cloned_coefficient
            self.assertTrue(actual_A_clone.numel() == 0)
            self.assertEqual(actual_X, expected_X)

            # test out with C contiguous strides
            out = torch.empty_strided((n, k), (k, 1), dtype=dtype, device=device)
            torch.triangular_solve(
                B, A_sparse,
                upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone)
            )
            self.assertEqual(out, expected_X)

            # test out with F contiguous strides
            out = torch.empty_strided((n, k), (1, n), dtype=dtype, device=device)
            torch.triangular_solve(
                B, A_sparse,
                upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone)
            )
            self.assertEqual(out, expected_X)
            self.assertEqual(out.stride(), (1, n))

            # test out with discontiguous strides
            out = torch.empty_strided((2 * n, k), (1, 2 * n), dtype=dtype, device=device)[::2]
            if n > 0 and k > 0:
                self.assertFalse(out.is_contiguous())
                self.assertFalse(out.t().is_contiguous())
            before_stride = out.stride()
            torch.triangular_solve(
                B, A_sparse,
                upper=upper, unitriangular=unitriangular, transpose=transpose, out=(out, actual_A_clone)
            )
            self.assertEqual(out, expected_X)
            self.assertEqual(out.stride(), before_stride)
def init_inputs(N, C, H, W, in_order, out_order, dtype):
    shape = (N, C, H, W)
    stride_x = torch_blocksparse._permute.strides(N, C, H, W, in_order)
    stride_y = torch_blocksparse._permute.strides(N, C, H, W, out_order)
    x = torch.rand(N * C * H * W,
                   requires_grad=True).as_strided(shape,
                                                  stride_x).cuda().type(dtype)
    y = torch.empty_strided(shape, stride_y, device=x.device, dtype=dtype)
    return x, y
Esempio n. 11
0
 def backward(ctx, dy):
   # load from context
   x, y = ctx.saved_tensors
   # get kernel
   if x.dtype not in _relu.bwd_kernel:
     defines = {'TYPE': x.dtype, 'TN': [128]}
     _relu.bwd_kernel[x.dtype] = triton.kernel(_relu.bwd_src, defines=defines, num_warps=[4])
   kernel = _relu.bwd_kernel[x.dtype]
   # allocate output
   dx = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
   dres = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
   dscale = torch.zeros((1,), device=dy.device, dtype=torch.float32)
   dbias = torch.zeros_like(dscale)
   # launch kernel
   N = x.numel()
   grid = lambda opt: [triton.cdiv(N, opt.d('TN'))]
   kernel(x, y, ctx.scale.item(), dx, dy, dscale, dbias, dres, N, grid=grid)
   return dx, dscale.type(x.dtype), dbias.type(x.dtype), dres
Esempio n. 12
0
 def unwrap(t):
     # TODO: doesn't setup aliasing relation on views correctly
     if isinstance(t, TensorMeta):
         return torch.empty_strided(t.shape,
                                    t.stride(),
                                    dtype=t.dtype,
                                    device="meta")
     else:
         return t
Esempio n. 13
0
def make_tensor_from_type(inp_type: torch._C.TensorType):
    size = inp_type.sizes()
    stride = inp_type.strides()
    device = inp_type.device()
    dtype = inp_type.dtype()
    assert size is not None
    assert stride is not None
    assert device is not None
    assert dtype is not None
    return torch.empty_strided(size=size,
                               stride=stride,
                               device=device,
                               dtype=dtype)
Esempio n. 14
0
 def forward(ctx, x, scale, bias, res):
   if x.dtype not in _relu.fwd_kernel:
     defines = {'TYPE': x.dtype, 'TN': [128]}
     _relu.fwd_kernel[x.dtype] = triton.kernel(_relu.fwd_src, defines=defines, num_warps=[4])
   kernel = _relu.fwd_kernel[x.dtype]
   # launch kernel
   y = torch.empty_strided(x.shape, x.stride(), device=x.device, dtype=x.dtype)
   N = x.numel()
   grid = lambda opt: [triton.cdiv(N, opt.d('TN'))]
   kernel(x, y, scale.item(), bias.item(),res, N, grid=grid)
   # update context
   ctx.save_for_backward(x, y)
   ctx.scale = scale
   return y
Esempio n. 15
0
 def tensor_creation_ops(self):
     i = torch.tensor([[0, 1, 1], [2, 0, 2]])
     v = torch.tensor([3, 4, 5], dtype=torch.float32)
     real = torch.tensor([1, 2], dtype=torch.float32)
     imag = torch.tensor([3, 4], dtype=torch.float32)
     inp = torch.tensor([-1.5, 0.0, 2.0])
     values = torch.tensor([0.5])
     quantized = torch.quantize_per_channel(
         torch.tensor([[-1.0, 0.0], [1.0, 2.0]]),
         torch.tensor([0.1, 0.01]),
         torch.tensor([10, 0]),
         0,
         torch.quint8,
     )
     return (
         torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]]),
         # torch.sparse_coo_tensor(i, v, [2, 3]), # not work for iOS
         torch.as_tensor([1, 2, 3]),
         torch.as_strided(torch.randn(3, 3), (2, 2), (1, 2)),
         torch.zeros(2, 3),
         torch.zeros((2, 3)),
         torch.zeros([2, 3], out=i),
         torch.zeros(5),
         torch.zeros_like(torch.empty(2, 3)),
         torch.ones(2, 3),
         torch.ones((2, 3)),
         torch.ones([2, 3]),
         torch.ones(5),
         torch.ones_like(torch.empty(2, 3)),
         torch.arange(5),
         torch.arange(1, 4),
         torch.arange(1, 2.5, 0.5),
         torch.range(1, 4),
         torch.range(1, 4, 0.5),
         torch.linspace(3.0, 3.0, steps=1),
         torch.logspace(start=2, end=2, steps=1, base=2.0),
         torch.eye(3),
         torch.empty(2, 3),
         torch.empty_like(torch.empty(2, 3), dtype=torch.int64),
         torch.empty_strided((2, 3), (1, 2)),
         torch.full((2, 3), 3.141592),
         torch.full_like(torch.full((2, 3), 3.141592), 2.71828),
         torch.quantize_per_tensor(
             torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10, torch.quint8
         ),
         torch.dequantize(quantized),
         torch.complex(real, imag),
         torch.polar(real, imag),
         torch.heaviside(inp, values),
     )
Esempio n. 16
0
def _new_strided_tensor(size: Tuple[int], stride: Tuple[int],
                        dtype: torch.dtype, device: Union[str, torch.device],
                        pin_memory: bool) -> torch.Tensor:
    if isinstance(device, torch.device):
        pin_memory &= device.type == 'cpu'
    else:
        pin_memory &= device.lower() == 'cpu'

    return torch.empty_strided(size=size,
                               stride=stride,
                               dtype=dtype,
                               device=device,
                               requires_grad=False,
                               pin_memory=pin_memory)
Esempio n. 17
0
def cutlass_matmul(a, b):
    if _cutlass is None:
        raise RuntimeError("Cannot find cutlass library")
    M, N = a.shape[0], b.shape[1]
    Ka, Kb = a.shape[1], b.shape[0]
    assert Ka == Kb
    assert a.dtype == b.dtype
    assert a.device == b.device
    # allocate output
    c = torch.empty_strided((M, N), (1, M), dtype=a.dtype, device=a.device)
    # run function
    dtype = str(a.dtype).split('.')[-1]
    _cutlass.matmul(a.data_ptr(), b.data_ptr(), c.data_ptr(), \
                    M, N, Ka,\
                    a.stride(0), a.stride(1),\
                    b.stride(0), b.stride(1),\
                    c.stride(0), c.stride(1),\
                    dtype, dtype, dtype,
                    a.device.index, torch.cuda.current_stream(a.device).cuda_stream)

    return c
Esempio n. 18
0
 def forward(ctx, x, running_mean, running_var, gamma, beta, training,
             momentum, eps):
     N, C, H, W = x.shape
     # lazy compilation of kernel
     key = (training, x.dtype)
     if key not in _batchnorm.fwd_kernel:
         defines = {'TM': 256, 'TYPE': x.dtype}
         if training:
             defines['TRAINING'] = True
         _batchnorm.fwd_kernel[key] = triton.kernel(_batchnorm.fwd_src,
                                                    defines=defines,
                                                    num_warps=[4])
     kernel = _batchnorm.fwd_kernel[key]
     # allocate outputs
     y = torch.empty_strided(x.shape,
                             x.stride(),
                             layout=x.layout,
                             dtype=x.dtype,
                             device=x.device)
     mean = torch.empty(C, dtype=torch.float32, device=x.device)
     var = torch.empty(C, dtype=torch.float32, device=x.device)
     # execute kernels
     grid = lambda opt: [C]
     kernel(y,
            mean,
            var,
            running_mean,
            running_var,
            x,
            gamma,
            beta,
            H * W * N,
            momentum,
            eps,
            grid=grid)
     # save
     ctx.save_for_backward(x, gamma, beta, mean, var)
     ctx.eps = eps
     return y
Esempio n. 19
0
 def backward(ctx, dy):
     # lazy compilation of kernel
     key = (dy.dtype, )
     if key not in _batchnorm.bwd_kernel:
         _batchnorm.bwd_kernel[key] = triton.kernel(_batchnorm.bwd_src,
                                                    defines={
                                                        'TM': 256,
                                                        'TYPE': dy.dtype
                                                    },
                                                    num_warps=[4])
     kernel = _batchnorm.bwd_kernel[key]
     # retrieve info
     x, gamma, beta, mean, var = ctx.saved_tensors
     eps = ctx.eps
     # allocate result
     dx = torch.empty_strided(x.shape,
                              x.stride(),
                              layout=x.layout,
                              dtype=x.dtype,
                              device=x.device)
     dgamma = torch.empty_like(gamma)
     dbeta = torch.empty_like(beta)
     # execute
     N, C, H, W = x.shape
     kernel(dx,
            dgamma,
            dbeta,
            dy,
            x,
            gamma,
            mean,
            var,
            H * W * N,
            eps,
            grid=lambda opt: [C])
     return dx, None, None, dgamma, dbeta, None, None, None
Esempio n. 20
0
def _rebuild_meta_tensor_no_storage(dtype, size, stride, requires_grad):
    return torch.empty_strided(size,
                               stride,
                               dtype=dtype,
                               device='meta',
                               requires_grad=requires_grad)
Esempio n. 21
0
# torch.logspace
torch.logspace(start=-10, end=10, steps=5)
torch.logspace(start=0.1, end=1.0, steps=5)
torch.logspace(start=0.1, end=1.0, steps=1)
torch.logspace(start=2, end=2, steps=1, base=2)

# torch.eye
torch.eye(3)

# torch.empty/empty_like/empty_strided
torch.empty(2, 3)
torch.empty((2, 3))
torch.empty([2, 3])
torch.empty_like(torch.empty(2, 3), dtype=torch.int64)
torch.empty_strided((2, 3), (1, 2))

# torch.full/full_like
torch.full((2, 3), 3.141592)
torch.full_like(torch.full((2, 3), 3.141592), 2.71828)

# torch.quantize_per_tensor
torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10,
                          torch.quint8)

# torch.quantize_per_channel
x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]])
quant = torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]),
                                   torch.tensor([10, 0]), 0, torch.quint8)

# torch.dequantize
Esempio n. 22
0
def run(n,
        repeat=3,
        compare_results=True,
        dtype=torch.float32,
        fn=cuda_lauum_lower,
        lower=True):
    torch.random.manual_seed(10)
    device = torch.device("cuda:0")

    # Generate random matrix
    matrix = torch.randn((n, n), dtype=dtype)
    # Fill 'ignored' side of the matrix with zeros.
    # matrix = torch.tril(matrix)
    # Make it in F-order
    matrix = matrix.T

    # Create GPU buffers for input and output matrices
    gpu_in = torch.empty_strided((n, n),
                                 stride=matrix.stride(),
                                 dtype=matrix.dtype,
                                 device=device,
                                 requires_grad=False)
    gpu_out = torch.empty_strided((n, n),
                                  stride=matrix.stride(),
                                  dtype=matrix.dtype,
                                  device=device,
                                  requires_grad=False)
    # Copy matrix to the GPU
    gpu_in.copy_(matrix)
    torch.cuda.synchronize(device)

    # Run on the CPU
    if compare_results:
        print("\tRunning CPU Exp...", flush=True)
        # Generate the expected output using LAPACK
        cpu_times = []
        for i in range(repeat):
            start_time = time.time()
            expected = scll.dlauum(matrix.numpy(),
                                   lower=lower,
                                   overwrite_c=False)[0]
            cpu_times.append(time.time() - start_time)
        cpu_time = min(cpu_times)
    else:
        cpu_time = 0

    # Run on the GPU
    gpu_times = []
    for i in range(repeat):
        gpu_out.fill_(0.0)
        start_time = time.time()
        fn(gpu_in.shape[0], gpu_in, gpu_in.stride(1), gpu_out,
           gpu_out.stride(1))
        torch.cuda.synchronize(device)
        gpu_times.append(time.time() - start_time)
    gpu_time = min(gpu_times)
    flop = (2 * n * (n + 1) * (n + 2)) / 6
    flops = flop / gpu_time

    if False:
        with np.printoptions(precision=3, linewidth=160):
            print("INPUT")
            print(matrix)
            print("EXPECTED")
            print(torch.from_numpy(expected))
            print("ACTUAL")
            print(gpu_out)

    # Compare outputs and print timing info
    if compare_results:
        if lower:
            np.testing.assert_allclose(np.tril(expected),
                                       gpu_out.cpu().numpy())
        else:
            v_cpu = np.triu(expected)
            v_gpu = np.triu(gpu_out.cpu().numpy())
            diff = np.abs(v_cpu - v_gpu)
            if False:
                with np.printoptions(precision=1, linewidth=160):
                    print(diff)
            np.testing.assert_allclose(v_cpu, v_gpu)
    print(
        f"Exp. of size {n} - CPU time {cpu_time:.2f}s - GPU time {gpu_time:.2f}s  ({fn.__name__}) - GFlops {flops/1e9:.2f}"
    )
Esempio n. 23
0
 def do_work(x, in_order, out_order):
     x_inner_mul = _permute.multiple_of(x.shape['NCHW'.index(in_order[-1])])
     y_inner_mul = _permute.multiple_of(x.shape['NCHW'.index(
         out_order[-1])])
     key = (x.dtype, in_order, out_order, x_inner_mul, y_inner_mul)
     if key not in _permute.kernels:
         TN = [32] if in_order[-1] == 'N' or out_order[-1] == 'N' else 1
         TC = [32] if in_order[-1] == 'C' or out_order[-1] == 'C' else 1
         THW = [32] if in_order[-1] == 'W' or out_order[-1] == 'W' else 1
         defines = {
             'NAME':
             f'permute_{in_order}_{out_order}_{x_inner_mul}_{y_inner_mul}',
             'TYPE': x.dtype,
             # stride multiple for X
             'M_STRIDE_XN': 1 if in_order[-1] == 'N' else x_inner_mul,
             'M_STRIDE_XC': 1 if in_order[-1] == 'N' else x_inner_mul,
             'M_STRIDE_XHW': 1 if in_order[-1] == 'N' else x_inner_mul,
             # stride multiple for Y
             'M_STRIDE_YN': 1 if out_order[-1] == 'N' else y_inner_mul,
             'M_STRIDE_YC': 1 if out_order[-1] == 'N' else y_inner_mul,
             'M_STRIDE_YHW': 1 if out_order[-1] == 'N' else y_inner_mul,
             # strides for X
             'STRIDE_XN': 1 if in_order[-1] == 'N' else 'stride_xn',
             'STRIDE_XC': 1 if in_order[-1] == 'C' else 'stride_xc',
             'STRIDE_XHW': 1 if in_order[-1] == 'W' else 'stride_xhw',
             # strides for Y
             'STRIDE_YN': 1 if out_order[-1] == 'N' else 'stride_yn',
             'STRIDE_YC': 1 if out_order[-1] == 'C' else 'stride_yc',
             'STRIDE_YHW': 1 if out_order[-1] == 'W' else 'stride_yhw',
             # tile parameters
             'TN': TN,
             'TC': TC,
             'THW': THW
         }
         _permute.kernels[key] = triton.kernel(src,
                                               defines=defines,
                                               num_warps=[4])
     kernel = _permute.kernels[key]
     N, C, H, W = x.shape
     y = torch.empty_strided(x.shape,
                             _permute.strides(N, C, H, W, out_order),
                             device=x.device,
                             dtype=x.dtype)
     stride_xn, stride_xc, _, stride_xhw = x.stride()
     stride_yn, stride_yc, _, stride_yhw = y.stride()
     grid = lambda opt: (triton.cdiv(N, opt.d('TN')),
                         triton.cdiv(C, opt.d('TC')),
                         triton.cdiv(H * W, opt.d('THW')))
     kernel(x,
            y,
            N,
            C,
            H * W,
            stride_xn,
            stride_xc,
            stride_xhw,
            stride_yn,
            stride_yc,
            stride_yhw,
            grid=grid)
     return y
Esempio n. 24
0
reveal_type(torch.linspace(start=-10, end=10, steps=1))  # E: {Tensor}

# torch.logspace
reveal_type(torch.logspace(start=-10, end=10, steps=5))  # E: {Tensor}
reveal_type(torch.logspace(start=0.1, end=1.0, steps=5))  # E: {Tensor}
reveal_type(torch.logspace(start=0.1, end=1.0, steps=1))  # E: {Tensor}
reveal_type(torch.logspace(start=2, end=2, steps=1, base=2))  # E: {Tensor}

# torch.eye
reveal_type(torch.eye(3))  # E: {Tensor}

# torch.empty/empty_like/empty_strided
reveal_type(torch.empty(2, 3))  # E: {Tensor}
reveal_type(torch.empty_like(torch.empty(2, 3),
                             dtype=torch.int64))  # E: {Tensor}
reveal_type(torch.empty_strided((2, 3), (1, 2)))  # E: {Tensor}

# torch.full/full_like
reveal_type(torch.full((2, 3), 3.141592))  # E: {Tensor}
reveal_type(torch.full_like(torch.full((2, 3), 3.141592),
                            2.71828))  # E: {Tensor}

# torch.quantize_per_tensor
reveal_type(
    torch.quantize_per_tensor(torch.tensor([-1.0, 0.0, 1.0, 2.0]), 0.1, 10,
                              torch.quint8))  # E: {Tensor}

# torch.quantize_per_channel
x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]])
quant = torch.quantize_per_channel(x, torch.tensor([0.1, 0.01]),
                                   torch.tensor([10, 0]), 0, torch.quint8)