Beispiel #1
0
    def backward(self, grad_cost):
        xyz1       = self.xyz1
        xyz2       = self.xyz2
        match      = self.match
        cost       = self.cost
        m          = self.m
        numel      = self.numel
        num_pts    = self.num_pts
        batch_size = self.batch_size

        grad1 = torch.zeros_like(xyz1).cuda()
        grad2 = torch.zeros_like(xyz2).cuda()

        with torch.cuda.device_of(grad_cost):
            if xyz1.requires_grad:
                f = load_kernel('matchcostgrad1', matchcostgrad1_kernel)
                f(block=(512, 1, 1), # (CUDA_NUM_THREADS, 1, 1),
                grid=(32, 1, 1),     # (GET_BLOCKS(xyz1.numel()), 1, 1),
                args=[batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), grad1.data_ptr()],
                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

            if xyz2.requires_grad:
                g = load_kernel('matchcostgrad2', matchcostgrad2_kernel)
                g(block=(256, 1, 1), # (CUDA_NUM_THREADS, 1, 1),
                grid=(32, 32, 1),    # (GET_BLOCKS(xyz2.numel()), 1, 1),
                args=[batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), grad2.data_ptr()],
                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        return grad1 * grad_cost.view(-1, 1, 1), grad2 * grad_cost.view(-1, 1, 1)
Beispiel #2
0
    def forward(self, xyz1, xyz2):
        assert xyz1.dim() == 3 and xyz1.is_cuda and xyz2.is_cuda
        assert xyz1.shape[-1] == 3  # as done by Panos
        batch_size, num_pts, pt_dim = xyz1.size()
        _, m, _ = xyz2.size()

        match = torch.zeros(batch_size, m, num_pts).cuda()
        cost = torch.zeros(batch_size, ).cuda()
        temp = torch.zeros(batch_size, 2 * (m + num_pts)).cuda()

        n = xyz1.numel()

        with torch.cuda.device_of(xyz1):
            # 1) get matching
            f = load_kernel('approxmatch', approxmatch_kernel)
            f(
                block=(512, 1, 1),  # (CUDA_NUM_THREADS,1,1),
                grid=(32, 1, 1),  # GET_BLOCKS(n),1,1),
                args=[
                    batch_size, num_pts, m,
                    xyz1.data_ptr(),
                    xyz2.data_ptr(),
                    match.data_ptr(),
                    temp.data_ptr()
                ],
                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

            # 2) calculate matching cost
            g = load_kernel('matchcost', matchcost_kernel)
            g(
                block=(512, 1, 1),  # (CUDA_NUM_THREADS, 1, 1),
                grid=(32, 1, 1),  # (GET_BLOCKS(n), 1, 1),
                args=[
                    batch_size, num_pts, m,
                    xyz1.data_ptr(),
                    xyz2.data_ptr(),
                    match.data_ptr(),
                    cost.data_ptr()
                ],
                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        self.xyz1 = xyz1
        self.xyz2 = xyz2
        self.match = match
        self.cost = cost
        self.num_pts = num_pts
        self.m = m
        self.numel = n
        self.batch_size = batch_size
        del temp

        return cost
Beispiel #3
0
def swap(x):
    assert x.size(-1) == 2
    total = x.numel() // 2
    with torch.cuda.device_of(x):
        f = load_kernel('swap', kernel)
        f(args=[x.data_ptr(), total],
          block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(total), 1, 1),
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    def backward(ctx, grad_cost):
        xyz1, xyz2, match = ctx.saved_tensors

        batch_size, num_pts, _ = xyz1.size()
        _, m, _ = xyz2.size()

        grad1 = torch.zeros_like(xyz1).cuda()
        grad2 = torch.zeros_like(xyz2).cuda()

        with torch.cuda.device_of(grad_cost):
            if xyz1.requires_grad:
                f = load_kernel('matchcostgrad1', matchcostgrad1_kernel)
                f(
                    block=(512, 1, 1),  # (CUDA_NUM_THREADS, 1, 1),
                    grid=(32, 1, 1),  # (GET_BLOCKS(xyz1.numel()), 1, 1),
                    args=[
                        batch_size, num_pts, m,
                        xyz1.data_ptr(),
                        xyz2.data_ptr(),
                        match.data_ptr(),
                        grad1.data_ptr()
                    ],
                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

            if xyz2.requires_grad:
                g = load_kernel('matchcostgrad2', matchcostgrad2_kernel)
                g(
                    block=(256, 1, 1),  # (CUDA_NUM_THREADS, 1, 1),
                    grid=(32, 32, 1),  # (GET_BLOCKS(xyz2.numel()), 1, 1),
                    args=[
                        batch_size, num_pts, m,
                        xyz1.data_ptr(),
                        xyz2.data_ptr(),
                        match.data_ptr(),
                        grad2.data_ptr()
                    ],
                    stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        return grad1 * grad_cost.view(-1, 1, 1), grad2 * grad_cost.view(
            -1, 1, 1)
def cuda_correlate(ip, weight, padding, stride, kernel_size, corner_case=False, op_channels=None, dilation=1) : 
    if corner_case:
        w_transform = torch.flip(weight, torch.arange(weight.dim()).tolist())
    o_channels = weight.size(0)
    ip_unfold = torch.nn.functional.unfold(ip, (kernel_size, kernel_size), padding=padding, stride=stride)
    w_unfold = weight.view(o_channels, -1) 
    
    # setup output
    batch_size, channels, height, width = ip.size()

    kernel_h, kernel_w = weight.size()[2:]
    output_h = int((height + 2 * padding - (dilation * (kernel_h - 1) + 1)) / stride + 1)
    output_w = int((width + 2 * padding - (dilation * (kernel_w - 1) + 1)) / stride + 1)
    output = ip.new(batch_size, weight.size(0), output_h, output_w)
    op_unfold = output.view(ip.size(0), weight.size(0), output_h*output_w).byte()

    # cupy conversion 
    ip_cupy = cp.fromDlpack(to_dlpack(ip_unfold)).astype('int8')
    w_cupy = cp.fromDlpack(to_dlpack(w_unfold)).astype('int8')
    op_cupy = cp.fromDlpack(to_dlpack(op_unfold)).astype('int8')

    # need to pretend like these are transposed, so swapping the values for m, n and k
    # should be: m = ip(1), n = ip(2) k = w(1)
    m = ip_unfold.size(2)
    n = ip_unfold.size(1)
    k = w_unfold.size(0)

    # set these up properly to make this work
    blockSize = 16
    batchNum = ip_unfold.size(0)
    dimBlock = (blockSize, blockSize, 1)
    dimGrid = (int((k + blockSize - 1)/ blockSize), int((m + blockSize - 1)/ blockSize), 1)
    # print("batchNum: ", batchNum, "dimBlock:", dimBlock, "dimGrid:", dimGrid)

    f = load_kernel('gpu_matrix_mult', matmul_kernel)
    f(block=dimBlock, 
      grid=dimGrid,
      args=[ip_cupy.data.ptr, w_cupy.data.ptr, op_cupy.data.ptr, m, n, k],
      stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

    # multiplication in cupy 
    op = cp.matmul(cp.transpose(ip_cupy, (0,2,1)), cp.transpose(w_cupy))
    op = cp.transpose(op, (0,2,1))

    return op 
def ncrelu_backward(grad_output, mask):
    assert grad_output.get_device() == mask.get_device()
    assert grad_output.is_contiguous()
    n, c, h, w = mask.size()

    with torch.cuda.device_of(grad_output):
        grad_input = grad_output.new(mask.size())
        f = load_kernel('ncrelu_backward', kernels, Dtype=Dtype(grad_output))
        f(args=[
            grad_input.data_ptr(),
            mask.data_ptr(),
            grad_output.data_ptr(), c * h * w,
            mask.numel()
        ],
          block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(mask.numel()), 1, 1),
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return grad_input
def ncrelu_forward(input):
    assert input.dim() == 4 and input.is_contiguous()
    n, c, h, w = input.size()

    with torch.cuda.device_of(input):
        output = input.new(n, 2 * c, h, w)
        mask = torch.cuda.ByteTensor(input.size())
        f = load_kernel('ncrelu_forward', kernels, Dtype=Dtype(input))
        f(args=[
            output.data_ptr(),
            mask.data_ptr(),
            input.data_ptr(), c * h * w,
            input.numel()
        ],
          block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(input.numel()), 1, 1),
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return output, mask
    def forward(self, input, weight):
        assert input.dim() == 4 and input.is_cuda and weight.is_cuda
        batch_size, in_channels, bottom_height, bottom_width = input.size()
        out_channels, _, kernel_h, kernel_w = weight.size()
        print(in_channels, out_channels, batch_size)
        output_h = int((bottom_height + 2 * self.padding[0] -
                        (self.dilation[0] *
                         (kernel_h - 1) + 1)) / self.stride[0] + 1)
        output_w = int((bottom_width + 2 * self.padding[1] -
                        (self.dilation[1] *
                         (kernel_w - 1) + 1)) / self.stride[1] + 1)

        output = input.new(batch_size, out_channels, output_h, output_w)
        n = output.numel()

        with torch.cuda.device_of(input):
            f = load_kernel('conv2d_naive_forward_kernel',
                            _conv2d_naive_kernel,
                            Dtype=Dtype(input),
                            nthreads=n,
                            batch_size=batch_size,
                            in_channels=in_channels,
                            out_channels=out_channels,
                            bottom_height=bottom_height,
                            bottom_width=bottom_width,
                            top_height=output_h,
                            top_width=output_w,
                            kernel_h=kernel_h,
                            kernel_w=kernel_w,
                            stride_h=self.stride[0],
                            stride_w=self.stride[1],
                            dilation_h=self.dilation[0],
                            dilation_w=self.dilation[1],
                            pad_h=self.padding[0],
                            pad_w=self.padding[1])
            f(block=(CUDA_NUM_THREADS, 1, 1),
              grid=(GET_BLOCKS(n), 1, 1),
              args=[input.data_ptr(),
                    weight.data_ptr(),
                    output.data_ptr()],
              stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        self.save_for_backward(input, weight)
        return output
def _col2im(data_col, kernel_size, stride, padding, out=None, input_size=None):
    assert data_col.dim() == 5
    ksize_h, ksize_w = _pair(kernel_size)
    stride_h, stride_w = _pair(stride)
    pad_h, pad_w = _pair(padding)
    n_input_plane, ksize_h, ksize_w, height_col, width_col = data_col.size()
    if input_size is not None:
        height, width = input_size
    else:
        height = (height_col - 1) * stride_h - 2 * pad_h + ksize_h
        width = (width_col - 1) * stride_w - 2 * pad_w + ksize_w
    n = n_input_plane * height * width

    if out is not None:
        assert tuple(out.size()) == (n_input_plane, height, width)
        data = out
    else:
        data = data_col.new(n_input_plane, height, width)

    with torch.cuda.device_of(data_col):
        f = load_kernel('col2im_kernel',
                        _col2im_kernel,
                        Dtype=Dtype(data),
                        n=n,
                        height_col=height_col,
                        width_col=width_col,
                        height=height,
                        width=width,
                        ksize_h=ksize_h,
                        ksize_w=ksize_w,
                        pad_h=pad_h,
                        pad_w=pad_w,
                        stride_h=stride_h,
                        stride_w=stride_w,
                        channels=n_input_plane)
        f(block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(n), 1, 1),
          args=[data_col.data_ptr(), data.data_ptr()],
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return data
Beispiel #10
0
def _im2col(data, kernel_size, stride, padding, out=None):
    assert data.dim() == 3 and data.is_cuda
    ksize_h, ksize_w = _pair(kernel_size)
    stride_h, stride_w = _pair(stride)
    pad_h, pad_w = _pair(padding)
    n_input_plane, height, width = data.size()
    height_col = (height + 2 * pad_h - ksize_h) // stride_h + 1
    width_col = (width + 2 * pad_w - ksize_w) // stride_w + 1
    n = n_input_plane * height_col * width_col

    shape = torch.Size(
        (n_input_plane, ksize_h, ksize_w, height_col, width_col))
    if out is not None:
        assert out.size() == shape
        data_col = out
    else:
        data_col = data.new(*shape)

    with torch.cuda.device_of(data):
        f = load_kernel('im2col_kernel',
                        _im2col_kernel,
                        Dtype=Dtype(data),
                        n=n,
                        height_col=height_col,
                        width_col=width_col,
                        height=height,
                        width=width,
                        ksize_h=ksize_h,
                        ksize_w=ksize_w,
                        pad_h=pad_h,
                        pad_w=pad_w,
                        stride_h=stride_h,
                        stride_w=stride_w,
                        channels=n_input_plane)
        f(block=(CUDA_NUM_THREADS, 1, 1),
          grid=(GET_BLOCKS(n), 1, 1),
          args=[data.data_ptr(), data_col.data_ptr()],
          stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
    return data_col
Beispiel #11
0
    def forward(self, a, b):

        # same preprocessing as chamfer
        if a.dim() == 4:
            if a.size(1) == 2:
                a = from_polar(a)

            assert a.size(1) == 3
            a = a.permute(0, 2, 3, 1).contiguous().reshape(a.size(0), -1, 3)

        if b.dim() == 4:
            if b.size(1) == 2:
                b = from_polar(b)

            assert b.size(1) == 3
            b = b.permute(0, 2, 3, 1).contiguous().reshape(b.size(0), -1, 3)

        assert a.dim() == b.dim() == 3
        if a.size(-1) != 3:
            assert a.size(-2) == 3
            a = a.transpose(-2, -1).contiguous()

        if b.size(-1) != 3:
            assert b.size(-2) == 3
            b = a.transpose(-2, -1).contiguous()

        xyz1, xyz2 = a, b
        batch_size, num_pts, pt_dim = xyz1.size()
        _, m, _ = xyz2.size()

        match = torch.zeros(batch_size, m, num_pts).cuda()
        cost = torch.zeros(batch_size, ).cuda()
        temp = torch.zeros(batch_size, 2 * (m + num_pts)).cuda()

        n = xyz1.numel()

        with torch.cuda.device_of(xyz1):
            # 1) get matching
            f = load_kernel('approxmatch', approxmatch_kernel)
            f(
                block=(512, 1, 1),  # (CUDA_NUM_THREADS,1,1),
                grid=(32, 1, 1),  # GET_BLOCKS(n),1,1),
                args=[
                    batch_size, num_pts, m,
                    xyz1.data_ptr(),
                    xyz2.data_ptr(),
                    match.data_ptr(),
                    temp.data_ptr()
                ],
                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

            # 2) calculate matching cost
            g = load_kernel('matchcost', matchcost_kernel)
            g(
                block=(512, 1, 1),  # (CUDA_NUM_THREADS, 1, 1),
                grid=(32, 1, 1),  # (GET_BLOCKS(n), 1, 1),
                args=[
                    batch_size, num_pts, m,
                    xyz1.data_ptr(),
                    xyz2.data_ptr(),
                    match.data_ptr(),
                    cost.data_ptr()
                ],
                stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        self.xyz1 = xyz1
        self.xyz2 = xyz2
        self.match = match
        self.cost = cost
        self.num_pts = num_pts
        self.m = m
        self.numel = n
        self.batch_size = batch_size
        del temp

        return cost
Beispiel #12
0
    def backward(self, grad_output):
        assert grad_output.is_cuda and grad_output.is_contiguous()
        input, weight = self.saved_tensors

        batch_size, channels, height, width = input.size()
        kernel_h, kernel_w = weight.size()[2:]
        output_h, output_w = grad_output.size()[2:]

        grad_input, grad_weight = None, None

        opt = dict(Dtype=Dtype(grad_output),
                   num=batch_size,
                   channels=channels,
                   bottom_height=height,
                   bottom_width=width,
                   top_height=output_h,
                   top_width=output_w,
                   kernel_h=kernel_h,
                   kernel_w=kernel_w,
                   stride_h=self.stride[0],
                   stride_w=self.stride[1],
                   dilation_h=self.dilation[0],
                   dilation_w=self.dilation[1],
                   pad_h=self.padding[0],
                   pad_w=self.padding[1])

        with torch.cuda.device_of(input):
            if self.needs_input_grad[0]:
                grad_input = input.new(input.size())

                n = grad_input.numel()
                opt['nthreads'] = n

                f = load_kernel('conv2d_dw_backward_grad_input_kernel',
                                _conv2d_depthwise_kernel_backward_grad_input,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      weight.data_ptr(),
                      grad_input.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

            if self.needs_input_grad[1]:
                weight_buffer = weight.new(channels, kernel_h, kernel_w,
                                           batch_size, output_h, output_w)

                n = weight_buffer.numel()
                opt['nthreads'] = n

                f = load_kernel('conv2d_dw_backward_grad_weight_kernel',
                                _conv2d_depthwise_kernel_backward_grad_weight,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      input.data_ptr(),
                      weight_buffer.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
                grad_weight = weight_buffer.view(weight.size() +
                                                 (-1, )).sum(-1)

        return grad_input, grad_weight
    def backward(self, grad_output):
        assert grad_output.is_cuda and grad_output.is_contiguous()
        input, weight = self.saved_tensors

        batch_size, in_channels, bottom_height, bottom_width = input.size()
        out_channels, _, kernel_h, kernel_w = weight.size()
        top_height, top_width = grad_output.size()[2:]

        grad_input, grad_weight = None, None

        opt = dict(Dtype=Dtype(grad_output),
                   batch_size=batch_size,
                   in_channels=in_channels,
                   out_channels=out_channels,
                   bottom_height=bottom_height,
                   bottom_width=bottom_width,
                   top_height=top_height,
                   top_width=top_width,
                   kernel_h=kernel_h,
                   kernel_w=kernel_w,
                   stride_h=self.stride[0],
                   stride_w=self.stride[1],
                   dilation_h=self.dilation[0],
                   dilation_w=self.dilation[1],
                   pad_h=self.padding[0],
                   pad_w=self.padding[1])

        with torch.cuda.device_of(input):
            if self.needs_input_grad[0]:
                grad_input = input.new(input.size())
                n = grad_input.numel()
                opt['nthreads'] = n
                weight_transposed = weight.permute(1, 0, 2, 3).contiguous()
                f = load_kernel('conv2d_naive_backward_grad_input_kernel',
                                _conv2d_naive_kernel_backward_grad_input,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      weight_transposed.data_ptr(),
                      grad_input.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
            else:
                grad_input = None

            if self.needs_input_grad[1]:
                weight_buffer = weight.new(out_channels, in_channels, kernel_h,
                                           kernel_w, batch_size, top_height,
                                           top_width)

                n = weight_buffer.numel()
                opt['nthreads'] = n

                f = load_kernel('conv2d_naive_backward_grad_weight_kernel',
                                _conv2d_naive_kernel_backward_grad_weight,
                                **opt)
                f(block=(CUDA_NUM_THREADS, 1, 1),
                  grid=(GET_BLOCKS(n), 1, 1),
                  args=[
                      grad_output.data_ptr(),
                      input.data_ptr(),
                      weight_buffer.data_ptr()
                  ],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
                grad_weight = weight_buffer.view(weight.size() +
                                                 (-1, )).sum(-1)
            else:
                grad_weight = None

        return grad_input, grad_weight