def backward(self, grad_cost): xyz1 = self.xyz1 xyz2 = self.xyz2 match = self.match cost = self.cost m = self.m numel = self.numel num_pts = self.num_pts batch_size = self.batch_size grad1 = torch.zeros_like(xyz1).cuda() grad2 = torch.zeros_like(xyz2).cuda() with torch.cuda.device_of(grad_cost): if xyz1.requires_grad: f = load_kernel('matchcostgrad1', matchcostgrad1_kernel) f(block=(512, 1, 1), # (CUDA_NUM_THREADS, 1, 1), grid=(32, 1, 1), # (GET_BLOCKS(xyz1.numel()), 1, 1), args=[batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), grad1.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) if xyz2.requires_grad: g = load_kernel('matchcostgrad2', matchcostgrad2_kernel) g(block=(256, 1, 1), # (CUDA_NUM_THREADS, 1, 1), grid=(32, 32, 1), # (GET_BLOCKS(xyz2.numel()), 1, 1), args=[batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), grad2.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return grad1 * grad_cost.view(-1, 1, 1), grad2 * grad_cost.view(-1, 1, 1)
def forward(self, xyz1, xyz2): assert xyz1.dim() == 3 and xyz1.is_cuda and xyz2.is_cuda assert xyz1.shape[-1] == 3 # as done by Panos batch_size, num_pts, pt_dim = xyz1.size() _, m, _ = xyz2.size() match = torch.zeros(batch_size, m, num_pts).cuda() cost = torch.zeros(batch_size, ).cuda() temp = torch.zeros(batch_size, 2 * (m + num_pts)).cuda() n = xyz1.numel() with torch.cuda.device_of(xyz1): # 1) get matching f = load_kernel('approxmatch', approxmatch_kernel) f( block=(512, 1, 1), # (CUDA_NUM_THREADS,1,1), grid=(32, 1, 1), # GET_BLOCKS(n),1,1), args=[ batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), temp.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) # 2) calculate matching cost g = load_kernel('matchcost', matchcost_kernel) g( block=(512, 1, 1), # (CUDA_NUM_THREADS, 1, 1), grid=(32, 1, 1), # (GET_BLOCKS(n), 1, 1), args=[ batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), cost.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) self.xyz1 = xyz1 self.xyz2 = xyz2 self.match = match self.cost = cost self.num_pts = num_pts self.m = m self.numel = n self.batch_size = batch_size del temp return cost
def swap(x): assert x.size(-1) == 2 total = x.numel() // 2 with torch.cuda.device_of(x): f = load_kernel('swap', kernel) f(args=[x.data_ptr(), total], block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(total), 1, 1), stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
def backward(ctx, grad_cost): xyz1, xyz2, match = ctx.saved_tensors batch_size, num_pts, _ = xyz1.size() _, m, _ = xyz2.size() grad1 = torch.zeros_like(xyz1).cuda() grad2 = torch.zeros_like(xyz2).cuda() with torch.cuda.device_of(grad_cost): if xyz1.requires_grad: f = load_kernel('matchcostgrad1', matchcostgrad1_kernel) f( block=(512, 1, 1), # (CUDA_NUM_THREADS, 1, 1), grid=(32, 1, 1), # (GET_BLOCKS(xyz1.numel()), 1, 1), args=[ batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), grad1.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) if xyz2.requires_grad: g = load_kernel('matchcostgrad2', matchcostgrad2_kernel) g( block=(256, 1, 1), # (CUDA_NUM_THREADS, 1, 1), grid=(32, 32, 1), # (GET_BLOCKS(xyz2.numel()), 1, 1), args=[ batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), grad2.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return grad1 * grad_cost.view(-1, 1, 1), grad2 * grad_cost.view( -1, 1, 1)
def cuda_correlate(ip, weight, padding, stride, kernel_size, corner_case=False, op_channels=None, dilation=1) : if corner_case: w_transform = torch.flip(weight, torch.arange(weight.dim()).tolist()) o_channels = weight.size(0) ip_unfold = torch.nn.functional.unfold(ip, (kernel_size, kernel_size), padding=padding, stride=stride) w_unfold = weight.view(o_channels, -1) # setup output batch_size, channels, height, width = ip.size() kernel_h, kernel_w = weight.size()[2:] output_h = int((height + 2 * padding - (dilation * (kernel_h - 1) + 1)) / stride + 1) output_w = int((width + 2 * padding - (dilation * (kernel_w - 1) + 1)) / stride + 1) output = ip.new(batch_size, weight.size(0), output_h, output_w) op_unfold = output.view(ip.size(0), weight.size(0), output_h*output_w).byte() # cupy conversion ip_cupy = cp.fromDlpack(to_dlpack(ip_unfold)).astype('int8') w_cupy = cp.fromDlpack(to_dlpack(w_unfold)).astype('int8') op_cupy = cp.fromDlpack(to_dlpack(op_unfold)).astype('int8') # need to pretend like these are transposed, so swapping the values for m, n and k # should be: m = ip(1), n = ip(2) k = w(1) m = ip_unfold.size(2) n = ip_unfold.size(1) k = w_unfold.size(0) # set these up properly to make this work blockSize = 16 batchNum = ip_unfold.size(0) dimBlock = (blockSize, blockSize, 1) dimGrid = (int((k + blockSize - 1)/ blockSize), int((m + blockSize - 1)/ blockSize), 1) # print("batchNum: ", batchNum, "dimBlock:", dimBlock, "dimGrid:", dimGrid) f = load_kernel('gpu_matrix_mult', matmul_kernel) f(block=dimBlock, grid=dimGrid, args=[ip_cupy.data.ptr, w_cupy.data.ptr, op_cupy.data.ptr, m, n, k], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) # multiplication in cupy op = cp.matmul(cp.transpose(ip_cupy, (0,2,1)), cp.transpose(w_cupy)) op = cp.transpose(op, (0,2,1)) return op
def ncrelu_backward(grad_output, mask): assert grad_output.get_device() == mask.get_device() assert grad_output.is_contiguous() n, c, h, w = mask.size() with torch.cuda.device_of(grad_output): grad_input = grad_output.new(mask.size()) f = load_kernel('ncrelu_backward', kernels, Dtype=Dtype(grad_output)) f(args=[ grad_input.data_ptr(), mask.data_ptr(), grad_output.data_ptr(), c * h * w, mask.numel() ], block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(mask.numel()), 1, 1), stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return grad_input
def ncrelu_forward(input): assert input.dim() == 4 and input.is_contiguous() n, c, h, w = input.size() with torch.cuda.device_of(input): output = input.new(n, 2 * c, h, w) mask = torch.cuda.ByteTensor(input.size()) f = load_kernel('ncrelu_forward', kernels, Dtype=Dtype(input)) f(args=[ output.data_ptr(), mask.data_ptr(), input.data_ptr(), c * h * w, input.numel() ], block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(input.numel()), 1, 1), stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return output, mask
def forward(self, input, weight): assert input.dim() == 4 and input.is_cuda and weight.is_cuda batch_size, in_channels, bottom_height, bottom_width = input.size() out_channels, _, kernel_h, kernel_w = weight.size() print(in_channels, out_channels, batch_size) output_h = int((bottom_height + 2 * self.padding[0] - (self.dilation[0] * (kernel_h - 1) + 1)) / self.stride[0] + 1) output_w = int((bottom_width + 2 * self.padding[1] - (self.dilation[1] * (kernel_w - 1) + 1)) / self.stride[1] + 1) output = input.new(batch_size, out_channels, output_h, output_w) n = output.numel() with torch.cuda.device_of(input): f = load_kernel('conv2d_naive_forward_kernel', _conv2d_naive_kernel, Dtype=Dtype(input), nthreads=n, batch_size=batch_size, in_channels=in_channels, out_channels=out_channels, bottom_height=bottom_height, bottom_width=bottom_width, top_height=output_h, top_width=output_w, kernel_h=kernel_h, kernel_w=kernel_w, stride_h=self.stride[0], stride_w=self.stride[1], dilation_h=self.dilation[0], dilation_w=self.dilation[1], pad_h=self.padding[0], pad_w=self.padding[1]) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[input.data_ptr(), weight.data_ptr(), output.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) self.save_for_backward(input, weight) return output
def _col2im(data_col, kernel_size, stride, padding, out=None, input_size=None): assert data_col.dim() == 5 ksize_h, ksize_w = _pair(kernel_size) stride_h, stride_w = _pair(stride) pad_h, pad_w = _pair(padding) n_input_plane, ksize_h, ksize_w, height_col, width_col = data_col.size() if input_size is not None: height, width = input_size else: height = (height_col - 1) * stride_h - 2 * pad_h + ksize_h width = (width_col - 1) * stride_w - 2 * pad_w + ksize_w n = n_input_plane * height * width if out is not None: assert tuple(out.size()) == (n_input_plane, height, width) data = out else: data = data_col.new(n_input_plane, height, width) with torch.cuda.device_of(data_col): f = load_kernel('col2im_kernel', _col2im_kernel, Dtype=Dtype(data), n=n, height_col=height_col, width_col=width_col, height=height, width=width, ksize_h=ksize_h, ksize_w=ksize_w, pad_h=pad_h, pad_w=pad_w, stride_h=stride_h, stride_w=stride_w, channels=n_input_plane) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[data_col.data_ptr(), data.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return data
def _im2col(data, kernel_size, stride, padding, out=None): assert data.dim() == 3 and data.is_cuda ksize_h, ksize_w = _pair(kernel_size) stride_h, stride_w = _pair(stride) pad_h, pad_w = _pair(padding) n_input_plane, height, width = data.size() height_col = (height + 2 * pad_h - ksize_h) // stride_h + 1 width_col = (width + 2 * pad_w - ksize_w) // stride_w + 1 n = n_input_plane * height_col * width_col shape = torch.Size( (n_input_plane, ksize_h, ksize_w, height_col, width_col)) if out is not None: assert out.size() == shape data_col = out else: data_col = data.new(*shape) with torch.cuda.device_of(data): f = load_kernel('im2col_kernel', _im2col_kernel, Dtype=Dtype(data), n=n, height_col=height_col, width_col=width_col, height=height, width=width, ksize_h=ksize_h, ksize_w=ksize_w, pad_h=pad_h, pad_w=pad_w, stride_h=stride_h, stride_w=stride_w, channels=n_input_plane) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[data.data_ptr(), data_col.data_ptr()], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) return data_col
def forward(self, a, b): # same preprocessing as chamfer if a.dim() == 4: if a.size(1) == 2: a = from_polar(a) assert a.size(1) == 3 a = a.permute(0, 2, 3, 1).contiguous().reshape(a.size(0), -1, 3) if b.dim() == 4: if b.size(1) == 2: b = from_polar(b) assert b.size(1) == 3 b = b.permute(0, 2, 3, 1).contiguous().reshape(b.size(0), -1, 3) assert a.dim() == b.dim() == 3 if a.size(-1) != 3: assert a.size(-2) == 3 a = a.transpose(-2, -1).contiguous() if b.size(-1) != 3: assert b.size(-2) == 3 b = a.transpose(-2, -1).contiguous() xyz1, xyz2 = a, b batch_size, num_pts, pt_dim = xyz1.size() _, m, _ = xyz2.size() match = torch.zeros(batch_size, m, num_pts).cuda() cost = torch.zeros(batch_size, ).cuda() temp = torch.zeros(batch_size, 2 * (m + num_pts)).cuda() n = xyz1.numel() with torch.cuda.device_of(xyz1): # 1) get matching f = load_kernel('approxmatch', approxmatch_kernel) f( block=(512, 1, 1), # (CUDA_NUM_THREADS,1,1), grid=(32, 1, 1), # GET_BLOCKS(n),1,1), args=[ batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), temp.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) # 2) calculate matching cost g = load_kernel('matchcost', matchcost_kernel) g( block=(512, 1, 1), # (CUDA_NUM_THREADS, 1, 1), grid=(32, 1, 1), # (GET_BLOCKS(n), 1, 1), args=[ batch_size, num_pts, m, xyz1.data_ptr(), xyz2.data_ptr(), match.data_ptr(), cost.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) self.xyz1 = xyz1 self.xyz2 = xyz2 self.match = match self.cost = cost self.num_pts = num_pts self.m = m self.numel = n self.batch_size = batch_size del temp return cost
def backward(self, grad_output): assert grad_output.is_cuda and grad_output.is_contiguous() input, weight = self.saved_tensors batch_size, channels, height, width = input.size() kernel_h, kernel_w = weight.size()[2:] output_h, output_w = grad_output.size()[2:] grad_input, grad_weight = None, None opt = dict(Dtype=Dtype(grad_output), num=batch_size, channels=channels, bottom_height=height, bottom_width=width, top_height=output_h, top_width=output_w, kernel_h=kernel_h, kernel_w=kernel_w, stride_h=self.stride[0], stride_w=self.stride[1], dilation_h=self.dilation[0], dilation_w=self.dilation[1], pad_h=self.padding[0], pad_w=self.padding[1]) with torch.cuda.device_of(input): if self.needs_input_grad[0]: grad_input = input.new(input.size()) n = grad_input.numel() opt['nthreads'] = n f = load_kernel('conv2d_dw_backward_grad_input_kernel', _conv2d_depthwise_kernel_backward_grad_input, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), weight.data_ptr(), grad_input.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) if self.needs_input_grad[1]: weight_buffer = weight.new(channels, kernel_h, kernel_w, batch_size, output_h, output_w) n = weight_buffer.numel() opt['nthreads'] = n f = load_kernel('conv2d_dw_backward_grad_weight_kernel', _conv2d_depthwise_kernel_backward_grad_weight, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), input.data_ptr(), weight_buffer.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) grad_weight = weight_buffer.view(weight.size() + (-1, )).sum(-1) return grad_input, grad_weight
def backward(self, grad_output): assert grad_output.is_cuda and grad_output.is_contiguous() input, weight = self.saved_tensors batch_size, in_channels, bottom_height, bottom_width = input.size() out_channels, _, kernel_h, kernel_w = weight.size() top_height, top_width = grad_output.size()[2:] grad_input, grad_weight = None, None opt = dict(Dtype=Dtype(grad_output), batch_size=batch_size, in_channels=in_channels, out_channels=out_channels, bottom_height=bottom_height, bottom_width=bottom_width, top_height=top_height, top_width=top_width, kernel_h=kernel_h, kernel_w=kernel_w, stride_h=self.stride[0], stride_w=self.stride[1], dilation_h=self.dilation[0], dilation_w=self.dilation[1], pad_h=self.padding[0], pad_w=self.padding[1]) with torch.cuda.device_of(input): if self.needs_input_grad[0]: grad_input = input.new(input.size()) n = grad_input.numel() opt['nthreads'] = n weight_transposed = weight.permute(1, 0, 2, 3).contiguous() f = load_kernel('conv2d_naive_backward_grad_input_kernel', _conv2d_naive_kernel_backward_grad_input, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), weight_transposed.data_ptr(), grad_input.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) else: grad_input = None if self.needs_input_grad[1]: weight_buffer = weight.new(out_channels, in_channels, kernel_h, kernel_w, batch_size, top_height, top_width) n = weight_buffer.numel() opt['nthreads'] = n f = load_kernel('conv2d_naive_backward_grad_weight_kernel', _conv2d_naive_kernel_backward_grad_weight, **opt) f(block=(CUDA_NUM_THREADS, 1, 1), grid=(GET_BLOCKS(n), 1, 1), args=[ grad_output.data_ptr(), input.data_ptr(), weight_buffer.data_ptr() ], stream=Stream(ptr=torch.cuda.current_stream().cuda_stream)) grad_weight = weight_buffer.view(weight.size() + (-1, )).sum(-1) else: grad_weight = None return grad_input, grad_weight