def backward_gpu(self, inputs, grad_outputs): img, labels = inputs # print(img) grad_in = cp.zeros_like(img) K = int(labels.max() + 1) dimension = len(img.shape) batch_size, n_classes, _ = self.initialize_arrays(img, dimension, K) blockSizeX = 32 blockSizeY = min(CUDA_MAX_THREADS / 32, n_classes) nbBlocksX = int(math.ceil(K / float(blockSizeX))) nbBlocksY = int(math.ceil(n_classes / float(blockSizeY))) kern = load_kernel('bw_max_pooling', self.code) args = (grad_in, img, self.max_indices, n_classes, grad_outputs[0], K, batch_size) block = (blockSizeX, blockSizeY ) # block size = size of one row in the labels volume grid = (nbBlocksX, nbBlocksY, batch_size) kern(grid, block, args=args) return grad_in, cp.zeros_like( labels ) # Second argument needs to be returned to match shapes of arguments in forward and backward passes.
def forward_gpu(self, inputs): img, labels = inputs self.max_indices = cp.zeros(img.size, dtype=cp.int32) volumeSize = np.prod(img.shape[1:]) blockSizeX = np.min((64, volumeSize)) blockSizeY = 1 blockSizeZ = 1 nbBlocksX = int(math.ceil(volumeSize / float(blockSizeX))) K = int(labels.max() + 1) outputs = (-np.inf * cp.ones((img.shape[0], K))).astype(img.dtype) self.max_indices = -cp.ones( outputs.shape, dtype=cp.int32 ) # Initialize as -1 so negative values can be ignored in backward pass. # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later self.code = read_code( GPU_KERNEL ) # TODO: Should be able to be moved outside this function. But it needs the information in config ... kern = load_kernel('max_pooling', self.code) args = (img, labels, self.max_indices, volumeSize, img.shape[0], K) block = (blockSizeX, blockSizeY, blockSizeZ ) # block size = size of one volume (one block per class) grid = (nbBlocksX, img.shape[0], K) # print("indices before: ", self.max_indices) kern(grid, block, shared_mem=blockSizeX, args=args) fill_vals = load_kernel('fill_values', self.code) blockSizeX = 16 blockSizeY = 16 nbBlocksX = int(math.ceil(img.shape[0] / float(blockSizeX))) nbBlocksY = int(math.ceil(K / float(blockSizeY))) block = (blockSizeX, blockSizeY) grid = (nbBlocksX, nbBlocksY) args = (img, self.max_indices, K, img.shape[0], outputs) fill_vals(grid, block, args=args) # print("indices after: ", self.max_indices) return outputs,
def forward_gpu(self, inputs): img, labels = inputs # ------------------ # INPUT VERIFICATION # ------------------ assert img.flags["C_CONTIGUOUS"] assert len(labels.shape) >= 4 assert img.dtype == cp.float32 or img.dtype == cp.int32 assert (labels.flags["C_CONTIGUOUS"]) # ---------- # INITIALIZE # ---------- volumeSize = np.prod(img.shape[-3:]) blockSize = np.min((CUDA_MAX_THREADS, volumeSize)) nbPixPerThread = int(math.ceil(volumeSize / float(blockSize))) K = int(labels.max() + 1) # ------------------------------- # FIGURE OUT MEANING OF EACH AXIS # ------------------------------- dimension = len(img.shape) batch_size, n_classes, outputs, counts, expand_axis = self.initialize_arrays( img, dimension, K) self.counts = counts self.code = read_code( GPU_KERNEL ) # TODO: Should be able to be moved outside this function. # # --- # # PERFORM AVERAGE ON GPU # # --- summation = load_kernel('avg_pooling', self.code) # print("labels: ", cp.ravel(labels)) args = (img, labels.astype(cp.int32), outputs, self.counts, volumeSize, n_classes, batch_size, nbPixPerThread, K) block = (blockSize, ) # block size = size of one volume (one block per class) grid = (np.prod(img.shape[:-3]), batch_size) # 1 block for each class summation(grid, block, args) if self.divide: if expand_axis is not None: outputs /= cp.repeat(cp.expand_dims(self.counts, expand_axis), n_classes, expand_axis) else: outputs /= self.counts # TODO maybe write kernel for this if it seems that cupy doesn't parallellize this. # If it does, a new call to kernel might cause too much overhead. return outputs,
def backward_gpu(self, inputs, grad_outputs): # print("backprop running") # print("number of gradients: ", len(grad_outputs)) # for gr in grad_outputs: # print("output grads to propagate: ", cp.where(gr != 0)) img, labels = inputs assert grad_outputs[0].dtype == cp.float32 # print(img) grad_in = cp.zeros_like(img) K = int(labels.max() + 1) volumeSize = np.prod(img.shape[-3:]) dimension = len(img.shape) batch_size, n_classes, _, _, _ = self.initialize_arrays( img, dimension, K) # print("forward pass -------------") # print("batch_size: ", batch_size) # print("n_classes: ", n_classes) # print("outputs: \n", outputs) # print("tileCounts: ", tileCounts) # print("counts: ", self.counts) blockSizeX = 32 blockSizeY = min(CUDA_MAX_THREADS / 32, n_classes) blockSizeZ = 1 nbBlocksX = int(math.ceil(volumeSize / float(blockSizeX))) nbBlocksY = int(math.ceil(n_classes / float(blockSizeY))) nbBlocksZ = int(math.ceil(batch_size / float(blockSizeZ))) kern = load_kernel('bw_avg_pooling', self.code) args = (grad_in, self.counts, labels.astype(cp.int32), grad_outputs[0], K, volumeSize, n_classes, batch_size, chainer.config.train) block = (blockSizeX, blockSizeY, blockSizeZ) grid = (nbBlocksX, nbBlocksY, nbBlocksZ) kern(grid, block, args=args) return grad_in, cp.zeros_like( labels ) # Second argument needs to be returned to match shapes of arguments in forward and backward passes.
def backward_gpu(self, inputs, grad_outputs): img, labels = inputs # print(img) grad_in = cp.zeros_like(img) K = int(labels.max() + 1) blockSizeX = 32 blockSizeY = min(CUDA_MAX_THREADS / 32, img.shape[0]) nbBlocksX = int(math.ceil(K / float(blockSizeX))) nbBlocksY = int(math.ceil(img.shape[0] / float(blockSizeY))) kern = load_kernel('bw_max_pooling', self.code) # print("before bw: ", self.max_indices) args = (grad_in, img, self.max_indices, K * img.shape[0], grad_outputs[0], K) block = (blockSizeX, blockSizeY ) # block size = size of one row in the labels volume grid = (nbBlocksX, nbBlocksY) kern(grid, block, args=args) return grad_in, cp.zeros_like( labels ) # Second argument needs to be returned to match shapes of arguments in forward and backward passes.
def forward_gpu(self, inputs): img, labels = inputs # ------------------ # INPUT VERIFICATION # ------------------ assert img.dtype == cp.float32 or img.dtype == cp.int32 assert labels.dtype == cp.int32 or labels.dtype == cp.int64 assert len(labels.shape) >= 4 labels = labels.astype(cp.int32) # ---------- # INITIALIZE # ---------- volumeSize = np.prod(img.shape[-3:]) blockSize = np.min((CUDA_MAX_THREADS, volumeSize)) nbPixPerThread = int(math.ceil(volumeSize / float(blockSize))) K = int(labels.max() + 1) # ------------------------------- # FIGURE OUT MEANING OF EACH AXIS # ------------------------------- dimension = len(img.shape) batch_size, n_classes, outputs = self.initialize_arrays( img, dimension, K) self.max_indices = -cp.ones( outputs.shape, dtype=cp.int32 ) # Initialize as -1 so negative values can be ignored in backward pass. # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later self.code = read_code( GPU_KERNEL ) # TODO: Should be able to be moved outside this function. But it needs the information in config ... # --- # PERFORM ARG MAX ON GPU # --- kern = load_kernel('max_pooling_v2', self.code) args = (img, labels.astype(cp.int32), self.max_indices, volumeSize, n_classes, batch_size, nbPixPerThread, K) block = (blockSize, ) # block size = size of one volume (one block per class) grid = (np.prod(img.shape[:-3]), batch_size) # 1 block for each class kern(grid, block, args) # print("max_indices: ", self.max_indices) # print("corresponding labels: ", cp.ravel(labels)[self.max_indices]) # --- # FILL IN CORRESPONDING VALUES # --- fill_vals = load_kernel('fill_values', self.code) blockSizeX = 16 blockSizeY = CUDA_MAX_THREADS / blockSizeX nbBlocksX = int(math.ceil(n_classes / float(blockSizeX))) nbBlocksY = int(math.ceil(K / float(blockSizeY))) block = (blockSizeX, blockSizeY) grid = (nbBlocksX, nbBlocksY, batch_size) args = (img, self.max_indices, K, n_classes, batch_size, outputs) fill_vals(grid, block, args=args) return outputs,