Python load_kernel Examples, util.utilsGPU.load_kernel Python Examples

Example #1

0

Show file

    def backward_gpu(self, inputs, grad_outputs):

        img, labels = inputs
        # print(img)
        grad_in = cp.zeros_like(img)

        K = int(labels.max() + 1)
        dimension = len(img.shape)
        batch_size, n_classes, _ = self.initialize_arrays(img, dimension, K)

        blockSizeX = 32
        blockSizeY = min(CUDA_MAX_THREADS / 32, n_classes)
        nbBlocksX = int(math.ceil(K / float(blockSizeX)))
        nbBlocksY = int(math.ceil(n_classes / float(blockSizeY)))

        kern = load_kernel('bw_max_pooling', self.code)
        args = (grad_in, img, self.max_indices, n_classes, grad_outputs[0], K,
                batch_size)
        block = (blockSizeX, blockSizeY
                 )  # block size = size of one row in the labels volume
        grid = (nbBlocksX, nbBlocksY, batch_size)
        kern(grid, block, args=args)
        return grad_in, cp.zeros_like(
            labels
        )  # Second argument needs to be returned to match shapes of arguments in forward and backward passes.

Example #2

0

Show file

    def forward_gpu(self, inputs):
        img, labels = inputs
        self.max_indices = cp.zeros(img.size, dtype=cp.int32)

        volumeSize = np.prod(img.shape[1:])
        blockSizeX = np.min((64, volumeSize))
        blockSizeY = 1
        blockSizeZ = 1

        nbBlocksX = int(math.ceil(volumeSize / float(blockSizeX)))

        K = int(labels.max() + 1)

        outputs = (-np.inf * cp.ones((img.shape[0], K))).astype(img.dtype)
        self.max_indices = -cp.ones(
            outputs.shape, dtype=cp.int32
        )  # Initialize as -1 so negative values can be ignored in backward pass.
        # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later

        self.code = read_code(
            GPU_KERNEL
        )  # TODO: Should be able to be moved outside this function. But it needs the information in config ...
        kern = load_kernel('max_pooling', self.code)

        args = (img, labels, self.max_indices, volumeSize, img.shape[0], K)

        block = (blockSizeX, blockSizeY, blockSizeZ
                 )  # block size = size of one volume (one block per class)
        grid = (nbBlocksX, img.shape[0], K)

        # print("indices before: ", self.max_indices)
        kern(grid, block, shared_mem=blockSizeX, args=args)
        fill_vals = load_kernel('fill_values', self.code)
        blockSizeX = 16
        blockSizeY = 16
        nbBlocksX = int(math.ceil(img.shape[0] / float(blockSizeX)))
        nbBlocksY = int(math.ceil(K / float(blockSizeY)))
        block = (blockSizeX, blockSizeY)
        grid = (nbBlocksX, nbBlocksY)

        args = (img, self.max_indices, K, img.shape[0], outputs)
        fill_vals(grid, block, args=args)
        # print("indices after: ", self.max_indices)
        return outputs,

Example #3

0

Show file

    def forward_gpu(self, inputs):
        img, labels = inputs

        # ------------------
        # INPUT VERIFICATION
        # ------------------
        assert img.flags["C_CONTIGUOUS"]
        assert len(labels.shape) >= 4
        assert img.dtype == cp.float32 or img.dtype == cp.int32
        assert (labels.flags["C_CONTIGUOUS"])

        # ----------
        # INITIALIZE
        # ----------

        volumeSize = np.prod(img.shape[-3:])
        blockSize = np.min((CUDA_MAX_THREADS, volumeSize))
        nbPixPerThread = int(math.ceil(volumeSize / float(blockSize)))

        K = int(labels.max() + 1)

        # -------------------------------
        # FIGURE OUT MEANING OF EACH AXIS
        # -------------------------------
        dimension = len(img.shape)
        batch_size, n_classes, outputs, counts, expand_axis = self.initialize_arrays(
            img, dimension, K)
        self.counts = counts

        self.code = read_code(
            GPU_KERNEL
        )  # TODO: Should be able to be moved outside this function.

        # # ---
        # # PERFORM AVERAGE ON GPU
        # # ---

        summation = load_kernel('avg_pooling', self.code)

        # print("labels: ", cp.ravel(labels))
        args = (img, labels.astype(cp.int32), outputs, self.counts, volumeSize,
                n_classes, batch_size, nbPixPerThread, K)

        block = (blockSize,
                 )  # block size = size of one volume (one block per class)
        grid = (np.prod(img.shape[:-3]), batch_size)  # 1 block for each class
        summation(grid, block, args)

        if self.divide:
            if expand_axis is not None:
                outputs /= cp.repeat(cp.expand_dims(self.counts, expand_axis),
                                     n_classes, expand_axis)
            else:
                outputs /= self.counts  # TODO maybe write kernel for this if it seems that cupy doesn't parallellize this.
                # If it does, a new call to kernel might cause too much overhead.
        return outputs,

Example #4

0

Show file

    def backward_gpu(self, inputs, grad_outputs):

        # print("backprop running")
        # print("number of gradients: ", len(grad_outputs))
        # for gr in grad_outputs:
        #     print("output grads to propagate: ", cp.where(gr != 0))

        img, labels = inputs
        assert grad_outputs[0].dtype == cp.float32
        # print(img)
        grad_in = cp.zeros_like(img)

        K = int(labels.max() + 1)
        volumeSize = np.prod(img.shape[-3:])

        dimension = len(img.shape)
        batch_size, n_classes, _, _, _ = self.initialize_arrays(
            img, dimension, K)

        # print("forward pass -------------")
        # print("batch_size: ", batch_size)
        # print("n_classes: ", n_classes)
        # print("outputs: \n", outputs)
        # print("tileCounts: ", tileCounts)
        # print("counts: ", self.counts)

        blockSizeX = 32
        blockSizeY = min(CUDA_MAX_THREADS / 32, n_classes)
        blockSizeZ = 1

        nbBlocksX = int(math.ceil(volumeSize / float(blockSizeX)))
        nbBlocksY = int(math.ceil(n_classes / float(blockSizeY)))
        nbBlocksZ = int(math.ceil(batch_size / float(blockSizeZ)))

        kern = load_kernel('bw_avg_pooling', self.code)

        args = (grad_in, self.counts, labels.astype(cp.int32), grad_outputs[0],
                K, volumeSize, n_classes, batch_size, chainer.config.train)
        block = (blockSizeX, blockSizeY, blockSizeZ)
        grid = (nbBlocksX, nbBlocksY, nbBlocksZ)
        kern(grid, block, args=args)
        return grad_in, cp.zeros_like(
            labels
        )  # Second argument needs to be returned to match shapes of arguments in forward and backward passes.

Example #5

0

Show file

    def backward_gpu(self, inputs, grad_outputs):

        img, labels = inputs
        # print(img)
        grad_in = cp.zeros_like(img)

        K = int(labels.max() + 1)
        blockSizeX = 32
        blockSizeY = min(CUDA_MAX_THREADS / 32, img.shape[0])
        nbBlocksX = int(math.ceil(K / float(blockSizeX)))
        nbBlocksY = int(math.ceil(img.shape[0] / float(blockSizeY)))

        kern = load_kernel('bw_max_pooling', self.code)
        # print("before bw: ", self.max_indices)
        args = (grad_in, img, self.max_indices, K * img.shape[0],
                grad_outputs[0], K)
        block = (blockSizeX, blockSizeY
                 )  # block size = size of one row in the labels volume
        grid = (nbBlocksX, nbBlocksY)
        kern(grid, block, args=args)
        return grad_in, cp.zeros_like(
            labels
        )  # Second argument needs to be returned to match shapes of arguments in forward and backward passes.

Example #6

0

Show file

    def forward_gpu(self, inputs):

        img, labels = inputs

        # ------------------
        # INPUT VERIFICATION
        # ------------------
        assert img.dtype == cp.float32 or img.dtype == cp.int32
        assert labels.dtype == cp.int32 or labels.dtype == cp.int64
        assert len(labels.shape) >= 4

        labels = labels.astype(cp.int32)
        # ----------
        # INITIALIZE
        # ----------

        volumeSize = np.prod(img.shape[-3:])
        blockSize = np.min((CUDA_MAX_THREADS, volumeSize))
        nbPixPerThread = int(math.ceil(volumeSize / float(blockSize)))

        K = int(labels.max() + 1)

        # -------------------------------
        # FIGURE OUT MEANING OF EACH AXIS
        # -------------------------------

        dimension = len(img.shape)

        batch_size, n_classes, outputs = self.initialize_arrays(
            img, dimension, K)
        self.max_indices = -cp.ones(
            outputs.shape, dtype=cp.int32
        )  # Initialize as -1 so negative values can be ignored in backward pass.
        # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later
        self.code = read_code(
            GPU_KERNEL
        )  # TODO: Should be able to be moved outside this function. But it needs the information in config ...

        # ---
        # PERFORM ARG MAX ON GPU
        # ---
        kern = load_kernel('max_pooling_v2', self.code)
        args = (img, labels.astype(cp.int32), self.max_indices, volumeSize,
                n_classes, batch_size, nbPixPerThread, K)

        block = (blockSize,
                 )  # block size = size of one volume (one block per class)
        grid = (np.prod(img.shape[:-3]), batch_size)  # 1 block for each class

        kern(grid, block, args)

        # print("max_indices: ", self.max_indices)
        # print("corresponding labels: ", cp.ravel(labels)[self.max_indices])
        # ---
        # FILL IN CORRESPONDING VALUES
        # ---

        fill_vals = load_kernel('fill_values', self.code)
        blockSizeX = 16
        blockSizeY = CUDA_MAX_THREADS / blockSizeX
        nbBlocksX = int(math.ceil(n_classes / float(blockSizeX)))
        nbBlocksY = int(math.ceil(K / float(blockSizeY)))
        block = (blockSizeX, blockSizeY)
        grid = (nbBlocksX, nbBlocksY, batch_size)

        args = (img, self.max_indices, K, n_classes, batch_size, outputs)
        fill_vals(grid, block, args=args)

        return outputs,