Beispiel #1
0
    def forward_gpu(self, inputs):
        img, labels = inputs

        # ------------------
        # INPUT VERIFICATION
        # ------------------
        assert img.flags["C_CONTIGUOUS"]
        assert len(labels.shape) >= 4
        assert img.dtype == cp.float32 or img.dtype == cp.int32
        assert (labels.flags["C_CONTIGUOUS"])

        # ----------
        # INITIALIZE
        # ----------

        volumeSize = np.prod(img.shape[-3:])
        blockSize = np.min((CUDA_MAX_THREADS, volumeSize))
        nbPixPerThread = int(math.ceil(volumeSize / float(blockSize)))

        K = int(labels.max() + 1)

        # -------------------------------
        # FIGURE OUT MEANING OF EACH AXIS
        # -------------------------------
        dimension = len(img.shape)
        batch_size, n_classes, outputs, counts, expand_axis = self.initialize_arrays(
            img, dimension, K)
        self.counts = counts

        self.code = read_code(
            GPU_KERNEL
        )  # TODO: Should be able to be moved outside this function.

        # # ---
        # # PERFORM AVERAGE ON GPU
        # # ---

        summation = load_kernel('avg_pooling', self.code)

        # print("labels: ", cp.ravel(labels))
        args = (img, labels.astype(cp.int32), outputs, self.counts, volumeSize,
                n_classes, batch_size, nbPixPerThread, K)

        block = (blockSize,
                 )  # block size = size of one volume (one block per class)
        grid = (np.prod(img.shape[:-3]), batch_size)  # 1 block for each class
        summation(grid, block, args)

        if self.divide:
            if expand_axis is not None:
                outputs /= cp.repeat(cp.expand_dims(self.counts, expand_axis),
                                     n_classes, expand_axis)
            else:
                outputs /= self.counts  # TODO maybe write kernel for this if it seems that cupy doesn't parallellize this.
                # If it does, a new call to kernel might cause too much overhead.
        return outputs,
Beispiel #2
0
    def forward_gpu(self, inputs):
        img, labels = inputs
        self.max_indices = cp.zeros(img.size, dtype=cp.int32)

        volumeSize = np.prod(img.shape[1:])
        blockSizeX = np.min((64, volumeSize))
        blockSizeY = 1
        blockSizeZ = 1

        nbBlocksX = int(math.ceil(volumeSize / float(blockSizeX)))

        K = int(labels.max() + 1)

        outputs = (-np.inf * cp.ones((img.shape[0], K))).astype(img.dtype)
        self.max_indices = -cp.ones(
            outputs.shape, dtype=cp.int32
        )  # Initialize as -1 so negative values can be ignored in backward pass.
        # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later

        self.code = read_code(
            GPU_KERNEL
        )  # TODO: Should be able to be moved outside this function. But it needs the information in config ...
        kern = load_kernel('max_pooling', self.code)

        args = (img, labels, self.max_indices, volumeSize, img.shape[0], K)

        block = (blockSizeX, blockSizeY, blockSizeZ
                 )  # block size = size of one volume (one block per class)
        grid = (nbBlocksX, img.shape[0], K)

        # print("indices before: ", self.max_indices)
        kern(grid, block, shared_mem=blockSizeX, args=args)
        fill_vals = load_kernel('fill_values', self.code)
        blockSizeX = 16
        blockSizeY = 16
        nbBlocksX = int(math.ceil(img.shape[0] / float(blockSizeX)))
        nbBlocksY = int(math.ceil(K / float(blockSizeY)))
        block = (blockSizeX, blockSizeY)
        grid = (nbBlocksX, nbBlocksY)

        args = (img, self.max_indices, K, img.shape[0], outputs)
        fill_vals(grid, block, args=args)
        # print("indices after: ", self.max_indices)
        return outputs,
Beispiel #3
0
    def forward_gpu(self, inputs):

        img, labels = inputs

        # ------------------
        # INPUT VERIFICATION
        # ------------------
        assert img.dtype == cp.float32 or img.dtype == cp.int32
        assert labels.dtype == cp.int32 or labels.dtype == cp.int64
        assert len(labels.shape) >= 4

        labels = labels.astype(cp.int32)
        # ----------
        # INITIALIZE
        # ----------

        volumeSize = np.prod(img.shape[-3:])
        blockSize = np.min((CUDA_MAX_THREADS, volumeSize))
        nbPixPerThread = int(math.ceil(volumeSize / float(blockSize)))

        K = int(labels.max() + 1)

        # -------------------------------
        # FIGURE OUT MEANING OF EACH AXIS
        # -------------------------------

        dimension = len(img.shape)

        batch_size, n_classes, outputs = self.initialize_arrays(
            img, dimension, K)
        self.max_indices = -cp.ones(
            outputs.shape, dtype=cp.int32
        )  # Initialize as -1 so negative values can be ignored in backward pass.
        # This is a bit wasteful, only saving the ones that matter is better, TODO: look at this later
        self.code = read_code(
            GPU_KERNEL
        )  # TODO: Should be able to be moved outside this function. But it needs the information in config ...

        # ---
        # PERFORM ARG MAX ON GPU
        # ---
        kern = load_kernel('max_pooling_v2', self.code)
        args = (img, labels.astype(cp.int32), self.max_indices, volumeSize,
                n_classes, batch_size, nbPixPerThread, K)

        block = (blockSize,
                 )  # block size = size of one volume (one block per class)
        grid = (np.prod(img.shape[:-3]), batch_size)  # 1 block for each class

        kern(grid, block, args)

        # print("max_indices: ", self.max_indices)
        # print("corresponding labels: ", cp.ravel(labels)[self.max_indices])
        # ---
        # FILL IN CORRESPONDING VALUES
        # ---

        fill_vals = load_kernel('fill_values', self.code)
        blockSizeX = 16
        blockSizeY = CUDA_MAX_THREADS / blockSizeX
        nbBlocksX = int(math.ceil(n_classes / float(blockSizeX)))
        nbBlocksY = int(math.ceil(K / float(blockSizeY)))
        block = (blockSizeX, blockSizeY)
        grid = (nbBlocksX, nbBlocksY, batch_size)

        args = (img, self.max_indices, K, n_classes, batch_size, outputs)
        fill_vals(grid, block, args=args)

        return outputs,