Esempio n. 1
0
    def axnpby(self, *arr, subdims=None):
        if any(arr[0].traits != x.traits for x in arr[1:]):
            raise ValueError('Incompatible matrix types')

        nv = len(arr)
        nrow, ncol, ldim, dtype = arr[0].traits[1:]
        ncola, ncolb = arr[0].ioshape[1:]

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, ncolb, nrow)

        # Render the kernel template
        src = self.backend.lookup.get_template('axnpby').render(
            block=block, subdims=subdims or range(ncola), ncola=ncola, nv=nv)

        # Build the kernel
        kern = self._build_kernel('axnpby', src, [np.int32] * 3 +
                                  [np.intp] * nv + [dtype] * nv)

        class AxnpbyKernel(ComputeKernel):
            def run(self, queue, *consts):
                kern.exec_async(grid, block, queue.stream_comp, nrow, ncolb,
                                ldim, *arr, *consts)

        return AxnpbyKernel()
Esempio n. 2
0
    def mul(self, a, b, out, alpha=1.0, beta=0.0):
        # Ensure the matrices are compatible
        if a.nrow != out.nrow or a.ncol != b.nrow or b.ncol != out.ncol:
            raise ValueError('Incompatible matrices for out = a*b')

        # Check that A is constant
        if 'const' not in a.tags:
            raise NotSuitableError('GiMMiK requires a constant a matrix')

        # Fetch the matrix and tally up the number of non-zeros
        arr = a.get()
        nnz, nuq = np.count_nonzero(arr), len(np.unique(np.abs(arr)))

        # Check that A is suitable
        if nuq > 28 and nnz / arr.size > 0.15:
            raise NotSuitableError('Matrix inappropriate GiMMiK')

        # Determine the grid/block
        block = (128, 1, 1)
        grid = get_grid_for_block(block, b.ncol)

        # Generate
        src = generate_mm(a.get(), dtype=a.dtype, platform='hip',
                          alpha=alpha, beta=beta)

        # Build
        fun = self._build_kernel('gimmik_mm', src,
                                 [np.int32, np.intp]*2 + [np.int32])

        class MulKernel(Kernel):
            def run(self, queue):
                fun.exec_async(grid, block, queue.stream, b.ncol, b, b.leaddim,
                               out, out.leaddim)

        return MulKernel()
Esempio n. 3
0
    def reduction(self, *rs, method, norm, dt_mat=None):
        if any(r.traits != rs[0].traits for r in rs[1:]):
            raise ValueError('Incompatible matrix types')

        hip = self.backend.hip
        nrow, ncol, ldim, dtype = rs[0].traits[1:]
        ncola, ncolb = rs[0].ioshape[1:]

        # Reduction block dimensions
        block = (128, 1, 1)

        # Determine the grid size
        grid = get_grid_for_block(block, ncolb, ncola)

        # Empty result buffer on the device
        reduced_dev = hip.mem_alloc(ncola * grid[0] * rs[0].itemsize)

        # Empty result buffer on the host
        reduced_host = hip.pagelocked_empty((ncola, grid[0]), dtype)

        tplargs = dict(norm=norm, blocksz=block[0], method=method)

        if method == 'resid':
            tplargs['dt_type'] = 'matrix' if dt_mat else 'scalar'

        # Get the kernel template
        src = self.backend.lookup.get_template('reduction').render(**tplargs)

        regs = list(rs) + [dt_mat] if dt_mat else rs

        # Argument types for reduction kernel
        if method == 'errest':
            argt = [np.int32] * 3 + [np.intp] * 4 + [dtype] * 2
        elif method == 'resid' and dt_mat:
            argt = [np.int32] * 3 + [np.intp] * 4 + [dtype]
        else:
            argt = [np.int32] * 3 + [np.intp] * 3 + [dtype]

        # Build the reduction kernel
        rkern = self._build_kernel('reduction', src, argt)

        # Norm type
        reducer = np.max if norm == 'uniform' else np.sum

        class ReductionKernel(ComputeKernel):
            @property
            def retval(self):
                return reducer(reduced_host, axis=1)

            def run(self, queue, *facs):
                rkern.exec_async(grid, block, queue.stream_comp, nrow, ncolb,
                                 ldim, reduced_dev, *regs, *facs)
                hip.memcpy_async(reduced_host, reduced_dev, reduced_dev.nbytes,
                                 queue.stream_comp)

        return ReductionKernel()
Esempio n. 4
0
    def pack(self, mv):
        hip = self.backend.hip

        # An exchange view is simply a regular view plus an exchange matrix
        m, v = mv.xchgmat, mv.view

        # Render the kernel template
        src = self.backend.lookup.get_template('pack').render()

        # Build
        kern = self._build_kernel('pack_view', src, [np.int32]*3 + [np.intp]*4)

        # Compute the grid and thread-block size
        block = (128, 1, 1)
        grid = get_grid_for_block(block, v.n)

        # If MPI is HIP aware then we just need to pack the buffer
        if self.backend.mpitype == 'hip-aware':
            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.hip_stream_comp

                    # Pack
                    kern.exec_async(
                        grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
                        v.mapping, v.rstrides or 0, m
                    )
        # Otherwise, we need to both pack the buffer and copy it back
        else:
            # Create a HIP event
            event = hip.create_event()

            class PackXchgViewKernel(ComputeKernel):
                def run(self, queue):
                    scomp = queue.hip_stream_comp
                    scopy = queue.hip_stream_copy

                    # Pack
                    kern.exec_async(
                        grid, block, scomp, v.n, v.nvrow, v.nvcol, v.basedata,
                        v.mapping, v.rstrides or 0, m
                    )

                    # Copy the packed buffer to the host
                    event.record(scomp)
                    scopy.wait_for_event(event)
                    hip.memcpy_async(m.hdata, m.data, m.nbytes, scopy)

        return PackXchgViewKernel()
Esempio n. 5
0
    def errest(self, x, y, z, *, norm):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        hip = self.backend.hip
        nrow, ncol, ldim, dtype = x.traits
        ncola, ncolb = x.ioshape[1:]

        # Reduction block dimensions
        block = (128, 1, 1)

        # Determine the grid size
        grid = get_grid_for_block(block, ncolb, ncola)

        # Empty result buffer on the device
        err_dev = hip.mem_alloc(ncola*grid[0]*x.itemsize)

        # Empty result buffer on the host
        err_host = hip.pagelocked_empty((ncola, grid[0]), dtype)

        # Get the kernel template
        src = self.backend.lookup.get_template('errest').render(
            norm=norm, sharesz=block[0]
        )

        # Build the reduction kernel
        rkern = self._build_kernel(
            'errest', src, [np.int32]*3 + [np.intp]*4 + [dtype]*2
        )

        # Norm type
        reducer = np.max if norm == 'uniform' else np.sum

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return reducer(err_host, axis=1)

            def run(self, queue, atol, rtol):
                rkern.exec_async(grid, block, queue.hip_stream_comp, nrow,
                                 ncolb, ldim, err_dev, x, y, z, atol, rtol)
                hip.memcpy_async(err_host, err_dev, err_dev.nbytes,
                                 queue.hip_stream_comp)

        return ErrestKernel()