Ejemplo n.º 1
0
 def _from_device(self):
     flag = self._data.flags['WRITEABLE']
     maybe_setflags(self._data, write=True)
     if self.state is DeviceDataMixin.DEVICE:
         self._device_data.get(_queue, self._data)
         self._data = self._maybe_to_aos(self._data)
         self.state = DeviceDataMixin.BOTH
     maybe_setflags(self._data, write=flag)
Ejemplo n.º 2
0
 def _from_device(self):
     flag = self._data.flags['WRITEABLE']
     maybe_setflags(self._data, write=True)
     if self.state is DeviceDataMixin.DEVICE:
         self._device_data.get(_queue, self._data)
         self._data = self._maybe_to_aos(self._data)
         self.state = DeviceDataMixin.BOTH
     maybe_setflags(self._data, write=flag)
Ejemplo n.º 3
0
    def _compute(self, part):
        conf = self.launch_configuration()

        if self._is_indirect:
            _plan = Plan(
                part,
                *self._unwound_args,
                partition_size=conf["partition_size"],
                matrix_coloring=self._requires_matrix_coloring
            )
            conf["local_memory_size"] = _plan.nshared
            conf["ninds"] = _plan.ninds
            conf["work_group_size"] = min(_max_work_group_size, conf["partition_size"])
            conf["work_group_count"] = _plan.nblocks
        conf["warpsize"] = _warpsize
        conf["op2stride"] = self._it_space.size

        fun = JITModule(self.kernel, self.it_space, *self.args, parloop=self, conf=conf)

        args = []
        for arg in self._unique_args:
            arg.data._allocate_device()
            if arg.access is not device.WRITE:
                arg.data._to_device()

        for a in self._unique_dat_args:
            args.append(a.data.array.data)

        for a in self._all_global_non_reduction_args:
            args.append(a.data._array.data)

        for a in self._all_global_reduction_args:
            a.data._allocate_reduction_array(conf["work_group_count"])
            args.append(a.data._d_reduc_array.data)

        for cst in Const._definitions():
            args.append(cst._array.data)

        for m in self._unique_matrix:
            args.append(m._dev_array.data)
            m._to_device()
            args.append(m._rowptr.data)
            args.append(m._colidx.data)

        for m in self._matrix_entry_maps:
            m._to_device()
            args.append(m._device_values.data)

        if self._is_direct:
            args.append(np.int32(part.size))
            args.append(np.int32(part.offset))
            fun(conf["thread_count"], conf["work_group_size"], *args)
        else:
            args.append(np.int32(part.size))
            args.append(np.int32(part.offset))
            args.append(_plan.ind_map.data)
            args.append(_plan.loc_map.data)
            args.append(_plan.ind_sizes.data)
            args.append(_plan.ind_offs.data)
            args.append(_plan.blkmap.data)
            args.append(_plan.offset.data)
            args.append(_plan.nelems.data)
            args.append(_plan.nthrcol.data)
            args.append(_plan.thrcol.data)

            block_offset = 0
            args.append(0)
            for i in range(_plan.ncolors):
                blocks_per_grid = int(_plan.ncolblk[i])
                threads_per_block = min(_max_work_group_size, conf["partition_size"])
                thread_count = threads_per_block * blocks_per_grid

                args[-1] = np.int32(block_offset)
                fun(int(thread_count), int(threads_per_block), *args)
                block_offset += blocks_per_grid

        # mark !READ data as dirty
        for arg in self.args:
            if arg.access is not READ:
                arg.data.state = DeviceDataMixin.DEVICE
            if arg._is_dat:
                maybe_setflags(arg.data._data, write=False)

        for a in self._all_global_reduction_args:
            a.data._post_kernel_reduction_task(conf["work_group_count"], a.access)
Ejemplo n.º 4
0
    def compute(self):
        if self._has_soa:
            op2stride = Const(1, self._it_space.size, name='op2stride',
                              dtype='int32')
        arglist = [np.int32(self._it_space.size)]
        config = self.launch_configuration()
        fun = JITModule(self.kernel, self.it_space.extents, *self.args, parloop=self, config=config)

        if self._is_direct:
            _args = self.args
            block_size = config['block_size']
            max_grid_size = config['grid_size']
            shared_size = config['required_smem']
        else:
            _args = self._unique_args
            maxbytes = sum([a.dtype.itemsize * a.data.cdim \
                            for a in self._unwound_args if a._is_indirect])
            # shared memory as reported by the device, divided by some
            # factor.  This is the same calculation as done inside
            # op_plan_core, but without assuming 48K shared memory.
            # It would be much nicer if we could tell op_plan_core "I
            # have X bytes shared memory"
            part_size = (_AVAILABLE_SHARED_MEMORY / (64 * maxbytes)) * 64
            self._plan = Plan(self.kernel, self._it_space.iterset,
                              *self._unwound_args,
                              partition_size=part_size)
            max_grid_size = self._plan.ncolblk.max()

        for arg in _args:
            if arg._is_mat:
                d = arg.data._lmadata.gpudata
                offset = arg.data._lmaoffset(self._it_space.iterset)
                arglist.append(np.intp(d))
                arglist.append(np.int32(offset))
            else:
                arg.data._allocate_device()
                if arg.access is not op2.WRITE:
                    arg.data._to_device()
                karg = arg.data._device_data
                if arg._is_global_reduction:
                    arg.data._allocate_reduction_buffer(max_grid_size,
                                                        arg.access)
                    karg = arg.data._reduction_buffer
                arglist.append(np.intp(karg.gpudata))

        if self._is_direct:
            _stream.synchronize()
            fun(max_grid_size, block_size, _stream, *arglist,
                shared_size=shared_size)
            for arg in self.args:
                if arg._is_global_reduction:
                    arg.data._finalise_reduction_begin(max_grid_size, arg.access)
                    arg.data._finalise_reduction_end(max_grid_size, arg.access)
                else:
                    # Set write state to False
                    maybe_setflags(arg.data._data, write=False)
                    # Data state is updated in finalise_reduction for Global
                    if arg.access is not op2.READ:
                        arg.data.state = DeviceDataMixin.DEVICE
        else:
            arglist.append(self._plan.ind_map.gpudata)
            arglist.append(self._plan.loc_map.gpudata)
            arglist.append(self._plan.ind_sizes.gpudata)
            arglist.append(self._plan.ind_offs.gpudata)
            arglist.append(None) # Block offset
            arglist.append(self._plan.blkmap.gpudata)
            arglist.append(self._plan.offset.gpudata)
            arglist.append(self._plan.nelems.gpudata)
            arglist.append(self._plan.nthrcol.gpudata)
            arglist.append(self._plan.thrcol.gpudata)
            arglist.append(None) # Number of colours in this block
            block_offset = 0
            for col in xrange(self._plan.ncolors):
                # At this point, before we can continue processing in
                # the MPI case, we'll need to wait for halo swaps to
                # complete, but at the moment we don't support that
                # use case, so we just pass through for now.
                if col == self._plan.ncolors_core:
                    pass

                blocks = self._plan.ncolblk[col]
                if blocks > 0:
                    arglist[-1] = np.int32(blocks)
                    arglist[-7] = np.int32(block_offset)
                    blocks = np.asscalar(blocks)
                    # Compute capability < 3 can handle at most 2**16  - 1
                    # blocks in any one dimension of the grid.
                    if blocks >= 2**16:
                        grid_size = (2**16 - 1, (blocks - 1)/(2**16-1) + 1, 1)
                    else:
                        grid_size = (blocks, 1, 1)

                    block_size = (128, 1, 1)
                    shared_size = np.asscalar(self._plan.nsharedCol[col])
                    # Global reductions require shared memory of at least block
                    # size * sizeof(double) for the reduction buffer
                    if any(arg._is_global_reduction for arg in self.args):
                        shared_size = max(128 * 8, shared_size)

                    _stream.synchronize()
                    fun(grid_size, block_size, _stream, *arglist,
                        shared_size=shared_size)

                # We've reached the end of elements that should
                # contribute to a reduction (this is only different
                # from the total number of elements in the MPI case).
                # So copy the reduction array back to the host now (so
                # that we don't double count halo elements).  We'll
                # finalise the reduction a little later.
                if col == self._plan.ncolors_owned - 1:
                    for arg in self.args:
                        if arg._is_global_reduction:
                            arg.data._finalise_reduction_begin(max_grid_size,
                                                               arg.access)
                block_offset += blocks
            for arg in self.args:
                if arg._is_global_reduction:
                    arg.data._finalise_reduction_end(max_grid_size,
                                                     arg.access)
                elif not arg._is_mat:
                    # Data state is updated in finalise_reduction for Global
                    if arg.access is not op2.READ:
                        arg.data.state = DeviceDataMixin.DEVICE
                else:
                    # Mat, assemble from lma->csr
                    arg.data._assemble(rowmap=arg.map[0], colmap=arg.map[1])
        if self._has_soa:
            op2stride.remove_from_namespace()
Ejemplo n.º 5
0
    def compute(self):
        if self._has_soa:
            op2stride = Const(1, self._it_space.size, name='op2stride',
                              dtype='int32')

        conf = self.launch_configuration()

        if self._is_indirect:
            self._plan = Plan(self.kernel, self._it_space.iterset,
                              *self._unwound_args,
                              partition_size=conf['partition_size'],
                              matrix_coloring=self._requires_matrix_coloring)
            conf['local_memory_size'] = self._plan.nshared
            conf['ninds'] = self._plan.ninds
            conf['work_group_size'] = min(_max_work_group_size,
                                          conf['partition_size'])
            conf['work_group_count'] = self._plan.nblocks
        conf['warpsize'] = _warpsize

        fun = JITModule(self.kernel, self.it_space.extents, *self.args, parloop=self, conf=conf)

        args = []
        for arg in self._unique_args:
            arg.data._allocate_device()
            if arg.access is not device.WRITE:
                arg.data._to_device()

        for a in self._unique_dat_args:
            args.append(a.data.array.data)

        for a in self._all_global_non_reduction_args:
            args.append(a.data._array.data)

        for a in self._all_global_reduction_args:
            a.data._allocate_reduction_array(conf['work_group_count'])
            args.append(a.data._d_reduc_array.data)

        for cst in Const._definitions():
            args.append(cst._array.data)

        for m in self._unique_matrix:
            args.append(m._dev_array.data)
            m._upload_array()
            args.append(m._rowptr.data)
            args.append(m._colidx.data)

        for m in self._matrix_entry_maps:
            m._to_device()
            args.append(m._device_values.data)

        if self._is_direct:
            args.append(np.int32(self._it_space.size))
            fun(conf['thread_count'], conf['work_group_size'], *args)
        else:
            args.append(np.int32(self._it_space.size))
            args.append(self._plan.ind_map.data)
            args.append(self._plan.loc_map.data)
            args.append(self._plan.ind_sizes.data)
            args.append(self._plan.ind_offs.data)
            args.append(self._plan.blkmap.data)
            args.append(self._plan.offset.data)
            args.append(self._plan.nelems.data)
            args.append(self._plan.nthrcol.data)
            args.append(self._plan.thrcol.data)

            block_offset = 0
            args.append(0)
            for i in range(self._plan.ncolors):
                blocks_per_grid = int(self._plan.ncolblk[i])
                threads_per_block = min(_max_work_group_size, conf['partition_size'])
                thread_count = threads_per_block * blocks_per_grid

                args[-1] = np.int32(block_offset)
                fun(int(thread_count), int(threads_per_block), *args)
                block_offset += blocks_per_grid

        # mark !READ data as dirty
        for arg in self.args:
            if arg.access is not READ:
                arg.data.state = DeviceDataMixin.DEVICE
            if arg._is_dat:
                maybe_setflags(arg.data._data, write=False)

        for mat in [arg.data for arg in self._matrix_args]:
            mat.assemble()

        for a in self._all_global_reduction_args:
            a.data._post_kernel_reduction_task(conf['work_group_count'], a.access)

        if self._has_soa:
            op2stride.remove_from_namespace()
Ejemplo n.º 6
0
    def _compute(self, part):
        arglist = [np.int32(part.size), np.int32(part.offset)]
        config = self.launch_configuration(part)
        fun = JITModule(self.kernel, self.it_space, *self.args, parloop=self, config=config)

        if self._is_direct:
            _args = self.args
            block_size = config['block_size']
            max_grid_size = config['grid_size']
            shared_size = config['required_smem']
        else:
            _args = self._unique_args
            maxbytes = sum([a.dtype.itemsize * a.data.cdim
                            for a in self._unwound_args if a._is_indirect])
            # shared memory as reported by the device, divided by some
            # factor.  This is the same calculation as done inside
            # op_plan_core, but without assuming 48K shared memory.
            # It would be much nicer if we could tell op_plan_core "I
            # have X bytes shared memory"
            part_size = (_AVAILABLE_SHARED_MEMORY / (64 * maxbytes)) * 64
            _plan = Plan(part,
                         *self._unwound_args,
                         partition_size=part_size)
            max_grid_size = _plan.ncolblk.max()

        for arg in _args:
            if arg._is_mat:
                d = arg.data._lmadata.gpudata
                offset = arg.data._lmaoffset(self._it_space.iterset)
                arglist.append(np.intp(d))
                arglist.append(np.int32(offset))
            else:
                arg.data._allocate_device()
                if arg.access is not op2.WRITE:
                    arg.data._to_device()
                karg = arg.data._device_data
                if arg._is_global_reduction:
                    arg.data._allocate_reduction_buffer(max_grid_size,
                                                        arg.access)
                    karg = arg.data._reduction_buffer
                arglist.append(np.intp(karg.gpudata))

        if self._is_direct:
            _stream.synchronize()
            fun(max_grid_size, block_size, _stream, *arglist,
                shared_size=shared_size)
        else:
            arglist.append(_plan.ind_map.gpudata)
            arglist.append(_plan.loc_map.gpudata)
            arglist.append(_plan.ind_sizes.gpudata)
            arglist.append(_plan.ind_offs.gpudata)
            arglist.append(None)  # Block offset
            arglist.append(_plan.blkmap.gpudata)
            arglist.append(_plan.offset.gpudata)
            arglist.append(_plan.nelems.gpudata)
            arglist.append(_plan.nthrcol.gpudata)
            arglist.append(_plan.thrcol.gpudata)
            arglist.append(None)  # Number of colours in this block
            block_offset = 0

            for col in xrange(_plan.ncolors):
                blocks = _plan.ncolblk[col]
                if blocks > 0:
                    arglist[-1] = np.int32(blocks)
                    arglist[-7] = np.int32(block_offset)
                    blocks = np.asscalar(blocks)
                    # Compute capability < 3 can handle at most 2**16  - 1
                    # blocks in any one dimension of the grid.
                    if blocks >= 2 ** 16:
                        grid_size = (2 ** 16 - 1, (blocks - 1) / (2 ** 16 - 1) + 1, 1)
                    else:
                        grid_size = (blocks, 1, 1)

                    block_size = (128, 1, 1)
                    shared_size = np.asscalar(_plan.nsharedCol[col])
                    # Global reductions require shared memory of at least block
                    # size * sizeof(double) for the reduction buffer
                    if any(arg._is_global_reduction for arg in self.args):
                        shared_size = max(128 * 8, shared_size)

                    _stream.synchronize()
                    fun(grid_size, block_size, _stream, *arglist,
                        shared_size=shared_size)

                block_offset += blocks

        _stream.synchronize()
        for arg in self.args:
            if arg._is_global_reduction:
                arg.data._finalise_reduction_begin(max_grid_size, arg.access)
                arg.data._finalise_reduction_end(max_grid_size, arg.access)
            elif not arg._is_mat:
                # Set write state to False
                maybe_setflags(arg.data._data, write=False)
                # Data state is updated in finalise_reduction for Global
                if arg.access is not op2.READ:
                    arg.data.state = DeviceDataMixin.DEVICE