Example #1
0
    def _do_stage(self, method):
        from pycuda.gpuarray import splay
        import pycuda.driver as drv
        # Call the appropriate kernels for either initialize/stage computation.
        call_info = self.helper.calls[method]
        py_call_info = self.helper.py_calls['py_' + method]
        dtype = np.float64 if self._use_double else np.float32
        extra_args = [
            np.asarray(self.t, dtype=dtype),
            np.asarray(self.dt, dtype=dtype)
        ]
        # Call the py_{method} for each destination.
        for name, (py_meth, dest) in py_call_info.items():
            py_meth(dest, *extra_args)

        # Call the stage* method for each destination.
        for name, (call, args, dest) in call_info.items():
            n = dest.get_number_of_particles(real=True)

            gs, ls = splay(n)
            gs, ls = int(gs[0]), int(ls[0])

            num_blocks = (n + ls - 1) // ls
            num_tpb = ls

            # Compute the remaining arguments.
            args = [x() for x in args[3:]]
            call(*(args + extra_args),
                 block=(num_tpb, 1, 1),
                 grid=(num_blocks, 1))
Example #2
0
 def _get_workgroup_size(self, global_size):
     if self.backend == 'opencl':
         gs, ls = splay_cl(self.queue, global_size,
                           self._max_work_group_size)
     elif self.backend == 'cuda':
         from pycuda.gpuarray import splay
         gs, ls = splay(global_size)
     return gs, ls
Example #3
0
    def __call__(self, *args, **kwargs):
        vectors = []

        range_ = kwargs.pop("range", None)
        slice_ = kwargs.pop("slice", None)
        stream = kwargs.pop("stream", None)

        if kwargs:
            raise TypeError(
                "invalid keyword arguments specified: "
                + ", ".join(kwargs.keys())
            )

        invocation_args = []
        mod, func, arguments = self.generate_stride_kernel_and_types(
            range_ is not None or slice_ is not None
        )

        for arg, arg_descr in zip(args, arguments):
            if isinstance(arg_descr, VectorArg):
                if not arg.flags.forc:
                    raise RuntimeError(
                        "elementwise kernel cannot " "deal with non-contiguous arrays"
                    )

                vectors.append(arg)
                invocation_args.append(arg.gpudata)
            else:
                invocation_args.append(arg)

        repr_vec = vectors[0]

        if slice_ is not None:
            if range_ is not None:
                raise TypeError(
                    "may not specify both range and slice " "keyword arguments"
                )

            range_ = slice(*slice_.indices(repr_vec.size))

        if range_ is not None:
            invocation_args.append(range_.start)
            invocation_args.append(range_.stop)
            if range_.step is None:
                invocation_args.append(1)
            else:
                invocation_args.append(range_.step)

            from pycuda.gpuarray import splay

            grid, block = splay(abs(range_.stop - range_.start) // range_.step)
        else:
            block = repr_vec._block
            grid = repr_vec._grid
            invocation_args.append(repr_vec.mem_size)

        func.prepared_async_call(grid, block, stream, *invocation_args)
    def _call_kernel(self, info, extra_args):
        from pycuda.gpuarray import splay
        import pycuda.driver as drv
        nnps = self.nnps
        call = info.get('method')
        args = list(info.get('args'))
        dest = info['dest']
        n = dest.get_number_of_particles(info.get('real', True))
        # args is actually [queue, None, None, actual_meaningful_args]
        # we do not need the first 3 args on CUDA.
        args = [x() for x in args[3:]]

        # Argument for NP_MAX
        extra_args[-1][...] = n - 1

        gs, ls = splay(n)
        gs, ls = int(gs[0]), int(ls[0])
        num_blocks = (n + ls - 1) // ls

        #num_blocks = int((gs + ls - 1) / ls)
        num_tpb = ls

        if info.get('loop'):
            if self._use_local_memory:
                # FIXME: Fix local memory for CUDA
                nnps.set_context(info['src_idx'], info['dst_idx'])

                nnps_args, gs_ls = self.nnps.get_kernel_args('float')
                args[1] = gs_ls[0]
                args[2] = gs_ls[1]

                # No need for the guard variable for the local memory code.
                args = args + extra_args[:-1] + nnps_args

                call(*args)
            else:
                # find block sizes
                nnps.set_context(info['src_idx'], info['dst_idx'])
                cache = nnps.current_cache
                cache.get_neighbors_gpu()
                args = args + [
                    cache._nbr_lengths_gpu.dev,
                    cache._start_idx_gpu.dev,
                    cache._neighbors_gpu.dev
                ] + extra_args
                event = drv.Event()
                call(*args, block=(num_tpb, 1, 1), grid=(num_blocks, 1))
                event.record()
                event.synchronize()
        else:
            event = drv.Event()
            call(*(args + extra_args),
                 block=(num_tpb, 1, 1),
                 grid=(num_blocks, 1))
            event.record()
            event.synchronize()
Example #5
0
    def __call__(self, *args, **kwargs):
        vectors = []

        range_ = kwargs.pop("range", None)
        slice_ = kwargs.pop("slice", None)
        stream = kwargs.pop("stream", None)

        if kwargs:
            raise TypeError("invalid keyword arguments specified: "
                    + ", ".join(kwargs.iterkeys()))

        invocation_args = []
        func, arguments = self.generate_stride_kernel_and_types(
                range_ is not None or slice_ is not None)

        for arg, arg_descr in zip(args, arguments):
            if isinstance(arg_descr, VectorArg):
                if not arg.flags.forc:
                    raise RuntimeError("elementwise kernel cannot "
                            "deal with non-contiguous arrays")

                vectors.append(arg)
                invocation_args.append(arg.gpudata)
            else:
                invocation_args.append(arg)

        repr_vec = vectors[0]

        if slice_ is not None:
            if range_ is not None:
                raise TypeError("may not specify both range and slice "
                        "keyword arguments")

            range_ = slice(*slice_.indices(repr_vec.size))

        if range_ is not None:
            invocation_args.append(range_.start)
            invocation_args.append(range_.stop)
            if range_.step is None:
                invocation_args.append(1)
            else:
                invocation_args.append(range_.step)

            from pycuda.gpuarray import splay
            grid, block = splay(abs(range_.stop - range_.start)//range_.step)
        else:
            block = repr_vec._block
            grid = repr_vec._grid
            invocation_args.append(repr_vec.mem_size)

        func.prepared_async_call(grid, block, stream, *invocation_args)
Example #6
0
    def __call__(self, *args, **kwargs):
        range_ = kwargs.pop("range", None)
        slice_ = kwargs.pop("slice", None)
        stream = kwargs.pop("stream", None)

        if kwargs:
            raise TypeError("invalid keyword arguments specified: "
                            + ", ".join(elementwise.six.iterkeys(kwargs)))

        mod, func, arguments, call_info = self.generate_stride_kernel_and_types(
            range_ is not None or slice_ is not None, args)

        vectors, invocation_args = call_info
        repr_vec = vectors[0]

        if slice_ is not None:
            if range_ is not None:
                raise TypeError("may not specify both range and slice "
                                "keyword arguments")

            range_ = slice(*slice_.indices(repr_vec.size))

        if range_ is not None:
            invocation_args.append(range_.start)
            invocation_args.append(range_.stop)
            if range_.step is None:
                invocation_args.append(1)
            else:
                invocation_args.append(range_.step)

            from pycuda.gpuarray import splay
            grid, block = splay(abs(range_.stop - range_.start) // range_.step)
        else:
            block = repr_vec._block
            grid = repr_vec._grid
            invocation_args.append(repr_vec.mem_size)

        func.prepared_async_call(grid, block, stream, *invocation_args)
Example #7
0
    def axnpby(self, y, *xn):
        if any(y.traits != x.traits for x in xn):
            raise ValueError('Incompatible matrix types')

        nv, cnt = len(xn), y.leaddim * y.nrow

        # Render the kernel template
        src = self.backend.lookup.get_template('axnpby').render(n=nv)

        # Build
        kern = self._build_kernel('axnpby', src,
                                  [np.int32] + [np.intp, y.dtype] * (1 + nv))

        # Compute a suitable block and grid
        grid, block = splay(cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, queue, beta, *alphan):
                args = [i for axn in zip(xn, alphan) for i in axn]
                kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                         cnt, y, beta, *args)

        return AxnpbyKernel()
Example #8
0
    def axnpby(self, y, *xn):
        if any(y.traits != x.traits for x in xn):
            raise ValueError('Incompatible matrix types')

        nv, cnt = len(xn), y.leaddim*y.nrow

        # Render the kernel template
        src = self.backend.lookup.get_template('axnpby').render(n=nv)

        # Build
        kern = self._build_kernel('axnpby', src,
                                  [np.int32] + [np.intp, y.dtype]*(1 + nv))

        # Compute a suitable block and grid
        grid, block = splay(cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, queue, beta, *alphan):
                args = [i for axn in zip(xn, alphan) for i in axn]
                kern.prepared_async_call(grid, block, queue.cuda_stream_comp,
                                         cnt, y, beta, *args)

        return AxnpbyKernel()