Beispiel #1
0
def get_if_positive_kernel(crit_dtype, dtype):
    return get_elwise_kernel([
        VectorArg(crit_dtype, "crit"),
        VectorArg(dtype, "then_"),
        VectorArg(dtype, "else_"),
        VectorArg(dtype, "result"),
    ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", "if_positive")
Beispiel #2
0
def get_take_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
        "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
    }

    args = (
        [VectorArg(idx_dtype, "idx")]
        + [VectorArg(dtype, "dest" + str(i)) for i in range(vec_count)]
        + [ScalarArg(np.intp, "n")]
    )
    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
        for i in range(vec_count)
    )
    body = ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join(
        "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
        for i in range(vec_count)
    )

    mod = get_elwise_module(args, body, "take", preamble=preamble)
    func = mod.get_function("take")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
    func.prepare("P" + (vec_count * "P") + np.dtype(np.uintp).char, texrefs=tex_src)
    return func, tex_src
Beispiel #3
0
def get_take_put_kernel(dtype, idx_dtype, with_offsets, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
        "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
    }

    args = (
        [
            VectorArg(idx_dtype, "gmem_dest_idx"),
            VectorArg(idx_dtype, "gmem_src_idx"),
        ]
        + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)]
        + [
            ScalarArg(idx_dtype, "offset%d" % i)
            for i in range(vec_count)
            if with_offsets
        ]
        + [ScalarArg(np.intp, "n")]
    )

    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
        for i in range(vec_count)
    )

    if with_offsets:

        def get_copy_insn(i):
            return (
                "dest%d[dest_idx] = "
                "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);" % (i, i, i)
            )

    else:

        def get_copy_insn(i):
            return "dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)

    body = (
        "%(idx_tp)s src_idx = gmem_src_idx[i];\n"
        "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
    ) + "\n".join(get_copy_insn(i) for i in range(vec_count))

    mod = get_elwise_module(args, body, "take_put", preamble=preamble)
    func = mod.get_function("take_put")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]

    func.prepare(
        "PP"
        + (vec_count * "P")
        + (bool(with_offsets) * vec_count * idx_dtype.char)
        + np.dtype(np.uintp).char,
        texrefs=tex_src,
    )
    return func, tex_src
Beispiel #4
0
def get_linear_combination_kernel(summand_descriptors, dtype_z):
    from pycuda.tools import dtype_to_ctype
    from pycuda.elementwise import VectorArg, ScalarArg, get_elwise_module

    args = []
    preamble = ["#include <pycuda-helpers.hpp>\n\n"]
    loop_prep = []
    summands = []
    tex_names = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in enumerate(
        summand_descriptors
    ):
        if is_gpu_scalar:
            preamble.append(
                "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
                % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)
            )
            args.append(VectorArg(vector_dtype, "x%d" % i))
            tex_names.append("tex_a%d" % i)
            loop_prep.append(
                "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
                % (dtype_to_ctype(scalar_dtype), i, i)
            )
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z"))
    args.append(ScalarArg(np.uintp, "n"))

    mod = get_elwise_module(
        args,
        "z[i] = " + " + ".join(summands),
        "linear_combination",
        preamble="\n".join(preamble),
        loop_prep=";\n".join(loop_prep),
    )

    func = mod.get_function("linear_combination")
    tex_src = [mod.get_texref(tn) for tn in tex_names]
    func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src)

    return func, tex_src
Beispiel #5
0
def get_put_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
    }

    args = ([
        VectorArg(idx_dtype, "gmem_dest_idx"),
    ] + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] +
            [VectorArg(dtype, "src%d" % i)
             for i in range(vec_count)] + [ScalarArg(np.intp, "n")])

    body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join(
        "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count))

    func = get_elwise_module(args, body, "put").get_function("put")
    func.prepare("P" + (2 * vec_count * "P") + np.dtype(np.uintp).char)
    return func