Esempio n. 1
0
def get_take_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
            "idx_tp": dtype_to_ctype(idx_dtype),
            "tp": dtype_to_ctype(dtype),
            "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
            }

    args = [VectorArg(idx_dtype, "idx")] + [
            VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] + [
                ScalarArg(np.intp, "n")
            ]
    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
        for i in range(vec_count))
    body = (
            ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
            + "\n".join(
                "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
                for i in range(vec_count)))

    mod = get_elwise_module(args, body, "take", preamble=preamble)
    func = mod.get_function("take")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
    func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src)
    return func, tex_src
Esempio n. 2
0
def get_lin_comb_kernel_no_tex(summand_descriptors, dtype_z):
    from pycuda.tools import dtype_to_ctype
    from pycuda.elementwise import \
            VectorArg, ScalarArg, get_elwise_module

    args = []
    loop_prep = []
    summands = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
            enumerate(summand_descriptors):
        if is_gpu_scalar:
            args.append(VectorArg(vector_dtype, "global_a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))
            loop_prep.append("%s a%d = *global_a%d" %
                             (dtype_to_ctype(scalar_dtype), i, i))
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z"))
    args.append(ScalarArg(numpy.uintp, "n"))

    mod = get_elwise_module(args,
                            "z[i] = " + " + ".join(summands),
                            "linear_combination",
                            loop_prep=";\n".join(loop_prep))

    func = mod.get_function("linear_combination")
    func.prepare("".join(arg.struct_char for arg in args))

    return func
Esempio n. 3
0
def get_take_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
        "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
    }

    args = (
        [VectorArg(idx_dtype, "idx")]
        + [VectorArg(dtype, "dest" + str(i)) for i in range(vec_count)]
        + [ScalarArg(np.intp, "n")]
    )
    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
        for i in range(vec_count)
    )
    body = ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join(
        "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)
        for i in range(vec_count)
    )

    mod = get_elwise_module(args, body, "take", preamble=preamble)
    func = mod.get_function("take")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]
    func.prepare("P" + (vec_count * "P") + np.dtype(np.uintp).char, texrefs=tex_src)
    return func, tex_src
Esempio n. 4
0
def get_lin_comb_kernel_no_tex(summand_descriptors,
        dtype_z):
    from pycuda.tools import dtype_to_ctype
    from pycuda.elementwise import \
            VectorArg, ScalarArg, get_elwise_module

    args = []
    loop_prep = []
    summands = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
            enumerate(summand_descriptors):
        if is_gpu_scalar:
            args.append(VectorArg(vector_dtype, "global_a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))
            loop_prep.append("%s a%d = *global_a%d"
                    % (dtype_to_ctype(scalar_dtype), i, i))
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z"))
    args.append(ScalarArg(numpy.uintp, "n"))

    mod = get_elwise_module(args,
            "z[i] = " + " + ".join(summands),
            "linear_combination",
            loop_prep=";\n".join(loop_prep))

    func = mod.get_function("linear_combination")
    func.prepare("".join(arg.struct_char for arg in args))

    return func
Esempio n. 5
0
def get_take_put_kernel(dtype, idx_dtype, with_offsets, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
        "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
    }

    args = (
        [
            VectorArg(idx_dtype, "gmem_dest_idx"),
            VectorArg(idx_dtype, "gmem_src_idx"),
        ]
        + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)]
        + [
            ScalarArg(idx_dtype, "offset%d" % i)
            for i in range(vec_count)
            if with_offsets
        ]
        + [ScalarArg(np.intp, "n")]
    )

    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
        for i in range(vec_count)
    )

    if with_offsets:

        def get_copy_insn(i):
            return (
                "dest%d[dest_idx] = "
                "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);" % (i, i, i)
            )

    else:

        def get_copy_insn(i):
            return "dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i)

    body = (
        "%(idx_tp)s src_idx = gmem_src_idx[i];\n"
        "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
    ) + "\n".join(get_copy_insn(i) for i in range(vec_count))

    mod = get_elwise_module(args, body, "take_put", preamble=preamble)
    func = mod.get_function("take_put")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]

    func.prepare(
        "PP"
        + (vec_count * "P")
        + (bool(with_offsets) * vec_count * idx_dtype.char)
        + np.dtype(np.uintp).char,
        texrefs=tex_src,
    )
    return func, tex_src
Esempio n. 6
0
def get_elwise_kernel_and_types(arguments, operation,
        name="kernel", keep=False, options=[], **kwargs):
    if isinstance(arguments, str):
        from pycuda.tools import parse_c_arg
        arguments = [parse_c_arg(arg) for arg in arguments.split(",")]

    arguments.append(ScalarArg(numpy.uintp, "n"))

    mod = get_elwise_module(arguments, operation, name,
            keep, options, **kwargs)

    from pycuda.tools import get_arg_type
    func = mod.get_function(name)
    func.prepare("".join(arg.struct_char for arg in arguments), (1,1,1))

    return func, arguments
Esempio n. 7
0
def get_linear_combination_kernel(summand_descriptors,
        dtype_z):
    # TODO: Port this!
    raise NotImplementedError

    from pycuda.tools import dtype_to_ctype
    from pycuda.elementwise import \
            VectorArg, ScalarArg, get_elwise_module

    args = []
    preamble = [ "#include <pycuda-helpers.hpp>\n\n" ]
    loop_prep = []
    summands = []
    tex_names = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \
            enumerate(summand_descriptors):
        if is_gpu_scalar:
            preamble.append(
                    "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
                    % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i))
            args.append(VectorArg(vector_dtype, "x%d" % i))
            tex_names.append("tex_a%d" % i)
            loop_prep.append(
                    "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
                    % (dtype_to_ctype(scalar_dtype), i, i))
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z"))
    args.append(ScalarArg(np.uintp, "n"))

    mod = get_elwise_module(args,
            "z[i] = " + " + ".join(summands),
            "linear_combination",
            preamble="\n".join(preamble),
            loop_prep=";\n".join(loop_prep))

    func = mod.get_function("linear_combination")
    tex_src = [mod.get_texref(tn) for tn in tex_names]
    func.prepare("".join(arg.struct_char for arg in args),
            (1,1,1), texrefs=tex_src)

    return func, tex_src
Esempio n. 8
0
def get_take_put_kernel(dtype, idx_dtype, with_offsets, vec_count=1):
    ctx = {
            "idx_tp": dtype_to_ctype(idx_dtype),
            "tp": dtype_to_ctype(dtype),
            "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True),
            }

    args = [
            VectorArg(idx_dtype, "gmem_dest_idx"),
            VectorArg(idx_dtype, "gmem_src_idx"),
            ] + [
            VectorArg(dtype, "dest%d" % i)
                for i in range(vec_count)
            ] + [
            ScalarArg(idx_dtype, "offset%d" % i)
                for i in range(vec_count) if with_offsets
            ] + [ScalarArg(np.intp, "n")]

    preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join(
        "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i)
        for i in range(vec_count))

    if with_offsets:
        def get_copy_insn(i):
            return ("dest%d[dest_idx] = "
                    "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);"
                    % (i, i, i))
    else:
        def get_copy_insn(i):
            return ("dest%d[dest_idx] = "
                    "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i))

    body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n"
                "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
            + "\n".join(get_copy_insn(i) for i in range(vec_count)))

    mod = get_elwise_module(args, body, "take_put", preamble=preamble)
    func = mod.get_function("take_put")
    tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)]

    func.prepare(
            "PP"+(vec_count*"P")
            +(bool(with_offsets)*vec_count*idx_dtype.char)
            +np.dtype(np.uintp).char,
            texrefs=tex_src)
    return func, tex_src
Esempio n. 9
0
def get_linear_combination_kernel(summand_descriptors, dtype_z):
    from pycuda.tools import dtype_to_ctype
    from pycuda.elementwise import VectorArg, ScalarArg, get_elwise_module

    args = []
    preamble = ["#include <pycuda-helpers.hpp>\n\n"]
    loop_prep = []
    summands = []
    tex_names = []

    for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in enumerate(
        summand_descriptors
    ):
        if is_gpu_scalar:
            preamble.append(
                "texture <%s, 1, cudaReadModeElementType> tex_a%d;"
                % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)
            )
            args.append(VectorArg(vector_dtype, "x%d" % i))
            tex_names.append("tex_a%d" % i)
            loop_prep.append(
                "%s a%d = fp_tex1Dfetch(tex_a%d, 0)"
                % (dtype_to_ctype(scalar_dtype), i, i)
            )
        else:
            args.append(ScalarArg(scalar_dtype, "a%d" % i))
            args.append(VectorArg(vector_dtype, "x%d" % i))

        summands.append("a%d*x%d[i]" % (i, i))

    args.append(VectorArg(dtype_z, "z"))
    args.append(ScalarArg(np.uintp, "n"))

    mod = get_elwise_module(
        args,
        "z[i] = " + " + ".join(summands),
        "linear_combination",
        preamble="\n".join(preamble),
        loop_prep=";\n".join(loop_prep),
    )

    func = mod.get_function("linear_combination")
    tex_src = [mod.get_texref(tn) for tn in tex_names]
    func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src)

    return func, tex_src
Esempio n. 10
0
def get_put_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
        "idx_tp": dtype_to_ctype(idx_dtype),
        "tp": dtype_to_ctype(dtype),
    }

    args = ([
        VectorArg(idx_dtype, "gmem_dest_idx"),
    ] + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] +
            [VectorArg(dtype, "src%d" % i)
             for i in range(vec_count)] + [ScalarArg(np.intp, "n")])

    body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join(
        "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count))

    func = get_elwise_module(args, body, "put").get_function("put")
    func.prepare("P" + (2 * vec_count * "P") + np.dtype(np.uintp).char)
    return func
Esempio n. 11
0
def get_elwise_kernel_and_types(arguments,
                                operation,
                                name="kernel",
                                keep=False,
                                options=[],
                                **kwargs):
    if isinstance(arguments, str):
        from pycuda.tools import parse_c_arg
        arguments = [parse_c_arg(arg) for arg in arguments.split(",")]

    arguments.append(ScalarArg(numpy.uintp, "n"))

    mod = get_elwise_module(arguments, operation, name, keep, options,
                            **kwargs)

    from pycuda.tools import get_arg_type
    func = mod.get_function(name)
    func.prepare("".join(arg.struct_char for arg in arguments), (1, 1, 1))

    return func, arguments
Esempio n. 12
0
def get_put_kernel(dtype, idx_dtype, vec_count=1):
    ctx = {
            "idx_tp": dtype_to_ctype(idx_dtype),
            "tp": dtype_to_ctype(dtype),
            }

    args = [
            VectorArg(idx_dtype, "gmem_dest_idx"),
            ] + [
            VectorArg(dtype, "dest%d" % i)
                for i in range(vec_count)
            ] + [
            VectorArg(dtype, "src%d" % i)
                for i in range(vec_count)
            ] + [ScalarArg(np.intp, "n")]

    body = (
            "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx
            + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
                for i in range(vec_count)))

    func = get_elwise_module(args, body, "put").get_function("put")
    func.prepare("P"+(2*vec_count*"P")+np.dtype(np.uintp).char)
    return func