def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): ctx = {"idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype)} args = ( [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), VectorArg(idx_dtype, "gmem_src_idx", with_offset=True), ] + [VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count)] + [ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets] ) if with_offsets: def get_copy_insn(i): return "dest%d[dest_idx] = " "src%d[src_idx+offset%d];" % (i, i, i) else: def get_copy_insn(i): return "dest%d[dest_idx] = " "src%d[src_idx];" % (i, i) body = ("%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join( get_copy_insn(i) for i in range(vec_count) ) return get_elwise_kernel( context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="take_put" )
def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): ax = "a*x[i]" by = "b*y[i]" x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" if x_is_complex: ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x) if y_is_complex: by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y) if x_is_complex and not y_is_complex: by = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_x), by) if not x_is_complex and y_is_complex: ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax) if x_is_complex or y_is_complex: result = "{root}_add({root}_cast({ax}), {root}_cast({by}))".format( ax=ax, by=by, root=complex_dtype_to_name(dtype_z) ) else: result = "%s + %s" % (ax, by) return get_elwise_kernel( context, "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % {"tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z)}, "z[i] = %s" % result, name="axpbyz", )
def get_weighted_inner_kernel(dtype_x, dtype_y, dtype_w, dtype_out): if (dtype_x == np.complex64) or (dtype_x == np.complex128): if (dtype_y == np.float64) or (dtype_y == np.float32): ys = "%s_fromreal(y[i])" % complex_dtype_to_name(dtype_x) else: ys = "y[i]" inner_map="%s_mul(%s_conj(x[i]), %s)" % (complex_dtype_to_name(dtype_x), complex_dtype_to_name(dtype_x), ys) else: inner_map="x[i]*y[i]" if (dtype_w == np.float64) or (dtype_w == np.float32): inner_map = inner_map + "/w[i]" else: inner_map = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_x), inner_map, "w[i]") return ReductionKernel(mgr.state.context, dtype_out, neutral="0", arguments="__global const %(tp_x)s *x, __global const %(tp_y)s *y, __global const %(tp_w)s *w" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_w": dtype_to_ctype(dtype_w), }, reduce_expr="a+b", map_expr=inner_map, name="weighted_inner")
def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support, index_expr="i"): if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_out is None: from pyopencl.compyte.array import get_common_dtype dtype_out = get_common_dtype( dtype_a.type(0), dtype_b.type(0), has_double_support) a_real_dtype = dtype_a.type(0).real.dtype b_real_dtype = dtype_b.type(0).real.dtype out_real_dtype = dtype_out.type(0).real.dtype a_is_complex = dtype_a.kind == "c" b_is_complex = dtype_b.kind == "c" out_is_complex = dtype_out.kind == "c" from pyopencl.elementwise import complex_dtype_to_name if a_is_complex and b_is_complex: a = "a[%s]" % index_expr b = "b[%s]" % index_expr if dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) if conjugate_first and a_is_complex: a = "%s_conj(%s)" % ( complex_dtype_to_name(dtype_out), a) map_expr = "%s_mul(%s, %s)" % ( complex_dtype_to_name(dtype_out), a, b) else: a = "a[%s]" % index_expr b = "b[%s]" % index_expr if out_is_complex: if a_is_complex and dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if b_is_complex and dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) if not a_is_complex and a_real_dtype != out_real_dtype: a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a) if not b_is_complex and b_real_dtype != out_real_dtype: b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b) if conjugate_first and a_is_complex: a = "%s_conj(%s)" % ( complex_dtype_to_name(dtype_out), a) map_expr = "%s*%s" % (a, b) return map_expr, dtype_out, dtype_b
def get_put_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i, with_offset=True) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] + [ VectorArg(np.uint8, "use_fill", with_offset=True) ] + [ VectorArg(np.int64, "val_ary_lengths", with_offset=True) ] body = ( "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join( "dest{i}[dest_idx] = (use_fill[{i}] ? src{i}[0] : " "src{i}[i % val_ary_lengths[{i}]]);".format(i=i) for i in range(vec_count) ) ) return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="put")
def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" x = "x[i]" y = "y[i]" if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) if x_is_complex and y_is_complex: xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif x_is_complex and not y_is_complex: xy = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: xy = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xy = "%s * %s" % (x, y) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % xy, name="multiply")
def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" x = "x[i]" y = "y[i]" if z_is_complex and dtype_x != dtype_y: if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) if x_is_complex and y_is_complex: xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif x_is_complex and not y_is_complex: xoy = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xoy = "%s / %s" % (x, y) if z_is_complex: xoy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xoy) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % xoy, name="divide")
def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx"), VectorArg(idx_dtype, "gmem_src_idx"), ] + [ VectorArg(dtype, "src%d" % i) for i in range(vec_count) ] + [ ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets ] if with_offsets: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx+offset%d];" % (i, i, i)) else: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx];" % (i, i)) body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="take_put")
def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z): # implements y / x! x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" x = "x[i]" y = "y" if z_is_complex and dtype_x != dtype_y: if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) if x_is_complex and y_is_complex: yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) elif not y_is_complex and x_is_complex: yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) elif y_is_complex and not x_is_complex: yox = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) else: yox = "%s / %s" % (y, x) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % yox, name="divide_r")
def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): ax = "a*x[i]" by = "b*y[i]" x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" if x_is_complex: ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x) if y_is_complex: by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y) if x_is_complex and not y_is_complex: by = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_x), by) if not x_is_complex and y_is_complex: ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax) result = "%s + %s" % (ax, by) if z_is_complex: result = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), result) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % result, name="axpbyz")
def get_copy_kernel(context, dtype_dest, dtype_src): return get_elwise_kernel(context, "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = src[i]", name="copy")
def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): a_is_complex = dtype_a.kind == "c" x_is_complex = dtype_x.kind == "c" b_is_complex = dtype_b.kind == "c" z_is_complex = dtype_z.kind == "c" ax = "a*x[i]" if x_is_complex: a = "a" x = "x[i]" if dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if a_is_complex: if dtype_a != dtype_z: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) else: ax = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) elif a_is_complex: a = "a" x = "x[i]" if dtype_a != dtype_z: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) ax = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) b = "b" if z_is_complex and not b_is_complex: b = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), b) if z_is_complex and not (a_is_complex or x_is_complex): ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), ax) if z_is_complex: ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax) b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b) if a_is_complex or x_is_complex or b_is_complex: expr = "{root}_add({ax}, {b})".format( ax=ax, b=b, root=complex_dtype_to_name(dtype_z)) else: expr = "%s + %s" % (ax, b) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_x": dtype_to_ctype(dtype_x), "tp_b": dtype_to_ctype(dtype_b), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = " + expr, name="axpb")
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None): if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_out is None: from pyopencl.compyte.array import get_common_dtype from pyopencl.characterize import has_double_support dtype_out = get_common_dtype( dtype_a.type(0), dtype_b.type(0), has_double_support(ctx.devices[0])) a_real_dtype = dtype_a.type(0).real.dtype b_real_dtype = dtype_b.type(0).real.dtype out_real_dtype = dtype_out.type(0).real.dtype a_is_complex = dtype_a.kind == "c" b_is_complex = dtype_b.kind == "c" out_is_complex = dtype_out.kind == "c" from pyopencl.elementwise import complex_dtype_to_name if a_is_complex and b_is_complex: a = "a[i]" b = "b[i]" if dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) map_expr = "%s_mul(%s, %s)" % ( complex_dtype_to_name(dtype_out), a, b) else: a = "a[i]" b = "b[i]" if out_is_complex: if a_is_complex and dtype_a != dtype_out: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) if b_is_complex and dtype_b != dtype_out: b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) if not a_is_complex and a_real_dtype != out_real_dtype: a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a) if not b_is_complex and b_real_dtype != out_real_dtype: b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b) map_expr = "%s*%s" % (a, b) return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments= "__global const %(tp_a)s *a, " "__global const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), })
def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] * y[i]", name="multiply")
def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] / y[i]", name="divide")
def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = a*x[i] + b*y[i]", name="axpbyz")
def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z): return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = pow(x[i], y[i])", name="pow_method")
def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): if out_dtype is None: out_dtype = in_dtype return get_elwise_kernel(context, "%(tp_out)s *z, %(tp_in)s *y" % { "tp_in": dtype_to_ctype(in_dtype), "tp_out": dtype_to_ctype(out_dtype), }, "z[i] = %s(y[i])" % func_name, name="%s_kernel" % func_name)
def get_copy_kernel(context, dtype_dest, dtype_src): src = "src[i]" if dtype_dest.kind == "c" != dtype_src.kind: src = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_dest), src) return get_elwise_kernel(context, "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = %s" % src, name="copy")
def get_norm_kernel(dtype_x, dtype_out): if dtype_x == np.float32 or dtype_x == np.float64: op = "z[i] = x[i] * x[i]" if dtype_x == np.complex64 or dtype_x == np.complex128: op = "z[i] = x[i].x*x[i].x + x[i].y*x[i].y" return ElementwiseKernel(mgr.state.context, "%(tp_x)s *x, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_out), }, op, "normsq")
def get_take_kernel(context, dtype, idx_dtype, vec_count=1): ctx = {"idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype)} args = ( [VectorArg(dtype, "dest" + str(i), with_offset=True) for i in range(vec_count)] + [VectorArg(dtype, "src" + str(i), with_offset=True) for i in range(vec_count)] + [VectorArg(idx_dtype, "idx", with_offset=True)] ) body = ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( "dest%d[i] = src%d[src_idx];" % (i, i) for i in range(vec_count) ) return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="take")
def get_correlate_kernel(dtype_x, dtype_y,dtype_out): if dtype_x == numpy.complex64: op = "z[i] = cfloat_mul(cfloat_conj(x[i]), y[i])" elif dtype_x == numpy.complex128: op = "z[i] = cdouble_mul(cdouble_conj(x[i]), y[i])" return ElementwiseKernel(mgr.state.context, "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_out), }, op, "correlate")
def get_put_kernel(context, dtype, idx_dtype, vec_count=1): ctx = {"idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype)} args = ( [VectorArg(dtype, "dest%d" % i, with_offset=True) for i in range(vec_count)] + [VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True)] + [VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count)] ) body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join( "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count) ) return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="put")
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments= "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), })
def get_pow_kernel(context, dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): if is_base_array: x = "x[i]" x_ctype = "%(tp_x)s *x" else: x = "x" x_ctype = "%(tp_x)s x" if is_exp_array: y = "y[i]" y_ctype = "%(tp_y)s *y" else: y = "y" y_ctype = "%(tp_y)s y" x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" if z_is_complex and dtype_x != dtype_y: if x_is_complex and dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), y) elif dtype_x != dtype_y: if dtype_x != dtype_z: x = "(%s) (%s)" % (dtype_to_ctype(dtype_z), x) if dtype_y != dtype_z: y = "(%s) (%s)" % (dtype_to_ctype(dtype_z), y) if x_is_complex and y_is_complex: result = "%s_pow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif x_is_complex and not y_is_complex: result = "%s_powr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: result = "%s_rpow(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: result = "pow(%s, %s)" % (x, y) return get_elwise_kernel(context, ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % result, name="pow_method")
def _fill_array_with_index_knl(self, context, idx_dtype, array_dtype): return ElementwiseKernel( context, Template(r""" ${idx_t} *index, ${array_t} *array, ${array_t} val """).render( idx_t=dtype_to_ctype(idx_dtype), array_t=dtype_to_ctype(array_dtype) ), Template(r""" array[index[i]] = val; """).render(), name="fill_array_with_index" )
def get_fill_kernel(context, dtype): return get_elwise_kernel( context, "{tp} *z, {tp} a".format(tp=dtype_to_ctype(dtype), ), "z[i] = a", preamble=dtype_to_c_struct(context.devices[0], dtype), name="fill")
def get_sum_kernel(ctx, dtype_out, dtype_in): if dtype_out is None: dtype_out = dtype_in return ReductionKernel(ctx, dtype_out, "0", "a+b", arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)})
def get_reverse_kernel(context, dtype): return get_elwise_kernel(context, "%(tp)s *z, %(tp)s *y" % { "tp": dtype_to_ctype(dtype), }, "z[i] = y[n-1-i]", name="reverse")
def get_compress_kernel(self, index_dtype): arguments = """ __global ${index_t} *count, __global ${index_t} *compressed_counts, __global ${index_t} *nonempty_indices, __global ${index_t} *compressed_indices, __global ${index_t} *num_non_empty_list """ from sys import version_info if version_info > (3, 0): arguments = Template(arguments) else: arguments = Template(arguments, disable_unicode=True) from pyopencl.scan import GenericScanKernel return GenericScanKernel( self.context, index_dtype, arguments=arguments.render(index_t=dtype_to_ctype(index_dtype)), input_expr="count[i] == 0 ? 0 : 1", scan_expr="a+b", neutral="0", output_statement=""" if (i + 1 < N) compressed_indices[i + 1] = item; if (prev_item != item) { nonempty_indices[item - 1] = i; compressed_counts[item - 1] = count[i]; } if (i + 1 == N) *num_non_empty_list = item; """, devices=self.devices)
def get_arange_kernel(context, dtype): return get_elwise_kernel(context, "%(tp)s *z, %(tp)s start, %(tp)s step" % { "tp": dtype_to_ctype(dtype), }, "z[i] = start + i*step", name="arange")
def python_dtype_str(dtype): import pyopencl.tools as cl_tools if dtype.isbuiltin: return "_lpy_np."+dtype.name else: return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")" % cl_tools.dtype_to_ctype(dtype))
def maxpool2d(q, A, f, stride, out=None, indices=None): dtype = dtype_to_ctype(A.dtype) n, c, h, w = A.shape out_h = (h - f) / stride + 1 out_w = (w - f) / stride + 1 if out is None: out = clarray.empty(q, (n, c, out_h, out_w), dtype=A.dtype) if indices is None: indices = clarray.empty(q, (n, c, out_h, out_w), dtype=np.int32) if 'max_pool' not in _kernel_cache: prg = cl.Program(clplatf.ctx, _maxpool_template % { 'dtype': dtype }).build() _kernel_cache['max_pool'] = prg.max_pool krnl = _kernel_cache['max_pool'] # TODO better global and local dimensions (make divisible by 64 etc.) ev = krnl(q, (n * c * out_h * out_w, ), None, A.data, out.data, indices.data, np.int32(h), np.int32(w), np.int32(out_h), np.int32(out_w), np.int32(f), np.int32(f), np.int32(stride), np.int32(stride)) ev.wait() return out, indices
def get_fill_kernel(context, dtype): return get_elwise_kernel(context, "%(tp)s *z, %(tp)s a" % { "tp": dtype_to_ctype(dtype), }, "z[i] = a", name="fill")
def __init__(self, ctx, queue, data, symmetry_modes): self._ctx = ctx self._queue = queue self.symmetry_modes = symmetry_modes self.data = data ctype = dtype_to_ctype(data.dtype) with open('sandpile.cl') as f: program = cl.Program(self._ctx, f.read()) macros = _gen_macros(data, symmetry_modes) options = _macros_to_options(macros) self._program = program.build(options=options) from pyopencl.reduction import ReductionKernel self._diff_krnl = ReductionKernel( self._ctx, numpy.uint32, neutral='0', reduce_expr='a+b', map_expr='grid[i]!=new_grid[i]', arguments='const __global %s *grid, const __global %s *new_grid' % (ctype, ctype))
def get_write_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [] kernel_list_arg_values = "" user_list_args = [] for name, dtype in self.list_names_and_dtypes: list_name = "plb_%s_list" % name list_arg = VectorArg(dtype, list_name) kernel_list_args.append(list_arg) user_list_args.append(list_arg) if name in self.count_sharing: kernel_list_arg_values += "%s, " % list_name continue kernel_list_args.append(VectorArg(index_dtype, "plb_%s_start_index" % name)) index_name = "plb_%s_index" % name user_list_args.append(OtherArg("%s *%s" % (index_ctype, index_name), index_name)) kernel_list_arg_values += "%s, &%s, " % (list_name, index_name) kernel_name = self.name_prefix + "_write" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=False, kernel_name=kernel_name, double_support=all(has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=kernel_list_arg_values, user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl=_get_arg_decl(self.arg_decls), user_args=_get_arg_list(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) + [index_dtype]) return knl
def get_linear_combination_kernel(summand_descriptors, dtype_z): # TODO: Port this! raise NotImplementedError from pyopencl.tools import dtype_to_ctype from pyopencl.elementwise import \ VectorArg, ScalarArg, get_elwise_module args = [] preamble = [] loop_prep = [] summands = [] tex_names = [] for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ enumerate(summand_descriptors): if is_gpu_scalar: preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) tex_names.append("tex_a%d" % i) loop_prep.append( "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i)) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) summands.append("a%d*x%d[i]" % (i, i)) args.append(VectorArg(dtype_z, "z", with_offset=True)) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, "z[i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), (1, 1, 1), texrefs=tex_src) return func, tex_src
def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z, x_is_scalar=False, y_is_scalar=False): result_t = dtype_to_ctype(dtype_z) x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" x = "x[0]" if x_is_scalar else "x[i]" y = "y[0]" if y_is_scalar else "y[i]" if dtype_z.kind == "c": # a and b will always be complex here. z_ct = complex_dtype_to_name(dtype_z) if x_is_complex: ax = f"{z_ct}_mul(a, {z_ct}_cast({x}))" else: ax = f"{z_ct}_mulr(a, {x})" if y_is_complex: by = f"{z_ct}_mul(b, {z_ct}_cast({y}))" else: by = f"{z_ct}_mulr(b, {y})" result = f"{z_ct}_add({ax}, {by})" else: # real-only ax = f"a*(({result_t}) {x})" by = f"b*(({result_t}) {y})" result = f"{ax} + {by}" return get_elwise_kernel( context, "{tp_z} *z, {tp_z} a, {tp_x} *x, {tp_z} b, {tp_y} *y".format( tp_x=dtype_to_ctype(dtype_x), tp_y=dtype_to_ctype(dtype_y), tp_z=dtype_to_ctype(dtype_z), ), "z[i] = %s" % result, name="axpbyz")
def get_fill_kernel(context, dtype): return get_elwise_kernel(context, "%(tp)s *z, %(tp)s a" % { "tp": dtype_to_ctype(dtype), }, "z[i] = a", preamble=dtype_to_c_struct(context.devices[0], dtype), name="fill")
def get_take_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = ([ VectorArg(dtype, "dest" + str(i), with_offset=True) for i in range(vec_count) ] + [ VectorArg(dtype, "src" + str(i), with_offset=True) for i in range(vec_count) ] + [VectorArg(idx_dtype, "idx", with_offset=True)]) body = (("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join("dest%d[i] = src%d[src_idx];" % (i, i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="take")
def get_count_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [ VectorArg(index_dtype, "plb_%s_count" % name) for name, dtype in self.list_names_and_dtypes if name not in self.count_sharing] user_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue name = "plb_loc_%s_count" % name user_list_args.append(OtherArg("%s *%s" % ( index_ctype, name), name)) kernel_name = self.name_prefix+"_count" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=True, kernel_name=kernel_name, double_support=all(has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), eliminate_empty_output_lists=self.eliminate_empty_output_lists, kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"), user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl_with_offset=_get_arg_decl(self.arg_decls), user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset), user_args_no_offset=_get_arg_list(self.arg_decls_no_offset), arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes( kernel_list_args+self.arg_decls) + [index_dtype]) return knl
def get_scan_kernel(self, index_dtype): from pyopencl.scan import GenericScanKernel return GenericScanKernel( self.context, index_dtype, arguments="__global %s *ary" % dtype_to_ctype(index_dtype), input_expr="ary[i]", scan_expr="a+b", neutral="0", output_statement="ary[i+1] = item;", devices=self.devices)
def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset): if dtype.kind == "f": reduce_expr = "f%s(a,b)" % what elif dtype.kind in "iu": reduce_expr = "%s(a,b)" % what else: raise TypeError("unsupported dtype specified") return ReductionKernel(ctx, dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, map_expr="in[lookup_tbl[i]]", arguments= "const %(tp_lut)s *lookup_tbl, " "const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), "tp_lut": dtype_to_ctype(dtype_subset), }, preamble="#define MY_INFINITY (1./0)")
def get_divide_kernel(context, dtype_x, dtype_y, dtype_z, x_is_scalar=False, y_is_scalar=False): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" z_is_complex = dtype_z.kind == "c" x = "x[0]" if x_is_scalar else "x[i]" y = "y[0]" if y_is_scalar else "y[i]" if z_is_complex and dtype_x != dtype_y: if x_is_complex and dtype_x != dtype_z: x = "{}_cast({})".format(complex_dtype_to_name(dtype_z), x) if y_is_complex and dtype_y != dtype_z: y = "{}_cast({})".format(complex_dtype_to_name(dtype_z), y) else: if dtype_x != dtype_z: x = f"({dtype_to_ctype(dtype_z)}) ({x})" if dtype_y != dtype_z: y = f"({dtype_to_ctype(dtype_z)}) ({y})" if x_is_complex and y_is_complex: xoy = "{}_divide({}, {})".format(complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: xoy = "{}_rdivide({}, {})".format(complex_dtype_to_name(dtype_z), x, y) elif x_is_complex and not y_is_complex: xoy = "{}_divider({}, {})".format(complex_dtype_to_name(dtype_z), x, y) else: xoy = f"{x} / {y}" if z_is_complex: xoy = "{}_cast({})".format(complex_dtype_to_name(dtype_z), xoy) return get_elwise_kernel(context, "{tp_z} *z, {tp_x} *x, {tp_y} *y".format( tp_x=dtype_to_ctype(dtype_x), tp_y=dtype_to_ctype(dtype_y), tp_z=dtype_to_ctype(dtype_z), ), "z[i] = %s" % xoy, name="divide")
def get_reduction_kernel(stage, ctx, dtype_out, neutral, reduce_expr, arguments=None, name="reduce_kernel", preamble="", map_exprs=None, device=None, options=[], max_group_size=None): if map_exprs is None: raise ValueError("map_exprs has to be given!") for i, m in enumerate(map_exprs): if m is None: if stage == 2: map_exprs[i] = "pyopencl_reduction_inp_%i[i]" % i else: map_exprs[i] = "in[i]" from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes, get_arg_offset_adjuster_code, VectorArg) arg_prep = "" if stage == 1 and arguments is not None: arguments = parse_arg_list(arguments, with_offset=True) arg_prep = get_arg_offset_adjuster_code(arguments) if stage == 2 and arguments is not None: arguments = parse_arg_list(arguments) arguments = ([ VectorArg(dtype_out, "pyopencl_reduction_inp_%i" % i) for i in range(len(map_exprs)) ] + arguments) inf = _get_reduction_source(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_exprs, arguments, name, preamble, arg_prep, device, max_group_size) inf.program = cl.Program(ctx, inf.source) inf.program.build(options) inf.kernel = getattr(inf.program, name) inf.arg_types = arguments inf.kernel.set_scalar_arg_dtypes( [ None, ] * len(map_exprs) + [np.int64] + get_arg_list_scalar_arg_dtypes(inf.arg_types) + [np.uint32] * 2) return inf
def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0]), index_expr="lookup_tbl[i]") # important: lookup_tbl must be first--it controls the length return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments=( "const %(tp_lut)s *lookup_tbl, " "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_lut": dtype_to_ctype(dtype_subset), "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }))
def python_dtype_str_inner(self, dtype): import pyopencl.tools as cl_tools if dtype.isbuiltin: name = dtype.name if dtype.name == "bool": name = "bool8" return f"_lpy_np.dtype(_lpy_np.{name})" else: return ('_lpy_cl_tools.get_or_register_dtype("%s")' % cl_tools.dtype_to_ctype(dtype))
def get_copy_kernel(context, dtype_dest, dtype_src): src = "src[i]" if dtype_dest.kind == "c" != dtype_src.kind: src = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_dest), src) if dtype_dest.kind == "c" and dtype_src != dtype_dest: src = "%s_cast(%s)" % (complex_dtype_to_name(dtype_dest), src), if dtype_dest != dtype_src and ( dtype_dest.kind == "V" or dtype_src.kind == "V"): raise TypeError("copying between non-identical struct types") return get_elwise_kernel(context, "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = %s" % src, preamble=dtype_to_c_struct(context.devices[0], dtype_dest), name="copy")
def dtype_to_ctype(dtype): """Get the CL type of the given numpy data type. Args: dtype (np.dtype): the numpy data type Returns: str: the CL type string for the corresponding type """ from pyopencl.tools import dtype_to_ctype return dtype_to_ctype(dtype)
def get_copy_kernel(context, dtype_dest, dtype_src): src = "src[i]" if dtype_dest.kind == "c" != dtype_src.kind: src = "{}_fromreal({})".format(complex_dtype_to_name(dtype_dest), src) if dtype_dest.kind == "c" and dtype_src != dtype_dest: src = "{}_cast({})".format(complex_dtype_to_name(dtype_dest), src), if dtype_dest != dtype_src and ( dtype_dest.kind == "V" or dtype_src.kind == "V"): raise TypeError("copying between non-identical struct types") return get_elwise_kernel(context, "{tp_dest} *dest, {tp_src} *src".format( tp_dest=dtype_to_ctype(dtype_dest), tp_src=dtype_to_ctype(dtype_src), ), "dest[i] = %s" % src, preamble=dtype_to_c_struct(context.devices[0], dtype_dest), name="copy")
def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): a_is_complex = dtype_a.kind == "c" x_is_complex = dtype_x.kind == "c" b_is_complex = dtype_b.kind == "c" z_is_complex = dtype_z.kind == "c" ax = "a*x[i]" if a_is_complex and x_is_complex: a = "a" x = "x[i]" if dtype_a != dtype_z: a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) if dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) b = "b" if z_is_complex and not b_is_complex: b = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), b) if z_is_complex and not (a_is_complex or x_is_complex): ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_z), ax) if z_is_complex: ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax) b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b) return get_elwise_kernel( context, "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_x": dtype_to_ctype(dtype_x), "tp_b": dtype_to_ctype(dtype_b), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s + %s" % (ax, b), name="axpb")
def get_reduction_kernel(stage, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", preamble="", device=None, options=None, max_group_size=None): if map_expr is None: if stage == 2: map_expr = "pyopencl_reduction_inp[i]" else: map_expr = "in[i]" from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes, get_arg_offset_adjuster_code, VectorArg) if arguments is None: raise ValueError("arguments must not be None") arguments = parse_arg_list(arguments, with_offset=True) arg_prep = get_arg_offset_adjuster_code(arguments) if stage == 2 and arguments is not None: arguments = ([VectorArg(dtype_out, "pyopencl_reduction_inp")] + arguments) source, group_size = _get_reduction_source(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_expr, arguments, name, preamble, arg_prep, device, max_group_size) program = cl.Program(ctx, source) program.build(options) kernel = getattr(program, name) kernel.set_scalar_arg_dtypes([None, np.int64] + get_arg_list_scalar_arg_dtypes(arguments) + [np.int64] * 3 + [np.uint32, np.int64]) return _ReductionInfo(context=ctx, source=source, group_size=group_size, program=program, kernel=kernel, arg_types=arguments)
def get_put_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i, with_offset=True) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] body = ("%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="put")
def get_arange_kernel(context, dtype): if dtype.kind == "c": i = "%s_fromreal(i)" % complex_dtype_to_name(dtype) else: i = "(%s) i" % dtype_to_ctype(dtype) return get_elwise_kernel(context, [ VectorArg(dtype, "z", with_offset=True), ScalarArg(dtype, "start"), ScalarArg(dtype, "step"), ], "z[i] = start + %s*step" % i, name="arange")
def get_arange_kernel(context, dtype): if dtype.kind == "c": expr = ("{root}_add(start, {root}_rmul(i, step))".format( root=complex_dtype_to_name(dtype))) else: expr = "start + ((%s) i)*step" % dtype_to_ctype(dtype) return get_elwise_kernel(context, [ VectorArg(dtype, "z", with_offset=True), ScalarArg(dtype, "start"), ScalarArg(dtype, "step"), ], "z[i] = " + expr, name="arange")
def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, conjugate_first=False): from pyopencl.characterize import has_double_support map_expr, dtype_out, dtype_b = _get_dot_expr( dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) reduce_expr = "a+b" neutral_expr = "0" if dtype_out.kind == "c": from pyopencl.elementwise import complex_dtype_to_name dtname = complex_dtype_to_name(dtype_out) reduce_expr = "%s_add(a, b)" % dtname neutral_expr = "%s_new(0, 0)" % dtname return ReductionKernel(ctx, dtype_out, neutral=neutral_expr, reduce_expr=reduce_expr, map_expr=map_expr, arguments=( "const %(tp_a)s *a, " "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }))