def __init__(self, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", keep=False, options=None, preamble=""): self.dtype_out = dtype_out self.block_size = 512 s1_func, self.stage1_arg_types = get_reduction_kernel_and_types( dtype_to_ctype(dtype_out), self.block_size, neutral, reduce_expr, map_expr, arguments, name=name+"_stage1", keep=keep, options=options, preamble=preamble) self.stage1_func = s1_func.prepared_async_call # stage 2 has only one input and no map expression s2_func, self.stage2_arg_types = get_reduction_kernel_and_types( dtype_to_ctype(dtype_out), self.block_size, neutral, reduce_expr, name=name+"_stage2", keep=keep, options=options, preamble=preamble) self.stage2_func = s2_func.prepared_async_call assert [i for i, arg_tp in enumerate(self.stage1_arg_types) if arg_tp == "P"], \ "ReductionKernel can only be used with functions that have at least one " \ "vector argument"
def get_divscalar_function(src_type, dest_type, pitch = True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "divscalar" operation = "/" if pitch: func = SourceModule( pitch_left_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare([np.int32, np.int32, np.intp, np.int32, np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule( non_pitch_left_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare([np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def get_dot_kernel(dtype_out, dtype_a, dtype_b): return ReductionKernel(dtype_out, neutral="0", reduce_expr="a+b", map_expr="a[i]*b[i]", arguments="const %(tp_a)s *a, const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }, keep=True)
def get_complex_from_amp_function(in_type, result_type, pitch = True): type_in = dtype_to_ctype(in_type) type_result = dtype_to_ctype(result_type) name = "makecomplex_amp_phase" if pitch: func = SourceModule( pitch_complex_amp_template % { "name": name, "in_type": type_in, "result_type": type_result, "fletter": 'f' if in_type == np.float32 else '' }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.intp, np.int32]) else: func = SourceModule( non_pitch_complex_amp_template % { "name": name, "in_type": type_in, "result_type": type_result, "fletter": 'f' if in_type == np.float32 else '' }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32]) return func
def get_astype_function(dtype_dest, dtype_src, pitch = True): type_dest = dtype_to_ctype(dtype_dest) type_src = dtype_to_ctype(dtype_src) name = "astype" operation = "" if pitch: func = SourceModule( pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi')#[np.intp, np.intp, np.int32]) return func
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch = True): type_left = dtype_to_ctype(left_dtype) type_right = dtype_to_ctype(right_dtype) type_rslt = dtype_to_ctype(rslt_dtype) name = "divarray" operation = "/" if pitch: func = SourceModule( pitch_array_op_template % { "name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPiPi')#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_array_op_template % { "name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32]) return func
def get_take_kernel(dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), } args = [VectorArg(idx_dtype, "idx")] + [ VectorArg(dtype, "dest"+str(i))for i in range(vec_count)] + [ ScalarArg(np.intp, "n") ] preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count)) body = ( ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) for i in range(vec_count))) mod = get_elwise_module(args, body, "take", preamble=preamble) func = mod.get_function("take") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare("P"+(vec_count*"P")+np.dtype(np.uintp).char, texrefs=tex_src) return func, tex_src
def get_angle_function(dtypein, dtypeout, pitch = True): type_src = dtype_to_ctype(dtypein) type_dest = dtype_to_ctype(dtypeout) name = "angle_function" if dtypeout == np.float32: fletter = "f" else: fletter = "" if pitch: func = SourceModule( pitch_angle_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "fletter": fletter, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule( non_pitch_angle_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "fletter": fletter, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi')#[np.intp, np.intp, np.int32]) return func
def get_divarray_function(left_dtype, right_dtype, rslt_dtype, pitch = True): type_left = dtype_to_ctype(left_dtype) type_right = dtype_to_ctype(right_dtype) type_rslt = dtype_to_ctype(rslt_dtype) name = "divarray" operation = "/" if pitch: func = func_compile(name, pitch_array_op_template % {"name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }) func.prepare([np.int32, np.int32, np.intp, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = func_compile(name, non_pitch_array_op_template % {"name": name, "dest_type": type_rslt, "left_type": type_left, "right_type": type_right, "operation": operation, }) func.prepare([np.intp, np.intp, np.intp, np.int32]) return func
def get_powscalar_function(src_type, dest_type, pitch = True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "powscalar" operation = "pow" if pitch: func = SourceModule( pitch_left_scalar_func_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, "fletter": 'f' if src_type == np.float32 else '', }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule( non_pitch_left_scalar_func_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, "fletter": 'f' if src_type == np.float32 else '', }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def _get_eigsq_kernel(dtype_s, dtype_q): template = """ #include <pycuda/pycuda-complex.hpp> __global__ void eigsq_Kernel(%(types)s* d_S, %(typeq)s* d_q, %(types)s thres, int size) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int total = blockDim.x * gridDim.x; for(int i = tid; i < size; i += total) { %(types)s s = d_S[i]; %(typeq)s q = d_q[i]; if(fabs%(iff)s(s) > thres) { d_q[i] = q / s; }else { d_q[i] = 0.0; } } } """ mod = SourceModule(template % { "types": dtype_to_ctype(dtype_s), "typeq": dtype_to_ctype(dtype_q), "iff": "f" if dtype_q == np.float32 else ""}) func = mod.get_function("eigsq_Kernel") func.prepare([np.intp, np.intp, np.double if dtype_s == np.double else np.float32, np.int32]) return func
def get_by_index(src_gpu, ind): """ Get values in a GPUArray by index. Parameters ---------- src_gpu : pycuda.gpuarray.GPUArray GPUArray instance from which to extract values. ind : pycuda.gpuarray.GPUArray or numpy.ndarray Array of element indices to set. Must have an integer dtype. Returns ------- res_gpu : pycuda.gpuarray.GPUArray GPUArray with length of `ind` and dtype of `src_gpu` containing selected values. Examples -------- >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import misc >>> src = np.random.rand(5).astype(np.float32) >>> src_gpu = gpuarray.to_gpu(src) >>> ind = gpuarray.to_gpu(np.array([0, 2, 4])) >>> res_gpu = misc.get_by_index(src_gpu, ind) >>> np.allclose(res_gpu.get(), src[[0, 2, 4]]) True Notes ----- Only supports 1D index arrays. May not be efficient for certain index patterns because of lack of inability to coalesce memory operations. """ # Only support 1D index arrays: assert len(np.shape(ind)) == 1 assert issubclass(ind.dtype.type, numbers.Integral) N = len(ind) if not isinstance(ind, gpuarray.GPUArray): ind = gpuarray.to_gpu(ind) dest_gpu = gpuarray.empty(N, dtype=src_gpu.dtype) # Manually handle empty index array because it will cause the kernel to # fail if processed: if N == 0: return dest_gpu try: func = get_by_index.cache[(src_gpu.dtype, ind.dtype)] except KeyError: data_ctype = tools.dtype_to_ctype(src_gpu.dtype) ind_ctype = tools.dtype_to_ctype(ind.dtype) v = "{data_ctype} *dest, {ind_ctype} *ind, {data_ctype} *src".format(data_ctype=data_ctype, ind_ctype=ind_ctype) func = elementwise.ElementwiseKernel(v, "dest[i] = src[ind[i]]") get_by_index.cache[(src_gpu.dtype, ind.dtype)] = func func(dest_gpu, ind, src_gpu, range=slice(0, N, 1)) return dest_gpu
def get_scalardiv_function(src_type, dest_type, pitch = True): type_src = dtype_to_ctype(src_type) type_dest = dtype_to_ctype(dest_type) name = "scalardiv" operation = "/" if pitch: func = SourceModule( pitch_right_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi'+np.dtype(dest_type).char)#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.int32, _get_type(dest_type)]) else: func = SourceModule( non_pitch_right_scalar_op_template % { "name": name, "src_type": type_src, "dest_type": type_dest, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PP'+np.dtype(dest_type).char+'i')#[np.intp, np.intp, _get_type(dest_type), np.int32]) return func
def get_complex_function(real_type, imag_type, result_type, pitch = True): type_real = dtype_to_ctype(real_type) type_imag = dtype_to_ctype(imag_type) type_result = dtype_to_ctype(result_type) name = "makecomplex" if pitch: func = SourceModule( pitch_complex_template % { "name": name, "real_type": type_real, "imag_type": type_imag, "result_type": type_result }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPPi')#[np.int32, np.int32, np.intp, np.int32, # np.intp, np.intp, np.int32]) else: func = SourceModule( non_pitch_complex_template % { "name": name, "real_type": type_real, "imag_type": type_imag, "result_type": type_result }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPPi')#[np.intp, np.intp, np.intp, np.int32]) return func
def get_norm_kernel(dtype_x, dtype_out): return ElementwiseKernel( "%(tp_x)s *x, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_out), }, "z[i] = norm(x[i])", "normalize")
def get_accum_diff_sq_kernel(dtype_x, dtype_z): return ElementwiseKernel( "%(tp_a)s *x, %(tp_c)s *z" % { "tp_a": dtype_to_ctype(dtype_x), "tp_c": dtype_to_ctype(dtype_z), }, "x[i] += norm(z[i]) ", "chisq_accum")
def get_imag_kernel(dtype, real_dtype): return get_elwise_kernel( "%(tp)s *y, %(real_tp)s *z" % { "tp": dtype_to_ctype(dtype), "real_tp": dtype_to_ctype(real_dtype), }, "z[i] = imag(y[i])", "imag")
def get_axpbz_kernel(dtype_x, dtype_z): return get_elwise_kernel( "%(tp_z)s a, %(tp_x)s *x,%(tp_z)s b, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z) }, "z[i] = a * x[i] + b", "axpb")
def get_rdivide_elwise_kernel(dtype_x, dtype_z): return get_elwise_kernel( "%(tp_x)s *x, %(tp_z)s y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = y / x[i]", "divide_r")
def get_copy_kernel(dtype_dest, dtype_src): return get_elwise_kernel( "%(tp_dest)s *dest, %(tp_src)s *src" % { "tp_dest": dtype_to_ctype(dtype_dest), "tp_src": dtype_to_ctype(dtype_src), }, "dest[i] = src[i]", "copy")
def get_scalar_op_kernel(dtype_x, dtype_y, operator): return get_elwise_kernel( "%(tp_x)s *x, %(tp_a)s a, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_a": dtype_to_ctype(dtype_x), }, "y[i] = x[i] %s a" % operator, "scalarop_kernel")
def get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s(x[i], y[i])" % func, func+"_kernel")
def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] %s y[i]" % operator, "multiply")
def get_multiply_kernel(dtype_x, dtype_y, dtype_z): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] * y[i]", "multiply")
def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): return get_elwise_kernel( "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = a*x[i] + b*y[i]", "axpbyz")
def get_gt_kernel(dtype_x, dtype_y, dtype_z): return get_elwise_kernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = x[i] > y[i]", "gt")
def get_correlate_kernel(dtype_x, dtype_y,dtype_out): return ElementwiseKernel( "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_out), }, "z[i] = conj(x[i]) * y[i]", "correlate")
def get_inds(self, src, dest, inds, src_shift=0): """ Set `dest[i] = src[src_shift+inds[i]] for i in range(len(inds))` """ assert src.dtype == dest.dtype inds_ctype = dtype_to_ctype(inds.dtype) data_ctype = dtype_to_ctype(src.dtype) func = get_inds_kernel(inds_ctype, data_ctype) func(dest, int(src_shift), inds, src, range=slice(0, len(inds), 1) )
def get_unary_func_kernel(func_name, in_dtype, out_dtype=None): if out_dtype is None: out_dtype = in_dtype return get_elwise_kernel( "%(tp_in)s *y, %(tp_out)s *z" % { "tp_in": dtype_to_ctype(in_dtype), "tp_out": dtype_to_ctype(out_dtype), }, "z[i] = %s(y[i])" % func_name, "%s_kernel" % func_name)
def get_subset_sum_kernel(dtype_out, dtype_subset, dtype_in): if dtype_out is None: dtype_out = dtype_in return ReductionKernel(dtype_out, "0", "a+b", map_expr="in[lookup_tbl[i]]", arguments="const %(tp_lut)s *lookup_tbl, const %(tp)s *in" % { "tp": dtype_to_ctype(dtype_in), "tp_lut": dtype_to_ctype(dtype_subset), })
def get_linear_combination_kernel(summand_descriptors, dtype_z): from pycuda.tools import dtype_to_ctype from pycuda.elementwise import \ VectorArg, ScalarArg, get_elwise_module args = [] preamble = ["#include <pycuda-helpers.hpp>\n\n"] loop_prep = [] summands = [] tex_names = [] for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ enumerate(summand_descriptors): if is_gpu_scalar: preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) args.append(VectorArg(vector_dtype, "x%d" % i)) tex_names.append("tex_a%d" % i) loop_prep.append("%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i)) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i)) summands.append("a%d*x%d[i]" % (i, i)) args.append(VectorArg(dtype_z, "z")) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, "z[i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src) return func, tex_src
def get_subset_minmax_kernel(what, dtype, dtype_subset): if dtype == np.float64: reduce_expr = "f%s(a,b)" % what elif dtype == np.float32: reduce_expr = "f%sf(a,b)" % what elif dtype.kind in "iu": reduce_expr = "%s(a,b)" % what else: raise TypeError("unsupported dtype specified") return ReductionKernel(dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, map_expr="in[lookup_tbl[i]]", arguments="const %(tp_lut)s *lookup_tbl, " "const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), "tp_lut": dtype_to_ctype(dtype_subset), }, preamble="#define MY_INFINITY (1./0)")
def get_update_func(self, dtypes): type_dict = {k: dtype_to_ctype(dtypes[k]) for k in dtypes} type_dict.update({'fletter': 'f' if type_dict['n'] == 'float' else ''}) mod = SourceModule(self.get_update_template() % type_dict, options=self.compile_options) func = mod.get_function("update") func.prepare('i' + np.dtype(dtypes['dt']).char + 'i' + 'P' * (len(type_dict) - 2)) func.block = (128, 1, 1) func.grid = (min(6 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, (self.num_comps - 1) / 128 + 1), 1) return func
def __init__( self, dtype, scan_expr, neutral=None, name_prefix="scan", options=None, preamble="", devices=None, ): if isinstance(self, ExclusiveScanKernel) and neutral is None: raise ValueError("neutral element is required for exclusive scan") dtype = self.dtype = np.dtype(dtype) self.neutral = neutral # Thrust says these are good for GT200 self.scan_wg_size = 128 self.update_wg_size = 256 self.scan_wg_seq_batches = 6 kw_values = dict( preamble=preamble, name_prefix=name_prefix, scan_type=dtype_to_ctype(dtype), scan_expr=scan_expr, neutral=neutral, ) scan_intervals_src = str( SCAN_INTERVALS_SOURCE.render( wg_size=self.scan_wg_size, wg_seq_batches=self.scan_wg_seq_batches, **kw_values)) scan_intervals_prg = SourceModule(scan_intervals_src, options=options, no_extern_c=True) self.scan_intervals_knl = scan_intervals_prg.get_function( name_prefix + "_scan_intervals") self.scan_intervals_knl.prepare("PIIPP") final_update_src = str( self.final_update_tp.render(wg_size=self.update_wg_size, **kw_values)) final_update_prg = SourceModule(final_update_src, options=options, no_extern_c=True) self.final_update_knl = final_update_prg.get_function(name_prefix + "_final_update") self.final_update_knl.prepare("PIIP")
def test_3d_fp_textures(self): orden = "C" npoints = 32 for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = "fp_tex_cfloat" elif prec == np.complex128: fpName_str = "fp_tex_cdouble" elif prec == np.float64: fpName_str = "fp_tex_double" else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = """ #include <pycuda-helpers.hpp> texture<fpName, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); } """ myKern = myKern.replace("fpName", fpName_str) myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (8, 8, 8) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, npoints) cuGrid = ( npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0), ) copy_texture.prepare("P", texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array( 0, dtype=prec ) A_gpu.gpudata.free()
def get_dot_kernel(dtype_out, dtype_a=None, dtype_b=None): if dtype_out is None: dtype_out = dtype_a if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_a is None: dtype_a = dtype_out return ReductionKernel(dtype_out, neutral="0", reduce_expr="a+b", map_expr="a[i]*b[i]", arguments="const %(tp_a)s *a, const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }, keep=True)
def get_fill_zeros_kernel(data_dtype): template = """ __global__ void update(%(data_ctype)s* dest, %(inds_ctype)s* inds, int N) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int total_threads = gridDim.x * blockDim.x; for(int i = tid; i < N; i += total_threads) { dest[inds[i]] = 0; } } """ mod = SourceModule(template % {"data_ctype": dtype_to_ctype(data_dtype), "inds_ctype": dtype_to_ctype(np.int32)}) func = mod.get_function("update") func.prepare('PPi') func.block = (128,1,1) func.grid = (16 * cuda.Context.get_device().MULTIPROCESSOR_COUNT, 1) return func
def get_pow_kernel(dtype): if dtype == np.float32: func = "powf" else: func = "pow" return get_elwise_kernel( "%(tp)s value, %(tp)s *y, %(tp)s *z" % { "tp": dtype_to_ctype(dtype), }, "z[i] = %s(y[i], value)" % func, "pow_method", )
def get_abs_function(dtype, pitch=True): type_src = dtype_to_ctype(dtype) if dtype == np.complex128: operation = "pycuda::abs" type_dest = "double" elif dtype == np.complex64: operation = "pycuda::abs" type_dest = "float" elif dtype == np.float64: operation = "fabs" type_dest = "double" elif dtype == np.float32: operation = "fabsf" type_dest = "float" else: operation = "abs" type_dest = dtype_to_ctype(dtype) name = "abs_function" if pitch: func = SourceModule(pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiPiPi') # [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = SourceModule(non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('PPi') #[np.intp, np.intp, np.int32]) return func
def get_resize_function(dtype): type_src = dtype_to_ctype(dtype) name = "resize" func = SourceModule(reshape_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": "", }, options=["--ptxas-options=-v"]).get_function(name) func.prepare('iiiiPiPi') #[np.int32, np.int32, np.int32, np.int32, # np.intp, np.int32, np.intp, np.int32]) return func
def get_subset_dot_kernel(dtype_out, dtype_subset, dtype_a=None, dtype_b=None): if dtype_out is None: dtype_out = dtype_a if dtype_b is None: if dtype_a is None: dtype_b = dtype_out else: dtype_b = dtype_a if dtype_a is None: dtype_a = dtype_out # important: lookup_tbl must be first--it controls the length return ReductionKernel(dtype_out, neutral="0", reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]", arguments="const %(tp_lut)s *lookup_tbl, " "const %(tp_a)s *a, const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), "tp_lut": dtype_to_ctype(dtype_subset), })
def get_put_kernel(dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = ( [ VectorArg(idx_dtype, "gmem_dest_idx"), ] + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] + [VectorArg(dtype, "src%d" % i) for i in range(vec_count)] + [ScalarArg(np.intp, "n")] ) body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join( "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count) ) func = get_elwise_module(args, body, "put").get_function("put") func.prepare("P" + (2 * vec_count * "P") + np.dtype(np.uintp).char) return func
def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): """ Returns the kernel for the operation: ``z = x ** y`` """ if dtype_z == np.float32: func = "powf" else: # FIXME: Casting args to double-precision not # ideal for all cases (ex. int args) func = "pow" if not is_base_array and is_exp_array: x_ctype = "%(tp_x)s x" y_ctype = "%(tp_y)s *y" func = "%s(x,y[i])" % func elif is_base_array and is_exp_array: x_ctype = "%(tp_x)s *x" y_ctype = "%(tp_y)s *y" func = "%s(x[i],y[i])" % func elif is_base_array and not is_exp_array: x_ctype = "%(tp_x)s *x" y_ctype = "%(tp_y)s y" func = "%s(x[i],y)" % func else: raise AssertionError return get_elwise_kernel( (x_ctype + ", " + y_ctype + ", " + "%(tp_z)s *z") % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % func, name="pow_method")
def _get_eigsq_kernel(dtype_s, dtype_q): template = """ #include <pycuda/pycuda-complex.hpp> __global__ void eigsq_Kernel(%(types)s* d_S, %(typeq)s* d_q, %(types)s thres, int size) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int total = blockDim.x * gridDim.x; for(int i = tid; i < size; i += total) { %(types)s s = d_S[i]; %(typeq)s q = d_q[i]; if(fabs%(iff)s(s) > thres) { d_q[i] = q / s; }else { d_q[i] = 0.0; } } } """ mod = SourceModule( template % { "types": dtype_to_ctype(dtype_s), "typeq": dtype_to_ctype(dtype_q), "iff": "f" if dtype_q == np.float32 else "" }) func = mod.get_function("eigsq_Kernel") func.prepare([ np.intp, np.intp, np.double if dtype_s == np.double else np.float32, np.int32 ]) return func
def get_take_kernel(dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), } args = [VectorArg(idx_dtype, "idx") ] + [VectorArg(dtype, "dest" + str(i)) for i in range(vec_count)] + [ScalarArg(np.intp, "n")] preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count)) body = (("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join("dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) for i in range(vec_count))) mod = get_elwise_module(args, body, "take", preamble=preamble) func = mod.get_function("take") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare("P" + (vec_count * "P") + np.dtype(np.uintp).char, texrefs=tex_src) return func, tex_src
def eye(N, dtype=np.float32): """ Construct a 2D matrix with ones on the diagonal and zeros elsewhere. Constructs a matrix in device memory whose diagonal elements are set to 1 and non-diagonal elements are set to 0. Parameters ---------- N : int Number of rows or columns in the output matrix. dtype : type Matrix data type. Returns ------- e_gpu : pycuda.gpuarray.GPUArray Diagonal matrix of dimensions `[N, N]` with diagonal values set to 1. Examples -------- >>> import pycuda.driver as drv >>> import pycuda.gpuarray as gpuarray >>> import pycuda.autoinit >>> import numpy as np >>> import linalg >>> linalg.init() >>> N = 5 >>> e_gpu = linalg.eye(N) >>> np.all(e_gpu.get() == np.eye(N)) True >>> e_gpu = linalg.eye(N, np.complex64) >>> np.all(e_gpu.get() == np.eye(N, dtype=np.complex64)) True """ if dtype not in [np.float32, np.float64, np.complex64, np.complex128]: raise ValueError('unrecognized type') if N <= 0: raise ValueError('N must be greater than 0') alloc = misc._global_cublas_allocator e_gpu = misc.zeros((N, N), dtype, allocator=alloc) func = el.ElementwiseKernel("{ctype} *e".format(ctype=tools.dtype_to_ctype(dtype)), "e[i] = 1") func(e_gpu, slice=slice(0, N*N, N+1)) return e_gpu
def __init__(self, model, **kwargs): self.dtype = dtype_to_ctype(kwargs.pop("dtype", np.float32)) self.model = model self.solver = model.solver.__name__ self.float_char = "f" if self.dtype == "float" else "" self.params_gdata = kwargs.pop("params_gdata", []) dct = kwargs.pop("inputs_gdata", dict()) self.inputs = {k: {"value": v, "used": False} for k, v in dct.items()} cls = self.model.__class__ self.has_post = np.all([cls.post != base.post for base in cls.__bases__]) self.generate()
def _scale_inplace(a, x_gpu): """ Scale an array by a specified value in-place. """ # Cache the kernel to avoid invoking the compiler if the # specified scale factor and array type have already been encountered: try: func = _scale_inplace.cache[(a, x_gpu.dtype)] except KeyError: ctype = tools.dtype_to_ctype(x_gpu.dtype) func = el.ElementwiseKernel( "{ctype} a, {ctype} *x".format(ctype=ctype), "x[i] /= a") _scale_inplace.cache[(a, x_gpu.dtype)] = func func(x_gpu.dtype.type(a), x_gpu)
def test_2d_fp_texturesLayered(self): orden = "F" npoints = 32 for prec in [ np.int16, np.float32, np.float64, np.complex64, np.complex128 ]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = ''' #include <pycuda-helpers.hpp> texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1); } ''' myKern = myKern.replace('fpName', fpName_str) myKern = myKern.replace('cuPres', prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (16, 16, 1) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, 1) cuGrid = (npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), 1) copy_texture.prepare('P', texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec) A_gpu.gpudata.free()
def get_resize_function(dtype): type_src = dtype_to_ctype(dtype) name = "resize" func = func_compile( name, reshape_template % { "name": name, "dest_type": type_src, "src_type": type_src, "operation": "", }) func.prepare([ np.int32, np.int32, np.int32, np.int32, np.intp, np.int32, np.intp, np.int32 ]) return func
def process_signature(self): new_signature = [] for key in self.signature: val = self.inputs.get(key, None) if val is None: isArray = True dtype = self.dtype else: val['used'] = True isArray = hasattr(val['value'], '__len__') if isArray: dtype = dtype_to_ctype(val['value'].dtype) else: dtype = self.dtype new_signature.append((key, dtype, isArray)) return new_signature
def __init__(self, model, **kwargs): self.dtype = dtype_to_ctype(kwargs.pop('dtype', np.float32)) self.model = model self.solver = model.solver.__name__ self.float_char = 'f' if self.dtype == 'float' else '' self.params_gdata = kwargs.pop('params_gdata', []) dct = kwargs.pop('inputs_gdata', dict()) self.inputs = {k: {'value': v, 'used': False} for k, v in dct.items()} cls = self.model.__class__ self.has_post = np.all( [cls.post != base.post for base in cls.__bases__]) self.generate()
def _get_binaryop_vecmat_kernel(dtype, binary_op): template = Template(""" #include <pycuda-complex.hpp> __global__ void opColVecToMat(const ${type} *mat, const ${type} *vec, ${type} *out, const int n, const int m){ const int tx = threadIdx.x; const int ty = threadIdx.y; const int tidx = blockIdx.x * blockDim.x + threadIdx.x; const int tidy = blockIdx.y * blockDim.y + threadIdx.y; extern __shared__ ${type} shared_vec[]; if ((ty == 0) & (tidx < n)) shared_vec[tx] = vec[tidx]; __syncthreads(); if ((tidy < m) & (tidx < n)) { out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[tx]; } } __global__ void opRowVecToMat(const ${type}* mat, const ${type}* vec, ${type}* out, const int n, const int m){ const int tx = threadIdx.x; const int ty = threadIdx.y; const int tidx = blockIdx.x * blockDim.x + threadIdx.x; const int tidy = blockIdx.y * blockDim.y + threadIdx.y; extern __shared__ ${type} shared_vec[]; if ((tx == 0) & (tidy < m)) shared_vec[ty] = vec[tidy]; __syncthreads(); if ((tidy < m) & (tidx < n)) { out[tidx*m+tidy] = mat[tidx*m+tidy] ${binary_op} shared_vec[ty]; } }""") cache_dir=None ctype = dtype_to_ctype(dtype) tmpl = template.substitute(type=ctype, binary_op=binary_op) mod = SourceModule(tmpl) add_row_vec_kernel = mod.get_function('opRowVecToMat') add_col_vec_kernel = mod.get_function('opColVecToMat') return add_row_vec_kernel, add_col_vec_kernel
def get_gpu_kernel(self): self.gpu_block = (128,1,1) self.gpu_grid = (min( 6*cuda.Context.get_device().MULTIPROCESSOR_COUNT,\ (self.num-1)/self.gpu_block[0] + 1), 1) mod = SourceModule( \ cuda_src % {"type": dtype_to_ctype(np.float64)},\ options=self.compile_options) func = mod.get_function("dummy_synapse") func.prepare('PiiiiPPP')# [ np.intp, # neuron state buffer # np.int32, # buffer width # np.int32, # buffer position # np.int32, # buffer delay steps # np.int32, # syn_num # np.intp, # pre-synaptic neuron list # np.intp, # delay step # np.intp ] ) # cond array return func
def set_by_inds_array(self, inds, data): """ Set mapped data with array by integer indices. Parameters ---------- inds : array-like Integer indices of data elements to update. data : numpy.ndarray Data to assign. """ if np.isscalar(data): raise ValueError('data must be array-like') if len(np.shape(inds)) > 1: raise ValueError('index array must be 1D') N = len(inds) if N == 0: return if not isinstance(inds, gpuarray.GPUArray): inds = gpuarray.to_gpu(inds) if not issubclass(inds.dtype.type, numbers.Integral): raise ValueError('index array must contain integers') if N != len(data): raise ValueError('len(inds) = %s != %s = len(data)' % (N, len(data))) if not isinstance(data, gpuarray.GPUArray): data = gpuarray.to_gpu(data) # Allocate data array if it doesn't exist: if not self.data: self.data = gpuarray.empty(N, data.dtype) else: assert self.data.dtype == data.dtype try: func = self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)] except KeyError: inds_ctype = tools.dtype_to_ctype(inds.dtype) v = "{data_ctype} *dest, {inds_ctype} *inds, {data_ctype} *src".format( data_ctype=self.data_ctype, inds_ctype=inds_ctype) func = elementwise.ElementwiseKernel(v, "dest[inds[i]] = src[i]") self.set_by_inds_array.cache[(inds.dtype, self.data.dtype)] = func func(self.data, inds, data, range=slice(0, N, 1))
def get_diag_add_kernel(dtype): template = """ __global__ void diag_add_Kernel(%(type)s* d_G, int ld, int size, %(type)s addin) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int total = gridDim.x * blockDim.x; for(int i = tid; i < size; i+=total) { d_G[i * ld + i] += addin; } } """ func = func_compile("diag_add_Kernel", template % {"type": dtype_to_ctype(dtype)}) return func
def get_realimag_function(dtype, real=True, pitch=True): type_src = dtype_to_ctype(dtype) if dtype == np.complex64: type_dest = "float" if real: operation = "pycuda::real" name = "real" else: operation = "pycuda::imag" name = "imag" elif dtype == np.complex128: type_dest = "double" if real: operation = "pycuda::real" name = "real" else: operation = "pycuda::imag" name = "imag" else: raise TypeError( "only support complex inputs as numpy.complex64 or numpy.complex128" ) if pitch: func = func_compile( name, pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }) func.prepare( [np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) else: func = func_compile( name, non_pitch_template % { "name": name, "dest_type": type_dest, "src_type": type_src, "operation": operation, }) func.prepare([np.intp, np.intp, np.int32]) return func
def get_fill_function(dtype, pitch=True): type_dst = dtype_to_ctype(dtype) name = "fill" if pitch: func = func_compile( name, fill_pitch_template % { "name": name, "type_dst": type_dst }) func.prepare([np.int32, np.int32, np.intp, np.int32, dtype.type]) else: func = func_compile( name, fill_nonpitch_template % { "name": name, "type_dst": type_dst }) func.prepare([np.int32, np.intp, dtype.type]) return func
def get_transpose_function(dtype, conj=False): src_type = dtype_to_ctype(dtype) name = "trans" operation = "" if conj: if dtype == np.complex128: operation = "pycuda::conj" elif dtype == np.complex64: operation = "pycuda::conj" func = func_compile( name, transpose_template % { "name": name, "type": src_type, "operation": operation }) func.prepare([np.int32, np.int32, np.intp, np.int32, np.intp, np.int32]) return func
def get_minmax_kernel(what, dtype): if dtype == np.float64: reduce_expr = "f%s(a,b)" % what elif dtype == np.float32: reduce_expr = "f%sf(a,b)" % what elif dtype.kind in "iu": reduce_expr = "%s(a,b)" % what else: raise TypeError("unsupported dtype specified") return ReductionKernel( dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr=f"{reduce_expr}", arguments="const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), }, preamble="#define MY_INFINITY (1./0)", )