def _create_vector_types(): from pycuda.characterize import platform_bits if platform_bits() == 32: long_dtype = np.int32 ulong_dtype = np.uint32 else: long_dtype = np.int64 ulong_dtype = np.uint64 field_names = ["x", "y", "z", "w"] from pycuda.tools import get_or_register_dtype for base_name, base_type, counts in [ ('char', np.int8, [1, 2, 3, 4]), ('uchar', np.uint8, [1, 2, 3, 4]), ('short', np.int16, [1, 2, 3, 4]), ('ushort', np.uint16, [1, 2, 3, 4]), ('int', np.int32, [1, 2, 3, 4]), ('uint', np.uint32, [1, 2, 3, 4]), ('long', long_dtype, [1, 2, 3, 4]), ('ulong', ulong_dtype, [1, 2, 3, 4]), ('longlong', np.int64, [1, 2]), ('ulonglong', np.uint64, [1, 2]), ('float', np.float32, [1, 2, 3, 4]), ('double', np.float64, [1, 2]), ]: for count in counts: name = "%s%d" % (base_name, count) dtype = np.dtype([(field_names[i], base_type) for i in range(count)]) get_or_register_dtype(name, dtype) setattr(vec, name, dtype) my_field_names = ",".join(field_names[:count]) setattr( vec, "make_" + name, staticmethod( eval( "lambda %s: array((%s), dtype=my_dtype)" % (my_field_names, my_field_names), dict(array=np.array, my_dtype=dtype))))
def _create_vector_types(): from pycuda.characterize import platform_bits if platform_bits() == 32: long_dtype = np.int32 ulong_dtype = np.uint32 else: long_dtype = np.int64 ulong_dtype = np.uint64 field_names = ["x", "y", "z", "w"] from pycuda.tools import get_or_register_dtype for base_name, base_type, counts in [ ('char', np.int8, [1, 2, 3, 4]), ('uchar', np.uint8, [1, 2, 3, 4]), ('short', np.int16, [1, 2, 3, 4]), ('ushort', np.uint16, [1, 2, 3, 4]), ('int', np.int32, [1, 2, 3, 4]), ('uint', np.uint32, [1, 2, 3, 4]), ('long', long_dtype, [1, 2, 3, 4]), ('ulong', ulong_dtype, [1, 2, 3, 4]), ('longlong', np.int64, [1, 2]), ('ulonglong', np.uint64, [1, 2]), ('float', np.float32, [1, 2, 3, 4]), ('double', np.float64, [1, 2]), ]: for count in counts: name = "%s%d" % (base_name, count) dtype = np.dtype([ (field_names[i], base_type) for i in range(count)]) get_or_register_dtype(name, dtype) setattr(vec, name, dtype) my_field_names = ",".join(field_names[:count]) setattr(vec, "make_"+name, staticmethod(eval( "lambda %s: array((%s), dtype=my_dtype)" % (my_field_names, my_field_names), dict(array=np.array, my_dtype=dtype))))
+ maxloc_preamble ) maxloc_preamble_double = ( """ #define MAXLOCN maxlocd #define TTYPE double #define LTYPE long """ + maxloc_preamble ) maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)]) maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)]) maxloc_dtype_single = get_or_register_dtype("maxlocs", dtype=maxloc_dtype_single) maxloc_dtype_double = get_or_register_dtype("maxlocd", dtype=maxloc_dtype_double) mls = LowerLatencyReductionKernel( maxloc_dtype_single, neutral="maxloc_start()", reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)", arguments="float *x", preamble=maxloc_preamble_single, ) mld = LowerLatencyReductionKernel( maxloc_dtype_double, neutral="maxloc_start()", reduce_expr="maxloc_red(a, b)",
maxloc_preamble_single = """ #define MAXLOCN maxlocs #define TTYPE float #define LTYPE int """ + maxloc_preamble maxloc_preamble_double = """ #define MAXLOCN maxlocd #define TTYPE double #define LTYPE long """ + maxloc_preamble maxloc_dtype_double = np.dtype([("max", np.float64), ("loc", np.int64)]) maxloc_dtype_single = np.dtype([("max", np.float32), ("loc", np.int32)]) maxloc_dtype_single = get_or_register_dtype("maxlocs", dtype=maxloc_dtype_single) maxloc_dtype_double = get_or_register_dtype("maxlocd", dtype=maxloc_dtype_double) mls = LowerLatencyReductionKernel(maxloc_dtype_single, neutral="maxloc_start()", reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)", arguments="float *x", preamble=maxloc_preamble_single) mld = LowerLatencyReductionKernel(maxloc_dtype_double, neutral="maxloc_start()", reduce_expr="maxloc_red(a, b)", map_expr="maxloc_map(x[i], i)", arguments="double *x",
def get_shift_kernel(num_shifts): shift_preamble = Template(""" struct shift_t${n}{ % for i in range(n): float vr${i}; float vi${i}; % endfor __device__ shift_t${n}(){} __device__ shift_t${n}(shift_t${n} const &src): % for i in range(n-1): vr${i}(src.vr${i}), vi${i}(src.vi${i}), % endfor vr${n-1}(src.vr${n-1}), vi${n-1}(src.vi${n-1}) {} __device__ shift_t${n}(shift_t${n} const volatile &src): % for i in range(n-1): vr${i}(src.vr${i}), vi${i}(src.vi${i}), % endfor vr${n-1}(src.vr${n-1}), vi${n-1}(src.vi${n-1}) {} __device__ shift_t${n} volatile &operator=( shift_t${n} const &src) volatile{ % for i in range(n): vr${i} = src.vr${i}; vi${i} = src.vi${i}; % endfor return *this; } }; __device__ shift_t${n} shift_red(shift_t${n} a, shift_t${n} b){ % for i in range(n): a.vr${i} += b.vr${i}; a.vi${i} += b.vi${i}; % endfor return a; } __device__ shift_t${n} shift_start(){ shift_t${n} t; % for i in range(n): t.vr${i}=0; t.vi${i}=0; % endfor return t; } __device__ shift_t${n} shift_map(pycuda::complex<float> x, % for i in range(n): float shift${i}, % endfor float offset, float slen){ shift_t${n} t; float pphase = offset * 2 * 3.141592653 / slen; float pr, pi; % for i in range(n): __sincosf(pphase * shift${i}, &pi, &pr); // Phase shift the input data (x) to correspond to a time shift t.vr${i} = x._M_re * pr - x._M_im * pi; t.vi${i} = x._M_re * pi + x._M_im * pr; % endfor return t; } """).render(n = num_shifts) shift_map_args = "" shift_krnl_args = "" for i in range(num_shifts): shift_map_args += " shift%s," % i shift_krnl_args += " float shift%s, " % i sd = np.dtype([("v1", np.complex64, num_shifts)]) shift_t = get_or_register_dtype('shift_t%s' % num_shifts, sd) shift_krnl = LowerLatencyReductionKernel(shift_t, neutral="shift_start()", reduce_expr="shift_red(a, b)", map_expr="shift_map(x[i], " + shift_map_args + " offset+i, slen)", arguments="pycuda::complex<float> *x," + shift_krnl_args + "float offset, float slen ", preamble=shift_preamble) return shift_krnl