def exe(self, idata, odata, dir): postfix = cufft_dtype_to_name[self._dtype] handle = self._handle if self._ngpu <= 1 : meth = getattr(self._api, 'cufftExec' + postfix) if isinstance(idata,PlanDataHelper) : return meth(handle, device_pointer(idata._d_data), device_pointer(odata._d_data), int(dir)) else : return meth(handle, device_pointer(idata), device_pointer(odata), int(dir)) meth = getattr(self._api, 'cufftXtExecDescriptor' + postfix) if postfix == 'C2C' or postfix == 'Z2Z': return meth(handle, idata._d_data, odata._d_data, int(dir)) return meth(handle, idata._d_data, odata._d_data)
def runsort(temp, keys, vals, begin_bit=0, end_bit=None): stream = 0 begin_bit = 0 dtty = np.dtype(dtype) end_bit = dtty.itemsize * 8 descending = 0 count = maxcount if keys: count = keys.size _arysize = int(maxcount * dtty.itemsize) _sort = _bind_radixsort_double() ctx = cuda.current_context() _temp_keys = ctx.memalloc(_arysize) return _sort(temp, ctypes.c_uint(count), device_pointer(keys), device_pointer(_temp_keys), None, None, stream, descending, begin_bit, end_bit)
def __float_or_double(self, devary, floatfn, doublefn): if devary.dtype == np.float32: fn = floatfn fty = c_float elif devary.dtype == np.float64: fn = doublefn fty = c_double else: raise ValueError("Only accept float or double arrays.") dptr = device_pointer(devary) ptr = cast(c_void_p(dptr), POINTER(fty)) return fn, ptr
def __uint32_or_uint64(self, devary, fn32, fn64): if devary.dtype in (np.dtype(np.uint32), np.dtype(np.int32)): fn = self._api.curandGenerate ity = c_uint elif devary.dtype in (np.dtype(np.uint64), np.dtype(np.int64)): fn = self._api.curandGenerateLongLong ity = c_ulonglong else: raise ValueError("Only accept int32, int64, " "uint32 or uint64 arrays") dptr = device_pointer(devary) ptr = cast(c_void_p(dptr), POINTER(ity)) return fn, ptr
def make_array_args(arr): args = [] c_intp = ctypes.c_ssize_t meminfo = ctypes.c_void_p(0) parent = ctypes.c_void_p(0) nitems = c_intp(arr.size) itemsize = c_intp(arr.dtype.itemsize) data = ctypes.c_void_p(device_pointer(arr)) args.append(meminfo) args.append(parent) args.append(nitems) args.append(itemsize) args.append(data) for ax in range(arr.ndim): args.append(c_intp(arr.shape[ax])) for ax in range(arr.ndim): args.append(c_intp(arr.strides[ax])) return args
def _prepare_array(self, val): return device_pointer(val), val
def _segmentedsort(d_keys, d_vals, d_segments, stream): _overloads[d_keys.dtype](device_pointer(d_keys), device_pointer(d_vals), d_keys.size, device_pointer(d_segments), d_segments.size, stream.handle if stream else 0)
def _prepare_args(self, ty, val, stream, retr, kernelargs): """ Convert arguments to ctypes and append to kernelargs """ # map the arguments using any extension you've registered for extension in reversed(self.extensions): ty, val = extension.prepare_args(ty, val, stream=stream, retr=retr) if isinstance(ty, types.Array): devary = wrap_arg(val).to_device(retr, stream) c_intp = ctypes.c_ssize_t meminfo = ctypes.c_void_p(0) parent = ctypes.c_void_p(0) nitems = c_intp(devary.size) itemsize = c_intp(devary.dtype.itemsize) ptr = driver.device_pointer(devary) if driver.USE_NV_BINDING: ptr = int(ptr) data = ctypes.c_void_p(ptr) kernelargs.append(meminfo) kernelargs.append(parent) kernelargs.append(nitems) kernelargs.append(itemsize) kernelargs.append(data) for ax in range(devary.ndim): kernelargs.append(c_intp(devary.shape[ax])) for ax in range(devary.ndim): kernelargs.append(c_intp(devary.strides[ax])) elif isinstance(ty, types.Integer): cval = getattr(ctypes, "c_%s" % ty)(val) kernelargs.append(cval) elif ty == types.float16: cval = ctypes.c_uint16(np.float16(val).view(np.uint16)) kernelargs.append(cval) elif ty == types.float64: cval = ctypes.c_double(val) kernelargs.append(cval) elif ty == types.float32: cval = ctypes.c_float(val) kernelargs.append(cval) elif ty == types.boolean: cval = ctypes.c_uint8(int(val)) kernelargs.append(cval) elif ty == types.complex64: kernelargs.append(ctypes.c_float(val.real)) kernelargs.append(ctypes.c_float(val.imag)) elif ty == types.complex128: kernelargs.append(ctypes.c_double(val.real)) kernelargs.append(ctypes.c_double(val.imag)) elif isinstance(ty, (types.NPDatetime, types.NPTimedelta)): kernelargs.append(ctypes.c_int64(val.view(np.int64))) elif isinstance(ty, types.Record): devrec = wrap_arg(val).to_device(retr, stream) ptr = devrec.device_ctypes_pointer if driver.USE_NV_BINDING: ptr = ctypes.c_void_p(int(ptr)) kernelargs.append(ptr) elif isinstance(ty, types.BaseTuple): assert len(ty) == len(val) for t, v in zip(ty, val): self._prepare_args(t, v, stream, retr, kernelargs) else: raise NotImplementedError(ty, val)
def generate_poisson(self, devout, num, lmbd): if devout.dtype not in (np.dtype(np.uint32), np.dtype(np.int32)): raise ValueError("Only accept int32 or uint32 arrays") dptr = device_pointer(devout) ptr = cast(c_void_p(dptr), POINTER(c_uint)) return self._api.curandGeneratePoisson(self._handle, ptr, num, lmbd)
def _devptr(p): if p is None: return None else: return device_pointer(p)
def exe(self, idata, odata, dir): postfix = cufft_dtype_to_name[self.dtype] meth = getattr(self._api, 'cufftExec' + postfix) return meth(self._handle, device_pointer(idata), device_pointer(odata), int(dir))
def runsort(d_keys, d_vals, d_seg): _sort = _bind_segsort_double() _sort(device_pointer(d_keys), device_pointer(d_vals), d_keys.size, device_pointer(d_seg), d_seg.size, 0)