def __init__(self): # Lazily load the libHLC library bitcode_path = os.path.join(sys.prefix, 'share', 'rocmtools') assert os.path.exists(bitcode_path) and os.path.isdir(bitcode_path) self.bitcode_path = bitcode_path dev_ctx = devices.get_context() target_cpu = dev_ctx.agent.name self.target_cpu = target_cpu if self.hlc is None: try: hlc = CDLL(os.path.join(sys.prefix, 'lib', 'librocmlite.so')) except OSError: raise ImportError("librocmlite.so cannot be found. Please " "install the roctools package by: " "conda install -c numba roctools") else: hlc.ROC_ParseModule.restype = moduleref_ptr hlc.ROC_ParseBitcode.restype = moduleref_ptr hlc.ROC_ModuleEmitBRIG.restype = c_size_t hlc.ROC_Initialize() weakref.finalize(hlc, hlc.ROC_Finalize) hlc.ROC_SetCommandLineOption.argtypes = [ c_int, c_void_p, ] type(self).hlc = hlc
def _initialize(self): if not self.initialized: dev_ctx = devices.get_context() target_cpu = dev_ctx.agent.name.decode('UTF-8') self.target_cpu = "-mcpu %s" % target_cpu self.CMD_OPT = ' '.join([ self.opt, "-O3", self.triple_flag, self.target_cpu, "-disable-simplify-libcalls", "-verify", "-S", "-o {fout}", "{fin}" ]) self.CMD_VERIFY = ' '.join([ self.opt, "-verify", self.triple_flag, self.target_cpu, "-S", "-o {fout}", "{fin}" ]) self.CMD_GEN_HSAIL = ' '.join([ self.llc, "-O2", self.triple_flag, self.target_cpu, "-filetype=asm", "-o {fout}", "{fin}" ]) self.CMD_GEN_BRIG = ' '.join([ self.llc, "-O2", self.triple_flag, self.target_cpu, "-filetype=obj", "-o {fout}", "{fin}" ]) self.CMD_LINK_BUILTINS = ' '.join( [self.llvm_link, "-S", "-o {fout}", "{fin}", "{lib}"]) self.CMD_LINK_LIBS = ' '.join( [self.llvm_link, "-S", "-o {fout}", "{fin}"]) self.CMD_LINK_BRIG = ' '.join( [self.ld_lld, "-shared", "-o {fout}", "{fin}"])
def to_device(obj, stream=None, context=None, copy=True, to=None): """to_device(obj, context, copy=True, to=None) Allocate and transfer a numpy ndarray or structured scalar to the device. To copy host->device a numpy array:: ary = numpy.arange(10) d_ary = roc.to_device(ary) The resulting ``d_ary`` is a ``DeviceNDArray``. To copy device->host:: hary = d_ary.copy_to_host() To copy device->host to an existing array:: ary = numpy.empty(shape=d_ary.shape, dtype=d_ary.dtype) d_ary.copy_to_host(ary) """ context = context or get_context() if to is None: to = devicearray.from_array_like(obj) if copy: to.copy_to_device(obj, stream=stream, context=context) return to
def _initialize(self): if not self.initialized: dev_ctx = devices.get_context() target_cpu = dev_ctx.agent.name.decode('UTF-8') self.target_cpu = "-mcpu %s" % target_cpu self.CMD_OPT = ' '.join([ self.opt, "-O3", self.triple_flag, self.target_cpu, "-disable-simplify-libcalls", "-verify", "-S", "-o {fout}", "{fin}"]) self.CMD_VERIFY = ' '.join([ self.opt, "-verify", self.triple_flag, self.target_cpu, "-S", "-o {fout}", "{fin}"]) self.CMD_GEN_HSAIL = ' '.join([self.llc, "-O2", self.triple_flag, self.target_cpu, "-filetype=asm", "-o {fout}", "{fin}"]) self.CMD_GEN_BRIG = ' '.join([self.llc, "-O2", self.triple_flag, self.target_cpu, "-filetype=obj", "-o {fout}", "{fin}"]) self.CMD_LINK_BUILTINS = ' '.join([ self.llvm_link, "-S", "-o {fout}", "{fin}", "{lib}"]) self.CMD_LINK_LIBS = ' '.join([self.llvm_link, "-S", "-o {fout}", "{fin}"]) self.CMD_LINK_BRIG = ' '.join([self.ld_lld, "-shared", "-o {fout}", "{fin}"])
def __init__(self, shape, strides, dtype, dgpu_data=None): """ Args ---- shape array shape. strides array strides. dtype data type as numpy.dtype. dgpu_data user provided device memory for the ndarray data buffer """ if isinstance(shape, (int, long)): shape = (shape,) if isinstance(strides, (int, long)): strides = (strides,) self.ndim = len(shape) if len(strides) != self.ndim: raise ValueError("strides not match ndim") self._dummy = dummyarray.Array.from_desc(0, shape, strides, dtype.itemsize) self.shape = tuple(shape) self.strides = tuple(strides) self.dtype = np.dtype(dtype) self.size = int(np.prod(self.shape)) # prepare dgpu memory if self.size > 0: if dgpu_data is None: from numba.roc.api import _memory_size_from_info self.alloc_size = _memory_size_from_info( self.shape, self.strides, self.dtype.itemsize ) # find a coarse region on the dGPU dgpu_data = devices.get_context().mempoolalloc(self.alloc_size) else: # we have some preallocated dgpu_memory sz = getattr(dgpu_data, "_hsa_memsize_", None) if sz is None: raise ValueError("dgpu_data as no _hsa_memsize_ attribute") assert sz >= 0 self.alloc_size = sz else: dgpu_data = None self.alloc_size = 0 self.dgpu_data = dgpu_data