def TestParams(): import time # SIZE = 1024 kernel = """ .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff """ t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) # mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem # ptx_exec.free(input) # ptx_exec.free(output) ##ptx_exec.free(glob) # ptx_exec.unload_module(image) return
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') _a = prgm.add_parameter('f32', name='_a') _b = prgm.add_parameter('f32', name='_b') # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register('u64') r1 = prgm.acquire_register('f32') r2 = prgm.acquire_register('f32') r3 = prgm.acquire_register('f32') r4 = prgm.acquire_register('f32') v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable # import pdb # pdb.set_trace() #code.add(isa.add(r3, r2, r1)) #code.add('add.f32 r3, r2, r1;') code.add(isa.ld('param', r1, regs.ptxAddress(_a))) code.add(isa.ld('param', r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) #temp = prgm.acquire_register('u32') #code.add(isa.cvt(temp, regs.tid.x)) #code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register('u32') temp2 = prgm.acquire_register('u32') temp3 = prgm.acquire_register('u32') code.add(isa.mul(temp2, temp1, temp3, hlw='lo')) code.add(isa.ld('param', rd1, regs.ptxAddress(_mem))) code.add(isa.st('global', regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### #ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) #import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) #kernel = prgm.render_string #module = ptx_exec.compile(kernel) t1 = time.time() #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1,1,1,1,1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return
def TestParams(): import time #SIZE = 1024 kernel = ( ''' .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff ''' ) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) #mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #ptx_exec.free(input) #ptx_exec.free(output) ##ptx_exec.free(glob) #ptx_exec.unload_module(image) return
class Processor(spe.Processor): exec_module = ptx_exec def __init__(self, device=0): """Create a new Processor representing a particular GPU in the system, indexed by device.""" spe.Processor.__init__(self) if device < 0 or device > N_GPUS: raise Exception("Invalid device number %d" % device) print "Creating ctx" self.ctx = ptx_exec.alloc_ctx(device) self.device = device return def __del__(self): print "Destroying ctx" ptx_exec.free_ctx(self.ctx) return # ------------------------------ # Memory Management # ------------------------------ # def _get_fmt(self, typecode, comps = 1): # if typecode == 'f': # if comps == 1: # fmt = ptx_exec.FMT_FLOAT32_1 # elif comps == 2: # fmt = ptx_exec.FMT_FLOAT32_2 # elif comps == 4: # fmt = ptx_exec.FMT_FLOAT32_4 # else: # raise Exception("Number of components must be 1, 2, or 4") # elif typecode == 'i': # if comps == 1: # fmt = ptx_exec.FMT_SIGNED_INT32_1 # elif comps == 2: # fmt = ptx_exec.FMT_SIGNED_INT32_2 # elif comps == 4: # fmt = ptx_exec.FMT_SIGNED_INT32_4 # else: # raise Exception("Number of components must be 1, 2, or 4") # elif typecode == 'I': # if comps == 1: # fmt = ptx_exec.FMT_UNSIGNED_INT32_1 # elif comps == 2: # fmt = ptx_exec.FMT_UNSIGNED_INT32_2 # elif comps == 4: # fmt = ptx_exec.FMT_UNSIGNED_INT32_4 # else: # raise Exception("Number of components must be 1, 2, or 4") # else: # raise Exception("Unsupported data type: " + str(typecode)) # return fmt def alloc_device(self, typecode, length, comps = 1): """ Allocate local GPU memory and return a handle for copying/binding. Typecode is ptx typecode (u32, s32, f32, u64, etc.) """ #fmt = self._get_fmt(typecode, comps) scalar_byte_width = int(typecode[1:])/8 # Allocate GPU memory and create a DeviceMemory handle address = ptx_exec.alloc_device(length*scalar_byte_width*comps) return DeviceMemory(address, typecode, length) def alloc_host(self, typecode, length, comps = 1): """ Allocate local GPU memory and return a handle for copying/binding. Typecode is ptx typecode (u32, s32, f32, u64, etc.) """ #fmt = self._get_fmt(typecode, comps) array_typecode = '' # This might be clearer, but not very efficient... #type_conversion_table = {} #type_conversion_table['32'] = {'f': 'f', 'u': 'I', 's', 'i'} #type_conversion_table['64'] = {'f': 'd', 'u': 'L', 's', 'l'} #type_conversion_table['16'] = {'u': 'H', 's', 'h'} #type_conversion_table['8'] = {'u': 'B', 's', 'b'} # #if typecode == 'b': # typecode = 'u' #array_typecode = type_conversion_table[typecode[0]][typecode[1:]] scalar_width = int(typecode[1:]) if typecode[0] == 'f': if scalar_width == 32: array_typecode = 'f' elif scalar_width == 64: array_typecode = 'd' elif typecode[0] == 'u': if scalar_width == 32: array_typecode = 'I' elif scalar_width == 64: array_typecode = 'L' elif scalar_width == 16: array_typecode = 'H' elif scalar_width == 8: array_typecode = 'b' elif typecode[0] == 's': if scalar_width == 32: array_typecode = 'i' elif scalar_width == 64: array_typecode = 'l' elif scalar_width == 16: array_typecode = 'h' elif scalar_width == 8: array_typecode = 'B' if array_typecode == '': raise Exception('Unable to convert type') mem = ptx_exec.alloc_host(length*scalar_byte_width*comps) arr = extarray.extarray(array_typecode, 0) arr.data_len = scalar_width/4 * length * comps arr.set_memory(mem, arr.data_len * 4) arr.gpu_mem_handle = mem # arr.gpu_device = self.device arr.gpu_width = length # arr.gpu_pitch = mem[2] # arr.gpu_height = height return arr # def alloc_remote(self, typecode, comps, width, height = 1, globl = False): # """Allocate an ExtArray backed by remote (main) memory.""" # fmt = self._get_fmt(typecode, comps) # if globl: # globl = ptx_exec.GLOBAL_BUFFER # # Allocate and initialize the memory # # TODO - more operand error checking # mem = ptx_exec.alloc_remote(self.device, fmt, width, height, globl) # def alloc_remote_npy(self, typecode, comps, width, height = 1, globl = False): # """Allocate a NumPy ndarray backed by remote (main) memory.""" # if not HAS_NUMPY: # raise ImportError("NumPy array support requires NumPy installation") # fmt = self._get_fmt(typecode, comps) # if typecode == 'f': # dtype = numpy.float32 # elif typecode == 'i': # dtype = numpy.int32 # elif typecode == 'I': # dtype = numpy.uint32 # else: # raise Exception("Unsupported data type: " + str(typecode)) # if globl: # globl = ptx_exec.GLOBAL_BUFFER # buf = ptx_exec.calmembuffer(self.device, fmt, width, height, globl) # arr = numpy.frombuffer(buf, dtype=dtype) # if height == 1: # arr.shape = (width, comps) # else: # arr.shape = (buf.pitch, height, comps) # return arr def free_device(self, hdl): ptx_exec.free_device(hdl.address) def free_host(self, arr): ptx_exec.free_host(arr.buffer_info()[0]) def free(self, hdl): #if not (isinstance(arr, extarray.extarray) and hasattr(arr, "gpu_mem_handle")): # raise Exception("Not a register or extarray with a GPU memory handle") if isinstance(hdl, extarray.extarray): if not hasattr(hdl, "gpu_mem_handle"): raise TypeError("Not an extarray with a GPU memory handle") ptx_exec.free_remote(hdl.gpu_mem_handle) del hdl.gpu_mem_handle del hdl.gpu_device del hdl.gpu_width del hdl.gpu_pitch hdl.set_memory(0, 0) hdl.data_len = 0 elif isinstance(hdl, LocalMemory): ptx_exec.free_local(hdl.binding) hdl.res = None else: raise TypeError("Unknown handle type %s" % (type(hdl))) return # ------------------------------ # Kernel Execution # ------------------------------ def copy(self, dst, src, async = False): """Copy memory from src to dst, using this GPU.""" # Figure out what dst and src are and extract bindings if isinstance(dst, extarray.extarray): ptx_exec.copy_dtoh(dst.buffer_info()[0], src.address, src.length*src.itemsize) elif isinstance(dst, DeviceMemory): ptx_exec.copy_htod(dst.address, src.buffer_info()[0], src.buffer_info()[1]*src.itemsize) #elif isinstance(dst, numpy.ndarray): # # NumPy array.. do we support it, and does it use a CAL buffer? # if not HAS_NUMPY: # raise ImportError("NumPy array support requires NumPy installation") # if not isinstance(arr.base, ptx_exec.calmembuffer): # raise TypeError("Not NumPy with a GPU memory buffer") ## Start the copy #hdl = ptx_exec.copy_async(self.ctx, dst_binding, src_binding) # #if async: # return hdl # ## Not async, complete the copy here. #ptx_exec.join_copy(self.ctx, hdl) return
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter("u64", name="_mem") _a = prgm.add_parameter("f32", name="_a") _b = prgm.add_parameter("f32", name="_b") # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register("u64") r1 = prgm.acquire_register("f32") r2 = prgm.acquire_register("f32") r3 = prgm.acquire_register("f32") r4 = prgm.acquire_register("f32") v1 = prgm.add_variable("shared", "f32") # don't need this, but let's test add_variable # import pdb # pdb.set_trace() # code.add(isa.add(r3, r2, r1)) # code.add('add.f32 r3, r2, r1;') code.add(isa.ld("param", r1, regs.ptxAddress(_a))) code.add(isa.ld("param", r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) # temp = prgm.acquire_register('u32') # code.add(isa.cvt(temp, regs.tid.x)) # code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register("u32") temp2 = prgm.acquire_register("u32") temp3 = prgm.acquire_register("u32") code.add(isa.mul(temp2, temp1, temp3, hlw="lo")) code.add(isa.ld("param", rd1, regs.ptxAddress(_mem))) code.add(isa.st("global", regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### # ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) # import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) # kernel = prgm.render_string # module = ptx_exec.compile(kernel) t1 = time.time() # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return