def TestParams(): import time # SIZE = 1024 kernel = """ .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff """ t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) # mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem # ptx_exec.free(input) # ptx_exec.free(output) ##ptx_exec.free(glob) # ptx_exec.unload_module(image) return
def alloc_device(self, typecode, length, comps = 1): """ Allocate local GPU memory and return a handle for copying/binding. Typecode is ptx typecode (u32, s32, f32, u64, etc.) """ #fmt = self._get_fmt(typecode, comps) scalar_byte_width = int(typecode[1:])/8 # Allocate GPU memory and create a DeviceMemory handle address = ptx_exec.alloc_device(length*scalar_byte_width*comps) return DeviceMemory(address, typecode, length)
def alloc_device(self, typecode, length, comps=1): """ Allocate local GPU memory and return a handle for copying/binding. Typecode is ptx typecode (u32, s32, f32, u64, etc.) """ # fmt = self._get_fmt(typecode, comps) scalar_byte_width = int(typecode[1:]) / 8 # Allocate GPU memory and create a DeviceMemory handle address = ptx_exec.alloc_device(length * scalar_byte_width * comps) return DeviceMemory(address, typecode, length)
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') _a = prgm.add_parameter('f32', name='_a') _b = prgm.add_parameter('f32', name='_b') # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register('u64') r1 = prgm.acquire_register('f32') r2 = prgm.acquire_register('f32') r3 = prgm.acquire_register('f32') r4 = prgm.acquire_register('f32') v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable # import pdb # pdb.set_trace() #code.add(isa.add(r3, r2, r1)) #code.add('add.f32 r3, r2, r1;') code.add(isa.ld('param', r1, regs.ptxAddress(_a))) code.add(isa.ld('param', r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) #temp = prgm.acquire_register('u32') #code.add(isa.cvt(temp, regs.tid.x)) #code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register('u32') temp2 = prgm.acquire_register('u32') temp3 = prgm.acquire_register('u32') code.add(isa.mul(temp2, temp1, temp3, hlw='lo')) code.add(isa.ld('param', rd1, regs.ptxAddress(_mem))) code.add(isa.st('global', regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### #ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) #import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) #kernel = prgm.render_string #module = ptx_exec.compile(kernel) t1 = time.time() #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1,1,1,1,1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return
def TestParams(): import time #SIZE = 1024 kernel = ( ''' .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff ''' ) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) #mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #ptx_exec.free(input) #ptx_exec.free(output) ##ptx_exec.free(glob) #ptx_exec.unload_module(image) return
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter("u64", name="_mem") _a = prgm.add_parameter("f32", name="_a") _b = prgm.add_parameter("f32", name="_b") # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register("u64") r1 = prgm.acquire_register("f32") r2 = prgm.acquire_register("f32") r3 = prgm.acquire_register("f32") r4 = prgm.acquire_register("f32") v1 = prgm.add_variable("shared", "f32") # don't need this, but let's test add_variable # import pdb # pdb.set_trace() # code.add(isa.add(r3, r2, r1)) # code.add('add.f32 r3, r2, r1;') code.add(isa.ld("param", r1, regs.ptxAddress(_a))) code.add(isa.ld("param", r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) # temp = prgm.acquire_register('u32') # code.add(isa.cvt(temp, regs.tid.x)) # code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register("u32") temp2 = prgm.acquire_register("u32") temp3 = prgm.acquire_register("u32") code.add(isa.mul(temp2, temp1, temp3, hlw="lo")) code.add(isa.ld("param", rd1, regs.ptxAddress(_mem))) code.add(isa.st("global", regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### # ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) # import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) # kernel = prgm.render_string # module = ptx_exec.compile(kernel) t1 = time.time() # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return