def TestParamsFull(): import time import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') _a = prgm.add_parameter('f32', name='_a') _b = prgm.add_parameter('f32', name='_b') rd1 = prgm.acquire_register('u64') r1 = prgm.acquire_register('f32') r2 = prgm.acquire_register('f32') r3 = prgm.acquire_register('f32') r4 = prgm.acquire_register('f32') v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable code.add(isa.ld('param', r1, regs.ptxAddress(_a))) code.add(isa.ld('param', r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) code.add(isa.ld('param', rd1, regs.ptxAddress(_mem))) code.add(isa.st('global', regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() a = 1.0 b = 2.0 ptx_mem_addr = proc.alloc_device('f32', 1) mem = extarray.extarray('f', 1) mem[0] = 5.0 param_list = [ptx_mem_addr.address, a, b] proc.copy(ptx_mem_addr, mem) prgm.cache_code() for i in range(20): t1 = time.time() proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() print "run time", t2 - t1 print "#####" print "X", mem.buffer_info()[0], ptx_mem_addr.address proc.copy(mem, ptx_mem_addr) print param_list print mem return
def TestParamsFull(): import time import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter("u64", name="_mem") _a = prgm.add_parameter("f32", name="_a") _b = prgm.add_parameter("f32", name="_b") rd1 = prgm.acquire_register("u64") r1 = prgm.acquire_register("f32") r2 = prgm.acquire_register("f32") r3 = prgm.acquire_register("f32") r4 = prgm.acquire_register("f32") v1 = prgm.add_variable("shared", "f32") # don't need this, but let's test add_variable code.add(isa.ld("param", r1, regs.ptxAddress(_a))) code.add(isa.ld("param", r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) code.add(isa.ld("param", rd1, regs.ptxAddress(_mem))) code.add(isa.st("global", regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() a = 1.0 b = 2.0 ptx_mem_addr = proc.alloc_device("f32", 1) mem = extarray.extarray("f", 1) mem[0] = 5.0 param_list = [ptx_mem_addr.address, a, b] proc.copy(ptx_mem_addr, mem) prgm.cache_code() for i in range(20): t1 = time.time() proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() print "run time", t2 - t1 print "#####" print "X", mem.buffer_info()[0], ptx_mem_addr.address proc.copy(mem, ptx_mem_addr) print param_list print mem return
def cleanup(self): """Do end-of-loop iterator code""" # Update the current count if self.mode == DEC: if self._external_step: self.code.add(ptx.sub(self.r_count, self.r_count, self.r_step)) else: self.code.add(ptx.sub(self.r_count, self.r_count, self.step)) elif self.mode == INC: if self._external_step: self.code.add(ptx.add(self.r_count, self.r_count, self.r_step)) else: self.code.add(ptx.add(self.r_count, self.r_count, self.step)) return
def load(self, addr, offset = 0, space='global'): if isinstance(offset, (int, long)): self.code.add(ptx.ld(space, self, regs.ptxAddress(addr, offset))) else: temp = self.code.prgm.acquire_register('u64') self.code.add(ptx.add(temp, addr, offset)) self.code.add(ptx.ld(space, self, regs.ptxAddress(temp))) self.code.prgm.release_register(temp)
def _set_literal_value(self, value): ## Put the lower 16 bits into r-temp #self.code.add(ptx.addi(self.reg, 0, value & 0xFFFF)) ## Addis r-temp with the upper 16 bits (shifted add immediate) and ## put the result in r-target #if (value & 0x7FFF) != value: # self.code.add(ptx.addis(self.reg, self.reg, ((value + 32768) >> 16))) self.code.add(ptx.add(self.reg, self.reg, value)) return
def TestSynIterDec(): import corepy.arch.ptx.isa as ptx import corepy.arch.ptx.types.registers as regs SIZE = 64 proc = env.Processor(0) # build and run the kernel prgm = env.Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') memp = prgm.acquire_register('u64') counter = prgm.acquire_register('u32') code.add(ptx.ld('param', memp, regs.ptxAddress(_mem))) code.add(ptx.mov(counter, 0)) for i in syn_iter(code, 5, step=1, mode=DEC): code.add(ptx.add(counter, counter, 1)) code.add(ptx.st('global', regs.ptxAddress(memp), counter)) prgm.add(code) ptx_mem_addr = proc.alloc_device('u32', 1) mem = extarray.extarray('I', 1) mem[0] = 5 param_list = [ ptx_mem_addr.address, ] proc.copy(ptx_mem_addr, mem) prgm.cache_code() print prgm.render_string proc.execute(prgm, (1, 1, 1, 1, 1), param_list) proc.copy(mem, ptx_mem_addr) print mem #passed = True #for i in xrange(0, SIZE): # if ext_output[i] != 5: # passed = False #print "Passed == ", passed return
def TestSynIterDec(): import corepy.arch.ptx.isa as ptx import corepy.arch.ptx.types.registers as regs SIZE = 64 proc = env.Processor(0) # build and run the kernel prgm = env.Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') memp = prgm.acquire_register('u64') counter = prgm.acquire_register('u32') code.add(ptx.ld('param', memp, regs.ptxAddress(_mem))) code.add(ptx.mov(counter, 0)) for i in syn_iter(code, 5, step=1, mode=DEC): code.add(ptx.add(counter, counter, 1)) code.add(ptx.st('global', regs.ptxAddress(memp), counter)) prgm.add(code) ptx_mem_addr = proc.alloc_device('u32', 1) mem = extarray.extarray('I', 1) mem[0] = 5 param_list = [ptx_mem_addr.address,] proc.copy(ptx_mem_addr, mem) prgm.cache_code() print prgm.render_string proc.execute(prgm, (1, 1, 1, 1, 1), param_list) proc.copy(mem, ptx_mem_addr) print mem #passed = True #for i in xrange(0, SIZE): # if ext_output[i] != 5: # passed = False #print "Passed == ", passed return
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') _a = prgm.add_parameter('f32', name='_a') _b = prgm.add_parameter('f32', name='_b') # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register('u64') r1 = prgm.acquire_register('f32') r2 = prgm.acquire_register('f32') r3 = prgm.acquire_register('f32') r4 = prgm.acquire_register('f32') v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable # import pdb # pdb.set_trace() #code.add(isa.add(r3, r2, r1)) #code.add('add.f32 r3, r2, r1;') code.add(isa.ld('param', r1, regs.ptxAddress(_a))) code.add(isa.ld('param', r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) #temp = prgm.acquire_register('u32') #code.add(isa.cvt(temp, regs.tid.x)) #code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register('u32') temp2 = prgm.acquire_register('u32') temp3 = prgm.acquire_register('u32') code.add(isa.mul(temp2, temp1, temp3, hlw='lo')) code.add(isa.ld('param', rd1, regs.ptxAddress(_mem))) code.add(isa.st('global', regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### #ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) #import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) #kernel = prgm.render_string #module = ptx_exec.compile(kernel) t1 = time.time() #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1,1,1,1,1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter("u64", name="_mem") _a = prgm.add_parameter("f32", name="_a") _b = prgm.add_parameter("f32", name="_b") # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register("u64") r1 = prgm.acquire_register("f32") r2 = prgm.acquire_register("f32") r3 = prgm.acquire_register("f32") r4 = prgm.acquire_register("f32") v1 = prgm.add_variable("shared", "f32") # don't need this, but let's test add_variable # import pdb # pdb.set_trace() # code.add(isa.add(r3, r2, r1)) # code.add('add.f32 r3, r2, r1;') code.add(isa.ld("param", r1, regs.ptxAddress(_a))) code.add(isa.ld("param", r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) # temp = prgm.acquire_register('u32') # code.add(isa.cvt(temp, regs.tid.x)) # code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register("u32") temp2 = prgm.acquire_register("u32") temp3 = prgm.acquire_register("u32") code.add(isa.mul(temp2, temp1, temp3, hlw="lo")) code.add(isa.ld("param", rd1, regs.ptxAddress(_mem))) code.add(isa.st("global", regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### # ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) # import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) # kernel = prgm.render_string # module = ptx_exec.compile(kernel) t1 = time.time() # ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return
# return True if __name__ == '__main__': import corepy.arch.ptx.isa as isa #import corepy.arch.ptx.platform as env #code = env.InstructionStream() #set_active_code(code) r1 = regs.ptxVariable('reg', 'u32', 'r1') r2 = regs.ptxVariable('reg', 'u32', 'r2') r3 = regs.ptxVariable('reg', 'u32', 'r3') r4 = regs.ptxVariable('reg', 'u32', 'r4') #x = add(r3, r2, r1, ignore_active = True) x = isa.add(r3, r2, r1) print x.render() y = isa.mov(r2, r1) print y.render() a = regs.ptxAddress(r4) z = isa.ld('param', r1, a) print z.render() f1 = regs.ptxVariable('reg', 'f32', 'f1') f2 = regs.ptxVariable('reg', 'f32', 'f2') f3 = regs.ptxVariable('reg', 'f32', 'f3') a = isa.add(f3, f2, f1) print a.render()
# self.params['rnd'] = koperands['rnd'] # return True if __name__ == '__main__': import corepy.arch.ptx.isa as isa #import corepy.arch.ptx.platform as env #code = env.InstructionStream() #set_active_code(code) r1 = regs.ptxVariable('reg', 'u32', 'r1') r2 = regs.ptxVariable('reg', 'u32', 'r2') r3 = regs.ptxVariable('reg', 'u32', 'r3') r4 = regs.ptxVariable('reg', 'u32', 'r4') #x = add(r3, r2, r1, ignore_active = True) x = isa.add(r3, r2, r1) print x.render() y = isa.mov(r2, r1) print y.render() a = regs.ptxAddress(r4) z = isa.ld('param', r1, a) print z.render() f1 = regs.ptxVariable('reg', 'f32', 'f1') f2 = regs.ptxVariable('reg', 'f32', 'f2') f3 = regs.ptxVariable('reg', 'f32', 'f3') a = isa.add(f3, f2, f1) print a.render()