def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Reserve two variable registers count = code.acquire_register() result = code.acquire_register() # 'Load' the input vector x from register 5 x = code.acquire_register() spu.ai(x, 5, 0) # Zero count and result spu.xor(count, count, count) spu.xor(result, result, result) # Inline the popc and reduce operations self.popc(count, x) self.reduce_word(result, count) # Send the result to the caller spu.wrch(result, dma.SPU_WrOutMbox) code.release_register(x) spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) stream = spuiter.stream_buffer(code, self.stream_addr, self.stream_size * 4, self.buffer_size, self.lsa) ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4) popc = syn_popc_var() x = var.Word(0) count = var.Word(0) total = var.Word(0) for buffer in stream: for x in spuiter.spu_vec_iter(code, ls_data, addr_reg = buffer): popc.popc(count, x) popc.reduce_word(total, count) # Send the result to the caller spu.wrch(total, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def TestInt(): prgm = Program() code = prgm.get_stream() proc = Processor() spu.set_active_code(code) r13 = prgm.acquire_register(reg_name=13) r20 = prgm.acquire_register(reg_name=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) prgm += code r = proc.execute(prgm, stop=True) # , debug = True) #print 'int result:', r assert (r[0] == 0) assert (r[1] == 0x200D) return
def TestInt(): prgm = Program() code = prgm.get_stream() proc = Processor() spu.set_active_code(code) r13 = prgm.acquire_register(reg_name = 13) r20 = prgm.acquire_register(reg_name = 20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) prgm += code r = proc.execute(prgm, stop = True) # , debug = True) #print 'int result:', r assert(r[0] == 0) assert(r[1] == 0x200D) return
def synthesize(self, code): """ Render a vector with 4 pixels. """ old_code = spu.get_active_code() spu.set_active_code(code) if self.x_offset is None: raise Exception('Please call setup') if self.result is None: raise Exception('Please set result') if self.one is None: raise Exception('Please set one') # Make the part of the result positive and subtract 1 # to transform (-1,-oo) into (0,oo) self.result.v = spu.fs.ex(0, self.result) self.result.v = spu.fs.ex(self.result, self.one) # Convert the result to an unsigned int, scaling by 2^4 to put # values between 0 and 16 in the gradient. Values outside [0,16] # are 0 or FF self.result.v = spu.cfltu.ex(self.result, 169) # 173 - 169 == 4 # self.result.v = spu.sfi.ex(self.result, 255) # 173 - 169 == 4 # Extract the first two bytes from the result into the RGB positions # and set alpha to 0xFF self.result.v = spu.shufb.ex(self.result, self.ff, self.uint2rgba) # Save the result and increment the offset spu.stqd(self.result, self.x_offset, self.lsa >> 4) spu.ai(self.x_offset, self.x_offset, 16) spu.set_active_code(old_code) return
def TestAll(): import corepy.arch.spu.platform as env code = env.InstructionStream() spu.set_active_code(code) a = code.acquire_register() b = code.acquire_register() c = code.acquire_register() shr(c, a, b) cneq(c, a, b) cge(c, a, b) cgei(c, a, 10) lt(c, a, b) lti(c, a, 10) a_immediate(c, a, 10) a_immediate(c, a, 10000) sf_immediate(c, a, 10000) code.print_code() proc = env.Processor() proc.execute(code) return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async = True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert(result == (i + 0x10)) proc.join(spe_id) return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async=True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert (result == (i + 0x10)) proc.join(spe_id) return
def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode='fp') x_test = array.array('f', [1.0, 2.0, 3.0, 4.0]) y_test = array.array('f', [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert (result == r_test) return
def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode="fp") x_test = array.array("f", [1.0, 2.0, 3.0, 4.0]) y_test = array.array("f", [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert result == r_test return
def TestAll(): import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) a = code.prgm.acquire_register() b = code.prgm.acquire_register() c = code.prgm.acquire_register() shr(c, a, b) cneq(c, a, b) cge(c, a, b) cgei(c, a, 10) lt(c, a, b) lti(c, a, 10) a_immediate(c, a, 10) a_immediate(c, a, 10000) sf_immediate(c, a, 10000) prgm.add(code) prgm.print_code() proc = env.Processor() proc.execute(prgm) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) stream = spuiter.stream_buffer(code, self.stream_addr, self.stream_size * 4, self.buffer_size, self.lsa) ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4) popc = syn_popc_var() x = var.Word(0) count = var.Word(0) total = var.Word(0) for buffer in stream: for x in spuiter.spu_vec_iter(code, ls_data, addr_reg=buffer): popc.popc(count, x) popc.reduce_word(total, count) # Send the result to the caller spu.wrch(total, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.buffers is None: raise Exception('Please set buffers') if self.stride is None: raise Exception('Please set stride') # Draw a square color = var.SignedWord(0x0F0F0FFF) fb0 = var.Word(self.buffers[0]) fb1 = var.Word(self.buffers[1]) stride = var.Word(self.stride) addr = var.Word(0) # Draw one line line_pixels = 256 for i in spuiter.syn_iter(code, line_pixels*4, step = 16): spu.stqx(color, addr, i) # Transfer the line to the frame buffer md_fb = spuiter.memory_desc('I', size = line_pixels) md_fb.set_addr_reg(addr.reg) addr.v = fb0 for i in spuiter.syn_iter(code, 128): md_fb.put(code, 0) addr.v = addr + stride spu.set_active_code(old_code) return
def setup(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self.consts = {} for const in constants.keys(): self.consts[const] = var.Word(constants[const]) spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.result is None: raise Exception('Please set result') spu.wrch(self.result, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def RunTest(test): from corepy.arch.spu.platform import InstructionStream, Processor code = InstructionStream() spu.set_active_code(code) test() code.print_code() proc = Processor() proc.execute(code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self._load_parameters(code) log = spu_log.SPULog() log.setup(code) if self.renderer is not None: self.renderer.setup(code) self.renderer.set_one(log.consts['ONE']) r1_inc = var.SingleFloat() r2_inc = var.SingleFloat() r1 = var.SingleFloat() r2 = var.SingleFloat() result = var.SingleFloat() pattern = var.Word(0) self.ly_point.set_pattern_reg(pattern) self.ly_point.set_result_reg(result) self.ly_point.set_r_regs(r1, r2) self.ly_point.set_log(log) self.ly_point.setup(code) spu.lqa(r1, 0) spu.lqa(r2, 4) spu.lqa(r1_inc, 8) spu.lqa(r2_inc, 12) spu.lqa(pattern, 16) for y in spuiter.syn_iter(code, self.h): spu.lqa(r1, 0) for x in spuiter.syn_iter(code, self.w / 4): self.ly_point.synthesize(code) r1.v = spu.fa.ex(r1, r1_inc) if self.renderer is not None: # result.v = spu.fm.ex(r1, r2) self.renderer.set_result_reg(result) self.renderer.synthesize(code) if self.renderer is not None: self.renderer.row_complete(code) r2.v = spu.fa.ex(r2, r2_inc) # return Numeric.where(Numeric.less(results, 0), results, 0) spu.set_active_code(old_code) return
def synthesize_constants(self, code): old_code = spu.get_active_code() spu.set_active_code(code) self._one = code.acquire_register() spu.xor(self._one, self._one, self._one) spu.ai(self._one, self._one, 1) spu.cuflt(self._one, self._one, 155) if old_code is not None: spu.set_active_code(old_code) return
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode='async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode = 'async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def RunTest(test): import corepy.arch.spu.platform as env #from corepy.arch.spu.platform import InstructionStream, Processor prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) test() prgm.add(code) prgm.print_code() proc = env.Processor() proc.execute(prgm) return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert (r == None) print 'int result:', r # while True: # pass return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert(r == None) print 'int result:', r # while True: # pass return
def RunTest(test): import corepy.arch.spu.platform as env # from corepy.arch.spu.platform import InstructionStream, Processor prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) test() prgm.add(code) prgm.print_code() proc = env.Processor() proc.execute(prgm) return
def TestSaveBuffer1(): import array code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) n = 2**14 data = array.array('I', range(n)) #data = synspu.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) save_buffer = SaveBuffer() save_buffer.setup() save_buffer.init_ls_buffer(0, 128) save_buffer.init_mm_buffer(data.buffer_info()[0], n) value = var.SignedWord(0xCAFEBABE) for i in spuiter.syn_iter(code, n / 4): save_buffer.save_register(value) code.print_code() spe_id = proc.execute(code, mode='async') for i in range(n / 4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) #data.copy_from(data_array.buffer_info()[0], len(data_array)) print data[:10] return
def synthesize(self): # Okay. This code is not going to exceed 256 instructions (1kb). Knowing that, # the register contents can be safely placed at 0x3F400 in localstore, 3kb from # the top. The SPRE will place the instruction stream as close to the top as # possible. But since it is not going to be more than 1kb worth of instructions, # it will not overlap with the register contents. code = self.code spu.set_active_code(code) # Reload the instructions spu.sync(1) # Next instruction to execute lbl_op = code.size() spu.nop(0) # Placeholders for register store instructions for i in range(128): spu.stqa(i, 0xFD00 + (i * 4)) # spu.stqa(i, 0xFE00 + (i * 4)) # Stop for next command spu.stop(0x0FFF) lbl_regs = code.size() # Create space for the saved registers #for i in range(128): # # 16 bytes/register # spu.nop(0) # spu.lnop() # spu.nop(0) # spu.lnop() # Clearing active code here is important! spu.set_active_code(None) code.cache_code() code_size = len(code._prologue._code) * 4 self.xfer_size = code_size + (16 - (code_size) % 16); print 'xfer_size:', self.xfer_size self.code_lsa = (0x3FFFF - code_size) & 0xFFF80; self.lbl_op = lbl_op return
def synthesize(self): # Okay. This code is not going to exceed 256 instructions (1kb). Knowing that, # the register contents can be safely placed at 0x3F400 in localstore, 3kb from # the top. The SPRE will place the instruction stream as close to the top as # possible. But since it is not going to be more than 1kb worth of instructions, # it will not overlap with the register contents. code = self.code spu.set_active_code(code) # Reload the instructions spu.sync(1) # Next instruction to execute lbl_op = code.size() spu.nop(0) # Placeholders for register store instructions for i in range(128): spu.stqa(i, 0xFD00 + (i * 4)) # spu.stqa(i, 0xFE00 + (i * 4)) # Stop for next command spu.stop(0x0FFF) lbl_regs = code.size() # Create space for the saved registers #for i in range(128): # # 16 bytes/register # spu.nop(0) # spu.lnop() # spu.nop(0) # spu.lnop() # Clearing active code here is important! spu.set_active_code(None) code.cache_code() code_size = len(code._prologue._code) * 4 self.xfer_size = code_size + (16 - (code_size) % 16) print 'xfer_size:', self.xfer_size self.code_lsa = (0x3FFFF - code_size) & 0xFFF80 self.lbl_op = lbl_op return
def TestLog(): code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) # Create a simple SPU program that computes log for 10 values and # sends the result back using the mailbox log = SPULog() values = [] result = code.acquire_register() N = 10 x = 1 for i in range(N): val = var.Word(x) spu.cuflt(val, val, 155) values.append(val) x = x * 10 log.setup(code) log.set_result(result) for i in range(N): log.set_x(values[i]) log.synthesize(code) spu.wrch(result, dma.SPU_WrOutMbox) spe_id = proc.execute(code, mode='async') x = 1 for i in range(N): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'log said: 0x%08X (%d)' % ( synspu.spu_exec.read_out_mbox(spe_id), x) x = x * 10 proc.join(spe_id) return
def TestSaveBuffer1(): import array code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) n = 2**14 data = array.array('I', range(n)) #data = synspu.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) save_buffer = SaveBuffer() save_buffer.setup() save_buffer.init_ls_buffer(0, 128) save_buffer.init_mm_buffer(data.buffer_info()[0], n) value = var.SignedWord(0xCAFEBABE) for i in spuiter.syn_iter(code, n / 4): save_buffer.save_register(value) code.print_code() spe_id = proc.execute(code, mode='async') for i in range(n/4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) #data.copy_from(data_array.buffer_info()[0], len(data_array)) print data[:10] return
def TestFloatScalar(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma code = InstructionStream() spu.set_active_code(code) x = SingleFloat(1.0) y = SingleFloat(2.0) r = SingleFloat(0.0, reg = code.fp_return) r.v = spu.fa.ex(x, y) proc = Processor() result = proc.execute(code, mode='fp') assert(result == (1.0 + 2.0)) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name = 55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex = True) r = proc.execute(prgm, mode = 'int', stop = True, debug = True) assert(r[0] == 42) assert(r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex = True) r = proc.execute(prgm, mode = 'fp') print r return
def TestLog(): code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) # Create a simple SPU program that computes log for 10 values and # sends the result back using the mailbox log = SPULog() values = [] result = code.acquire_register() N = 10 x = 1 for i in range(N): val = var.Word(x) spu.cuflt(val, val, 155) values.append(val) x = x * 10 log.setup(code) log.set_result(result) for i in range(N): log.set_x(values[i]) log.synthesize(code) spu.wrch(result, dma.SPU_WrOutMbox) spe_id = proc.execute(code, mode = 'async') x = 1 for i in range(N): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'log said: 0x%08X (%d)' %(synspu.spu_exec.read_out_mbox(spe_id), x) x = x * 10 proc.join(spe_id) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name=55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='int', stop=True, debug=True) assert (r[0] == 42) assert (r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='fp') print r return
def synthesize(self, code): if self._x_regs is None: raise Exception("Please set x_regs") if self._y_regs is None: raise Exception("Please set y_regs") if self._result is None: raise Exception("Please set result register") old_code = spu.get_active_code() spu.set_active_code(code) regs = [] if self._one is None: self.synthesize_constants(code) regs.append(self._one) ab = code.acquire_register() c = code.acquire_register() ab_temp = code.acquire_register() c_temp = code.acquire_register() result = code.acquire_register() regs = regs + [ab, c, ab_temp, c_temp] nregs = self._n_bits / 128 for i in range(nregs): # self._ab(self._x_regs[i], self._y_regs[i], ab, ab_temp) # self._c( self._x_regs[i], self._y_regs[i], c, c_temp) self._ab_c(self._x_regs[i], self._y_regs[i], ab, c, ab_temp, c_temp) self._reduce_word(ab, ab_temp) self._reduce_word(c, c_temp) self._compute_ratio(ab_temp, c_temp, result) print '%d registers,' % (len(regs) + len(self._x_regs) + len(self._y_regs)), code.release_registers(regs) if old_code is not None: spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Create and initialize the variables count = var.Word(0) result = var.Word(0) x = var.Word(0) # 'Load' the input vector x from register 5 x.v = spu.ai.ex(5, 0) # Inline the popc and reduce operations self.popc(count, x) self.reduce_word(result, count) # Send the result to the caller spu.wrch(result, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def TestFloatScalar(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat(1.0) y = SingleFloat(2.0) r = SingleFloat(0.0, reg=code.fp_return) r.v = spu.fa.ex(x, y) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode="fp") assert result == (1.0 + 2.0) return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert r == 0xD return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert (r == 0xD) return
def synthesize(self, code): if self._x_regs is None: raise Exception("Please set x_regs") if self._y_regs is None: raise Exception("Please set y_regs") if self._result is None: raise Exception("Please set result register") old_code = spu.get_active_code() spu.set_active_code(code) regs = [] if self._one is None: self.synthesize_constants(code) regs.append(self._one) ab = code.acquire_register() c = code.acquire_register() ab_temp = code.acquire_register() c_temp = code.acquire_register() result = code.acquire_register() regs = regs + [ab, c, ab_temp, c_temp] nregs = self._n_bits / 128 for i in range(nregs): # self._ab(self._x_regs[i], self._y_regs[i], ab, ab_temp) # self._c( self._x_regs[i], self._y_regs[i], c, c_temp) self._ab_c(self._x_regs[i], self._y_regs[i], ab, c, ab_temp, c_temp) self._reduce_word(ab, ab_temp) self._reduce_word( c, c_temp) self._compute_ratio(ab_temp, c_temp, result) print '%d registers,' % (len(regs) + len(self._x_regs) + len(self._y_regs)), code.release_registers(regs) if old_code is not None: spu.set_active_code(old_code) return
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg=13) r20 = code.acquire_register(reg=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) r = proc.execute(code, stop=True) # , debug = True) #print 'int result:', r assert (r[0] == 0) assert (r[1] == 0x200D) return
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg = 13) r20 = code.acquire_register(reg = 20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) code.print_code() r = proc.execute(code) # , debug = True) print 'int result:', r # while True: # pass return
def TestInt(): code = InstructionStream() proc = Processor() spu.set_active_code(code) r13 = code.acquire_register(reg=13) r20 = code.acquire_register(reg=20) spu.ai(r20, r20, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.ai(r13, r13, 13) spu.stop(0x200D) code.print_code() r = proc.execute(code) # , debug = True) print 'int result:', r # while True: # pass return
import corepy.lib.extarray as extarray import corepy.arch.spu.isa as spu import corepy.arch.spu.platform as env import corepy.arch.spu.lib.dma as dma from corepy.arch.spu.lib.util import load_word import time if __name__ == '__main__': ITERS = 500000 #ITERS = 15 prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) psmap = extarray.extarray('I', 131072 / 4) data = extarray.extarray('I', range(0, 16)) r_sum = prgm.gp_return r_cnt = prgm.acquire_register() spu.xor(r_sum, r_sum, r_sum) load_word(code, r_cnt, ITERS) lbl_loop = prgm.get_label("loop") code.add(lbl_loop) reg = dma.spu_read_in_mbox(code) spu.ai(r_sum, r_sum, 1)
def SpeedTest(n_spus = 6, n_floats = 6): """ Get a rough estimate of the maximum flop count. On a PS3 using all 6 spus, this is 152 GFlops. """ if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) f_range = range(n_floats) a = [SingleFloat(0.0) for i in f_range] b = [SingleFloat(0.0) for i in f_range] c = [SingleFloat(0.0) for i in f_range] t = [SingleFloat(0.0) for i in f_range] outer = 2**12 inner = 2**16 unroll = 128 fuse = 2 simd = 4 for x in syn_iter(code, outer): for y in syn_iter(code, inner): for u in xrange(unroll): for i in f_range: t[i].v = spu.fma.ex(a[i], b[i], c[i]) # Run the synthetic program and copy the results back to the array # TODO - AWF - use the SPU decrementers to time this proc = env.Processor() prgm += code start = time.time() r = proc.execute(prgm, n_spus = n_spus) stop = time.time() total = stop - start n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) # # Run the native program and copy the results back to the array # outer = 2**14 # inner = 2**16 # unroll = 1 # fuse = 1 # simd = 1 # proc = Processor() # # ncode = NativeInstructionStream("a.out") # start = time.time() # r = proc.execute(ncode, n_spus = n_spus) # stop = time.time() # total = stop - start # n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) # print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) results = """ --> No optimizations Executing native code: a.out 14.805322 sec, 20.89 GFlops --> Synthetic Platform: linux.spre_linux_spu no raw data 65.023350 sec, 152.19 GFlops --> -O3 (fuse: 2, simd: 4) Executing native code: a.out 7.407939 sec, 41.74 GFlops --> -O3 (fuse: 1, simd: 1) Executing native code: a.out 7.403702 sec, 5.22 GFlops """ return
def MemoryDescExample(data_size = 20000): """ This example uses a memory descriptor to move 20k integers back and forth between main memory and the SPU local store. Each value is incremented by 1 while on the SPU. Memory descriptors are a general purpose method for describing a region of memory. Memory is described by a typecode, address, and size. Memory descriptors can be initialized by hand or from an array or buffer object. For main memory, memory descriptors are useful for transfering data between main memory and an SPU's local store. The get/put methods on a memory descriptor generate the SPU code to move data of any size between main memory and local store. Memory descriptors can also be used with spu_vec_iters to describe the region of memory to iterate over. The typecode in the memory descriptor is used to determine the type for the loop induction variable. Note that there is currently no difference between memory descriptors for main memory and local store. It's up to the user to make sure the memory descriptor settings make sense in the current context. (this will probably change in the near future) Note: get/put currently use loops rather than display lists for transferring data over 16k. """ code = env.InstructionStream() proc = env.Processor() code.debug = True spu.set_active_code(code) # Create a python array data = extarray.extarray('I', range(data_size)) # Align the data in the array #a_data = aligned_memory(data_size, typecode = 'I') #a_data.copy_to(data.buffer_info()[0], data_size) # Create memory descriptor for the data in main memory data_desc = memory_desc('I') #data_desc.from_array(a_data) data_desc.from_array(data) # Transfer the data to 0x0 in the local store data_desc.get(code, 0) # Create memory descriptor for the data in the local store for use # in the iterator lsa_data = memory_desc('i', 0, data_size) # Add one to each value for x in spu_vec_iter(code, lsa_data): x.v = x + 1 # Transfer the data back to main memory data_desc.put(code, 0) dma.spu_write_out_mbox(code, 0xCAFE) # Execute the synthetic program # code.print_code() spe_id = proc.execute(code, async=True) proc.join(spe_id) # Copy it back to the Python array #a_data.copy_from(data.buffer_info()[0], data_size) for i in xrange(data_size): assert(data[i] == i + 1) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = prgm.gp_return test = prgm.acquire_register() lbl_brz = prgm.get_label("BRZ") lbl_skip = prgm.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 42) assert(r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) lbl_loop = prgm.get_label("LOOP") lbl_break = prgm.get_label("BREAK") r_cnt = prgm.acquire_register() r_stop = prgm.acquire_register() r_cmp = prgm.acquire_register() r_foo = prgm.gp_return spu.ori(r_foo, prgm.r_zero, 0) spu.ori(r_cnt, prgm.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) prgm.add(code) prgm.print_code() r = proc.execute(prgm, mode = 'int', stop = True) print "ret", r assert(r[0] == 55) return
def _transfer_data(self, code, kernel, lsa, tag): """ Load the data into the SPU memory """ # Check the types if not isinstance(code, spe.InstructionStream): raise Exception('Code must be an InstructionStream') if not (isinstance(lsa, int) or issubclass(type(lsa), (spe.Register, spe.Variable))): raise Exception('lsa must be an integer, Register, or Variable') old_code = spu.get_active_code() spu.set_active_code(code) # Acquire registers for address and size, if they were not supplied by the user if self.r_addr is None: r_ea_data = code.prgm.acquire_register() else: r_ea_data = self.r_addr if self.r_size is None: r_size = code.prgm.acquire_register() else: r_size = self.r_size # Create variables ea_addr = var.SignedWord(reg = r_ea_data) aligned_size = var.SignedWord(0) mod_16 = var.SignedWord(0xF) # Initialize the lsa_addr variable. if isinstance(lsa, int): # From a constant ls_addr = var.SignedWord(lsa) elif issubclass(type(lsa), (spe.Register, spe.Variable)): # From a variable ls_addr = var.SignedWord() ls_addr.v = lsa tag_var = var.SignedWord(tag) cmp = var.SignedWord(0) # Load the effective address if self.r_addr is None: if self.addr % 16 != 0: print '[get_memory] Misaligned data' util.load_word(code, ea_addr, self.addr) # Load the size, rounding up as required to be 16-byte aligned if self.r_size is None: rnd_size = self.size * var.INT_SIZES[self.typecode] if rnd_size < 16: rnd_size = 16 elif (rnd_size % 16) != 0: rnd_size += (16 - (rnd_size % 16)) util.load_word(code, aligned_size, rnd_size) else: # TODO: !!! UNIT TEST THIS !!! # Same as above, but using SPU arithemtic to round size = var.SignedWord(reg = r_size) sixteen = var.SignedWord(16) cmp.v = ((size & mod_16) == size) aligned_size.v = size + (sixteen - (size & mod_16)) spu.selb(aligned_size.reg, size.reg, aligned_size.reg, cmp.reg, order = _mi(spu.selb)) code.release_register(sixteen.reg) # Use an auxillary register for the moving ea value if the # caller supplied the address register if self.r_addr is not None: ea_load = var.SignedWord(0) ea_load.v = ea_addr else: ea_load = ea_addr # note that this is reference, not .v assignment # Transfer parameters buffer_size = var.SignedWord(16384) remaining = var.SignedWord(0) transfer_size = var.SignedWord(0) remaining.v = aligned_size # Set up the iterators to transfer at most 16k at a time xfer_iter = syn_iter(code, 0, 16384) xfer_iter.set_stop_reg(aligned_size.reg) for offset in xfer_iter: cmp.v = buffer_size > remaining spu.selb(transfer_size, buffer_size, remaining, cmp) # Transfer the data kernel(code, ls_addr, ea_load, transfer_size, tag_var) ls_addr.v = ls_addr + buffer_size ea_load.v = ea_load + buffer_size remaining.v = remaining - buffer_size # Set the tag bit to tag dma.mfc_write_tag_mask(code, 1<<tag); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Release the registers code.release_register(buffer_size.reg) code.release_register(remaining.reg) code.release_register(aligned_size.reg) code.release_register(transfer_size.reg) code.release_register(cmp.reg) code.release_register(ls_addr.reg) code.release_register(tag_var.reg) code.release_register(ea_load.reg) if old_code is not None: spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Sanity checks if self._x_addr is None: raise Exception("Please set x_addr") if self._y_addr is None: raise Exception("Please set y_addr") if self._n_bits is None: raise Exception("Please set n_bits") if self._m is None: raise Exception("Please set m") if self._n is None: raise Exception("Please set n") # Acquire a registers for the bit vectors and result n_vecs = self._n_bits / 128 x_regs = [code.acquire_register() for i in range(n_vecs)] y_regs = [code.acquire_register() for i in range(n_vecs)] result = code.acquire_register() x_addr = var.Word() y_addr = var.Word() if self._save_op is not None: if self._threshold is not None: threshold = var.SingleFloat(self._threshold) else: threshold = var.SingleFloat(0.0) bcmp = var.Word(0) # Setup the Tanimito kernel tan = Tanimoto() tan.set_n_bits(self._n_bits) tan.set_x_regs(x_regs) tan.set_y_regs(y_regs) tan.set_result(result) tan.synthesize_constants(code) # Setup the save op save_op = self._save_op if save_op is not None: save_op.setup() # Create the iterators xiter = spuiter.syn_iter(code, self._m) yiter = spuiter.syn_iter(code, self._n) # Synthesize the block comparison loops x_addr.v = self._x_addr for x_off in xiter: x_addr.v = x_addr + 16 * n_vecs y_addr.v = self._y_addr self._load_bit_vector(x_addr, x_regs) for y_off in yiter: y_addr.v = y_addr + 16 * n_vecs self._load_bit_vector(y_addr, y_regs) tan.synthesize(code) if save_op is not None: spu.fcgt(bcmp, result, threshold) save_op.test(bcmp, result, x_off, y_off) # /x_off if old_code is not None: spu.set_active_code(old_code) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() lbl_brz = code.get_label("BRZ") lbl_skip = code.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) code.print_code(hex=True, pro=True, epi=True) r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) lbl_loop = code.get_label("LOOP") lbl_break = code.get_label("BREAK") r_cnt = code.acquire_register() r_stop = code.acquire_register() r_cmp = code.acquire_register() r_foo = code.gp_return spu.ori(r_foo, code.r_zero, 0) spu.ori(r_cnt, code.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) code.print_code() r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 55) return
def TestTanimotoBlock(n_vecs = 4): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) tb = TanimotoBlock() ls_save = LocalSave() mm_save = MemorySave() code.set_debug(True) # Input block parameters m = 128 n = 64 # n_vecs = 9 n_bits = 128 * n_vecs # Main memory results buffer # max_results = 2**16 max_results = 16384 words_per_result = 4 mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)]) #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I') # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data)) mm_results = spuiter.memory_desc('I') #mm_results.from_array(mm_results_buffer) mm_results.from_array(mm_results_data) mm_save.set_md_save_buffer(mm_results) # Local Results buffer buffer_size = var.SignedWord(16384) buffer_addr = var.SignedWord(m * n * n_vecs * 4) ls_results = spuiter.memory_desc('B') ls_results.set_size_reg(buffer_size) ls_results.set_addr_reg(buffer_addr) ls_save.set_md_results(ls_results) ls_save.set_mm_save_op(mm_save) # Setup the TanimotoBlock class tb.set_n_bits(n_bits) tb.set_block_size(m, n) tb.set_x_addr(0) tb.set_y_addr(m * n_vecs * 16) tb.set_save_op(ls_save) # Main test loop n_samples = 10000 for samples in spuiter.syn_iter(code, n_samples): tb.synthesize(code) spu.wrch(buffer_size, dma.SPU_WrOutMbox) spu.stop(0x2000) # "Function" Calls ls_save.block() mm_save.block() # code.print_code() start = time.time() spe_id = proc.execute(code, async=True) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) stop = time.time() # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data)) proc.join(spe_id) total = stop - start bits_sec = (m * n * n_bits * n_samples) / total / 1e9 ops_per_compare = 48 * 4 + 8 # 48 SIMD instructions, 8 scalar insts_per_compare = 56 gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9 ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9 print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % ( total, bits_sec, gops, ginsts, code.size()) return