def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Reserve two variable registers count = code.acquire_register() result = code.acquire_register() # 'Load' the input vector x from register 5 x = code.acquire_register() spu.ai(x, 5, 0) # Zero count and result spu.xor(count, count, count) spu.xor(result, result, result) # Inline the popc and reduce operations self.popc(count, x) self.reduce_word(result, count) # Send the result to the caller spu.wrch(result, dma.SPU_WrOutMbox) code.release_register(x) spu.set_active_code(old_code) return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async = True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert(result == (i + 0x10)) proc.join(spe_id) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) stream = spuiter.stream_buffer(code, self.stream_addr, self.stream_size * 4, self.buffer_size, self.lsa) ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4) popc = syn_popc_var() x = var.Word(0) count = var.Word(0) total = var.Word(0) for buffer in stream: for x in spuiter.spu_vec_iter(code, ls_data, addr_reg = buffer): popc.popc(count, x) popc.reduce_word(total, count) # Send the result to the caller spu.wrch(total, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async=True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert (result == (i + 0x10)) proc.join(spe_id) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) stream = spuiter.stream_buffer(code, self.stream_addr, self.stream_size * 4, self.buffer_size, self.lsa) ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4) popc = syn_popc_var() x = var.Word(0) count = var.Word(0) total = var.Word(0) for buffer in stream: for x in spuiter.spu_vec_iter(code, ls_data, addr_reg=buffer): popc.popc(count, x) popc.reduce_word(total, count) # Send the result to the caller spu.wrch(total, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) if self.result is None: raise Exception('Please set result') spu.wrch(self.result, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def spu_writech(code, ch, msg): # msg may be either a literal value, or a register containing the value if isinstance(msg, (spe.Register, spe.Variable)): last = code.add(spu.wrch(msg, ch)) else: r_msg = code.prgm.acquire_register() util.load_word(code, r_msg, msg) last = code.add(spu.wrch(r_msg, ch)) code.prgm.release_register(r_msg) return last
def spu_writech(code, ch, msg): # msg may be either a literal value, or a register containing the value if isinstance(msg, (spe.Register, spe.Variable)): last = code.add(spu.wrch(msg, ch)) else: r_msg = code.acquire_register() util.load_word(code, r_msg, msg) last = code.add(spu.wrch(r_msg, ch)) code.release_register(r_msg) return last
def spu_mfcdma32(code, r_ls, r_ea, r_size, r_tagid, cmd): r_cmd = code.acquire_register() util.load_word(code, r_cmd, cmd) code.add(spu.wrch(r_ls, MFC_LSA)) code.add(spu.wrch(r_ea, MFC_EAL)) code.add(spu.wrch(r_size, MFC_Size)) code.add(spu.wrch(r_tagid, MFC_TagID)) last = code.add(spu.wrch(r_cmd, MFC_Cmd)) code.release_register(r_cmd) return last
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode='async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode = 'async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def TestMbox(): import corepy.arch.spu.platform as synspu code = synspu.InstructionStream() # Send a message to the PPU spu_write_out_mbox(code, 0xDEADBEEFl) # Get a message from the PPU reg = spu_read_in_mbox(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_in_mbox(spe_id, 0x88CAFE) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def TestMbox(): code = synspu.InstructionStream() # Send a message to the PPU spu_write_out_mbox(code, 0xDEADBEEFl) # Get a message from the PPU reg = spu_read_in_mbox(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_in_mbox(spe_id, 0x88CAFE) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def TestLog(): code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) # Create a simple SPU program that computes log for 10 values and # sends the result back using the mailbox log = SPULog() values = [] result = code.acquire_register() N = 10 x = 1 for i in range(N): val = var.Word(x) spu.cuflt(val, val, 155) values.append(val) x = x * 10 log.setup(code) log.set_result(result) for i in range(N): log.set_x(values[i]) log.synthesize(code) spu.wrch(result, dma.SPU_WrOutMbox) spe_id = proc.execute(code, mode='async') x = 1 for i in range(N): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'log said: 0x%08X (%d)' % ( synspu.spu_exec.read_out_mbox(spe_id), x) x = x * 10 proc.join(spe_id) return
def TestLog(): code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) # Create a simple SPU program that computes log for 10 values and # sends the result back using the mailbox log = SPULog() values = [] result = code.acquire_register() N = 10 x = 1 for i in range(N): val = var.Word(x) spu.cuflt(val, val, 155) values.append(val) x = x * 10 log.setup(code) log.set_result(result) for i in range(N): log.set_x(values[i]) log.synthesize(code) spu.wrch(result, dma.SPU_WrOutMbox) spe_id = proc.execute(code, mode = 'async') x = 1 for i in range(N): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'log said: 0x%08X (%d)' %(synspu.spu_exec.read_out_mbox(spe_id), x) x = x * 10 proc.join(spe_id) return
def spu_mfcdma32(code, r_ls, r_ea, r_size, r_tagid, cmd): # print "spu_mfcdma32 cmd", cmd, str(cmd) # ref = "__spu_mfcdma32_cmd_%s" % str(cmd) # r_cmd = code.prgm.get_storage(ref) # if not isinstance(r_cmd, spu.Register): # r_cmd = code.acquire_register() # util.load_word(code, r_cmd, cmd) # code.prgm.add_storage(ref, r_cmd) r_cmd = code.prgm.acquire_register() util.load_word(code, r_cmd, cmd) code.add(spu.wrch(r_ls, MFC_LSA)) code.add(spu.wrch(r_ea, MFC_EAL)) code.add(spu.wrch(r_size, MFC_Size)) code.add(spu.wrch(r_tagid, MFC_TagID)) last = code.add(spu.wrch(r_cmd, MFC_Cmd)) code.prgm.release_register(r_cmd) return last
def synthesize(self, code): old_code = spu.get_active_code() spu.set_active_code(code) # Create and initialize the variables count = var.Word(0) result = var.Word(0) x = var.Word(0) # 'Load' the input vector x from register 5 x.v = spu.ai.ex(5, 0) # Inline the popc and reduce operations self.popc(count, x) self.reduce_word(result, count) # Send the result to the caller spu.wrch(result, dma.SPU_WrOutMbox) spu.set_active_code(old_code) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size = size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active = True) code.release_registers(regs) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size=size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active=True) code.release_registers(regs) return
def spu_mfcdma64(code, r_ls, r_eah, r_eal, r_size, r_tagid, cmd): r_cmd = code.prgm.acquire_register() util.load_word(code, r_cmd, cmd) code.add(spu.wrch(r_ls, MFC_LSA)) code.add(spu.wrch(r_eah, MFC_EAH)) code.add(spu.wrch(r_eal, MFC_EAL)) code.add(spu.wrch(r_size, MFC_Size)) code.add(spu.wrch(r_tagid, MFC_TagID)) last = code.add(spu.wrch(r_cmd, MFC_Cmd)) code.release_register(r_cmd) return last
def TestSignal(): import corepy.arch.spu.platform as synspu code = synspu.InstructionStream() # Get a signal from the PPU reg = spu_read_signal1(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_signal(spe_id, 1, 0xCAFEBABEl) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'sig said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def TestSignal(): code = synspu.InstructionStream() # Get a signal from the PPU reg = spu_read_signal1(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_signal(spe_id, 1, 0xCAFEBABEl) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'sig said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def dump_regs(self): mbox = 28 # write out mbox channel # Pseudo-code: # 1) Save code is: (do this as an array, not an instruction stream) save_size = 128 * 2 + 4 save_code = extarray.extarray('I', range(save_size)) for i in range(0, 128 * 2, 2): save_code[i] = spu.wrch(i / 2, mbox, ignore_active = True).render() save_code[i + 1] = spu.stop(0x6, ignore_active = True).render() # branch back to the debug stop save_code[128 * 2] = spu.stop(0x7, ignore_active = True).render() ret = spu.bra(self.debug_lsa, ignore_active = True) save_code[128 * 2 + 1] = ret.render() #aligned_save_code = aligned_memory(save_size, typecode = 'I') #aligned_save_code.copy_to(save_code.buffer_info()[0], len(save_code)) # 2) Save lsa[0:len(save_code)] # TODO: do this with putb # 3) Push save code to lsa[0:] tag = 2 spu_exec.spu_getb(self.spe_id, 0, save_code.buffer_info()[0], save_size * 4, tag, 0, 0) spu_exec.read_tag_status_all(self.spe_id, 1 << tag); # 3) Replace the debug branch with a branch to 0 self.replace(self.debug_branch, spu.bra(0, ignore_active = True)) self.get_instructions() # 4) Resume self.resume(self.spe_id) # 5) Read the register values and send the ok signal regs = [] for i in range(128): while spu_exec.stat_out_mbox(self.spe_id) == 0: pass value = spu_exec.read_out_mbox(self.spe_id) regs.append(value) r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = spu_exec.wait_stop_event(self.spe_id) print 'next stop', r # 6) Restore code at original pc self.restore(self.debug_branch) self.get_instructions() # 7) Restore lsa[0:len(save_code)] # TODO: do this with putb # 8) Resume # self.resume(self.spe_id) # r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = self.wait_debug() return regs
def TestTanimotoBlock(n_vecs = 4): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) tb = TanimotoBlock() ls_save = LocalSave() mm_save = MemorySave() code.set_debug(True) # Input block parameters m = 128 n = 64 # n_vecs = 9 n_bits = 128 * n_vecs # Main memory results buffer # max_results = 2**16 max_results = 16384 words_per_result = 4 mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)]) #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I') # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data)) mm_results = spuiter.memory_desc('I') #mm_results.from_array(mm_results_buffer) mm_results.from_array(mm_results_data) mm_save.set_md_save_buffer(mm_results) # Local Results buffer buffer_size = var.SignedWord(16384) buffer_addr = var.SignedWord(m * n * n_vecs * 4) ls_results = spuiter.memory_desc('B') ls_results.set_size_reg(buffer_size) ls_results.set_addr_reg(buffer_addr) ls_save.set_md_results(ls_results) ls_save.set_mm_save_op(mm_save) # Setup the TanimotoBlock class tb.set_n_bits(n_bits) tb.set_block_size(m, n) tb.set_x_addr(0) tb.set_y_addr(m * n_vecs * 16) tb.set_save_op(ls_save) # Main test loop n_samples = 10000 for samples in spuiter.syn_iter(code, n_samples): tb.synthesize(code) spu.wrch(buffer_size, dma.SPU_WrOutMbox) spu.stop(0x2000) # "Function" Calls ls_save.block() mm_save.block() # code.print_code() start = time.time() spe_id = proc.execute(code, async=True) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) stop = time.time() # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data)) proc.join(spe_id) total = stop - start bits_sec = (m * n * n_bits * n_samples) / total / 1e9 ops_per_compare = 48 * 4 + 8 # 48 SIMD instructions, 8 scalar insts_per_compare = 56 gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9 ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9 print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % ( total, bits_sec, gops, ginsts, code.size()) return
def post_cleanup(self, code): spu.wrch(self._count, dma.SPU_WrOutMbox) return
def dump_regs(self): mbox = 28 # write out mbox channel # Pseudo-code: # 1) Save code is: (do this as an array, not an instruction stream) save_size = 128 * 2 + 4 save_code = extarray.extarray('I', range(save_size)) for i in range(0, 128 * 2, 2): save_code[i] = spu.wrch(i / 2, mbox, ignore_active=True).render() save_code[i + 1] = spu.stop(0x6, ignore_active=True).render() # branch back to the debug stop save_code[128 * 2] = spu.stop(0x7, ignore_active=True).render() ret = spu.bra(self.debug_lsa, ignore_active=True) save_code[128 * 2 + 1] = ret.render() #aligned_save_code = aligned_memory(save_size, typecode = 'I') #aligned_save_code.copy_to(save_code.buffer_info()[0], len(save_code)) # 2) Save lsa[0:len(save_code)] # TODO: do this with putb # 3) Push save code to lsa[0:] tag = 2 spu_exec.spu_getb(self.spe_id, 0, save_code.buffer_info()[0], save_size * 4, tag, 0, 0) spu_exec.read_tag_status_all(self.spe_id, 1 << tag) # 3) Replace the debug branch with a branch to 0 self.replace(self.debug_branch, spu.bra(0, ignore_active=True)) self.get_instructions() # 4) Resume self.resume(self.spe_id) # 5) Read the register values and send the ok signal regs = [] for i in range(128): while spu_exec.stat_out_mbox(self.spe_id) == 0: pass value = spu_exec.read_out_mbox(self.spe_id) regs.append(value) r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = spu_exec.wait_stop_event(self.spe_id) print 'next stop', r # 6) Restore code at original pc self.restore(self.debug_branch) self.get_instructions() # 7) Restore lsa[0:len(save_code)] # TODO: do this with putb # 8) Resume # self.resume(self.spe_id) # r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = self.wait_debug() return regs