def _align_stream(self, length, align): # Return nop's such that length % align = 0 if align % 4 != 0: raise Exception("SPU alignment must be a multiple of 4 bytes") length /= 4 align /= 4 mod = align - (length % align) # need mod instructions to achieve alignment ret = [] if mod % 2 == 0: nop_pair = (spu.nop(self.r_zero, ignore_active=True), spu.lnop(ignore_active=True)) # issue mod / 2 nop/lnop pairs for i in xrange(0, mod / 2): ret.extend(nop_pair) else: # issue an lnop, then (mod - 1) / 2 nop/lnop pairs nop_pair = (spu.lnop(ignore_active=True), spu.nop(self.r_zero, ignore_active=True)) for i in xrange(0, mod / 2): ret.extend(nop_pair) ret.append(spu.lnop(ignore_active=True)) return ret
def _align_stream(self, length, align): # Return nop's such that length % align = 0 if align % 4 != 0: raise Exception("SPU alignment must be a multiple of 4 bytes") length /= 4 align /= 4 mod = align - (length % align) # need mod instructions to achieve alignment ret = [] if mod % 2 == 0: nop_pair = (spu.nop(self.r_zero, ignore_active = True), spu.lnop(ignore_active = True)) # issue mod / 2 nop/lnop pairs for i in xrange(0, mod / 2): ret.extend(nop_pair) else: # issue an lnop, then (mod - 1) / 2 nop/lnop pairs nop_pair = (spu.lnop(ignore_active = True), spu.nop(self.r_zero, ignore_active = True)) for i in xrange(0, mod / 2): ret.extend(nop_pair) ret.append(spu.lnop(ignore_active = True)) return ret
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size = size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active = True) code.release_registers(regs) return
def save_register(self, reg): # , branch_to_save = False): code = spu.get_active_code() offset = code.acquire_register() size = code.acquire_register() test = code.acquire_register() regs = [offset, size, test] spu.rotqbyi(offset, self.ls_buffer, 4) spu.rotqbyi(size, self.ls_buffer, 8) spu.stqx(reg, self.ls_buffer, offset) spu.ai(offset, offset, 16) spu.ceq(test, offset, size) spu.wrch(size, dma.SPU_WrOutMbox) spu.wrch(offset, dma.SPU_WrOutMbox) spu.wrch(test, dma.SPU_WrOutMbox) # !!! STOPPED HERE !!! THESE VALUES ARE WRONG !!! lbl_ls_full = code.size() spu.stop(0xB) self.save_ls_buffer(ls_size=size) spu.nop(0) code[lbl_ls_full] = spu.brz(test, (code.size() - lbl_ls_full), ignore_active=True) code.release_registers(regs) return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert(r == None) print 'int result:', r # while True: # pass return
def TestDebug(): prgm = Program() code = prgm.get_stream() proc = DebugProcessor() spu.set_active_code(code) ra = code.acquire_register() rb = code.acquire_register() rc = code.acquire_register() rd = code.acquire_register() re = code.acquire_register() rf = code.acquire_register() rg = code.acquire_register() rh = code.acquire_register() spu.ai(ra, 0, 14) spu.ai(rb, 0, 13) spu.ai(rc, 0, 14) spu.brnz(14, 3) spu.ai(rd, 0, 15) spu.ai(re, 0, 16) spu.ai(rf, 0, 17) spu.ai(rg, 0, 18) spu.ai(rh, 0, 19) spu.nop(0) spu.stop(0x200A) prgm += code r = proc.execute(prgm) # , debug = True) r = proc.nexti() r = proc.nexti() r = proc.nexti() r = proc.nexti() while r != None: r = proc.nexti() if r is not None: regs = proc.dump_regs() print '******', regs[122:] assert (r == None) print 'int result:', r # while True: # pass return
def synthesize(self): # Okay. This code is not going to exceed 256 instructions (1kb). Knowing that, # the register contents can be safely placed at 0x3F400 in localstore, 3kb from # the top. The SPRE will place the instruction stream as close to the top as # possible. But since it is not going to be more than 1kb worth of instructions, # it will not overlap with the register contents. code = self.code spu.set_active_code(code) # Reload the instructions spu.sync(1) # Next instruction to execute lbl_op = code.size() spu.nop(0) # Placeholders for register store instructions for i in range(128): spu.stqa(i, 0xFD00 + (i * 4)) # spu.stqa(i, 0xFE00 + (i * 4)) # Stop for next command spu.stop(0x0FFF) lbl_regs = code.size() # Create space for the saved registers #for i in range(128): # # 16 bytes/register # spu.nop(0) # spu.lnop() # spu.nop(0) # spu.lnop() # Clearing active code here is important! spu.set_active_code(None) code.cache_code() code_size = len(code._prologue._code) * 4 self.xfer_size = code_size + (16 - (code_size) % 16) print 'xfer_size:', self.xfer_size self.code_lsa = (0x3FFFF - code_size) & 0xFFF80 self.lbl_op = lbl_op return
def synthesize(self): # Okay. This code is not going to exceed 256 instructions (1kb). Knowing that, # the register contents can be safely placed at 0x3F400 in localstore, 3kb from # the top. The SPRE will place the instruction stream as close to the top as # possible. But since it is not going to be more than 1kb worth of instructions, # it will not overlap with the register contents. code = self.code spu.set_active_code(code) # Reload the instructions spu.sync(1) # Next instruction to execute lbl_op = code.size() spu.nop(0) # Placeholders for register store instructions for i in range(128): spu.stqa(i, 0xFD00 + (i * 4)) # spu.stqa(i, 0xFE00 + (i * 4)) # Stop for next command spu.stop(0x0FFF) lbl_regs = code.size() # Create space for the saved registers #for i in range(128): # # 16 bytes/register # spu.nop(0) # spu.lnop() # spu.nop(0) # spu.lnop() # Clearing active code here is important! spu.set_active_code(None) code.cache_code() code_size = len(code._prologue._code) * 4 self.xfer_size = code_size + (16 - (code_size) % 16); print 'xfer_size:', self.xfer_size self.code_lsa = (0x3FFFF - code_size) & 0xFFF80; self.lbl_op = lbl_op return
def align_code(self, boundary): """ Insert the appropraite nop/lnops to align the next instruction on the byte boudary. boundary must be a multiple of four. """ word_align = boundary / 4 while len(self._code) % word_align: if len(self._code) % 2 == 0: self.add(spu.nop(0), True) else: self.add(spu.lnop(0), True) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active=True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(-(idx - self._branch_idx - 1)) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction code[self._branch_idx] = spu.nop(0, ignore_active = True) code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # FILL IN HERE # Return to the loop idx = len(code) spu.br(- (idx - self._branch_idx - 1)) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active = True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(- (idx - self._branch_idx - 1)) return
def _startSPU(self): self.ctx = ctx = env.spu_exec.alloc_context() # Execute a no-op instruction stream so the prolog is executed code = env.InstructionStream() code.add(spu.nop(code.r_zero)) code.cache_code() itemsize = code.render_code.itemsize code_len = len(code.render_code) * itemsize if code_len % 16 != 0: code_len += 16 - (code_len % 16) code_lsa = 0x40000 - code_len env.spu_exec.run_stream(ctx, code.inst_addr(), code_len, code_lsa, code_lsa) self.localstore = extarray.extarray('I', 262144 / 4) self.localstore.set_memory(ctx.spuls) return
def _startSPU(self): self.ctx = ctx = env.spu_exec.alloc_context() # Execute a no-op instruction stream so the prolog is executed prgm = env.Program() code = prgm.get_stream() code.add(spu.nop(code.r_zero)) prgm.cache_code() itemsize = prgm.render_code.itemsize code_len = len(prgm.render_code) * itemsize if code_len % 16 != 0: code_len += 16 - (code_len % 16) code_lsa = 0x40000 - code_len env.spu_exec.run_stream(ctx, prgm.inst_addr(), code_len, code_lsa, code_lsa) self.localstore = extarray.extarray('I', 262144 / 4) print "spuls %x" % (ctx.spuls), ctx.spuls, type(ctx.spuls) self.localstore.set_memory(ctx.spuls, 262144) return
def add(self, inst, optimize_override=False): if not optimize_override and self._optimize: # binary_string_inst = spu.DecToBin(inst) op = 'nop' # if binary_string_inst[0:3] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:3]] # elif binary_string_inst[0:6] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:6]] # elif binary_string_inst[0:7] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:7]] # elif binary_string_inst[0:8] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:8]] # elif binary_string_inst[0:9] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:9]] # elif binary_string_inst[0:10] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:10]] pipeline = inst.cycles[0] if (len(self._code) % 2 == 0) and pipeline == 0: InstructionStream.add(self, inst) elif (len(self._code) % 2 == 1) and pipeline == 1: InstructionStream.add(self, inst) elif (len(self._code) % 2 == 0) and pipeline == 1: InstructionStream.add(self, spu.nop(0)) InstructionStream.add(self, inst) elif (len(self._code) % 2 == 1) and pipeline == 0: InstructionStream.add(self, spu.lnop(0)) InstructionStream.add(self, inst) else: spe.InstructionStream.add(self, inst) # Invalidate the cache self._cached = False return len(self._code)
def add(self, inst, optimize_override = False): if not optimize_override and self._optimize: # binary_string_inst = spu.DecToBin(inst) op = 'nop' # if binary_string_inst[0:3] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:3]] # elif binary_string_inst[0:6] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:6]] # elif binary_string_inst[0:7] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:7]] # elif binary_string_inst[0:8] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:8]] # elif binary_string_inst[0:9] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:9]] # elif binary_string_inst[0:10] in spu.inst_opcodes: # op = spu.inst_opcodes[binary_string_inst[0:10]] pipeline = inst.cycles[0] if (len(self._code) % 2 == 0) and pipeline == 0: InstructionStream.add(self, inst) elif (len(self._code) % 2 == 1) and pipeline == 1: InstructionStream.add(self, inst) elif (len(self._code) % 2 == 0) and pipeline == 1: InstructionStream.add(self, spu.nop(0)) InstructionStream.add(self, inst) elif (len(self._code) % 2 == 1) and pipeline == 0: InstructionStream.add(self, spu.lnop(0)) InstructionStream.add(self, inst) else: spe.InstructionStream.add(self, inst) # Invalidate the cache self._cached = False return len(self._code)