def set_slot_value(code, reg, slot, value): """ Set the value in reg[slot] with value. If value is a register, use the value from the preferred slot (value[0]). If value is a constant, load it into reg[slot], preserving the values in the other slots. """ prgm = code.prgm if slot not in [0,1,2,3]: raise Exception("Invalid SIMD slot: " + slot) mask = prgm.acquire_register() vector_from_array(code, mask, [0xFFFFFFFF, 0, 0, 0]) if not issubclass(type(value), (spe.Register, spe.Variable)): r_value = prgm.acquire_register() load_word(code, r_value, value) else: r_value = value code.add(spu.rotqbyi(reg, reg, slot * 4)) code.add(spu.selb(reg, reg, r_value, mask)) code.add(spu.rotqbyi(reg, reg, (4 - slot) * 4)) prgm.release_register(mask) if not issubclass(type(value), (spe.Register, spe.Variable)): prgm.release_register(r_value) return
def set_slot_value(code, reg, slot, value): """ Set the value in reg[slot] with value. If value is a register, use the value from the preferred slot (value[0]). If value is a constant, load it into reg[slot], preserving the values in the other slots. """ prgm = code.prgm if slot not in [0, 1, 2, 3]: raise Exception("Invalid SIMD slot: " + slot) mask = prgm.acquire_register() vector_from_array(code, mask, [0xFFFFFFFF, 0, 0, 0]) if not issubclass(type(value), (spe.Register, spe.Variable)): r_value = prgm.acquire_register() load_word(code, r_value, value) else: r_value = value code.add(spu.rotqbyi(reg, reg, slot * 4)) code.add(spu.selb(reg, reg, r_value, mask)) code.add(spu.rotqbyi(reg, reg, (4 - slot) * 4)) prgm.release_register(mask) if not issubclass(type(value), (spe.Register, spe.Variable)): prgm.release_register(r_value) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active=True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(-(idx - self._branch_idx - 1)) return
def block(self): code = spu.get_active_code() self._block_idx = len(code) # --> add the branch instruction (use brz (?) to always branch, nop to never branch) code[self._branch_idx] = spu.nop(0, ignore_active = True) # code[self._branch_idx] = spu.brnz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # code[self._branch_idx] = spu.brz(self._cmp, self._block_idx - self._branch_idx, ignore_active = True) # Pack result into vector # [x][y][score][--] # Zero the save value spu.xor(self._save_value, self._save_value, self._save_value) # Copy the score spu.selb(self._save_value, self._save_value, self._score, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the y value spu.selb(self._save_value, self._save_value, self._y_off, self._word_mask) spu.rotqbyi(self._save_value, self._save_value, 12) # Copy the x value spu.selb(self._save_value, self._save_value, self._x_off, self._word_mask) # Save value to local store spu.stqx(self._save_value, self._count, self._md_results.r_addr) self._count.v = self._count.v + 16 # --> MemorySave test cmp = self._save_value # reuse the save register spu.ceq.ex(cmp, self._count, self._md_results.r_size) if self._save_op is not None: self._save_op.test(cmp, self._count) # Just reset for now spu.selb(self._count, self._count, 0, cmp) # Return to the loop idx = len(code) spu.br(- (idx - self._branch_idx - 1)) return
def _transfer_data(self, code, kernel, lsa, tag): """ Load the data into the SPU memory """ # Check the types if not isinstance(code, spe.InstructionStream): raise Exception('Code must be an InstructionStream') if not (isinstance(lsa, int) or issubclass(type(lsa), (spe.Register, spe.Variable))): raise Exception('lsa must be an integer, Register, or Variable') old_code = spu.get_active_code() spu.set_active_code(code) # Acquire registers for address and size, if they were not supplied by the user if self.r_addr is None: r_ea_data = code.prgm.acquire_register() else: r_ea_data = self.r_addr if self.r_size is None: r_size = code.prgm.acquire_register() else: r_size = self.r_size # Create variables ea_addr = var.SignedWord(reg = r_ea_data) aligned_size = var.SignedWord(0) mod_16 = var.SignedWord(0xF) # Initialize the lsa_addr variable. if isinstance(lsa, int): # From a constant ls_addr = var.SignedWord(lsa) elif issubclass(type(lsa), (spe.Register, spe.Variable)): # From a variable ls_addr = var.SignedWord() ls_addr.v = lsa tag_var = var.SignedWord(tag) cmp = var.SignedWord(0) # Load the effective address if self.r_addr is None: if self.addr % 16 != 0: print '[get_memory] Misaligned data' util.load_word(code, ea_addr, self.addr) # Load the size, rounding up as required to be 16-byte aligned if self.r_size is None: rnd_size = self.size * var.INT_SIZES[self.typecode] if rnd_size < 16: rnd_size = 16 elif (rnd_size % 16) != 0: rnd_size += (16 - (rnd_size % 16)) util.load_word(code, aligned_size, rnd_size) else: # TODO: !!! UNIT TEST THIS !!! # Same as above, but using SPU arithemtic to round size = var.SignedWord(reg = r_size) sixteen = var.SignedWord(16) cmp.v = ((size & mod_16) == size) aligned_size.v = size + (sixteen - (size & mod_16)) spu.selb(aligned_size.reg, size.reg, aligned_size.reg, cmp.reg, order = _mi(spu.selb)) code.release_register(sixteen.reg) # Use an auxillary register for the moving ea value if the # caller supplied the address register if self.r_addr is not None: ea_load = var.SignedWord(0) ea_load.v = ea_addr else: ea_load = ea_addr # note that this is reference, not .v assignment # Transfer parameters buffer_size = var.SignedWord(16384) remaining = var.SignedWord(0) transfer_size = var.SignedWord(0) remaining.v = aligned_size # Set up the iterators to transfer at most 16k at a time xfer_iter = syn_iter(code, 0, 16384) xfer_iter.set_stop_reg(aligned_size.reg) for offset in xfer_iter: cmp.v = buffer_size > remaining spu.selb(transfer_size, buffer_size, remaining, cmp) # Transfer the data kernel(code, ls_addr, ea_load, transfer_size, tag_var) ls_addr.v = ls_addr + buffer_size ea_load.v = ea_load + buffer_size remaining.v = remaining - buffer_size # Set the tag bit to tag dma.mfc_write_tag_mask(code, 1<<tag); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Release the registers code.release_register(buffer_size.reg) code.release_register(remaining.reg) code.release_register(aligned_size.reg) code.release_register(transfer_size.reg) code.release_register(cmp.reg) code.release_register(ls_addr.reg) code.release_register(tag_var.reg) code.release_register(ea_load.reg) if old_code is not None: spu.set_active_code(old_code) return