def fb_draw(): code0 = synspu.InstructionStream() code1 = synspu.InstructionStream() proc = synspu.Processor() fb = cell_fb.framebuffer() cell_fb.fb_open(fb) draw0 = FBDraw() draw0.set_buffers(cell_fb.fb_addr(fb, 0), cell_fb.fb_addr(fb, 1)) draw0.set_stride(fb.stride) draw0.synthesize(code0) draw1 = FBDraw() draw1.set_buffers(cell_fb.fb_addr(fb, 1), cell_fb.fb_addr(fb, 0))cell_fb.fb_addr(fb, 0)) draw1.set_stride(fb.stride) draw1.synthesize(code1) while True: # cell_fb.fb_clear(fb, 0) proc.execute(code0) cell_fb.fb_wait_vsync(fb) cell_fb.fb_flip(fb, 0) # cell_fb.fb_clear(fb, 1) proc.execute(code1) cell_fb.fb_wait_vsync(fb) cell_fb.fb_flip(fb, 1) cell_fb.fb_close(fb) return
def TestAll(): import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) a = code.prgm.acquire_register() b = code.prgm.acquire_register() c = code.prgm.acquire_register() shr(c, a, b) cneq(c, a, b) cge(c, a, b) cgei(c, a, 10) lt(c, a, b) lti(c, a, 10) a_immediate(c, a, 10) a_immediate(c, a, 10000) sf_immediate(c, a, 10000) prgm.add(code) prgm.print_code() proc = env.Processor() proc.execute(prgm) return
def generate(self, results, pattern, r1_range, r2_range, max_init, max_n, size): # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[i] = r1_range[0] ranges[4 + i] = r2_range[0] ranges[8 + i] = r1_inc ranges[12 + i] = r2_inc # Setup the pattern vector bits = _pattern2vector(pattern) # Copy the paramters to aligned buffers #a_ranges = synspu.aligned_memory(len(ranges), typecode='I') #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges)) #a_pattern = synspu.aligned_memory(len(bits), typecode='I') #a_pattern.copy_to(bits.buffer_info()[0], len(bits)) renderer = MailboxRenderer() ly_block = LyapunovBlock() ly_block.set_size(size[0], size[1]) #ly_block.set_range(a_ranges) #ly_block.set_pattern(a_pattern) ly_block.set_range(ranges) ly_block.set_pattern(bits) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) code = synspu.InstructionStream() ly_block.synthesize(code) proc = synspu.Processor() spe_id = proc.execute(code, async=True) for i in range(size[0] * size[1]): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) # for x in range(size[0]): # r2 = r2_range[0] + r2_inc # print 'col:', x, r1, r2 # for y in range(size[1]): # results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n) # r2 += r2_inc # r1 += r1_inc return
def TestSetSlotValue(): import corepy.arch.spu.platform as synspu import corepy.arch.spu.types.spu_types as var import corepy.arch.spu.lib.dma as dma prgm = synspu.Program() code = prgm.get_stream() proc = synspu.Processor() spu.set_active_code(code) a = var.SignedWord(0x11) b = var.SignedWord(0x13) r = var.SignedWord(0xFFFFFFFF) set_slot_value(code, r, 0, 0x10) set_slot_value(code, r, 1, a) set_slot_value(code, r, 2, 0x12) set_slot_value(code, r, 3, b) for i in range(4): spu.wrch(r, dma.SPU_WrOutMbox) spu.rotqbyi(r, r, 4) prgm.add(code) spe_id = proc.execute(prgm, async=True) for i in range(4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass result = synspu.spu_exec.read_out_mbox(spe_id) assert (result == (i + 0x10)) proc.join(spe_id) return
def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode='fp') x_test = array.array('f', [1.0, 2.0, 3.0, 4.0]) y_test = array.array('f', [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert (result == r_test) return
def TestTanimoto(): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) x_regs = code.acquire_registers(2) y_regs = code.acquire_registers(2) result = code.acquire_register() tan = Tanimoto() tan.set_n_bits(256) tan.set_x_regs(x_regs) tan.set_y_regs(y_regs) tan.set_result_reg(result) tan.synthesize(code) code.print_code() proc.execute(code) # TODO: Do a real test, not just a synthesis test return
def TestDecrementer(): code = synspu.InstructionStream() spu_write_decr(code, 0x7FFFFFFFl) spu_start_decr(code) # Get a message from the PPU spu_read_in_mbox(code) reg = spu_read_decr(code) spu_write_out_mbox(code, reg) spu_stop_decr(code) proc = synspu.Processor() spe_id = proc.execute(code, async=True) print 'test is sleeping for 1 second' time.sleep(1) synspu.spu_exec.write_in_mbox(spe_id, 0x44CAFE) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spu said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def TestVecIter(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) for buffer in stream: for current in spu_vec_iter(code, md): current.v = current + current prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def TestMbox(): code = synspu.InstructionStream() # Send a message to the PPU spu_write_out_mbox(code, 0xDEADBEEFl) # Get a message from the PPU reg = spu_read_in_mbox(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_in_mbox(spe_id, 0x88CAFE) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
def TestStreamBufferDouble(n_spus = 1): n = 2048 a = extarray.extarray('I', range(n)) buffer_size = 32 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] n_bytes = n * 4 #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True) if n_spus > 1: stream = parallel(stream) for buffer in stream: for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, len(a)): assert(a[i] == i + i) return
def test_stream_popc(): code = synspu.InstructionStream() proc = synspu.Processor() bits = array.array('I', range(1024)) for i in range(0, 1024, 4): bits[i] = 0x01010101 # 4 bits bits[i + 1] = 0xFFFFFFFF # 32 bits bits[i + 2] = 0x10101010 # 4 bits bits[i + 3] = 0xFF0FF0F0 # 20 bits = 60 bits total # bits[i] = 1 # bits[i+1] = 2 # bits[i+2] = 3 # bits[i+3] = 4 #abits = synspu.aligned_memory(len(bits), typecode = 'I') #abits.copy_to(bits.buffer_info()[0], len(bits)) popc = syn_popc_stream() popc.set_stream_addr(bits.buffer_info()[0]) popc.set_stream_size(len(bits)) popc.synthesize(code) count = proc.execute(code, mode='mbox') print '-->', count assert (count == 60 * 1024 / 4) return
def TestAll(): import corepy.arch.spu.platform as env code = env.InstructionStream() spu.set_active_code(code) a = code.acquire_register() b = code.acquire_register() c = code.acquire_register() shr(c, a, b) cneq(c, a, b) cge(c, a, b) cgei(c, a, 10) lt(c, a, b) lti(c, a, 10) a_immediate(c, a, 10) a_immediate(c, a, 10000) sf_immediate(c, a, 10000) code.print_code() proc = env.Processor() proc.execute(code) return
def __init__(self): self.spus = spu_bank() self.n_spus = 0 self.proc = synspu.Processor() self.prog = None return
def DoubleBufferExample(n_spus=6): """ stream_buffer is an iterator that streams data from main memory to SPU local store in blocked buffers. The buffers can be managed using single or double buffering semantics. The induction variable returned by the buffer returns the address of the current buffer. Note: stream_buffer was designed before memory descriptors and has not been updated to support them yet. The interface will change slightly when the memory classes are finalized. """ n = 30000 buffer_size = 16 # Create an array and align the data a = extarray.extarray('I', range(n)) addr = a.buffer_info()[0] n_bytes = n * 4 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = SignedWord(0, code) two = SignedWord(2, code) # Create the stream buffer, parallelizing it if using more than 1 SPU stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save=True) if n_spus > 1: stream = parallel(stream) # Loop over the buffers for buffer in stream: # Create an iterators that computes the address offsets within the # buffer. Note: this will be supported by var/vec iters soon. for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current - two code.add(spu.stqx(current, lsa, buffer)) # Run the synthetic program and copy the results back to the array proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(2, len(a)): try: assert (a[i] == i - 2) except: print 'DoubleBuffer error:', a[i], i - 2 return
def TestFloats(): import math code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) code.set_debug(True) # Create a simple SPU program that computes log for all values bettween # .01 and 10.0 with .01 increments start = .65 stop = .75 inc = .01 sp_step = 0x3C23D70A # r_current = var.Word(0x3C23D70A) # .01 in single precision r_current = var.Word(0x3F266666) r_step = var.Word(sp_step) # .01 in single precision result = var.Word(0) log = SPULog() log.setup(code) log.set_result(result) log.set_x(r_current) log_iter = syn_iter(code, int((stop - start) / inc)) for i in log_iter: log.synthesize(code) spu.fa(r_current, r_current, r_step) spu.wrch(result, dma.SPU_WrOutMbox) # code.print_code() spe_id = proc.execute(code, mode='async') x = start for i in range(int((stop - start) / inc)): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass slog = synspu.spu_exec.read_out_mbox(spe_id) print '%.3f 0x%08X %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2)) x += inc proc.join(spe_id) return
def RunTest(test): import corepy.arch.spu.platform as env #from corepy.arch.spu.platform import InstructionStream, Processor prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) test() prgm.add(code) prgm.print_code() proc = env.Processor() proc.execute(prgm) return
def test_c_popc(): code = synspu.NativeInstructionStream("spu_popc") proc = synspu.Processor() params = synspu.spu_exec.ExecParams() params.p7 = 0x01010101 # 4 bits params.p8 = 0xFFFFFFFF # 32 bits params.p9 = 0x10101010 # 4 bits params.p10 = 0xFF0FF0F0 # 20 bits = 60 bits total count = proc.execute(code, mode='mbox', params=params) assert (count == 60) print 'test_syn_c passed' return
def TestStreamBufferSingle(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 128 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) #r_bufsize = code.acquire_register() #r_lsa = code.acquire_register() #r_current = code.acquire_register() for buffer in stream: #util.load_word(code, r_bufsize, buffer_size) #code.add(spu.il(r_lsa, 0)) #loop = code.size() #code.add(spu.lqx(r_current, buffer, r_lsa)) #code.add(spu.a(r_current, r_current, r_current)) #code.add(spu.stqx(r_current, buffer, r_lsa)) #code.add(spu.ai(r_bufsize, r_bufsize, -16)) #code.add(spu.ai(r_lsa, r_lsa, 16)) #code.add(spu.brnz(r_bufsize, loop - code.size())) for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current #current.v = 5 code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def TestSaveBuffer1(): import array code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) n = 2**14 data = array.array('I', range(n)) #data = synspu.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) save_buffer = SaveBuffer() save_buffer.setup() save_buffer.init_ls_buffer(0, 128) save_buffer.init_mm_buffer(data.buffer_info()[0], n) value = var.SignedWord(0xCAFEBABE) for i in spuiter.syn_iter(code, n / 4): save_buffer.save_register(value) code.print_code() spe_id = proc.execute(code, mode='async') for i in range(n / 4): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) #data.copy_from(data_array.buffer_info()[0], len(data_array)) print data[:10] return
def TestContinueLabel(n_spus=1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = var.SignedWord(0, code) test = var.SignedWord(0, code) four = var.SignedWord(4, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save=True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) lsa_iter = spu_vec_iter(code, md) for buffer in stream: for current in lsa_iter: current.v = current + current test.v = (current == four) code.add(spu.gbb(test, test)) #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl)) code.add(spu.brz(test.reg, lsa_iter.continue_label)) current.v = current + current #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next)) proc = env.Processor() r = proc.execute(code, n_spus=n_spus) for i in range(0, n): if i >= 4: assert (a[i] == i + i) else: #print a[i] assert (a[i] == i * 4) return
def test_syn(kernel): code = synspu.InstructionStream() proc = synspu.Processor() popc = kernel() popc.synthesize(code) params = synspu.spu_exec.ExecParams() params.p7 = 0x01010101 # 4 bits params.p8 = 0xFFFFFFFF # 32 bits params.p9 = 0x10101010 # 4 bits params.p10 = 0xFF0FF0F0 # 20 bits = 60 bits total count = proc.execute(code, mode='mbox', params=params) assert (count == 60) return
def TestLog(): code = synspu.InstructionStream() proc = synspu.Processor() spu.set_active_code(code) # Create a simple SPU program that computes log for 10 values and # sends the result back using the mailbox log = SPULog() values = [] result = code.acquire_register() N = 10 x = 1 for i in range(N): val = var.Word(x) spu.cuflt(val, val, 155) values.append(val) x = x * 10 log.setup(code) log.set_result(result) for i in range(N): log.set_x(values[i]) log.synthesize(code) spu.wrch(result, dma.SPU_WrOutMbox) spe_id = proc.execute(code, mode='async') x = 1 for i in range(N): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'log said: 0x%08X (%d)' % ( synspu.spu_exec.read_out_mbox(spe_id), x) x = x * 10 proc.join(spe_id) return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = prgm.acquire_register(reg_name=55) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='int', stop=True, debug=True) assert (r[0] == 42) assert (r[1] == 0x100A) prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) prgm.add(code) prgm.print_code(hex=True) r = proc.execute(prgm, mode='fp') print r return
def TestFloatScalar(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma import corepy.arch.spu.platform as env prgm = env.Program() code = prgm.get_stream() spu.set_active_code(code) x = SingleFloat(1.0) y = SingleFloat(2.0) r = SingleFloat(0.0, reg=code.fp_return) r.v = spu.fa.ex(x, y) prgm.add(code) proc = env.Processor() result = proc.execute(prgm, mode='fp') assert (result == (1.0 + 2.0)) return
def TestSignal(): code = synspu.InstructionStream() # Get a signal from the PPU reg = spu_read_signal1(code) # And send it back code.add(spu.wrch(reg, SPU_WrOutMbox)) proc = synspu.Processor() spe_id = proc.execute(code, async=True) synspu.spu_exec.write_signal(spe_id, 1, 0xCAFEBABEl) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'sig said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) return
import corepy.lib.extarray as extarray import corepy.arch.spu.isa as spu import corepy.arch.spu.platform as env import corepy.arch.spu.lib.dma as dma from corepy.arch.spu.lib.util import load_word import time if __name__ == '__main__': ITERS = 500000 #ITERS = 15 prgm = env.Program() code = prgm.get_stream() proc = env.Processor() spu.set_active_code(code) psmap = extarray.extarray('I', 131072 / 4) data = extarray.extarray('I', range(0, 16)) r_sum = prgm.gp_return r_cnt = prgm.acquire_register() spu.xor(r_sum, r_sum, r_sum) load_word(code, r_cnt, ITERS) lbl_loop = prgm.get_label("loop") code.add(lbl_loop) reg = dma.spu_read_in_mbox(code)
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) prgm = env.Program() code = prgm.get_stream() r_ea_data = prgm.acquire_register() r_ls_data = prgm.acquire_register() r_size = prgm.acquire_register() r_tag = prgm.acquire_register() #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, code.r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, code.r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, code.r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, code.r_zero, lsa)) # code.prgm.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, code.r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Cleanup prgm.release_register(r_ea_data) prgm.release_register(r_ls_data) prgm.release_register(r_size) prgm.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code prgm.add(code) proc = env.Processor() r = proc.execute(prgm) for i in range(0, size): assert(data[i] == i + i) return
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True): import time # n_spus = 8 # buffer_size = 16 # 16 ints/buffer # n_buffers = 4 # 4 buffers/spu # n_buffers = size / buffer_size # size = buffer_size * n_buffers * n_spus # data = array.array('I', range(size + 2)) #data = env.aligned_memory(n, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16) code = env.ParallelInstructionStream() # code = env.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0])) # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % ( # r_zero, r_ea_data, r_ls_data, r_size, r_tag) # Load the effective address if data.buffer_info()[0] % 16 == 0: util.load_word(code, r_ea_data, data.buffer_info()[0]) else: util.load_word(code, r_ea_data, data.buffer_info()[0] + 8) ea_start = data.buffer_info()[0] # Iterate over each buffer for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)): # ea = var.SignedWord(code = code, reg = r_ea_data) # print 'n_iters:', size / buffer_size # for i in syn_range(code, size / buffer_size): # code.add(spu.stop(0xB)) # Load the size util.load_word(code, r_size, buffer_size * 4) # Load the tag code.add(spu.ai(r_tag, r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, ea, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1<<12); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) count = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, buffer_size * 4, 16): code.add(spu.lqx(current, r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, r_zero, lsa)) count.v = count + 1 code.add(spu.stqx(count, r_zero, 0)) # code.release_register(r_current) current.release_registers(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag) # Set the tag bit to 13 dma.mfc_write_tag_mask(code, 1<<13); # Wait for the transfer to complete dma.mfc_read_tag_status_all(code); # code.add(spu.stop(0xB)) # Update ea # ea.v = ea + (buffer_size * 4) # /for ea address # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) if not run_code: return code # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = env.Processor() #data.copy_from(data_array.buffer_info()[0], len(data_array)) def print_blocks(): for i in range(0, size, buffer_size): # print data[i:(i + buffer_size)] print data[i + buffer_size], print '' # print_blocks() s = time.time() r = proc.execute(code, n_spus = n_spus) # r = proc.execute(code) t = time.time() - s # print_blocks() return t
def TestTanimotoBlock(n_vecs = 4): code = synspu.InstructionStream() proc = synspu.Processor() code.set_debug(True) spu.set_active_code(code) tb = TanimotoBlock() ls_save = LocalSave() mm_save = MemorySave() code.set_debug(True) # Input block parameters m = 128 n = 64 # n_vecs = 9 n_bits = 128 * n_vecs # Main memory results buffer # max_results = 2**16 max_results = 16384 words_per_result = 4 mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)]) #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I') # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data)) mm_results = spuiter.memory_desc('I') #mm_results.from_array(mm_results_buffer) mm_results.from_array(mm_results_data) mm_save.set_md_save_buffer(mm_results) # Local Results buffer buffer_size = var.SignedWord(16384) buffer_addr = var.SignedWord(m * n * n_vecs * 4) ls_results = spuiter.memory_desc('B') ls_results.set_size_reg(buffer_size) ls_results.set_addr_reg(buffer_addr) ls_save.set_md_results(ls_results) ls_save.set_mm_save_op(mm_save) # Setup the TanimotoBlock class tb.set_n_bits(n_bits) tb.set_block_size(m, n) tb.set_x_addr(0) tb.set_y_addr(m * n_vecs * 16) tb.set_save_op(ls_save) # Main test loop n_samples = 10000 for samples in spuiter.syn_iter(code, n_samples): tb.synthesize(code) spu.wrch(buffer_size, dma.SPU_WrOutMbox) spu.stop(0x2000) # "Function" Calls ls_save.block() mm_save.block() # code.print_code() start = time.time() spe_id = proc.execute(code, async=True) while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) stop = time.time() # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data)) proc.join(spe_id) total = stop - start bits_sec = (m * n * n_bits * n_samples) / total / 1e9 ops_per_compare = 48 * 4 + 8 # 48 SIMD instructions, 8 scalar insts_per_compare = 56 gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9 ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9 print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % ( total, bits_sec, gops, ginsts, code.size()) return
def generate(self, results, patterns, r1_range, r2_range, max_init, max_n, size, n_spus=6): # Connect to the framebuffer #fb = cell_fb.framebuffer() #cell_fb.fb_open(fb) buffer = extarray.extarray('B', size[0] * size[1] * 4) buffer.clear() # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = [0 for i in range(n_spus)] #a_ranges = [0 for i in range(n_spus)] # Slice and dice for parallel execution spu_slices = [[size[0], size[1] / n_spus] for ispu in range(n_spus)] spu_slices[-1][1] += size[1] % n_spus offset = 0.0 for ispu in range(n_spus): ranges[ispu] = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[ispu][ i] = r1_range[0] + float(i) * r1_inc # horizontal is simd ranges[ispu][4 + i] = r2_range[0] + offset ranges[ispu][8 + i] = r1_inc * 4.0 ranges[ispu][12 + i] = r2_inc # print ranges # Copy the paramters to aligned buffers #a_ranges[ispu] = synspu.aligned_memory(len(ranges[ispu]), typecode='I') #a_ranges[ispu].copy_to(ranges[ispu].buffer_info()[0], len(ranges[ispu])) offset += r2_inc * spu_slices[ispu][1] # Setup the pattern vector for pattern in patterns: if len(pattern) != len(patterns[0]): raise Exception('All patterns must be the same length') bits = [_pattern2vector(pattern) for pattern in patterns] #a_pattern = synspu.aligned_memory(len(bits[0]), typecode='I') pattern = extarray.extarray('I', len(bits[0])) # Create the instruction streams codes = [] n = len(patterns) * 10 offset = 0 for ispu in range(n_spus): renderer = FBRenderer() renderer.set_lsa(0x100) #renderer.set_addr(cell_fb.fb_addr(fb, 0) + offset) renderer.set_addr(buffer.buffer_info()[0] + offset) renderer.set_width(size[0]) #renderer.set_stride(fb.stride) renderer.set_stride(size[0]) ly_block = LyapunovBlock() ly_block.set_size(*spu_slices[i]) #ly_block.set_range(a_ranges[ispu]) ly_block.set_range(ranges[ispu]) #ly_block.set_pattern(a_pattern) ly_block.set_pattern(pattern) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) code = synspu.InstructionStream() # code.set_debug(True) codes.append(code) #offset += spu_slices[i][1] * fb.stride * 4 offset += spu_slices[i][1] * size[0] * 4 # for i in spuiter.syn_range(code, n): ly_block.synthesize(code) # code.print_code() proc = synspu.Processor() #cell_fb.fb_clear(fb, 0) buffer.clear() import time ids = [0 for i in range(n_spus)] start = time.time() ipattern = 0 n_patterns = len(patterns) len_bits = len(bits[0]) pattern_inc = 1 for i in range(n): #a_pattern.copy_to(bits[ipattern].buffer_info()[0], len_bits) # TODO - better/faster for j in xrange(0, len_bits): pattern[j] = bits[ipattern][j] for ispu in range(n_spus): ids[ispu] = proc.execute(codes[ispu], async=True) for ispu in range(n_spus): proc.join(ids[ispu]) #cell_fb.fb_wait_vsync(fb) #cell_fb.fb_flip(fb, 0) # TODO - write buffer to image file #im = Image.frombuffer("RGBA", size, buffer.tostring(), "raw", "RGBA", 0, 1) imgbuf = Image.new("RGBA", size) arr = [(buffer[i + 3], buffer[i + 2], buffer[i + 1], 0xFF) for i in xrange(0, len(buffer), 4)] imgbuf.putdata(arr) imgbuf.save("lyapunov_%d.png" % ipattern) ipattern += pattern_inc if (ipattern == (n_patterns - 1)) or (ipattern == 0): pattern_inc *= -1 print ipattern stop = time.time() print '%.2f fps (%.6f)' % (float(n) / (stop - start), (stop - start)) #cell_fb.fb_close(fb) return