def TestFloatArray(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma code = InstructionStream() spu.set_active_code(code) x = SingleFloat([1.0, 2.0, 3.0, 4.0]) y = SingleFloat([0.5, 1.5, 2.5, 3.5]) sum = SingleFloat(0.0) sum.v = spu.fa.ex(x, y) r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg = code.fp_return) for i in range(4): r.v = spu.fa.ex(sum, r) spu.rotqbyi(sum, sum, 4) proc = Processor() result = proc.execute(code, mode='fp') x_test = array.array('f', [1.0, 2.0, 3.0, 4.0]) y_test = array.array('f', [0.5, 1.5, 2.5, 3.5]) r_test = 0.0 for i in range(4): r_test += x_test[i] + y_test[i] assert(result == r_test) return
def DoubleBufferExample(n_spus=6): """ stream_buffer is an iterator that streams data from main memory to SPU local store in blocked buffers. The buffers can be managed using single or double buffering semantics. The induction variable returned by the buffer returns the address of the current buffer. Note: stream_buffer was designed before memory descriptors and has not been updated to support them yet. The interface will change slightly when the memory classes are finalized. """ n = 30000 buffer_size = 16 # Create an array and align the data a = array.array('I', range(n)) addr = a.buffer_info()[0] n_bytes = n * 4 if n_spus > 1: code = ParallelInstructionStream() else: code = InstructionStream() current = SignedWord(0, code) two = SignedWord(2, code) # Create the stream buffer, parallelizing it if using more than 1 SPU stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save=True) if n_spus > 1: stream = parallel(stream) # Loop over the buffers for buffer in stream: # Create an iterators that computes the address offsets within the # buffer. Note: this will be supported by var/vec iters soon. for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current - two code.add(spu.stqx(current, lsa, buffer)) # Run the synthetic program and copy the results back to the array proc = Processor() r = proc.execute(code, n_spus=n_spus) for i in range(2, len(a)): try: assert (a[i] == i - 2) except: print 'DoubleBuffer error:', a[i], i - 2 return
def RunTest(test): from corepy.arch.spu.platform import InstructionStream, Processor code = InstructionStream() spu.set_active_code(code) test() code.print_code() proc = Processor() proc.execute(code) return
def TestFloatScalar(): from corepy.arch.spu.platform import InstructionStream, Processor import corepy.arch.spu.lib.dma as dma code = InstructionStream() spu.set_active_code(code) x = SingleFloat(1.0) y = SingleFloat(2.0) r = SingleFloat(0.0, reg = code.fp_return) r.v = spu.fa.ex(x, y) proc = Processor() result = proc.execute(code, mode='fp') assert(result == (1.0 + 2.0)) return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert (r == 0xD) return
def bi_bug(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers stop_inst = SignedWord(0x200D) stop_addr = SignedWord(0x0) spu.stqa(stop_inst, 0x0) spu.bi(stop_addr) spu.stop(0x200A) r = proc.execute(code) assert r == 0xD return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction spu.brz(test, 2) spu.stop(0x100A) spu.stop(0x100B) code.print_code(hex=True) r = proc.execute(code, mode='int', stop=True, debug=True) assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) util.load_float(code, code.fp_return, 3.14) code.print_code(hex=True) r = proc.execute(code, mode='fp') print r return
def SimpleSPU(): """ A very simple SPU that computes 11 + 31 and returns 0xA on success. """ code = InstructionStream() proc = Processor() spu.set_active_code(code) # Acquire two registers #x = code.acquire_register() x = code.gp_return test = code.acquire_register() lbl_brz = code.get_label("BRZ") lbl_skip = code.get_label("SKIP") spu.hbrr(lbl_brz, lbl_skip) spu.xor(x, x, x) # zero x spu.ai(x, x, 11) # x = x + 11 spu.ai(x, x, 31) # x = x + 31 spu.ceqi(test, x, 42) # test = (x == 42) # If test is false (all 0s), skip the stop(0x100A) instruction code.add(lbl_brz) spu.brz(test, lbl_skip) spu.stop(0x100A) code.add(lbl_skip) spu.stop(0x100B) code.print_code(hex=True, pro=True, epi=True) r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 42) assert (r[1] == 0x100A) code = InstructionStream() spu.set_active_code(code) lbl_loop = code.get_label("LOOP") lbl_break = code.get_label("BREAK") r_cnt = code.acquire_register() r_stop = code.acquire_register() r_cmp = code.acquire_register() r_foo = code.gp_return spu.ori(r_foo, code.r_zero, 0) spu.ori(r_cnt, code.r_zero, 0) util.load_word(code, r_stop, 10) code.add(lbl_loop) spu.ceq(r_cmp, r_cnt, r_stop) spu.brnz(r_cmp, lbl_break) spu.ai(r_cnt, r_cnt, 1) spu.a(r_foo, r_foo, r_cnt) spu.br(lbl_loop) code.add(lbl_break) code.print_code() r = proc.execute(code, mode='int', stop=True) print "ret", r assert (r[0] == 55) return
def MemoryDescExample(data_size=20000): """ This example uses a memory descriptor to move 20k integers back and forth between main memory and the SPU local store. Each value is incremented by 1 while on the SPU. Memory descriptors are a general purpose method for describing a region of memory. Memory is described by a typecode, address, and size. Memory descriptors can be initialized by hand or from an array or buffer object. For main memory, memory descriptors are useful for transfering data between main memory and an SPU's local store. The get/put methods on a memory descriptor generate the SPU code to move data of any size between main memory and local store. Memory descriptors can also be used with spu_vec_iters to describe the region of memory to iterate over. The typecode in the memory descriptor is used to determine the type for the loop induction variable. Note that there is currently no difference between memory descriptors for main memory and local store. It's up to the user to make sure the memory descriptor settings make sense in the current context. (this will probably change in the near future) Note: get/put currently use loops rather than display lists for transferring data over 16k. """ code = InstructionStream() proc = Processor() code.debug = True spu.set_active_code(code) # Create a python array data = extarray.extarray('I', range(data_size)) # Align the data in the array #a_data = aligned_memory(data_size, typecode = 'I') #a_data.copy_to(data.buffer_info()[0], data_size) # Create memory descriptor for the data in main memory data_desc = memory_desc('I') #data_desc.from_array(a_data) data_desc.from_array(data) # Transfer the data to 0x0 in the local store data_desc.get(code, 0) # Create memory descriptor for the data in the local store for use # in the iterator lsa_data = memory_desc('i', 0, data_size) # Add one to each value for x in spu_vec_iter(code, lsa_data): x.v = x + 1 # Transfer the data back to main memory data_desc.put(code, 0) dma.spu_write_out_mbox(code, 0xCAFE) # Execute the synthetic program # code.print_code() spe_id = proc.execute(code, async=True) proc.join(spe_id) # Copy it back to the Python array #a_data.copy_from(data.buffer_info()[0], data_size) for i in xrange(data_size): assert (data[i] == i + 1) return
def SpeedTest(n_spus=6, n_floats=6): """ Get a rough estimate of the maximum flop count. On a PS3 using all 6 spus, this is 152 GFlops. """ if n_spus > 1: code = ParallelInstructionStream() else: code = InstructionStream() spu.set_active_code(code) f_range = range(n_floats) a = [SingleFloat(0.0) for i in f_range] b = [SingleFloat(0.0) for i in f_range] c = [SingleFloat(0.0) for i in f_range] t = [SingleFloat(0.0) for i in f_range] outer = 2**12 inner = 2**16 unroll = 128 fuse = 2 simd = 4 for x in syn_iter(code, outer): for y in syn_iter(code, inner): for u in range(unroll): for i in f_range: t[i].v = spu.fma.ex(a[i], b[i], c[i]) # Run the synthetic program and copy the results back to the array # TODO - AWF - use the SPU decrementers to time this proc = Processor() start = time.time() r = proc.execute(code, n_spus=n_spus) stop = time.time() total = stop - start n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long( fuse) * long(simd) * long(n_spus) print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) # # Run the native program and copy the results back to the array # outer = 2**14 # inner = 2**16 # unroll = 1 # fuse = 1 # simd = 1 # proc = Processor() # # ncode = NativeInstructionStream("a.out") # start = time.time() # r = proc.execute(ncode, n_spus = n_spus) # stop = time.time() # total = stop - start # n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus) # print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9) results = """ --> No optimizations Executing native code: a.out 14.805322 sec, 20.89 GFlops --> Synthetic Platform: linux.spre_linux_spu no raw data 65.023350 sec, 152.19 GFlops --> -O3 (fuse: 2, simd: 4) Executing native code: a.out 7.407939 sec, 41.74 GFlops --> -O3 (fuse: 1, simd: 1) Executing native code: a.out 7.403702 sec, 5.22 GFlops """ return