def ParMD5Final(pardigest, parcontext): num = parcontext.number #print map(hex, [parPADDING[k*64] for k in range(num)]) parbits = extarray.extarray('B', 8*num) ParEncode(num, parbits, parcontext.count, 8) index = (parcontext.count[0] // 8) % 64 #import pdb #pdb.set_trace() if index < 56: padLen = 56 - index else: padLen = 120 - index parPADDING = extarray.extarray('B', padLen*num) for k in range(num): for i in range(padLen): parPADDING[k*padLen + i] = 0 parPADDING[k*padLen] = 128 ParMD5Update(parcontext, parPADDING, padLen) ParMD5Update(parcontext, parbits, 8) state = extarray.extarray('I', 4*num) for k in range(num): state[k*4 + 0] = parcontext.statea[k] state[k*4 + 1] = parcontext.stateb[k] state[k*4 + 2] = parcontext.statec[k] state[k*4 + 3] = parcontext.stated[k] #print map(hex, state) ParEncode(num, pardigest, state, 16)
def MD5Final(digest, context): PADDING = extarray.extarray('B', 64) for i in range(64): PADDING[i] = 0 PADDING[0] = 128 bits = extarray.extarray('B', 8) Encode(bits, context.count, 8) index = (context.count[0] // 8) % 64 if index < 56: padLen = 56 - index else: padLen = 120 - index MD5Update(context, PADDING, padLen) MD5Update(context, bits, 8) print map(hex, context.state) Encode(digest, context.state, 16)
def alloc_host(self, typecode, length, comps=1): """ Allocate local GPU memory and return a handle for copying/binding. Typecode is ptx typecode (u32, s32, f32, u64, etc.) """ # fmt = self._get_fmt(typecode, comps) array_typecode = "" # This might be clearer, but not very efficient... # type_conversion_table = {} # type_conversion_table['32'] = {'f': 'f', 'u': 'I', 's', 'i'} # type_conversion_table['64'] = {'f': 'd', 'u': 'L', 's', 'l'} # type_conversion_table['16'] = {'u': 'H', 's', 'h'} # type_conversion_table['8'] = {'u': 'B', 's', 'b'} # # if typecode == 'b': # typecode = 'u' # array_typecode = type_conversion_table[typecode[0]][typecode[1:]] scalar_width = int(typecode[1:]) if typecode[0] == "f": if scalar_width == 32: array_typecode = "f" elif scalar_width == 64: array_typecode = "d" elif typecode[0] == "u": if scalar_width == 32: array_typecode = "I" elif scalar_width == 64: array_typecode = "L" elif scalar_width == 16: array_typecode = "H" elif scalar_width == 8: array_typecode = "b" elif typecode[0] == "s": if scalar_width == 32: array_typecode = "i" elif scalar_width == 64: array_typecode = "l" elif scalar_width == 16: array_typecode = "h" elif scalar_width == 8: array_typecode = "B" if array_typecode == "": raise Exception("Unable to convert type") mem = ptx_exec.alloc_host(length * scalar_byte_width * comps) arr = extarray.extarray(array_typecode, 0) arr.data_len = scalar_width / 4 * length * comps arr.set_memory(mem, arr.data_len * 4) arr.gpu_mem_handle = mem # arr.gpu_device = self.device arr.gpu_width = length # arr.gpu_pitch = mem[2] # arr.gpu_height = height return arr
def ParallelTests(): max_exp = 16 max_size = pow(2, max_exp) print 'Creating data...' data = extarray.extarray('I', range(max_size)) print 'Executing Tests...' # t = TestSPUParallelIter(data, 8192, n_spus = 1, buffer_size = 128) # return i = 0 for exponent in range(13, max_exp + 1): size = pow(2, exponent) for n_spus in [1, 2, 4]: # Increase the buffer size until to the largest possible factor for the # number of SPUs or 4096 (*4=16k), whichever is smaller for buffer_exp in range(2, min(exponent - LOG[n_spus] - 2, 12)): buffer_size = pow(2, buffer_exp) # for buffer_size in [4]: t = 0.0 print 'try\t%d\t%d\t%d\t-.-' % (size, n_spus, buffer_size) # for i in range(10): t += TestSPUParallelIter(data, size, n_spus = n_spus, buffer_size = buffer_size) print 'test\t%d\t%d\t%d\t%.8f' % (size, n_spus, buffer_size, t / 10.0) # print 'count:', i i += 1 return
def TestParams(): import time # SIZE = 1024 kernel = """ .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff """ t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray("f", 1) # mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem # ptx_exec.free(input) # ptx_exec.free(output) ##ptx_exec.free(glob) # ptx_exec.unload_module(image) return
def TestVecIter(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) for buffer in stream: for current in spu_vec_iter(code, md): current.v = current + current prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def TestStreamBufferDouble(n_spus = 1): n = 2048 a = extarray.extarray('I', range(n)) buffer_size = 32 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] n_bytes = n * 4 #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True) if n_spus > 1: stream = parallel(stream) for buffer in stream: for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, len(a)): assert(a[i] == i + i) return
def load_double(code, reg, val): data = extarray.extarray('d', (val,)) data.change_type('L') # reg better be an mmx or xmm, should we check? code.add(x86.push(data[0])) code.add(x86.pshufd(reg, mem.MemRef(regs.rsp, data_size = 128), 0x44)) return code.add(x86.add(regs.rsp, 8))
def get_asm_glue(dest): l = [72, 184] l.extend(struct.unpack("8B", struct.pack("l", ctypes.cast(dest, ctypes.c_void_p).value))) l.extend([73, 84, 73, 137, 228, 72, 131, 228, 240, 72, 255, 208, 76, 137, 228, 73, 92, 72, 88, 72, 131, 232, 13, 72, 255, 224]) l = extarray('B', l) make_executable(*l.buffer_info()) l.references.append(dest) return l
def load_float(code, reg, val, clear = False): data = extarray.extarray('f', (val,)) data.change_type('I') # reg better be an mmx or xmm, should we check? code.add(x86.push(data[0])) code.add(x86.pshufd(reg, mem.MemRef(regs.rsp, data_size = 128), 0)) return code.add(x86.add(regs.rsp, 8))
def generate(self, results, pattern, r1_range, r2_range, max_init, max_n, size): # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[i] = r1_range[0] ranges[4 + i] = r2_range[0] ranges[8 + i] = r1_inc ranges[12 + i] = r2_inc # Setup the pattern vector bits = _pattern2vector(pattern) # Copy the paramters to aligned buffers #a_ranges = synspu.aligned_memory(len(ranges), typecode='I') #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges)) #a_pattern = synspu.aligned_memory(len(bits), typecode='I') #a_pattern.copy_to(bits.buffer_info()[0], len(bits)) renderer = MailboxRenderer() ly_block = LyapunovBlock() ly_block.set_size(size[0], size[1]) #ly_block.set_range(a_ranges) #ly_block.set_pattern(a_pattern) ly_block.set_range(ranges) ly_block.set_pattern(bits) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) code = synspu.InstructionStream() ly_block.synthesize(code) proc = synspu.Processor() spe_id = proc.execute(code, async=True) for i in range(size[0] * size[1]): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) # for x in range(size[0]): # r2 = r2_range[0] + r2_inc # print 'col:', x, r1, r2 # for y in range(size[1]): # results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n) # r2 += r2_inc # r1 += r1_inc return
def _set_literal_value(self, value): if type(value) is _array_type: if value.typecode not in self.array_typecodes: raise Exception("Array typecode '%s' is not supported" % (value.typecode,)) if len(value) < INT_ARRAY_SIZES[self.array_typecode]: print 'Warning: Variable array initializer has fewer elements than the corresponding vector: %d < %d' % ( len(value), INT_ARRAY_SIZES[self.array_typecode]) util.load_vector(self.code, self.reg, value.buffer_info()[0]) self.storage = value elif type(value) in (list, tuple): if len(value) < INT_ARRAY_SIZES[self.array_typecode]: print 'Warning: Variable %s initializer has fewer elements than the corresponding vector: %d < %d' % ( type(value), len(value), INT_ARRAY_SIZES[self.array_typecode]) storage = extarray.extarray(self.array_typecode, value) util.load_vector(self.code, self.reg, storage.buffer_info()[0]) self.storage = storage elif type(value) in self.literal_types: if (value & 0x1F) == value and isinstance(self, (SignedByteType, SignedHalfwordType, SignedWordType)): # Use the splat instructions if isinstance(self, SignedByteType): self.code.add(vmx.vspltisb(self.reg, value)) elif isinstance(self, SignedHalfwordType): self.code.add(vmx.vspltish(self.reg, value)) elif isinstance(self, SignedWordType): self.code.add(vmx.vspltisw(self.reg, value)) else: raise Exception('Unsupported typecode for vector literal splat: ' + str(type(self))) else: splat = [self.value for i in xrange(INT_ARRAY_SIZES[self.array_typecode])] vsplat = extarray.extarray(self.array_typecode, splat) util.load_vector(self.code, self.reg, vsplat.buffer_info()[0]) self.code.prgm.add_storage(vsplat) self.storage = vsplat self.value = value if self.storage is not None: self.code.prgm.add_storage(self.storage) return
def TestParamsFull(): import time import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter("u64", name="_mem") _a = prgm.add_parameter("f32", name="_a") _b = prgm.add_parameter("f32", name="_b") rd1 = prgm.acquire_register("u64") r1 = prgm.acquire_register("f32") r2 = prgm.acquire_register("f32") r3 = prgm.acquire_register("f32") r4 = prgm.acquire_register("f32") v1 = prgm.add_variable("shared", "f32") # don't need this, but let's test add_variable code.add(isa.ld("param", r1, regs.ptxAddress(_a))) code.add(isa.ld("param", r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) code.add(isa.ld("param", rd1, regs.ptxAddress(_mem))) code.add(isa.st("global", regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() a = 1.0 b = 2.0 ptx_mem_addr = proc.alloc_device("f32", 1) mem = extarray.extarray("f", 1) mem[0] = 5.0 param_list = [ptx_mem_addr.address, a, b] proc.copy(ptx_mem_addr, mem) prgm.cache_code() for i in range(20): t1 = time.time() proc.execute(prgm, (1, 1, 1, 1, 1), param_list) t2 = time.time() print "run time", t2 - t1 print "#####" print "X", mem.buffer_info()[0], ptx_mem_addr.address proc.copy(mem, ptx_mem_addr) print param_list print mem return
def MD5(s): digest = extarray.extarray('B', 16) length = len(s) context = Context() MD5Init(context) MD5Update(context, s, length) MD5Final(digest, context) print map(hex, map(int, digest))
def py_nbody(): global x, y, vx, vy, m x = extarray.extarray('f', N_BODIES) y = extarray.extarray('f', N_BODIES) vx = extarray.extarray('f', N_BODIES) vy = extarray.extarray('f', N_BODIES) m = extarray.extarray('f', N_BODIES) for i in xrange(0, N_BODIES): x[i] = random.uniform(-1.0, 1.0) y[i] = random.uniform(-1.0, 1.0) #vx[i] = random.uniform(-1.0, 1.0) #vy[i] = random.uniform(-1.0, 1.0) vx[i] = 0.0 vy[i] = 0.0 m[i] = random.uniform(1e9, 1e10) return
def join(self, ti): if not isinstance(ti, spu_exec.ThreadInfo): raise Exception('Invalid thread handle: ' + str(ti)) spu_exec.wait_stream(ti) if ti.mode == self.MODE_INT: regs = extarray.extarray('I', 128 * 4) spu_exec.get_spu_registers(ti, regs.buffer_info()[0]) retval = int(regs[4]) elif ti.mode == self.MODE_FP: regs = extarray.extarray('f', 128 * 4) spu_exec.get_spu_registers(ti, regs.buffer_info()[0]) retval = float(regs[4]) else: retval = None if ti.stop != 0: retval = (retval, spu_exec.get_result(ti)) spu_exec.free_context(ti) return retval
def join(self, ti): if not isinstance(ti, spu_exec.Context): raise TypeError("Invalid context: " + str(ti)) spu_exec.wait_stream(ti) if ti.mode == self.MODE_INT: regs = extarray.extarray("I", 128 * 4) spu_exec.get_spu_registers(ti, regs.buffer_info()[0]) retval = int(regs[4]) elif ti.mode == self.MODE_FP: regs = extarray.extarray("f", 128 * 4) spu_exec.get_spu_registers(ti, regs.buffer_info()[0]) retval = float(regs[4]) else: retval = None if ti.stop != 0: retval = (retval, spu_exec.get_result(ti)) spu_exec.free_context(ti) return retval
def DoubleBufferExample(n_spus = 6): """ stream_buffer is an iterator that streams data from main memory to SPU local store in blocked buffers. The buffers can be managed using single or double buffering semantics. The induction variable returned by the buffer returns the address of the current buffer. Note: stream_buffer was designed before memory descriptors and has not been updated to support them yet. The interface will change slightly when the memory classes are finalized. """ n = 30000 buffer_size = 16 # Create an array and align the data a = extarray.extarray('I', range(n)) addr = a.buffer_info()[0] n_bytes = n * 4 if n_spus > 1: code = env.ParallelInstructionStream() else: code = env.InstructionStream() current = SignedWord(0, code) two = SignedWord(2, code) # Create the stream buffer, parallelizing it if using more than 1 SPU stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True) if n_spus > 1: stream = parallel(stream) # Loop over the buffers for buffer in stream: # Create an iterators that computes the address offsets within the # buffer. Note: this will be supported by var/vec iters soon. for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current - two code.add(spu.stqx(current, lsa, buffer)) # Run the synthetic program and copy the results back to the array proc = env.Processor() r = proc.execute(code, n_spus = n_spus) for i in range(2, len(a)): try: assert(a[i] == i - 2) except: print 'DoubleBuffer error:', a[i], i - 2 return
def __init__(self, number): self.number = number self.statea = extarray.extarray('I', number) self.stateb = extarray.extarray('I', number) self.statec = extarray.extarray('I', number) self.stated = extarray.extarray('I', number) self.count = extarray.extarray('I', 2*number) self.buffer = extarray.extarray('B', 64*number)
def setup(self, code): if self.addr is None: raise Exception('Please set addr') if self._stride is None: raise Exception('Please set stride') self.x_offset = var.Word(0) self.y_offset = var.Word(self.addr) self.stride = var.Word(self._stride * 4) # Mask to extract the lowest 2 bytes from each word in the first vector # into RGB and the first byte from the second vector into A self.uint2rgba = var.Word(extarray.extarray('I', [0x01030303, 0x10070707, 0x100B0B0B, 0x100F0F0F])) self.ff = var.Word(0xFF000000) return
def __init__(self): # Code and memory buffers self.code = env.InstructionStream() self.regs = extarray.extarray('I', 128 * 4) self.regs.clear() # Runtime parameters self.speid = None self.reg_lsa = None self.proc = None self.synthesize() return
def TestStreamBufferSingle(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 128 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) addr = a.buffer_info()[0] stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) #r_bufsize = code.acquire_register() #r_lsa = code.acquire_register() #r_current = code.acquire_register() for buffer in stream: #util.load_word(code, r_bufsize, buffer_size) #code.add(spu.il(r_lsa, 0)) #loop = code.size() #code.add(spu.lqx(r_current, buffer, r_lsa)) #code.add(spu.a(r_current, r_current, r_current)) #code.add(spu.stqx(r_current, buffer, r_lsa)) #code.add(spu.ai(r_bufsize, r_bufsize, -16)) #code.add(spu.ai(r_lsa, r_lsa, 16)) #code.add(spu.brnz(r_bufsize, loop - code.size())) for lsa in syn_iter(code, buffer_size, 16): code.add(spu.lqx(current, lsa, buffer)) current.v = current + current #current.v = 5 code.add(spu.stqx(current, lsa, buffer)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): assert(a[i] == i + i) return
def TestVecIter(): prgm = synppc.Program() code = prgm.get_stream() prgm.add(code) ppc.set_active_code(code) a = extarray.extarray('I', range(16)) for i in vector_iter(code, a): i.v = vmx.vadduws.ex(i, i) ai = extarray.extarray('i', range(16)) for i in vector_iter(code, ai): i.v = vmx.vaddsws.ex(i, i) b = extarray.extarray('H', range(16)) for i in vector_iter(code, b): i.v = vmx.vadduhs.ex(i, i) bi = extarray.extarray('h', range(16)) for i in vector_iter(code, bi): i.v = vmx.vaddshs.ex(i, i) c = extarray.extarray('B', range(16)) for i in vector_iter(code, c): i.v = vmx.vaddubs.ex(i, i) ci = extarray.extarray('b', range(16)) for i in vector_iter(code, ci): i.v = vmx.vaddsbs.ex(i, i) ften = vmx_vars.BitType(10.0) f = extarray.extarray('f', range(16)) for i in vector_iter(code, f): i.v = vmx.vaddfp.ex(i, i) proc = synppc.Processor() r = proc.execute(prgm) expected = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] _array_check(a, expected) _array_check(ai, expected) _array_check(b, expected) _array_check(bi, expected) _array_check(c, expected) _array_check(ci, expected) _array_check(f, expected) return
def alloc_remote(self, typecode, comps, width, height = 1, globl = False): if typecode == 'f': if comps == 1: fmt = cal_exec.FMT_FLOAT32_1 elif comps == 2: fmt = cal_exec.FMT_FLOAT32_2 elif comps == 4: fmt = cal_exec.FMT_FLOAT32_4 else: raise Exception("Number of components must be 1, 2, or 4") elif typecode == 'i': if comps == 1: fmt = cal_exec.FMT_SIGNED_INT32_1 elif comps == 2: fmt = cal_exec.FMT_SIGNED_INT32_2 elif comps == 4: fmt = cal_exec.FMT_SIGNED_INT32_4 else: raise Exception("Number of components must be 1, 2, or 4") elif typecode == 'I': if comps == 1: fmt = cal_exec.FMT_UNSIGNED_INT32_1 elif comps == 2: fmt = cal_exec.FMT_UNSIGNED_INT32_2 elif comps == 4: fmt = cal_exec.FMT_UNSIGNED_INT32_4 else: raise Exception("Number of components must be 1, 2, or 4") else: raise Exception("Unsupported data type: " + str(typecode)) if globl: globl = cal_exec.GLOBAL_BUFFER # Allocate and initialize the memory # TODO - more operand error checking mem = cal_exec.alloc_remote(self.device, fmt, width, height, globl) arr = extarray.extarray(typecode, 0) arr.data_len = mem[1] * height * comps arr.set_memory(mem[0], arr.data_len * 4) arr.gpu_mem_handle = mem arr.gpu_device = self.device arr.gpu_width = width arr.gpu_pitch = mem[1] return arr
def _pattern2vector(pattern): """ Encode a string of 1's and 0's into a 128-bit bit vector. """ if 128 % len(pattern) != 0: raise Exception('Pattern length must be a factor of 128') pattern = pattern * (128 / len(pattern)) bv = extarray.extarray('I', [0,0,0,0]) size = 128 / 4 for i in range(size): for j in range(4): b = int(pattern[size * j + i]) if b == 1: bv[j] = bv[j] | (1 << (size - i - 1)) return bv
def TestContinueLabel(n_spus = 1): n = 1024 a = extarray.extarray('I', range(n)) buffer_size = 16 if n_spus > 1: prgm = env.ParallelProgram() else: prgm = env.Program() code = prgm.get_stream() current = var.SignedWord(0, code) test = var.SignedWord(0, code) four = var.SignedWord(4, code) stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True) if n_spus > 1: stream = parallel(stream) md = memory_desc('i', 0, buffer_size) lsa_iter = spu_vec_iter(code, md) for buffer in stream: for current in lsa_iter: current.v = current + current test.v = (current == four) code.add(spu.gbb(test, test)) #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl)) code.add(spu.brz(test.reg, lsa_iter.continue_label)) current.v = current + current #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next)) prgm.add(code) proc = env.Processor() r = proc.execute(prgm, n_spus = n_spus) for i in range(0, n): if i >= 4: assert(a[i] == i + i) else: #print a[i] assert(a[i] == i * 4) return
def TestSynIterDec(): import corepy.arch.ptx.isa as ptx import corepy.arch.ptx.types.registers as regs SIZE = 64 proc = env.Processor(0) # build and run the kernel prgm = env.Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') memp = prgm.acquire_register('u64') counter = prgm.acquire_register('u32') code.add(ptx.ld('param', memp, regs.ptxAddress(_mem))) code.add(ptx.mov(counter, 0)) for i in syn_iter(code, 5, step=1, mode=DEC): code.add(ptx.add(counter, counter, 1)) code.add(ptx.st('global', regs.ptxAddress(memp), counter)) prgm.add(code) ptx_mem_addr = proc.alloc_device('u32', 1) mem = extarray.extarray('I', 1) mem[0] = 5 param_list = [ptx_mem_addr.address,] proc.copy(ptx_mem_addr, mem) prgm.cache_code() print prgm.render_string proc.execute(prgm, (1, 1, 1, 1, 1), param_list) proc.copy(mem, ptx_mem_addr) print mem #passed = True #for i in xrange(0, SIZE): # if ext_output[i] != 5: # passed = False #print "Passed == ", passed return
def __init__(self, app, parent, id, style, size = (-1, -1)): wx.ListCtrl.__init__(self, parent, id, size = size, style = style) listmix.TextEditMixin.__init__(self) self.attr = wx.ListItemAttr() self.attr.SetFont(wx.Font(11, wx.FONTFAMILY_TELETYPE, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL)) #self.attr_red = wx.ListItemAttr() #self.attr_red.SetFont(wx.Font(11, # wx.FONTFAMILY_TELETYPE, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_NORMAL)) #self.attr_red.SetTextColour(wx.RED) self.Bind(wx.EVT_LIST_BEGIN_LABEL_EDIT, self.OnBeginEdit) self.app = app self._cur_regs = extarray.extarray('I', 128 * 4) #self._prev_regs = extarray.extarray('I', 128 * 4) #self._prev_regs.clear() return
def alloc_remote(self, typecode, comps, width, height = 1, globl = False): """Allocate an ExtArray backed by remote (main) memory.""" fmt = self._get_fmt(typecode, comps) if globl: globl = cal_exec.GLOBAL_BUFFER # Allocate and initialize the memory # TODO - more operand error checking mem = cal_exec.alloc_remote(self.device, fmt, width, height, globl) arr = extarray.extarray(typecode, 0) arr.data_len = mem[2] * height * comps arr.set_memory(mem[1], arr.data_len * 4) arr.gpu_mem_handle = mem arr.gpu_device = self.device arr.gpu_width = width arr.gpu_pitch = mem[2] arr.gpu_height = height return arr
def _startSPU(self): self.ctx = ctx = env.spu_exec.alloc_context() # Execute a no-op instruction stream so the prolog is executed prgm = env.Program() code = prgm.get_stream() code.add(spu.nop(code.r_zero)) prgm.cache_code() itemsize = prgm.render_code.itemsize code_len = len(prgm.render_code) * itemsize if code_len % 16 != 0: code_len += 16 - (code_len % 16) code_lsa = 0x40000 - code_len env.spu_exec.run_stream(ctx, prgm.inst_addr(), code_len, code_lsa, code_lsa) self.localstore = extarray.extarray('I', 262144 / 4) print "spuls %x" % (ctx.spuls), ctx.spuls, type(ctx.spuls) self.localstore.set_memory(ctx.spuls, 262144) return
def generate(self, results, pattern, r1_range, r2_range, max_init, max_n, size): # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[i] = r1_range[0] ranges[4 + i] = r2_range[0] ranges[8 + i] = r1_inc ranges[12 + i] = r2_inc # Setup the pattern vector bits = _pattern2vector(pattern) # Copy the paramters to aligned buffers #a_ranges = synspu.aligned_memory(len(ranges), typecode='I') #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges)) #a_pattern = synspu.aligned_memory(len(bits), typecode='I') #a_pattern.copy_to(bits.buffer_info()[0], len(bits)) renderer = MailboxRenderer() ly_block = LyapunovBlock() ly_block.set_size(size[0], size[1]) #ly_block.set_range(a_ranges) #ly_block.set_pattern(a_pattern) ly_block.set_range(ranges) ly_block.set_pattern(bits) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) prgm = synspu.Program() code = prgm.get_stream() prgm += code ly_block.synthesize(code) proc = synspu.Processor() spe_id = proc.execute(prgm, async=True) for i in range(size[0] * size[1]): while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id)) proc.join(spe_id) # for x in range(size[0]): # r2 = r2_range[0] + r2_inc # print 'col:', x, r1, r2 # for y in range(size[1]): # results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n) # r2 += r2_inc # r1 += r1_inc return
def dump_regs(self): mbox = 28 # write out mbox channel # Pseudo-code: # 1) Save code is: (do this as an array, not an instruction stream) save_size = 128 * 2 + 4 save_code = extarray.extarray('I', range(save_size)) for i in range(0, 128 * 2, 2): save_code[i] = spu.wrch(i / 2, mbox, ignore_active=True).render() save_code[i + 1] = spu.stop(0x6, ignore_active=True).render() # branch back to the debug stop save_code[128 * 2] = spu.stop(0x7, ignore_active=True).render() ret = spu.bra(self.debug_lsa, ignore_active=True) save_code[128 * 2 + 1] = ret.render() #aligned_save_code = aligned_memory(save_size, typecode = 'I') #aligned_save_code.copy_to(save_code.buffer_info()[0], len(save_code)) # 2) Save lsa[0:len(save_code)] # TODO: do this with putb # 3) Push save code to lsa[0:] tag = 2 spu_exec.spu_getb(self.spe_id, 0, save_code.buffer_info()[0], save_size * 4, tag, 0, 0) spu_exec.read_tag_status_all(self.spe_id, 1 << tag) # 3) Replace the debug branch with a branch to 0 self.replace(self.debug_branch, spu.bra(0, ignore_active=True)) self.get_instructions() # 4) Resume self.resume(self.spe_id) # 5) Read the register values and send the ok signal regs = [] for i in range(128): while spu_exec.stat_out_mbox(self.spe_id) == 0: pass value = spu_exec.read_out_mbox(self.spe_id) regs.append(value) r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = spu_exec.wait_stop_event(self.spe_id) print 'next stop', r # 6) Restore code at original pc self.restore(self.debug_branch) self.get_instructions() # 7) Restore lsa[0:len(save_code)] # TODO: do this with putb # 8) Resume # self.resume(self.spe_id) # r = spu_exec.wait_stop_event(self.spe_id) self.resume(self.spe_id) r = self.wait_debug() return regs
def _synthesize_prologue(self): """ Create the prologue. (see PPC ABI p41) This manages the register preservation requirements from the ABI. TODO: CR2-4 need to be preserved. """ # Reset the prologue self._prologue = [self.lbl_prologue] # Add the instructions to save the registers r_addr = GPRegister(13) # Only available volatile register r_idx = GPRegister(14) # Non-volatile; safe to use before restoring # TODO - AWF - don't want to push things on the stack, that changes the # relative location of the passed-in arguments # However, we could just use the stack to save all the registers, and use # a frame pointer to give access to the arguments save_gp = [r for r in self._used_registers[GPRegister] if r in gp_save] if len(save_gp) > 0: # Save GP registers self._saved_gp_registers = array.array('I', xrange(len(save_gp))) self._load_word(self._prologue, r_addr, self._saved_gp_registers.buffer_info()[0]) for i, reg in enumerate(save_gp): self._prologue.append( ppc.stw(reg, r_addr, i * WORD_SIZE, ignore_active=True)) save_fp = [r for r in self._used_registers[FPRegister] if r in fp_save] if len(save_fp) > 0: # Save FP registers self._saved_fp_registers = array.array('d', xrange(len(save_fp))) self._load_word(self._prologue, r_addr, self._saved_fp_registers.buffer_info()[0]) for i, reg in enumerate(save_fp): self._prologue.append( ppc.stfd(reg, r_addr, i * WORD_SIZE * 2, ignore_active=True)) if self._enable_vmx: save_vx = [ r for r in self._used_registers[VMXRegister] if r in vx_save ] if len(save_vx) > 0: # Save VMX registers self._saved_vx_registers = extarray.extarray( 'I', range(len(save_vx) * 4)) self._load_word(self._prologue, r_addr, self._saved_vx_registers.buffer_info()[0]) for i, reg in enumerate(save_vx): self._load_word(self._prologue, r_idx, i * WORD_SIZE * 4) self._prologue.append( vmx.stvx(reg, r_idx, r_addr, ignore_active=True)) # Set up VRSAVE # Currently, we save the old value of VRSAVE in r31. # On the G4, someone stomps on registers < 20 ... save them all for now. # Save vrsave and put our value in it self._prologue.append( ppc.mfvrsave(self._vrsave, ignore_active=True)) self._load_word(self._prologue, r_addr, 0xFFFFFFFF) self._prologue.append(ppc.mtvrsave(r_addr, ignore_active=True)) return
def TestLiterals(): import corepy.arch.ppc.platform as env prgm = env.Program() code = prgm.get_stream() prgm += code proc = env.Processor() ppc.set_active_code(code) vmx.set_active_code(code) zero = Bits.cast(SignedByte(0)) target = Bits() # Signed versions use splat, unsigned arrays b = Byte(2) sb = SignedByte(-2) vmx.vaddsbs(b, b, sb) h = Halfword(9999) sh = SignedHalfword(-9999) vmx.vaddshs(h, h, sh) w = Word(99999) sw = SignedWord(-99999) vmx.vaddsws(w, w, sw) # Combine the results (should be [0,0,0,0]) vmx.vor(target, b, h) vmx.vor(target, target, w) # Array initializers b = Byte(range(16)) sb = SignedByte(range(16)) vmx.vsubsbs(b, b, sb) vmx.vor(target, target, b) h = Halfword([9999, 9998, 9997, 9996, 9995, 9994, 9993, 9992]) sh = SignedHalfword([9999, 9998, 9997, 9996, 9995, 9994, 9993, 9992]) vmx.vsubshs(h, h, sh) vmx.vor(target, target, h) w = Word([99999, 99998, 99997, 99996]) sw = SignedWord([99999, 99998, 99997, 99996]) vmx.vsubsws(w, w, sw) target.v = vmx.vor.ex(target, w) result = extarray.extarray('I', [42, 42, 42, 42]) r_addr = prgm.acquire_register() util.load_word(code, r_addr, result.buffer_info()[0]) vmx.stvx(target, 0, r_addr) ppc.set_active_code(None) vmx.set_active_code(None) r = proc.execute(prgm) print result for i in result: assert (i == 0) # for i in result: print '%08X' % i, # print return
import corepy.arch.x86_64.isa as x86 from corepy.arch.x86_64.types.registers import * import corepy.arch.x86_64.platform as env from corepy.arch.x86_64.lib.memory import MemRef import corepy.lib.extarray as extarray import corepy.arch.x86_64.lib.util as util import time ITERS = 1000000 THREADS = 4 data = extarray.extarray('l', 1) dbi = data.buffer_info() # This first case is intentionally wrong to show what happens w/o locking. data[0] = 0 prgm = env.Program() code = prgm.get_stream() x86.set_active_code(code) x86.mov(rax, 1) x86.mov(rcx, ITERS) x86.mov(rdi, dbi[0]) lbl_loop = prgm.get_unique_label("loop") code.add(lbl_loop) x86.add(MemRef(rdi), rax) x86.dec(rcx) x86.jnz(lbl_loop)
def ParMD5Transform(parcontext, parblock, blocki): num = parcontext.number temp_block = extarray.extarray('I', 16 * num) ParDecode(num, temp_block, parblock, blocki, 64) proc = env.Processor(0) N = int(math.sqrt(num / 4)) #print "N = ", N def address_4_1d(i, pitch=64): x = i % N y = i // 64 * 4 #return x*4 + y*pitch*4*4 return i def address_4_2d(x, y, pitch=64): return x * 4 + y * pitch * 4 input_statea = proc.alloc_remote('I', 4, N, N) input_stateb = proc.alloc_remote('I', 4, N, N) input_statec = proc.alloc_remote('I', 4, N, N) input_stated = proc.alloc_remote('I', 4, N, N) input_block = [proc.alloc_remote('I', 4, N, N) for i in range(16)] outputa = proc.alloc_remote('I', 4, N, N) outputb = proc.alloc_remote('I', 4, N, N) outputc = proc.alloc_remote('I', 4, N, N) outputd = proc.alloc_remote('I', 4, N, N) for j in range(N): for i in range(N): for k in range(4): input_statea[address_4_2d(i, j) + k] = parcontext.statea[k + (i + j * N) * 4] input_stateb[address_4_2d(i, j) + k] = parcontext.stateb[k + (i + j * N) * 4] input_statec[address_4_2d(i, j) + k] = parcontext.statec[k + (i + j * N) * 4] input_stated[address_4_2d(i, j) + k] = parcontext.stated[k + (i + j * N) * 4] for k in range(N): for j in range(N): for l in range(4): for i in range(16): input_block[i][address_4_2d(j, k) + l] = temp_block[i + (j + k * N) * 4 * 16 + l * 16] global xcode if xcode == None: xcode = env.InstructionStream() cal.set_active_code(xcode) S11 = xcode.acquire_register((7, 7, 7, 7)) S12 = xcode.acquire_register((12, 12, 12, 12)) S13 = xcode.acquire_register((17, 17, 17, 17)) S14 = xcode.acquire_register((22, 22, 22, 22)) S21 = xcode.acquire_register((5, 5, 5, 5)) S22 = xcode.acquire_register((9, 9, 9, 9)) S23 = xcode.acquire_register((14, 14, 14, 14)) S24 = xcode.acquire_register((20, 20, 20, 20)) S31 = xcode.acquire_register((4, 4, 4, 4)) S32 = xcode.acquire_register((11, 11, 11, 11)) S33 = xcode.acquire_register((16, 16, 16, 16)) S34 = xcode.acquire_register((23, 23, 23, 23)) S41 = xcode.acquire_register((6, 6, 6, 6)) S42 = xcode.acquire_register((10, 10, 10, 10)) S43 = xcode.acquire_register((15, 15, 15, 15)) S44 = xcode.acquire_register((21, 21, 21, 21)) a = xcode.acquire_register() b = xcode.acquire_register() c = xcode.acquire_register() d = xcode.acquire_register() x = [xcode.acquire_register() for i in range(16)] r = xcode.acquire_register() cal.dcl_resource(0, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statea cal.dcl_resource(1, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stateb cal.dcl_resource(2, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # statec cal.dcl_resource(3, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) # stated for i in range(16): cal.dcl_resource(i + 4, cal.pixtex_type.twod, cal.fmt.uint, UNNORM=True) cal.dcl_output(reg.o0, USAGE=cal.usage.generic) cal.dcl_output(reg.o1, USAGE=cal.usage.generic) cal.dcl_output(reg.o2, USAGE=cal.usage.generic) cal.dcl_output(reg.o3, USAGE=cal.usage.generic) cal.sample(0, 0, a, reg.v0.xy) cal.sample(1, 0, b, reg.v0.xy) cal.sample(2, 0, c, reg.v0.xy) cal.sample(3, 0, d, reg.v0.xy) for i in range(16): cal.sample(i + 4, 0, x[i], reg.v0.xy) # Round 1 FF(a, b, c, d, x[0], S11, 0xd76aa478) # 1 FF(d, a, b, c, x[1], S12, 0xe8c7b756) # 2 FF(c, d, a, b, x[2], S13, 0x242070db) # 3 FF(b, c, d, a, x[3], S14, 0xc1bdceee) # 4 FF(a, b, c, d, x[4], S11, 0xf57c0faf) # 5 FF(d, a, b, c, x[5], S12, 0x4787c62a) # 6 FF(c, d, a, b, x[6], S13, 0xa8304613) # 7 FF(b, c, d, a, x[7], S14, 0xfd469501) # 8 FF(a, b, c, d, x[8], S11, 0x698098d8) # 9 FF(d, a, b, c, x[9], S12, 0x8b44f7af) # 10 FF(c, d, a, b, x[10], S13, 0xffff5bb1) # 11 FF(b, c, d, a, x[11], S14, 0x895cd7be) # 12 FF(a, b, c, d, x[12], S11, 0x6b901122) # 13 FF(d, a, b, c, x[13], S12, 0xfd987193) # 14 FF(c, d, a, b, x[14], S13, 0xa679438e) # 15 FF(b, c, d, a, x[15], S14, 0x49b40821) # 16 # Round 2 GG(a, b, c, d, x[1], S21, 0xf61e2562) # 17 GG(d, a, b, c, x[6], S22, 0xc040b340) # 18 GG(c, d, a, b, x[11], S23, 0x265e5a51) # 19 GG(b, c, d, a, x[0], S24, 0xe9b6c7aa) # 20 GG(a, b, c, d, x[5], S21, 0xd62f105d) # 21 GG(d, a, b, c, x[10], S22, 0x2441453) # 22 GG(c, d, a, b, x[15], S23, 0xd8a1e681) # 23 GG(b, c, d, a, x[4], S24, 0xe7d3fbc8) # 24 GG(a, b, c, d, x[9], S21, 0x21e1cde6) # 25 GG(d, a, b, c, x[14], S22, 0xc33707d6) # 26 GG(c, d, a, b, x[3], S23, 0xf4d50d87) # 27 GG(b, c, d, a, x[8], S24, 0x455a14ed) # 28 GG(a, b, c, d, x[13], S21, 0xa9e3e905) # 29 GG(d, a, b, c, x[2], S22, 0xfcefa3f8) # 30 GG(c, d, a, b, x[7], S23, 0x676f02d9) # 31 GG(b, c, d, a, x[12], S24, 0x8d2a4c8a) # 32 # Round 3 HH(a, b, c, d, x[5], S31, 0xfffa3942) # 33 HH(d, a, b, c, x[8], S32, 0x8771f681) # 34 HH(c, d, a, b, x[11], S33, 0x6d9d6122) # 35 HH(b, c, d, a, x[14], S34, 0xfde5380c) # 36 HH(a, b, c, d, x[1], S31, 0xa4beea44) # 37 HH(d, a, b, c, x[4], S32, 0x4bdecfa9) # 38 HH(c, d, a, b, x[7], S33, 0xf6bb4b60) # 39 HH(b, c, d, a, x[10], S34, 0xbebfbc70) # 40 HH(a, b, c, d, x[13], S31, 0x289b7ec6) # 41 HH(d, a, b, c, x[0], S32, 0xeaa127fa) # 42 HH(c, d, a, b, x[3], S33, 0xd4ef3085) # 43 HH(b, c, d, a, x[6], S34, 0x4881d05) # 44 HH(a, b, c, d, x[9], S31, 0xd9d4d039) # 45 HH(d, a, b, c, x[12], S32, 0xe6db99e5) # 46 HH(c, d, a, b, x[15], S33, 0x1fa27cf8) # 47 HH(b, c, d, a, x[2], S34, 0xc4ac5665) # 48 # Round 4 II(a, b, c, d, x[0], S41, 0xf4292244) # 49 II(d, a, b, c, x[7], S42, 0x432aff97) # 50 II(c, d, a, b, x[14], S43, 0xab9423a7) # 51 II(b, c, d, a, x[5], S44, 0xfc93a039) # 52 II(a, b, c, d, x[12], S41, 0x655b59c3) # 53 II(d, a, b, c, x[3], S42, 0x8f0ccc92) # 54 II(c, d, a, b, x[10], S43, 0xffeff47d) # 55 II(b, c, d, a, x[1], S44, 0x85845dd1) # 56 II(a, b, c, d, x[8], S41, 0x6fa87e4f) # 57 II(d, a, b, c, x[15], S42, 0xfe2ce6e0) # 58 II(c, d, a, b, x[6], S43, 0xa3014314) # 59 II(b, c, d, a, x[13], S44, 0x4e0811a1) # 60 II(a, b, c, d, x[4], S41, 0xf7537e82) # 61 II(d, a, b, c, x[11], S42, 0xbd3af235) # 62 II(c, d, a, b, x[2], S43, 0x2ad7d2bb) # 63 II(b, c, d, a, x[9], S44, 0xeb86d391) # 64 cal.mov('o0', a) cal.mov('o1', b) cal.mov('o2', c) cal.mov('o3', d) xcode.release_register(a) xcode.release_register(b) xcode.release_register(c) xcode.release_register(d) for xi in x: xcode.release_register(xi) xcode.set_remote_binding('i0', input_statea) xcode.set_remote_binding('i1', input_stateb) xcode.set_remote_binding('i2', input_statec) xcode.set_remote_binding('i3', input_stated) for i in range(16): #range(len(input_block)): xcode.set_remote_binding('i' + str(i + 4), input_block[i]) xcode.set_remote_binding('o0', outputa) xcode.set_remote_binding('o1', outputb) xcode.set_remote_binding('o2', outputc) xcode.set_remote_binding('o3', outputd) domain = (0, 0, N, N) global TIME start_time = time.time() proc.execute(xcode, domain) end_time = time.time() TIME += (end_time - start_time) for j in range(N): for i in range(N): for k in range(4): parcontext.statea[k + (i + j * N) * 4] += outputa[address_4_2d(i, j) + k] parcontext.stateb[k + (i + j * N) * 4] += outputb[address_4_2d(i, j) + k] parcontext.statec[k + (i + j * N) * 4] += outputc[address_4_2d(i, j) + k] parcontext.stated[k + (i + j * N) * 4] += outputd[address_4_2d(i, j) + k] proc.free_remote(input_statea) proc.free_remote(input_stateb) proc.free_remote(input_statec) proc.free_remote(input_stated) for block in input_block: proc.free_remote(block) proc.free_remote(outputa) proc.free_remote(outputb) proc.free_remote(outputc) proc.free_remote(outputd)
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import array import corepy.lib.extarray as extarray import corepy.arch.ppc.isa as ppc import corepy.arch.vmx.isa as vmx import corepy.spre.spe as spe import corepy.arch.ppc.lib.util as util #from corepy.arch.ppc.types.ppc_types import make_user_type from corepy.spre.syn_util import most_specific _array_type = type(extarray.extarray('I', [1])) INT_ARRAY_SIZES = {'b': 16, 'h': 8, 'i': 4, 'B': 16, 'H': 8, 'I': 4} class VMXType(spe.Type): def __init__(self, *args, **kargs): super(VMXType, self).__init__(*args, **kargs) self.storage = None return def _get_active_code(self): return ppc.get_active_code() def _set_active_code(self, code): return ppc.set_active_code(code)
def TestMFC(): size = 32 #data_array = array.array('I', range(size)) #data = synspu.aligned_memory(size, typecode = 'I') #data.copy_to(data_array.buffer_info()[0], len(data_array)) data = extarray.extarray('I', range(size)) code = synspu.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) print 'array ea: %X' % (data.buffer_info()[0]) print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address print 'test ea: %X' % data.buffer_info()[0] util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size code.add(spu.ai(r_size, r_zero, size * 4)) # Load the tag code.add(spu.ai(r_tag, r_zero, 2)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1 << 2) # Wait for the transfer to complete mfc_read_tag_status_all(code) # Increment the data values by 1 using an unrolled loop (no branches) r_current = code.acquire_register() for lsa in range(0, size * 4, 16): code.add(spu.lqa(r_current, (lsa >> 2))) code.add(spu.ai(r_current, r_current, 1)) code.add(spu.stqa(r_current, (lsa >> 2))) code.release_register(r_current) # Store the values back to main memory # Load the data into address 0 mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 2 mfc_write_tag_mask(code, 1 << 2) # Wait for the transfer to complete mfc_read_tag_status_all(code) # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = synspu.Processor() # code.print_code() #print data_array proc.execute(code) #data.copy_from(data_array.buffer_info()[0], len(data_array)) for i in range(size): assert (data[i] == i + 1) return
def MemoryDescExample(data_size=20000): """ This example uses a memory descriptor to move 20k integers back and forth between main memory and the SPU local store. Each value is incremented by 1 while on the SPU. Memory descriptors are a general purpose method for describing a region of memory. Memory is described by a typecode, address, and size. Memory descriptors can be initialized by hand or from an array or buffer object. For main memory, memory descriptors are useful for transfering data between main memory and an SPU's local store. The get/put methods on a memory descriptor generate the SPU code to move data of any size between main memory and local store. Memory descriptors can also be used with spu_vec_iters to describe the region of memory to iterate over. The typecode in the memory descriptor is used to determine the type for the loop induction variable. Note that there is currently no difference between memory descriptors for main memory and local store. It's up to the user to make sure the memory descriptor settings make sense in the current context. (this will probably change in the near future) Note: get/put currently use loops rather than display lists for transferring data over 16k. """ code = InstructionStream() proc = Processor() code.debug = True spu.set_active_code(code) # Create a python array data = extarray.extarray('I', range(data_size)) # Align the data in the array #a_data = aligned_memory(data_size, typecode = 'I') #a_data.copy_to(data.buffer_info()[0], data_size) # Create memory descriptor for the data in main memory data_desc = memory_desc('I') #data_desc.from_array(a_data) data_desc.from_array(data) # Transfer the data to 0x0 in the local store data_desc.get(code, 0) # Create memory descriptor for the data in the local store for use # in the iterator lsa_data = memory_desc('i', 0, data_size) # Add one to each value for x in spu_vec_iter(code, lsa_data): x.v = x + 1 # Transfer the data back to main memory data_desc.put(code, 0) dma.spu_write_out_mbox(code, 0xCAFE) # Execute the synthetic program # code.print_code() spe_id = proc.execute(code, async=True) proc.join(spe_id) # Copy it back to the Python array #a_data.copy_from(data.buffer_info()[0], data_size) for i in xrange(data_size): assert (data[i] == i + 1) return
def Test(): code = env.InstructionStream() proc = env.Processor() params = env.ExecParams() params.p1 = 3 mr32 = MemRef(rbp, 16, data_size=32) mr8 = MemRef(rbp, 16, data_size=8) lbl1 = code.get_label("lbl1") lbl2 = code.get_label("lbl2") code.add(x86.xor(rax, rax)) code.add(x86.cmp(rax, 1)) code.add(x86.jne(lbl1)) code.add(x86.ud2()) code.add(x86.ud2()) code.add(lbl1) code.add(x86.cmp(rax, 1)) code.add(x86.je(lbl2)) code.add(x86.add(rax, 12)) code.add(lbl2) # printer.PrintInstructionStream(code, printer.x86_64_Nasm(function_name="foobar")) ret = proc.execute(code) print "ret", ret assert (ret == 12) print "W00T" code.reset() code.add(x86.xor(rax, rax)) code.add(x86.cmp(rax, 1)) code.add(x86.jne(28)) code.add(x86.ud2()) code.add(x86.ud2()) code.add(x86.cmp(eax, 1)) code.add(x86.je(37)) code.add(x86.add(rax, 12)) code.print_code(hex=True, pro=True, epi=True) print "a" ret = proc.execute(code) print "b" print "ret", ret assert (ret == 12) print "w00t 2" code.reset() call_lbl = code.get_label("call_fn") code.add(x86.xor(rax, rax)) code.add(x86.call(call_lbl)) code.add(x86.jmp(code.lbl_epilogue)) code.add(x86.mov(rax, 75)) code.add(x86.mov(rax, 42)) code.add(call_lbl) code.add(x86.mov(rax, 15)) code.add(x86.ret()) code.print_code() ret = proc.execute(code) print "ret", ret assert (ret == 15) code.reset() fwd_lbl = code.get_label("FORWARD") bck_lbl = code.get_label("BACKWARD") code.add(x86.xor(rax, rax)) code.add(bck_lbl) code.add(x86.cmp(rax, 1)) code.add(x86.jne(fwd_lbl)) for i in xrange(0, 65): code.add(x86.pop(r15)) code.add(fwd_lbl) ret = proc.execute(code, mode='int') assert (ret == 0) code.reset() loop_lbl = code.get_label("LOOP") out_lbl = code.get_label("OUT") skip_lbl = code.get_label("SKIP") code.add(x86.xor(rax, rax)) code.add(loop_lbl) for i in range(0, 1): for i in xrange(0, 24): code.add(x86.add(r15, MemRef(rsp, 4))) code.add(x86.add(rax, 4)) code.add(x86.cmp(rax, 20)) code.add(x86.je(out_lbl)) for i in xrange(0, 24): code.add(x86.add(r15, MemRef(rsp, 4))) code.add(x86.cmp(rax, 32)) code.add(x86.jne(loop_lbl)) code.add(out_lbl) code.add(x86.jmp(skip_lbl)) for i in xrange(0, 2): code.add(x86.add(r15, MemRef(rsp, 4))) code.add(skip_lbl) ret = proc.execute(code, mode='int') print "ret", ret assert (ret == 20) code.reset() loop_lbl = code.get_label("LOOP") else_lbl = code.get_label("ELSE") finish_lbl = code.get_label("finish") code.add(x86.mov(rax, 0)) code.add(x86.mov(rdx, 0)) code.add(loop_lbl) code.add(x86.add(rax, 1)) code.add(x86.cmp(rax, 16)) code.add(x86.jge(finish_lbl)) code.add(x86.add(rdx, rax)) code.add(x86.mov(r8, rdx)) code.add(x86.and_(r8, 0x1)) code.add(x86.jnz(else_lbl)) code.add(x86.add(rdx, 1)) code.add(x86.jmp(loop_lbl)) code.add(else_lbl) code.add(x86.add(rdx, r8)) code.add(x86.jmp(loop_lbl)) code.add(finish_lbl) code.add(x86.mov(rax, rdx)) ret = proc.execute(code, mode='int') print "ret", ret assert (ret == 135) code.reset() loop_lbl = code.get_label("LOOP") code.add(x86.xor(rax, rax)) code.add(x86.xor(rcx, rcx)) code.add(x86.mov(rdx, 1)) code.add(loop_lbl) code.add(x86.inc(rax)) code.add(x86.cmp(rax, 7)) code.add(x86.cmove(rcx, rdx)) code.add(x86.jrcxz(loop_lbl)) code.print_code(hex=True) ret = proc.execute(code, mode='int') print "ret", ret assert (ret == 7) code.reset() code.add(x86.mov(rax, MemRef(rbp, 16))) code.add(x86.xor(rbx, rbx)) code.add(x86.mov(rbx, -1)) code.add(x86.mov(cl, 1)) code.add(x86.shld(rax, rbx, cl)) code.print_code(hex=True) ret = proc.execute(code, params=params, mode='int') print "ret", ret assert (ret == 7) # code.reset() # code.add(x86.add(eax, 200)) # code.add(x86.xor(eax, eax)) # code.add(x86.add(al, 32)) # code.add(x86.add(bl, 32)) # code.add(x86.xor(bl, bl)) # code.add(x86.mov(mr8, al)) # code.add(x86.add(mr32, 0)) # code.add(x86.mov(eax, mr32)) # code.add(x86.mov(al, mr8)) # # code.add(x86.imul(ax, ax, 4)) # code.add(x86.imul(eax, ebx, 10)) # code.add(x86.mov(cx, 1232)) # code.add(x86.sub(ax, cx)) # code.add(x86.xor(eax,eax)) # code.add(x86.mov(eax,ebx)) # code.add(x86.clc()) # code.add(x86.rcl(eax, 1)) # code.add(x86.rcr(eax, 1)) # #ret = proc.execute(code, debug = True, params = params) # id1 = proc.execute(code, params = params, mode = 'int', async = True) # id2 = proc.execute(code, params = params, mode = 'int', async = True) # ret = proc.execute(code, params = params, mode = 'int') # print "Return main thread: %d" % (ret) # assert(ret == 1280) # ret = proc.join(id1) # print "Return thread 1: %d" % (ret) # assert(ret == 1280) # ret = proc.join(id2) # print "Return thread 2: %d" % (ret) # assert(ret == 1280) code.reset() code.add(x86.fldpi()) code.add(x86.pxor(xmm0, xmm0)) code.add(x86.fld1()) code.add(x86.fadd(st0, st0)) code.add(x86.fmulp()) code.add(x86.fsin()) code.add(x86.fcos()) code.add(x86.fld1()) code.add(x86.fyl2xp1()) # x86_64 now uses xmm0 to return floats, not st0. So here, just make room # on the stack, convert the FP result to an int and store it on the stack, # then pop it into rax, the int return register. code.add(x86.push(rax)) code.add(x86.fistp(MemRef(rsp))) code.add(x86.pop(rax)) code.print_code(hex=True) ret = proc.execute(code, params=params, mode='int') assert (ret == 1) print "Return main thread: %d" % (ret) code.reset() lbl_ok = code.get_label("OK") code.add(x86.emms()) code.add(x86.movd(xmm0, mr32)) code.add(x86.mov(ebx, mr32)) code.add(x86.cmp(ebx, 3)) code.add(x86.je(lbl_ok)) code.add(x86.movd(eax, xmm0)) code.add(x86.cmp(eax, 3)) code.add(x86.je(lbl_ok)) code.add(x86.ud2()) code.add(lbl_ok) code.add(x86.xor(eax, eax)) code.add(x86.movd(xmm1, ebx)) code.add(x86.paddq(xmm0, xmm1)) code.add(x86.pextrw(ecx, xmm0, 0)) code.add(x86.pinsrw(mm1, ecx, 0)) code.add(x86.movq2dq(xmm0, mm1)) code.add(x86.movdq2q(mm2, xmm0)) code.add(x86.movd(edx, mm2)) code.add(x86.movd(xmm5, edx)) code.add(x86.movd(ecx, xmm5)) code.add(x86.pinsrw(xmm6, ecx, 0)) code.add(x86.movd(eax, xmm6)) code.print_code(hex=True) ret = proc.execute(code, params=params, mode='int') print "Return main thread: %d" % (ret) assert (ret == 6) code.reset() # Test immediate size encodings code.add(x86.add(eax, 300)) code.add(x86.add(ax, 300)) code.add(x86.add(ax, 30)) code.add(x86.mov(eax, 16)) code.add(x86.mov(eax, 300)) code.reset() code.add(x86.add(eax, 0xDEADBEEF)) code.add(x86.add(ebx, 0xDEADBEEF)) code.print_code(hex=True) # Try the LOCK prefix code.reset() code.add(x86.xor(eax, eax)) code.add(x86.add(mr32, eax)) code.add(x86.add(mr32, eax, lock=True)) #code.print_code(hex = True) proc.execute(code, params=params) code.reset() code.add(x86.mov(edx, 0x1234)) code.add(x86.mov(eax, 0xFFFF)) code.add(x86.xchg(edx, eax)) code.print_code(hex=True) ret = proc.execute(code, params=params) print "ret:", ret assert (ret == 0x1234) code.reset() code.add(x86.mov(rax, rsp)) code.add(x86.pushfq()) code.add(x86.sub(rax, rsp)) code.add(x86.add(rsp, rax)) code.print_code(hex=True) ret = proc.execute(code, params=params) print "ret:", ret assert (ret == 8) code.reset() data = extarray.extarray('H', xrange(0, 16)) # code.add(x86.push(rdi)) code.add(x86.mov(rdi, data.buffer_info()[0])) code.add(x86.movaps(xmm1, MemRef(rdi, data_size=128))) code.add(x86.pextrw(rax, xmm1, 0)) code.add(x86.pextrw(rbx, xmm1, 1)) code.add(x86.pextrw(rcx, xmm1, 2)) code.add(x86.pextrw(rdx, xmm1, 3)) code.add(x86.shl(rbx, 16)) code.add(x86.shl(rcx, 32)) code.add(x86.shl(rdx, 48)) code.add(x86.or_(rax, rbx)) code.add(x86.or_(rax, rcx)) code.add(x86.or_(rax, rdx)) # code.add(x86.pop(rdi)) code.print_code(hex=True) ret = proc.execute(code, mode='int') print "ret %x" % ret assert (ret == 0x0003000200010000) code.reset() L1 = code.get_label("L1") code.add(x86.xor(rax, rax)) code.add(x86.mov(rcx, 3)) code.add(L1) code.add(x86.add(rax, 1)) code.add(x86.loop(L1)) code.print_code(hex=True) ret = proc.execute(code, mode='int') print "ret %x" % ret assert (ret == 0x03) return
def TestSPUIter(): size = 32 data = extarray.extarray('I', range(size)) code = env.InstructionStream() r_zero = code.acquire_register() r_ea_data = code.acquire_register() r_ls_data = code.acquire_register() r_size = code.acquire_register() r_tag = code.acquire_register() # Load zero util.load_word(code, r_zero, 0) #print 'array ea: %X' % (data.buffer_info()[0]) #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % ( # str(r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag)) # Load the effective address util.load_word(code, r_ea_data, data.buffer_info()[0]) # Load the size util.load_word(code, r_size, size * 4) # Load the tag code.add(spu.ai(r_tag, r_zero, 12)) # Load the lsa code.add(spu.ai(r_ls_data, r_zero, 0)) # Load the data into address 0 dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1 << 12) # Wait for the transfer to complete dma.mfc_read_tag_status_all(code) # Increment the data values by 1 using an unrolled loop (no branches) # r_current = code.acquire_register() current = var.SignedWord(0, code) # Use an SPU iter for lsa in syn_iter(code, size * 4, 16): code.add(spu.lqx(current, r_zero, lsa)) # code.add(spu.ai(1, r_current, r_current)) current.v = current + current code.add(spu.stqx(current, r_zero, lsa)) # code.release_register(r_current) #current.release_register(code) # Store the values back to main memory # Load the tag code.add(spu.ai(r_tag, r_zero, 13)) # Load the data into address 0 dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag) # Set the tag bit to 12 dma.mfc_write_tag_mask(code, 1 << 13) # Wait for the transfer to complete dma.mfc_read_tag_status_all(code) # Cleanup code.release_register(r_zero) code.release_register(r_ea_data) code.release_register(r_ls_data) code.release_register(r_size) code.release_register(r_tag) # Stop for debugging # code.add(spu.stop(0xA)) # Execute the code proc = env.Processor() r = proc.execute(code) for i in range(0, size): assert (data[i] == i + i) return
def alloc_host(self, typecode, length, comps = 1): """ Allocate local GPU memory and return a handle for copying/binding. Typecode is ptx typecode (u32, s32, f32, u64, etc.) """ #fmt = self._get_fmt(typecode, comps) array_typecode = '' # This might be clearer, but not very efficient... #type_conversion_table = {} #type_conversion_table['32'] = {'f': 'f', 'u': 'I', 's', 'i'} #type_conversion_table['64'] = {'f': 'd', 'u': 'L', 's', 'l'} #type_conversion_table['16'] = {'u': 'H', 's', 'h'} #type_conversion_table['8'] = {'u': 'B', 's', 'b'} # #if typecode == 'b': # typecode = 'u' #array_typecode = type_conversion_table[typecode[0]][typecode[1:]] scalar_width = int(typecode[1:]) if typecode[0] == 'f': if scalar_width == 32: array_typecode = 'f' elif scalar_width == 64: array_typecode = 'd' elif typecode[0] == 'u': if scalar_width == 32: array_typecode = 'I' elif scalar_width == 64: array_typecode = 'L' elif scalar_width == 16: array_typecode = 'H' elif scalar_width == 8: array_typecode = 'b' elif typecode[0] == 's': if scalar_width == 32: array_typecode = 'i' elif scalar_width == 64: array_typecode = 'l' elif scalar_width == 16: array_typecode = 'h' elif scalar_width == 8: array_typecode = 'B' if array_typecode == '': raise Exception('Unable to convert type') mem = ptx_exec.alloc_host(length*scalar_byte_width*comps) arr = extarray.extarray(array_typecode, 0) arr.data_len = scalar_width/4 * length * comps arr.set_memory(mem, arr.data_len * 4) arr.gpu_mem_handle = mem # arr.gpu_device = self.device arr.gpu_width = length # arr.gpu_pitch = mem[2] # arr.gpu_height = height return arr
def TestParams(): import time #SIZE = 1024 kernel = ( ''' .version 1.4 .target sm_10, map_f64_to_f32 .entry _main ( .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) { .reg .u64 %rd<3>; .reg .f32 %f<6>; ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; add.f32 %f3, %f1, %f2; mov.f32 %f4, %f3; ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; st.global.f32 [%rd1+0], %f4; exit; } // _Z16addArrayOnDevicePfff ''' ) t1 = time.time() module = ptx_exec.compile(kernel) t2 = time.time() print "compile time", t2 - t1 a = 1.0 b = 2.0 ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) #mem.set_memory(ptx_mem_addr, 4) mem[0] = 5.0 print ptx_mem_addr, type(ptx_mem_addr) print mem.buffer_info()[0], type(mem.buffer_info()[0]) param_list = [ptx_mem_addr, a, b] # image, dev num, (x, y, w, h) ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) t1 = time.time() ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) t2 = time.time() print "run time", t2 - t1 print "X", mem.buffer_info()[0], ptx_mem_addr ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #ptx_exec.free(input) #ptx_exec.free(output) ##ptx_exec.free(glob) #ptx_exec.unload_module(image) return
def cache_code(self): """ Fill in the epilogue and prologue. This call freezes the code and any subsequent calls to acquire_register() or add() will unfreeze it. Also perform alignment checks. Once the checks are preformed, the code should not be modified. """ if self._cached == True: return # HACK: Disable the current active code # NOTE: This may not work in the presence of multiple ISAs... active_callback = None if self._active_callback is not None: active_callback = self._active_callback active_callback(None) self._synthesize_prologue() self._synthesize_epilogue() render_code = extarray.extarray(self.instruction_type) # Note - TRAC ticket #19 has some background info and reference links on # the algorithms used here. https://svn.osl.iu.edu/trac/corepy/ticket/19 if self.instruction_type == 'I': fwd_ref_list = [] # Assumed below that 'I' type is 4 bytes for arr in (self._prologue, self._instructions, self._epilogue): for val in arr: if isinstance(val, (Instruction, ExtendedInstruction)): # Does this instruction reference any labels? lbl = None for k in val._operands.keys(): if isinstance(val._operands[k], Label): lbl = val._operands[k] break if lbl == None: # No label reference, render the inst render_code.append(val.render()) else: # Label reference assert (lbl.code == self) val.set_position(len(render_code) * 4) if lbl.position != None: # Back reference, render the inst render_code.append(val.render()) else: # Fill in a dummy instruction and save info to render later fwd_ref_list.append((val, len(render_code))) render_code.append(0xFFFFFFFF) elif isinstance( val, Label): # Label, fill in a zero-length slot val.set_position(len(render_code) * 4) # Render the instructions with forward label references for rec in fwd_ref_list: render_code[rec[1]] = rec[0].render() elif self.instruction_type == 'B': # inst_list is a list of tuples. Each tuple contains a bool # indicating presence of a label reference, rendered code ([] if label), # and a label or instruction object. inst_list = [] inst_len = 0 for arr in (self._prologue, self._instructions, self._epilogue): for val in arr: if isinstance(val, (Instruction, ExtendedInstruction)): # Does this instruction reference any labels? lbl = None relref = False #iop = 0 #for k in val._operands.keys(): sig = val.machine_inst.signature #while val._operands.has_key(iop): for iop in xrange(0, len(sig)): opsig = sig[iop] #if isinstance(op, (int, long)): # print "ops", val._operands # print "op", op, iop, val.params, val.machine_inst.signature # print "opsig", opsig if hasattr(opsig, "relative_op" ) and opsig.relative_op == True: op = val._operands[iop] if isinstance(op, Label): lbl = op # This is a hack, but it works. Some instructions can have # a relative offset that is not a label. These insts need to be # re-rendered if instruction sizes change relref = True #iop += 1 if lbl == None: # No label references val.set_position(inst_len) r = val.render() inst_list.append([relref, r, val]) inst_len += len(r) else: # Instruction referencing a label. assert (lbl.code == self) val.set_position(inst_len) if lbl.position != None: # Back-reference, render the instruction r = val.render() inst_list.append([True, r, val]) inst_len += len(r) else: # Fill in a dummy instruction, assuming 2-byte best case inst_list.append([True, [-1, -1], val]) inst_len += 2 elif isinstance( val, Label): # Label, fill in a zero-length slot val.set_position(inst_len) inst_list.append([False, [], val]) inst_list = self._adjust_pass(inst_list) # Final loop, bring everything together into render_code for rec in inst_list: if isinstance(rec[2], (Instruction, ExtendedInstruction)): render_code.fromlist(rec[1]) self.render_code = render_code self.make_executable() if active_callback is not None: active_callback(self) self._cached = True return
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import corepy.lib.extarray as extarray import corepy.arch.spu.isa as spu import corepy.arch.spu.platform as env import corepy.arch.spu.lib.dma as dma from corepy.arch.spu.lib.util import load_word if __name__ == '__main__': a = extarray.extarray('i', range(0, 32)) b = extarray.extarray('i', [0 for i in range(0, 32)]) code = env.InstructionStream() proc = env.Processor() spu.set_active_code(code) abi = a.buffer_info() print "abi", abi, a.itemsize dma.mem_get(code, 0x1000, abi[0], abi[1] * a.itemsize, 2) dma.mem_complete(code, 2) bbi = b.buffer_info() print "bbi", bbi, b.itemsize dma.mem_put(code, 0x1000, bbi[0], bbi[1] * b.itemsize, 2) dma.mem_complete(code, 2)
def Test(): prgm = env.Program() code = prgm.get_stream() proc = env.Processor() params = env.ExecParams() params.p1 = 3 lbl1 = prgm.get_label("lbl1") lbl2 = prgm.get_label("lbl2") code.add(x86.xor(prgm.gp_return, prgm.gp_return)) code.add(x86.cmp(prgm.gp_return, 1)) code.add(x86.jne(lbl1)) code.add(x86.ud2()) code.add(x86.ud2()) code.add(lbl1) code.add(x86.cmp(prgm.gp_return, 1)) code.add(x86.je(lbl2)) code.add(x86.add(prgm.gp_return, 12)) code.add(lbl2) prgm.add(code) #prgm.print_code(pro = True, epi = True, hex = True) ret = proc.execute(prgm, mode='int') print "ret", ret assert (ret == 12) prgm.reset() code.reset() code.add(x86.xor(prgm.gp_return, prgm.gp_return)) code.add(x86.cmp(prgm.gp_return, 1)) code.add(x86.jne(28)) code.add(x86.ud2()) code.add(x86.ud2()) code.add(x86.cmp(prgm.gp_return, 1)) code.add(x86.je(37)) code.add(x86.add(prgm.gp_return, 12)) prgm.add(code) prgm.print_code(hex=True, pro=True, epi=True) ret = proc.execute(prgm) print "ret", ret assert (ret == 12) prgm.reset() code.reset() call_lbl = prgm.get_label("call_fn") code.add(x86.xor(prgm.gp_return, prgm.gp_return)) code.add(x86.call(call_lbl)) code.add(x86.jmp(prgm.lbl_epilogue)) code.add(x86.mov(prgm.gp_return, 75)) code.add(x86.mov(prgm.gp_return, 42)) code.add(call_lbl) code.add(x86.mov(prgm.gp_return, 15)) code.add(x86.ret()) prgm.add(code) prgm.print_code() ret = proc.execute(prgm, mode='int') print "ret", ret assert (ret == 15) prgm.reset() code.reset() fwd_lbl = prgm.get_label("FORWARD") bck_lbl = prgm.get_label("BACKWARD") code.add(x86.xor(prgm.gp_return, prgm.gp_return)) code.add(bck_lbl) code.add(x86.cmp(prgm.gp_return, 1)) code.add(x86.jne(fwd_lbl)) r_foo = prgm.acquire_register() for i in xrange(0, 65): code.add(x86.pop(r_foo)) prgm.release_register(r_foo) code.add(fwd_lbl) prgm.add(code) ret = proc.execute(prgm, mode='int') print "ret", ret assert (ret == 0) prgm.reset() code.reset() loop_lbl = prgm.get_label("LOOP") out_lbl = prgm.get_label("OUT") skip_lbl = prgm.get_label("SKIP") code.add(x86.xor(prgm.gp_return, prgm.gp_return)) code.add(loop_lbl) r_foo = prgm.acquire_register() for i in range(0, 1): for i in xrange(0, 24): code.add(x86.add(r_foo, MemRef(rsp, 4))) code.add(x86.add(prgm.gp_return, 4)) code.add(x86.cmp(prgm.gp_return, 20)) code.add(x86.je(out_lbl)) for i in xrange(0, 24): code.add(x86.add(r_foo, MemRef(rsp, 4))) code.add(x86.cmp(prgm.gp_return, 32)) code.add(x86.jne(loop_lbl)) code.add(out_lbl) code.add(x86.jmp(skip_lbl)) for i in xrange(0, 2): code.add(x86.add(r_foo, MemRef(rsp, 4))) code.add(skip_lbl) prgm.release_register(r_foo) prgm.add(code) ret = proc.execute(prgm, mode='int') print "ret", ret assert (ret == 20) prgm.reset() code.reset() r_tmp = prgm.acquire_registers(2) loop_lbl = prgm.get_label("LOOP") else_lbl = prgm.get_label("ELSE") finish_lbl = prgm.get_label("finish") code.add(x86.mov(prgm.gp_return, 0)) code.add(x86.mov(r_tmp[0], 0)) code.add(loop_lbl) code.add(x86.add(prgm.gp_return, 1)) code.add(x86.cmp(prgm.gp_return, 16)) code.add(x86.jge(finish_lbl)) code.add(x86.add(r_tmp[0], prgm.gp_return)) code.add(x86.mov(r_tmp[1], r_tmp[0])) code.add(x86.and_(r_tmp[1], 0x1)) code.add(x86.jnz(else_lbl)) code.add(x86.add(r_tmp[0], 1)) code.add(x86.jmp(loop_lbl)) code.add(else_lbl) code.add(x86.add(r_tmp[0], r_tmp[1])) code.add(x86.jmp(loop_lbl)) code.add(finish_lbl) code.add(x86.mov(prgm.gp_return, r_tmp[0])) prgm.release_registers(r_tmp) prgm.add(code) ret = proc.execute(prgm, mode='int') print "ret", ret assert (ret == 135) prgm.reset() code.reset() loop_lbl = prgm.get_label("LOOP") r_one = prgm.acquire_register() code.add(x86.xor(prgm.gp_return, prgm.gp_return)) code.add(x86.xor(rcx, rcx)) code.add(x86.mov(r_one, 1)) code.add(loop_lbl) code.add(x86.inc(prgm.gp_return)) code.add(x86.cmp(prgm.gp_return, 7)) code.add(x86.cmove(rcx, r_one)) code.add(x86.jrcxz(loop_lbl)) prgm.release_register(r_one) prgm.add(code) prgm.print_code(hex=True) ret = proc.execute(prgm, mode='int') print "ret", ret assert (ret == 7) prgm.reset() code.reset() r_tmp = prgm.acquire_register() code.add(x86.mov(prgm.gp_return, rdi)) code.add(x86.xor(r_tmp, r_tmp)) code.add(x86.mov(r_tmp, -1)) code.add(x86.mov(cl, 1)) code.add(x86.shld(prgm.gp_return, r_tmp, cl)) prgm.release_register(r_tmp) prgm.add(code) ret = proc.execute(prgm, params=params, mode='int') print "ret", ret assert (ret == 7) prgm.reset() code.reset() code.add(x86.add(eax, 200)) code.add(x86.xor(eax, eax)) code.add(x86.add(al, 32)) code.add(x86.add(bl, 32)) code.add(x86.xor(bl, bl)) code.add(x86.mov(dil, al)) code.add(x86.add(rdi, 0)) code.add(x86.mov(eax, edi)) code.add(x86.mov(al, dil)) code.add(x86.imul(ax, ax, 4)) code.add(x86.imul(eax, ebx, 10)) code.add(x86.mov(cx, 1232)) code.add(x86.sub(ax, cx)) code.add(x86.xor(eax, eax)) code.add(x86.mov(eax, ebx)) code.add(x86.clc()) code.add(x86.rcl(eax, 1)) code.add(x86.rcr(eax, 1)) prgm.add(code) #ret = proc.execute(prgm, debug = True, params = params) id1 = proc.execute(prgm, params=params, mode='int', async=True) id2 = proc.execute(prgm, params=params, mode='int', async=True) ret = proc.execute(prgm, params=params, mode='int') print "Return main thread: %d" % (ret) assert (ret == 1280) ret = proc.join(id1) print "Return thread 1: %d" % (ret) assert (ret == 1280) ret = proc.join(id2) print "Return thread 2: %d" % (ret) assert (ret == 1280) prgm.reset() code.reset() code.add(x86.fldpi()) code.add(x86.pxor(xmm0, xmm0)) code.add(x86.fld1()) code.add(x86.fadd(st0, st0)) code.add(x86.fmulp()) code.add(x86.fsin()) code.add(x86.fcos()) code.add(x86.fld1()) code.add(x86.fyl2xp1()) # x86_64 now uses xmm0 to return floats, not st0. So here, just make room # on the stack, convert the FP result to an int and store it on the stack, # then pop it into rax, the int return register. code.add(x86.push(prgm.gp_return)) code.add(x86.fistp(MemRef(rsp))) code.add(x86.pop(prgm.gp_return)) prgm.add(code) prgm.print_code(hex=True) ret = proc.execute(prgm, params=params, mode='int') assert (ret == 1) print "Return main thread: %d" % (ret) prgm.reset() code.reset() lbl_ok = prgm.get_label("OK") code.add(x86.emms()) code.add(x86.movd(xmm0, edi)) code.add(x86.mov(ebx, edi)) code.add(x86.cmp(ebx, 3)) code.add(x86.je(lbl_ok)) code.add(x86.movd(eax, xmm0)) code.add(x86.cmp(eax, 3)) code.add(x86.je(lbl_ok)) code.add(x86.ud2()) code.add(lbl_ok) code.add(x86.xor(eax, eax)) code.add(x86.movd(xmm1, ebx)) code.add(x86.paddq(xmm0, xmm1)) code.add(x86.pextrw(ecx, xmm0, 0)) code.add(x86.pxor(mm1, mm1)) code.add(x86.pinsrw(mm1, ecx, 0)) code.add(x86.movq2dq(xmm0, mm1)) code.add(x86.movdq2q(mm2, xmm0)) code.add(x86.movd(edx, mm2)) code.add(x86.movd(xmm5, edx)) code.add(x86.movd(ecx, xmm5)) code.add(x86.pxor(xmm6, xmm6)) code.add(x86.pinsrw(xmm6, ecx, 0)) code.add(x86.movd(eax, xmm6)) prgm.add(code) prgm.print_code(hex=True) ret = proc.execute(prgm, params=params, mode='int') print "Return main thread: %d" % (ret) assert (ret == 6) prgm.reset() code.reset() code.add(x86.mov(edx, 0x1234)) code.add(x86.mov(eax, 0xFFFF)) code.add(x86.xchg(edx, eax)) prgm.add(code) prgm.print_code(hex=True) ret = proc.execute(prgm, params=params) print "ret:", ret assert (ret == 0x1234) prgm.reset() code.reset() code.add(x86.mov(prgm.gp_return, rsp)) code.add(x86.pushfq()) code.add(x86.sub(prgm.gp_return, rsp)) code.add(x86.add(rsp, prgm.gp_return)) prgm.add(code) prgm.print_code(hex=True) ret = proc.execute(prgm, params=params) print "ret:", ret assert (ret == 8) prgm.reset() code.reset() data = extarray.extarray('H', xrange(0, 16)) r_128 = prgm.acquire_register(reg_type=XMMRegister) regs = prgm.acquire_registers(4) code.add(x86.mov(regs[0], data.buffer_info()[0])) code.add(x86.movaps(r_128, MemRef(regs[0], data_size=128))) code.add(x86.pextrw(prgm.gp_return, r_128, 0)) code.add(x86.pextrw(regs[1], r_128, 1)) code.add(x86.pextrw(regs[2], r_128, 2)) code.add(x86.pextrw(regs[3], r_128, 3)) code.add(x86.shl(regs[1], 16)) code.add(x86.shl(regs[2], 32)) code.add(x86.shl(regs[3], 48)) code.add(x86.or_(prgm.gp_return, regs[1])) code.add(x86.or_(prgm.gp_return, regs[2])) code.add(x86.or_(prgm.gp_return, regs[3])) prgm.release_register(r_128) prgm.release_registers(regs) prgm.add(code) prgm.print_code() ret = proc.execute(prgm, mode='int') print "ret %x" % ret assert (ret == 0x0003000200010000) prgm.reset() code.reset() util.load_float(code, xmm0, 3.14159) prgm.add(code) ret = proc.execute(prgm, mode='fp') print "ret", ret assert (ret - 3.14159 < 0.00001) return
def __init__(self): self.state = extarray.extarray('I', 4) self.count = extarray.extarray('I', 2) self.buffer = extarray.extarray('B', 64)
def TestSimpleKernel(): import corepy.arch.ptx.isa as isa import corepy.arch.ptx.types.registers as regs import time SIZE = 128 proc = Processor(0) # build and run the kernel prgm = Program() code = prgm.get_stream() _mem = prgm.add_parameter('u64', name='_mem') _a = prgm.add_parameter('f32', name='_a') _b = prgm.add_parameter('f32', name='_b') # rd1 = regs.ptxVariable('reg', 'u64', 'rd1') # r1 = regs.ptxVariable('reg', 'f32', 'f1') # r2 = regs.ptxVariable('reg', 'f32', 'f2') # r3 = regs.ptxVariable('reg', 'f32', 'f3') # r4 = regs.ptxVariable('reg', 'f32', 'f4') # code.add(' .reg .u64 rd1;') # code.add(' .reg .f32 f1;') # code.add(' .reg .f32 f2;') # code.add(' .reg .f32 f3;') # code.add(' .reg .f32 f4;') rd1 = prgm.acquire_register('u64') r1 = prgm.acquire_register('f32') r2 = prgm.acquire_register('f32') r3 = prgm.acquire_register('f32') r4 = prgm.acquire_register('f32') v1 = prgm.add_variable('shared', 'f32') # don't need this, but let's test add_variable # import pdb # pdb.set_trace() #code.add(isa.add(r3, r2, r1)) #code.add('add.f32 r3, r2, r1;') code.add(isa.ld('param', r1, regs.ptxAddress(_a))) code.add(isa.ld('param', r2, regs.ptxAddress(_b))) code.add(isa.add(r3, r2, r1)) code.add(isa.add(r3, r3, 1.0)) code.add(isa.mov(r4, r3)) #temp = prgm.acquire_register('u32') #code.add(isa.cvt(temp, regs.tid.x)) #code.add(isa.cvt(r4, temp, rnd='rn')) temp1 = prgm.acquire_register('u32') temp2 = prgm.acquire_register('u32') temp3 = prgm.acquire_register('u32') code.add(isa.mul(temp2, temp1, temp3, hlw='lo')) code.add(isa.ld('param', rd1, regs.ptxAddress(_mem))) code.add(isa.st('global', regs.ptxAddress(rd1), r4)) prgm.add(code) prgm.cache_code() # prgm.render_string = ( # ''' # .version 1.4 # .target sm_10, map_f64_to_f32 # .entry _main ( # .param .u64 __cudaparm__Z16addArrayOnDevicePfff_c, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_a, # .param .f32 __cudaparm__Z16addArrayOnDevicePfff_b) # { # .reg .u64 %rd<3>; # .reg .f32 %f<6>; # ld.param.f32 %f1, [__cudaparm__Z16addArrayOnDevicePfff_a]; # ld.param.f32 %f2, [__cudaparm__Z16addArrayOnDevicePfff_b]; # add.f32 %f3, %f1, %f2; # mov.f32 %f4, %f3; # ld.param.u64 %rd1, [__cudaparm__Z16addArrayOnDevicePfff_c]; # st.global.f32 [%rd1+0], %f4; # exit; # } // _Z16addArrayOnDevicePfff # ''' # ) # prgm.render_code = ptx_exec.compile(prgm.render_string) #### #ptx_mem_addr = proc.alloc_device('f32', 1) ptx_mem_addr = ptx_exec.alloc_device(4) mem = extarray.extarray('f', 1) mem[0] = 5.0 a = 1.0 b = 2.0 print mem.buffer_info()[0] param_list = [ptx_mem_addr, a, b] print map(type, param_list) # # image, dev num, (x, y, w, h) #import pdb ptx_exec.copy_htod(ptx_mem_addr, mem.buffer_info()[0], 4) #kernel = prgm.render_string #module = ptx_exec.compile(kernel) t1 = time.time() #ptx_exec.run_stream(module, (1, 1, 1, 1, 1), (ptx_exec.u64, ptx_exec.f32, ptx_exec.f32), param_list) proc.execute(prgm, (1,1,1,1,1), param_list) t2 = time.time() # pdb.set_trace() print "run time", t2 - t1 print "YY", mem.buffer_info()[0], ptx_mem_addr, type(mem.buffer_info()[0]), type(ptx_mem_addr) print int(ptx_mem_addr) print int(mem.buffer_info()[0]) ptx_exec.copy_dtoh(mem.buffer_info()[0], ptx_mem_addr, 4) print param_list print mem #### return
def generate(self, results, patterns, r1_range, r2_range, max_init, max_n, size, n_spus=6): # Connect to the framebuffer #fb = cell_fb.framebuffer() #cell_fb.fb_open(fb) buffer = extarray.extarray('B', size[0] * size[1] * 4) buffer.clear() # Setup the range parameter array r1_inc = (r1_range[1] - r1_range[0]) / size[0] r2_inc = (r2_range[1] - r2_range[0]) / size[1] ranges = [0 for i in range(n_spus)] #a_ranges = [0 for i in range(n_spus)] # Slice and dice for parallel execution spu_slices = [[size[0], size[1] / n_spus] for ispu in range(n_spus)] spu_slices[-1][1] += size[1] % n_spus offset = 0.0 for ispu in range(n_spus): ranges[ispu] = extarray.extarray('f', [0.0] * 16) for i in range(4): ranges[ispu][ i] = r1_range[0] + float(i) * r1_inc # horizontal is simd ranges[ispu][4 + i] = r2_range[0] + offset ranges[ispu][8 + i] = r1_inc * 4.0 ranges[ispu][12 + i] = r2_inc # print ranges # Copy the paramters to aligned buffers #a_ranges[ispu] = synspu.aligned_memory(len(ranges[ispu]), typecode='I') #a_ranges[ispu].copy_to(ranges[ispu].buffer_info()[0], len(ranges[ispu])) offset += r2_inc * spu_slices[ispu][1] # Setup the pattern vector for pattern in patterns: if len(pattern) != len(patterns[0]): raise Exception('All patterns must be the same length') bits = [_pattern2vector(pattern) for pattern in patterns] #a_pattern = synspu.aligned_memory(len(bits[0]), typecode='I') pattern = extarray.extarray('I', len(bits[0])) # Create the instruction streams codes = [] n = len(patterns) * 10 offset = 0 for ispu in range(n_spus): renderer = FBRenderer() renderer.set_lsa(0x100) #renderer.set_addr(cell_fb.fb_addr(fb, 0) + offset) renderer.set_addr(buffer.buffer_info()[0] + offset) renderer.set_width(size[0]) #renderer.set_stride(fb.stride) renderer.set_stride(size[0]) ly_block = LyapunovBlock() ly_block.set_size(*spu_slices[i]) #ly_block.set_range(a_ranges[ispu]) ly_block.set_range(ranges[ispu]) #ly_block.set_pattern(a_pattern) ly_block.set_pattern(pattern) ly_block.set_max_init(max_init) ly_block.set_max_n(max_n) ly_block.set_renderer(renderer) code = synspu.InstructionStream() # code.set_debug(True) codes.append(code) #offset += spu_slices[i][1] * fb.stride * 4 offset += spu_slices[i][1] * size[0] * 4 # for i in spuiter.syn_range(code, n): ly_block.synthesize(code) # code.print_code() proc = synspu.Processor() #cell_fb.fb_clear(fb, 0) buffer.clear() import time ids = [0 for i in range(n_spus)] start = time.time() ipattern = 0 n_patterns = len(patterns) len_bits = len(bits[0]) pattern_inc = 1 for i in range(n): #a_pattern.copy_to(bits[ipattern].buffer_info()[0], len_bits) # TODO - better/faster for j in xrange(0, len_bits): pattern[j] = bits[ipattern][j] for ispu in range(n_spus): ids[ispu] = proc.execute(codes[ispu], async=True) for ispu in range(n_spus): proc.join(ids[ispu]) #cell_fb.fb_wait_vsync(fb) #cell_fb.fb_flip(fb, 0) # TODO - write buffer to image file #im = Image.frombuffer("RGBA", size, buffer.tostring(), "raw", "RGBA", 0, 1) imgbuf = Image.new("RGBA", size) arr = [(buffer[i + 3], buffer[i + 2], buffer[i + 1], 0xFF) for i in xrange(0, len(buffer), 4)] imgbuf.putdata(arr) imgbuf.save("lyapunov_%d.png" % ipattern) ipattern += pattern_inc if (ipattern == (n_patterns - 1)) or (ipattern == 0): pattern_inc *= -1 print ipattern stop = time.time() print '%.2f fps (%.6f)' % (float(n) / (stop - start), (stop - start)) #cell_fb.fb_close(fb) return
def load_float(code, reg, val): data = extarray.extarray('f', (val, )) data.change_type('I') return load_word(code, reg, data[0])