Beispiel #1
0
def fb_draw():
  code0 = synspu.InstructionStream()
  code1 = synspu.InstructionStream()  
  proc = synspu.Processor()

  fb = cell_fb.framebuffer()
  cell_fb.fb_open(fb)

  draw0 = FBDraw()
  draw0.set_buffers(cell_fb.fb_addr(fb, 0), cell_fb.fb_addr(fb, 1))
  draw0.set_stride(fb.stride)

  draw0.synthesize(code0)

  draw1 = FBDraw()
  draw1.set_buffers(cell_fb.fb_addr(fb, 1), cell_fb.fb_addr(fb, 0))cell_fb.fb_addr(fb, 0))
  draw1.set_stride(fb.stride)

  draw1.synthesize(code1)

  while True:

    # cell_fb.fb_clear(fb, 0)
    proc.execute(code0)
    cell_fb.fb_wait_vsync(fb)
    cell_fb.fb_flip(fb, 0)

    # cell_fb.fb_clear(fb, 1)
    proc.execute(code1)
    cell_fb.fb_wait_vsync(fb)
    cell_fb.fb_flip(fb, 1)
    

  cell_fb.fb_close(fb)
  return
Beispiel #2
0
def TestAll():
  import corepy.arch.spu.platform as env

  prgm = env.Program()
  code = prgm.get_stream()
  spu.set_active_code(code)

  a = code.prgm.acquire_register()
  b = code.prgm.acquire_register()
  c = code.prgm.acquire_register()
  
  shr(c, a, b)
  cneq(c, a, b)
  cge(c, a, b)
  cgei(c, a, 10)
  lt(c, a, b)
  lti(c, a, 10)  

  a_immediate(c, a, 10)
  a_immediate(c, a, 10000)  
  sf_immediate(c, a, 10000)
  

  prgm.add(code)
  prgm.print_code()

  proc = env.Processor()
  proc.execute(prgm)
  return
Beispiel #3
0
    def generate(self, results, pattern, r1_range, r2_range, max_init, max_n,
                 size):

        # Setup the range parameter array
        r1_inc = (r1_range[1] - r1_range[0]) / size[0]
        r2_inc = (r2_range[1] - r2_range[0]) / size[1]

        ranges = extarray.extarray('f', [0.0] * 16)
        for i in range(4):
            ranges[i] = r1_range[0]
            ranges[4 + i] = r2_range[0]
            ranges[8 + i] = r1_inc
            ranges[12 + i] = r2_inc

        # Setup the pattern vector
        bits = _pattern2vector(pattern)

        # Copy the paramters to aligned buffers
        #a_ranges = synspu.aligned_memory(len(ranges), typecode='I')
        #a_ranges.copy_to(ranges.buffer_info()[0], len(ranges))

        #a_pattern = synspu.aligned_memory(len(bits), typecode='I')
        #a_pattern.copy_to(bits.buffer_info()[0], len(bits))

        renderer = MailboxRenderer()
        ly_block = LyapunovBlock()

        ly_block.set_size(size[0], size[1])
        #ly_block.set_range(a_ranges)
        #ly_block.set_pattern(a_pattern)
        ly_block.set_range(ranges)
        ly_block.set_pattern(bits)
        ly_block.set_max_init(max_init)
        ly_block.set_max_n(max_n)
        ly_block.set_renderer(renderer)

        code = synspu.InstructionStream()
        ly_block.synthesize(code)

        proc = synspu.Processor()

        spe_id = proc.execute(code, async=True)

        for i in range(size[0] * size[1]):
            while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
                pass
            print 'ly said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        proc.join(spe_id)

        # for x in range(size[0]):
        #   r2 = r2_range[0] + r2_inc
        #   print 'col:', x, r1, r2

        #   for y in range(size[1]):
        #     results[y, x] = lyapunov_point(pattern, r1, r2, max_init, max_n)
        #     r2 += r2_inc
        #   r1 += r1_inc

        return
Beispiel #4
0
def TestSetSlotValue():
    import corepy.arch.spu.platform as synspu
    import corepy.arch.spu.types.spu_types as var
    import corepy.arch.spu.lib.dma as dma

    prgm = synspu.Program()
    code = prgm.get_stream()
    proc = synspu.Processor()
    spu.set_active_code(code)
    a = var.SignedWord(0x11)
    b = var.SignedWord(0x13)
    r = var.SignedWord(0xFFFFFFFF)

    set_slot_value(code, r, 0, 0x10)
    set_slot_value(code, r, 1, a)
    set_slot_value(code, r, 2, 0x12)
    set_slot_value(code, r, 3, b)

    for i in range(4):
        spu.wrch(r, dma.SPU_WrOutMbox)
        spu.rotqbyi(r, r, 4)

    prgm.add(code)
    spe_id = proc.execute(prgm, async=True)

    for i in range(4):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        result = synspu.spu_exec.read_out_mbox(spe_id)
        assert (result == (i + 0x10))

    proc.join(spe_id)

    return
Beispiel #5
0
def TestFloatArray():
    from corepy.arch.spu.platform import InstructionStream, Processor
    import corepy.arch.spu.lib.dma as dma
    import corepy.arch.spu.platform as env

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    x = SingleFloat([1.0, 2.0, 3.0, 4.0])
    y = SingleFloat([0.5, 1.5, 2.5, 3.5])
    sum = SingleFloat(0.0)

    sum.v = spu.fa.ex(x, y)

    r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return)

    for i in range(4):
        r.v = spu.fa.ex(sum, r)
        spu.rotqbyi(sum, sum, 4)

    prgm.add(code)
    proc = env.Processor()
    result = proc.execute(prgm, mode='fp')

    x_test = array.array('f', [1.0, 2.0, 3.0, 4.0])
    y_test = array.array('f', [0.5, 1.5, 2.5, 3.5])
    r_test = 0.0
    for i in range(4):
        r_test += x_test[i] + y_test[i]

    assert (result == r_test)

    return
Beispiel #6
0
def TestTanimoto():
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  code.set_debug(True)
  
  x_regs = code.acquire_registers(2)
  y_regs = code.acquire_registers(2)
  result = code.acquire_register()

  tan = Tanimoto()

  tan.set_n_bits(256)
  tan.set_x_regs(x_regs)
  tan.set_y_regs(y_regs)
  tan.set_result_reg(result)
  
  tan.synthesize(code)

  code.print_code()

  proc.execute(code)

  # TODO: Do a real test, not just a synthesis test
  return
Beispiel #7
0
def TestDecrementer():

    code = synspu.InstructionStream()

    spu_write_decr(code, 0x7FFFFFFFl)
    spu_start_decr(code)

    # Get a message from the PPU
    spu_read_in_mbox(code)

    reg = spu_read_decr(code)
    spu_write_out_mbox(code, reg)
    spu_stop_decr(code)

    proc = synspu.Processor()

    spe_id = proc.execute(code, async=True)

    print 'test is sleeping for 1 second'
    time.sleep(1)
    synspu.spu_exec.write_in_mbox(spe_id, 0x44CAFE)

    while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
        pass

    print 'spu said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    return
Beispiel #8
0
def TestVecIter(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  
  buffer_size = 16

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  stream = stream_buffer(code, a.buffer_info()[0], n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  md = memory_desc('i', 0, buffer_size)

  for buffer in stream:
    for current in spu_vec_iter(code, md):
      current.v = current + current

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)

  return
Beispiel #9
0
def TestMbox():

    code = synspu.InstructionStream()

    # Send a message to the PPU
    spu_write_out_mbox(code, 0xDEADBEEFl)

    # Get a message from the PPU
    reg = spu_read_in_mbox(code)

    # And send it back
    code.add(spu.wrch(reg, SPU_WrOutMbox))

    proc = synspu.Processor()

    spe_id = proc.execute(code, async=True)
    synspu.spu_exec.write_in_mbox(spe_id, 0x88CAFE)

    while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
        pass
    print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))
    while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
        pass
    print 'spe said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    return
Beispiel #10
0
def TestStreamBufferDouble(n_spus = 1):
  n = 2048
  a = extarray.extarray('I', range(n))
  
  buffer_size = 32

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  n_bytes = n * 4
  #print 'addr 0x%(addr)x %(addr)d' % {'addr':a.buffer_info()[0]}, n_bytes, buffer_size

  stream = stream_buffer(code, addr, n_bytes, buffer_size, 0, buffer_mode='double', save = True)
  if n_spus > 1:  stream = parallel(stream)

  for buffer in stream:
    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      code.add(spu.stqx(current, lsa, buffer))

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, len(a)):
    assert(a[i] == i + i)
  
  return
Beispiel #11
0
def test_stream_popc():
    code = synspu.InstructionStream()
    proc = synspu.Processor()

    bits = array.array('I', range(1024))
    for i in range(0, 1024, 4):
        bits[i] = 0x01010101  # 4 bits
        bits[i + 1] = 0xFFFFFFFF  # 32 bits
        bits[i + 2] = 0x10101010  # 4 bits
        bits[i + 3] = 0xFF0FF0F0  # 20 bits = 60 bits total


#    bits[i]   = 1
#    bits[i+1] = 2
#    bits[i+2] = 3
#    bits[i+3] = 4

#abits = synspu.aligned_memory(len(bits), typecode = 'I')
#abits.copy_to(bits.buffer_info()[0], len(bits))

    popc = syn_popc_stream()
    popc.set_stream_addr(bits.buffer_info()[0])
    popc.set_stream_size(len(bits))

    popc.synthesize(code)

    count = proc.execute(code, mode='mbox')
    print '-->', count
    assert (count == 60 * 1024 / 4)

    return
Beispiel #12
0
def TestAll():
    import corepy.arch.spu.platform as env

    code = env.InstructionStream()
    spu.set_active_code(code)

    a = code.acquire_register()
    b = code.acquire_register()
    c = code.acquire_register()

    shr(c, a, b)
    cneq(c, a, b)
    cge(c, a, b)
    cgei(c, a, 10)
    lt(c, a, b)
    lti(c, a, 10)

    a_immediate(c, a, 10)
    a_immediate(c, a, 10000)
    sf_immediate(c, a, 10000)

    code.print_code()
    proc = env.Processor()
    proc.execute(code)

    return
Beispiel #13
0
    def __init__(self):
        self.spus = spu_bank()
        self.n_spus = 0

        self.proc = synspu.Processor()
        self.prog = None

        return
Beispiel #14
0
def DoubleBufferExample(n_spus=6):
    """
  stream_buffer is an iterator that streams data from main memory to
  SPU local store in blocked buffers.  The buffers can be managed
  using single or double buffering semantics.  The induction variable
  returned by the buffer returns the address of the current buffer.

  Note: stream_buffer was designed before memory descriptors and has
        not been updated to support them yet.  The interface will
        change slightly when the memory classes are finalized.
  """
    n = 30000
    buffer_size = 16

    # Create an array and align the data
    a = extarray.extarray('I', range(n))

    addr = a.buffer_info()[0]
    n_bytes = n * 4

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = SignedWord(0, code)
    two = SignedWord(2, code)

    # Create the stream buffer, parallelizing it if using more than 1 SPU
    stream = stream_buffer(code,
                           addr,
                           n_bytes,
                           buffer_size,
                           0,
                           buffer_mode='double',
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    # Loop over the buffers
    for buffer in stream:

        # Create an iterators that computes the address offsets within the
        # buffer.  Note: this will be supported by var/vec iters soon.
        for lsa in syn_iter(code, buffer_size, 16):
            code.add(spu.lqx(current, lsa, buffer))
            current.v = current - two
            code.add(spu.stqx(current, lsa, buffer))

    # Run the synthetic program and copy the results back to the array
    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(2, len(a)):
        try:
            assert (a[i] == i - 2)
        except:
            print 'DoubleBuffer error:', a[i], i - 2

    return
Beispiel #15
0
def TestFloats():
    import math

    code = synspu.InstructionStream()
    proc = synspu.Processor()

    spu.set_active_code(code)

    code.set_debug(True)

    # Create a simple SPU program that computes log for all values bettween
    # .01 and 10.0 with .01 increments

    start = .65
    stop = .75
    inc = .01

    sp_step = 0x3C23D70A
    # r_current = var.Word(0x3C23D70A) # .01 in single precision
    r_current = var.Word(0x3F266666)
    r_step = var.Word(sp_step)  # .01 in single precision
    result = var.Word(0)
    log = SPULog()

    log.setup(code)
    log.set_result(result)
    log.set_x(r_current)

    log_iter = syn_iter(code, int((stop - start) / inc))

    for i in log_iter:

        log.synthesize(code)
        spu.fa(r_current, r_current, r_step)
        spu.wrch(result, dma.SPU_WrOutMbox)

    # code.print_code()
    spe_id = proc.execute(code, mode='async')

    x = start
    for i in range(int((stop - start) / inc)):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        slog = synspu.spu_exec.read_out_mbox(spe_id)
        print '%.3f 0x%08X  %.08f %.08f ' % (x, slog, _sp_to_float(slog),
                                             math.log(x, 2))
        x += inc

    proc.join(spe_id)

    return
Beispiel #16
0
def RunTest(test):
    import corepy.arch.spu.platform as env
    #from corepy.arch.spu.platform import InstructionStream, Processor

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    test()

    prgm.add(code)
    prgm.print_code()
    proc = env.Processor()
    proc.execute(prgm)
    return
Beispiel #17
0
def test_c_popc():
    code = synspu.NativeInstructionStream("spu_popc")
    proc = synspu.Processor()

    params = synspu.spu_exec.ExecParams()
    params.p7 = 0x01010101  # 4 bits
    params.p8 = 0xFFFFFFFF  # 32 bits
    params.p9 = 0x10101010  # 4 bits
    params.p10 = 0xFF0FF0F0  # 20 bits = 60 bits total

    count = proc.execute(code, mode='mbox', params=params)

    assert (count == 60)
    print 'test_syn_c passed'
    return
Beispiel #18
0
def TestStreamBufferSingle(n_spus = 1):
  n = 1024
  a = extarray.extarray('I', range(n))
  buffer_size = 128

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()
  
  current = var.SignedWord(0, code)

  addr = a.buffer_info()[0]
  stream = stream_buffer(code, addr, n * 4, buffer_size, 0, save = True)  
  if n_spus > 1:  stream = parallel(stream)

  #r_bufsize = code.acquire_register()
  #r_lsa = code.acquire_register()
  #r_current = code.acquire_register()
  
  for buffer in stream:
    #util.load_word(code, r_bufsize, buffer_size)
    #code.add(spu.il(r_lsa, 0))

    #loop = code.size()
    
    #code.add(spu.lqx(r_current, buffer, r_lsa))
    #code.add(spu.a(r_current, r_current, r_current))
    #code.add(spu.stqx(r_current, buffer, r_lsa))

    #code.add(spu.ai(r_bufsize, r_bufsize, -16))
    #code.add(spu.ai(r_lsa, r_lsa, 16))
    #code.add(spu.brnz(r_bufsize, loop - code.size()))

    for lsa in syn_iter(code, buffer_size, 16):
      code.add(spu.lqx(current, lsa, buffer))
      current.v = current + current
      #current.v = 5
      code.add(spu.stqx(current, lsa, buffer))
      

  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm, n_spus = n_spus)

  for i in range(0, n):
    assert(a[i] == i + i)
  
  return
Beispiel #19
0
def TestSaveBuffer1():
    import array

    code = synspu.InstructionStream()
    proc = synspu.Processor()

    code.set_debug(True)
    spu.set_active_code(code)

    n = 2**14
    data = array.array('I', range(n))
    #data = synspu.aligned_memory(n, typecode = 'I')
    #data.copy_to(data_array.buffer_info()[0], len(data_array))

    save_buffer = SaveBuffer()

    save_buffer.setup()
    save_buffer.init_ls_buffer(0, 128)
    save_buffer.init_mm_buffer(data.buffer_info()[0], n)

    value = var.SignedWord(0xCAFEBABE)

    for i in spuiter.syn_iter(code, n / 4):
        save_buffer.save_register(value)

    code.print_code()
    spe_id = proc.execute(code, mode='async')

    for i in range(n / 4):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    #data.copy_from(data_array.buffer_info()[0], len(data_array))

    print data[:10]
    return
Beispiel #20
0
def TestContinueLabel(n_spus=1):
    n = 1024
    a = extarray.extarray('I', range(n))

    buffer_size = 16

    if n_spus > 1: code = env.ParallelInstructionStream()
    else: code = env.InstructionStream()

    current = var.SignedWord(0, code)
    test = var.SignedWord(0, code)
    four = var.SignedWord(4, code)

    stream = stream_buffer(code,
                           a.buffer_info()[0],
                           n * 4,
                           buffer_size,
                           0,
                           save=True)
    if n_spus > 1: stream = parallel(stream)

    md = memory_desc('i', 0, buffer_size)
    lsa_iter = spu_vec_iter(code, md)

    for buffer in stream:
        for current in lsa_iter:
            current.v = current + current

            test.v = (current == four)
            code.add(spu.gbb(test, test))
            #lbl_continue = code.add(spu.stop(0xC)) - 1 # Place holder for the continue
            #lsa_iter.add_continue(code, 0, lambda lbl, reg = test.reg: spu.brz(reg, lbl))
            code.add(spu.brz(test.reg, lsa_iter.continue_label))
            current.v = current + current

        #lsa_iter.add_continue(code, lbl_continue, lambda next, reg = test.reg: spu.brz(reg, next))

    proc = env.Processor()
    r = proc.execute(code, n_spus=n_spus)

    for i in range(0, n):
        if i >= 4:
            assert (a[i] == i + i)
        else:
            #print a[i]
            assert (a[i] == i * 4)
    return
Beispiel #21
0
def test_syn(kernel):
    code = synspu.InstructionStream()
    proc = synspu.Processor()

    popc = kernel()
    popc.synthesize(code)

    params = synspu.spu_exec.ExecParams()
    params.p7 = 0x01010101  # 4 bits
    params.p8 = 0xFFFFFFFF  # 32 bits
    params.p9 = 0x10101010  # 4 bits
    params.p10 = 0xFF0FF0F0  # 20 bits = 60 bits total

    count = proc.execute(code, mode='mbox', params=params)
    assert (count == 60)

    return
Beispiel #22
0
def TestLog():
    code = synspu.InstructionStream()
    proc = synspu.Processor()

    spu.set_active_code(code)
    # Create a simple SPU program that computes log for 10 values and
    # sends the result back using the mailbox

    log = SPULog()

    values = []
    result = code.acquire_register()

    N = 10

    x = 1
    for i in range(N):
        val = var.Word(x)
        spu.cuflt(val, val, 155)
        values.append(val)
        x = x * 10

    log.setup(code)
    log.set_result(result)

    for i in range(N):

        log.set_x(values[i])
        log.synthesize(code)

        spu.wrch(result, dma.SPU_WrOutMbox)

    spe_id = proc.execute(code, mode='async')

    x = 1
    for i in range(N):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'log said: 0x%08X  (%d)' % (
            synspu.spu_exec.read_out_mbox(spe_id), x)
        x = x * 10

    proc.join(spe_id)

    return
Beispiel #23
0
def SimpleSPU():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    prgm = env.Program()
    code = prgm.get_stream()
    proc = env.Processor()

    spu.set_active_code(code)

    # Acquire two registers
    #x    = code.acquire_register()
    x = code.gp_return
    test = prgm.acquire_register(reg_name=55)

    spu.xor(x, x, x)  # zero x
    spu.ai(x, x, 11)  # x = x + 11
    spu.ai(x, x, 31)  # x = x + 31

    spu.ceqi(test, x, 42)  # test = (x == 42)

    # If test is false (all 0s), skip the stop(0x100A) instruction
    spu.brz(test, 2)
    spu.stop(0x100A)
    spu.stop(0x100B)

    prgm.add(code)
    prgm.print_code(hex=True)
    r = proc.execute(prgm, mode='int', stop=True, debug=True)
    assert (r[0] == 42)
    assert (r[1] == 0x100A)

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    util.load_float(code, code.fp_return, 3.14)

    prgm.add(code)
    prgm.print_code(hex=True)
    r = proc.execute(prgm, mode='fp')
    print r
    return
Beispiel #24
0
def TestFloatScalar():
    from corepy.arch.spu.platform import InstructionStream, Processor
    import corepy.arch.spu.lib.dma as dma
    import corepy.arch.spu.platform as env

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    x = SingleFloat(1.0)
    y = SingleFloat(2.0)
    r = SingleFloat(0.0, reg=code.fp_return)

    r.v = spu.fa.ex(x, y)

    prgm.add(code)
    proc = env.Processor()
    result = proc.execute(prgm, mode='fp')
    assert (result == (1.0 + 2.0))

    return
Beispiel #25
0
def TestSignal():

    code = synspu.InstructionStream()

    # Get a signal from the PPU
    reg = spu_read_signal1(code)

    # And send it back
    code.add(spu.wrch(reg, SPU_WrOutMbox))

    proc = synspu.Processor()

    spe_id = proc.execute(code, async=True)
    synspu.spu_exec.write_signal(spe_id, 1, 0xCAFEBABEl)

    while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
        pass

    print 'sig said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    return
Beispiel #26
0
import corepy.lib.extarray as extarray
import corepy.arch.spu.isa as spu
import corepy.arch.spu.platform as env
import corepy.arch.spu.lib.dma as dma
from corepy.arch.spu.lib.util import load_word

import time

if __name__ == '__main__':
    ITERS = 500000
    #ITERS = 15

    prgm = env.Program()
    code = prgm.get_stream()
    proc = env.Processor()
    spu.set_active_code(code)
    psmap = extarray.extarray('I', 131072 / 4)
    data = extarray.extarray('I', range(0, 16))

    r_sum = prgm.gp_return
    r_cnt = prgm.acquire_register()

    spu.xor(r_sum, r_sum, r_sum)
    load_word(code, r_cnt, ITERS)

    lbl_loop = prgm.get_label("loop")
    code.add(lbl_loop)

    reg = dma.spu_read_in_mbox(code)
Beispiel #27
0
def TestSPUIter():
  size = 32
  data = extarray.extarray('I', range(size))
  prgm = env.Program()
  code = prgm.get_stream()

  r_ea_data = prgm.acquire_register()
  r_ls_data = prgm.acquire_register()
  r_size    = prgm.acquire_register()
  r_tag     = prgm.acquire_register()  

  #print 'array ea: %X' % (data.buffer_info()[0])
  #print 'r_zero = %s, ea_data = %s, ls_data = %s, r_size = %s, r_tag = %s' % (
  #  str(code.r_zero), str(r_ea_data), str(r_ls_data), str(r_size), str(r_tag))
  
  # Load the effective address
  util.load_word(code, r_ea_data, data.buffer_info()[0])

  # Load the size
  util.load_word(code, r_size, size * 4)

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 12))

  # Load the lsa
  code.add(spu.ai(r_ls_data, code.r_zero, 0))

  # Load the data into address 0
  dma.mfc_get(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<12);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Increment the data values by 1 using an unrolled loop (no branches)
  # r_current = code.acquire_register()
  current = var.SignedWord(0, code)
  
  # Use an SPU iter
  for lsa in syn_iter(code, size * 4, 16):
    code.add(spu.lqx(current, code.r_zero, lsa))
    # code.add(spu.ai(1, r_current, r_current))
    current.v = current + current
    code.add(spu.stqx(current, code.r_zero, lsa))    

  # code.prgm.release_register(r_current)
  #current.release_register(code)
  
  # Store the values back to main memory

  # Load the tag
  code.add(spu.ai(r_tag, code.r_zero, 13))

  # Load the data into address 0
  dma.mfc_put(code, r_ls_data, r_ea_data, r_size, r_tag)

  # Set the tag bit to 12
  dma.mfc_write_tag_mask(code, 1<<13);

  # Wait for the transfer to complete
  dma.mfc_read_tag_status_all(code);

  # Cleanup
  prgm.release_register(r_ea_data)
  prgm.release_register(r_ls_data)  
  prgm.release_register(r_size)
  prgm.release_register(r_tag)  

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  prgm.add(code)
  proc = env.Processor()
  r = proc.execute(prgm)

  for i in range(0, size):
    assert(data[i] == i + i)

  return
Beispiel #28
0
def TestSPUParallelIter(data, size, n_spus = 6, buffer_size = 16, run_code = True):
  import time
  # n_spus = 8
  # buffer_size = 16 # 16 ints/buffer
  # n_buffers   = 4  # 4 buffers/spu
  # n_buffers = size / buffer_size
  # size = buffer_size * n_buffers * n_spus
  # data = array.array('I', range(size + 2))

  #data = env.aligned_memory(n, typecode = 'I')
  #data.copy_to(data_array.buffer_info()[0], len(data_array))


  # print 'Data align: 0x%X, %d' % (data.buffer_info()[0], data.buffer_info()[0] % 16)

  code = env.ParallelInstructionStream()
  # code = env.InstructionStream()

  r_zero    = code.acquire_register()
  r_ea_data = code.acquire_register()
  r_ls_data = code.acquire_register()
  r_size    = code.acquire_register()
  r_tag     = code.acquire_register()  

  # Load zero
  util.load_word(code, r_zero, 0)

  # print 'array ea: 0x%X 0x%X' % (data.buffer_info()[0], long(data.buffer_info()[0]))
  # print 'r_zero = %d, ea_data = %d, ls_data = %d, r_size = %d, r_tag = %d' % (
  #   r_zero, r_ea_data, r_ls_data, r_size, r_tag)

  # Load the effective address
  if data.buffer_info()[0] % 16 == 0:
    util.load_word(code, r_ea_data, data.buffer_info()[0])
  else: 
    util.load_word(code, r_ea_data, data.buffer_info()[0] + 8)

  ea_start = data.buffer_info()[0]
  # Iterate over each buffer
  for ea in parallel(syn_range(code, ea_start, ea_start + size * 4 , buffer_size * 4)):
    # ea = var.SignedWord(code = code, reg = r_ea_data)
  
    # print 'n_iters:', size / buffer_size
    # for i in syn_range(code, size / buffer_size):

    # code.add(spu.stop(0xB))
  
    # Load the size
    util.load_word(code, r_size, buffer_size * 4)

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 12))

    # Load the lsa
    code.add(spu.ai(r_ls_data, r_zero, 0))

    # Load the data into address 0
    dma.mfc_get(code, r_ls_data, ea, r_size, r_tag)

    # Set the tag bit to 12
    dma.mfc_write_tag_mask(code, 1<<12);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);

    # Increment the data values by 1 using an unrolled loop (no branches)
    # r_current = code.acquire_register()
    current = var.SignedWord(0, code)

    count = var.SignedWord(0, code)
    # Use an SPU iter
    for lsa in syn_iter(code, buffer_size * 4, 16):
      code.add(spu.lqx(current, r_zero, lsa))
      # code.add(spu.ai(1, r_current, r_current))
      current.v = current + current
      code.add(spu.stqx(current, r_zero, lsa))    
      count.v = count + 1

    code.add(spu.stqx(count, r_zero, 0))
  
    # code.release_register(r_current)
    current.release_registers(code)

    # Store the values back to main memory

    # Load the tag
    code.add(spu.ai(r_tag, r_zero, 13))

    # Load the data into address 0
    dma.mfc_put(code, r_ls_data, ea.reg, r_size, r_tag)

    # Set the tag bit to 13
    dma.mfc_write_tag_mask(code, 1<<13);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);


    # code.add(spu.stop(0xB))

    # Update ea
    # ea.v = ea + (buffer_size * 4)
  # /for ea address 


  # Cleanup
  code.release_register(r_zero)
  code.release_register(r_ea_data)
  code.release_register(r_ls_data)  
  code.release_register(r_size)
  code.release_register(r_tag)  

  if not run_code:
    return code

  # Stop for debugging
  # code.add(spu.stop(0xA))

  # Execute the code
  proc = env.Processor()
  #data.copy_from(data_array.buffer_info()[0], len(data_array))  
  def print_blocks():
    for i in range(0, size, buffer_size):
      # print data[i:(i + buffer_size)]
      print data[i + buffer_size],
    print '' 
  
  # print_blocks()
  s = time.time()
  r = proc.execute(code, n_spus = n_spus)
  # r = proc.execute(code)
  t = time.time() - s
  # print_blocks()

  return t
Beispiel #29
0
def TestTanimotoBlock(n_vecs = 4):
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  code.set_debug(True)
  spu.set_active_code(code)
  
  tb = TanimotoBlock()
  ls_save = LocalSave()
  mm_save = MemorySave()

  code.set_debug(True)

  # Input block parameters
  m = 128
  n = 64
  # n_vecs = 9
  n_bits = 128 * n_vecs

  # Main memory results buffer
  # max_results = 2**16
  max_results = 16384
  words_per_result = 4

  mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)])
  #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I')
  # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data))

  mm_results = spuiter.memory_desc('I')
  #mm_results.from_array(mm_results_buffer)
  mm_results.from_array(mm_results_data)

  mm_save.set_md_save_buffer(mm_results)
    
  # Local Results buffer
  buffer_size = var.SignedWord(16384)
  buffer_addr = var.SignedWord(m * n * n_vecs * 4)
  ls_results = spuiter.memory_desc('B')
  ls_results.set_size_reg(buffer_size)
  ls_results.set_addr_reg(buffer_addr)

  ls_save.set_md_results(ls_results)
  ls_save.set_mm_save_op(mm_save)

  # Setup the TanimotoBlock class
  tb.set_n_bits(n_bits)
  tb.set_block_size(m, n)

  tb.set_x_addr(0)
  tb.set_y_addr(m * n_vecs * 16)
  tb.set_save_op(ls_save)

  # Main test loop
  n_samples = 10000
  for samples in spuiter.syn_iter(code, n_samples):
    tb.synthesize(code)

  spu.wrch(buffer_size, dma.SPU_WrOutMbox)
  
  spu.stop(0x2000) 

  # "Function" Calls
  ls_save.block()
  mm_save.block()

  # code.print_code()
  start = time.time()
  spe_id = proc.execute(code, async=True)
  
  while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
  # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))
  stop = time.time()

  # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data))
  
  proc.join(spe_id)
  total = stop - start
  bits_sec = (m * n * n_bits * n_samples) / total / 1e9
  ops_per_compare = 48 * 4 + 8  # 48 SIMD instructions, 8 scalar
  insts_per_compare = 56
  gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9
  ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9  
  print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % (
    total, bits_sec, gops, ginsts, code.size())
  return
Beispiel #30
0
    def generate(self,
                 results,
                 patterns,
                 r1_range,
                 r2_range,
                 max_init,
                 max_n,
                 size,
                 n_spus=6):
        # Connect to the framebuffer
        #fb = cell_fb.framebuffer()
        #cell_fb.fb_open(fb)
        buffer = extarray.extarray('B', size[0] * size[1] * 4)
        buffer.clear()

        # Setup the range parameter array
        r1_inc = (r1_range[1] - r1_range[0]) / size[0]
        r2_inc = (r2_range[1] - r2_range[0]) / size[1]

        ranges = [0 for i in range(n_spus)]
        #a_ranges = [0 for i in range(n_spus)]

        # Slice and dice for parallel execution
        spu_slices = [[size[0], size[1] / n_spus] for ispu in range(n_spus)]
        spu_slices[-1][1] += size[1] % n_spus

        offset = 0.0
        for ispu in range(n_spus):
            ranges[ispu] = extarray.extarray('f', [0.0] * 16)

            for i in range(4):
                ranges[ispu][
                    i] = r1_range[0] + float(i) * r1_inc  # horizontal is simd
                ranges[ispu][4 + i] = r2_range[0] + offset
                ranges[ispu][8 + i] = r1_inc * 4.0
                ranges[ispu][12 + i] = r2_inc
            # print ranges

            # Copy the paramters to aligned buffers
            #a_ranges[ispu] = synspu.aligned_memory(len(ranges[ispu]), typecode='I')
            #a_ranges[ispu].copy_to(ranges[ispu].buffer_info()[0], len(ranges[ispu]))

            offset += r2_inc * spu_slices[ispu][1]

        # Setup the pattern vector
        for pattern in patterns:
            if len(pattern) != len(patterns[0]):
                raise Exception('All patterns must be the same length')

        bits = [_pattern2vector(pattern) for pattern in patterns]
        #a_pattern = synspu.aligned_memory(len(bits[0]), typecode='I')
        pattern = extarray.extarray('I', len(bits[0]))

        # Create the instruction streams
        codes = []

        n = len(patterns) * 10
        offset = 0
        for ispu in range(n_spus):
            renderer = FBRenderer()
            renderer.set_lsa(0x100)
            #renderer.set_addr(cell_fb.fb_addr(fb, 0) + offset)
            renderer.set_addr(buffer.buffer_info()[0] + offset)
            renderer.set_width(size[0])
            #renderer.set_stride(fb.stride)
            renderer.set_stride(size[0])

            ly_block = LyapunovBlock()

            ly_block.set_size(*spu_slices[i])
            #ly_block.set_range(a_ranges[ispu])
            ly_block.set_range(ranges[ispu])
            #ly_block.set_pattern(a_pattern)
            ly_block.set_pattern(pattern)
            ly_block.set_max_init(max_init)
            ly_block.set_max_n(max_n)
            ly_block.set_renderer(renderer)

            code = synspu.InstructionStream()
            # code.set_debug(True)
            codes.append(code)
            #offset += spu_slices[i][1] * fb.stride * 4
            offset += spu_slices[i][1] * size[0] * 4

            # for i in spuiter.syn_range(code, n):
            ly_block.synthesize(code)

        # code.print_code()
        proc = synspu.Processor()

        #cell_fb.fb_clear(fb, 0)
        buffer.clear()

        import time
        ids = [0 for i in range(n_spus)]
        start = time.time()

        ipattern = 0
        n_patterns = len(patterns)
        len_bits = len(bits[0])
        pattern_inc = 1

        for i in range(n):
            #a_pattern.copy_to(bits[ipattern].buffer_info()[0], len_bits)
            # TODO - better/faster
            for j in xrange(0, len_bits):
                pattern[j] = bits[ipattern][j]

            for ispu in range(n_spus):
                ids[ispu] = proc.execute(codes[ispu], async=True)

            for ispu in range(n_spus):
                proc.join(ids[ispu])

            #cell_fb.fb_wait_vsync(fb)
            #cell_fb.fb_flip(fb, 0)
            # TODO - write buffer to image file
            #im = Image.frombuffer("RGBA", size, buffer.tostring(), "raw", "RGBA", 0, 1)
            imgbuf = Image.new("RGBA", size)

            arr = [(buffer[i + 3], buffer[i + 2], buffer[i + 1], 0xFF)
                   for i in xrange(0, len(buffer), 4)]
            imgbuf.putdata(arr)
            imgbuf.save("lyapunov_%d.png" % ipattern)

            ipattern += pattern_inc
            if (ipattern == (n_patterns - 1)) or (ipattern == 0):
                pattern_inc *= -1

            print ipattern

        stop = time.time()

        print '%.2f fps (%.6f)' % (float(n) / (stop - start), (stop - start))
        #cell_fb.fb_close(fb)

        return