Example #1
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    # Reserve two variable registers
    count  = code.acquire_register()
    result = code.acquire_register()
    
    # 'Load' the input vector x from register 5
    x = code.acquire_register() 
    spu.ai(x, 5, 0)

    # Zero count and result
    spu.xor(count, count, count)
    spu.xor(result, result, result)
    
    # Inline the popc and reduce operations
    self.popc(count, x)
    self.reduce_word(result, count)

    # Send the result to the caller
    spu.wrch(result, dma.SPU_WrOutMbox)    

    code.release_register(x)
    spu.set_active_code(old_code)
    return
Example #2
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)


    stream = spuiter.stream_buffer(code, self.stream_addr, self.stream_size * 4,
                                   self.buffer_size, self.lsa)
    ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4)
    popc = syn_popc_var()

    x = var.Word(0)
    count = var.Word(0)
    total = var.Word(0)

    for buffer in stream:
      for x in spuiter.spu_vec_iter(code, ls_data, addr_reg = buffer):
        popc.popc(count, x)

    popc.reduce_word(total, count)

    # Send the result to the caller
    spu.wrch(total, dma.SPU_WrOutMbox)    

    spu.set_active_code(old_code)
    return
Example #3
0
def TestInt():
    prgm = Program()
    code = prgm.get_stream()
    proc = Processor()

    spu.set_active_code(code)

    r13 = prgm.acquire_register(reg_name=13)
    r20 = prgm.acquire_register(reg_name=20)
    spu.ai(r20, r20, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)

    spu.stop(0x200D)

    prgm += code
    r = proc.execute(prgm, stop=True)  # , debug = True)

    #print 'int result:', r
    assert (r[0] == 0)
    assert (r[1] == 0x200D)
    return
Example #4
0
def TestInt():
  prgm = Program()
  code = prgm.get_stream()
  proc = Processor()

  spu.set_active_code(code)
  
  r13 = prgm.acquire_register(reg_name = 13)
  r20 = prgm.acquire_register(reg_name = 20)
  spu.ai(r20, r20, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  
  spu.stop(0x200D)

  prgm += code
  r = proc.execute(prgm, stop = True) # , debug = True)

  #print 'int result:', r
  assert(r[0] == 0)
  assert(r[1] == 0x200D)
  return
Example #5
0
    def synthesize(self, code):
        """
    Render a vector with 4 pixels.
    """
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        if self.x_offset is None: raise Exception('Please call setup')
        if self.result is None: raise Exception('Please set result')
        if self.one is None: raise Exception('Please set one')

        # Make the part of the result positive and subtract 1
        # to transform (-1,-oo) into (0,oo)
        self.result.v = spu.fs.ex(0, self.result)
        self.result.v = spu.fs.ex(self.result, self.one)

        # Convert the result to an unsigned int, scaling by 2^4 to put
        # values between 0 and 16 in the gradient.  Values outside [0,16]
        # are 0 or FF
        self.result.v = spu.cfltu.ex(self.result, 169)  # 173 - 169 == 4
        # self.result.v = spu.sfi.ex(self.result, 255) # 173 - 169 == 4

        # Extract the first two bytes from the result into the RGB positions
        # and set alpha to 0xFF
        self.result.v = spu.shufb.ex(self.result, self.ff, self.uint2rgba)

        # Save the result and increment the offset
        spu.stqd(self.result, self.x_offset, self.lsa >> 4)
        spu.ai(self.x_offset, self.x_offset, 16)

        spu.set_active_code(old_code)
        return
Example #6
0
def TestAll():
    import corepy.arch.spu.platform as env

    code = env.InstructionStream()
    spu.set_active_code(code)

    a = code.acquire_register()
    b = code.acquire_register()
    c = code.acquire_register()

    shr(c, a, b)
    cneq(c, a, b)
    cge(c, a, b)
    cgei(c, a, 10)
    lt(c, a, b)
    lti(c, a, 10)

    a_immediate(c, a, 10)
    a_immediate(c, a, 10000)
    sf_immediate(c, a, 10000)

    code.print_code()
    proc = env.Processor()
    proc.execute(code)

    return
Example #7
0
def TestSetSlotValue():
  import corepy.arch.spu.platform as synspu
  import corepy.arch.spu.types.spu_types as var
  import corepy.arch.spu.lib.dma as dma

  prgm = synspu.Program()
  code = prgm.get_stream()
  proc = synspu.Processor()
  spu.set_active_code(code)
  a = var.SignedWord(0x11)
  b = var.SignedWord(0x13)
  r = var.SignedWord(0xFFFFFFFF)

  set_slot_value(code, r, 0, 0x10)
  set_slot_value(code, r, 1, a)
  set_slot_value(code, r, 2, 0x12)
  set_slot_value(code, r, 3, b)

  for i in range(4):
    spu.wrch(r, dma.SPU_WrOutMbox)
    spu.rotqbyi(r, r, 4)

  prgm.add(code)
  spe_id = proc.execute(prgm, async = True)

  for i in range(4):
    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    result = synspu.spu_exec.read_out_mbox(spe_id)
    assert(result == (i + 0x10))

  proc.join(spe_id)

  return
Example #8
0
    def synthesize(self, code):
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        # Reserve two variable registers
        count = code.acquire_register()
        result = code.acquire_register()

        # 'Load' the input vector x from register 5
        x = code.acquire_register()
        spu.ai(x, 5, 0)

        # Zero count and result
        spu.xor(count, count, count)
        spu.xor(result, result, result)

        # Inline the popc and reduce operations
        self.popc(count, x)
        self.reduce_word(result, count)

        # Send the result to the caller
        spu.wrch(result, dma.SPU_WrOutMbox)

        code.release_register(x)
        spu.set_active_code(old_code)
        return
Example #9
0
def TestSetSlotValue():
    import corepy.arch.spu.platform as synspu
    import corepy.arch.spu.types.spu_types as var
    import corepy.arch.spu.lib.dma as dma

    prgm = synspu.Program()
    code = prgm.get_stream()
    proc = synspu.Processor()
    spu.set_active_code(code)
    a = var.SignedWord(0x11)
    b = var.SignedWord(0x13)
    r = var.SignedWord(0xFFFFFFFF)

    set_slot_value(code, r, 0, 0x10)
    set_slot_value(code, r, 1, a)
    set_slot_value(code, r, 2, 0x12)
    set_slot_value(code, r, 3, b)

    for i in range(4):
        spu.wrch(r, dma.SPU_WrOutMbox)
        spu.rotqbyi(r, r, 4)

    prgm.add(code)
    spe_id = proc.execute(prgm, async=True)

    for i in range(4):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        result = synspu.spu_exec.read_out_mbox(spe_id)
        assert (result == (i + 0x10))

    proc.join(spe_id)

    return
Example #10
0
def TestFloatArray():
    from corepy.arch.spu.platform import InstructionStream, Processor
    import corepy.arch.spu.lib.dma as dma
    import corepy.arch.spu.platform as env

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    x = SingleFloat([1.0, 2.0, 3.0, 4.0])
    y = SingleFloat([0.5, 1.5, 2.5, 3.5])
    sum = SingleFloat(0.0)

    sum.v = spu.fa.ex(x, y)

    r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return)

    for i in range(4):
        r.v = spu.fa.ex(sum, r)
        spu.rotqbyi(sum, sum, 4)

    prgm.add(code)
    proc = env.Processor()
    result = proc.execute(prgm, mode='fp')

    x_test = array.array('f', [1.0, 2.0, 3.0, 4.0])
    y_test = array.array('f', [0.5, 1.5, 2.5, 3.5])
    r_test = 0.0
    for i in range(4):
        r_test += x_test[i] + y_test[i]

    assert (result == r_test)

    return
Example #11
0
def TestFloatArray():
    from corepy.arch.spu.platform import InstructionStream, Processor
    import corepy.arch.spu.lib.dma as dma
    import corepy.arch.spu.platform as env

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    x = SingleFloat([1.0, 2.0, 3.0, 4.0])
    y = SingleFloat([0.5, 1.5, 2.5, 3.5])
    sum = SingleFloat(0.0)

    sum.v = spu.fa.ex(x, y)

    r = SingleFloat([0.0, 0.0, 0.0, 0.0], reg=code.fp_return)

    for i in range(4):
        r.v = spu.fa.ex(sum, r)
        spu.rotqbyi(sum, sum, 4)

    prgm.add(code)
    proc = env.Processor()
    result = proc.execute(prgm, mode="fp")

    x_test = array.array("f", [1.0, 2.0, 3.0, 4.0])
    y_test = array.array("f", [0.5, 1.5, 2.5, 3.5])
    r_test = 0.0
    for i in range(4):
        r_test += x_test[i] + y_test[i]

    assert result == r_test

    return
Example #12
0
  def synthesize(self, code):
    """
    Render a vector with 4 pixels.
    """
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    if self.x_offset is None: raise Exception('Please call setup')
    if self.result is None: raise Exception('Please set result')
    if self.one is None: raise Exception('Please set one')

    # Make the part of the result positive and subtract 1
    # to transform (-1,-oo) into (0,oo)
    self.result.v = spu.fs.ex(0, self.result)
    self.result.v = spu.fs.ex(self.result, self.one)

    # Convert the result to an unsigned int, scaling by 2^4 to put 
    # values between 0 and 16 in the gradient.  Values outside [0,16] 
    # are 0 or FF
    self.result.v = spu.cfltu.ex(self.result, 169) # 173 - 169 == 4
    # self.result.v = spu.sfi.ex(self.result, 255) # 173 - 169 == 4

    # Extract the first two bytes from the result into the RGB positions
    # and set alpha to 0xFF
    self.result.v = spu.shufb.ex(self.result, self.ff, self.uint2rgba)

    # Save the result and increment the offset
    spu.stqd(self.result, self.x_offset, self.lsa >> 4)
    spu.ai(self.x_offset, self.x_offset, 16)

    spu.set_active_code(old_code)
    return
Example #13
0
def TestAll():
  import corepy.arch.spu.platform as env

  prgm = env.Program()
  code = prgm.get_stream()
  spu.set_active_code(code)

  a = code.prgm.acquire_register()
  b = code.prgm.acquire_register()
  c = code.prgm.acquire_register()
  
  shr(c, a, b)
  cneq(c, a, b)
  cge(c, a, b)
  cgei(c, a, 10)
  lt(c, a, b)
  lti(c, a, 10)  

  a_immediate(c, a, 10)
  a_immediate(c, a, 10000)  
  sf_immediate(c, a, 10000)
  

  prgm.add(code)
  prgm.print_code()

  proc = env.Processor()
  proc.execute(prgm)
  return
Example #14
0
    def synthesize(self, code):
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        stream = spuiter.stream_buffer(code, self.stream_addr,
                                       self.stream_size * 4, self.buffer_size,
                                       self.lsa)
        ls_data = spuiter.memory_desc('I', self.lsa, self.buffer_size / 4)
        popc = syn_popc_var()

        x = var.Word(0)
        count = var.Word(0)
        total = var.Word(0)

        for buffer in stream:
            for x in spuiter.spu_vec_iter(code, ls_data, addr_reg=buffer):
                popc.popc(count, x)

        popc.reduce_word(total, count)

        # Send the result to the caller
        spu.wrch(total, dma.SPU_WrOutMbox)

        spu.set_active_code(old_code)
        return
Example #15
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    if self.buffers is None: raise Exception('Please set buffers')
    if self.stride is None: raise Exception('Please set stride')
    
    # Draw a square
    color  = var.SignedWord(0x0F0F0FFF)
    fb0    = var.Word(self.buffers[0])
    fb1    = var.Word(self.buffers[1])
    stride = var.Word(self.stride)
    addr   = var.Word(0)
    
    # Draw one line
    line_pixels = 256
    for i in spuiter.syn_iter(code, line_pixels*4, step = 16):
      spu.stqx(color, addr, i)

    # Transfer the line to the frame buffer
    md_fb = spuiter.memory_desc('I', size = line_pixels)
    md_fb.set_addr_reg(addr.reg)
    
    addr.v = fb0

    for i in spuiter.syn_iter(code, 128):
      md_fb.put(code, 0)
      addr.v = addr + stride
    
    spu.set_active_code(old_code)
    return
Example #16
0
  def setup(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)
    
    self.consts = {}
    for const in constants.keys():
      self.consts[const] = var.Word(constants[const])

    spu.set_active_code(old_code)
    return
Example #17
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    if self.result is None: raise Exception('Please set result')

    spu.wrch(self.result, dma.SPU_WrOutMbox)

    spu.set_active_code(old_code)
    return
Example #18
0
    def setup(self, code):
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        self.consts = {}
        for const in constants.keys():
            self.consts[const] = var.Word(constants[const])

        spu.set_active_code(old_code)
        return
Example #19
0
    def synthesize(self, code):
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        if self.result is None: raise Exception('Please set result')

        spu.wrch(self.result, dma.SPU_WrOutMbox)

        spu.set_active_code(old_code)
        return
Example #20
0
def RunTest(test):
  from corepy.arch.spu.platform import InstructionStream, Processor

  code = InstructionStream()
  spu.set_active_code(code)

  test()
  
  code.print_code()
  proc = Processor()
  proc.execute(code)
  return
Example #21
0
    def synthesize(self, code):
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        self._load_parameters(code)

        log = spu_log.SPULog()
        log.setup(code)

        if self.renderer is not None:
            self.renderer.setup(code)
            self.renderer.set_one(log.consts['ONE'])

        r1_inc = var.SingleFloat()
        r2_inc = var.SingleFloat()
        r1 = var.SingleFloat()
        r2 = var.SingleFloat()
        result = var.SingleFloat()
        pattern = var.Word(0)

        self.ly_point.set_pattern_reg(pattern)
        self.ly_point.set_result_reg(result)
        self.ly_point.set_r_regs(r1, r2)
        self.ly_point.set_log(log)
        self.ly_point.setup(code)

        spu.lqa(r1, 0)
        spu.lqa(r2, 4)
        spu.lqa(r1_inc, 8)
        spu.lqa(r2_inc, 12)
        spu.lqa(pattern, 16)

        for y in spuiter.syn_iter(code, self.h):
            spu.lqa(r1, 0)

            for x in spuiter.syn_iter(code, self.w / 4):
                self.ly_point.synthesize(code)
                r1.v = spu.fa.ex(r1, r1_inc)

                if self.renderer is not None:
                    # result.v = spu.fm.ex(r1, r2)
                    self.renderer.set_result_reg(result)
                    self.renderer.synthesize(code)

            if self.renderer is not None:
                self.renderer.row_complete(code)
            r2.v = spu.fa.ex(r2, r2_inc)

        # return Numeric.where(Numeric.less(results, 0), results, 0)

        spu.set_active_code(old_code)
        return
Example #22
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    self._load_parameters(code)

    log = spu_log.SPULog()
    log.setup(code)

    if self.renderer is not None:
      self.renderer.setup(code)
      self.renderer.set_one(log.consts['ONE'])

    r1_inc = var.SingleFloat()
    r2_inc = var.SingleFloat()
    r1 = var.SingleFloat()
    r2 = var.SingleFloat()
    result = var.SingleFloat()
    pattern = var.Word(0)

    self.ly_point.set_pattern_reg(pattern)
    self.ly_point.set_result_reg(result)
    self.ly_point.set_r_regs(r1, r2)
    self.ly_point.set_log(log)
    self.ly_point.setup(code)

    spu.lqa(r1, 0)
    spu.lqa(r2, 4)    
    spu.lqa(r1_inc, 8)
    spu.lqa(r2_inc, 12)
    spu.lqa(pattern, 16)

    for y in spuiter.syn_iter(code, self.h):
      spu.lqa(r1, 0)

      for x in spuiter.syn_iter(code, self.w / 4):
        self.ly_point.synthesize(code)
        r1.v = spu.fa.ex(r1, r1_inc)

        if self.renderer is not None:
          # result.v = spu.fm.ex(r1, r2)
          self.renderer.set_result_reg(result)
          self.renderer.synthesize(code)
          
      if self.renderer is not None:
        self.renderer.row_complete(code)
      r2.v = spu.fa.ex(r2, r2_inc)
      
    # return Numeric.where(Numeric.less(results, 0), results, 0)
    
    spu.set_active_code(old_code)
    return 
Example #23
0
  def synthesize_constants(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)
    
    self._one = code.acquire_register()
    spu.xor(self._one, self._one, self._one)
    spu.ai(self._one, self._one, 1)
    spu.cuflt(self._one, self._one, 155)
    
    if old_code is not None:
      spu.set_active_code(old_code)

    return
Example #24
0
def TestFloats():
    import math

    code = synspu.InstructionStream()
    proc = synspu.Processor()

    spu.set_active_code(code)

    code.set_debug(True)

    # Create a simple SPU program that computes log for all values bettween
    # .01 and 10.0 with .01 increments

    start = .65
    stop = .75
    inc = .01

    sp_step = 0x3C23D70A
    # r_current = var.Word(0x3C23D70A) # .01 in single precision
    r_current = var.Word(0x3F266666)
    r_step = var.Word(sp_step)  # .01 in single precision
    result = var.Word(0)
    log = SPULog()

    log.setup(code)
    log.set_result(result)
    log.set_x(r_current)

    log_iter = syn_iter(code, int((stop - start) / inc))

    for i in log_iter:

        log.synthesize(code)
        spu.fa(r_current, r_current, r_step)
        spu.wrch(result, dma.SPU_WrOutMbox)

    # code.print_code()
    spe_id = proc.execute(code, mode='async')

    x = start
    for i in range(int((stop - start) / inc)):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        slog = synspu.spu_exec.read_out_mbox(spe_id)
        print '%.3f 0x%08X  %.08f %.08f ' % (x, slog, _sp_to_float(slog),
                                             math.log(x, 2))
        x += inc

    proc.join(spe_id)

    return
Example #25
0
def TestFloats():
  import math
  
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  spu.set_active_code(code)

  code.set_debug(True)
  
  # Create a simple SPU program that computes log for all values bettween
  # .01 and 10.0 with .01 increments

  start = .65
  stop  = .75
  inc   = .01

  sp_step = 0x3C23D70A
  # r_current = var.Word(0x3C23D70A) # .01 in single precision
  r_current = var.Word(0x3F266666)
  r_step  = var.Word(sp_step)    # .01 in single precision
  result  = var.Word(0)
  log = SPULog()

  log.setup(code)
  log.set_result(result)
  log.set_x(r_current)
  
  log_iter = syn_iter(code, int((stop - start) / inc))

  for i in log_iter:
    
    log.synthesize(code)
    spu.fa(r_current, r_current, r_step)
    spu.wrch(result, dma.SPU_WrOutMbox)

  # code.print_code()
  spe_id = proc.execute(code, mode = 'async')

  x = start
  for i in range(int((stop - start) / inc)):
    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    slog = synspu.spu_exec.read_out_mbox(spe_id)
    print '%.3f 0x%08X  %.08f %.08f ' % (x, slog, _sp_to_float(slog), math.log(x, 2))
    x += inc

  proc.join(spe_id)

  return
Example #26
0
def RunTest(test):
    import corepy.arch.spu.platform as env
    #from corepy.arch.spu.platform import InstructionStream, Processor

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    test()

    prgm.add(code)
    prgm.print_code()
    proc = env.Processor()
    proc.execute(prgm)
    return
Example #27
0
def TestDebug():
    prgm = Program()
    code = prgm.get_stream()
    proc = DebugProcessor()

    spu.set_active_code(code)

    ra = code.acquire_register()
    rb = code.acquire_register()
    rc = code.acquire_register()
    rd = code.acquire_register()
    re = code.acquire_register()
    rf = code.acquire_register()
    rg = code.acquire_register()
    rh = code.acquire_register()

    spu.ai(ra, 0, 14)
    spu.ai(rb, 0, 13)
    spu.ai(rc, 0, 14)
    spu.brnz(14, 3)
    spu.ai(rd, 0, 15)
    spu.ai(re, 0, 16)
    spu.ai(rf, 0, 17)
    spu.ai(rg, 0, 18)
    spu.ai(rh, 0, 19)
    spu.nop(0)

    spu.stop(0x200A)

    prgm += code
    r = proc.execute(prgm)  # , debug = True)

    r = proc.nexti()
    r = proc.nexti()
    r = proc.nexti()
    r = proc.nexti()

    while r != None:
        r = proc.nexti()
        if r is not None:
            regs = proc.dump_regs()
            print '******', regs[122:]

    assert (r == None)
    print 'int result:', r
    # while True:
    #   pass
    return
Example #28
0
def TestDebug():
  prgm = Program()
  code = prgm.get_stream()
  proc = DebugProcessor()

  spu.set_active_code(code)

  ra = code.acquire_register()
  rb = code.acquire_register()
  rc = code.acquire_register()
  rd = code.acquire_register()
  re = code.acquire_register()
  rf = code.acquire_register()
  rg = code.acquire_register()
  rh = code.acquire_register()  

  spu.ai(ra, 0, 14)
  spu.ai(rb, 0, 13)
  spu.ai(rc, 0, 14)
  spu.brnz(14, 3)
  spu.ai(rd, 0, 15)
  spu.ai(re, 0, 16)
  spu.ai(rf, 0, 17)
  spu.ai(rg, 0, 18)
  spu.ai(rh, 0, 19)    
  spu.nop(0)

  spu.stop(0x200A)

  prgm += code
  r = proc.execute(prgm) # , debug = True)

  r = proc.nexti()
  r = proc.nexti()
  r = proc.nexti()
  r = proc.nexti()
    
  while r != None:
    r = proc.nexti()
    if r is not None:
      regs = proc.dump_regs()
      print '******', regs[122:]
    
  assert(r == None)
  print 'int result:', r
  # while True:
  #   pass
  return
Example #29
0
def RunTest(test):
    import corepy.arch.spu.platform as env

    # from corepy.arch.spu.platform import InstructionStream, Processor

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    test()

    prgm.add(code)
    prgm.print_code()
    proc = env.Processor()
    proc.execute(prgm)
    return
Example #30
0
def TestSaveBuffer1():
    import array

    code = synspu.InstructionStream()
    proc = synspu.Processor()

    code.set_debug(True)
    spu.set_active_code(code)

    n = 2**14
    data = array.array('I', range(n))
    #data = synspu.aligned_memory(n, typecode = 'I')
    #data.copy_to(data_array.buffer_info()[0], len(data_array))

    save_buffer = SaveBuffer()

    save_buffer.setup()
    save_buffer.init_ls_buffer(0, 128)
    save_buffer.init_mm_buffer(data.buffer_info()[0], n)

    value = var.SignedWord(0xCAFEBABE)

    for i in spuiter.syn_iter(code, n / 4):
        save_buffer.save_register(value)

    code.print_code()
    spe_id = proc.execute(code, mode='async')

    for i in range(n / 4):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    proc.join(spe_id)

    #data.copy_from(data_array.buffer_info()[0], len(data_array))

    print data[:10]
    return
Example #31
0
  def synthesize(self):
    # Okay.  This code is not going to exceed 256 instructions (1kb).  Knowing that,
    # the register contents can be safely placed at 0x3F400 in localstore, 3kb from
    # the top.  The SPRE will place the instruction stream as close to the top as
    # possible.  But since it is not going to be more than 1kb worth of instructions,
    # it will not overlap with the register contents.

    code = self.code
    spu.set_active_code(code)
    
    # Reload the instructions
    spu.sync(1)

    # Next instruction to execute
    lbl_op = code.size()
    spu.nop(0)    

    # Placeholders for register store instructions
    for i in range(128):
       spu.stqa(i, 0xFD00 + (i * 4))
    #  spu.stqa(i, 0xFE00 + (i * 4))

    # Stop for next command
    spu.stop(0x0FFF) 

    lbl_regs = code.size()
    
    # Create space for the saved registers
    #for i in range(128):
    #  # 16 bytes/register
    #  spu.nop(0)
    #  spu.lnop()
    #  spu.nop(0)
    #  spu.lnop()

    # Clearing active code here is important!
    spu.set_active_code(None)
    code.cache_code()

    code_size = len(code._prologue._code) * 4
    self.xfer_size = code_size  + (16 - (code_size) % 16);
    print 'xfer_size:', self.xfer_size

    self.code_lsa = (0x3FFFF - code_size) & 0xFFF80;
    self.lbl_op = lbl_op
    return
Example #32
0
File: ispu.py Project: tmaone/efi
    def synthesize(self):
        # Okay.  This code is not going to exceed 256 instructions (1kb).  Knowing that,
        # the register contents can be safely placed at 0x3F400 in localstore, 3kb from
        # the top.  The SPRE will place the instruction stream as close to the top as
        # possible.  But since it is not going to be more than 1kb worth of instructions,
        # it will not overlap with the register contents.

        code = self.code
        spu.set_active_code(code)

        # Reload the instructions
        spu.sync(1)

        # Next instruction to execute
        lbl_op = code.size()
        spu.nop(0)

        # Placeholders for register store instructions
        for i in range(128):
            spu.stqa(i, 0xFD00 + (i * 4))
        #  spu.stqa(i, 0xFE00 + (i * 4))

        # Stop for next command
        spu.stop(0x0FFF)

        lbl_regs = code.size()

        # Create space for the saved registers
        #for i in range(128):
        #  # 16 bytes/register
        #  spu.nop(0)
        #  spu.lnop()
        #  spu.nop(0)
        #  spu.lnop()

        # Clearing active code here is important!
        spu.set_active_code(None)
        code.cache_code()

        code_size = len(code._prologue._code) * 4
        self.xfer_size = code_size + (16 - (code_size) % 16)
        print 'xfer_size:', self.xfer_size

        self.code_lsa = (0x3FFFF - code_size) & 0xFFF80
        self.lbl_op = lbl_op
        return
Example #33
0
def TestLog():
    code = synspu.InstructionStream()
    proc = synspu.Processor()

    spu.set_active_code(code)
    # Create a simple SPU program that computes log for 10 values and
    # sends the result back using the mailbox

    log = SPULog()

    values = []
    result = code.acquire_register()

    N = 10

    x = 1
    for i in range(N):
        val = var.Word(x)
        spu.cuflt(val, val, 155)
        values.append(val)
        x = x * 10

    log.setup(code)
    log.set_result(result)

    for i in range(N):

        log.set_x(values[i])
        log.synthesize(code)

        spu.wrch(result, dma.SPU_WrOutMbox)

    spe_id = proc.execute(code, mode='async')

    x = 1
    for i in range(N):
        while synspu.spu_exec.stat_out_mbox(spe_id) == 0:
            pass
        print 'log said: 0x%08X  (%d)' % (
            synspu.spu_exec.read_out_mbox(spe_id), x)
        x = x * 10

    proc.join(spe_id)

    return
Example #34
0
def TestSaveBuffer1():
  import array

  code = synspu.InstructionStream()
  proc = synspu.Processor()

  code.set_debug(True)
  spu.set_active_code(code)
  
  n = 2**14
  data = array.array('I', range(n))
  #data = synspu.aligned_memory(n, typecode = 'I')
  #data.copy_to(data_array.buffer_info()[0], len(data_array))


  save_buffer = SaveBuffer()
  
  save_buffer.setup()
  save_buffer.init_ls_buffer(0, 128)
  save_buffer.init_mm_buffer(data.buffer_info()[0], n)

  value = var.SignedWord(0xCAFEBABE)
  
  for i in spuiter.syn_iter(code, n / 4):
    save_buffer.save_register(value)

  code.print_code()
  spe_id = proc.execute(code, mode='async')

  for i in range(n/4):
    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    print 'size: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    print 'offset: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    print 'test: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))

  proc.join(spe_id)

  #data.copy_from(data_array.buffer_info()[0], len(data_array))  

  print data[:10]
  return
Example #35
0
def TestFloatScalar():
  from corepy.arch.spu.platform import InstructionStream, Processor
  import corepy.arch.spu.lib.dma as dma

  code = InstructionStream()
  spu.set_active_code(code)

  x = SingleFloat(1.0)
  y = SingleFloat(2.0)
  r = SingleFloat(0.0, reg = code.fp_return)

  r.v = spu.fa.ex(x, y)
  
  proc = Processor()
  result = proc.execute(code, mode='fp')
  assert(result == (1.0 + 2.0))
  
  return
Example #36
0
def SimpleSPU():
  """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
  prgm = env.Program()
  code = prgm.get_stream()
  proc = env.Processor()

  spu.set_active_code(code)

  # Acquire two registers
  #x    = code.acquire_register()
  x = code.gp_return
  test = prgm.acquire_register(reg_name = 55)

  spu.xor(x, x, x) # zero x
  spu.ai(x, x, 11) # x = x + 11
  spu.ai(x, x, 31) # x = x + 31

  spu.ceqi(test, x, 42) # test = (x == 42)

  # If test is false (all 0s), skip the stop(0x100A) instruction
  spu.brz(test, 2)
  spu.stop(0x100A)
  spu.stop(0x100B)

  prgm.add(code) 
  prgm.print_code(hex = True) 
  r = proc.execute(prgm, mode = 'int', stop = True, debug = True) 
  assert(r[0] == 42)
  assert(r[1] == 0x100A)

  prgm = env.Program()
  code = prgm.get_stream()
  spu.set_active_code(code)

  util.load_float(code, code.fp_return, 3.14)

  prgm.add(code)
  prgm.print_code(hex = True)
  r = proc.execute(prgm, mode = 'fp')
  print r
  return
Example #37
0
def TestLog():
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  spu.set_active_code(code)
  # Create a simple SPU program that computes log for 10 values and
  # sends the result back using the mailbox

  log = SPULog()
  
  values = []
  result = code.acquire_register()

  N = 10
  
  x = 1
  for i in range(N):
    val = var.Word(x)
    spu.cuflt(val, val, 155)
    values.append(val)
    x = x * 10
    
  log.setup(code)
  log.set_result(result)

  for i in range(N):

    log.set_x(values[i])
    log.synthesize(code)

    spu.wrch(result, dma.SPU_WrOutMbox)
    
  spe_id = proc.execute(code, mode = 'async')

  x = 1
  for i in range(N):
    while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
    print 'log said: 0x%08X  (%d)' %(synspu.spu_exec.read_out_mbox(spe_id), x)
    x = x * 10

  proc.join(spe_id)

  return
Example #38
0
def SimpleSPU():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    prgm = env.Program()
    code = prgm.get_stream()
    proc = env.Processor()

    spu.set_active_code(code)

    # Acquire two registers
    #x    = code.acquire_register()
    x = code.gp_return
    test = prgm.acquire_register(reg_name=55)

    spu.xor(x, x, x)  # zero x
    spu.ai(x, x, 11)  # x = x + 11
    spu.ai(x, x, 31)  # x = x + 31

    spu.ceqi(test, x, 42)  # test = (x == 42)

    # If test is false (all 0s), skip the stop(0x100A) instruction
    spu.brz(test, 2)
    spu.stop(0x100A)
    spu.stop(0x100B)

    prgm.add(code)
    prgm.print_code(hex=True)
    r = proc.execute(prgm, mode='int', stop=True, debug=True)
    assert (r[0] == 42)
    assert (r[1] == 0x100A)

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    util.load_float(code, code.fp_return, 3.14)

    prgm.add(code)
    prgm.print_code(hex=True)
    r = proc.execute(prgm, mode='fp')
    print r
    return
Example #39
0
    def synthesize(self, code):
        if self._x_regs is None: raise Exception("Please set x_regs")
        if self._y_regs is None: raise Exception("Please set y_regs")
        if self._result is None: raise Exception("Please set result register")

        old_code = spu.get_active_code()
        spu.set_active_code(code)

        regs = []

        if self._one is None:
            self.synthesize_constants(code)
            regs.append(self._one)

        ab = code.acquire_register()
        c = code.acquire_register()
        ab_temp = code.acquire_register()
        c_temp = code.acquire_register()
        result = code.acquire_register()
        regs = regs + [ab, c, ab_temp, c_temp]

        nregs = self._n_bits / 128

        for i in range(nregs):
            # self._ab(self._x_regs[i], self._y_regs[i], ab, ab_temp)
            # self._c( self._x_regs[i], self._y_regs[i],  c,  c_temp)
            self._ab_c(self._x_regs[i], self._y_regs[i], ab, c, ab_temp,
                       c_temp)

        self._reduce_word(ab, ab_temp)
        self._reduce_word(c, c_temp)

        self._compute_ratio(ab_temp, c_temp, result)

        print '%d registers,' % (len(regs) + len(self._x_regs) +
                                 len(self._y_regs)),
        code.release_registers(regs)
        if old_code is not None:
            spu.set_active_code(old_code)

        return
Example #40
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    # Create and initialize the variables
    count  = var.Word(0)
    result = var.Word(0)
    x = var.Word(0)    

    # 'Load' the input vector x from register 5
    x.v = spu.ai.ex(5, 0)
    
    # Inline the popc and reduce operations
    self.popc(count, x)
    self.reduce_word(result, count)

    # Send the result to the caller
    spu.wrch(result, dma.SPU_WrOutMbox)    

    spu.set_active_code(old_code)
    return
Example #41
0
def TestFloatScalar():
    from corepy.arch.spu.platform import InstructionStream, Processor
    import corepy.arch.spu.lib.dma as dma
    import corepy.arch.spu.platform as env

    prgm = env.Program()
    code = prgm.get_stream()
    spu.set_active_code(code)

    x = SingleFloat(1.0)
    y = SingleFloat(2.0)
    r = SingleFloat(0.0, reg=code.fp_return)

    r.v = spu.fa.ex(x, y)

    prgm.add(code)
    proc = env.Processor()
    result = proc.execute(prgm, mode="fp")
    assert result == (1.0 + 2.0)

    return
Example #42
0
def bi_bug():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    # Acquire two registers
    stop_inst = SignedWord(0x200D)
    stop_addr = SignedWord(0x0)

    spu.stqa(stop_inst, 0x0)
    spu.bi(stop_addr)
    spu.stop(0x200A)

    r = proc.execute(code)
    assert r == 0xD

    return
Example #43
0
    def synthesize(self, code):
        old_code = spu.get_active_code()
        spu.set_active_code(code)

        # Create and initialize the variables
        count = var.Word(0)
        result = var.Word(0)
        x = var.Word(0)

        # 'Load' the input vector x from register 5
        x.v = spu.ai.ex(5, 0)

        # Inline the popc and reduce operations
        self.popc(count, x)
        self.reduce_word(result, count)

        # Send the result to the caller
        spu.wrch(result, dma.SPU_WrOutMbox)

        spu.set_active_code(old_code)
        return
Example #44
0
File: bi.py Project: tmaone/efi
def bi_bug():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    # Acquire two registers
    stop_inst = SignedWord(0x200D)
    stop_addr = SignedWord(0x0)

    spu.stqa(stop_inst, 0x0)
    spu.bi(stop_addr)
    spu.stop(0x200A)

    r = proc.execute(code)
    assert (r == 0xD)

    return
Example #45
0
  def synthesize(self, code):
    if self._x_regs is None:  raise Exception("Please set x_regs")        
    if self._y_regs is None:  raise Exception("Please set y_regs")
    if self._result is None:  raise Exception("Please set result register")    

    old_code = spu.get_active_code()
    spu.set_active_code(code)    

    regs = []

    if self._one is None:
      self.synthesize_constants(code)
      regs.append(self._one)      


    ab = code.acquire_register()
    c  = code.acquire_register()
    ab_temp = code.acquire_register()
    c_temp  = code.acquire_register()
    result  = code.acquire_register()
    regs = regs + [ab, c, ab_temp, c_temp]

    nregs = self._n_bits / 128

    for i in range(nregs):
      # self._ab(self._x_regs[i], self._y_regs[i], ab, ab_temp)
      # self._c( self._x_regs[i], self._y_regs[i],  c,  c_temp)
      self._ab_c(self._x_regs[i], self._y_regs[i], ab, c, ab_temp, c_temp)
      
    self._reduce_word(ab, ab_temp)
    self._reduce_word( c,  c_temp)

    self._compute_ratio(ab_temp, c_temp, result)

    print '%d registers,' % (len(regs) + len(self._x_regs) + len(self._y_regs)),
    code.release_registers(regs)
    if old_code is not None:
      spu.set_active_code(old_code)
      
    return
Example #46
0
def TestInt():
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    r13 = code.acquire_register(reg=13)
    r20 = code.acquire_register(reg=20)
    spu.ai(r20, r20, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)

    spu.stop(0x200D)

    r = proc.execute(code, stop=True)  # , debug = True)

    #print 'int result:', r
    assert (r[0] == 0)
    assert (r[1] == 0x200D)
    return
Example #47
0
def TestInt():
  code = InstructionStream()
  proc = Processor()

  spu.set_active_code(code)
  
  r13 = code.acquire_register(reg = 13)
  r20 = code.acquire_register(reg = 20)
  spu.ai(r20, r20, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  spu.ai(r13, r13, 13)
  
  spu.stop(0x200D)

  code.print_code()
  r = proc.execute(code) # , debug = True)
  print 'int result:', r
  # while True:
  #   pass
  return
Example #48
0
def TestInt():
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    r13 = code.acquire_register(reg=13)
    r20 = code.acquire_register(reg=20)
    spu.ai(r20, r20, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)
    spu.ai(r13, r13, 13)

    spu.stop(0x200D)

    code.print_code()
    r = proc.execute(code)  # , debug = True)
    print 'int result:', r
    # while True:
    #   pass
    return
Example #49
0
import corepy.lib.extarray as extarray
import corepy.arch.spu.isa as spu
import corepy.arch.spu.platform as env
import corepy.arch.spu.lib.dma as dma
from corepy.arch.spu.lib.util import load_word

import time

if __name__ == '__main__':
  ITERS = 500000
  #ITERS = 15

  prgm = env.Program()
  code = prgm.get_stream()
  proc = env.Processor()
  spu.set_active_code(code)
  psmap = extarray.extarray('I', 131072 / 4)
  data = extarray.extarray('I', range(0, 16))

  r_sum = prgm.gp_return
  r_cnt = prgm.acquire_register()

  spu.xor(r_sum, r_sum, r_sum)
  load_word(code, r_cnt, ITERS)

  lbl_loop = prgm.get_label("loop")
  code.add(lbl_loop)

  reg = dma.spu_read_in_mbox(code)

  spu.ai(r_sum, r_sum, 1)
Example #50
0
import corepy.lib.extarray as extarray
import corepy.arch.spu.isa as spu
import corepy.arch.spu.platform as env
import corepy.arch.spu.lib.dma as dma
from corepy.arch.spu.lib.util import load_word

import time

if __name__ == '__main__':
    ITERS = 500000
    #ITERS = 15

    prgm = env.Program()
    code = prgm.get_stream()
    proc = env.Processor()
    spu.set_active_code(code)
    psmap = extarray.extarray('I', 131072 / 4)
    data = extarray.extarray('I', range(0, 16))

    r_sum = prgm.gp_return
    r_cnt = prgm.acquire_register()

    spu.xor(r_sum, r_sum, r_sum)
    load_word(code, r_cnt, ITERS)

    lbl_loop = prgm.get_label("loop")
    code.add(lbl_loop)

    reg = dma.spu_read_in_mbox(code)

    spu.ai(r_sum, r_sum, 1)
Example #51
0
def SpeedTest(n_spus = 6, n_floats = 6):
  """
  Get a rough estimate of the maximum flop count.
  On a PS3 using all 6 spus, this is 152 GFlops.
  """

  if n_spus > 1:  prgm = env.ParallelProgram()
  else:           prgm = env.Program()
  code = prgm.get_stream()

  spu.set_active_code(code)
  
  f_range = range(n_floats)
  a = [SingleFloat(0.0) for i in f_range]
  b = [SingleFloat(0.0) for i in f_range]
  c = [SingleFloat(0.0) for i in f_range]  
  t = [SingleFloat(0.0) for i in f_range]

  outer = 2**12
  inner = 2**16
  unroll = 128
  fuse = 2
  simd = 4
  for x in syn_iter(code, outer):
    for y in syn_iter(code, inner):
      for u in xrange(unroll):
        for i in f_range:
          t[i].v = spu.fma.ex(a[i], b[i], c[i])
    

  # Run the synthetic program and copy the results back to the array 
  # TODO - AWF - use the SPU decrementers to time this
  proc = env.Processor()
  prgm += code

  start = time.time()
  r = proc.execute(prgm, n_spus = n_spus)
  stop = time.time()
  total = stop - start
  n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus)
  print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

#   # Run the native program and copy the results back to the array
#   outer = 2**14
#   inner = 2**16
#   unroll = 1
#   fuse = 1
#   simd = 1

#   proc = Processor()
#   # ncode = NativeInstructionStream("a.out")
#   start = time.time()
#   r = proc.execute(ncode, n_spus = n_spus)
#   stop = time.time()
#   total = stop - start
#   n_ops = long(outer) * inner * long(unroll) * long(n_floats) * long(fuse) * long(simd) * long(n_spus)
#   print '%.6f sec, %.2f GFlops' % (total, n_ops / total / 1e9)

  results = """
  --> No optimizations
  Executing native code: a.out
  14.805322 sec, 20.89 GFlops

  --> Synthetic
  Platform: linux.spre_linux_spu
  no raw data
  65.023350 sec, 152.19 GFlops

  --> -O3 (fuse: 2, simd: 4)
  Executing native code: a.out
  7.407939 sec, 41.74 GFlops

  --> -O3 (fuse: 1, simd: 1)
  Executing native code: a.out
  7.403702 sec, 5.22 GFlops
  """
  return
Example #52
0
def MemoryDescExample(data_size = 20000):
  """
  This example uses a memory descriptor to move 20k integers back and 
  forth between main memory and the SPU local store. Each value is
  incremented by 1 while on the SPU.
  
  Memory descriptors are a general purpose method for describing a
  region of memory.  Memory is described by a typecode, address, and
  size.  Memory descriptors can be initialized by hand or from an
  array or buffer object.

  For main memory, memory descriptors are useful for transfering data
  between main memory and an SPU's local store.  The get/put methods
  on a memory descriptor generate the SPU code to move data of any
  size between main memory and local store.

  Memory descriptors can also be used with spu_vec_iters to describe
  the region of memory to iterate over.  The typecode in the memory
  descriptor is used to determine the type for the loop induction
  variable.

  Note that there is currently no difference between memory
  descriptors for main memory and local store.  It's up to the user to
  make sure the memory descriptor settings make sense in the current
  context.  (this will probably change in the near future)

  Note: get/put currently use loops rather than display lists for
        transferring data over 16k.
  """
  
  code = env.InstructionStream()
  proc = env.Processor()

  code.debug = True
  spu.set_active_code(code)

  # Create a python array
  data = extarray.extarray('I', range(data_size))

  # Align the data in the array
  #a_data = aligned_memory(data_size, typecode = 'I')
  #a_data.copy_to(data.buffer_info()[0], data_size)
  
  # Create memory descriptor for the data in main memory
  data_desc = memory_desc('I')
  #data_desc.from_array(a_data)
  data_desc.from_array(data)

  # Transfer the data to 0x0 in the local store
  data_desc.get(code, 0)

  # Create memory descriptor for the data in the local store for use
  # in the iterator  
  lsa_data = memory_desc('i', 0, data_size)

  # Add one to each value
  for x in spu_vec_iter(code, lsa_data):
    x.v = x + 1

  # Transfer the data back to main memory
  data_desc.put(code, 0)

  dma.spu_write_out_mbox(code, 0xCAFE)
  
  # Execute the synthetic program
  # code.print_code()
  
  spe_id = proc.execute(code, async=True)
  proc.join(spe_id)

  # Copy it back to the Python array
  #a_data.copy_from(data.buffer_info()[0], data_size)

  for i in xrange(data_size):
    assert(data[i] == i + 1)
  return
Example #53
0
def SimpleSPU():
  """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
  prgm = env.Program()
  code = prgm.get_stream()
  proc = env.Processor()

  spu.set_active_code(code)
  

  # Acquire two registers
  #x    = code.acquire_register()
  x = prgm.gp_return
  test = prgm.acquire_register()

  lbl_brz = prgm.get_label("BRZ")
  lbl_skip = prgm.get_label("SKIP")

  spu.hbrr(lbl_brz, lbl_skip)
  spu.xor(x, x, x) # zero x
  spu.ai(x, x, 11) # x = x + 11
  spu.ai(x, x, 31) # x = x + 31

  spu.ceqi(test, x, 42) # test = (x == 42)

  # If test is false (all 0s), skip the stop(0x100A) instruction
  code.add(lbl_brz)
  spu.brz(test, lbl_skip)
  spu.stop(0x100A)
  code.add(lbl_skip)
  spu.stop(0x100B)

  prgm.add(code) 
  prgm.print_code() 
  r = proc.execute(prgm, mode = 'int', stop = True) 
  print "ret", r
  assert(r[0] == 42)
  assert(r[1] == 0x100A)


  prgm = env.Program()
  code = prgm.get_stream()
  spu.set_active_code(code)

  lbl_loop = prgm.get_label("LOOP")
  lbl_break = prgm.get_label("BREAK")

  r_cnt = prgm.acquire_register()
  r_stop = prgm.acquire_register()
  r_cmp = prgm.acquire_register()
  r_foo = prgm.gp_return

  spu.ori(r_foo, prgm.r_zero, 0)
  spu.ori(r_cnt, prgm.r_zero, 0)
  util.load_word(code, r_stop, 10)

  code.add(lbl_loop)

  spu.ceq(r_cmp, r_cnt, r_stop)
  spu.brnz(r_cmp, lbl_break)
  spu.ai(r_cnt, r_cnt, 1)

  spu.a(r_foo, r_foo, r_cnt)

  spu.br(lbl_loop)
  code.add(lbl_break)

  prgm.add(code)
  prgm.print_code()
  r = proc.execute(prgm, mode = 'int', stop = True)
  print "ret", r
  assert(r[0] == 55)

  return
Example #54
0
  def _transfer_data(self, code, kernel, lsa, tag):
    """
    Load the data into the SPU memory
    """

    # Check the types
    if not isinstance(code, spe.InstructionStream):
      raise Exception('Code must be an InstructionStream')
    if not (isinstance(lsa, int) or issubclass(type(lsa), (spe.Register, spe.Variable))):
      raise Exception('lsa must be an integer, Register, or Variable')
    
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    # Acquire registers for address and size, if they were not supplied by the user
    if self.r_addr is None: r_ea_data = code.prgm.acquire_register()
    else:                   r_ea_data = self.r_addr
      
    if self.r_size is None: r_size = code.prgm.acquire_register()
    else:                   r_size = self.r_size

    # Create variables 
    ea_addr      = var.SignedWord(reg = r_ea_data)
    aligned_size = var.SignedWord(0)
    mod_16       = var.SignedWord(0xF)

    # Initialize the lsa_addr variable. 
    if isinstance(lsa, int):
      # From a constant
      ls_addr   = var.SignedWord(lsa)
    elif issubclass(type(lsa), (spe.Register, spe.Variable)):
      # From a variable
      ls_addr   = var.SignedWord()      
      ls_addr.v = lsa
      
      
    tag_var = var.SignedWord(tag)
    cmp = var.SignedWord(0)

    # Load the effective address
    if self.r_addr is None:
      if self.addr % 16 != 0:
        print '[get_memory] Misaligned data'

      util.load_word(code, ea_addr, self.addr)

    # Load the size, rounding up as required to be 16-byte aligned
    if self.r_size is None:
      rnd_size = self.size * var.INT_SIZES[self.typecode]
      if rnd_size < 16:
        rnd_size = 16
      elif (rnd_size % 16) != 0:
        rnd_size += (16 - (rnd_size % 16))
      util.load_word(code, aligned_size, rnd_size)
    else:
      # TODO: !!! UNIT TEST THIS !!!
      # Same as above, but using SPU arithemtic to round
      size  = var.SignedWord(reg = r_size)
      sixteen  = var.SignedWord(16)
      cmp.v = ((size & mod_16) == size)
      aligned_size.v = size + (sixteen - (size & mod_16))
      spu.selb(aligned_size.reg, size.reg, aligned_size.reg, cmp.reg, order = _mi(spu.selb))
      code.release_register(sixteen.reg)

    # Use an auxillary register for the moving ea value if the
    # caller supplied the address register
    if self.r_addr is not None:
      ea_load   = var.SignedWord(0)
      ea_load.v = ea_addr
    else:
      ea_load = ea_addr # note that this is reference, not .v assignment

    # Transfer parameters
    buffer_size   = var.SignedWord(16384)
    remaining     = var.SignedWord(0)
    transfer_size = var.SignedWord(0)
    remaining.v   = aligned_size

    # Set up the iterators to transfer at most 16k at a time
    xfer_iter = syn_iter(code, 0, 16384)
    xfer_iter.set_stop_reg(aligned_size.reg)

    for offset in xfer_iter:
      cmp.v = buffer_size > remaining
      spu.selb(transfer_size, buffer_size, remaining, cmp)

      # Transfer the data
      kernel(code, ls_addr, ea_load, transfer_size, tag_var)
      ls_addr.v = ls_addr + buffer_size
      ea_load.v = ea_load + buffer_size

      remaining.v = remaining - buffer_size

    # Set the tag bit to tag
    dma.mfc_write_tag_mask(code, 1<<tag);

    # Wait for the transfer to complete
    dma.mfc_read_tag_status_all(code);

    # Release the registers
    code.release_register(buffer_size.reg)
    code.release_register(remaining.reg)
    code.release_register(aligned_size.reg)    
    code.release_register(transfer_size.reg)
    code.release_register(cmp.reg)
    code.release_register(ls_addr.reg)
    code.release_register(tag_var.reg)
    code.release_register(ea_load.reg)

    if old_code is not None:
      spu.set_active_code(old_code)
    return 
Example #55
0
  def synthesize(self, code):
    old_code = spu.get_active_code()
    spu.set_active_code(code)

    # Sanity checks
    if self._x_addr is None: raise Exception("Please set x_addr")
    if self._y_addr is None: raise Exception("Please set y_addr")
    if self._n_bits is None: raise Exception("Please set n_bits")
    if self._m is None: raise Exception("Please set m")
    if self._n is None: raise Exception("Please set n")    
    
    # Acquire a registers for the bit vectors and result
    n_vecs = self._n_bits / 128
    x_regs = [code.acquire_register() for i in range(n_vecs)]
    y_regs = [code.acquire_register() for i in range(n_vecs)]
    result = code.acquire_register()

    x_addr = var.Word()
    y_addr = var.Word()

    if self._save_op is not None:
      if self._threshold is not None:
        threshold = var.SingleFloat(self._threshold)
      else:
        threshold = var.SingleFloat(0.0)
      bcmp = var.Word(0)
    
    # Setup the Tanimito kernel
    tan = Tanimoto()

    tan.set_n_bits(self._n_bits)
    tan.set_x_regs(x_regs)
    tan.set_y_regs(y_regs)
    tan.set_result(result)

    tan.synthesize_constants(code)

    # Setup the save op
    save_op = self._save_op
    if save_op is not None:
      save_op.setup()
      
    # Create the iterators
    xiter = spuiter.syn_iter(code, self._m)
    yiter = spuiter.syn_iter(code, self._n)

    # Synthesize the block comparison loops
    x_addr.v = self._x_addr

    for x_off in xiter:
      x_addr.v = x_addr + 16 * n_vecs
      y_addr.v = self._y_addr

      self._load_bit_vector(x_addr, x_regs)

      for y_off in yiter:
        y_addr.v = y_addr + 16 * n_vecs

        self._load_bit_vector(y_addr, y_regs)
        tan.synthesize(code)

        if save_op is not None:
          spu.fcgt(bcmp, result, threshold)
          save_op.test(bcmp, result, x_off, y_off)

    # /x_off

    if old_code is not None:
      spu.set_active_code(old_code)
    
    return
Example #56
0
def SimpleSPU():
    """
  A very simple SPU that computes 11 + 31 and returns 0xA on success.
  """
    code = InstructionStream()
    proc = Processor()

    spu.set_active_code(code)

    # Acquire two registers
    #x    = code.acquire_register()
    x = code.gp_return
    test = code.acquire_register()

    lbl_brz = code.get_label("BRZ")
    lbl_skip = code.get_label("SKIP")

    spu.hbrr(lbl_brz, lbl_skip)
    spu.xor(x, x, x)  # zero x
    spu.ai(x, x, 11)  # x = x + 11
    spu.ai(x, x, 31)  # x = x + 31

    spu.ceqi(test, x, 42)  # test = (x == 42)

    # If test is false (all 0s), skip the stop(0x100A) instruction
    code.add(lbl_brz)
    spu.brz(test, lbl_skip)
    spu.stop(0x100A)
    code.add(lbl_skip)
    spu.stop(0x100B)

    code.print_code(hex=True, pro=True, epi=True)
    r = proc.execute(code, mode='int', stop=True)
    print "ret", r
    assert (r[0] == 42)
    assert (r[1] == 0x100A)

    code = InstructionStream()
    spu.set_active_code(code)

    lbl_loop = code.get_label("LOOP")
    lbl_break = code.get_label("BREAK")

    r_cnt = code.acquire_register()
    r_stop = code.acquire_register()
    r_cmp = code.acquire_register()
    r_foo = code.gp_return

    spu.ori(r_foo, code.r_zero, 0)
    spu.ori(r_cnt, code.r_zero, 0)
    util.load_word(code, r_stop, 10)

    code.add(lbl_loop)

    spu.ceq(r_cmp, r_cnt, r_stop)
    spu.brnz(r_cmp, lbl_break)
    spu.ai(r_cnt, r_cnt, 1)

    spu.a(r_foo, r_foo, r_cnt)

    spu.br(lbl_loop)
    code.add(lbl_break)

    code.print_code()
    r = proc.execute(code, mode='int', stop=True)
    print "ret", r
    assert (r[0] == 55)

    return
Example #57
0
def TestTanimotoBlock(n_vecs = 4):
  code = synspu.InstructionStream()
  proc = synspu.Processor()

  code.set_debug(True)
  spu.set_active_code(code)
  
  tb = TanimotoBlock()
  ls_save = LocalSave()
  mm_save = MemorySave()

  code.set_debug(True)

  # Input block parameters
  m = 128
  n = 64
  # n_vecs = 9
  n_bits = 128 * n_vecs

  # Main memory results buffer
  # max_results = 2**16
  max_results = 16384
  words_per_result = 4

  mm_results_data = array.array('I', [12 for i in range(max_results * words_per_result)])
  #mm_results_buffer = synspu.aligned_memory(max_results * words_per_result, typecode = 'I')
  # mm_results_buffer.copy_to(mm_results_data.buffer_info()[0], len(mm_results_data))

  mm_results = spuiter.memory_desc('I')
  #mm_results.from_array(mm_results_buffer)
  mm_results.from_array(mm_results_data)

  mm_save.set_md_save_buffer(mm_results)
    
  # Local Results buffer
  buffer_size = var.SignedWord(16384)
  buffer_addr = var.SignedWord(m * n * n_vecs * 4)
  ls_results = spuiter.memory_desc('B')
  ls_results.set_size_reg(buffer_size)
  ls_results.set_addr_reg(buffer_addr)

  ls_save.set_md_results(ls_results)
  ls_save.set_mm_save_op(mm_save)

  # Setup the TanimotoBlock class
  tb.set_n_bits(n_bits)
  tb.set_block_size(m, n)

  tb.set_x_addr(0)
  tb.set_y_addr(m * n_vecs * 16)
  tb.set_save_op(ls_save)

  # Main test loop
  n_samples = 10000
  for samples in spuiter.syn_iter(code, n_samples):
    tb.synthesize(code)

  spu.wrch(buffer_size, dma.SPU_WrOutMbox)
  
  spu.stop(0x2000) 

  # "Function" Calls
  ls_save.block()
  mm_save.block()

  # code.print_code()
  start = time.time()
  spe_id = proc.execute(code, async=True)
  
  while synspu.spu_exec.stat_out_mbox(spe_id) == 0: pass
  # print 'tb said: 0x%X' % (synspu.spu_exec.read_out_mbox(spe_id))
  stop = time.time()

  # mm_results_buffer.copy_from(mm_results_data.buffer_info()[0], len(mm_results_data))
  
  proc.join(spe_id)
  total = stop - start
  bits_sec = (m * n * n_bits * n_samples) / total / 1e9
  ops_per_compare = 48 * 4 + 8  # 48 SIMD instructions, 8 scalar
  insts_per_compare = 56
  gops = (m * n * n_vecs * n_samples * ops_per_compare ) / total / 1e9
  ginsts = (m * n * n_vecs * n_samples * insts_per_compare ) / total / 1e9  
  print '%.6f sec, %.2f Gbits/sec, %.2f GOps, %.2f GInsts, %d insts' % (
    total, bits_sec, gops, ginsts, code.size())
  return