Example #1
0
  def k_loop_prefetch_simple(self, code):
    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Load the next values from tA and tB 
    for ai in range(mr):
      a[ai].load(p_tA, ai * self.A_strides.row)
      
    for bj in range(nr):
      b[bj].load(p_tB, bj * self.B_strides.col)

    p_tA.v = p_tA + self.A_strides.col
    p_tB.v = p_tB + self.B_strides.row
        

    # Inner loop over k
    for k in syn_iter(code, kc / 2, mode = CTR): # syn_range(code, 0, kc * 8, 8):

      # Iteration 1 -- load [a,b]_pre, compute [a,b]
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])

      # Iteration 2l -- oad [a,b], compute [a,b]_pre
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a_pre[ci], b_pre[cj], c[ci][cj])
          
    # /end for k

    return
Example #2
0
File: gemm.py Project: tmaone/efi
  def k_loop_prefetch_simple(self, code):
    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Load the next values from tA and tB 
    for ai in range(mr):
      a[ai].load(p_tA, ai * self.A_strides.row)
      
    for bj in range(nr):
      b[bj].load(p_tB, bj * self.B_strides.col)

    p_tA.v = p_tA + self.A_strides.col
    p_tB.v = p_tB + self.B_strides.row
        

    # Inner loop over k
    for k in syn_iter(code, kc / 2, mode = CTR): # syn_range(code, 0, kc * 8, 8):

      # Iteration 1 -- load [a,b]_pre, compute [a,b]
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])

      # Iteration 2l -- oad [a,b], compute [a,b]_pre
      # Load the prefetch values from tA and tB 
      for ai in range(mr):
        a_pre[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b_pre[bj].load(p_tB, bj * self.B_strides.col)

      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row

      # Update c
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a_pre[ci], b_pre[cj], c[ci][cj])
          
    # /end for k

    return
Example #3
0
    def k_loop_simple(self, code):
        kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
        a, b, c = self.a, self.b, self.c
        p_tA, p_tB = self.p_tA, self.p_tB

        # Inner loop over k
        for k in syn_iter(code, kc,
                          mode=CTR):  # syn_range(code, 0, kc * 8, 8):
            # Load the next values from tA and tB -- generating loops
            for ai in range(mr):
                a[ai].load(p_tA, ai * self.A_strides.row)

            for bj in range(nr):
                b[bj].load(p_tB, bj * self.B_strides.col)

            # Update c -- generating loop
            for ci in range(mr):
                for cj in range(nr):
                    c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])

            p_tA.v = p_tA + self.A_strides.col
            p_tB.v = p_tB + self.B_strides.row
        # /end for k

        return
Example #4
0
 def reg_loop_simple(self, a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_col_stride):
   # Load the next values from tA and tB -- generating loops
   for ai in range(mr):
     a[ai].load(p_tA, ai * A_row_stride)
   
   for bj in range(nr):
     b[bj].load(p_tB, bj * B_col_stride)
   
   # Update c -- generating loop
   for ci in range(mr):
     for cj in range(nr):
       c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])
         
   return
Example #5
0
File: gemm.py Project: tmaone/efi
 def reg_loop_simple(self, a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_col_stride):
   # Load the next values from tA and tB -- generating loops
   for ai in range(mr):
     a[ai].load(p_tA, ai * A_row_stride)
   
   for bj in range(nr):
     b[bj].load(p_tB, bj * B_col_stride)
   
   # Update c -- generating loop
   for ci in range(mr):
     for cj in range(nr):
       c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])
         
   return
Example #6
0
def TestExternalStop():

  prgm = synppc.Program()
  code = prgm.get_stream()
  prgm.add(code)
  ppc.set_active_code(code)
  
  # Data
  data = array.array('d', range(5*5))

  # Constants - read only
  n_rows = vars.SignedWord(5)
  n_cols = vars.SignedWord(5)
  addr   = vars.SignedWord(data.buffer_info()[0])  
  dbl_size  = vars.SignedWord(synppc.WORD_SIZE * 2)
  row_bytes = vars.SignedWord(synppc.WORD_SIZE * 5 * 2)

  # Variables - read/write
  sum = vars.DoubleFloat(0.0)
  x = vars.DoubleFloat(0.0)

  offset = vars.SignedWord(0)

  # Iterators
  i_iter = syn_iter(code, 0, mode = INC)
  i_iter.set_external_stop(n_rows.reg)

  j_ctr = syn_iter(code, 0, mode = CTR)
  j_ctr.set_external_stop(n_cols.reg)

  for i in i_iter:
    offset.v = vars.SignedWord.cast(i) * row_bytes
    
    # Note that j_cnt is unreadable since it's in the ctr register
    for j_cnt in j_ctr:
      # Load the next vaule in the matrix
      ppc.lfdx(x, addr, offset)
      sum.v = vars.fmadd(x, x, sum) # sum += x*x
      offset.v = offset + dbl_size

  # code.add(ppc.Illegal())
  util.return_var(sum)

  proc = synppc.Processor()
  r = proc.execute(prgm, mode = 'fp')
  # print 'Test external stop: ', r
  assert(r == 4900.0)
    
  return
Example #7
0
  def k_loop_simple(self, code):
    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    p_tA, p_tB = self.p_tA, self.p_tB
    
    # Inner loop over k
    for k in syn_iter(code, kc, mode = CTR): # syn_range(code, 0, kc * 8, 8):
      # Load the next values from tA and tB -- generating loops
      for ai in range(mr):
        a[ai].load(p_tA, ai * self.A_strides.row)
    
      for bj in range(nr):
        b[bj].load(p_tB, bj * self.B_strides.col)
    
      # Update c -- generating loop
      for ci in range(mr):
        for cj in range(nr):
          c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj])
          
      p_tA.v = p_tA + self.A_strides.col
      p_tB.v = p_tB + self.B_strides.row
    # /end for k

    return
Example #8
0
  def k_loop_prefetch(self, code):
    A_row_stride = self.A_strides.row
    A_col_stride = self.A_strides.col

    B_row_stride = self.B_strides.row
    B_col_stride = self.B_strides.col

    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Increment p_tA, p_tB
    a[0].load(p_tA, 0 * A_row_stride)
    b[0].load(p_tB, 0 * B_col_stride)
    b[1].load(p_tB, 1 * B_col_stride)                    
    a[1].load(p_tA, 1 * A_row_stride)
    
    b[2].load(p_tB, 2 * B_col_stride)          
    b[3].load(p_tB, 3 * B_col_stride)
    
    a[2].load(p_tA, 2 * A_row_stride)
    a[3].load(p_tA, 3 * A_row_stride)
    
    p_tA.v = p_tA + A_col_stride
    p_tB.v = p_tB + B_row_stride
    
    for k in syn_iter(code, kc / 2 , mode = CTR): # syn_range(code, 0, kc * 8, 8):
      # self.reg_loop_simple(a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_row_stride)
      
      a_pre[0].load(p_tA, 0 * A_row_stride)
      b_pre[1].load(p_tB, 1 * B_col_stride)
      b_pre[0].load(p_tB, 0 * B_col_stride)          
      a_pre[1].load(p_tA, 1 * A_row_stride)
      
      b_pre[2].load(p_tB, 2 * B_col_stride)
      a_pre[2].load(p_tA, 2 * A_row_stride)

      b_pre[3].load(p_tB, 3 * B_col_stride)          
      a_pre[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3])
      
      a[0].load(p_tA, 0 * A_row_stride)
      b[1].load(p_tB, 1 * B_col_stride)
      b[0].load(p_tB, 0 * B_col_stride)
      a[1].load(p_tA, 1 * A_row_stride)
      
      b[2].load(p_tB, 2 * B_col_stride)
      b[3].load(p_tB, 3 * B_col_stride)
      
      a[2].load(p_tA, 2 * A_row_stride)
      a[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a_pre[0], b_pre[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a_pre[0], b_pre[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a_pre[1], b_pre[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a_pre[1], b_pre[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a_pre[1], b_pre[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a_pre[0], b_pre[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a_pre[2], b_pre[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a_pre[2], b_pre[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a_pre[2], b_pre[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a_pre[2], b_pre[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a_pre[0], b_pre[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a_pre[1], b_pre[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a_pre[3], b_pre[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a_pre[3], b_pre[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a_pre[3], b_pre[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a_pre[3], b_pre[3], c[3][3])
      
      # /end for k
    return
Example #9
0
 def compute(i,j): c[i][j].v = ppcvar.fmadd(a[i], b[j], c[i][j]); 
 
 load_a(0)
Example #10
0
  def reg_loop_4x4(self, a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_col_stride):
    a[0].load(p_tA, 0 * A_row_stride)
    b[0].load(p_tB, 0 * B_col_stride)

    c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); 
    
    b[1].load(p_tB, 1 * B_col_stride)          
    a[1].load(p_tA, 1 * A_row_stride)
    
    c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]); 
    c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0])
    
    a[2].load(p_tA, 2 * A_row_stride)
    b[2].load(p_tB, 2 * B_col_stride)          
    
    c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1])
    c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0])          
    
    c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); 
    c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2])
    
    c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1])
    
    a[3].load(p_tA, 3 * A_row_stride)
    b[3].load(p_tB, 3 * B_col_stride)
    
    
    c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2])
    
    c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0])
    c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1])
    c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2])
    c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3])
    
    c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]); 
    c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3])
    c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3])

    return
Example #11
0
File: gemm.py Project: tmaone/efi
  def k_loop_prefetch(self, code):
    A_row_stride = self.A_strides.row
    A_col_stride = self.A_strides.col

    B_row_stride = self.B_strides.row
    B_col_stride = self.B_strides.col

    kc, mr, nr = self.block.kc, self.block.mr, self.block.nr
    a, b, c = self.a, self.b, self.c
    a_pre, b_pre = self.a_pre, self.b_pre
    p_tA, p_tB = self.p_tA, self.p_tB

    # Increment p_tA, p_tB
    a[0].load(p_tA, 0 * A_row_stride)
    b[0].load(p_tB, 0 * B_col_stride)
    b[1].load(p_tB, 1 * B_col_stride)                    
    a[1].load(p_tA, 1 * A_row_stride)
    
    b[2].load(p_tB, 2 * B_col_stride)          
    b[3].load(p_tB, 3 * B_col_stride)
    
    a[2].load(p_tA, 2 * A_row_stride)
    a[3].load(p_tA, 3 * A_row_stride)
    
    p_tA.v = p_tA + A_col_stride
    p_tB.v = p_tB + B_row_stride
    
    for k in syn_iter(code, kc / 2 , mode = CTR): # syn_range(code, 0, kc * 8, 8):
      # self.reg_loop_simple(a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_row_stride)
      
      a_pre[0].load(p_tA, 0 * A_row_stride)
      b_pre[1].load(p_tB, 1 * B_col_stride)
      b_pre[0].load(p_tB, 0 * B_col_stride)          
      a_pre[1].load(p_tA, 1 * A_row_stride)
      
      b_pre[2].load(p_tB, 2 * B_col_stride)
      a_pre[2].load(p_tA, 2 * A_row_stride)

      b_pre[3].load(p_tB, 3 * B_col_stride)          
      a_pre[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3])
      
      a[0].load(p_tA, 0 * A_row_stride)
      b[1].load(p_tB, 1 * B_col_stride)
      b[0].load(p_tB, 0 * B_col_stride)
      a[1].load(p_tA, 1 * A_row_stride)
      
      b[2].load(p_tB, 2 * B_col_stride)
      b[3].load(p_tB, 3 * B_col_stride)
      
      a[2].load(p_tA, 2 * A_row_stride)
      a[3].load(p_tA, 3 * A_row_stride)
      
      p_tA.v = p_tA + A_col_stride
      p_tB.v = p_tB + B_row_stride
      
      c[0][0].v = ppcvar.fmadd(a_pre[0], b_pre[0], c[0][0]); 
      c[0][1].v = ppcvar.fmadd(a_pre[0], b_pre[1], c[0][1]);
      c[1][0].v = ppcvar.fmadd(a_pre[1], b_pre[0], c[1][0])
      c[1][1].v = ppcvar.fmadd(a_pre[1], b_pre[1], c[1][1])
      
      c[1][2].v = ppcvar.fmadd(a_pre[1], b_pre[2], c[1][2])
      c[0][2].v = ppcvar.fmadd(a_pre[0], b_pre[2], c[0][2]); 
      c[2][0].v = ppcvar.fmadd(a_pre[2], b_pre[0], c[2][0])
      c[2][1].v = ppcvar.fmadd(a_pre[2], b_pre[1], c[2][1])
      
      c[2][2].v = ppcvar.fmadd(a_pre[2], b_pre[2], c[2][2])
      c[2][3].v = ppcvar.fmadd(a_pre[2], b_pre[3], c[2][3])
      c[0][3].v = ppcvar.fmadd(a_pre[0], b_pre[3], c[0][3]);
      c[1][3].v = ppcvar.fmadd(a_pre[1], b_pre[3], c[1][3])
      
      c[3][0].v = ppcvar.fmadd(a_pre[3], b_pre[0], c[3][0])
      c[3][1].v = ppcvar.fmadd(a_pre[3], b_pre[1], c[3][1])
      c[3][2].v = ppcvar.fmadd(a_pre[3], b_pre[2], c[3][2])
      c[3][3].v = ppcvar.fmadd(a_pre[3], b_pre[3], c[3][3])
      
      # /end for k
    return
Example #12
0
File: gemm.py Project: tmaone/efi
 def compute(i,j): c[i][j].v = ppcvar.fmadd(a[i], b[j], c[i][j]); 
 
 load_a(0)
Example #13
0
File: gemm.py Project: tmaone/efi
  def reg_loop_4x4(self, a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_col_stride):
    a[0].load(p_tA, 0 * A_row_stride)
    b[0].load(p_tB, 0 * B_col_stride)

    c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); 
    
    b[1].load(p_tB, 1 * B_col_stride)          
    a[1].load(p_tA, 1 * A_row_stride)
    
    c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]); 
    c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0])
    
    a[2].load(p_tA, 2 * A_row_stride)
    b[2].load(p_tB, 2 * B_col_stride)          
    
    c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1])
    c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0])          
    
    c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); 
    c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2])
    
    c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1])
    
    a[3].load(p_tA, 3 * A_row_stride)
    b[3].load(p_tB, 3 * B_col_stride)
    
    
    c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2])
    
    c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0])
    c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1])
    c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2])
    c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3])
    
    c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]); 
    c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3])
    c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3])

    return