コード例 #1
0
ファイル: gemm.py プロジェクト: maxim-tyutyunnikov/corepy
    def c_aux_save_hand(self, code):
        mr = self.block.mr
        nc = self.block.nc
        p_C = self.p_C
        p_C_aux = self.p_C_aux

        c_mem = self.a + self.b
        if mr > 1: c_aux = self.c[0] + self.c[1]
        else: c_aux = self.c[0]

        for jj in syn_range(code, 0, nc * self.C_strides.col,
                            self.C_strides.col):
            JJ = jj

            # ~ 650 Mflops (one extra unroll does not affect result)
            if mr >= 1:
                c_mem[0].load(p_C[0], JJ)
                c_aux[0].load(p_C_aux[0], JJ)

            if mr >= 2:
                c_mem[1].load(p_C[1], JJ)
                c_aux[1].load(p_C_aux[1], JJ)

            if mr >= 1:
                c_mem[0].v = c_mem[0] + c_aux[0]

            if mr >= 2:
                c_mem[1].v = c_mem[1] + c_aux[1]

            if mr >= 3:
                c_mem[2].load(p_C[2], JJ)
                c_aux[2].load(p_C_aux[2], JJ)

            if mr >= 4:
                c_mem[3].load(p_C[3], JJ)
                c_aux[3].load(p_C_aux[3], JJ)

            if mr >= 3:
                c_mem[2].v = c_mem[2] + c_aux[2]

            if mr >= 4:
                c_mem[3].v = c_mem[3] + c_aux[3]

            if mr >= 1: c_mem[0].store(p_C[0], JJ)
            if mr >= 2: c_mem[1].store(p_C[1], JJ)
            if mr >= 3: c_mem[2].store(p_C[2], JJ)
            if mr >= 4: c_mem[3].store(p_C[3], JJ)

        # /end for jj
        return
コード例 #2
0
ファイル: gemm.py プロジェクト: KapilRijhwani/corepy
  def c_aux_save_hand(self, code):
    mr = self.block.mr
    nc = self.block.nc
    p_C = self.p_C
    p_C_aux = self.p_C_aux
    
    c_mem = self.a + self.b
    if mr > 1:  c_aux = self.c[0] + self.c[1]
    else:       c_aux = self.c[0]
    
    for jj in syn_range(code, 0, nc * self.C_strides.col, self.C_strides.col):
      JJ = jj
      
      # ~ 650 Mflops (one extra unroll does not affect result)
      if mr >= 1:
        c_mem[0].load(p_C[0], JJ)
        c_aux[0].load(p_C_aux[0], JJ)
        
      if mr >= 2:
        c_mem[1].load(p_C[1], JJ)
        c_aux[1].load(p_C_aux[1], JJ)
        
      if mr >= 1:
        c_mem[0].v = c_mem[0] + c_aux[0]
        
      if mr >= 2:
        c_mem[1].v = c_mem[1] + c_aux[1]
        
      if mr >= 3:
        c_mem[2].load(p_C[2], JJ)
        c_aux[2].load(p_C_aux[2], JJ)
        
      if mr >= 4:
        c_mem[3].load(p_C[3], JJ)
        c_aux[3].load(p_C_aux[3], JJ)
        
      if mr >= 3:
        c_mem[2].v = c_mem[2] + c_aux[2]
        
      if mr >= 4:
        c_mem[3].v = c_mem[3] + c_aux[3]
        
      if mr >= 1: c_mem[0].store(p_C[0], JJ)
      if mr >= 2: c_mem[1].store(p_C[1], JJ)
      if mr >= 3: c_mem[2].store(p_C[2], JJ)
      if mr >= 4: c_mem[3].store(p_C[3], JJ)

    # /end for jj
    return
コード例 #3
0
ファイル: gemm.py プロジェクト: KapilRijhwani/corepy
  def c_aux_save_simple(self, code):
    # Copy C_aux to C
    # ~620 MFlops
    nc, mr, a, b, p_C, p_C_aux, C_col_stride = (
      self.block.nc, self.block.mr, self.a, self.b, self.p_C, self.p_C_aux, self.C_strides.col)

    for jj in syn_range(code, 0, nc * C_col_stride, C_col_stride):
      for ci in range(mr):
    
        a[ci].load(p_C[ci], jj)
        b[ci].load(p_C_aux[ci], jj)
    
        a[ci].v = a[ci] + b[ci]
        a[ci].store(p_C[ci], jj)
    
      # /end for ci
    # /end for jj
    return
コード例 #4
0
ファイル: gemm.py プロジェクト: tmaone/efi
  def c_aux_save_simple(self, code):
    # Copy C_aux to C
    # ~620 MFlops
    nc, mr, a, b, p_C, p_C_aux, C_col_stride = (
      self.block.nc, self.block.mr, self.a, self.b, self.p_C, self.p_C_aux, self.C_strides.col)

    for jj in syn_range(code, 0, nc * C_col_stride, C_col_stride):
      for ci in range(mr):
    
        a[ci].load(p_C[ci], jj)
        b[ci].load(p_C_aux[ci], jj)
    
        a[ci].v = a[ci] + b[ci]
        a[ci].store(p_C[ci], jj)
    
      # /end for ci
    # /end for jj
    return
コード例 #5
0
ファイル: gemm.py プロジェクト: KapilRijhwani/corepy
  def _gepb(self, code):
    a, b, c = self.a, self.b, self.c
    p_tA, p_tB = self.p_tA, self.p_tB
    M = self.dims.M
    mr, nr, nc = self.block.mr, self.block.nr, self.block.nc

    # For each row in C
    for ii in syn_range(code, 0, M, mr):

      # Reset p_tB
      p_tB.v = self.r_tB_addr

      # For each column in C
      for jj in syn_range(code, 0, nc, nr):

        # Set p_tA to the current row in tA
        p_tA.v = self.r_tA_addr + ii * self.A_strides.row # * mr
            
        # Set p_tB to the current col in tB
        p_tB.v = self.r_tB_addr + jj * self.B_strides.col
        
        # Zero the c register block
        ppc.fsubx(c[0][0], c[0][0], c[0][0])
        for ci in range(mr):
          for cj in range(nr):
            c[ci][cj].copy_register(c[0][0])

        # self.k_loop_simple(code)
        if self.mode == gepb_simple:
          self.k_loop_simple(code)                        
        elif self.mode == gepb_prefetch:
          self.k_loop_prefetch_simple(code)              
        elif self.mode == gepb_prefetch_hand:
          self.k_loop_prefetch(code)              
        else:
          raise Exception("Unknown inner loop mode: %s" % (str(self.mode)))
        
        # Save c_current to c_aux 
        # (this is OK performance-wise)
        for ci in range(mr):
          for cj in range(nr):
            c[ci][cj].store(self.p_C_aux[ci], cj * self.C_strides.col)
            
        # Increment the sub-matrix in C_aux
        for ci in range(mr):
          self.p_C_aux[ci].v = self.p_C_aux[ci] + self.C_strides.col * nr

      # /end for jj

      # Reset p_C_aux
      for ci in range(mr):
        self.p_C_aux[ci].v = self.r_C_aux_addr + ci * (nc * self.C_strides.col)

      # Copy C_aux to C
      # self.c_aux_save_simple(code)
      self.c_aux_save_hand(code)

      # Increment p_C
      for ci in range(mr):
        self.p_C[ci].v = self.p_C[ci] + self.vC_row_stride * mr

    # /end for ii

    return
コード例 #6
0
ファイル: gemm.py プロジェクト: tmaone/efi
  def _gepb(self, code):
    a, b, c = self.a, self.b, self.c
    p_tA, p_tB = self.p_tA, self.p_tB
    M = self.dims.M
    mr, nr, nc = self.block.mr, self.block.nr, self.block.nc

    # For each row in C
    for ii in syn_range(code, 0, M, mr):

      # Reset p_tB
      p_tB.v = self.r_tB_addr

      # For each column in C
      for jj in syn_range(code, 0, nc, nr):

        # Set p_tA to the current row in tA
        p_tA.v = self.r_tA_addr + ii * self.A_strides.row # * mr
            
        # Set p_tB to the current col in tB
        p_tB.v = self.r_tB_addr + jj * self.B_strides.col
        
        # Zero the c register block
        ppc.fsubx(c[0][0], c[0][0], c[0][0])
        for ci in range(mr):
          for cj in range(nr):
            c[ci][cj].copy_register(c[0][0])

        # self.k_loop_simple(code)
        if self.mode == gepb_simple:
          self.k_loop_simple(code)                        
        elif self.mode == gepb_prefetch:
          self.k_loop_prefetch_simple(code)              
        elif self.mode == gepb_prefetch_hand:
          self.k_loop_prefetch(code)              
        else:
          raise Exception("Unknown inner loop mode: %s" % (str(self.mode)))
        
        # Save c_current to c_aux 
        # (this is OK performance-wise)
        for ci in range(mr):
          for cj in range(nr):
            c[ci][cj].store(self.p_C_aux[ci], cj * self.C_strides.col)
            
        # Increment the sub-matrix in C_aux
        for ci in range(mr):
          self.p_C_aux[ci].v = self.p_C_aux[ci] + self.C_strides.col * nr

      # /end for jj

      # Reset p_C_aux
      for ci in range(mr):
        self.p_C_aux[ci].v = self.r_C_aux_addr + ci * (nc * self.C_strides.col)

      # Copy C_aux to C
      # self.c_aux_save_simple(code)
      self.c_aux_save_hand(code)

      # Increment p_C
      for ci in range(mr):
        self.p_C[ci].v = self.p_C[ci] + self.vC_row_stride * mr

    # /end for ii

    return