def c_aux_save_hand(self, code): mr = self.block.mr nc = self.block.nc p_C = self.p_C p_C_aux = self.p_C_aux c_mem = self.a + self.b if mr > 1: c_aux = self.c[0] + self.c[1] else: c_aux = self.c[0] for jj in syn_range(code, 0, nc * self.C_strides.col, self.C_strides.col): JJ = jj # ~ 650 Mflops (one extra unroll does not affect result) if mr >= 1: c_mem[0].load(p_C[0], JJ) c_aux[0].load(p_C_aux[0], JJ) if mr >= 2: c_mem[1].load(p_C[1], JJ) c_aux[1].load(p_C_aux[1], JJ) if mr >= 1: c_mem[0].v = c_mem[0] + c_aux[0] if mr >= 2: c_mem[1].v = c_mem[1] + c_aux[1] if mr >= 3: c_mem[2].load(p_C[2], JJ) c_aux[2].load(p_C_aux[2], JJ) if mr >= 4: c_mem[3].load(p_C[3], JJ) c_aux[3].load(p_C_aux[3], JJ) if mr >= 3: c_mem[2].v = c_mem[2] + c_aux[2] if mr >= 4: c_mem[3].v = c_mem[3] + c_aux[3] if mr >= 1: c_mem[0].store(p_C[0], JJ) if mr >= 2: c_mem[1].store(p_C[1], JJ) if mr >= 3: c_mem[2].store(p_C[2], JJ) if mr >= 4: c_mem[3].store(p_C[3], JJ) # /end for jj return
def c_aux_save_simple(self, code): # Copy C_aux to C # ~620 MFlops nc, mr, a, b, p_C, p_C_aux, C_col_stride = ( self.block.nc, self.block.mr, self.a, self.b, self.p_C, self.p_C_aux, self.C_strides.col) for jj in syn_range(code, 0, nc * C_col_stride, C_col_stride): for ci in range(mr): a[ci].load(p_C[ci], jj) b[ci].load(p_C_aux[ci], jj) a[ci].v = a[ci] + b[ci] a[ci].store(p_C[ci], jj) # /end for ci # /end for jj return
def _gepb(self, code): a, b, c = self.a, self.b, self.c p_tA, p_tB = self.p_tA, self.p_tB M = self.dims.M mr, nr, nc = self.block.mr, self.block.nr, self.block.nc # For each row in C for ii in syn_range(code, 0, M, mr): # Reset p_tB p_tB.v = self.r_tB_addr # For each column in C for jj in syn_range(code, 0, nc, nr): # Set p_tA to the current row in tA p_tA.v = self.r_tA_addr + ii * self.A_strides.row # * mr # Set p_tB to the current col in tB p_tB.v = self.r_tB_addr + jj * self.B_strides.col # Zero the c register block ppc.fsubx(c[0][0], c[0][0], c[0][0]) for ci in range(mr): for cj in range(nr): c[ci][cj].copy_register(c[0][0]) # self.k_loop_simple(code) if self.mode == gepb_simple: self.k_loop_simple(code) elif self.mode == gepb_prefetch: self.k_loop_prefetch_simple(code) elif self.mode == gepb_prefetch_hand: self.k_loop_prefetch(code) else: raise Exception("Unknown inner loop mode: %s" % (str(self.mode))) # Save c_current to c_aux # (this is OK performance-wise) for ci in range(mr): for cj in range(nr): c[ci][cj].store(self.p_C_aux[ci], cj * self.C_strides.col) # Increment the sub-matrix in C_aux for ci in range(mr): self.p_C_aux[ci].v = self.p_C_aux[ci] + self.C_strides.col * nr # /end for jj # Reset p_C_aux for ci in range(mr): self.p_C_aux[ci].v = self.r_C_aux_addr + ci * (nc * self.C_strides.col) # Copy C_aux to C # self.c_aux_save_simple(code) self.c_aux_save_hand(code) # Increment p_C for ci in range(mr): self.p_C[ci].v = self.p_C[ci] + self.vC_row_stride * mr # /end for ii return