def k_loop_simple(self, code): kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c p_tA, p_tB = self.p_tA, self.p_tB # Inner loop over k for k in syn_iter(code, kc, mode=CTR): # syn_range(code, 0, kc * 8, 8): # Load the next values from tA and tB -- generating loops for ai in range(mr): a[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b[bj].load(p_tB, bj * self.B_strides.col) # Update c -- generating loop for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj]) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # /end for k return
def _pack_b(self, code): kc, nc, = self.block.kc, self.block.nc vb, vB, vtB, vBi, bij, tbji, vN = ( self.vb, self.vB, self.vtB, self.vBi, self.bij, self.tbji, self.vN) for i in syn_iter(code, kc): vBi.v = i * vN for j in syn_iter(code, nc): bij.v = (vBi + j) * 8 tbji.v = (j * kc + i) * 8 vb.load(vB, bij) vb.store(vtB, tbji) return
def k_loop_prefetch_simple(self, code): kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c a_pre, b_pre = self.a_pre, self.b_pre p_tA, p_tB = self.p_tA, self.p_tB # Load the next values from tA and tB for ai in range(mr): a[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b[bj].load(p_tB, bj * self.B_strides.col) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # Inner loop over k for k in syn_iter(code, kc / 2, mode = CTR): # syn_range(code, 0, kc * 8, 8): # Iteration 1 -- load [a,b]_pre, compute [a,b] # Load the prefetch values from tA and tB for ai in range(mr): a_pre[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b_pre[bj].load(p_tB, bj * self.B_strides.col) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # Update c for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj]) # Iteration 2l -- oad [a,b], compute [a,b]_pre # Load the prefetch values from tA and tB for ai in range(mr): a_pre[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b_pre[bj].load(p_tB, bj * self.B_strides.col) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # Update c for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a_pre[ci], b_pre[cj], c[ci][cj]) # /end for k return
def synthesize(self, prgm, tB, M, K, N, kc, nc, mr = 1, nr = 1): code = prgm.get_stream() old_code = ppc.get_active_code() ppc.set_active_code(code) gepb = SynGEPB(self.gepb_mode) packb = SynPackB() gepb._init_constants(M, K, N, kc, nc, mr, nr, True) packb._init_constants(prgm, tB, N) gepb._init_vars() # Reuse the C/C_aux registers for B. They are set in init pointers. packb._init_vars2(gepb.p_C, gepb.c[0][0], gepb.r_tB_addr) gepb._load_params() packb._load_params(pvB = 7) # kN = k * N * 8 # for j in range(0, N * 8, nc * 8): for j in syn_iter(code, N, nc): # # Pack B into tB -- tB1.transpose(B[k:k+kc, j:j+nc]) # pack_params.p1 = B_addr + kN + j # (k * N + j) * 8 packb.vN.v = N packb._pack_b(code) # proc.execute(cgepb, params = pm) gepb._init_pointers() gepb._gepb(code) # pm.p3 += nc8 gepb.r_C_addr.v = gepb.r_C_addr + nc * 8 packb.vB.v = packb.vB + nc * 8 # /end for j ppc.set_active_code(old_code) return
def synthesize(self, prgm, tB, M, K, N, kc, nc, mr=1, nr=1): code = prgm.get_stream() old_code = ppc.get_active_code() ppc.set_active_code(code) gepb = SynGEPB(self.gepb_mode) packb = SynPackB() gepb._init_constants(M, K, N, kc, nc, mr, nr, True) packb._init_constants(prgm, tB, N) gepb._init_vars() # Reuse the C/C_aux registers for B. They are set in init pointers. packb._init_vars2(gepb.p_C, gepb.c[0][0], gepb.r_tB_addr) gepb._load_params() packb._load_params(pvB=7) # kN = k * N * 8 # for j in range(0, N * 8, nc * 8): for j in syn_iter(code, N, nc): # # Pack B into tB -- tB1.transpose(B[k:k+kc, j:j+nc]) # pack_params.p1 = B_addr + kN + j # (k * N + j) * 8 packb.vN.v = N packb._pack_b(code) # proc.execute(cgepb, params = pm) gepb._init_pointers() gepb._gepb(code) # pm.p3 += nc8 gepb.r_C_addr.v = gepb.r_C_addr + nc * 8 packb.vB.v = packb.vB + nc * 8 # /end for j ppc.set_active_code(old_code) return
def k_loop_simple(self, code): kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c p_tA, p_tB = self.p_tA, self.p_tB # Inner loop over k for k in syn_iter(code, kc, mode = CTR): # syn_range(code, 0, kc * 8, 8): # Load the next values from tA and tB -- generating loops for ai in range(mr): a[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b[bj].load(p_tB, bj * self.B_strides.col) # Update c -- generating loop for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj]) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # /end for k return
def k_loop_prefetch(self, code): A_row_stride = self.A_strides.row A_col_stride = self.A_strides.col B_row_stride = self.B_strides.row B_col_stride = self.B_strides.col kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c a_pre, b_pre = self.a_pre, self.b_pre p_tA, p_tB = self.p_tA, self.p_tB # Increment p_tA, p_tB a[0].load(p_tA, 0 * A_row_stride) b[0].load(p_tB, 0 * B_col_stride) b[1].load(p_tB, 1 * B_col_stride) a[1].load(p_tA, 1 * A_row_stride) b[2].load(p_tB, 2 * B_col_stride) b[3].load(p_tB, 3 * B_col_stride) a[2].load(p_tA, 2 * A_row_stride) a[3].load(p_tA, 3 * A_row_stride) p_tA.v = p_tA + A_col_stride p_tB.v = p_tB + B_row_stride for k in syn_iter(code, kc / 2 , mode = CTR): # syn_range(code, 0, kc * 8, 8): # self.reg_loop_simple(a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_row_stride) a_pre[0].load(p_tA, 0 * A_row_stride) b_pre[1].load(p_tB, 1 * B_col_stride) b_pre[0].load(p_tB, 0 * B_col_stride) a_pre[1].load(p_tA, 1 * A_row_stride) b_pre[2].load(p_tB, 2 * B_col_stride) a_pre[2].load(p_tA, 2 * A_row_stride) b_pre[3].load(p_tB, 3 * B_col_stride) a_pre[3].load(p_tA, 3 * A_row_stride) p_tA.v = p_tA + A_col_stride p_tB.v = p_tB + B_row_stride c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]); c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0]) c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1]) c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2]) c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0]) c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1]) c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2]) c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3]) c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]); c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3]) c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0]) c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1]) c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2]) c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3]) a[0].load(p_tA, 0 * A_row_stride) b[1].load(p_tB, 1 * B_col_stride) b[0].load(p_tB, 0 * B_col_stride) a[1].load(p_tA, 1 * A_row_stride) b[2].load(p_tB, 2 * B_col_stride) b[3].load(p_tB, 3 * B_col_stride) a[2].load(p_tA, 2 * A_row_stride) a[3].load(p_tA, 3 * A_row_stride) p_tA.v = p_tA + A_col_stride p_tB.v = p_tB + B_row_stride c[0][0].v = ppcvar.fmadd(a_pre[0], b_pre[0], c[0][0]); c[0][1].v = ppcvar.fmadd(a_pre[0], b_pre[1], c[0][1]); c[1][0].v = ppcvar.fmadd(a_pre[1], b_pre[0], c[1][0]) c[1][1].v = ppcvar.fmadd(a_pre[1], b_pre[1], c[1][1]) c[1][2].v = ppcvar.fmadd(a_pre[1], b_pre[2], c[1][2]) c[0][2].v = ppcvar.fmadd(a_pre[0], b_pre[2], c[0][2]); c[2][0].v = ppcvar.fmadd(a_pre[2], b_pre[0], c[2][0]) c[2][1].v = ppcvar.fmadd(a_pre[2], b_pre[1], c[2][1]) c[2][2].v = ppcvar.fmadd(a_pre[2], b_pre[2], c[2][2]) c[2][3].v = ppcvar.fmadd(a_pre[2], b_pre[3], c[2][3]) c[0][3].v = ppcvar.fmadd(a_pre[0], b_pre[3], c[0][3]); c[1][3].v = ppcvar.fmadd(a_pre[1], b_pre[3], c[1][3]) c[3][0].v = ppcvar.fmadd(a_pre[3], b_pre[0], c[3][0]) c[3][1].v = ppcvar.fmadd(a_pre[3], b_pre[1], c[3][1]) c[3][2].v = ppcvar.fmadd(a_pre[3], b_pre[2], c[3][2]) c[3][3].v = ppcvar.fmadd(a_pre[3], b_pre[3], c[3][3]) # /end for k return