def k_loop_prefetch_simple(self, code): kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c a_pre, b_pre = self.a_pre, self.b_pre p_tA, p_tB = self.p_tA, self.p_tB # Load the next values from tA and tB for ai in range(mr): a[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b[bj].load(p_tB, bj * self.B_strides.col) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # Inner loop over k for k in syn_iter(code, kc / 2, mode = CTR): # syn_range(code, 0, kc * 8, 8): # Iteration 1 -- load [a,b]_pre, compute [a,b] # Load the prefetch values from tA and tB for ai in range(mr): a_pre[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b_pre[bj].load(p_tB, bj * self.B_strides.col) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # Update c for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj]) # Iteration 2l -- oad [a,b], compute [a,b]_pre # Load the prefetch values from tA and tB for ai in range(mr): a_pre[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b_pre[bj].load(p_tB, bj * self.B_strides.col) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # Update c for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a_pre[ci], b_pre[cj], c[ci][cj]) # /end for k return
def k_loop_simple(self, code): kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c p_tA, p_tB = self.p_tA, self.p_tB # Inner loop over k for k in syn_iter(code, kc, mode=CTR): # syn_range(code, 0, kc * 8, 8): # Load the next values from tA and tB -- generating loops for ai in range(mr): a[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b[bj].load(p_tB, bj * self.B_strides.col) # Update c -- generating loop for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj]) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # /end for k return
def reg_loop_simple(self, a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_col_stride): # Load the next values from tA and tB -- generating loops for ai in range(mr): a[ai].load(p_tA, ai * A_row_stride) for bj in range(nr): b[bj].load(p_tB, bj * B_col_stride) # Update c -- generating loop for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj]) return
def TestExternalStop(): prgm = synppc.Program() code = prgm.get_stream() prgm.add(code) ppc.set_active_code(code) # Data data = array.array('d', range(5*5)) # Constants - read only n_rows = vars.SignedWord(5) n_cols = vars.SignedWord(5) addr = vars.SignedWord(data.buffer_info()[0]) dbl_size = vars.SignedWord(synppc.WORD_SIZE * 2) row_bytes = vars.SignedWord(synppc.WORD_SIZE * 5 * 2) # Variables - read/write sum = vars.DoubleFloat(0.0) x = vars.DoubleFloat(0.0) offset = vars.SignedWord(0) # Iterators i_iter = syn_iter(code, 0, mode = INC) i_iter.set_external_stop(n_rows.reg) j_ctr = syn_iter(code, 0, mode = CTR) j_ctr.set_external_stop(n_cols.reg) for i in i_iter: offset.v = vars.SignedWord.cast(i) * row_bytes # Note that j_cnt is unreadable since it's in the ctr register for j_cnt in j_ctr: # Load the next vaule in the matrix ppc.lfdx(x, addr, offset) sum.v = vars.fmadd(x, x, sum) # sum += x*x offset.v = offset + dbl_size # code.add(ppc.Illegal()) util.return_var(sum) proc = synppc.Processor() r = proc.execute(prgm, mode = 'fp') # print 'Test external stop: ', r assert(r == 4900.0) return
def k_loop_simple(self, code): kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c p_tA, p_tB = self.p_tA, self.p_tB # Inner loop over k for k in syn_iter(code, kc, mode = CTR): # syn_range(code, 0, kc * 8, 8): # Load the next values from tA and tB -- generating loops for ai in range(mr): a[ai].load(p_tA, ai * self.A_strides.row) for bj in range(nr): b[bj].load(p_tB, bj * self.B_strides.col) # Update c -- generating loop for ci in range(mr): for cj in range(nr): c[ci][cj].v = ppcvar.fmadd(a[ci], b[cj], c[ci][cj]) p_tA.v = p_tA + self.A_strides.col p_tB.v = p_tB + self.B_strides.row # /end for k return
def k_loop_prefetch(self, code): A_row_stride = self.A_strides.row A_col_stride = self.A_strides.col B_row_stride = self.B_strides.row B_col_stride = self.B_strides.col kc, mr, nr = self.block.kc, self.block.mr, self.block.nr a, b, c = self.a, self.b, self.c a_pre, b_pre = self.a_pre, self.b_pre p_tA, p_tB = self.p_tA, self.p_tB # Increment p_tA, p_tB a[0].load(p_tA, 0 * A_row_stride) b[0].load(p_tB, 0 * B_col_stride) b[1].load(p_tB, 1 * B_col_stride) a[1].load(p_tA, 1 * A_row_stride) b[2].load(p_tB, 2 * B_col_stride) b[3].load(p_tB, 3 * B_col_stride) a[2].load(p_tA, 2 * A_row_stride) a[3].load(p_tA, 3 * A_row_stride) p_tA.v = p_tA + A_col_stride p_tB.v = p_tB + B_row_stride for k in syn_iter(code, kc / 2 , mode = CTR): # syn_range(code, 0, kc * 8, 8): # self.reg_loop_simple(a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_row_stride) a_pre[0].load(p_tA, 0 * A_row_stride) b_pre[1].load(p_tB, 1 * B_col_stride) b_pre[0].load(p_tB, 0 * B_col_stride) a_pre[1].load(p_tA, 1 * A_row_stride) b_pre[2].load(p_tB, 2 * B_col_stride) a_pre[2].load(p_tA, 2 * A_row_stride) b_pre[3].load(p_tB, 3 * B_col_stride) a_pre[3].load(p_tA, 3 * A_row_stride) p_tA.v = p_tA + A_col_stride p_tB.v = p_tB + B_row_stride c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]); c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0]) c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1]) c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2]) c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0]) c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1]) c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2]) c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3]) c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]); c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3]) c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0]) c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1]) c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2]) c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3]) a[0].load(p_tA, 0 * A_row_stride) b[1].load(p_tB, 1 * B_col_stride) b[0].load(p_tB, 0 * B_col_stride) a[1].load(p_tA, 1 * A_row_stride) b[2].load(p_tB, 2 * B_col_stride) b[3].load(p_tB, 3 * B_col_stride) a[2].load(p_tA, 2 * A_row_stride) a[3].load(p_tA, 3 * A_row_stride) p_tA.v = p_tA + A_col_stride p_tB.v = p_tB + B_row_stride c[0][0].v = ppcvar.fmadd(a_pre[0], b_pre[0], c[0][0]); c[0][1].v = ppcvar.fmadd(a_pre[0], b_pre[1], c[0][1]); c[1][0].v = ppcvar.fmadd(a_pre[1], b_pre[0], c[1][0]) c[1][1].v = ppcvar.fmadd(a_pre[1], b_pre[1], c[1][1]) c[1][2].v = ppcvar.fmadd(a_pre[1], b_pre[2], c[1][2]) c[0][2].v = ppcvar.fmadd(a_pre[0], b_pre[2], c[0][2]); c[2][0].v = ppcvar.fmadd(a_pre[2], b_pre[0], c[2][0]) c[2][1].v = ppcvar.fmadd(a_pre[2], b_pre[1], c[2][1]) c[2][2].v = ppcvar.fmadd(a_pre[2], b_pre[2], c[2][2]) c[2][3].v = ppcvar.fmadd(a_pre[2], b_pre[3], c[2][3]) c[0][3].v = ppcvar.fmadd(a_pre[0], b_pre[3], c[0][3]); c[1][3].v = ppcvar.fmadd(a_pre[1], b_pre[3], c[1][3]) c[3][0].v = ppcvar.fmadd(a_pre[3], b_pre[0], c[3][0]) c[3][1].v = ppcvar.fmadd(a_pre[3], b_pre[1], c[3][1]) c[3][2].v = ppcvar.fmadd(a_pre[3], b_pre[2], c[3][2]) c[3][3].v = ppcvar.fmadd(a_pre[3], b_pre[3], c[3][3]) # /end for k return
def compute(i,j): c[i][j].v = ppcvar.fmadd(a[i], b[j], c[i][j]); load_a(0)
def reg_loop_4x4(self, a, b, c, mr, nr, p_tA, p_tB, A_row_stride, B_col_stride): a[0].load(p_tA, 0 * A_row_stride) b[0].load(p_tB, 0 * B_col_stride) c[0][0].v = ppcvar.fmadd(a[0], b[0], c[0][0]); b[1].load(p_tB, 1 * B_col_stride) a[1].load(p_tA, 1 * A_row_stride) c[0][1].v = ppcvar.fmadd(a[0], b[1], c[0][1]); c[1][0].v = ppcvar.fmadd(a[1], b[0], c[1][0]) a[2].load(p_tA, 2 * A_row_stride) b[2].load(p_tB, 2 * B_col_stride) c[1][1].v = ppcvar.fmadd(a[1], b[1], c[1][1]) c[2][0].v = ppcvar.fmadd(a[2], b[0], c[2][0]) c[0][2].v = ppcvar.fmadd(a[0], b[2], c[0][2]); c[1][2].v = ppcvar.fmadd(a[1], b[2], c[1][2]) c[2][1].v = ppcvar.fmadd(a[2], b[1], c[2][1]) a[3].load(p_tA, 3 * A_row_stride) b[3].load(p_tB, 3 * B_col_stride) c[2][2].v = ppcvar.fmadd(a[2], b[2], c[2][2]) c[3][0].v = ppcvar.fmadd(a[3], b[0], c[3][0]) c[3][1].v = ppcvar.fmadd(a[3], b[1], c[3][1]) c[3][2].v = ppcvar.fmadd(a[3], b[2], c[3][2]) c[3][3].v = ppcvar.fmadd(a[3], b[3], c[3][3]) c[0][3].v = ppcvar.fmadd(a[0], b[3], c[0][3]); c[1][3].v = ppcvar.fmadd(a[1], b[3], c[1][3]) c[2][3].v = ppcvar.fmadd(a[2], b[3], c[2][3]) return