def declare_gridref(self, grid: Grid, block: CodeBlock): name = self.gridref_name(grid) block.append( ("__global " if self.ocl else "") + "bElem *{} = &{};".format( name, self.layout.elem(grid, [0] * len(self.codegen.TILE_DIM)))) return name
def merge(self, rego, regl, regr, dim, shift, block: CodeBlock): block.append("// merge{} {} ,{}, {} -> {}".format( dim, regl, regr, shift, rego)) if dim > 0: raise RuntimeError( "Cannot merge on dimension {} for SSE".format(dim)) block.append("{} = _mm_alignr_epi8({}, {}, {});".format( rego, regr, regl, shift * self.prec * 4))
def merge(self, rego, regl, regr, dim, shift, block: CodeBlock): block.append("// merge{} {} ,{}, {} -> {}".format( dim, regl, regr, shift, rego)) if dim > 0: raise RuntimeError( "Cannot merge on dimension {} for SVE".format(dim)) block.append("{} = svext_u32({}, {}, {});".format( rego, regl, regr, shift * self.prec))
def declare_buf(self, buf: Buffer, block: CodeBlock): space = 1 for a, b in buf.iteration: space *= b - a # block.append("vfloat{} {};".format(space, buf.name)) align = self.layout.prec * 4 * space align = 64 if align >= 64 else align block.append("bElem {}[{}] __attribute__((aligned({})));".format(buf.name, space, align)) return buf.name
def read_aligned(self, grid: Grid, offset, name: str, block: CodeBlock, rel=None): if rel is not None: rel = [rel] block.append("{} = {};".format(name, self.layout.elem(grid, offset, rel)))
def read_aligned(self, grid: Grid, offset, name: str, block: CodeBlock, rel=None): block.append("// read {} -> {}".format(str(offset), name)) if rel is not None: rel = [rel] block.append("{} = _mm256_load_si256((__m256i *) & {});".format( name, self.layout.elem(grid, offset, rel)))
def read_aligned(self, grid: Grid, offset, name: str, block: CodeBlock, rel=None): block.append("// read {} -> {}".format(str(offset), name)) if rel is not None: rel = [rel] block.append("{} = vld1q_u32((uint32_t *) & {});".format( name, self.layout.elem(grid, offset, rel)))
def merge(self, rego, regl, regr, dim, shift, block: CodeBlock): block.append("// merge{} {} ,{}, {} -> {}".format( dim, regl, regr, shift, rego)) l = 1 for i in range(dim): l *= self.codegen.FOLD[i] ll = l * self.codegen.FOLD[dim] if ll == self.VECLEN: lid = self.LID else: lid = "{} & {}".format(self.LID, ll - 1) block.append("dev_shl({}, {}, {}, {}, {}, {});".format( rego, regl, regr, (self.codegen.FOLD[dim] - shift) * l, l * self.codegen.FOLD[dim], lid))
def genStoreTileLoop(self, group: CodeBlock, dims): subblock = CodeBlock() group.append(subblock) subblock.append("long rel = 0;") for d in range(dims - 1, 0, -1): idx_name = self.index_name(d) subblock.append("for (long {} = {}; {} < {}; {} += {})".format( idx_name, 0, idx_name, self.codegen.TILE_DIM[d], idx_name, 1)) newlevel = CodeBlock() subblock.append(newlevel) subblock = newlevel rel = self.index_name(0) subblock.append("for (long {} = {}; {} < {}; {} += {}, ++rel)".format( rel, self.LID, rel, self.codegen.TILE_DIM[0], rel, self.VECLEN)) return subblock
def read_aligned(self, grid: Grid, offset, name: str, block: CodeBlock, rel=None): import st.expr ref = [None] * len(offset) ref[-1] = st.expr.ConstRef(self.LID) if isinstance(rel, list): nrel = rel[:] if nrel[-1]: nrel[-1] += ref[-1] else: nrel[-1] = ref[-1] ref = nrel elif rel: ref[-1] = ref[-1] + rel * self.VECLEN block.append("{} = {};".format(name, self.layout.elem(grid, offset, ref)))
def merge(self, rego, regl, regr, dim, shift, block: CodeBlock): block.append("// merge{} {} ,{}, {} -> {}".format( dim, regl, regr, shift, rego)) if dim > 1: raise RuntimeError( "Cannot merge on dimension {} for AVX2".format(dim)) if dim == 1: block.append("{} = _mm256_permute2x128_si256({}, {}, 3);".format( rego, regr, regl)) elif dim == 0: block.append("{} = _mm256_alignr_epi8({}, {}, {});".format( rego, regr, regl, shift * self.prec * 4))
def merge(self, rego, regl, regr, dim, shift, block: CodeBlock): block.append("// merge{} {} ,{}, {} -> {}".format(dim, regl, regr, shift, rego)) l = 1 for i in range(dim): l *= self.codegen.FOLD[i] if l * self.codegen.FOLD[dim] == self.VECLEN: # this only requires a shift sh = shift * l * self.prec block.append("{} = _mm512_alignr_epi32({}, {}, {});".format(rego, regr, regl, sh)) else: # this requires masking sh = (self.VECLEN - (self.codegen.FOLD[dim] - shift) * l) * self.prec block.append("{} = _mm512_alignr_epi32({}, {}, {});".format( rego, regr, regr, sh)) sh = shift * l * self.prec mask = genmask(self.codegen.FOLD, dim, shift, self.prec, self.VECLEN) block.append("{} = _mm512_mask_alignr_epi32({}, {}, {}, {}, {});".format( rego, rego, mask, regl, regl, sh))
def declare_buf(self, buf: Buffer, block: CodeBlock): space = 1 for a, b in buf.iteration: space *= b - a block.append("vfloat{} {};".format(space, buf.name)) return buf.name
def store_vecbuf(self, vecbuf_name, reg_name, block: CodeBlock): block.append("_mm256_store_si256((__m256i *) & {}[0], {});".format( reg_name, vecbuf_name))
def declare_vec(self, name, block: CodeBlock): block.append("svuint32_t {};".format(name))
def declare_reg(self, name, block: CodeBlock): block.append("vfloat{} {};".format(self.VECLEN, name))
def genStoreLoop(self, group: CodeBlock): group.append("#pragma omp simd") group.append("for (long sti = 0; sti < {}; ++sti)".format(self.codegen.TILE_SIZE))
def declare_gridref(self, grid: Grid, block: CodeBlock): name = self.gridref_name(grid) block.append("bElem *{} = &{};".format( name, self.layout.elem(grid, [0] * len(self.codegen.TILE_DIM)))) block.append("{} = (bElem *)__builtin_assume_aligned({}, 64);".format(name, name)) return name
def declare_vec(self, name, block: CodeBlock): block.append("__m512i {};".format(name))
def store(self, buf: Buffer, group: CodeBlock): group.append("{}[sti * {} + {}] = {}[sti];".format( self.gridref_name(buf.grid), self.VECLEN, self.LID, buf.name))
def storeTile(self, buf: Buffer, group: CodeBlock): dims = buf.grid.dims dimrels = [self.index_name(i) for i in reversed(range(dims))] group.append("{} = {}[rel];".format( self.layout.elem(buf.grid, [0] * dims, dimrels), buf.name))
def genStoreLoop(self, group: CodeBlock): group.append("for (long sti = 0; sti < {}; ++sti)".format( self.codegen.TILE_SIZE // self.VECLEN))
def store_vecbuf(self, vecbuf_name, reg_name, block: CodeBlock): block.append("_mm512_store_epi32( & {}[0], {});".format(reg_name, vecbuf_name))
def declare_vec(self, name, block: CodeBlock): block.append("bElem {};".format(name))
def store_vecbuf(self, vecbuf_name, reg_name, block: CodeBlock): block.append("vst1q_u32((uint32_t *) & {}[0], {});".format( reg_name, vecbuf_name))
def store_vecbuf(self, vecbuf_name, reg_name, block: CodeBlock): block.append("{} = {};".format(reg_name, vecbuf_name))
def genVectorLoop(self, group: CodeBlock): group.append("#pragma omp simd") group.append("for (long vit = 0; vit < {}; ++vit)".format(self.VECLEN)) g = CodeBlock() group.append(g) return g
def declare_buf(self, buf: Buffer, block: CodeBlock): space = 1 for a, b in buf.iteration: space *= b - a block.append("bElem {}[{}];".format(buf.name, space // self.VECLEN)) return buf.name
def declare_reg(self, name, block: CodeBlock): align = self.layout.prec * 4 * self.VECLEN align = 64 if align >= 64 else align block.append("bElem {}[{}] __attribute__((aligned({})));".format(name, self.VECLEN, align))
def store_vecbuf(self, vecbuf_name, reg_name, block: CodeBlock): block.append( "svst1_u32(svptrue_b32(), (uint32_t *) & {}[0], {});".format( reg_name, vecbuf_name))