def to_ops_stencil(param, accesses): dims = len(accesses[0]) pts = len(accesses) stencil_name = namespace['ops_stencil_name'](dims, param.name, pts) stencil_array = Array( name=stencil_name, dimensions=(DefaultDimension(name='len', default_value=dims * pts), ), dtype=np.int32, ) ops_stencil = OpsStencil(stencil_name.upper()) return ops_stencil, [ Expression( ClusterizedEq( Eq(stencil_array, ListInitializer(list(itertools.chain(*accesses)))))), Expression( ClusterizedEq( Eq( ops_stencil, namespace['ops_decl_stencil']( dims, pts, Symbol(stencil_array.name), Literal('"%s"' % stencil_name.upper()))))) ]
def create_ops_par_loop(trees, ops_kernel, parameters, block, name_to_ops_dat, accessible_origin, par_to_ops_stencil, dims): it_range = [] devito_to_ops_indexer = 1 for tree in trees: if isinstance(tree, IterationTree): for i in tree: it_range.extend( [i.symbolic_min, i.symbolic_max + devito_to_ops_indexer]) range_array = Array(name='%s_range' % ops_kernel.name, dimensions=(DefaultDimension( name='range', default_value=len(it_range)), ), dtype=np.int32, scope='stack') range_array_init = Expression( ClusterizedEq(Eq(range_array, ListInitializer(it_range)))) ops_args = [] for p in parameters: ops_arg = create_ops_arg(p, accessible_origin, name_to_ops_dat, par_to_ops_stencil) ops_args.append( ops_arg.ops_type(ops_arg.ops_name, ops_arg.elements_per_point, ops_arg.dtype, ops_arg.rw_flag)) ops_par_loop_call = Call(namespace['ops_par_loop'], [ Literal(ops_kernel.name), Literal('"%s"' % ops_kernel.name), block, dims, range_array, *ops_args ]) return [range_array_init], ops_par_loop_call
def __init__(self, exprs, ispace, dspace, guards=None, properties=None): self._exprs = tuple(ClusterizedEq(i, ispace=ispace, dspace=dspace) for i in as_tuple(exprs)) self._ispace = ispace self._dspace = dspace self._guards = frozendict(guards or {}) self._properties = frozendict(properties or {})
def create_ops_par_loop(trees, ops_kernel, parameters, block, name_to_ops_dat, accessible_origin, par_to_ops_stencil, dims): it_range = [] for tree in trees: if isinstance(tree, IterationTree): for bounds in [it.bounds() for it in tree]: it_range.extend(bounds) range_array = Array(name='%s_range' % ops_kernel.name, dimensions=(DefaultDimension( name='range', default_value=len(it_range)), ), dtype=np.int32, scope='stack') range_array_init = Expression( ClusterizedEq(Eq(range_array, ListInitializer(it_range)))) ops_par_loop_call = Call(namespace['ops_par_loop'], [ Literal(ops_kernel.name), Literal('"%s"' % ops_kernel.name), block, dims, range_array, *[ create_ops_arg(p, accessible_origin, name_to_ops_dat, par_to_ops_stencil) for p in parameters ] ]) return [range_array_init], ops_par_loop_call
def __init__(self, exprs, ispace, dspace, atomics=None, guards=None): self._exprs = list( ClusterizedEq(i, ispace=ispace, dspace=dspace) for i in exprs) self._ispace = ispace self._dspace = dspace self._atomics = set(atomics or []) self._guards = guards or {}
def __init__(self, exprs, ispace, dspace, atomics=None, guards=None): self._exprs = exprs # Keep expressions ordered based on information flow self._exprs = tuple( ClusterizedEq(v, ispace, dspace) for v in self.trace.values()) self._ispace = ispace self._dspace = dspace self._atomics = frozenset(atomics or ()) self._guards = frozendict(guards or {})
def __init__(self, exprs, ispace, dspace, guards=None, properties=None): self._exprs = tuple(ClusterizedEq(i, ispace=ispace, dspace=dspace) for i in as_tuple(exprs)) self._ispace = ispace self._dspace = dspace self._guards = frozendict(guards or {}) properties = dict(properties or {}) properties.update({i.dim: properties.get(i.dim, set()) for i in ispace.intervals}) self._properties = frozendict(properties)
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) ops_block = OpsBlock('block') # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for section, trees in find_affine_trees(iet).items(): dims.append(len(trees[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(trees[0].root)) symbols -= set(FindSymbols('defines').visit(trees[0].root)) to_dat |= symbols # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block)) for n, (section, trees) in enumerate(find_affine_trees(iet).items()): pre_loop, ops_kernel = opsit(trees, n) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.append('stdio.h') body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, ops_exit ] return List(body=body)
def generate_ops_stencils(accesses): function_to_stencil = defaultdict(list) function_to_dims = {} ops_stencils_initializers = [] ops_stencils_symbols = {} for k, v in accesses.items(): to_skip = -1 if k.is_TimeFunction: to_skip = k._time_position stencils = [ (k1, list(v1)) for k1, v1 in groupby(v, lambda s: s[k._time_position][0]) ] for k1, v1 in stencils: name = "%s%s" % (k.name, k1) function_to_dims[name] = k.ndim - 1 function_to_stencil[name].extend([ offset for stencil in v1 for i, (_, offset) in enumerate(stencil) if i is not to_skip ]) else: function_to_dims[k.name] = k.ndim for s in v: function_to_stencil[k.name].extend( [offset for i, (_, offset) in enumerate(s)]) for f, stencil in function_to_stencil.items(): stencil_name = "s%sd_%s_%dpt" % (function_to_dims[f], f, len(stencil) / function_to_dims[f]) ops_stencil_arr = SymbolicArray(name=stencil_name, dimensions=(len(stencil), ), dtype=np.int32) ops_stencil = OPSStencil(stencil_name.upper()) arr_assign = Eq(ops_stencil_arr, ListInitializer(stencil)) ops_stencils_initializers.append(Expression(ClusterizedEq(arr_assign))) decl_call = Call("ops_decl_stencil", [ function_to_dims[f], int(len(stencil) / function_to_dims[f]), ops_stencil_arr, String(ops_stencil.name) ]) ops_stencils_symbols[f] = ops_stencil ops_stencils_initializers.append( Element(cgen.InlineInitializer(ops_stencil, decl_call))) return ops_stencils_initializers, ops_stencils_symbols
def __init__(self, exprs, ispace=None, guards=None, properties=None, syncs=None): ispace = ispace or IterationSpace([]) self._exprs = tuple( ClusterizedEq(e, ispace=ispace) for e in as_tuple(exprs)) self._ispace = ispace self._guards = frozendict(guards or {}) self._syncs = frozendict(syncs or {}) properties = dict(properties or {}) properties.update( {i.dim: properties.get(i.dim, set()) for i in ispace.intervals}) self._properties = frozendict(properties)
def create_ops_dat(f, name_to_ops_dat, block): ndim = f.ndim - (1 if f.is_TimeFunction else 0) dim = Array(name=namespace['ops_dat_dim'](f.name), dimensions=(DefaultDimension(name='dim', default_value=ndim), ), dtype=np.int32, scope='stack') base = Array(name=namespace['ops_dat_base'](f.name), dimensions=(DefaultDimension(name='base', default_value=ndim), ), dtype=np.int32, scope='stack') d_p = Array(name=namespace['ops_dat_d_p'](f.name), dimensions=(DefaultDimension(name='d_p', default_value=ndim), ), dtype=np.int32, scope='stack') d_m = Array(name=namespace['ops_dat_d_m'](f.name), dimensions=(DefaultDimension(name='d_m', default_value=ndim), ), dtype=np.int32, scope='stack') base_val = [Zero() for i in range(ndim)] # If f is a TimeFunction we need to create a ops_dat for each time stepping # variable (eg: t1, t2) if f.is_TimeFunction: time_pos = f._time_position time_index = f.indices[time_pos] time_dims = f.shape[time_pos] dim_val = f.shape[:time_pos] + f.shape[time_pos + 1:] d_p_val = f._size_nodomain.left[time_pos + 1:] d_m_val = [-i for i in f._size_nodomain.right[time_pos + 1:]] ops_dat_array = Array(name=namespace['ops_dat_name'](f.name), dimensions=(DefaultDimension( name='dat', default_value=time_dims), ), dtype=namespace['ops_dat_type'], scope='stack') dat_decls = [] for i in range(time_dims): name = '%s%s%s' % (f.name, time_index, i) dat_decls.append(namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([i])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % name))) ops_decl_dat = Expression( ClusterizedEq(Eq(ops_dat_array, ListInitializer(dat_decls)))) # Inserting the ops_dat array in case of TimeFunction. name_to_ops_dat[f.name] = ops_dat_array else: ops_dat = OpsDat("%s_dat" % f.name) name_to_ops_dat[f.name] = ops_dat dim_val = f.shape d_p_val = f._size_nodomain.left d_m_val = [-i for i in f._size_nodomain.right] ops_decl_dat = Expression( ClusterizedEq( Eq( ops_dat, namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([0])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % f.name))))) dim_val = Expression(ClusterizedEq(Eq(dim, ListInitializer(dim_val)))) base_val = Expression(ClusterizedEq(Eq(base, ListInitializer(base_val)))) d_p_val = Expression(ClusterizedEq(Eq(d_p, ListInitializer(d_p_val)))) d_m_val = Expression(ClusterizedEq(Eq(d_m, ListInitializer(d_m_val)))) return OpsDatDecl(dim_val=dim_val, base_val=base_val, d_p_val=d_p_val, d_m_val=d_m_val, ops_decl_dat=ops_decl_dat)
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # To return the result to Devito, it is necessary to copy the data # from the dat object back to the CPU memory. after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, self.time_dimension.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) self._func_table[namespace['ops_kernel_file'](ops_kernel.name)] = \ MetaCall(ops_kernel, False) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.extend(['stdio.h', 'ops_seq.h']) body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, *after_time_loop, ops_exit ] return List(body=body)
def opsit(trees, count): node_factory = OPSNodeFactory() expressions = [] for tree in trees: expressions.extend(FindNodes(Expression).visit(tree.inner)) it_range = [] it_dims = 0 for tree in trees: if isinstance(tree, IterationTree): it_range = [it.bounds() for it in tree] it_dims = len(tree) block = OPSBlock(namespace['ops_block'](count)) block_init = Element( cgen.Initializer( block, Call("ops_decl_block", [it_dims, String(block.name)], False))) ops_expressions = [] accesses = defaultdict(set) for i in reversed(expressions): extend_accesses(accesses, get_accesses(i.expr)) ops_expressions.insert(0, Expression(make_ops_ast(i.expr, node_factory))) ops_stencils_initializers, ops_stencils = generate_ops_stencils(accesses) to_remove = [ f.name for f in FindSymbols('defines').visit(List(body=expressions)) ] parameters = FindSymbols('symbolics').visit(List(body=ops_expressions)) parameters = [ p for p in parameters if p.name != 'OPS_ACC_size' and p.name not in to_remove ] parameters = sorted(parameters, key=lambda i: (i.is_Constant, i.name)) arguments = FindSymbols('symbolics').visit(List(body=expressions)) arguments = [a for a in arguments if a.name not in to_remove] arguments = sorted(arguments, key=lambda i: (i.is_Constant, i.name)) ops_expressions = [ Expression(fix_ops_acc(e.expr, [p.name for p in parameters])) for e in ops_expressions ] callable_kernel = Callable(namespace['ops_kernel'](count), ops_expressions, "void", parameters) dat_declarations = [] argname_to_dat = {} for a in arguments: if a.is_Constant: continue dat_dec, dat_sym = to_ops_dat(a, block) dat_declarations.extend(dat_dec) argname_to_dat.update(dat_sym) par_loop_range_arr = SymbolicArray(name=namespace['ops_range'](count), dimensions=(len(it_range) * 2, ), dtype=np.int32) range_vals = [] for mn, mx in it_range: range_vals.append(mn) range_vals.append(mx) par_loop_range_init = Expression( ClusterizedEq(Eq(par_loop_range_arr, ListInitializer(range_vals)))) ops_args = get_ops_args([p for p in parameters], ops_stencils, argname_to_dat) par_loop = Call("ops_par_loop", [ FunctionPointer(callable_kernel.name), String(callable_kernel.name), block, it_dims, par_loop_range_arr, *ops_args ]) return (callable_kernel, [par_loop_range_init, block_init] + ops_stencils_initializers + dat_declarations + [Call("ops_partition", [String("")])], List(body=[par_loop]), it_dims)
def to_ops_dat(function, block): ndim = function.ndim - (1 if function.is_TimeFunction else 0) dim = SymbolicArray(name="%s_dim" % function.name, dimensions=(ndim, ), dtype=np.int32) base = SymbolicArray(name="%s_base" % function.name, dimensions=(ndim, ), dtype=np.int32) d_p = SymbolicArray(name="%s_d_p" % function.name, dimensions=(ndim, ), dtype=np.int32) d_m = SymbolicArray(name="%s_d_m" % function.name, dimensions=(ndim, ), dtype=np.int32) res = [] dats = {} ops_decl_dat_call = [] if function.is_TimeFunction: time_pos = function._time_position time_index = function.indices[time_pos] time_dims = function.shape[time_pos] dim_shape = function.shape[:time_pos] + function.shape[time_pos + 1:] padding = function.padding[:time_pos] + function.padding[time_pos + 1:] halo = function.halo[:time_pos] + function.halo[time_pos + 1:] base_val = [0 for i in range(ndim)] d_p_val = tuple([p[0] + h[0] for p, h in zip(padding, halo)]) d_m_val = tuple([-(p[1] + h[1]) for p, h in zip(padding, halo)]) ops_dat_array = SymbolicArray( name="%s_dat" % function.name, dimensions=[time_dims], dtype="ops_dat", ) ops_decl_dat_call.append( Element( cgen.Statement( "%s %s[%s]" % (ops_dat_array.dtype, ops_dat_array.name, time_dims)))) for i in range(time_dims): access = FunctionTimeAccess(function, i) ops_dat_access = ArrayAccess(ops_dat_array, i) call = Call("ops_decl_dat", [ block, 1, dim, base, d_m, d_p, access, String(function._C_typedata), String("%s%s%s" % (function.name, time_index, i)) ], False) dats["%s%s%s" % (function.name, time_index, i)] = ArrayAccess( ops_dat_array, Symbol("%s%s" % (time_index, i))) ops_decl_dat_call.append(Element(cgen.Assign(ops_dat_access, call))) else: ops_dat = OPSDat("%s_dat" % function.name) dats[function.name] = ops_dat d_p_val = tuple( [p[0] + h[0] for p, h in zip(function.padding, function.halo)]) d_m_val = tuple( [-(p[1] + h[1]) for p, h in zip(function.padding, function.halo)]) dim_shape = function.shape base_val = [0 for i in function.shape] ops_decl_dat_call.append( Element( cgen.Initializer( ops_dat, Call("ops_decl_dat", [ block, 1, dim, base, d_m, d_p, FunctionTimeAccess(function, 0), String(function._C_typedata), String(function.name) ], False)))) res.append(Expression(ClusterizedEq(Eq(dim, ListInitializer(dim_shape))))) res.append(Expression(ClusterizedEq(Eq(base, ListInitializer(base_val))))) res.append(Expression(ClusterizedEq(Eq(d_p, ListInitializer(d_p_val))))) res.append(Expression(ClusterizedEq(Eq(d_m, ListInitializer(d_m_val))))) res.extend(ops_decl_dat_call) return res, dats
def make_ops_kernels(iet): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet, {} ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # Copy data from device to host after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, f.grid.time_dim.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} ffuncs = [] for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) ffuncs.append(ops_kernel) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" iet = iet._rebuild(body=flatten([ ops_init, ops_block_init, pre_time_loop, ops_partition, iet.body, after_time_loop, ops_exit ])) return iet, { 'includes': ['stdio.h', 'ops_seq.h'], 'ffuncs': ffuncs, 'headers': [namespace['ops_define_dimension'](dims[0])] }
def create_ops_dat(f, name_to_ops_dat, block): ndim = f.ndim - (1 if f.is_TimeFunction else 0) dim = Array(name=namespace['ops_dat_dim'](f.name), dimensions=(DefaultDimension(name='dim', default_value=ndim), ), dtype=np.int32, scope='stack') base = Array(name=namespace['ops_dat_base'](f.name), dimensions=(DefaultDimension(name='base', default_value=ndim), ), dtype=np.int32, scope='stack') d_p = Array(name=namespace['ops_dat_d_p'](f.name), dimensions=(DefaultDimension(name='d_p', default_value=ndim), ), dtype=np.int32, scope='stack') d_m = Array(name=namespace['ops_dat_d_m'](f.name), dimensions=(DefaultDimension(name='d_m', default_value=ndim), ), dtype=np.int32, scope='stack') res = [] base_val = [Zero() for i in range(ndim)] # If f is a TimeFunction we need to create a ops_dat for each time stepping # variable (eg: t1, t2) if f.is_TimeFunction: time_pos = f._time_position time_index = f.indices[time_pos] time_dims = f.shape[time_pos] dim_shape = sympify(f.shape[:time_pos] + f.shape[time_pos + 1:]) padding = f.padding[:time_pos] + f.padding[time_pos + 1:] halo = f.halo[:time_pos] + f.halo[time_pos + 1:] d_p_val = tuple(sympify([p[0] + h[0] for p, h in zip(padding, halo)])) d_m_val = tuple( sympify([-(p[1] + h[1]) for p, h in zip(padding, halo)])) ops_dat_array = Array(name=namespace['ops_dat_name'](f.name), dimensions=(DefaultDimension( name='dat', default_value=time_dims), ), dtype='ops_dat', scope='stack') dat_decls = [] for i in range(time_dims): name = '%s%s%s' % (f.name, time_index, i) name_to_ops_dat[name] = ops_dat_array.indexify( [Symbol('%s%s' % (time_index, i))]) dat_decls.append(namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([i])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % name))) ops_decl_dat = Expression( ClusterizedEq(Eq(ops_dat_array, ListInitializer(dat_decls)))) else: ops_dat = OpsDat("%s_dat" % f.name) name_to_ops_dat[f.name] = ops_dat d_p_val = tuple( sympify([p[0] + h[0] for p, h in zip(f.padding, f.halo)])) d_m_val = tuple( sympify([-(p[1] + h[1]) for p, h in zip(f.padding, f.halo)])) dim_shape = sympify(f.shape) ops_decl_dat = Expression( ClusterizedEq( Eq( ops_dat, namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([0])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % f.name))))) res.append(Expression(ClusterizedEq(Eq(dim, ListInitializer(dim_shape))))) res.append(Expression(ClusterizedEq(Eq(base, ListInitializer(base_val))))) res.append(Expression(ClusterizedEq(Eq(d_p, ListInitializer(d_p_val))))) res.append(Expression(ClusterizedEq(Eq(d_m, ListInitializer(d_m_val))))) res.append(ops_decl_dat) return res