def test_symbol_renaming(): """When two loops have assignments to the same symbol with different rhs and both are pulled before the loops, one of them has to be renamed """ f, g = ps.fields("f, g : double[2D]") a, b, c = [TypedSymbol(n, np.float64) for n in ('a', 'b', 'c')] loop1 = LoopOverCoordinate( Block( [SympyAssignment(c, a + b), SympyAssignment(g[0, 0], f[0, 0] + c)]), 0, 0, 10) loop2 = LoopOverCoordinate( Block([ SympyAssignment(c, a**2 + b**2), SympyAssignment(g[0, 0], f[0, 0] + c) ]), 0, 0, 10) block = Block([loop1, loop2]) move_constants_before_loop(block) loops = block.atoms(LoopOverCoordinate) assert len(loops) == 2 for loop in loops: assert len(loop.body.args) == 1 assert len(loop.parent.args) == 4 # 2 loops + 2 subexpressions assert loop.parent.args[0].lhs.name != loop.parent.args[1].lhs.name
def read(self, field, stencil): result = [] for i, d in enumerate(stencil): pull_direction = inverse_direction(d) periodic_pull_direction = [] for coord_id, dir_element in enumerate(pull_direction): if not self._periodicity[coord_id]: periodic_pull_direction.append(dir_element) continue lower_limit = self._ghostLayers upper_limit = field.spatial_shape[ coord_id] - 1 - self._ghostLayers limit_diff = upper_limit - lower_limit loop_counter = LoopOverCoordinate.get_loop_counter_symbol( coord_id) if dir_element == 0: periodic_pull_direction.append(0) elif dir_element == 1: new_dir_element = sp.Piecewise( (dir_element, loop_counter < upper_limit), (-limit_diff, True)) periodic_pull_direction.append(new_dir_element) elif dir_element == -1: new_dir_element = sp.Piecewise( (dir_element, loop_counter > lower_limit), (limit_diff, True)) periodic_pull_direction.append(new_dir_element) else: raise NotImplementedError( "This accessor supports only nearest neighbor stencils" ) result.append(field[tuple(periodic_pull_direction)](i)) return result
def border_conditions(direction, field, ghost_layers=1): abs_direction = tuple(-e if e < 0 else e for e in direction) assert sum(abs_direction) == 1 idx = abs_direction.index(1) val = direction[idx] loop_ctrs = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(len(direction)) ] loop_ctr = loop_ctrs[idx] gl = ghost_layers border_condition = sp.Eq(loop_ctr, gl if val < 0 else field.shape[idx] - gl - 1) if ghost_layers == 0: return type_all_numbers(border_condition, loop_ctr.dtype) else: other_min = [sp.Ge(c, gl) for c in loop_ctrs if c != loop_ctr] other_max = [ sp.Lt(c, field.shape[i] - gl) for i, c in enumerate(loop_ctrs) if c != loop_ctr ] result = sp.And(border_condition, *other_min, *other_max) return type_all_numbers(result, loop_ctr.dtype)
def __init__(self, dim, time_step=TypedSymbol("time_step", np.uint32), offsets=None, keys=None): if keys is None: keys = (0, ) * self._num_keys if offsets is None: offsets = (0, ) * dim if len(keys) != self._num_keys: raise ValueError( f"Provided {len(keys)} keys but need {self._num_keys}") if len(offsets) != dim: raise ValueError(f"Provided {len(offsets)} offsets but need {dim}") coordinates = [ LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i] for i in range(dim) ] if dim < 3: coordinates.append(0) self._args = sp.sympify([time_step, *coordinates, *keys]) self.result_symbols = tuple( TypedSymbol(f'random_{self.id}_{i}', self._data_type) for i in range(self._num_vars)) symbols_read = set.union(*[s.atoms(sp.Symbol) for s in self.args]) super().__init__("", symbols_read=symbols_read, symbols_defined=self.result_symbols) self.headers = [f'"{self._name.split("_")[0]}_rand.h"'] RNGBase.id += 1
def test_loop_over_coordinate(): assignments = [Assignment(dst[0, 0](0), s[0]), Assignment(x, dst[0, 0](2))] body = Block(assignments) loop = LoopOverCoordinate(body, coordinate_to_loop_over=0, start=0, stop=10, step=1) assert loop.body == body new_body = Block([assignments[0]]) loop = loop.new_loop_with_different_body(new_body) assert loop.body == new_body assert loop.start == 0 assert loop.stop == 10 assert loop.step == 1 loop.replace(loop.start, 2) loop.replace(loop.stop, 20) loop.replace(loop.step, 2) assert loop.start == 2 assert loop.stop == 20 assert loop.step == 2
def undefined_symbols(self): result = { a for a in (self._time_step, *self._offsets, *self.keys) if isinstance(a, sp.Symbol) } loop_counters = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(self._dim) ] result.update(loop_counters) return result
def test_staggered_iteration_manual(): dim = 2 f_arr = np.arange(5**dim).reshape([5] * dim) s_arr = np.ones([5] * dim + [dim]) * 1234 s_arr_ref = s_arr.copy() f = Field.create_from_numpy_array('f', f_arr) s = Field.create_from_numpy_array('s', s_arr, index_dimensions=1) eqs = [] counters = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim) ] conditions = [counters[i] < f.shape[i] - 1 for i in range(dim)] for d in range(dim): eq = SympyAssignment( s(d), sum(f[o] for o in offsets_in_plane(d, 0, dim)) - sum(f[o] for o in offsets_in_plane(d, -1, dim))) cond = sp.And(*[conditions[i] for i in range(dim) if d != i]) eqs.append(Conditional(cond, eq)) kernel_ast = create_kernel(eqs, ghost_layers=[(1, 0), (1, 0), (1, 0)]) func = make_python_function(kernel_ast) func(f=f_arr, s=s_arr_ref) inner_loop = [ n for n in kernel_ast.atoms(ast.LoopOverCoordinate) if n.is_innermost_loop ][0] cut_loop(inner_loop, [4]) outer_loop = [ n for n in kernel_ast.atoms(ast.LoopOverCoordinate) if n.is_outermost_loop ][0] cut_loop(outer_loop, [4]) simplify_conditionals(kernel_ast.body, loop_counter_simplification=True) cleanup_blocks(kernel_ast.body) move_constants_before_loop(kernel_ast.body) cleanup_blocks(kernel_ast.body) assert not kernel_ast.atoms( Conditional), "Loop cutting optimization did not work" func_optimized = make_python_function(kernel_ast) func_optimized(f=f_arr, s=s_arr) np.testing.assert_almost_equal(s_arr_ref, s_arr)
def _get_rng_code(template, dialect, vector_instruction_set, time_step, offsets, keys, dim, result_symbols): parameters = [time_step] + [ LoopOverCoordinate.get_loop_counter_symbol(i) + offsets[i] for i in range(dim) ] + [0] * (3 - dim) + list(keys) if dialect == 'cuda' or (dialect == 'c' and vector_instruction_set is None): return template.format(parameters=', '.join( str(p) for p in parameters), result_symbols=result_symbols) else: raise NotImplementedError("Not yet implemented for this backend")
def test_staggered_combined(): from pystencils.fd import diff f = ps.fields("f : double[2D]") x, y = [LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(2)] dx = sp.symbols("dx") expr = diff(x * diff(f, 0) + y * diff(f, 1), 0) right = (x + sp.Rational(1, 2)) * (f[1, 0] - f[0, 0]) + y * ( f[1, 1] - f[1, -1] + f[0, 1] - f[0, -1]) / 4 left = (x - sp.Rational(1, 2)) * (f[0, 0] - f[-1, 0]) + y * ( f[-1, 1] - f[-1, -1] + f[0, 1] - f[0, -1]) / 4 reference = (right - left) / (dx**2) to_test = ps.fd.discretize_spatial_staggered(expr, dx) assert sp.expand(reference - to_test) == 0
def test_staggered_iteration(): dim = 2 f_arr = np.arange(5**dim).reshape([5] * dim).astype(np.float64) s_arr = np.ones([5] * dim + [dim]) * 1234 s_arr_ref = s_arr.copy() fields_fixed = (Field.create_from_numpy_array('f', f_arr), Field.create_from_numpy_array('s', s_arr, index_dimensions=1)) fields_var = (Field.create_generic('f', 2), Field.create_generic('s', 2, index_dimensions=1)) for f, s in [fields_var, fields_fixed]: # --- Manual eqs = [] counters = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim) ] conditions = [counters[i] < f.shape[i] - 1 for i in range(dim)] for d in range(dim): eq = SympyAssignment( s(d), sum(f[o] for o in offsets_in_plane(d, 0, dim)) - sum(f[o] for o in offsets_in_plane(d, -1, dim))) cond = sp.And(*[conditions[i] for i in range(dim) if d != i]) eqs.append(Conditional(cond, eq)) func = create_kernel(eqs, ghost_layers=[(1, 0), (1, 0), (1, 0)]).compile() # --- Built-in optimized expressions = [] for d in range(dim): expressions.append( sum(f[o] for o in offsets_in_plane(d, 0, dim)) - sum(f[o] for o in offsets_in_plane(d, -1, dim))) func_optimized = create_staggered_kernel(s, expressions).compile() assert not func_optimized.ast.atoms( Conditional), "Loop cutting optimization did not work" func(f=f_arr, s=s_arr_ref) func_optimized(f=f_arr, s=s_arr) np.testing.assert_almost_equal(s_arr_ref, s_arr)
def staggered_visitor(e, coordinate, sign): if isinstance(e, Diff): arg, *indices = diff_args(e) if len(indices) != 1: raise ValueError("Function supports only up to second derivatives") if not isinstance(arg, Field.Access): raise ValueError("Argument of inner derivative has to be field access") target = indices[0] if target == coordinate: assert sign in (-1, 1) return (arg.neighbor(coordinate, sign) - arg) / dx * sign else: return (stencil(indices, dx, arg.neighbor(coordinate, sign)) + stencil(indices, dx, arg)) / 2 elif isinstance(e, Field.Access): return (e.neighbor(coordinate, sign) + e) / 2 elif isinstance(e, sp.Symbol): loop_idx = LoopOverCoordinate.is_loop_counter_symbol(e) return e + sign / 2 if loop_idx == coordinate else e else: new_args = [staggered_visitor(a, coordinate, sign) for a in e.args] return e.func(*new_args) if new_args else e
def _print_SympyAssignment(self, node): if node.is_declaration: if node.use_auto: data_type = 'auto ' else: if node.is_const: prefix = 'const ' else: prefix = '' data_type = prefix + self._print(node.lhs.dtype).replace(' const', '') + " " return "%s%s = %s;" % (data_type, self.sympy_printer.doprint(node.lhs), self.sympy_printer.doprint(node.rhs)) else: lhs_type = get_type_of_expression(node.lhs) printed_mask = "" if type(lhs_type) is VectorType and isinstance(node.lhs, cast_func): arg, data_type, aligned, nontemporal, mask, stride = node.lhs.args instr = 'storeU' if aligned: instr = 'stream' if nontemporal and 'stream' in self._vector_instruction_set else 'storeA' if mask != True: # NOQA instr = 'maskStoreA' if aligned else 'maskStoreU' if instr not in self._vector_instruction_set: self._vector_instruction_set[instr] = self._vector_instruction_set['store' + instr[-1]].format( '{0}', self._vector_instruction_set['blendv'].format( self._vector_instruction_set['load' + instr[-1]].format('{0}', **self._kwargs), '{1}', '{2}', **self._kwargs), **self._kwargs) printed_mask = self.sympy_printer.doprint(mask) if data_type.base_type.base_name == 'double': if self._vector_instruction_set['double'] == '__m256d': printed_mask = f"_mm256_castpd_si256({printed_mask})" elif self._vector_instruction_set['double'] == '__m128d': printed_mask = f"_mm_castpd_si128({printed_mask})" elif data_type.base_type.base_name == 'float': if self._vector_instruction_set['float'] == '__m256': printed_mask = f"_mm256_castps_si256({printed_mask})" elif self._vector_instruction_set['float'] == '__m128': printed_mask = f"_mm_castps_si128({printed_mask})" rhs_type = get_type_of_expression(node.rhs) if type(rhs_type) is not VectorType: rhs = cast_func(node.rhs, VectorType(rhs_type)) else: rhs = node.rhs ptr = "&" + self.sympy_printer.doprint(node.lhs.args[0]) if stride != 1: instr = 'maskStoreS' if mask != True else 'storeS' # NOQA return self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), stride, printed_mask, **self._kwargs) + ';' pre_code = '' if nontemporal and 'cachelineZero' in self._vector_instruction_set: first_cond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == 0" offset = sp.Add(*[sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i)) * node.lhs.args[0].field.spatial_strides[i] for i in range(len(node.lhs.args[0].field.spatial_strides))]) if stride == 1: offset = offset.subs({node.lhs.args[0].field.spatial_strides[0]: 1}) size = sp.Mul(*node.lhs.args[0].field.spatial_shape) element_size = 8 if data_type.base_type.base_name == 'double' else 4 size_cond = f"({offset} + {CachelineSize.symbol/element_size}) < {size}" pre_code = f"if ({first_cond} && {size_cond}) " + "{\n\t" + \ self._vector_instruction_set['cachelineZero'].format(ptr, **self._kwargs) + ';\n}\n' code = self._vector_instruction_set[instr].format(ptr, self.sympy_printer.doprint(rhs), printed_mask, **self._kwargs) + ';' flushcond = f"((uintptr_t) {ptr} & {CachelineSize.mask_symbol}) == {CachelineSize.last_symbol}" if nontemporal and 'flushCacheline' in self._vector_instruction_set: code2 = self._vector_instruction_set['flushCacheline'].format( ptr, self.sympy_printer.doprint(rhs), **self._kwargs) + ';' code = f"{code}\nif ({flushcond}) {{\n\t{code2}\n}}" elif nontemporal and 'storeAAndFlushCacheline' in self._vector_instruction_set: tmpvar = '_tmp_' + hashlib.sha1(self.sympy_printer.doprint(rhs).encode('ascii')).hexdigest()[:8] code = 'const ' + self._print(node.lhs.dtype).replace(' const', '') + ' ' + tmpvar + ' = ' \ + self.sympy_printer.doprint(rhs) + ';' code1 = self._vector_instruction_set[instr].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';' code2 = self._vector_instruction_set['storeAAndFlushCacheline'].format(ptr, tmpvar, printed_mask, **self._kwargs) + ';' code += f"\nif ({flushcond}) {{\n\t{code2}\n}} else {{\n\t{code1}\n}}" return pre_code + code else: return f"{self.sympy_printer.doprint(node.lhs)} = {self.sympy_printer.doprint(node.rhs)};"
def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu', gpu_exclusive_conditions=False, **kwargs): """Kernel that updates a staggered field. .. image:: /img/staggered_grid.svg Args: staggered_field: field where the first index coordinate defines the location of the staggered value can have 1 or 2 index coordinates, in case of two index coordinates at every staggered location a vector is stored, expressions parameter has to be a sequence of sequences then where e.g. ``f[0,0](0)`` is interpreted as value at the left cell boundary, ``f[1,0](0)`` the right cell boundary and ``f[0,0](1)`` the southern cell boundary etc. expressions: sequence of expressions of length dim, defining how the west, southern, (bottom) cell boundary should be updated. subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions target: 'cpu' or 'gpu' gpu_exclusive_conditions: if/else construct to have only one code block for each of 2**dim code paths kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed Returns: AST, see `create_kernel` """ assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs assert staggered_field.index_dimensions in ( 1, 2), 'Staggered field must have one or two index dimensions' dim = staggered_field.spatial_dimensions counters = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim) ] conditions = [ counters[i] < staggered_field.shape[i] - 1 for i in range(dim) ] assert len(expressions) == dim if staggered_field.index_dimensions == 2: assert all(len(sublist) == len(expressions[0]) for sublist in expressions), \ "If staggered field has two index dimensions expressions has to be a sequence of sequences of all the " \ "same length." final_assignments = [] last_conditional = None def add(condition, dimensions, as_else_block=False): nonlocal last_conditional if staggered_field.index_dimensions == 1: assignments = [ Assignment(staggered_field(d), expressions[d]) for d in dimensions ] a_coll = AssignmentCollection(assignments, list(subexpressions)) a_coll = a_coll.new_filtered( [staggered_field(d) for d in dimensions]) elif staggered_field.index_dimensions == 2: assert staggered_field.has_fixed_index_shape assignments = [ Assignment(staggered_field(d, i), expr) for d in dimensions for i, expr in enumerate(expressions[d]) ] a_coll = AssignmentCollection(assignments, list(subexpressions)) a_coll = a_coll.new_filtered([ staggered_field(d, i) for i in range(staggered_field.index_shape[1]) for d in dimensions ]) sp_assignments = [ SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments ] if as_else_block and last_conditional: new_cond = Conditional(condition, Block(sp_assignments)) last_conditional.false_block = Block([new_cond]) last_conditional = new_cond else: last_conditional = Conditional(condition, Block(sp_assignments)) final_assignments.append(last_conditional) if target == 'cpu' or not gpu_exclusive_conditions: for d in range(dim): cond = sp.And(*[conditions[i] for i in range(dim) if d != i]) add(cond, [d]) elif target == 'gpu': full_conditions = [ sp.And(*[conditions[i] for i in range(dim) if d != i]) for d in range(dim) ] for include in itertools.product(*[[1, 0]] * dim): case_conditions = sp.And(*[ c if value else sp.Not(c) for c, value in zip(full_conditions, include) ]) dimensions_to_include = [i for i in range(dim) if include[i]] if dimensions_to_include: add(case_conditions, dimensions_to_include, True) ghost_layers = [(1, 0)] * dim blocking = kwargs.get('cpu_blocking', None) if blocking: del kwargs['cpu_blocking'] cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None) if cpu_vectorize_info: del kwargs['cpu_vectorize_info'] openmp = kwargs.get('cpu_openmp', None) if openmp: del kwargs['cpu_openmp'] ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs) if target == 'cpu': remove_conditionals_in_staggered_kernel(ast) move_constants_before_loop(ast) omp_collapse = None if blocking: omp_collapse = loop_blocking(ast, blocking) if openmp: from pystencils.cpu import add_openmp add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False) if cpu_vectorize_info is True: vectorize(ast) elif isinstance(cpu_vectorize_info, dict): vectorize(ast, **cpu_vectorize_info) return ast
def create_indexed_kernel( assignments: AssignmentOrAstNodeList, index_fields, function_name="kernel", type_info=None, coordinate_names=('x', 'y', 'z')) -> KernelFunction: """ Similar to :func:`create_kernel`, but here not all cells of a field are updated but only cells with coordinates which are stored in an index field. This traversal method can e.g. be used for boundary handling. The coordinates are stored in a separate index_field, which is a one dimensional array with struct data type. This struct has to contain fields named 'x', 'y' and for 3D fields ('z'). These names are configurable with the 'coordinate_names' parameter. The struct can have also other fields that can be read and written in the kernel, for example boundary parameters. Args: assignments: list of assignments index_fields: list of index fields, i.e. 1D fields with struct data type type_info: see documentation of :func:`create_kernel` function_name: see documentation of :func:`create_kernel` coordinate_names: name of the coordinate fields in the struct data type """ fields_read, fields_written, assignments = add_types( assignments, type_info, check_independence_condition=False) all_fields = fields_read.union(fields_written) for index_field in index_fields: index_field.field_type = FieldType.INDEXED assert FieldType.is_indexed(index_field) assert index_field.spatial_dimensions == 1, "Index fields have to be 1D" non_index_fields = [f for f in all_fields if f not in index_fields] spatial_coordinates = {f.spatial_dimensions for f in non_index_fields} assert len( spatial_coordinates ) == 1, "Non-index fields do not have the same number of spatial coordinates" spatial_coordinates = list(spatial_coordinates)[0] def get_coordinate_symbol_assignment(name): for idx_field in index_fields: assert isinstance( idx_field.dtype, StructType), "Index fields have to have a struct data type" data_type = idx_field.dtype if data_type.has_element(name): rhs = idx_field[0](name) lhs = TypedSymbol(name, BasicType(data_type.get_element_type(name))) return SympyAssignment(lhs, rhs) raise ValueError( "Index %s not found in any of the passed index fields" % (name, )) coordinate_symbol_assignments = [ get_coordinate_symbol_assignment(n) for n in coordinate_names[:spatial_coordinates] ] coordinate_typed_symbols = [eq.lhs for eq in coordinate_symbol_assignments] assignments = coordinate_symbol_assignments + assignments # make 1D loop over index fields loop_body = Block([]) loop_node = LoopOverCoordinate(loop_body, coordinate_to_loop_over=0, start=0, stop=index_fields[0].shape[0]) for assignment in assignments: loop_body.append(assignment) function_body = Block([loop_node]) ast_node = KernelFunction(function_body, "cpu", "c", make_python_function, ghost_layers=None, function_name=function_name) fixed_coordinate_mapping = { f.name: coordinate_typed_symbols for f in non_index_fields } read_only_fields = set([f.name for f in fields_read - fields_written]) resolve_field_accesses(ast_node, read_only_fields, field_to_fixed_coordinates=fixed_coordinate_mapping) move_constants_before_loop(ast_node) return ast_node
def create_staggered_kernel(assignments, target: Target = Target.CPU, gpu_exclusive_conditions=False, **kwargs): """Kernel that updates a staggered field. .. image:: /img/staggered_grid.svg For a staggered field, the first index coordinate defines the location of the staggered value. Further index coordinates can be used to store vectors/tensors at each point. Args: assignments: a sequence of assignments or an AssignmentCollection. Assignments to staggered field are processed specially, while subexpressions and assignments to regular fields are passed through to `create_kernel`. Multiple different staggered fields can be used, but they all need to use the same stencil (i.e. the same number of staggered points) and shape. target: 'CPU' or 'GPU' gpu_exclusive_conditions: disable the use of multiple conditionals inside the loop. The outer layers are then handled in an else branch. kwargs: passed directly to create_kernel, iteration_slice and ghost_layers parameters are not allowed Returns: AST, see `create_kernel` """ if 'ghost_layers' in kwargs: assert kwargs['ghost_layers'] is None del kwargs['ghost_layers'] if 'iteration_slice' in kwargs: assert kwargs['iteration_slice'] is None del kwargs['iteration_slice'] if 'omp_single_loop' in kwargs: assert kwargs['omp_single_loop'] is False del kwargs['omp_single_loop'] if isinstance(assignments, AssignmentCollection): subexpressions = assignments.subexpressions + [ a for a in assignments.main_assignments if not hasattr(a, 'lhs') or type(a.lhs) is not Field.Access or not FieldType.is_staggered(a.lhs.field) ] assignments = [ a for a in assignments.main_assignments if hasattr(a, 'lhs') and type(a.lhs) is Field.Access and FieldType.is_staggered(a.lhs.field) ] else: subexpressions = [ a for a in assignments if not hasattr(a, 'lhs') or type(a.lhs) is not Field.Access or not FieldType.is_staggered(a.lhs.field) ] assignments = [ a for a in assignments if hasattr(a, 'lhs') and type(a.lhs) is Field.Access and FieldType.is_staggered(a.lhs.field) ] if len(set([tuple(a.lhs.field.staggered_stencil) for a in assignments])) != 1: raise ValueError( "All assignments need to be made to staggered fields with the same stencil" ) if len(set([a.lhs.field.shape for a in assignments])) != 1: raise ValueError( "All assignments need to be made to staggered fields with the same shape" ) staggered_field = assignments[0].lhs.field stencil = staggered_field.staggered_stencil dim = staggered_field.spatial_dimensions shape = staggered_field.shape counters = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim) ] final_assignments = [] # find out whether any of the ghost layers is not needed common_exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim]) for direction in stencil: exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim]) for elementary_direction in direction: exclusions.remove(inverse_direction_string(elementary_direction)) common_exclusions.intersection_update(exclusions) ghost_layers = [[0, 0] for d in range(dim)] for direction in common_exclusions: direction = direction_string_to_offset(direction) for d, s in enumerate(direction): if s == 1: ghost_layers[d][1] = 1 elif s == -1: ghost_layers[d][0] = 1 def condition(direction): """exclude those staggered points that correspond to fluxes between ghost cells""" exclusions = set(["E", "W", "N", "S", "T", "B"][:2 * dim]) for elementary_direction in direction: exclusions.remove(inverse_direction_string(elementary_direction)) conditions = [] for e in exclusions: if e in common_exclusions: continue offset = direction_string_to_offset(e) for i, o in enumerate(offset): if o == 1: conditions.append(counters[i] < shape[i] - 1) elif o == -1: conditions.append(counters[i] > 0) return sp.And(*conditions) if gpu_exclusive_conditions: outer_assignment = None conditions = {direction: condition(direction) for direction in stencil} for num_conditions in range(len(stencil)): for combination in itertools.combinations(conditions.values(), num_conditions): for assignment in assignments: direction = stencil[assignment.lhs.index[0]] if conditions[direction] in combination: assignment = SympyAssignment(assignment.lhs, assignment.rhs) outer_assignment = Conditional(sp.And(*combination), Block([assignment]), outer_assignment) inner_assignment = [] for assignment in assignments: inner_assignment.append( SympyAssignment(assignment.lhs, assignment.rhs)) last_conditional = Conditional( sp.And(*[condition(d) for d in stencil]), Block(inner_assignment), outer_assignment) final_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \ [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \ [last_conditional] if target == Target.CPU: from pystencils.cpu import create_kernel as create_kernel_cpu ast = create_kernel_cpu(final_assignments, ghost_layers=ghost_layers, omp_single_loop=False, **kwargs) else: ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs) return ast for assignment in assignments: direction = stencil[assignment.lhs.index[0]] sp_assignments = [s for s in subexpressions if not hasattr(s, 'lhs')] + \ [SympyAssignment(s.lhs, s.rhs) for s in subexpressions if hasattr(s, 'lhs')] + \ [SympyAssignment(assignment.lhs, assignment.rhs)] last_conditional = Conditional(condition(direction), Block(sp_assignments)) final_assignments.append(last_conditional) remove_start_conditional = any([gl[0] == 0 for gl in ghost_layers]) prepend_optimizations = [ lambda ast: remove_conditionals_in_staggered_kernel( ast, remove_start_conditional), move_constants_before_loop ] if 'cpu_prepend_optimizations' in kwargs: prepend_optimizations += kwargs['cpu_prepend_optimizations'] del kwargs['cpu_prepend_optimizations'] ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, omp_single_loop=False, cpu_prepend_optimizations=prepend_optimizations, **kwargs) return ast
def test_lees_edwards(): domain_size = (64, 64) omega = 1.0 # relaxation rate of first component shear_velocity = 0.1 # shear velocity shear_dir = 0 # direction of shear flow shear_dir_normal = 1 # direction normal to shear plane, for interpolation stencil = LBStencil(Stencil.D2Q9) dh = ps.create_data_handling(domain_size, periodicity=True, default_target=ps.Target.CPU) src = dh.add_array('src', values_per_cell=stencil.Q) dh.fill('src', 1.0, ghost_layers=True) dst = dh.add_array_like('dst', 'src') dh.fill('dst', 0.0, ghost_layers=True) force = dh.add_array('force', values_per_cell=stencil.D) dh.fill('force', 0.0, ghost_layers=True) rho = dh.add_array('rho', values_per_cell=1) dh.fill('rho', 1.0, ghost_layers=True) u = dh.add_array('u', values_per_cell=stencil.D) dh.fill('u', 0.0, ghost_layers=True) counters = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(stencil.D) ] points_up = sp.Symbol('points_up') points_down = sp.Symbol('points_down') u_p = sp.Piecewise( (1, sp.And(ps.data_types.type_all_numbers(counters[1] <= 1, 'int'), points_down)), (-1, sp.And( ps.data_types.type_all_numbers(counters[1] >= src.shape[1] - 2, 'int'), points_up)), (0, True)) * shear_velocity lbm_config = LBMConfig(stencil=stencil, relaxation_rate=omega, compressible=True, velocity_input=u.center_vector + sp.Matrix([u_p, 0]), density_input=rho, force_model=ForceModel.LUO, force=force.center_vector, kernel_type='collide_only') lbm_opt = LBMOptimisation(symbolic_field=src) collision = create_lb_update_rule(lbm_config=lbm_config, lbm_optimisation=lbm_opt) to_insert = [ s.lhs for s in collision.subexpressions if collision.method.first_order_equilibrium_moment_symbols[shear_dir] in s.free_symbols ] for s in to_insert: collision = collision.new_with_inserted_subexpression(s) ma = [] for a, c in zip(collision.main_assignments, collision.method.stencil): if c[shear_dir_normal] == -1: b = (True, False) elif c[shear_dir_normal] == 1: b = (False, True) else: b = (False, False) a = ps.Assignment(a.lhs, a.rhs.replace(points_down, b[0])) a = ps.Assignment(a.lhs, a.rhs.replace(points_up, b[1])) ma.append(a) collision.main_assignments = ma stream = create_stream_pull_with_output_kernel(collision.method, src, dst, { 'density': rho, 'velocity': u }) config = ps.CreateKernelConfig(target=dh.default_target) stream_kernel = ps.create_kernel(stream, config=config).compile() collision_kernel = ps.create_kernel(collision, config=config).compile() init = macroscopic_values_setter(collision.method, velocity=(0, 0), pdfs=src.center_vector, density=rho.center) init_kernel = ps.create_kernel(init, ghost_layers=0).compile() offset = [0.0] sync_pdfs = dh.synchronization_function([src.name], functor=partial( get_le_boundary_functor, shear_offset=offset)) dh.run_kernel(init_kernel) time = 500 dh.all_to_gpu() for i in range(time): dh.run_kernel(collision_kernel) sync_pdfs() dh.run_kernel(stream_kernel) dh.swap(src.name, dst.name) offset[0] += shear_velocity dh.all_to_cpu() nu = lattice_viscosity_from_relaxation_rate(omega) h = domain_size[0] k_max = 100 analytical_solution = get_solution_navier_stokes( np.linspace(0.5, h - 0.5, h), time, nu, shear_velocity, h, k_max) np.testing.assert_array_almost_equal(analytical_solution, dh.gather_array(u.name)[0, :, 0], decimal=5) dh.fill(rho.name, 1.0, ghost_layers=True) dh.run_kernel(init_kernel) dh.fill(u.name, 0.0, ghost_layers=True) dh.fill('force', 0.0, ghost_layers=True) dh.cpu_arrays[force.name][64 // 3, 1, :] = [1e-2, -1e-1] offset[0] = 0 time = 20 dh.all_to_gpu() for i in range(time): dh.run_kernel(collision_kernel) sync_pdfs() dh.run_kernel(stream_kernel) dh.swap(src.name, dst.name) dh.all_to_cpu() vel_unshifted = np.array(dh.gather_array(u.name)[:, -3:-1, :]) dh.fill(rho.name, 1.0, ghost_layers=True) dh.run_kernel(init_kernel) dh.fill(u.name, 0.0, ghost_layers=True) dh.fill('force', 0.0, ghost_layers=True) dh.cpu_arrays[force.name][64 // 3, 1, :] = [1e-2, -1e-1] offset[0] = 10 time = 20 dh.all_to_gpu() for i in range(time): dh.run_kernel(collision_kernel) sync_pdfs() dh.run_kernel(stream_kernel) dh.swap(src.name, dst.name) dh.all_to_cpu() vel_shifted = np.array(dh.gather_array(u.name)[:, -3:-1, :]) vel_rolled = np.roll(vel_shifted, -offset[0], axis=0) np.testing.assert_array_almost_equal(vel_unshifted, vel_rolled)
def __init__(self, ast: KernelFunction, machine: Optional[MachineModel] = None, assumed_layout='SoA', debug_print=False, filename=None): """Create a kerncraft kernel using a pystencils AST Args: ast: pystencils ast machine: kerncraft machine model - specify this if kernel needs to be compiled assumed_layout: either 'SoA' or 'AoS' - if fields have symbolic sizes the layout of the index coordinates is not known. In this case either a structures of array (SoA) or array of structures (AoS) layout is assumed """ kerncraft.kernel.Kernel.__init__(self, machine) # Initialize state self.asm_block = None self._filename = filename self.kernel_ast = ast self.temporary_dir = TemporaryDirectory() self._keep_intermediates = debug_print # Loops inner_loops = [l for l in filtered_tree_iteration(ast, LoopOverCoordinate, stop_type=SympyAssignment) if l.is_innermost_loop] if len(inner_loops) == 0: raise ValueError("No loop found in pystencils AST") else: if len(inner_loops) > 1: warnings.warn("pystencils AST contains multiple inner loops. " "Only one can be analyzed - choosing first one") inner_loop = inner_loops[0] self._loop_stack = [] cur_node = inner_loop while cur_node is not None: if isinstance(cur_node, LoopOverCoordinate): loop_counter_sym = cur_node.loop_counter_symbol loop_info = (loop_counter_sym.name, cur_node.start, cur_node.stop, 1) # If the correct step were to be provided, all access within that step length will # also need to be passed to kerncraft: cur_node.step) self._loop_stack.append(loop_info) cur_node = cur_node.parent self._loop_stack = list(reversed(self._loop_stack)) # Data sources & destinations self.sources = defaultdict(list) self.destinations = defaultdict(list) def get_layout_tuple(f): if f.has_fixed_shape: return get_layout_from_strides(f.strides) else: layout_list = list(f.layout) for _ in range(f.index_dimensions): layout_list.insert(0 if assumed_layout == 'SoA' else -1, max(layout_list) + 1) return layout_list reads, writes = search_resolved_field_accesses_in_ast(inner_loop) for accesses, target_dict in [(reads, self.sources), (writes, self.destinations)]: for fa in accesses: coord = [sp.Symbol(LoopOverCoordinate.get_loop_counter_name(i), positive=True, integer=True) + off for i, off in enumerate(fa.offsets)] coord += list(fa.idx_coordinate_values) layout = get_layout_tuple(fa.field) permuted_coord = [sp.sympify(coord[i]) for i in layout] target_dict[fa.field.name].append(permuted_coord) # Variables (arrays) fields_accessed = ast.fields_accessed for field in fields_accessed: layout = get_layout_tuple(field) permuted_shape = list(field.shape[i] for i in layout) self.set_variable(field.name, str(field.dtype), tuple(permuted_shape)) # Scalars may be safely ignored # for param in ast.get_parameters(): # if not param.is_field_parameter: # # self.set_variable(param.symbol.name, str(param.symbol.dtype), None) # self.sources[param.symbol.name] = [None] # data type self.datatype = list(self.variables.values())[0][0] # flops operation_count = count_operations_in_ast(inner_loop) self._flops = { '+': operation_count['adds'], '*': operation_count['muls'], '/': operation_count['divs'], } for k in [k for k, v in self._flops.items() if v == 0]: del self._flops[k] self.check() if debug_print: from pprint import pprint print("----------------------------- Loop Stack --------------------------") pprint(self._loop_stack) print("----------------------------- Sources -----------------------------") pprint(self.sources) print("----------------------------- Destinations ------------------------") pprint(self.destinations) print("----------------------------- FLOPS -------------------------------") pprint(self._flops)
def create_cuda_kernel(assignments, function_name="kernel", type_info=None, indexing_creator=BlockIndexing, iteration_slice=None, ghost_layers=None, skip_independence_check=False): assert assignments, "Assignments must not be empty!" fields_read, fields_written, assignments = add_types(assignments, type_info, not skip_independence_check) all_fields = fields_read.union(fields_written) read_only_fields = set([f.name for f in fields_read - fields_written]) buffers = set([f for f in all_fields if FieldType.is_buffer(f) or FieldType.is_custom(f)]) fields_without_buffers = all_fields - buffers field_accesses = set() num_buffer_accesses = 0 for eq in assignments: field_accesses.update(eq.atoms(Field.Access)) field_accesses = {e for e in field_accesses if not e.is_absolute_access} num_buffer_accesses += sum(1 for access in eq.atoms(Field.Access) if FieldType.is_buffer(access.field)) common_shape = get_common_shape(fields_without_buffers) if iteration_slice is None: # determine iteration slice from ghost layers if ghost_layers is None: # determine required number of ghost layers from field access required_ghost_layers = max([fa.required_ghost_layers for fa in field_accesses]) ghost_layers = [(required_ghost_layers, required_ghost_layers)] * len(common_shape) iteration_slice = [] if isinstance(ghost_layers, int): for i in range(len(common_shape)): iteration_slice.append(slice(ghost_layers, -ghost_layers if ghost_layers > 0 else None)) ghost_layers = [(ghost_layers, ghost_layers)] * len(common_shape) else: for i in range(len(common_shape)): iteration_slice.append(slice(ghost_layers[i][0], -ghost_layers[i][1] if ghost_layers[i][1] > 0 else None)) indexing = indexing_creator(field=list(fields_without_buffers)[0], iteration_slice=iteration_slice) coord_mapping = indexing.coordinates cell_idx_assignments = [SympyAssignment(LoopOverCoordinate.get_loop_counter_symbol(i), value) for i, value in enumerate(coord_mapping)] cell_idx_symbols = [LoopOverCoordinate.get_loop_counter_symbol(i) for i, _ in enumerate(coord_mapping)] assignments = cell_idx_assignments + assignments block = Block(assignments) block = indexing.guard(block, common_shape) unify_shape_symbols(block, common_shape=common_shape, fields=fields_without_buffers) ast = KernelFunction(block, Target.GPU, Backend.CUDA, make_python_function, ghost_layers, function_name, assignments=assignments) ast.global_variables.update(indexing.index_variables) base_pointer_spec = [['spatialInner0']] base_pointer_info = {f.name: parse_base_pointer_info(base_pointer_spec, [2, 1, 0], f.spatial_dimensions, f.index_dimensions) for f in all_fields} coord_mapping = {f.name: cell_idx_symbols for f in all_fields} loop_strides = list(fields_without_buffers)[0].shape if any(FieldType.is_buffer(f) for f in all_fields): resolve_buffer_accesses(ast, get_base_buffer_index(ast, indexing.coordinates, loop_strides), read_only_fields) resolve_field_accesses(ast, read_only_fields, field_to_base_pointer_info=base_pointer_info, field_to_fixed_coordinates=coord_mapping) # add the function which determines #blocks and #threads as additional member to KernelFunction node # this is used by the jit # If loop counter symbols have been explicitly used in the update equations (e.g. for built in periodicity), # they are defined here undefined_loop_counters = {LoopOverCoordinate.is_loop_counter_symbol(s): s for s in ast.body.undefined_symbols if LoopOverCoordinate.is_loop_counter_symbol(s) is not None} for i, loop_counter in undefined_loop_counters.items(): ast.body.insert_front(SympyAssignment(loop_counter, indexing.coordinates[i])) ast.indexing = indexing return ast