def test_vectorization_fixed_size(): configurations = [] # Fixed size - multiple of four arr = np.ones((20 + 2, 24 + 2)) * 5.0 f, g = ps.fields(f=arr, g=arr) configurations.append((arr, f, g)) # Fixed size - no multiple of four arr = np.ones((21 + 2, 25 + 2)) * 5.0 f, g = ps.fields(f=arr, g=arr) configurations.append((arr, f, g)) # Fixed size - different remainder arr = np.ones((23 + 2, 17 + 2)) * 5.0 f, g = ps.fields(f=arr, g=arr) configurations.append((arr, f, g)) for arr, f, g in configurations: update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] ast = ps.create_kernel(update_rule) vectorize(ast) func = ast.compile() dst = np.zeros_like(arr) func(g=dst, f=arr) np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0)
def test_piecewise3(instruction_set=instruction_set): arr = np.zeros((22, 22)) @ps.kernel def test_kernel(s): f, g = ps.fields(f=arr, g=arr) s.b @= f[0, 1] g[0, 0] @= 1.0 / (s.b + s.k) if f[0, 0] > 0.0 else 1.0 ast = ps.create_kernel(test_kernel) vectorize(ast, instruction_set=instruction_set) ast.compile()
def test_logical_operators(): arr = np.zeros((22, 22)) @ps.kernel def test_kernel(s): f, g = ps.fields(f=arr, g=arr) s.c @= sp.And(f[0, 1] < 0.0, f[1, 0] < 0.0) g[0, 0] @= sp.Piecewise([1.0 / f[1, 0], s.c], [1.0, True]) ast = ps.create_kernel(test_kernel) vectorize(ast) ast.compile()
def test_vectorization_variable_size(): f, g = ps.fields("f, g : double[2D]") update_rule = [ps.Assignment(g[0, 0], f[0, 0] + f[-1, 0] + f[1, 0] + f[0, 1] + f[0, -1] + 42.0)] ast = ps.create_kernel(update_rule) replace_inner_stride_with_one(ast) vectorize(ast) func = ast.compile() arr = np.ones((23 + 2, 17 + 2)) * 5.0 dst = np.zeros_like(arr) func(g=dst, f=arr) np.testing.assert_equal(dst[1:-1, 1:-1], 5 * 5.0 + 42.0)
def test_piecewise1(): a, b, c, d, e = sp.symbols("a b c d e") arr = np.ones((2 ** 3 + 2, 2 ** 4 + 2)) * 5.0 f, g = ps.fields(f=arr, g=arr) update_rule = [ps.Assignment(a, f[1, 0]), ps.Assignment(b, a), ps.Assignment(c, f[0, 0] > 0.0), ps.Assignment(g[0, 0], sp.Piecewise((b + 3 + f[0, 1], c), (0.0, True)))] ast = ps.create_kernel(update_rule) vectorize(ast) func = ast.compile() dst = np.zeros_like(arr) func(g=dst, f=arr) np.testing.assert_equal(dst[1:-1, 1:-1], 5 + 3 + 5.0)
def test_piecewise2(instruction_set=instruction_set): arr = np.zeros((20, 20)) @ps.kernel def test_kernel(s): f, g = ps.fields(f=arr, g=arr) s.condition @= f[0, 0] > 1 s.result @= 0.0 if s.condition else 1.0 g[0, 0] @= s.result ast = ps.create_kernel(test_kernel) vectorize(ast, instruction_set=instruction_set) func = ast.compile() func(f=arr, g=arr) np.testing.assert_equal(arr, np.ones_like(arr))
def test_vector_type_propagation(): a, b, c, d, e = sp.symbols("a b c d e") arr = np.ones((2 ** 2 + 2, 2 ** 3 + 2)) arr *= 10.0 f, g = ps.fields(f=arr, g=arr) update_rule = [ps.Assignment(a, f[1, 0]), ps.Assignment(b, a), ps.Assignment(g[0, 0], b + 3 + f[0, 1])] ast = ps.create_kernel(update_rule) vectorize(ast) func = ast.compile() dst = np.zeros_like(arr) func(g=dst, f=arr) np.testing.assert_equal(dst[1:-1, 1:-1], 2 * 10.0 + 3)
def test_logical_operators(instruction_set=instruction_set): arr = np.zeros((22, 22)) @ps.kernel def kernel_and(s): f, g = ps.fields(f=arr, g=arr) s.c @= sp.And(f[0, 1] < 0.0, f[1, 0] < 0.0) g[0, 0] @= sp.Piecewise([1.0 / f[1, 0], s.c], [1.0, True]) ast = ps.create_kernel(kernel_and) vectorize(ast, instruction_set=instruction_set) ast.compile() @ps.kernel def kernel_or(s): f, g = ps.fields(f=arr, g=arr) s.c @= sp.Or(f[0, 1] < 0.0, f[1, 0] < 0.0) g[0, 0] @= sp.Piecewise([1.0 / f[1, 0], s.c], [1.0, True]) ast = ps.create_kernel(kernel_or) vectorize(ast, instruction_set=instruction_set) ast.compile() @ps.kernel def kernel_equal(s): f, g = ps.fields(f=arr, g=arr) s.c @= sp.Eq(f[0, 1], 2.0) g[0, 0] @= sp.Piecewise([1.0 / f[1, 0], s.c], [1.0, True]) ast = ps.create_kernel(kernel_equal) vectorize(ast, instruction_set=instruction_set) ast.compile()
def test_vectorised_fast_approximations(instruction_set=instruction_set): arr = np.zeros((24, 24)) f, g = ps.fields(f=arr, g=arr) expr = sp.sqrt(f[0, 0] + f[1, 0]) assignment = ps.Assignment(g[0, 0], insert_fast_sqrts(expr)) ast = ps.create_kernel(assignment) vectorize(ast, instruction_set=instruction_set) ast.compile() expr = f[0, 0] / f[1, 0] assignment = ps.Assignment(g[0, 0], insert_fast_divisions(expr)) ast = ps.create_kernel(assignment) vectorize(ast, instruction_set=instruction_set) ast.compile() assignment = ps.Assignment(sp.Symbol("tmp"), 3 / sp.sqrt(f[0, 0] + f[1, 0])) ast = ps.create_kernel(insert_fast_sqrts(assignment)) vectorize(ast, instruction_set=instruction_set) ast.compile()
def create_staggered_kernel(staggered_field, expressions, subexpressions=(), target='cpu', gpu_exclusive_conditions=False, **kwargs): """Kernel that updates a staggered field. .. image:: /img/staggered_grid.svg Args: staggered_field: field where the first index coordinate defines the location of the staggered value can have 1 or 2 index coordinates, in case of two index coordinates at every staggered location a vector is stored, expressions parameter has to be a sequence of sequences then where e.g. ``f[0,0](0)`` is interpreted as value at the left cell boundary, ``f[1,0](0)`` the right cell boundary and ``f[0,0](1)`` the southern cell boundary etc. expressions: sequence of expressions of length dim, defining how the west, southern, (bottom) cell boundary should be updated. subexpressions: optional sequence of Assignments, that define subexpressions used in the main expressions target: 'cpu' or 'gpu' gpu_exclusive_conditions: if/else construct to have only one code block for each of 2**dim code paths kwargs: passed directly to create_kernel, iteration slice and ghost_layers parameters are not allowed Returns: AST, see `create_kernel` """ assert 'iteration_slice' not in kwargs and 'ghost_layers' not in kwargs assert staggered_field.index_dimensions in ( 1, 2), 'Staggered field must have one or two index dimensions' dim = staggered_field.spatial_dimensions counters = [ LoopOverCoordinate.get_loop_counter_symbol(i) for i in range(dim) ] conditions = [ counters[i] < staggered_field.shape[i] - 1 for i in range(dim) ] assert len(expressions) == dim if staggered_field.index_dimensions == 2: assert all(len(sublist) == len(expressions[0]) for sublist in expressions), \ "If staggered field has two index dimensions expressions has to be a sequence of sequences of all the " \ "same length." final_assignments = [] last_conditional = None def add(condition, dimensions, as_else_block=False): nonlocal last_conditional if staggered_field.index_dimensions == 1: assignments = [ Assignment(staggered_field(d), expressions[d]) for d in dimensions ] a_coll = AssignmentCollection(assignments, list(subexpressions)) a_coll = a_coll.new_filtered( [staggered_field(d) for d in dimensions]) elif staggered_field.index_dimensions == 2: assert staggered_field.has_fixed_index_shape assignments = [ Assignment(staggered_field(d, i), expr) for d in dimensions for i, expr in enumerate(expressions[d]) ] a_coll = AssignmentCollection(assignments, list(subexpressions)) a_coll = a_coll.new_filtered([ staggered_field(d, i) for i in range(staggered_field.index_shape[1]) for d in dimensions ]) sp_assignments = [ SympyAssignment(a.lhs, a.rhs) for a in a_coll.all_assignments ] if as_else_block and last_conditional: new_cond = Conditional(condition, Block(sp_assignments)) last_conditional.false_block = Block([new_cond]) last_conditional = new_cond else: last_conditional = Conditional(condition, Block(sp_assignments)) final_assignments.append(last_conditional) if target == 'cpu' or not gpu_exclusive_conditions: for d in range(dim): cond = sp.And(*[conditions[i] for i in range(dim) if d != i]) add(cond, [d]) elif target == 'gpu': full_conditions = [ sp.And(*[conditions[i] for i in range(dim) if d != i]) for d in range(dim) ] for include in itertools.product(*[[1, 0]] * dim): case_conditions = sp.And(*[ c if value else sp.Not(c) for c, value in zip(full_conditions, include) ]) dimensions_to_include = [i for i in range(dim) if include[i]] if dimensions_to_include: add(case_conditions, dimensions_to_include, True) ghost_layers = [(1, 0)] * dim blocking = kwargs.get('cpu_blocking', None) if blocking: del kwargs['cpu_blocking'] cpu_vectorize_info = kwargs.get('cpu_vectorize_info', None) if cpu_vectorize_info: del kwargs['cpu_vectorize_info'] openmp = kwargs.get('cpu_openmp', None) if openmp: del kwargs['cpu_openmp'] ast = create_kernel(final_assignments, ghost_layers=ghost_layers, target=target, **kwargs) if target == 'cpu': remove_conditionals_in_staggered_kernel(ast) move_constants_before_loop(ast) omp_collapse = None if blocking: omp_collapse = loop_blocking(ast, blocking) if openmp: from pystencils.cpu import add_openmp add_openmp(ast, num_threads=openmp, collapse=omp_collapse, assume_single_outer_loop=False) if cpu_vectorize_info is True: vectorize(ast) elif isinstance(cpu_vectorize_info, dict): vectorize(ast, **cpu_vectorize_info) return ast
def create_kernel(assignments, target='cpu', data_type="double", iteration_slice=None, ghost_layers=None, skip_independence_check=False, cpu_openmp=False, cpu_vectorize_info=None, cpu_blocking=None, gpu_indexing='block', gpu_indexing_params=MappingProxyType({})): """ Creates abstract syntax tree (AST) of kernel, using a list of update equations. Args: assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection` target: 'cpu', 'llvm' or 'gpu' data_type: data type used for all untyped symbols (i.e. non-fields), can also be a dict from symbol name to type iteration_slice: rectangular subset to iterate over, if not specified the complete non-ghost layer \ part of the field is iterated over ghost_layers: if left to default, the number of necessary ghost layers is determined automatically a single integer specifies the ghost layer count at all borders, can also be a sequence of pairs ``[(x_lower_gl, x_upper_gl), .... ]`` skip_independence_check: don't check that loop iterations are independent. This is needed e.g. for periodicity kernel, that access the field outside the iteration bounds. Use with care! cpu_openmp: True or number of threads for OpenMP parallelization, False for no OpenMP cpu_vectorize_info: a dictionary with keys, 'vector_instruction_set', 'assume_aligned' and 'nontemporal' for documentation of these parameters see vectorize function. Example: '{'instruction_set': 'avx512', 'assume_aligned': True, 'nontemporal':True}' cpu_blocking: a tuple of block sizes or None if no blocking should be applied gpu_indexing: either 'block' or 'line' , or custom indexing class, see `AbstractIndexing` gpu_indexing_params: dict with indexing parameters (constructor parameters of indexing class) e.g. for 'block' one can specify '{'block_size': (20, 20, 10) }' Returns: abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or can be compiled with through its 'compile()' member Example: >>> import pystencils as ps >>> import numpy as np >>> s, d = ps.fields('s, d: [2D]') >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0]) >>> ast = ps.create_kernel(assignment, target='cpu', cpu_openmp=True) >>> kernel = ast.compile() >>> d_arr = np.zeros([5, 5]) >>> kernel(d=d_arr, s=np.ones([5, 5])) >>> d_arr array([[0., 0., 0., 0., 0.], [0., 4., 4., 4., 0.], [0., 4., 4., 4., 0.], [0., 4., 4., 4., 0.], [0., 0., 0., 0., 0.]]) """ # ---- Normalizing parameters split_groups = () if isinstance(assignments, AssignmentCollection): if 'split_groups' in assignments.simplification_hints: split_groups = assignments.simplification_hints['split_groups'] assignments = assignments.all_assignments if isinstance(assignments, Assignment): assignments = [assignments] # ---- Creating ast if target == 'cpu': from pystencils.cpu import create_kernel from pystencils.cpu import add_openmp ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups, iteration_slice=iteration_slice, ghost_layers=ghost_layers, skip_independence_check=skip_independence_check) omp_collapse = None if cpu_blocking: omp_collapse = loop_blocking(ast, cpu_blocking) if cpu_openmp: add_openmp(ast, num_threads=cpu_openmp, collapse=omp_collapse) if cpu_vectorize_info: if cpu_vectorize_info is True: vectorize(ast) elif isinstance(cpu_vectorize_info, dict): vectorize(ast, **cpu_vectorize_info) else: raise ValueError("Invalid value for cpu_vectorize_info") return ast elif target == 'llvm': from pystencils.llvm import create_kernel ast = create_kernel(assignments, type_info=data_type, split_groups=split_groups, iteration_slice=iteration_slice, ghost_layers=ghost_layers) return ast elif target == 'gpu': from pystencils.gpucuda import create_cuda_kernel ast = create_cuda_kernel( assignments, type_info=data_type, indexing_creator=indexing_creator_from_params( gpu_indexing, gpu_indexing_params), iteration_slice=iteration_slice, ghost_layers=ghost_layers, skip_independence_check=skip_independence_check) return ast else: raise ValueError( "Unknown target %s. Has to be one of 'cpu', 'gpu' or 'llvm' " % (target, ))
def create_domain_kernel(assignments: List[Assignment], *, config: CreateKernelConfig): """ Creates abstract syntax tree (AST) of kernel, using a list of update equations. Args: assignments: can be a single assignment, sequence of assignments or an `AssignmentCollection` config: CreateKernelConfig which includes the needed configuration Returns: abstract syntax tree (AST) object, that can either be printed as source code with `show_code` or can be compiled with through its 'compile()' member Example: >>> import pystencils as ps >>> import numpy as np >>> s, d = ps.fields('s, d: [2D]') >>> assignment = ps.Assignment(d[0,0], s[0, 1] + s[0, -1] + s[1, 0] + s[-1, 0]) >>> kernel_config = ps.CreateKernelConfig(cpu_openmp=True) >>> kernel_ast = ps.kernelcreation.create_domain_kernel([assignment], config=kernel_config) >>> kernel = kernel_ast.compile() >>> d_arr = np.zeros([5, 5]) >>> kernel(d=d_arr, s=np.ones([5, 5])) >>> d_arr array([[0., 0., 0., 0., 0.], [0., 4., 4., 4., 0.], [0., 4., 4., 4., 0.], [0., 4., 4., 4., 0.], [0., 0., 0., 0., 0.]]) """ # --- applying first default simplifications try: if config.default_assignment_simplifications and isinstance( assignments, AssignmentCollection): simplification = create_simplification_strategy() assignments = simplification(assignments) except Exception as e: warnings.warn( f"It was not possible to apply the default pystencils optimisations to the " f"AssignmentCollection due to the following problem :{e}") # ---- Normalizing parameters split_groups = () if isinstance(assignments, AssignmentCollection): if 'split_groups' in assignments.simplification_hints: split_groups = assignments.simplification_hints['split_groups'] assignments = assignments.all_assignments try: if config.default_assignment_simplifications: assignments = apply_sympy_optimisations(assignments) except Exception as e: warnings.warn( f"It was not possible to apply the default SymPy optimisations to the " f"Assignments due to the following problem :{e}") # ---- Creating ast ast = None if config.target == Target.CPU: if config.backend == Backend.C: from pystencils.cpu import add_openmp, create_kernel ast = create_kernel( assignments, function_name=config.function_name, type_info=config.data_type, split_groups=split_groups, iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers, skip_independence_check=config.skip_independence_check) for optimization in config.cpu_prepend_optimizations: optimization(ast) omp_collapse = None if config.cpu_blocking: omp_collapse = loop_blocking(ast, config.cpu_blocking) if config.cpu_openmp: add_openmp(ast, num_threads=config.cpu_openmp, collapse=omp_collapse, assume_single_outer_loop=config.omp_single_loop) if config.cpu_vectorize_info: if config.cpu_vectorize_info is True: vectorize(ast) elif isinstance(config.cpu_vectorize_info, dict): vectorize(ast, **config.cpu_vectorize_info) if config.cpu_openmp and config.cpu_blocking and 'nontemporal' in config.cpu_vectorize_info and \ config.cpu_vectorize_info['nontemporal'] and 'cachelineZero' in ast.instruction_set: # This condition is stricter than it needs to be: if blocks along the fastest axis start on a # cache line boundary, it's okay. But we cannot determine that here. # We don't need to disallow OpenMP collapsing because it is never applied to the inner loop. raise ValueError( "Blocking cannot be combined with cacheline-zeroing" ) else: raise ValueError("Invalid value for cpu_vectorize_info") elif config.target == Target.GPU: if config.backend == Backend.CUDA: from pystencils.gpucuda import create_cuda_kernel ast = create_cuda_kernel( assignments, function_name=config.function_name, type_info=config.data_type, indexing_creator=indexing_creator_from_params( config.gpu_indexing, config.gpu_indexing_params), iteration_slice=config.iteration_slice, ghost_layers=config.ghost_layers, skip_independence_check=config.skip_independence_check) if not ast: raise NotImplementedError( f'{config.target} together with {config.backend} is not supported by `create_domain_kernel`' ) if config.use_auto_for_assignments: for a in ast.atoms(SympyAssignment): a.use_auto = True return ast
def test_vectorised_pow(instruction_set=instruction_set): arr = np.zeros((24, 24)) f, g = ps.fields(f=arr, g=arr) as1 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], 2)) as2 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], 0.5)) as3 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], -0.5)) as4 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], 4)) as5 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], -4)) as6 = ps.Assignment(g[0, 0], sp.Pow(f[0, 0], -1)) ast = ps.create_kernel(as1) vectorize(ast, instruction_set=instruction_set) ast.compile() ast = ps.create_kernel(as2) vectorize(ast, instruction_set=instruction_set) ast.compile() ast = ps.create_kernel(as3) vectorize(ast, instruction_set=instruction_set) ast.compile() ast = ps.create_kernel(as4) vectorize(ast, instruction_set=instruction_set) ast.compile() ast = ps.create_kernel(as5) vectorize(ast, instruction_set=instruction_set) ast.compile() ast = ps.create_kernel(as6) vectorize(ast, instruction_set=instruction_set) ast.compile()