def test_padding(simple_function_with_paddable_arrays): handle = transform(simple_function_with_paddable_arrays, mode='padding') assert """\ for (int i = 0; i < 3; i += 1) { pa_dense[i] = a_dense[i]; } void foo(float *restrict a_dense_vec, float *restrict b_dense_vec) { float (*restrict a_dense) __attribute__((aligned(64))) = (float (*)) a_dense_vec; float (*restrict b_dense) __attribute__((aligned(64))) = (float (*)) b_dense_vec; for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { for (int k = 0; k < 7; k += 1) { pa_dense[i] = b_dense[i] + pa_dense[i] + 5.0F; } } } } for (int i = 0; i < 3; i += 1) { a_dense[i] = pa_dense[i]; }""" in str(handle.nodes)
def test_create_elemental_functions_simple(simple_function): roots = [i[-1] for i in retrieve_iteration_tree(simple_function)] retagged = [i._rebuild(properties=tagger(0)) for i in roots] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(simple_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.elemental_functions) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,i_size,j,j_size,k_size); } } } void f_0(const int k_start, const int k_finish,""" """ float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i, const int i_size, const int j, const int j_size, const int k_size) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = k_start; k < k_finish; k += 1) { a[i] = a[i] + b[i] + 5.0F; a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k]; } }""")
def _specialize_iet(self, iet, **kwargs): """ Transform the IET into a backend-specific representation, such as code to be executed on a GPU or through a lower-level system (e.g., YASK). """ dle = kwargs.get("dle", configuration['dle']) # Apply the Devito Loop Engine (DLE) for loop optimization iet, state = transform(iet, *set_dle_mode(dle)) self._func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in state.efuncs])) self._dimensions.extend(state.dimensions) self._includes.extend(state.includes) return iet
def _specialize_iet(self, iet, **kwargs): """ Transform the IET into a backend-specific representation, such as code to be executed on a GPU or through a lower-level system (e.g., YASK). """ dle = kwargs.get("dle", configuration['dle']) # Apply the Devito Loop Engine (DLE) for loop optimization iet, state = transform(iet, *set_dle_mode(dle)) self._func_table.update(OrderedDict([(i.name, MetaCall(i, True)) for i in state.efuncs])) self._dimensions.extend(state.dimensions) self._includes.extend(state.includes) return iet
def test_loop_nofission(simple_function): old = Rewriter.thresholds['min_fission'], Rewriter.thresholds['max_fission'] Rewriter.thresholds['max_fission'], Rewriter.thresholds['min_fission'] = 0, 1 handle = transform(simple_function, mode='fission') assert """\ for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { for (int k = 0; k < 7; k += 1) { a[i] = a[i] + b[i] + 5.0F; a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k]; } } }""" in str(handle.nodes[0].ccode) Rewriter.thresholds['min_fission'], Rewriter.thresholds['max_fission'] = old
def _specialize_iet(self, iet, **kwargs): """Transform the Iteration/Expression tree into a backend-specific representation, such as code to be executed on a GPU or through a lower-level tool.""" # Apply the Devito Loop Engine (DLE) for loop optimization dle = kwargs.get("dle", configuration['dle']) dle_state = transform(iet, *set_dle_mode(dle)) self._dle_args = dle_state.arguments self._dle_flags = dle_state.flags self.func_table.update(OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) self.dimensions.extend([i.argument for i in self._dle_args if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) return dle_state.nodes
def _make_copy(self, f, fixed, swap=False): """ Construct a Callable performing a copy of: * an arbitrary convex region of ``f`` into a contiguous Array, OR * if ``swap=True``, a contiguous Array into an arbitrary convex region of ``f``. """ buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in fixed: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) f_offsets = [] f_indices = [] for d in f.dimensions: offset = Symbol(name='o%s' % d.root) f_offsets.append(offset) f_indices.append(offset + (d.root if d not in fixed else 0)) if swap is False: eq = DummyEq(buf[buf_indices], f[f_indices]) name = 'gather%dd' % f.ndim else: eq = DummyEq(f[f_indices], buf[buf_indices]) name = 'scatter%dd' % f.ndim iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, i, d.symbolic_size - 1, properties=PARALLEL) iet = List(body=[ArrayCast(f), ArrayCast(buf), iet]) # Optimize the memory copy with the DLE from devito.dle import transform state = transform(iet, 'simd', {'openmp': self._threaded}) parameters = [buf] + list(buf.shape) + [f] + f_offsets + state.input return Callable(name, state.nodes, 'void', parameters, ('static', )), state.input
def test_padding(simple_function_with_paddable_arrays): handle = transform(simple_function_with_paddable_arrays, mode='padding') assert str(handle.nodes[0].ccode) == """\ for (int i = 0; i < 3; i += 1) { pa_dense[i] = a_dense[i]; }""" assert """\ for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { for (int k = 0; k < 7; k += 1) { pa_dense[i] = b_dense[i] + pa_dense[i] + 5.0F; } } }""" in str(handle.nodes[1].ccode) assert str(handle.nodes[2].ccode) == """\
def test_loop_fission(simple_function_fissionable): old = Rewriter.thresholds['min_fission'], Rewriter.thresholds['max_fission'] Rewriter.thresholds['max_fission'], Rewriter.thresholds['min_fission'] = 0, 1 handle = transform(simple_function_fissionable, mode='fission') assert """\ for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { for (int k = 0; k < 7; k += 1) { a[i] = a[i] + b[i] + 5.0F; } for (int k = 0; k < 7; k += 1) { b[i] = a[i] + pow(b[i], 2) + 3; } } }""" in str(handle.nodes[0].ccode) Rewriter.thresholds['min_fission'], Rewriter.thresholds['max_fission'] = old
def test_create_elemental_functions_simple(simple_function): old = Rewriter.thresholds['elemental'] Rewriter.thresholds['elemental'] = 0 handle = transform(simple_function, mode='split') block = List(body=handle.nodes + handle.elemental_functions) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec; float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { f_0_0((float*) a,(float*) b,(float*) c,(float*) d,i,j); } } } void f_0_0(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec, const int i, const int j) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec; float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int k = 0; k < 7; k += 1) { a[i] = a[i] + b[i] + 5.0F; a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k]; } }""") Rewriter.thresholds['elemental'] = old
def test_loops_collapsed(fe, t0, t1, t2, t3, exprs, expected, iters): scope = [fe, t0, t1, t2, t3] node_exprs = [Expression(DummyEq(EVAL(i, *scope))) for i in exprs] ast = iters[6](iters[7](iters[8](node_exprs))) ast = iet_analyze(ast) nodes = transform(ast, mode='openmp').nodes iterations = FindNodes(Iteration).visit(nodes) assert len(iterations) == len(expected) # Check for presence of pragma omp for i, j in zip(iterations, expected): pragmas = i.pragmas if j is True: assert len(pragmas) == 1 pragma = pragmas[0] assert 'omp for collapse' in pragma.value else: for k in pragmas: assert 'omp for collapse' not in k.value
def test_loops_ompized(fa, fb, fc, fd, t0, t1, t2, t3, exprs, expected, iters): scope = [fa, fb, fc, fd, t0, t1, t2, t3] node_exprs = [Expression(EVAL(i, *scope)) for i in exprs] ast = iters[6](iters[7](node_exprs)) nodes = transform(ast, mode='openmp').nodes assert len(nodes) == 1 ast = nodes[0] iterations = FindNodes(Iteration).visit(ast) assert len(iterations) == len(expected) # Check for presence of pragma omp for i, j in zip(iterations, expected): pragmas = i.pragmas if j is True: assert len(pragmas) == 1 pragma = pragmas[0] assert 'omp for' in pragma.value else: for k in pragmas: assert 'omp for' not in k.value
def test_iterations_ompized(self, fa, fb, fc, fd, t0, t1, t2, t3, exprs, expected, iters): scope = [fa, fb, fc, fd, t0, t1, t2, t3] node_exprs = [Expression(DummyEq(EVAL(i, *scope))) for i in exprs] ast = iters[6](iters[7](node_exprs)) ast = iet_analyze(ast) iet, _ = transform(ast, mode='openmp') iterations = FindNodes(Iteration).visit(iet) assert len(iterations) == len(expected) # Check for presence of pragma omp for i, j in zip(iterations, expected): pragmas = i.pragmas if j is True: assert len(pragmas) == 1 pragma = pragmas[0] assert 'omp for' in pragma.value else: for k in pragmas: assert 'omp for' not in k.value
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Default attributes required for compilation self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis 1 - required *also after* the Operator construction self.dtype = self._retrieve_dtype(expressions) self.output = self._retrieve_output_fields(expressions) # Analysis 2 - required *for* the Operator construction ordering = self._retrieve_loop_ordering(expressions) stencils = self._retrieve_stencils(expressions) # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine for symbolic optimization clusters = rewrite(clusters, mode=dse) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters, ordering) # Introduce C-level profiling infrastructure self.sections = OrderedDict() nodes = self._profile_sections(nodes) # Parameters of the Operator (Dimensions necessary for data casts) parameters = FindSymbols('kernel-data').visit(nodes) dimensions = FindSymbols('dimensions').visit(nodes) dimensions += [d.parent for d in dimensions if d.is_Buffered] parameters += filter_ordered([d for d in dimensions if d.size is None], key=operator.attrgetter('name')) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) parameters += [i.argument for i in dle_state.arguments] self._includes.extend(list(dle_state.includes)) # Introduce all required C declarations nodes, elemental_functions = self._insert_declarations( dle_state, parameters) self.elemental_functions = elemental_functions # Track the DLE output, as it might be useful at execution time self._dle_state = dle_state # Finish instantiation super(OperatorBasic, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering and analysis expressions = [LoweredEq(e, subs=subs) for e in expressions] self.dtype = retrieve_dtype(expressions) self.input, self.output, self.dimensions = retrieve_symbols( expressions) # Set the direction of time acoording to the given TimeAxis for time in [d for d in self.dimensions if d.is_Time]: if not time.is_Stepping: time.reverse = time_axis == Backward # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + self.dimensions # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters, self.dtype) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(nodes, parameters) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([ i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension) ]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Initialise ArgumentEngine self.argument_engine = ArgumentEngine(clusters.ispace, parameters, self.dle_arguments) parameters = self.argument_engine.arguments # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis self.dtype = self._retrieve_dtype(expressions) self.input, self.output, self.dimensions = self._retrieve_symbols(expressions) stencils = self._retrieve_stencils(expressions) # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + [i for i in self.dimensions if i.size is None] # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine (DSE) for symbolic optimization clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table = OrderedDict([(i.name, FunMeta(i, True)) for i in dle_state.elemental_functions]) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(dle_state.nodes, parameters) # Introduce all required C declarations nodes = self._insert_declarations(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def test_create_efuncs_complex(complex_function): roots = [i[-1] for i in retrieve_iteration_tree(complex_function)] retagged = [j._rebuild(properties=tagger(i)) for i, j in enumerate(roots)] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(complex_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.efuncs) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { for (int i = 0; i <= 3; i += 1) { f_0((float *)a,(float *)b,i_size,i,4,0); for (int j = 0; j <= 5; j += 1) { f_1((float *)a,(float *)b,(float *)c,(float *)d,i_size,j_size,k_size,i,j,7,0); } f_2((float *)a,(float *)b,i_size,i,4,0); } } void f_0(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int sf_M, const int sf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int s = sf_m; s <= sf_M; s += 1) { b[i] = a[i] + pow(b[i], 2) + 3; } } void f_1(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i_size, const int j_size, const int k_size,""" """ const int i, const int j, const int kf_M, const int kf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__ ((aligned (64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__ ((aligned (64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = kf_m; k <= kf_M; k += 1) { a[i] = a[i]*b[i]*c[i][j]*d[i][j][k]; a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]); } } void f_2(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int qf_M, const int qf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int q = qf_m; q <= qf_M; q += 1) { a[i] = 8.0F*a[i] + 6.0F/b[i]; } }""")
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize_iet(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_args = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update(OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) self.dimensions.extend([i.argument for i in self.dle_args if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Insert data and pointer casts for array parameters and profiling structs nodes = self._build_casts(nodes) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def test_create_elemental_functions_complex(complex_function): roots = [i[-1] for i in retrieve_iteration_tree(complex_function)] retagged = [j._rebuild(properties=tagger(i)) for i, j in enumerate(roots)] mapper = {i: j._rebuild(properties=(j.properties + (ELEMENTAL,))) for i, j in zip(roots, retagged)} function = Transformer(mapper).visit(complex_function) handle = transform(function, mode='split') block = List(body=handle.nodes + handle.elemental_functions) output = str(block.ccode) # Make output compiler independent output = [i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')])] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec; float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int i = 0; i < 3; i += 1) { f_0(0,4,(float*)a,(float*)b,i); for (int j = 0; j < 5; j += 1) { f_1(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,j); } f_2(0,4,(float*)a,(float*)b,i); } } void f_0(const int s_start, const int s_finish,""" """ float *restrict a_vec, float *restrict b_vec, const int i) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; for (int s = s_start; s < s_finish; s += 1) { b[i] = a[i] + pow(b[i], 2) + 3; } } void f_1(const int k_start, const int k_finish,""" """ float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec, const int i, const int j) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec; float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int k = k_start; k < k_finish; k += 1) { a[i] = a[i]*b[i]*c[i][j]*d[i][j][k]; a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]); } } void f_2(const int q_start, const int q_finish,""" """ float *restrict a_vec, float *restrict b_vec, const int i) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; for (int q = q_start; q < q_finish; q += 1) { a[i] = 8.0F*a[i] + 6.0F/b[i]; } }""")