def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) ops_block = OpsBlock('block') # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for section, trees in find_affine_trees(iet).items(): dims.append(len(trees[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(trees[0].root)) symbols -= set(FindSymbols('defines').visit(trees[0].root)) to_dat |= symbols # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block)) for n, (section, trees) in enumerate(find_affine_trees(iet).items()): pre_loop, ops_kernel = opsit(trees, n) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.append('stdio.h') body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, ops_exit ] return List(body=body)
def make_efunc(name, iet, dynamic_parameters=None, retval='void', prefix='static'): """ Create an ElementalFunction from (a sequence of) perfectly nested Iterations. """ # Arrays are by definition (vector) temporaries, so if they are written # within `iet`, they can also be declared and allocated within the `efunc` items = FindSymbols().visit(iet) local = [ i.write for i in FindNodes(Expression).visit(iet) if i.write.is_Array ] external = [i for i in items if i.is_Tensor and i not in local] # Insert array casts casts = [ArrayCast(i) for i in external] iet = List(body=casts + [iet]) # Insert declarations iet = iet_insert_C_decls(iet, external) # The Callable parameters params = [i for i in derive_parameters(iet) if i not in local] return ElementalFunction(name, iet, retval, params, prefix, dynamic_parameters)
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # To return the result to Devito, it is necessary to copy the data # from the dat object back to the CPU memory. after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, self.time_dimension.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) self._func_table[namespace['ops_kernel_file'](ops_kernel.name)] = \ MetaCall(ops_kernel, False) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.extend(['stdio.h', 'ops_seq.h']) body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, *after_time_loop, ops_exit ] return List(body=body)
def make_ops_kernels(iet): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet, {} ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend( list(create_ops_dat(f, name_to_ops_dat, ops_block))) # Copy data from device to host after_time_loop.extend( create_ops_fetch(f, name_to_ops_dat, f.grid.time_dim.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} ffuncs = [] for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0]) pre_time_loop.extend(pre_loop) ffuncs.append(ops_kernel) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" iet = iet._rebuild(body=flatten([ ops_init, ops_block_init, pre_time_loop, ops_partition, iet.body, after_time_loop, ops_exit ])) return iet, { 'includes': ['stdio.h', 'ops_seq.h'], 'ffuncs': ffuncs, 'headers': [namespace['ops_define_dimension'](dims[0])] }
def _make_thread_func(name, iet, root, threads, sregistry): sid = SharedData._symbolic_id sdeviceid = SharedData._symbolic_deviceid # Create the SharedData, that is the data structure that will be used by the # main thread to pass information dows to the child thread(s) required, parameters, dynamic_parameters = diff_parameters(iet, root, [sid]) parameters = sorted(parameters, key=lambda i: i.is_Function) # Allow casting sdata = SharedData(name=sregistry.make_name(prefix='sdata'), npthreads=threads.size, fields=required, dynamic_fields=dynamic_parameters) # Create a Callable to initialize `sdata` with the known const values sbase = sdata.symbolic_base iname = 'init_%s' % sdata.dtype._type_.__name__ ibody = [DummyExpr(FieldFromPointer(i._C_name, sbase), i._C_symbol) for i in parameters] ibody.extend([ BlankLine, DummyExpr(FieldFromPointer(sdata._field_id, sbase), sid), DummyExpr(FieldFromPointer(sdata._field_deviceid, sbase), sdeviceid), DummyExpr(FieldFromPointer(sdata._field_flag, sbase), 1) ]) iparameters = parameters + [sdata, sid, sdeviceid] isdata = Callable(iname, ibody, 'void', iparameters, 'static') # Prepend the SharedData fields available upon thread activation preactions = [DummyExpr(i, FieldFromPointer(i.name, sbase)) for i in dynamic_parameters] # Append the flag reset postactions = [List(body=[ BlankLine, DummyExpr(FieldFromPointer(sdata._field_flag, sbase), 1) ])] iet = List(body=preactions + [iet] + postactions) # The thread has work to do when it receives the signal that all locks have # been set to 0 by the main thread iet = Conditional(CondEq(FieldFromPointer(sdata._field_flag, sbase), 2), iet) # The thread keeps spinning until the alive flag is set to 0 by the main thread iet = WhileAlive(CondNe(FieldFromPointer(sdata._field_flag, sbase), 0), iet) # pthread functions expect exactly one argument, a void*, and must return void* tretval = 'void*' tparameter = VoidPointer('_%s' % sdata.name) # Unpack `sdata` symbol_names = {i.name for i in FindSymbols('free-symbols').visit(iet)} unpack = [PointerCast(sdata, tparameter), BlankLine] for i in parameters: if i.is_AbstractFunction: unpack.append(Dereference(i, sdata)) if i.name in symbol_names: unpack.append(PointerCast(i)) else: unpack.append(DummyExpr(i, FieldFromPointer(i.name, sbase))) unpack.append(DummyExpr(sid, FieldFromPointer(sdata._field_id, sbase))) unpack.append(DummyExpr(sdeviceid, FieldFromPointer(sdata._field_deviceid, sbase))) unpack.append(BlankLine) iet = List(body=unpack + [iet, BlankLine, Return(Macro('NULL'))]) tfunc = ThreadFunction(name, iet, tretval, tparameter, 'static') return tfunc, isdata, sdata