def compile(self): # If we weren't in the cache we /must/ have arguments if not hasattr(self, '_args'): raise RuntimeError( "JITModule has no args associated with it, should never happen" ) compiler = coffee.system.compiler externc_open = '' if not self._kernel._cpp else 'extern "C" {' externc_close = '' if not self._kernel._cpp else '}' headers = "\n".join([compiler.get('vect_header', "")]) if any(arg._is_soa for arg in self._args): kernel_code = """ #define OP2_STRIDE(a, idx) a[idx] %(header)s %(code)s #undef OP2_STRIDE """ % { 'code': self._kernel.code(), 'header': headers } else: kernel_code = """ %(header)s %(code)s """ % { 'code': self._kernel.code(), 'header': headers } code_to_compile = strip(dedent(self._wrapper) % self.generate_code()) code_to_compile = """ #include <petsc.h> #include <stdbool.h> #include <math.h> #include <inttypes.h> %(sys_headers)s %(kernel)s %(externc_open)s %(wrapper)s %(externc_close)s """ % { 'kernel': kernel_code, 'wrapper': code_to_compile, 'externc_open': externc_open, 'externc_close': externc_close, 'sys_headers': '\n'.join(self._kernel._headers + self._system_headers) } self._dump_generated_code(code_to_compile) if configuration["debug"]: self._wrapper_code = code_to_compile extension = self._extension cppargs = self._cppargs cppargs += ["-I%s/include" % d for d in get_petsc_dir()] + \ ["-I%s" % d for d in self._kernel._include_dirs] + \ ["-I%s" % os.path.abspath(os.path.dirname(__file__))] if compiler: cppargs += [compiler[coffee.system.isa['inst_set']]] ldargs = ["-L%s/lib" % d for d in get_petsc_dir()] + \ ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()] + \ ["-lpetsc", "-lm"] + self._libraries ldargs += self._kernel._ldargs if self._kernel._cpp: extension = "cpp" self._fun = compilation.load(code_to_compile, extension, self._wrapper_name, cppargs=cppargs, ldargs=ldargs, argtypes=self._argtypes, restype=None, compiler=compiler.get('name'), comm=self.comm) # Blow away everything we don't need any more del self._args del self._kernel del self._itspace del self._direct return self._fun
def generate_code(self): indent = lambda t, i: ('\n' + ' ' * i).join(t.split('\n')) # 1) Construct the wrapper arguments code_dict = {} code_dict['wrapper_name'] = 'wrap_executor' code_dict['executor_arg'] = "%s %s" % (slope.Executor.meta['ctype_exec'], slope.Executor.meta['name_param_exec']) _wrapper_args = ', '.join([arg.c_wrapper_arg() for arg in self._args]) _wrapper_decs = ';\n'.join([arg.c_wrapper_dec() for arg in self._args]) code_dict['wrapper_args'] = _wrapper_args code_dict['wrapper_decs'] = indent(_wrapper_decs, 1) code_dict['rank'] = ", %s %s" % (slope.Executor.meta['ctype_rank'], slope.Executor.meta['rank']) code_dict['region_flag'] = ", %s %s" % (slope.Executor.meta['ctype_region_flag'], slope.Executor.meta['region_flag']) # 2) Construct the kernel invocations _loop_body, _user_code, _ssinds_arg = [], [], [] # For each kernel ... for i, (kernel, it_space, args) in enumerate(zip(self._all_kernels, self._all_itspaces, self._all_args)): # ... bind the Executor's arguments to this kernel's arguments binding = [] for a1 in args: for a2 in self._args: if a1.data is a2.data and a1.map is a2.map: a1.ref_arg = a2 break binding.append(a1.c_arg_bindto()) binding = ";\n".join(binding) # ... obtain the /code_dict/ as if it were not part of an Executor, # since bits of code generation can be reused loop_code_dict = sequential.JITModule(kernel, it_space, *args, delay=True) loop_code_dict = loop_code_dict.generate_code() # ... does the scatter use global or local maps ? if self._use_glb_maps: loop_code_dict['index_expr'] = '%s[n]' % self._executor.gtl_maps[i]['DIRECT'] prefetch_var = 'int p = %s[n + %d]' % (self._executor.gtl_maps[i]['DIRECT'], self._use_prefetch) else: prefetch_var = 'int p = n + %d' % self._use_prefetch # ... add prefetch intrinsics, if requested prefetch_maps, prefetch_vecs = '', '' if self._use_prefetch: prefetch = lambda addr: '_mm_prefetch ((char*)(%s), _MM_HINT_T0)' % addr prefetch_maps = [a.c_map_entry('p') for a in args if a._is_indirect] # can save some instructions since prefetching targets chunks of 32 bytes prefetch_maps = flatten([j for j in pm if pm.index(j) % 2 == 0] for pm in prefetch_maps) prefetch_maps = list(OrderedDict.fromkeys(prefetch_maps)) prefetch_maps = ';\n'.join([prefetch_var] + [prefetch('&(%s)' % pm) for pm in prefetch_maps]) prefetch_vecs = flatten(a.c_vec_entry('p', True) for a in args if a._is_indirect) prefetch_vecs = ';\n'.join([prefetch(pv) for pv in prefetch_vecs]) loop_code_dict['prefetch_maps'] = prefetch_maps loop_code_dict['prefetch_vecs'] = prefetch_vecs # ... build the subset indirection array, if necessary _ssind_arg, _ssind_decl = '', '' if loop_code_dict['ssinds_arg']: _ssind_arg = 'ssinds_%d' % i _ssind_decl = 'int* %s' % _ssind_arg loop_code_dict['index_expr'] = '%s[n]' % _ssind_arg # ... use the proper function name (the function name of the kernel # within *this* specific loop chain) loop_code_dict['kernel_name'] = kernel._function_names[self._kernel.cache_key] # ... finish building up the /code_dict/ loop_code_dict['args_binding'] = binding loop_code_dict['tile_init'] = self._executor.c_loop_init[i] loop_code_dict['tile_finish'] = self._executor.c_loop_end[i] loop_code_dict['tile_start'] = slope.Executor.meta['tile_start'] loop_code_dict['tile_end'] = slope.Executor.meta['tile_end'] loop_code_dict['tile_iter'] = '%s[n]' % self._executor.gtl_maps[i]['DIRECT'] if _ssind_arg: loop_code_dict['tile_iter'] = '%s[%s]' % (_ssind_arg, loop_code_dict['tile_iter']) # ... concatenate the rest, i.e., body, user code, ... _loop_body.append(strip(TilingJITModule._kernel_wrapper % loop_code_dict)) _user_code.append(kernel._user_code) _ssinds_arg.append(_ssind_decl) _loop_chain_body = indent("\n\n".join(_loop_body), 2) code_dict['user_code'] = indent("\n".join(_user_code), 1) code_dict['ssinds_arg'] = "".join(["%s," % s for s in _ssinds_arg if s]) code_dict['executor_code'] = indent(self._executor.c_code(_loop_chain_body), 1) return code_dict
def compile(self): # If we weren't in the cache we /must/ have arguments if not hasattr(self, '_args'): raise RuntimeError("JITModule has no args associated with it, should never happen") compiler = coffee.system.compiler externc_open = '' if not self._kernel._cpp else 'extern "C" {' externc_close = '' if not self._kernel._cpp else '}' headers = "\n".join([compiler.get('vect_header', "")]) if any(arg._is_soa for arg in self._args): kernel_code = """ #define OP2_STRIDE(a, idx) a[idx] %(header)s %(code)s #undef OP2_STRIDE """ % {'code': self._kernel.code(), 'header': headers} else: kernel_code = """ %(header)s %(code)s """ % {'code': self._kernel.code(), 'header': headers} code_to_compile = strip(dedent(self._wrapper) % self.generate_code()) code_to_compile = """ #include <petsc.h> #include <stdbool.h> #include <math.h> %(sys_headers)s %(kernel)s %(externc_open)s %(wrapper)s %(externc_close)s """ % {'kernel': kernel_code, 'wrapper': code_to_compile, 'externc_open': externc_open, 'externc_close': externc_close, 'sys_headers': '\n'.join(self._kernel._headers + self._system_headers)} self._dump_generated_code(code_to_compile) if configuration["debug"]: self._wrapper_code = code_to_compile extension = self._extension cppargs = self._cppargs cppargs += ["-I%s/include" % d for d in get_petsc_dir()] + \ ["-I%s" % d for d in self._kernel._include_dirs] + \ ["-I%s" % os.path.abspath(os.path.dirname(__file__))] if compiler: cppargs += [compiler[coffee.system.isa['inst_set']]] ldargs = ["-L%s/lib" % d for d in get_petsc_dir()] + \ ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()] + \ ["-lpetsc", "-lm"] + self._libraries ldargs += self._kernel._ldargs if self._kernel._cpp: extension = "cpp" self._fun = compilation.load(code_to_compile, extension, self._wrapper_name, cppargs=cppargs, ldargs=ldargs, argtypes=self._argtypes, restype=None, compiler=compiler.get('name'), comm=self.comm) # Blow away everything we don't need any more del self._args del self._kernel del self._itspace del self._direct return self._fun