def compile_str(module_name, src_code, location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[], rpaths=rpath_defaults, py_module=True, hide_symbols=True): """ Parameters ---------- module_name: str This has been embedded in the src_code. src_code A complete c or c++ source listing for the module. location A pre-existing filesystem directory where the cpp file and .so will be written. include_dirs A list of include directory names (each gets prefixed with -I). lib_dirs A list of library search path directory names (each gets prefixed with -L). libs A list of libraries to link with (each gets prefixed with -l). preargs A list of extra compiler arguments. rpaths List of rpaths to use with Xlinker. Defaults to `rpath_defaults`. py_module If False, compile to a shared library, but do not import as a Python module. hide_symbols If True (the default), hide all symbols from the library symbol table unless explicitely exported. Returns ------- module Dynamically-imported python module of the compiled code. (unless py_module is False, in that case returns None.) Notes ----- On Windows 7 with nvcc 3.1 we need to compile in the real directory Otherwise nvcc never finish. """ # Remove empty string directory include_dirs = [d for d in include_dirs if d] lib_dirs = [d for d in lib_dirs if d] rpaths = list(rpaths) if sys.platform == "win32": # Remove some compilation args that cl.exe does not understand. # cl.exe is the compiler used by nvcc on Windows. for a in [ "-Wno-write-strings", "-Wno-unused-label", "-Wno-unused-variable", "-fno-math-errno" ]: if a in preargs: preargs.remove(a) if preargs is None: preargs = [] else: preargs = list(preargs) if sys.platform != 'win32': preargs.append('-fPIC') if config.cmodule.remove_gxx_opt: preargs = [p for p in preargs if not p.startswith('-O')] cuda_root = config.cuda.root # The include dirs gived by the user should have precedence over # the standards ones. include_dirs = include_dirs + std_include_dirs() if os.path.abspath(os.path.split(__file__)[0]) not in include_dirs: include_dirs.append(os.path.abspath(os.path.split(__file__)[0])) libs = libs + std_libs() if 'cudart' not in libs: libs.append('cudart') lib_dirs = lib_dirs + std_lib_dirs() if sys.platform != 'darwin': # config.dnn.include_path add this by default for cudnn in the # new back-end. This should not be used in this back-end. So # just remove them. lib_dirs = [ ld for ld in lib_dirs if not (ld == os.path.join(cuda_root, 'lib') or ld == os.path.join(cuda_root, 'lib64')) ] if sys.platform != 'darwin': # sometimes, the linker cannot find -lpython so we need to tell it # explicitly where it is located # this returns somepath/lib/python2.x python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, standard_lib=1) python_lib = os.path.dirname(python_lib) if python_lib not in lib_dirs: lib_dirs.append(python_lib) cppfilename = os.path.join(location, 'mod.cu') with open(cppfilename, 'w') as cppfile: _logger.debug('Writing module C++ code to %s', cppfilename) cppfile.write(src_code) lib_filename = os.path.join( location, '%s.%s' % (module_name, get_lib_extension())) _logger.debug('Generating shared lib %s', lib_filename) # TODO: Why do these args cause failure on gtx285 that has 1.3 # compute capability? '--gpu-architecture=compute_13', # '--gpu-code=compute_13', # nvcc argument preargs1 = [] preargs2 = [] for pa in preargs: if pa.startswith('-Wl,'): # the -rpath option is not understood by the Microsoft linker if sys.platform != 'win32' or not pa.startswith('-Wl,-rpath'): preargs1.append('-Xlinker') preargs1.append(pa[4:]) continue for pattern in [ '-O', '-arch=', '-ccbin=', '-G', '-g', '-I', '-L', '--fmad', '--ftz', '--maxrregcount', '--prec-div', '--prec-sqrt', '--use_fast_math', '-fmad', '-ftz', '-maxrregcount', '-prec-div', '-prec-sqrt', '-use_fast_math', '--use-local-env', '--cl-version=' ]: if pa.startswith(pattern): preargs1.append(pa) break else: preargs2.append(pa) # Don't put -G by default, as it slow things down. # We aren't sure if -g slow things down, so we don't put it by default. cmd = [nvcc_path, '-shared'] + preargs1 if config.nvcc.compiler_bindir: cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir]) if sys.platform == 'win32': # add flags for Microsoft compiler to create .pdb files preargs2.extend(['/Zi', '/MD']) cmd.extend(['-Xlinker', '/DEBUG']) # remove the complaints for the duplication of `double round(double)` # in both math_functions.h and pymath.h, # by not including the one in pymath.h cmd.extend(['-D HAVE_ROUND']) else: if hide_symbols: preargs2.append('-fvisibility=hidden') if local_bitwidth() == 64: cmd.append('-m64') else: cmd.append('-m32') if len(preargs2) > 0: cmd.extend(['-Xcompiler', ','.join(preargs2)]) # We should not use rpath if possible. If the user provided # provided an cuda.root flag, we need to add one, but # otherwise, we don't add it. See gh-1540 and # https://wiki.debian.org/RpathIssue for details. if (not type(config.cuda).root.is_default and os.path.exists(os.path.join(config.cuda.root, 'lib'))): rpaths.append(os.path.join(config.cuda.root, 'lib')) if sys.platform != 'darwin': # the CUDA libs are universal (contain both 32-bit and 64-bit) rpaths.append(os.path.join(config.cuda.root, 'lib64')) if sys.platform != 'win32': # the -rpath option is not understood by the Microsoft linker for rpath in rpaths: cmd.extend(['-Xlinker', ','.join(['-rpath', rpath])]) cmd.extend('-I%s' % idir for idir in include_dirs) cmd.extend(['-o', lib_filename]) cmd.append(os.path.split(cppfilename)[-1]) cmd.extend(['-L%s' % ldir for ldir in lib_dirs]) cmd.extend(['-l%s' % l for l in libs]) if sys.platform == 'darwin': # This tells the compiler to use the already-loaded python # symbols (which should always be the right ones). cmd.extend(['-Xcompiler', '-undefined,dynamic_lookup']) # Remove "-u Symbol" arguments, since they are usually not # relevant for the new compilation, even if they were used for # compiling python. If they are necessary, the nvcc syntax is # "-U Symbol" with a capital U. done = False while not done: try: indexof = cmd.index('-u') cmd.pop(indexof) # Remove -u cmd.pop(indexof) # Remove argument to -u except ValueError as e: done = True # CUDA Toolkit v4.1 Known Issues: # Host linker on Mac OS 10.7 (and 10.6 for me) passes -no_pie option # to nvcc this option is not recognized and generates an error # http://stackoverflow.com/questions/9327265/nvcc-unknown-option-no-pie # Passing -Xlinker -pie stops -no_pie from getting passed if sys.platform == 'darwin' and nvcc_version >= '4.1': cmd.extend(['-Xlinker', '-pie']) # cmd.append("--ptxas-options=-v") #uncomment this to see # register and shared-mem requirements _logger.debug('Running cmd %s', ' '.join(cmd)) orig_dir = os.getcwd() try: os.chdir(location) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) nvcc_stdout_raw, nvcc_stderr_raw = p.communicate()[:2] console_encoding = getpreferredencoding() nvcc_stdout = decode_with(nvcc_stdout_raw, console_encoding) nvcc_stderr = decode_with(nvcc_stderr_raw, console_encoding) finally: os.chdir(orig_dir) for eline in nvcc_stderr.split('\n'): if not eline: continue if 'skipping incompatible' in eline: # ld is skipping an incompatible library continue if 'declared but never referenced' in eline: continue if 'statement is unreachable' in eline: continue _logger.info("NVCC: %s", eline) if p.returncode: for i, l in enumerate(src_code.split('\n')): print(i + 1, l, file=sys.stderr) print('===============================', file=sys.stderr) # filter the output from the compiler for l in nvcc_stderr.split('\n'): if not l: continue # filter out the annoying declaration warnings try: if l[l.index(':'):].startswith(': warning: variable'): continue if l[l.index(':'):].startswith(': warning: label'): continue except Exception: pass print(l, file=sys.stderr) print(nvcc_stdout) print(cmd) raise Exception('nvcc return status', p.returncode, 'for cmd', ' '.join(cmd)) elif config.cmodule.compilation_warning and nvcc_stdout: print(nvcc_stdout) # On Windows, nvcc print useless stuff by default if sys.platform != 'win32' and nvcc_stdout: # this doesn't happen to my knowledge print("DEBUG: nvcc STDOUT", nvcc_stdout, file=sys.stderr) if py_module: # touch the __init__ file open(os.path.join(location, "__init__.py"), 'w').close() return dlimport(lib_filename)
def compile_str( module_name, src_code, location=None, include_dirs=[], lib_dirs=[], libs=[], preargs=[], rpaths=rpath_defaults, py_module=True, hide_symbols=True): """ Parameters ---------- module_name: str This has been embedded in the src_code. src_code A complete c or c++ source listing for the module. location A pre-existing filesystem directory where the cpp file and .so will be written. include_dirs A list of include directory names (each gets prefixed with -I). lib_dirs A list of library search path directory names (each gets prefixed with -L). libs A list of libraries to link with (each gets prefixed with -l). preargs A list of extra compiler arguments. rpaths List of rpaths to use with Xlinker. Defaults to `rpath_defaults`. py_module If False, compile to a shared library, but do not import as a Python module. hide_symbols If True (the default), hide all symbols from the library symbol table unless explicitely exported. Returns ------- module Dynamically-imported python module of the compiled code. (unless py_module is False, in that case returns None.) Notes ----- On Windows 7 with nvcc 3.1 we need to compile in the real directory Otherwise nvcc never finish. """ # Remove empty string directory include_dirs = [d for d in include_dirs if d] lib_dirs = [d for d in lib_dirs if d] rpaths = list(rpaths) if sys.platform == "win32": # Remove some compilation args that cl.exe does not understand. # cl.exe is the compiler used by nvcc on Windows. for a in ["-Wno-write-strings", "-Wno-unused-label", "-Wno-unused-variable", "-fno-math-errno"]: if a in preargs: preargs.remove(a) if preargs is None: preargs = [] else: preargs = list(preargs) if sys.platform != 'win32': preargs.append('-fPIC') if config.cmodule.remove_gxx_opt: preargs = [p for p in preargs if not p.startswith('-O')] cuda_root = config.cuda.root # The include dirs gived by the user should have precedence over # the standards ones. include_dirs = include_dirs + std_include_dirs() if os.path.abspath(os.path.split(__file__)[0]) not in include_dirs: include_dirs.append(os.path.abspath(os.path.split(__file__)[0])) libs = libs + std_libs() if 'cudart' not in libs: libs.append('cudart') lib_dirs = lib_dirs + std_lib_dirs() if sys.platform != 'darwin': # config.dnn.include_path add this by default for cudnn in the # new back-end. This should not be used in this back-end. So # just remove them. lib_dirs = [ld for ld in lib_dirs if not(ld == os.path.join(cuda_root, 'lib') or ld == os.path.join(cuda_root, 'lib64'))] if sys.platform != 'darwin': # sometimes, the linker cannot find -lpython so we need to tell it # explicitly where it is located # this returns somepath/lib/python2.x python_lib = distutils.sysconfig.get_python_lib(plat_specific=1, standard_lib=1) python_lib = os.path.dirname(python_lib) if python_lib not in lib_dirs: lib_dirs.append(python_lib) cppfilename = os.path.join(location, 'mod.cu') with open(cppfilename, 'w') as cppfile: _logger.debug('Writing module C++ code to %s', cppfilename) cppfile.write(src_code) lib_filename = os.path.join(location, '%s.%s' % (module_name, get_lib_extension())) _logger.debug('Generating shared lib %s', lib_filename) # TODO: Why do these args cause failure on gtx285 that has 1.3 # compute capability? '--gpu-architecture=compute_13', # '--gpu-code=compute_13', # nvcc argument preargs1 = [] preargs2 = [] for pa in preargs: if pa.startswith('-Wl,'): preargs1.append('-Xlinker') preargs1.append(pa[4:]) continue for pattern in ['-O', '-arch=', '-ccbin=', '-G', '-g', '-I', '-L', '--fmad', '--ftz', '--maxrregcount', '--prec-div', '--prec-sqrt', '--use_fast_math', '-fmad', '-ftz', '-maxrregcount', '-prec-div', '-prec-sqrt', '-use_fast_math', '--use-local-env', '--cl-version=']: if pa.startswith(pattern): preargs1.append(pa) break else: preargs2.append(pa) # Don't put -G by default, as it slow things down. # We aren't sure if -g slow things down, so we don't put it by default. cmd = [nvcc_path, '-shared'] + preargs1 if config.nvcc.compiler_bindir: cmd.extend(['--compiler-bindir', config.nvcc.compiler_bindir]) if sys.platform == 'win32': # add flags for Microsoft compiler to create .pdb files preargs2.extend(['/Zi', '/MD']) cmd.extend(['-Xlinker', '/DEBUG']) # remove the complaints for the duplication of `double round(double)` # in both math_functions.h and pymath.h, # by not including the one in pymath.h cmd.extend(['-D HAVE_ROUND']) else: if hide_symbols: preargs2.append('-fvisibility=hidden') if local_bitwidth() == 64: cmd.append('-m64') else: cmd.append('-m32') if len(preargs2) > 0: cmd.extend(['-Xcompiler', ','.join(preargs2)]) # We should not use rpath if possible. If the user provided # provided an cuda.root flag, we need to add one, but # otherwise, we don't add it. See gh-1540 and # https://wiki.debian.org/RpathIssue for details. if (not type(config.cuda).root.is_default and os.path.exists(os.path.join(config.cuda.root, 'lib'))): rpaths.append(os.path.join(config.cuda.root, 'lib')) if sys.platform != 'darwin': # the CUDA libs are universal (contain both 32-bit and 64-bit) rpaths.append(os.path.join(config.cuda.root, 'lib64')) if sys.platform != 'win32': # the -rpath option is not understood by the Microsoft linker for rpath in rpaths: cmd.extend(['-Xlinker', ','.join(['-rpath', rpath])]) cmd.extend('-I%s' % idir for idir in include_dirs) cmd.extend(['-o', lib_filename]) cmd.append(os.path.split(cppfilename)[-1]) cmd.extend(['-L%s' % ldir for ldir in lib_dirs]) cmd.extend(['-l%s' % l for l in libs]) if sys.platform == 'darwin': # This tells the compiler to use the already-loaded python # symbols (which should always be the right ones). cmd.extend(['-Xcompiler', '-undefined,dynamic_lookup']) # Remove "-u Symbol" arguments, since they are usually not # relevant for the new compilation, even if they were used for # compiling python. If they are necessary, the nvcc syntax is # "-U Symbol" with a capital U. done = False while not done: try: indexof = cmd.index('-u') cmd.pop(indexof) # Remove -u cmd.pop(indexof) # Remove argument to -u except ValueError as e: done = True # CUDA Toolkit v4.1 Known Issues: # Host linker on Mac OS 10.7 (and 10.6 for me) passes -no_pie option # to nvcc this option is not recognized and generates an error # http://stackoverflow.com/questions/9327265/nvcc-unknown-option-no-pie # Passing -Xlinker -pie stops -no_pie from getting passed if sys.platform == 'darwin' and nvcc_version >= '4.1': cmd.extend(['-Xlinker', '-pie']) # cmd.append("--ptxas-options=-v") #uncomment this to see # register and shared-mem requirements _logger.debug('Running cmd %s', ' '.join(cmd)) orig_dir = os.getcwd() try: os.chdir(location) p = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) nvcc_stdout, nvcc_stderr = decode_iter(p.communicate()[:2]) finally: os.chdir(orig_dir) for eline in nvcc_stderr.split('\n'): if not eline: continue if 'skipping incompatible' in eline: # ld is skipping an incompatible library continue if 'declared but never referenced' in eline: continue if 'statement is unreachable' in eline: continue _logger.info("NVCC: %s", eline) if p.returncode: for i, l in enumerate(src_code.split('\n')): print(i + 1, l, file=sys.stderr) print('===============================', file=sys.stderr) # filter the output from the compiler for l in nvcc_stderr.split('\n'): if not l: continue # filter out the annoying declaration warnings try: if l[l.index(':'):].startswith(': warning: variable'): continue if l[l.index(':'):].startswith(': warning: label'): continue except Exception: pass print(l, file=sys.stderr) print(nvcc_stdout) print(cmd) raise Exception('nvcc return status', p.returncode, 'for cmd', ' '.join(cmd)) elif config.cmodule.compilation_warning and nvcc_stdout: print(nvcc_stdout) if nvcc_stdout: # this doesn't happen to my knowledge print("DEBUG: nvcc STDOUT", nvcc_stdout, file=sys.stderr) if py_module: # touch the __init__ file open(os.path.join(location, "__init__.py"), 'w').close() return dlimport(lib_filename)
def matrixify(vector, n): # Cast n to int32 if necessary to prevent error on 32 bit systems return T.repeat( T.shape_padleft(vector), n if (configdefaults.local_bitwidth() == 64) else T.cast(n, 'int32'), axis=0)