def LlvmBytecodeIterator( base_path: pathlib.Path, source_name: str) -> typing.Iterable[ml4pl_pb2.LlvmBytecode]: """Extract LLVM bytecodes from contentfiles. Args: base_path: The root directory containing IR codes. source_name: The name of the source which is attributed to bytecodes. Returns: An iterator of LlvmBytecode protos. """ for entry in base_path.iterdir(): if entry.is_dir() and not entry.name.endswith("_preprocessed"): for path in entry.iterdir(): if path.name.endswith(".ll"): relpath = os.path.relpath(path, base_path) app.Log(1, "Read %s:%s", source_name, relpath) yield ml4pl_pb2.LlvmBytecode( source_name=source_name, relpath=relpath, lang="cpp", cflags="", bytecode=fs.Read(path), clang_returncode=0, error_message="", )
def GetBytecodesFromContentFiles( source_name: str, language: str, content_files: typing.List[typing.Tuple[int, str]], ) -> typing.List[ml4pl_pb2.LlvmBytecode]: """Extract LLVM bytecodes from contentfiles. Args: source_name: The name of the content file database. This is the same across all content files. language: The source code language. This is the same across all content files. content_files: A list of <id,text> tuples, where each tuple is the ID and text of a row in the content file database. Returns: A list of zero LlvmBytecode protos, one for each contentfile which was successfully processed. """ if language == "swift": return GetSwiftBytecodesFromContentFiles(source_name, content_files) elif language == "haskell": return GetHaskellBytecodesFromContentFiles(source_name, content_files) protos = [] clang_args = LANGUAGE_TO_CLANG_ARGS[language] + [ "-S", "-emit-llvm", "-", "-o", "-", ] for content_file_id, text in content_files: process = clang.Exec(clang_args, stdin=text) if process.returncode: continue protos.append( ml4pl_pb2.LlvmBytecode( source_name=source_name, relpath=str(content_file_id), lang=language, cflags=" ".join(clang_args), bytecode=process.stdout, clang_returncode=0, error_message="", ) ) return protos
def GetHaskellBytecodesFromContentFiles( source_name: str, content_files: typing.List[typing.Tuple[int, str]] ) -> typing.List[ml4pl_pb2.LlvmBytecode]: """Extract LLVM bytecodes from haskell contentfiles. The process is haskell -> LLVM bytecode. This requires the glasgow haskell compiler and LLVM backend, install them on Ubuntu 16.04 using: $ sudo apt-get install ghc llvm-3.5 """ protos = [] with tempfile.TemporaryDirectory(prefix="phd_import_haskell_") as d: with fs.chdir(d) as d: for content_file_id, text in content_files: haskell_file = d / "file.hs" ll_file = d / "file.ll" fs.Write(haskell_file, text.encode("utf-8")) ghc = subprocess.Popen( [ "ghc", "-fllvm", "-keep-llvm-files", "-fforce-recomp", haskell_file.name, ], stderr=subprocess.DEVNULL, ) ghc.communicate() if ghc.returncode: continue if not ll_file.is_file(): continue protos.append( ml4pl_pb2.LlvmBytecode( source_name=source_name, relpath=str(content_file_id), lang="haskell", cflags="ghc -fllm -keep-llvm-files", bytecode=fs.Read(ll_file), clang_returncode=0, error_message="", ) ) return protos
def GetSwiftBytecodesFromContentFiles( source_name: str, content_files: typing.List[typing.Tuple[int, str]] ) -> typing.List[ml4pl_pb2.LlvmBytecode]: """Extract LLVM bytecodes from swift contentfiles. The process is swift -> LLVM bitcode, clang -> LLVM bytecode. This requires that the `swift` binary is in the system path. """ protos = [] with tempfile.TemporaryDirectory(prefix="phd_import_swift_") as d: with fs.chdir(d) as d: for content_file_id, text in content_files: swift_file = d / "file.swift" bc_file = d / "file.bc" fs.Write(swift_file, text.encode("utf-8")) swift = subprocess.Popen( ["swift", "-Xfrontend", "-emit-bc", swift_file.name], stderr=subprocess.DEVNULL, ) swift.communicate() if swift.returncode: continue if not bc_file.is_file(): continue process = clang.Exec(["-S", "-emit-llvm", str(bc_file), "-o", "-"]) if process.returncode: continue protos.append( ml4pl_pb2.LlvmBytecode( source_name=source_name, relpath=str(content_file_id), lang="swift", cflags="", bytecode=process.stdout, clang_returncode=0, error_message="", ) ) return protos
def ProcessLinuxSrcToBytecode(path: pathlib.Path) -> ml4pl_pb2.LlvmBytecode: src_root = LinuxSourcesDataset().src_tree_root version = LinuxSourcesDataset().version try: bytecode, cflags = BytecodeFromLinuxSrc(path, "-O0") clang_returncode = 0 error_message = "" except clang.ClangException as e: bytecode = "" cflags = e.command clang_returncode = e.returncode error_message = e.stderr return ml4pl_pb2.LlvmBytecode( source_name=f"linux-{version}", relpath=str(path)[len(str(src_root)) + 1 :], lang="C", cflags=" ".join(cflags), bytecode=bytecode, clang_returncode=clang_returncode, error_message=error_message, )