def LlvmBytecodeIterator(
        base_path: pathlib.Path,
        source_name: str) -> typing.Iterable[ml4pl_pb2.LlvmBytecode]:
    """Extract LLVM bytecodes from contentfiles.

  Args:
    base_path: The root directory containing IR codes.
    source_name: The name of the source which is attributed to bytecodes.

  Returns:
    An iterator of LlvmBytecode protos.
  """
    for entry in base_path.iterdir():
        if entry.is_dir() and not entry.name.endswith("_preprocessed"):
            for path in entry.iterdir():
                if path.name.endswith(".ll"):
                    relpath = os.path.relpath(path, base_path)
                    app.Log(1, "Read %s:%s", source_name, relpath)
                    yield ml4pl_pb2.LlvmBytecode(
                        source_name=source_name,
                        relpath=relpath,
                        lang="cpp",
                        cflags="",
                        bytecode=fs.Read(path),
                        clang_returncode=0,
                        error_message="",
                    )
Beispiel #2
0
def GetBytecodesFromContentFiles(
  source_name: str,
  language: str,
  content_files: typing.List[typing.Tuple[int, str]],
) -> typing.List[ml4pl_pb2.LlvmBytecode]:
  """Extract LLVM bytecodes from contentfiles.

  Args:
    source_name: The name of the content file database. This is the same across
      all content files.
    language: The source code language. This is the same across all content
      files.
    content_files: A list of <id,text> tuples, where each tuple is the ID and
      text of a row in the content file database.

  Returns:
    A list of zero LlvmBytecode protos, one for each contentfile which was
    successfully processed.
  """
  if language == "swift":
    return GetSwiftBytecodesFromContentFiles(source_name, content_files)
  elif language == "haskell":
    return GetHaskellBytecodesFromContentFiles(source_name, content_files)

  protos = []
  clang_args = LANGUAGE_TO_CLANG_ARGS[language] + [
    "-S",
    "-emit-llvm",
    "-",
    "-o",
    "-",
  ]

  for content_file_id, text in content_files:
    process = clang.Exec(clang_args, stdin=text)
    if process.returncode:
      continue

    protos.append(
      ml4pl_pb2.LlvmBytecode(
        source_name=source_name,
        relpath=str(content_file_id),
        lang=language,
        cflags=" ".join(clang_args),
        bytecode=process.stdout,
        clang_returncode=0,
        error_message="",
      )
    )

  return protos
Beispiel #3
0
def GetHaskellBytecodesFromContentFiles(
  source_name: str, content_files: typing.List[typing.Tuple[int, str]]
) -> typing.List[ml4pl_pb2.LlvmBytecode]:
  """Extract LLVM bytecodes from haskell contentfiles.

  The process is haskell -> LLVM bytecode.

  This requires the glasgow haskell compiler and LLVM backend, install them on
  Ubuntu 16.04 using:

    $ sudo apt-get install ghc llvm-3.5
  """
  protos = []

  with tempfile.TemporaryDirectory(prefix="phd_import_haskell_") as d:
    with fs.chdir(d) as d:
      for content_file_id, text in content_files:
        haskell_file = d / "file.hs"
        ll_file = d / "file.ll"
        fs.Write(haskell_file, text.encode("utf-8"))
        ghc = subprocess.Popen(
          [
            "ghc",
            "-fllvm",
            "-keep-llvm-files",
            "-fforce-recomp",
            haskell_file.name,
          ],
          stderr=subprocess.DEVNULL,
        )
        ghc.communicate()
        if ghc.returncode:
          continue
        if not ll_file.is_file():
          continue

        protos.append(
          ml4pl_pb2.LlvmBytecode(
            source_name=source_name,
            relpath=str(content_file_id),
            lang="haskell",
            cflags="ghc -fllm -keep-llvm-files",
            bytecode=fs.Read(ll_file),
            clang_returncode=0,
            error_message="",
          )
        )

  return protos
Beispiel #4
0
def GetSwiftBytecodesFromContentFiles(
  source_name: str, content_files: typing.List[typing.Tuple[int, str]]
) -> typing.List[ml4pl_pb2.LlvmBytecode]:
  """Extract LLVM bytecodes from swift contentfiles.

  The process is swift -> LLVM bitcode, clang -> LLVM bytecode.

  This requires that the `swift` binary is in the system path.
  """
  protos = []

  with tempfile.TemporaryDirectory(prefix="phd_import_swift_") as d:
    with fs.chdir(d) as d:
      for content_file_id, text in content_files:
        swift_file = d / "file.swift"
        bc_file = d / "file.bc"
        fs.Write(swift_file, text.encode("utf-8"))
        swift = subprocess.Popen(
          ["swift", "-Xfrontend", "-emit-bc", swift_file.name],
          stderr=subprocess.DEVNULL,
        )
        swift.communicate()
        if swift.returncode:
          continue
        if not bc_file.is_file():
          continue

        process = clang.Exec(["-S", "-emit-llvm", str(bc_file), "-o", "-"])
        if process.returncode:
          continue

        protos.append(
          ml4pl_pb2.LlvmBytecode(
            source_name=source_name,
            relpath=str(content_file_id),
            lang="swift",
            cflags="",
            bytecode=process.stdout,
            clang_returncode=0,
            error_message="",
          )
        )

  return protos
Beispiel #5
0
def ProcessLinuxSrcToBytecode(path: pathlib.Path) -> ml4pl_pb2.LlvmBytecode:
  src_root = LinuxSourcesDataset().src_tree_root
  version = LinuxSourcesDataset().version

  try:
    bytecode, cflags = BytecodeFromLinuxSrc(path, "-O0")
    clang_returncode = 0
    error_message = ""
  except clang.ClangException as e:
    bytecode = ""
    cflags = e.command
    clang_returncode = e.returncode
    error_message = e.stderr

  return ml4pl_pb2.LlvmBytecode(
    source_name=f"linux-{version}",
    relpath=str(path)[len(str(src_root)) + 1 :],
    lang="C",
    cflags=" ".join(cflags),
    bytecode=bytecode,
    clang_returncode=clang_returncode,
    error_message=error_message,
  )