コード例 #1
0
def main():
  # Find the paths to import.
  paths = subprocess.check_output(
    ["find", str(FLAGS.directory), "-name", "*.ll"], universal_newlines=True
  )
  paths = [pathlib.Path(result) for result in paths.split("\n") if result]

  i = 0
  with sqlutil.BufferedDatabaseWriter(FLAGS.bytecode_db()) as writer:
    for i, path in enumerate(paths):
      bytecode = fs.Read(path)
      relpath = os.path.relpath(path, FLAGS.directory)
      app.Log(1, "%s:%s", FLAGS.source, relpath)
      writer.AddOne(
        bytecode_database.LlvmBytecode(
          source_name=FLAGS.source,
          relpath=relpath,
          language=FLAGS.language,
          cflags=FLAGS.cflags,
          charcount=len(bytecode),
          linecount=len(bytecode.split("\n")),
          bytecode=bytecode,
          clang_returncode=0,
          error_message="",
        )
      )

  app.Log(1, "Imported %s bytecodes", i)
コード例 #2
0
def ImportProtos(
    db: database.Database,
    bytecode_protos: typing.Iterable[ml4pl_pb2.LlvmBytecode],
) -> None:
    """Import bytecode protobufs to the database."""
    for chunk in labtypes.Chunkify(bytecode_protos, 256):
        with db.Session(commit=True) as s:
            bytecodes = [
                database.LlvmBytecode(**database.LlvmBytecode.FromProto(proto))
                for proto in chunk
            ]
            s.add_all(bytecodes)
コード例 #3
0
  def PopulateBytecodeTable(
    self, db: bytecode_database.Database, commit_every: int = 1000
  ):
    bar = progressbar.ProgressBar()
    bar.max_value = len(self.all_srcs)

    # Process each row of the table in parallel.
    pool = multiprocessing.Pool()
    with db.Session(commit=True) as s:
      for i, proto in enumerate(
        pool.imap_unordered(ProcessLinuxSrcToBytecode, self.all_srcs)
      ):
        bar.update(i)
        s.add(
          bytecode_database.LlvmBytecode(
            **bytecode_database.LlvmBytecode.FromProto(proto)
          )
        )
        if not (i % commit_every):
          s.commit()
コード例 #4
0
def ProcessBitcode(path: pathlib.Path) -> bytecode_database.LlvmBytecode:
    """Process a bitecode file and return the database bytecode representation."""
    with tempfile.TemporaryDirectory(prefix="phd_") as d:
        bytecode_path = pathlib.Path(d) / "bytecode.ll"
        p = llvm_dis.Exec([str(path), "-o", str(bytecode_path)])
        if p.returncode or not bytecode_path.is_file():
            raise OSError(f"llvm-dis '{path}' failed")

        bytecode = fs.Read(bytecode_path)

    return bytecode_database.LlvmBytecode(
        source_name="github.com/av-maramzin/SNU_NPB:NPB3.3-SER-C",
        relpath=AbsPathToRelpath(path),
        language="c",
        cflags=FLAGS.cflags,
        charcount=len(bytecode),
        linecount=len(bytecode.split("\n")),
        bytecode=bytecode,
        clang_returncode=0,
        error_message="",
    )
コード例 #5
0
  def PopulateBytecodeTable(
    self, db: bytecode_database.Database, commit_every: int = 1000
  ):
    bar = progressbar.ProgressBar()
    bar.max_value = len(self.all_srcs)

    # Process each row of the table in parallel.
    pool = multiprocessing.Pool()
    with db.Session(commit=True) as s:
      for i, proto in enumerate(
        pool.imap_unordered(ProcessLinuxSrcToBytecode, self.all_srcs)
      ):
        bar.update(i)
        s.add(
          bytecode_database.LlvmBytecode(
            **bytecode_database.LlvmBytecode.FromProto(proto)
          )
        )
        if not (i % commit_every):
          s.commit()

  @decorators.memoized_property
  def cfgs_df(self) -> pd.DataFrame:
    # Process each row of the table in parallel.
    pool = multiprocessing.Pool()
    rows = []
    for row_batch in pool.imap_unordered(ProcessLinuxSrc, self.kernel_srcs):
      if row_batch:
        rows += row_batch

    # Create the output table.
コード例 #6
0
def PopulateBytecodeTable(
  cf: contentfiles.ContentFiles,
  language: str,
  db: bytecode_database.Database,
  pool: typing.Optional[multiprocessing.Pool] = None,
):
  # Only one process at a time can run this method.
  mutex = lockfile.AutoLockFile(granularity="function")

  # We use the database URL as the name of the source.
  source_name = cf.url

  # Read source files from the contenfiles database, process them into
  # bytecodes, and, if successful, write them into the database. We process
  # files sorted by their numeric ID in the contentfiles database, so that if
  with db.Session() as s:
    # Get the ID of the last-processed bytecode file to resume from.
    resume_from = int(
      (
        s.query(bytecode_database.LlvmBytecode.relpath)
        .filter(bytecode_database.LlvmBytecode.source_name == cf.url)
        .filter(bytecode_database.LlvmBytecode.language == language)
        # Note the cast to integer: relpath is a string column, sorting by it
        # in its native type would sort the string (e.g. '9' > '10'.
        .order_by(
          sql.cast(bytecode_database.LlvmBytecode.relpath, sql.Integer).desc()
        )
        .limit(1)
        .first()
        or (0,)
      )[0]
    )

  with mutex, cf.Session() as cf_s, sqlutil.BufferedDatabaseWriter(
    db, max_buffer_length=10
  ) as writer:
    # Get the ID of the last contentfile to process.
    n = (
      cf_s.query(contentfiles.ContentFile.id)
      .join(contentfiles.GitHubRepository)
      .filter(contentfiles.GitHubRepository.language == language)
      .order_by(contentfiles.ContentFile.id.desc())
      .limit(1)
      .one_or_none()
      or (0,)
    )[0]
    app.Log(
      1,
      "Starting at row %s / %s",
      humanize.Commas(resume_from),
      humanize.Commas(n),
    )

    # A query to return the <id,text> tuples of files to process.
    q = (
      cf_s.query(contentfiles.ContentFile.id, contentfiles.ContentFile.text)
      .filter(contentfiles.ContentFile.id > resume_from)
      .join(contentfiles.GitHubRepository)
      .filter(contentfiles.GitHubRepository.language == language)
      .order_by(contentfiles.ContentFile.id)
    )

    row_batches = sqlutil.OffsetLimitBatchedQuery(
      q, batch_size=FLAGS.batch_size
    )

    for i, batch in zip(range(resume_from, n + 1), row_batches):
      app.Log(
        1,
        "Processing batch of %d contentfiles -> bytecodes, %s / %s (%.1f%%)",
        FLAGS.batch_size,
        humanize.Commas(i),
        humanize.Commas(n),
        (i / n) * 100,
      )
      protos = GetBytecodesFromContentFiles(source_name, language, batch.rows)
      writer.AddMany(
        [
          bytecode_database.LlvmBytecode(
            **bytecode_database.LlvmBytecode.FromProto(proto)
          )
          for proto in protos
        ]
      )