def main(): # Find the paths to import. paths = subprocess.check_output( ["find", str(FLAGS.directory), "-name", "*.ll"], universal_newlines=True ) paths = [pathlib.Path(result) for result in paths.split("\n") if result] i = 0 with sqlutil.BufferedDatabaseWriter(FLAGS.bytecode_db()) as writer: for i, path in enumerate(paths): bytecode = fs.Read(path) relpath = os.path.relpath(path, FLAGS.directory) app.Log(1, "%s:%s", FLAGS.source, relpath) writer.AddOne( bytecode_database.LlvmBytecode( source_name=FLAGS.source, relpath=relpath, language=FLAGS.language, cflags=FLAGS.cflags, charcount=len(bytecode), linecount=len(bytecode.split("\n")), bytecode=bytecode, clang_returncode=0, error_message="", ) ) app.Log(1, "Imported %s bytecodes", i)
def ImportProtos( db: database.Database, bytecode_protos: typing.Iterable[ml4pl_pb2.LlvmBytecode], ) -> None: """Import bytecode protobufs to the database.""" for chunk in labtypes.Chunkify(bytecode_protos, 256): with db.Session(commit=True) as s: bytecodes = [ database.LlvmBytecode(**database.LlvmBytecode.FromProto(proto)) for proto in chunk ] s.add_all(bytecodes)
def PopulateBytecodeTable( self, db: bytecode_database.Database, commit_every: int = 1000 ): bar = progressbar.ProgressBar() bar.max_value = len(self.all_srcs) # Process each row of the table in parallel. pool = multiprocessing.Pool() with db.Session(commit=True) as s: for i, proto in enumerate( pool.imap_unordered(ProcessLinuxSrcToBytecode, self.all_srcs) ): bar.update(i) s.add( bytecode_database.LlvmBytecode( **bytecode_database.LlvmBytecode.FromProto(proto) ) ) if not (i % commit_every): s.commit()
def ProcessBitcode(path: pathlib.Path) -> bytecode_database.LlvmBytecode: """Process a bitecode file and return the database bytecode representation.""" with tempfile.TemporaryDirectory(prefix="phd_") as d: bytecode_path = pathlib.Path(d) / "bytecode.ll" p = llvm_dis.Exec([str(path), "-o", str(bytecode_path)]) if p.returncode or not bytecode_path.is_file(): raise OSError(f"llvm-dis '{path}' failed") bytecode = fs.Read(bytecode_path) return bytecode_database.LlvmBytecode( source_name="github.com/av-maramzin/SNU_NPB:NPB3.3-SER-C", relpath=AbsPathToRelpath(path), language="c", cflags=FLAGS.cflags, charcount=len(bytecode), linecount=len(bytecode.split("\n")), bytecode=bytecode, clang_returncode=0, error_message="", )
def PopulateBytecodeTable( self, db: bytecode_database.Database, commit_every: int = 1000 ): bar = progressbar.ProgressBar() bar.max_value = len(self.all_srcs) # Process each row of the table in parallel. pool = multiprocessing.Pool() with db.Session(commit=True) as s: for i, proto in enumerate( pool.imap_unordered(ProcessLinuxSrcToBytecode, self.all_srcs) ): bar.update(i) s.add( bytecode_database.LlvmBytecode( **bytecode_database.LlvmBytecode.FromProto(proto) ) ) if not (i % commit_every): s.commit() @decorators.memoized_property def cfgs_df(self) -> pd.DataFrame: # Process each row of the table in parallel. pool = multiprocessing.Pool() rows = [] for row_batch in pool.imap_unordered(ProcessLinuxSrc, self.kernel_srcs): if row_batch: rows += row_batch # Create the output table.
def PopulateBytecodeTable( cf: contentfiles.ContentFiles, language: str, db: bytecode_database.Database, pool: typing.Optional[multiprocessing.Pool] = None, ): # Only one process at a time can run this method. mutex = lockfile.AutoLockFile(granularity="function") # We use the database URL as the name of the source. source_name = cf.url # Read source files from the contenfiles database, process them into # bytecodes, and, if successful, write them into the database. We process # files sorted by their numeric ID in the contentfiles database, so that if with db.Session() as s: # Get the ID of the last-processed bytecode file to resume from. resume_from = int( ( s.query(bytecode_database.LlvmBytecode.relpath) .filter(bytecode_database.LlvmBytecode.source_name == cf.url) .filter(bytecode_database.LlvmBytecode.language == language) # Note the cast to integer: relpath is a string column, sorting by it # in its native type would sort the string (e.g. '9' > '10'. .order_by( sql.cast(bytecode_database.LlvmBytecode.relpath, sql.Integer).desc() ) .limit(1) .first() or (0,) )[0] ) with mutex, cf.Session() as cf_s, sqlutil.BufferedDatabaseWriter( db, max_buffer_length=10 ) as writer: # Get the ID of the last contentfile to process. n = ( cf_s.query(contentfiles.ContentFile.id) .join(contentfiles.GitHubRepository) .filter(contentfiles.GitHubRepository.language == language) .order_by(contentfiles.ContentFile.id.desc()) .limit(1) .one_or_none() or (0,) )[0] app.Log( 1, "Starting at row %s / %s", humanize.Commas(resume_from), humanize.Commas(n), ) # A query to return the <id,text> tuples of files to process. q = ( cf_s.query(contentfiles.ContentFile.id, contentfiles.ContentFile.text) .filter(contentfiles.ContentFile.id > resume_from) .join(contentfiles.GitHubRepository) .filter(contentfiles.GitHubRepository.language == language) .order_by(contentfiles.ContentFile.id) ) row_batches = sqlutil.OffsetLimitBatchedQuery( q, batch_size=FLAGS.batch_size ) for i, batch in zip(range(resume_from, n + 1), row_batches): app.Log( 1, "Processing batch of %d contentfiles -> bytecodes, %s / %s (%.1f%%)", FLAGS.batch_size, humanize.Commas(i), humanize.Commas(n), (i / n) * 100, ) protos = GetBytecodesFromContentFiles(source_name, language, batch.rows) writer.AddMany( [ bytecode_database.LlvmBytecode( **bytecode_database.LlvmBytecode.FromProto(proto) ) for proto in protos ] )