def _get_assertion(session: session_t, lines: Iterable[str]) -> Union[ None, Assertion]: clang_assertion = False strip = False for line in lines: if "assertion" in line.lower(): if strip: if line.startswith("cldrive-harness"): msg = ":".join(line.split(":")[1:]) else: msg = line msg = re.sub(r"^ *:[0-9]+: ", "", msg) if "Assertion `(null)' failed." in msg: msg = "Assertion `(null)' failed." elif "Assertion `' failed." in msg: msg = "Assertion `' failed." elif "Assertion `' failed." in msg: msg = "Assertion `' failed." elif clang_assertion: msg = ":".join(line.split(":")[3:]) else: msg = line assertion = get_or_add( session, Assertion, sha1=crypto.sha1_str(msg), assertion=msg) return assertion
def run(self) -> None: i = dbutil.num_rows_in(self.db_path, "ContentFiles") if not log.is_verbose(): bar = progressbar.ProgressBar(max_value=self.max_i) bar.update(self.progress()) try: while True: sample_time = time() sample = self.queue.get(timeout=60) kernels = clutil.get_cl_kernels(sample) ids = [crypto.sha1_str(k) for k in kernels] if self.sampler_opts["static_checker"]: preprocess_opts = { "use_shim": False, "use_gpuverify": self.sampler_opts["gpuverify"] } pp = [clgen.preprocess_for_db(k, **preprocess_opts) for k in kernels] db = dbutil.connect(self.db_path) c = db.cursor() # insert raw samples for kid, src in zip(ids, kernels): dbutil.sql_insert_dict(c, "ContentFiles", {"id": kid, "contents": src}, ignore_existing=True) # insert preprocessed samples if self.sampler_opts["static_checker"]: for kid, (status, src) in zip(ids, pp): dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": kid, "status": status, "contents": src }, ignore_existing=True) c.close() db.commit() db.close() # update progress bar progress = self.progress() if not log.is_verbose(): bar.update(progress) sample_time = time() - sample_time self.sampler.stats["progress"] = progress self.sampler.stats["time"] += sample_time self.sampler._flush_meta(self.cache) # determine if we are done sampling if self.term_condition(): self.producer.stop() return finally: # always kill the sampler thread print() self.producer.stop()
def _get_unreachable(session: session_t, lines: Iterable[str]) -> Union[ None, Unreachable]: for line in lines: if "unreachable" in line.lower(): unreachable = get_or_add( session, Unreachable, sha1=crypto.sha1_str(line), unreachable=line) return unreachable
def __init__(self, generator: Generators.column_t, generation_time: float, src: str): self.generator = generator self.sha1 = crypto.sha1_str(src) self.date = datetime.datetime.utcnow() self.generation_time = generation_time self.linecount = len(src.split("\n")) self.charcount = len(src) self.src = src
def _hash(sampler_opts: dict, kernel_opts: dict) -> str: # we don't consider the number of samples in the ID sampler_opts = deepcopy(sampler_opts) del sampler_opts["min_samples"] del sampler_opts["min_kernels"] del sampler_opts["created"] checksum_data = sorted([str(x) for x in sampler_opts.values()] + [str(x) for x in kernel_opts.values()]) string = "".join([str(x) for x in checksum_data]) return crypto.sha1_str(string)
def from_str(session: session_t, string: str) -> str: """ Instantiate a Stdout object """ # Strip the noise string = Stdout._escape(string) stdout = get_or_add(session, Stdout, sha1=crypto.sha1_str(string), stdout=string) return stdout
def from_str(session: session_t, string: str) -> 'Stderr': string = Stderr._escape(string) sha1 = crypto.sha1_str(string) stderr = get_or_add(session, Stderr, sha1=sha1, linecount=len(string.split("\n")), charcount=len(string), truncated=len(string) > Stderr.max_chars, stderr=string[:Stderr.max_chars]) return stderr
def cache(self, model: clgen.Model): """ Return sampler cache. Parameters ---------- model : clgen.Model CLgen model. Returns ------- labm8 FSCache: Cache. """ sampler_model_hash = crypto.sha1_str(self.hash + model.hash) cache = clgen.mkcache("sampler", sampler_model_hash) # validate metadata against cache self.stats = { "time": 0, "progress": 0 } meta = deepcopy(self.to_json()) if cache.get("META"): cached_meta = jsonutil.read_file(cache["META"]) if "stats" in cached_meta: self.stats = cached_meta["stats"] del cached_meta["stats"] if "created" in cached_meta["sampler"]: del cached_meta["sampler"]["created"] del meta["sampler"]["created"] if "min_samples" in cached_meta["sampler"]: del cached_meta["sampler"]["min_samples"] del meta["sampler"]["min_samples"] if "min_kernels" in cached_meta["sampler"]: del cached_meta["sampler"]["min_kernels"] del meta["sampler"]["min_kernels"] if meta != cached_meta: raise clgen.InternalError("sampler metadata mismatch") else: self._flush_meta(cache) return cache
def _get_stackdump(session: session_t, lines: Iterable[str]) -> Union[ None, StackDump]: in_stackdump = False stackdump = [] for line in lines: if in_stackdump: if line and line[0].isdigit(): stackdump.append(line) else: stackdump_ = "\n".join(stackdump) stackdump = get_or_add( session, StackDump, sha1=crypto.sha1_str(stackdump_), stackdump=stackdump_) return stackdump elif "stack dump:" in line.lower(): in_stackdump = True
def run(self) -> None: i = dbutil.num_rows_in(self.db_path, "ContentFiles") if not log.is_verbose(): bar = progressbar.ProgressBar(max_value=self.max_i) bar.update(self.progress()) try: while True: sample_time = time() # Block while waiting for a new sample to come in: sample = self.queue.get(timeout=120).strip() # Compute the sample ID: kid = crypto.sha1_str(sample) # Add the new sample to the database: db = dbutil.connect(self.db_path) c = db.cursor() dbutil.sql_insert_dict(c, "ContentFiles", { "id": kid, "contents": sample }, ignore_existing=True) c.close() db.commit() db.close() # update progress bar progress = self.progress() if not log.is_verbose(): bar.update(progress) sample_time = time() - sample_time self.sampler.stats["progress"] = progress self.sampler.stats["time"] += sample_time self.sampler._flush_meta(self.cache) # determine if we are done sampling if self.term_condition(): self.producer.stop() return finally: # always kill the sampler thread print() self.producer.stop()
def from_str(session: session_t, string: str) -> 'Stderr': # Strip the noise: string = Stderr._escape(string) # Get metadata: lines = string.split('\n') assertion = Stderr._get_assertion(session, lines) if assertion: unreachable = None stackdump = None else: unreachable = Stderr._get_unreachable(session, lines) if unreachable: stackdump = None else: stackdump = Stderr._get_stackdump(session, lines) session.flush() # Sanity check: errs = sum(1 if x else 0 for x in [assertion, unreachable, stackdump]) if errs > 1: logging.error("Stderr: " + string) if assertion: logging.error("Assertion: " + assertion.assertion) if unreachable: logging.error("Assertion: " + unreachable.unreachable) if stackdump: logging.error("Stackdump: " + stackdump.stackdump) raise LookupError(f"Multiple errors types found in stderr:\n\n" + f"Assertion: {assertion}\n" + f"Unreachable: {unreachable}\n" + f"Stackdump: {stackdump}") stderr = get_or_add( session, Stderr, sha1=crypto.sha1_str(string), assertion=assertion, unreachable=unreachable, stackdump=stackdump, linecount=len(lines), charcount=len(string), truncated=len(string) > Stderr.max_chars, stderr=string[:Stderr.max_chars]) return stderr
def import_clgen_sample(session: session_t, path: Path, cl_launchable: bool = False, harnesses: List[cldriveParams] = [], delete: bool = False) -> None: src = fs.read_file(path) hash_ = crypto.sha1_str(src) dupe = s.query(CLgenProgram).filter(CLgenProgram.hash == hash_).first() if dupe: print(f"warning: ignoring duplicate file {path}") elif not len(src): print(f"warning: ignoring empty file {path}") else: program = CLgenProgram(hash=hash_, runtime=len(src) / CLGEN_INFERENCE_CPS, src=src, linecount=len(src.split('\n')), cl_launchable=cl_launchable) s.add(program) s.commit() # Make test harnesses, if required if harnesses: env = cldrive.make_env() for params in harnesses: testcase = get_or_create(s, CLgenTestCase, program_id=program.id, params_id=params.id) s.flush() clgen_mkharness.mkharness(s, env, testcase) if delete: fs.rm(path)
def ResolveContentId(config: corpus_pb2.Corpus, hc: typing.Optional[hashcache.HashCache] = None) -> str: """Compute the hash of the input contentfiles. This function resolves the unique sha1 checksum of a set of content files. Args: config: The corpus config proto. hc: A hashcache database instance, used for resolving directory hashes. If the corpus has pre_encoded_corpus_url field set, this may be omitted. Returns: A hex encoded sha1 string. """ # We can take a massive shortcut if the content ID is already set in the # config proto. if config.HasField('content_id'): # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this after splitting # out Corpus class. return config.content_id elif config.HasField('pre_encoded_corpus_url'): # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this after splitting # out Corpus class. return crypto.sha1_str(config.pre_encoded_corpus_url) start_time = time.time() if config.HasField('local_directory'): local_directory = ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix) # After the first time we compute the hash of a directory, we write it into # a file. This is a shortcut to work around the fact that computing the # directory checksum is O(n) with respect to the number of files in the # directory (even if the directory is already cached by the hash cache). # This means that it is the responsibility of the user to delete this cached # file if the directory is changed. hash_file_path = pathlib.Path(str(local_directory) + '.sha1.txt') if hash_file_path.is_file(): app.Log(1, "Reading directory hash: '%s'.", hash_file_path) with open(hash_file_path) as f: content_id = f.read().rstrip() else: # No hash file, so compute the directory hash and create it. try: content_id = hc.GetHash(local_directory) except FileNotFoundError as e: raise errors.UserError(e) # Create the hash file in the directory so that next time we don't need # to reference the hash cache. with open(hash_file_path, 'w') as f: print(content_id, file=f) app.Log(1, "Wrote directory hash: '%s'.", hash_file_path) elif config.HasField('local_tar_archive'): # This if not an efficient means of getting the hash, as it requires always # unpacking the archive and reading the entire contents. It would be nicer # to maintain a cache which maps the mtime of tarballs to their content ID, # similart to how local_directory is implemented. content_id = GetHashOfArchiveContents( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)) else: raise NotImplementedError('Unsupported Corpus.contentfiles field value') app.Log(2, 'Resolved Content ID %s in %s ms.', content_id, humanize.Commas(int((time.time() - start_time) * 1000))) return content_id
runtime, status, stdout, stderr = drive_testcase( session, testcase, env, platform_id, device_id) # assert that executed params match expected if stderr != '<-- UTF-ERROR -->': verify_params(platform=args.platform, device=args.device, optimizations=testcase.params.optimizations, global_size=testcase.params.gsize, local_size=testcase.params.lsize, stderr=stderr) # create new result stdout_ = util.escape_stdout(stdout) stdout = get_or_create( session, CLgenStdout, hash=crypto.sha1_str(stdout_), stdout=stdout_) stderr_ = util.escape_stderr(stderr) stderr = get_or_create( session, CLgenStderr, hash=crypto.sha1_str(stderr_), stderr=stderr_) session.flush() result = CLgenResult( testbed_id=testbed.id, testcase_id=testcase.id, status=status, runtime=runtime, stdout_id=stdout.id, stderr_id=stderr.id, outcome=analyze.get_cldrive_outcome(status, runtime, stderr_))
#!/usr/bin/env python3.6 import sys from progressbar import ProgressBar from labm8 import crypto from labm8 import fs if __name__ == "__main__": inpath = sys.argv[1] outdir = sys.argv[2] print(f"reading from {inpath} into {outdir}") assert fs.isfile(inpath) assert not fs.exists(outdir) or fs.isdir(outdir) fs.mkdir(outdir) with open(inpath) as infile: text = infile.read() kernels = text.split("// ==== START SAMPLE ====") kernels = [kernel.strip() for kernel in kernels if kernel.strip()] print(len(kernels), "kernels") sha1s = [crypto.sha1_str(kernel) for kernel in kernels] for kernel, sha1 in ProgressBar()(list(zip(kernels, sha1s))): with open(f"{outdir}/{sha1}.txt", "w") as outfile: print(kernel, file=outfile)
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None: db = dbutil.connect(db_path) if not dbutil.is_github(db): raise clgen.UserError("not a GitHub database") c = db.cursor() for directory in fs.ls(indir, abspaths=True): # hacky hardcoded interpretation of `git remote -v` gitdir = fs.path(directory, ".git") output = subprocess.check_output( ["git", "--git-dir", gitdir, "remote", "-v"], universal_newlines=True) url = output.split("\n")[0].split("\t")[1].split(" ")[0] name = fs.basename(directory) output = subprocess.check_output( f"git --git-dir {gitdir} rev-list --format=format:'%ai' " + f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1", shell=True, universal_newlines=True) try: updated_at = dateutil.parser.parse(output) except ValueError: log.error(f"failed to process {name} {url}") continue c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, )) cached_updated_at = c.fetchone() # Do nothing unless updated timestamps don't match # if cached_updated_at and cached_updated_at[0] >= updated_at: # log.verbose(name, "already in database") # continue c.execute("DELETE FROM Repositories WHERE url=?", (url, )) c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)", (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at)) name_str = " -o ".join( [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)]) output = subprocess.check_output( f"find {directory} -type f {name_str} | grep -v '.git/' || true", shell=True, universal_newlines=True) files = [x.strip() for x in output.split("\n") if x.strip()] # nothing to import if not len(files): # log.verbose("no files in", name) continue log.verbose("processing", len(files), "files in", name) for path in files: relpath = path[len(directory) + 1:] try: contents = inline_fs_headers(path, [], lang=lang) sha = crypto.sha1_str(contents) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (sha, contents)) c.execute( "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)", (sha, relpath, url, sha, len(contents))) except UnicodeDecodeError: log.warning("non UTF-8 file", path) db.commit() c = db.cursor()
def hash_key(key): """ Convert a key to a filename by hashing its value. """ return crypto.sha1_str(json.dumps(key, sort_keys=True))
while True: # get the next batch of programs to run if not len(inbox): next_batch() # we have no programs to run if not len(inbox): break # get next program to run program = inbox.popleft() status, runtime, stderr_ = build_with_clang(program, clang) # create new result hash_ = crypto.sha1_str(stderr_) q = s.query(tables.clang_stderrs.id) \ .filter(tables.clang_stderrs.hash == hash_) \ .first() if q: stderr_id = q[0] else: stderr_id = create_stderr(s, tables, stderr_).id result = tables.clangs(program_id=program.id, clang=args.clang, status=status, runtime=runtime, stderr_id=stderr_id)
def hash_key(key): """ Convert a key to a filename by hashing its value. """ return crypto.sha1_str(json.dumps(key, sort_keys=True))
runtime, status, stdout, stderr = cl_launcher( program.src, platform_id, device_id, *flags) # assert that executed params match expected verify_params(platform=platform_name, device=device_name, optimizations=params.optimizations, global_size=params.gsize, local_size=params.lsize, stderr=stderr) # create new result stdout_ = util.escape_stdout(stdout) stdout = get_or_create(session, CLSmithStdout, hash=crypto.sha1_str(stdout_), stdout=stdout_) stderr_ = util.escape_stderr(stderr) stderr = get_or_create(session, CLSmithStderr, hash=crypto.sha1_str(stderr_), stderr=stderr_) session.flush() result = CLSmithResult(testbed_id=testbed.id, testcase_id=testcase.id, status=status, runtime=runtime, stdout_id=stdout.id, stderr_id=stderr.id,
with Session(commit=False) as s: def flush(): if args.commit: s.commit() while len(to_del): fs.rm(to_del.popleft()) print("Importing CLgen programs ...") paths = [p for p in Path("export/clgen/program").iterdir()] for i, path in enumerate(ProgressBar()(paths)): with open(path) as infile: data = json.loads(infile.read()) new_id = s.query(CLgenProgram.id) \ .filter(CLgenProgram.hash == crypto.sha1_str(data["src"])).scalar() idx = CLgenProgramTranslation(old_id=data["id"], new_id=new_id) s.add(idx) to_del.append(path) if i and not i % 1000: flush() flush() PROGRAMS = dict((old_id, new_id) for old_id, new_id in s.query( CLgenProgramTranslation.old_id, CLgenProgramTranslation.new_id).all()) print("Import CLgen results ...") paths = [p for p in Path("export/clgen/result").iterdir()]