def main(): import sys log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) s = sampler.from_json({ "kernels": { "args": [ "__global float*", "__global float*", "__global float*", "const int" ], "max_length": 5000, "temperature": 1 }, "sampler": { "batch_size": 1000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) print("Corpus size:", m.corpus.size) print("Vocab size: ", m.corpus.vocab_size) print() clgen.platform_info() print() outpath = "./benchmark-" + fs.basename(sys.argv[1]) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def read_file(*components, **kwargs): """ Load a JSON data blob. Arguments: path (str): Path to file. must_exist (bool, otional): If False, return empty dict if file does not exist. Returns: array or dict: JSON data. Raises: File404: If path does not exist, and must_exist is True. InvalidFile: If JSON is malformed. """ must_exist = kwargs.get("must_exist", True) if must_exist: path = fs.must_exist(*components) else: path = fs.path(*components) try: with open(path) as infile: return loads(infile.read()) except ValueError as e: raise ValueError( "malformed JSON file '{path}'. Message from parser: {err}".format( path=fs.basename(path), err=str(e))) except IOError as e: if not must_exist: return {} else: return e
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): continue _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def read_file(*components, **kwargs): """ Load a JSON data blob. Arguments: path (str): Path to file. must_exist (bool, otional): If False, return empty dict if file does not exist. Returns: array or dict: JSON data. Raises: File404: If path does not exist, and must_exist is True. InvalidFile: If JSON is malformed. """ must_exist = kwargs.get("must_exist", True) if must_exist: path = fs.must_exist(*components) else: path = fs.path(*components) try: with open(path) as infile: return loads(infile.read()) except ValueError as e: raise ValueError( "malformed JSON file '{path}'. Message from parser: {err}" .format(path=fs.basename(path), err=str(e))) except IOError as e: if not must_exist: return {} else: return e
def _msg(i, x): n = i + 1 filename = fs.basename(x[0]) lineno = x[1] fnname = x[2] loc = "{filename}:{lineno}".format(**vars()) return " #{n} {loc: <18} {fnname}()".format(**vars())
def _user_message_with_stacktrace(exception): # get limited stack trace _, _, tb = sys.exc_info() trace = "\n".join(" {file}:{ln}:{fn}".format( file=fs.basename(x[0]), ln=x[1], fn=x[2]) for x in traceback.extract_tb(tb, limit=5)[1:]) log.fatal("""\ {err} ({type}) stacktrace: {stack_trace} Please report bugs at <https://github.com/ChrisCummins/clgen/issues>\ """.format(err=e, type=type(e).__name__, stack_trace=trace))
def merge(old_oracle, dbs, path): """ Merge databases into one. Arguments: dbs (list of Database): Databases to merge. path (str): Path to merged database. Returns: Database: merged database instance. """ print("Merging {n} databases:".format(n=len(dbs) + 1)) print(" ", old_oracle) for db in dbs: print(" ", db) print() # Make a copy of the old oracle database to work from. io.info("Coping", old_oracle, "->", fs.basename(path)) fs.cp(old_oracle, path) target = migrate(_db.Database(path=path)) for db in dbs + [target]: try: db.num_rows("runtimes") except sqlite3.DatabaseError as e: io.error("Broken db:", db.path) io.fatal(e) num_runtimes = [db.num_rows("runtimes") for db in dbs] expected_total = target.num_rows("runtimes") + sum(num_runtimes) target.merge(dbs) total = target.num_rows("runtimes") if total != expected_total: io.fatal("Expected total", expected_total, "!= actual total", total) io.info(("Merged {num_db} databases, {n} rows" .format(num_db=len(dbs), n=total))) return target
def compile_cpp_code(code): """ Compile C++ code to a dynamic library. Arguments: code (str): C++ socde. Returns: str: Path to binary. """ bincache = cache.FSCache(fs.path("~/.cache/visioncpp")) if bincache.get(code): logging.info("Found cached binary {}".format( fs.basename(bincache[code]))) else: check_for_computecpp() counter = {"val": 0} def progress(msg): text = "{}: {}".format(counter["val"], msg) if msg else "" if logging.getLogger().getEffectiveLevel() <= logging.INFO: end = "\n" else: end = "" print("\r\033[K {}".format(text), end=end) counter["val"] += 1 sys.stdout.flush() tmpdir = mkdtemp(prefix="visioncpp-") try: progress("compiling device code ...") stub = stub_file(code, dir=tmpdir) progress("compiling host code ...") host = host_compile(code, stub, dir=tmpdir) progress("linking executable ...") tmpbin = link(host, dir=tmpdir) progress("") bincache[code] = tmpbin except Exception as e: rmtree(tmpdir) raise e rmtree(tmpdir) return bincache[code]
def merge(old_oracle, dbs, path): """ Merge databases into one. Arguments: dbs (list of Database): Databases to merge. path (str): Path to merged database. Returns: Database: merged database instance. """ print("Merging {n} databases:".format(n=len(dbs) + 1)) print(" ", old_oracle) for db in dbs: print(" ", db) print() # Make a copy of the old oracle database to work from. io.info("Coping", old_oracle, "->", fs.basename(path)) fs.cp(old_oracle, path) target = migrate(_db.Database(path=path)) for db in dbs + [target]: try: db.num_rows("runtimes") except sqlite3.DatabaseError as e: io.error("Broken db:", db.path) io.fatal(e) num_runtimes = [db.num_rows("runtimes") for db in dbs] expected_total = target.num_rows("runtimes") + sum(num_runtimes) target.merge(dbs) total = target.num_rows("runtimes") if total != expected_total: io.fatal("Expected total", expected_total, "!= actual total", total) io.info(("Merged {num_db} databases, {n} rows".format(num_db=len(dbs), n=total))) return target
def from_bin(path: Path = "gslang", session: session_t = None) -> List['Testbed']: import cldrive with ReuseSession(session) as s: basename = fs.basename(path) version = Testbed._get_version(path) platform = get_or_add(s, Platform, platform=basename, version=version, host=cldrive.host_os()) s.flush() return [ get_or_add(s, Testbed, platform_id=platform.id, optimizations=True), ]
def file(path: str, **kwargs): """ Drive an OpenCL kernel file. Arguments: path (str): Path to file **kwargs (dict, optional): Arguments to kernel() """ with open(path) as infile: src = infile.read() kernels = clutil.get_cl_kernels(src) # error if there's no kernels if not len(kernels): if kwargs.get("fatal_errors", False): raise E_BAD_CODE("no kernels in file '{}'".format(path)) else: print(path, "-", "E_BAD_CODE", '-', sep=',', file=sys.stderr) # execute all kernels in file for kernelsrc in kernels: kernel(kernelsrc, filename=fs.basename(path), **kwargs)
def _main() -> None: cache = clgen.cachepath() log.warning("Not Implemented: refresh corpuses") if fs.isdir(cache, "model"): cached_modeldirs = fs.ls(fs.path(cache, "model"), abspaths=True) for cached_modeldir in cached_modeldirs: cached_model_id = fs.basename(cached_modeldir) cached_meta = jsonutil.read_file(fs.path(cached_modeldir, "META")) model = clgen.Model.from_json(cached_meta) if cached_model_id != model.hash: log.info(cached_model_id, '->', model.hash) if fs.isdir(model.cache.path): log.fatal("cache conflict", file=sys.stderr) fs.mv(cached_modeldir, model.cache.path) log.warning("Not Implemented: refresh samplers")
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): print("skipped result for", outpath) continue else: print("starting result for", outpath) _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def features_dir(csv_path): return fs.basename(fs.dirname(csv_path))
def test_basename(self): self._test("foo", fs.basename("foo")) self._test("foo", fs.basename(fs.abspath("foo")))
def test_basename(): assert "foo" == fs.basename("foo") assert "foo" == fs.basename(fs.abspath("foo"))
def test_must_exist(): with tempfile.NamedTemporaryFile(prefix='labm8_') as f: assert fs.must_exist(f.name) == f.name assert fs.must_exist(fs.dirname(f.name), fs.basename(f.name)) == f.name with pytest.raises(fs.File404): fs.must_exist("/not/a/real/path")
type=str, default="cc1", help="MySQL database hostname") args = parser.parse_args() db.init(args.hostname) with Session(commit=False) as s: # Export results # print("Exporting CLgen results ...") fs.mkdir("export/clgen/result") # Pick up where we left off done = set([ int(fs.basename(path)) for path in Path("export/clgen/result").iterdir() ]) print(len(done), "done") ids = set([x[0] for x in s.query(CLgenResult.id).all()]) print(len(ids), "in total") todo = ids - done print(len(todo), "todo") for result_id in ProgressBar()(todo): result = s.query(CLgenResult).filter( CLgenResult.id == result_id).scalar() with open(f"export/clgen/result/{result.id}", "w") as outfile: print(json.dumps({ "id":
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None: db = dbutil.connect(db_path) if not dbutil.is_github(db): raise clgen.UserError("not a GitHub database") c = db.cursor() for directory in fs.ls(indir, abspaths=True): # hacky hardcoded interpretation of `git remote -v` gitdir = fs.path(directory, ".git") output = subprocess.check_output( ["git", "--git-dir", gitdir, "remote", "-v"], universal_newlines=True) url = output.split("\n")[0].split("\t")[1].split(" ")[0] name = fs.basename(directory) output = subprocess.check_output( f"git --git-dir {gitdir} rev-list --format=format:'%ai' " + f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1", shell=True, universal_newlines=True) try: updated_at = dateutil.parser.parse(output) except ValueError: log.error(f"failed to process {name} {url}") continue c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, )) cached_updated_at = c.fetchone() # Do nothing unless updated timestamps don't match # if cached_updated_at and cached_updated_at[0] >= updated_at: # log.verbose(name, "already in database") # continue c.execute("DELETE FROM Repositories WHERE url=?", (url, )) c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)", (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at)) name_str = " -o ".join( [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)]) output = subprocess.check_output( f"find {directory} -type f {name_str} | grep -v '.git/' || true", shell=True, universal_newlines=True) files = [x.strip() for x in output.split("\n") if x.strip()] # nothing to import if not len(files): # log.verbose("no files in", name) continue log.verbose("processing", len(files), "files in", name) for path in files: relpath = path[len(directory) + 1:] try: contents = inline_fs_headers(path, [], lang=lang) sha = crypto.sha1_str(contents) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (sha, contents)) c.execute( "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)", (sha, relpath, url, sha, len(contents))) except UnicodeDecodeError: log.warning("non UTF-8 file", path) db.commit() c = db.cursor()
def load_data_desc(platform, source="B", max_seq_len=1000, atomizer=CharacterAtomizer, quiet=False): """ load experimental results """ def get_benchmarks(platform): B = pd.read_csv( fs.path("runtimes/{platform}-benchmarks.csv".format(**vars()))) B["source"] = [escape_suite_name(x) for x in B["benchmark"]] B["synthetic"] = [0] * len(B) return B def get_npb_benchmarks(platform): B = get_benchmarks(platform) msk = B["source"] == "NPB" return B[msk] def get_synthetics(platform): S = pd.read_csv(fs.path("runtimes/{platform}-clgen.csv".format(**vars()))) S["source"] = ["CLgen"] * len(S) S["synthetic"] = [1] * len(S) return S if source == "B": dataframe = get_benchmarks(platform) elif source == "S": dataframe = get_synthetics(platform) elif source == "BS": dataframe = pd.concat((get_benchmarks(platform), get_synthetics(platform))) elif source == "N": dataframe = get_npb_benchmarks(platform) elif source == "NS": dataframe = pd.concat( (get_npb_benchmarks(platform), get_synthetics(platform))) else: raise Exception dataframe["oracle_enc"] = [1 if x == "GPU" else 0 for x in dataframe["oracle"].values] dataframe["benchmark_name"] = [escape_benchmark_name(b) for b in dataframe["benchmark"].values] # load source code: source_dir = fs.path("kernels") srcs, benchmark_names = [], [] for row in dataframe["benchmark"].values: inpath = fs.path(source_dir, row + ".cl") with open(inpath) as infile: src = infile.read() if not src.startswith("__kernel void A"): print(fs.basename(inpath)) raise Exception(src) srcs.append(src) dataframe["src"] = srcs dataframe["src_len"] = [len(s) for s in srcs] if not quiet: print("num instances {} ({} synthetic, {} benchmarks)".format( len(dataframe), sum(dataframe["synthetic"].values), len(dataframe) - sum(dataframe["synthetic"].values))) print("unique kernels", len(set(srcs))) # encode and pad sequences: atomizer = atomizer.from_text(''.join(dataframe["src"].values)) seqs = [atomizer.atomize(seq) for seq in dataframe["src"].values] seq_length = min(max(len(s) for s in seqs), max_seq_len) pad_val = atomizer.vocab_size + 1 dataframe["seq_len"] = [len(s) for s in seqs] dataframe["seq"] = list(pad_sequences(seqs, maxlen=seq_length, value=pad_val)) if not quiet: print("vocab size", atomizer.vocab_size + 1) print("pad val", pad_val) print("padded seq length", seq_length) return { "dataframe": dataframe, "seq_length": seq_length, "atomizer": atomizer }
def inline_fs_headers(path: Path, stack: List[str], lang: clgen.Language = clgen.Language.OPENCL, topdir: Path = None) -> str: """ Recursively inline headers in file. Parameters ---------- path : str File. stack : List[str] File stack. topdir : Path The top level directory to stop searching for includes in. Returns ------- str Inlined file. """ stack.append(path) if topdir is None: topdir = fs.dirname(path) # shell escaped top directory escp_topdir = topdir.replace('"', '\\"') include_re = clgen.include_regexp(lang) with open(path, encoding="utf-8") as infile: src = infile.read() outlines = [] for line in src.split('\n'): match = re.match(include_re, line) if match: # We have an import to inline! include = match.group("path") # Search for files with that name in the repository include_basename = fs.basename(include) esc_basename = include_basename.replace('"', '\\"') candidates = [x for x in subprocess.check_output( f'find "{escp_topdir}" -type f -name {esc_basename}', shell=True, universal_newlines=True)\ .split('\n') if x] # Select which file to inline: if len(candidates) == 1: # If there's exactly one match, then we're done: file_to_inline = candidates[0] elif len(candidates) > 1: # We have multiple candidates to inline, so we'll compare the # full paths (relative to the top directory) to select the one # whose name is the closest match: rel_matches = [match[len(topdir) + 1:] for match in candidates] distances = [ editdistance.eval(include, path) for path in rel_matches ] min_distance = min(distances) file_to_inline = candidates[distances.index(min_distance)] log.debug( f"Inferred include '{file_to_inline}' from '{line}' with distance {min_distance}" ) else: # We didn't find anything suitable: file_to_inline = None # Process the inline file: if file_to_inline in stack: # We've already inlined this file, so ignore it: outlines.append( clgen.format_as_comment( lang, f'[FETCH] ignored_include({line})')) elif file_to_inline: # Inline the file by recursively expanding its contents: outlines.append( clgen.format_as_comment(lang, f'[FETCH] begin_include({line})')) inline_src = inline_fs_headers(file_to_inline, stack) outlines.append(inline_src) outlines.append( clgen.format_as_comment(lang, f'[FETCH] end_include({line})')) else: # We didn't find anything suitable, so keep the original # include: outlines.append( clgen.format_as_comment(lang, f'[FETCH] not_found({line})')) outlines.append(line) else: outlines.append(line) return '\n'.join(outlines)