def load_and_test(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204): np.random.seed(seed) name = model_desc["name"] inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format( **vars()) outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format( **vars()) if fs.exists(outpath): return load_result(model_desc, platform, source, n_splits=n_splits, split_i=split_i, atomizer=atomizer, maxlen=maxlen, seed=seed) if not fs.exists(inpath): return False test_fn = model_desc["test_fn"] load_fn = model_desc["load_fn"] # load training data _atomizer = globals().get(atomizer) data_desc = load_data_desc(platform=platform, source=source, max_seq_len=maxlen, atomizer=_atomizer, quiet=True) train, test = get_training_data(data_desc, seed=seed, split_i=split_i, n_splits=n_splits) # load model model = load_fn(inpath) print("model loaded from", inpath) # test model predictions = test_fn(model=model, test=test, seed=seed) analysis = analyze(predictions, test) test.update(analysis) test["predictions"] = predictions with open(outpath, 'wb') as outfile: pickle.dump(test, outfile) print("result saved to", outpath) return test
def test_mv(): system.echo("Hello, world!", "/tmp/labm8.tmp") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp") # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp.copy") fs.mv("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp")
def main(): parser = ArgumentParser(description=__description__) parser.add_argument("classification") parser.add_argument("outdir") args = parser.parse_args() db.init("cc1") session = db.make_session() program_ids = [ x[0] for x in session.query(sql.distinct(CLSmithResult.program_id)) \ .filter(CLSmithResult.classification == args.classification).all()] header = fs.read_file(dsmith.data_path("include", "clsmith.h")) fs.mkdir(args.outdir) for program_id in ProgressBar()(program_ids): outpath = fs.path(args.outdir, program_id + ".cl") if not fs.exists(outpath): program = session.query(CLSmithProgram) \ .filter(CLSmithProgram.id == program_id).one() pre, post = program.src.split('#include "CLSmith.h"') inlined = pre + header + post with open(outpath, "w") as outfile: print(inlined, file=outfile)
def __init__(self, path, basecache=None): """ Create a new JSON cache. Optionally supports populating the cache with values of an existing cache. Arguments: basecache (TransientCache, optional): Cache to populate this new cache with. """ super(JsonCache, self).__init__() self.path = fs.abspath(path) if fs.exists(self.path) and fs.read_file(self.path): io.debug(("Loading cache '{0}'".format(self.path))) with open(self.path) as file: self._data = json.load(file) if basecache is not None: for key, val in basecache.items(): self._data[key] = val # Register exit handler atexit.register(self.write)
def assert_program_exists(path): """ Assert that a program exists. If the given path does not exist and is not a file, raises ProgramNotFoundError. """ if not fs.exists(path) or not fs.isfile(path): raise ProgramNotFoundError(path)
def test_cp_dir(): fs.rm("/tmp/labm8") fs.rm("/tmp/labm8.copy") fs.mkdir("/tmp/labm8/foo/bar") assert not fs.exists("/tmp/labm8.copy") fs.cp("/tmp/labm8/", "/tmp/labm8.copy") assert fs.isdir("/tmp/labm8.copy") assert fs.isdir("/tmp/labm8.copy/foo") assert fs.isdir("/tmp/labm8.copy/foo/bar")
def test_cp_overwrite(): system.echo("Hello, world!", "/tmp/labm8.tmp") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp") # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp.copy") fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") system.echo("Goodbye, world!", "/tmp/labm8.tmp") fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") assert fs.read("/tmp/labm8.tmp") == fs.read("/tmp/labm8.tmp.copy")
def __contains__(self, key): """ Check cache contents. Arguments: key: Key. Returns: bool: True if key in cache, else false. """ path = self.keypath(key) return fs.exists(path)
def test_scp(): system.echo("Hello, world!", "/tmp/labm8.tmp") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp") # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp.copy") # Perform scp. system.scp("localhost", "/tmp/labm8.tmp", "/tmp/labm8.tmp.copy", path="lib/labm8/data/test/bin") assert fs.read("/tmp/labm8.tmp") == fs.read("/tmp/labm8.tmp.copy")
def train_and_save(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204): np.random.seed(seed) name = model_desc["name"] outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format( **vars()) if not fs.exists(outpath): create_fn = model_desc.get("create_model", _nop) train_fn = model_desc.get("train_fn", _nop) save_fn = model_desc["save_fn"] _atomizer = globals().get(atomizer) # load training data data_desc = load_data_desc(platform=platform, source=source, max_seq_len=maxlen, atomizer=_atomizer) train, test = get_training_data(data_desc, seed=seed, split_i=split_i, n_splits=n_splits) # create model model = create_fn(seed=seed, data_desc=data_desc) # train model train_fn(model=model, train=train, seed=seed, platform=platform, source=source) fs.mkdir("models/{name}".format(**vars())) save_fn(outpath, model) print("model saved as", outpath) # evaluate model return load_and_test(model_desc, platform, source, n_splits=n_splits, split_i=split_i, atomizer=atomizer, maxlen=maxlen, seed=seed)
def __delitem__(self, key): """ Delete cached file. Arguments: key: Key. Raises: KeyError: If file not in cache. """ path = self.keypath(key) if fs.exists(path): fs.rm(path) else: raise KeyError(key)
def benchmark_inference(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204, n_runtimes=100): np.random.seed(seed) name = model_desc["name"] inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format( **vars()) outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format( **vars()) if not fs.exists(inpath): return False test_fn = model_desc["test_fn"] load_fn = model_desc["load_fn"] # load training data _atomizer = globals().get(atomizer) data_desc = load_data_desc(platform=platform, source=source, max_seq_len=maxlen, atomizer=_atomizer, quiet=True) train, test = get_training_data(data_desc, seed=seed, split_i=split_i, n_splits=n_splits) # load model model = load_fn(inpath) print("model loaded from", inpath) # test model runtimes = [] for i in range(n_runtimes): start = time.time() predictions = test_fn(model=model, test=test, seed=seed) elapsed = (time.time() - start) / len(test["y"]) runtimes.append(elapsed) return np.array(runtimes)
def test_LockFile_force_replace_stale(): """Test that lockfile is replaced if forced.""" with tempfile.TemporaryDirectory() as d: path = pathlib.Path(d) / 'LOCK' lock = lockfile.LockFile(path) MAX_PROCESSES = 4194303 # OS-dependent. This value is for Linux lock.acquire(pid=MAX_PROCESSES + 1) assert lock.islocked assert not lock.owned_by_self with pytest.raises(lockfile.UnableToAcquireLockError): lock.acquire() lock.acquire(force=True) assert lock.islocked assert lock.owned_by_self lock.release() assert not fs.exists(lock.path)
def __setitem__(self, key, value): """ Emplace file in cache. Arguments: key: Key. value (str): Path of file to insert in cache. Raises: ValueError: If no "value" does nto exist. """ if not fs.exists(value): raise ValueError(value) path = self.keypath(key) fs.mkdir(self.path) fs.mv(value, path)
def __getitem__(self, key): """ Get path to file in cache. Arguments: key: Key. Returns: str: Path to cache value. Raises: KeyErorr: If key not in cache. """ path = self.keypath(key) if fs.exists(path): return path else: raise KeyError(key)
def load_result(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204): name = model_desc["name"] inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format( **vars()) if not fs.exists(inpath): return False with open(inpath, 'rb') as infile: result = pickle.load(infile) return result
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): print("skipped result for", outpath) continue else: print("starting result for", outpath) _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def search(m, target_code, logpath, start_code=None): # resume search if fs.exists(logpath): log = clgen.load_json_file(logpath) print("resuming search of", len(get_steps(log)), "steps") else: log = [] steps = get_steps(log) if start_code and not len(steps): code = start_code elif len(steps): code = steps[-1]['data']['code'] else: code = get_start_code(m) target_features = get_features(target_code) features = get_features(code) distance = get_distance(target_features, features) if get_entries(log, "init"): init = get_entries(log, "init")[0] assert (init['data']['target_code'] == target_code) assert (init['data']['target_features'] == escape_features( target_features)) # load history from log code_history = get_code_history(log) else: # create init entry add_to_log(log, { "start_code": code, "start_features": escape_features(features), "target_features": escape_features(target_features), "target_code": target_code, "distance": distance, "model": m.meta }, name="init") write_log(log, logpath) code_history = [code] # keep track of best if len(steps): best = steps[-1]['data']['best'] else: best = {"distance": distance, "code": code, "improvement_count": 0} # maximum number of mutations before stopping search MAX_STEPS = 1000 for i in range(len(steps), MAX_STEPS): print("step", i, "of", MAX_STEPS) newcode, mutate_idx, mutate_seed, attempts = get_mutation(m, code) try: features = get_features(newcode) distance = get_distance(target_features, features) except ValueError: newcode = None entry = {"count": i, "attempts": attempts} if newcode: entry["base_code"] = code entry["code"] = newcode entry["distance"] = distance entry["distance_diff"] = 1 - distance / best["distance"] entry["features"] = escape_features(features) entry["mutate_idx"] = mutate_idx entry["mutate_seed"] = mutate_seed code_history.append(code) else: print(" -> step back") # step back if len(code_history): code = code_history.pop() entry["step_back"] = code if distance < best["distance"]: print(" -> improvement {:.1f}%".format(entry["distance_diff"] * 100)) best["distance"] = distance best["code"] = newcode best["features"] = escape_features(features) best["improvement_count"] += 1 else: if newcode: print(" -> regression {:.1f}%".format( entry["distance_diff"] * 100)) entry["best"] = best add_to_log(log, entry, name="step") write_log(log, logpath) # doesn't have to be exactly zero but whatever if distance <= 0.001: print("found exact match!") break add_to_log(log, { "best_code": best['code'], "best_features": escape_features(best['features']), "best_distance": best['distance'] }, name="end") write_log(log, logpath)
def test_exists(): assert fs.exists(__file__) assert fs.exists("/") assert not fs.exists("/not/a/real/path (I hope!)")
#!/usr/bin/env python3.6 import sys from phd.lib.labm8 import crypto from phd.lib.labm8 import fs from progressbar import ProgressBar if __name__ == "__main__": inpath = sys.argv[1] outdir = sys.argv[2] print(f"reading from {inpath} into {outdir}") assert fs.isfile(inpath) assert not fs.exists(outdir) or fs.isdir(outdir) fs.mkdir(outdir) with open(inpath) as infile: text = infile.read() kernels = text.split("// ==== START SAMPLE ====") kernels = [kernel.strip() for kernel in kernels if kernel.strip()] print(len(kernels), "kernels") sha1s = [crypto.sha1_str(kernel) for kernel in kernels] for kernel, sha1 in ProgressBar()(list(zip(kernels, sha1s))): with open(f"{outdir}/{sha1}.txt", "w") as outfile: print(kernel, file=outfile)