def main(): import sys log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) s = sampler.from_json({ "kernels": { "args": [ "__global float*", "__global float*", "__global float*", "const int" ], "max_length": 5000, "temperature": 1 }, "sampler": { "batch_size": 1000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) print("Corpus size:", m.corpus.size) print("Vocab size: ", m.corpus.vocab_size) print() clgen.platform_info() print() outpath = "./benchmark-" + fs.basename(sys.argv[1]) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def main(): from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("model", help="Path to model") parser.add_argument("target", help="Path to target code") parser.add_argument("-i", "--input", metavar="path", default=None, help="Path to starting code") parser.add_argument("-l", "--log", metavar="path", default="search-log.json", help="Path to log file") args = parser.parse_args() clgen_log.init(verbose=True) # load and train model modelpath = args.model if modelpath.endswith(".tar.bz2"): m = model.from_tar(modelpath) else: model_json = clgen.load_json_file(modelpath) m = clgen.model.from_json(model_json) m.train() # read target code with open(args.target) as infile: target_code = infile.read() # read start code if provided start_code = None if args.input: with open(args.input) as infile: start_code = infile.read() search(m, target_code, args.log, start_code=start_code)
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): continue _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): print("skipped result for", outpath) continue else: print("starting result for", outpath) _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def search(m, target_code, logpath, start_code=None): # resume search if fs.exists(logpath): log = clgen.load_json_file(logpath) print("resuming search of", len(get_steps(log)), "steps") else: log = [] steps = get_steps(log) if start_code and not len(steps): code = start_code elif len(steps): code = steps[-1]['data']['code'] else: code = get_start_code(m) target_features = get_features(target_code) features = get_features(code) distance = get_distance(target_features, features) if get_entries(log, "init"): init = get_entries(log, "init")[0] assert (init['data']['target_code'] == target_code) assert (init['data']['target_features'] == escape_features( target_features)) # load history from log code_history = get_code_history(log) else: # create init entry add_to_log(log, { "start_code": code, "start_features": escape_features(features), "target_features": escape_features(target_features), "target_code": target_code, "distance": distance, "model": m.meta }, name="init") write_log(log, logpath) code_history = [code] # keep track of best if len(steps): best = steps[-1]['data']['best'] else: best = {"distance": distance, "code": code, "improvement_count": 0} # maximum number of mutations before stopping search MAX_STEPS = 1000 for i in range(len(steps), MAX_STEPS): print("step", i, "of", MAX_STEPS) newcode, mutate_idx, mutate_seed, attempts = get_mutation(m, code) try: features = get_features(newcode) distance = get_distance(target_features, features) except ValueError: newcode = None entry = {"count": i, "attempts": attempts} if newcode: entry["base_code"] = code entry["code"] = newcode entry["distance"] = distance entry["distance_diff"] = 1 - distance / best["distance"] entry["features"] = escape_features(features) entry["mutate_idx"] = mutate_idx entry["mutate_seed"] = mutate_seed code_history.append(code) else: print(" -> step back") # step back if len(code_history): code = code_history.pop() entry["step_back"] = code if distance < best["distance"]: print(" -> improvement {:.1f}%".format(entry["distance_diff"] * 100)) best["distance"] = distance best["code"] = newcode best["features"] = escape_features(features) best["improvement_count"] += 1 else: if newcode: print(" -> regression {:.1f}%".format( entry["distance_diff"] * 100)) entry["best"] = best add_to_log(log, entry, name="step") write_log(log, logpath) # doesn't have to be exactly zero but whatever if distance <= 0.001: print("found exact match!") break add_to_log(log, { "best_code": best['code'], "best_features": escape_features(best['features']), "best_distance": best['distance'] }, name="end") write_log(log, logpath)
def search(m, target_code, logpath, start_code=None): # resume search if fs.exists(logpath): log = clgen.load_json_file(logpath) print("resuming search of", len(get_steps(log)), "steps") else: log = [] steps = get_steps(log) if start_code and not len(steps): code = start_code elif len(steps): code = steps[-1]['data']['code'] else: code = get_start_code(m) target_features = get_features(target_code) features = get_features(code) distance = get_distance(target_features, features) if get_entries(log, "init"): init = get_entries(log, "init")[0] assert(init['data']['target_code'] == target_code) assert(init['data']['target_features'] == escape_features(target_features)) # load history from log code_history = get_code_history(log) else: # create init entry add_to_log(log, { "start_code": code, "start_features": escape_features(features), "target_features": escape_features(target_features), "target_code": target_code, "distance": distance, "model": m.meta }, name="init") write_log(log, logpath) code_history = [code] # keep track of best if len(steps): best = steps[-1]['data']['best'] else: best = { "distance": distance, "code": code, "improvement_count": 0 } # maximum number of mutations before stopping search MAX_STEPS = 1000 for i in range(len(steps), MAX_STEPS): print("step", i, "of", MAX_STEPS) newcode, mutate_idx, mutate_seed, attempts = get_mutation(m, code) try: features = get_features(newcode) distance = get_distance(target_features, features) except ValueError: newcode = None entry = { "count": i, "attempts": attempts } if newcode: entry["base_code"] = code entry["code"] = newcode entry["distance"] = distance entry["distance_diff"] = 1 - distance / best["distance"] entry["features"] = escape_features(features) entry["mutate_idx"] = mutate_idx entry["mutate_seed"] = mutate_seed code_history.append(code) else: print(" -> step back") # step back if len(code_history): code = code_history.pop() entry["step_back"] = code if distance < best["distance"]: print(" -> improvement {:.1f}%".format( entry["distance_diff"] * 100)) best["distance"] = distance best["code"] = newcode best["features"] = escape_features(features) best["improvement_count"] += 1 else: if newcode: print(" -> regression {:.1f}%".format( entry["distance_diff"] * 100)) entry["best"] = best add_to_log(log, entry, name="step") write_log(log, logpath) # doesn't have to be exactly zero but whatever if distance <= 0.001: print("found exact match!") break add_to_log(log, { "best_code": best['code'], "best_features": escape_features(best['features']), "best_distance": best['distance'] }, name="end") write_log(log, logpath)