def test_mv(self): system.echo("Hello, world!", "/tmp/labm8.tmp") self._test(["Hello, world!"], fs.read("/tmp/labm8.tmp")) # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") self._test(False, fs.exists("/tmp/labm8.tmp.copy")) fs.mv("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") self.assertEqual(["Hello, world!"], fs.read("/tmp/labm8.tmp.copy")) self._test(False, fs.exists("/tmp/labm8.tmp"))
def test_mv(): system.echo("Hello, world!", "/tmp/labm8.tmp") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp") # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp.copy") fs.mv("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp")
def test_cli(): fs.rm("kernels.db") cli.main("db init kernels.db".split()) assert fs.exists("kernels.db") corpus_path = tests.archive("tiny", "corpus") cli.main("db explore kernels.db".split()) cli.main(f"fetch fs kernels.db {corpus_path}".split()) cli.main("preprocess kernels.db".split()) cli.main("db explore kernels.db".split()) fs.rm("kernels_out") cli.main("db dump kernels.db -d kernels_out".split()) assert fs.isdir("kernels_out") assert len(fs.ls("kernels_out")) >= 1 fs.rm("kernels.cl") cli.main("db dump kernels.db kernels.cl --file-sep --eof --reverse".split()) assert fs.isfile("kernels.cl") fs.rm("kernels_out") cli.main("db dump kernels.db --input-samples -d kernels_out".split()) assert fs.isdir("kernels_out") assert len(fs.ls("kernels_out")) == 250 fs.rm("kernels.db") fs.rm("kernels_out")
def __init__(self, path, basecache=None): """ Create a new JSON cache. Optionally supports populating the cache with values of an existing cache. Arguments: basecache (TransientCache, optional): Cache to populate this new cache with. """ super(JsonCache, self).__init__() self.path = fs.abspath(path) if fs.exists(self.path): io.debug(("Loading cache '{0}'".format(self.path))) with open(self.path) as file: self._data = json.load(file) if basecache is not None: for key,val in basecache.items(): self._data[key] = val # Register exit handler atexit.register(self.write)
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): continue _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def __init__(self, path, basecache=None): """ Create a new JSON cache. Optionally supports populating the cache with values of an existing cache. Arguments: basecache (TransientCache, optional): Cache to populate this new cache with. """ super(JsonCache, self).__init__() self.path = fs.abspath(path) if fs.exists(self.path) and fs.read_file(self.path): io.debug(("Loading cache '{0}'".format(self.path))) with open(self.path) as file: self._data = json.load(file) if basecache is not None: for key, val in basecache.items(): self._data[key] = val # Register exit handler atexit.register(self.write)
def train_and_save(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204): np.random.seed(seed) name = model_desc["name"] outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format( **vars()) if not fs.exists(outpath): create_fn = model_desc.get("create_model", _nop) train_fn = model_desc.get("train_fn", _nop) save_fn = model_desc["save_fn"] _atomizer = globals().get(atomizer) # load training data data_desc = load_data_desc(platform=platform, source=source, max_seq_len=maxlen, atomizer=_atomizer) train, test = get_training_data(data_desc, seed=seed, split_i=split_i, n_splits=n_splits) # create model model = create_fn(seed=seed, data_desc=data_desc) # train model train_fn(model=model, train=train, seed=seed, platform=platform, source=source) fs.mkdir("models/{name}".format(**vars())) save_fn(outpath, model) print("model saved as", outpath) # evaluate model return load_and_test(model_desc, platform, source, n_splits=n_splits, split_i=split_i, atomizer=atomizer, maxlen=maxlen, seed=seed)
def main(): parser = ArgumentParser(description=__description__) parser.add_argument("classification") parser.add_argument("outdir") args = parser.parse_args() db.init("cc1") session = db.make_session() program_ids = [ x[0] for x in session.query(sql.distinct(CLSmithResult.program_id)) \ .filter(CLSmithResult.classification == args.classification).all()] header = fs.read_file(dsmith.data_path("include", "clsmith.h")) fs.mkdir(args.outdir) for program_id in ProgressBar()(program_ids): outpath = fs.path(args.outdir, program_id + ".cl") if not fs.exists(outpath): program = session.query(CLSmithProgram) \ .filter(CLSmithProgram.id == program_id).one() pre, post = program.src.split('#include "CLSmith.h"') inlined = pre + header + post with open(outpath, "w") as outfile: print(inlined, file=outfile)
def _find_weka(): """ Look for Weka installation in system $PATH or /Applications. If not found, return None. """ mac_path = '/Applications/Weka.app' linux_path = system.which('weka') return mac_path if fs.exists(mac_path) else linux_path
def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") for path in files_to_rm: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err
def assert_program_exists(path): """ Assert that a program exists. If the given path does not exist and is not a file, raises ProgramNotFoundError. """ if not fs.exists(path) or not fs.isfile(path): raise ProgramNotFoundError(path)
def test_create_db_gh(): db_path = tests.data_path("db", "tmp.db", exists=False) fs.rm(db_path) dbutil.create_db(db_path, github=True) assert fs.exists(db_path) with pytest.raises(clgen.UserError): dbutil.create_db(db_path, github=True)
def test_cp_dir(): fs.rm("/tmp/labm8") fs.rm("/tmp/labm8.copy") fs.mkdir("/tmp/labm8/foo/bar") assert not fs.exists("/tmp/labm8.copy") fs.cp("/tmp/labm8/", "/tmp/labm8.copy") assert fs.isdir("/tmp/labm8.copy") assert fs.isdir("/tmp/labm8.copy/foo") assert fs.isdir("/tmp/labm8.copy/foo/bar")
def test_cp_dir(self): fs.rm("/tmp/labm8") fs.rm("/tmp/labm8.copy") fs.mkdir("/tmp/labm8/foo/bar") self._test(False, fs.exists("/tmp/labm8.copy")) fs.cp("/tmp/labm8/", "/tmp/labm8.copy") self._test(True, fs.isdir("/tmp/labm8.copy")) self._test(True, fs.isdir("/tmp/labm8.copy/foo")) self._test(True, fs.isdir("/tmp/labm8.copy/foo/bar"))
def load_and_test(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204): np.random.seed(seed) name = model_desc["name"] inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format( **vars()) outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format( **vars()) if fs.exists(outpath): return load_result(model_desc, platform, source, n_splits=n_splits, split_i=split_i, atomizer=atomizer, maxlen=maxlen, seed=seed) if not fs.exists(inpath): return False test_fn = model_desc["test_fn"] load_fn = model_desc["load_fn"] # load training data _atomizer = globals().get(atomizer) data_desc = load_data_desc(platform=platform, source=source, max_seq_len=maxlen, atomizer=_atomizer, quiet=True) train, test = get_training_data( data_desc, seed=seed, split_i=split_i, n_splits=n_splits) # load model model = load_fn(inpath) print("model loaded from", inpath) # test model predictions = test_fn(model=model, test=test, seed=seed) analysis = analyze(predictions, test) test.update(analysis) test["predictions"] = predictions with open(outpath, 'wb') as outfile: pickle.dump(test, outfile) print("result saved to", outpath) return test
def test_cp_overwrite(): system.echo("Hello, world!", "/tmp/labm8.tmp") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp") # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp.copy") fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") system.echo("Goodbye, world!", "/tmp/labm8.tmp") fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") assert fs.read("/tmp/labm8.tmp") == fs.read("/tmp/labm8.tmp.copy")
def test_cp_overwrite(self): system.echo("Hello, world!", "/tmp/labm8.tmp") self._test(["Hello, world!"], fs.read("/tmp/labm8.tmp")) # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") self._test(False, fs.exists("/tmp/labm8.tmp.copy")) fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") system.echo("Goodbye, world!", "/tmp/labm8.tmp") fs.cp("/tmp/labm8.tmp", "/tmp/labm8.tmp.copy") self._test(fs.read("/tmp/labm8.tmp"), fs.read("/tmp/labm8.tmp.copy"))
def test_scp_user(): system.echo("Hello, world!", "/tmp/labm8.tmp") assert ["Hello, world!"] == fs.read("/tmp/labm8.tmp") # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") assert not fs.exists("/tmp/labm8.tmp.copy") # Perform scp. system.scp("localhost", "/tmp/labm8.tmp", "/tmp/labm8.tmp.copy", path="labm8/data/test/bin", user="******") assert fs.read("/tmp/labm8.tmp") == fs.read("/tmp/labm8.tmp.copy")
def test_scp_user(self): system.echo("Hello, world!", "/tmp/labm8.tmp") self._test(["Hello, world!"], fs.read("/tmp/labm8.tmp")) # Cleanup any existing file. fs.rm("/tmp/labm8.tmp.copy") self._test(False, fs.exists("/tmp/labm8.tmp.copy")) # Perform scp. system.scp("localhost", "/tmp/labm8.tmp", "/tmp/labm8.tmp.copy", path="tests/bin", user="******") self._test(fs.read("/tmp/labm8.tmp"), fs.read("/tmp/labm8.tmp.copy"))
def __contains__(self, key): """ Check cache contents. Arguments: key: Key. Returns: bool: True if key in cache, else false. """ path = self.keypath(key) return fs.exists(path)
def load_result(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204): name = model_desc["name"] inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format(**vars()) if not fs.exists(inpath): return False with open(inpath, 'rb') as infile: result = pickle.load(infile) return result
def _init_error(err: Exception) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") paths = [ fs.path(self.contentcache.path, "kernels.db"), fs.path(self.cache.path, "corpus.txt"), fs.path(self.cache.path, "tensor.npy"), fs.path(self.cache.path, "atomizer.pkl") ] for path in paths: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err
def __delitem__(self, key): """ Delete cached file. Arguments: key: Key. Raises: KeyError: If file not in cache. """ path = self.keypath(key) if fs.exists(path): fs.rm(path) else: raise KeyError(key)
def test_LockFile_force_replace_stale(): """Test that lockfile is replaced if forced.""" with tempfile.TemporaryDirectory() as d: path = pathlib.Path(d) / 'LOCK' lock = lockfile.LockFile(path) MAX_PROCESSES = 4194303 # OS-dependent. This value is for Linux lock.acquire(pid=MAX_PROCESSES + 1) assert lock.islocked assert not lock.owned_by_self with pytest.raises(lockfile.UnableToAcquireLockError): lock.acquire() lock.acquire(force=True) assert lock.islocked assert lock.owned_by_self lock.release() assert not fs.exists(lock.path)
def _incache(self, path: str) -> str: """ Assert that file is in cache. Arguments: path (str): File path. Returns: str: File path. Raises: Cache404: If file does not exist. """ if not fs.exists(path): raise Cache404("file '{path}' not found".format(path=path)) return path
def to_dist(self, distpath: str, author: str = None) -> str: """ Create a dist file. Arguments: distpath (str): Path to dist file. author (str, optional): Author name. Returns: str: Path to generated distfile. """ outpath = fs.abspath(distpath) + ".tar.bz2" if fs.exists(outpath): raise DistError("file {} exists".format(outpath)) meta = self.meta if author is not None: meta["author"] = author log.debug(clgen.format_json(meta)) try: tar = tarfile.open(outpath, 'w:bz2') # write meta metapath = mktemp(prefix="clgen-", suffix=".json") clgen.write_file(metapath, clgen.format_json(meta)) log.debug("metafile:", metapath) # create tarball tar.add(metapath, arcname="meta.json") # pack contents: for path in meta["contents"]: abspath = fs.path(cache.ROOT, path) log.verbose("packing", abspath) tar.add(abspath, arcname=fs.path("contents", path)) # tidy up fs.rm(metapath) tar.close() except Exception as e: tar.close() fs.rm(metapath) fs.rm(outpath) raise e return outpath
def __setitem__(self, key, value): """ Emplace file in cache. Arguments: key: Key. value (str): Path of file to insert in cache. Raises: ValueError: If no "value" does nto exist. """ if not fs.exists(value): raise ValueError(value) path = self.keypath(key) fs.mkdir(self.path) fs.mv(value, path)
def __getitem__(self, key): """ Get path to file in cache. Arguments: key: Key. Returns: str: Path to cache value. Raises: KeyErorr: If key not in cache. """ path = self.keypath(key) if fs.exists(path): return path else: raise KeyError(key)
def read(path): """ Read the contents of a LockFile. Arguments: path (str): Path to lockfile. Returns: Tuple(int, datetime): The integer PID of the lock owner, and the date the lock was required. If the lock is not claimed, both values are None. """ if fs.exists(path): with open(path) as infile: components = infile.read().split() pid = int(components[0]) date = datetime.date.fromtimestamp(float(components[1])) return pid, date else: return None, None
def benchmark_inference(model_desc, platform, source, atomizer="CharacterAtomizer", maxlen=1024, n_splits=10, split_i=0, seed=204, n_runtimes=100): np.random.seed(seed) name = model_desc["name"] inpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.model".format( **vars()) outpath = "models/{name}/{platform}-{source}-{atomizer}:{maxlen}-{seed}-{n_splits}-{split_i}.result".format( **vars()) if not fs.exists(inpath): return False test_fn = model_desc["test_fn"] load_fn = model_desc["load_fn"] # load training data _atomizer = globals().get(atomizer) data_desc = load_data_desc(platform=platform, source=source, max_seq_len=maxlen, atomizer=_atomizer, quiet=True) train, test = get_training_data( data_desc, seed=seed, split_i=split_i, n_splits=n_splits) # load model model = load_fn(inpath) print("model loaded from", inpath) # test model runtimes = [] for i in range(n_runtimes): start = time.time() predictions = test_fn(model=model, test=test, seed=seed) elapsed = (time.time() - start) / len(test["y"]) runtimes.append(elapsed) return np.array(runtimes)
def main(): log.init(verbose=True) m = model.from_json(clgen.load_json_file(sys.argv[1])) c = corpus.Corpus.from_json({"path": "~/data/github"}) print("CLgen: ", clgen.version()) print("Corpus size:", c.size) print("Vocab size: ", c.vocab_size) m.train() p, _ = corpus.most_common_prototypes(c, 20) for i, row in enumerate(p): outpath = "./inference-p" + str(i + 1) + "-" + fs.basename(sys.argv[1]) if fs.exists(outpath): print("skipped result for", outpath) continue else: print("starting result for", outpath) _, prototype = row argspec = [' '.join(x.split()[:-1]) for x in prototype.split(',')] print("argspec", ','.join([str(x) for x in argspec])) s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 5000 }, "sampler": { "batch_size": 2000, "max_batches": 1, "static_checker": False, "dynamic_checker": False } }) info = evaluate(m, s) clgen.write_file(outpath, clgen.format_json(info))
def search(m, target_code, logpath, start_code=None): # resume search if fs.exists(logpath): log = clgen.load_json_file(logpath) print("resuming search of", len(get_steps(log)), "steps") else: log = [] steps = get_steps(log) if start_code and not len(steps): code = start_code elif len(steps): code = steps[-1]['data']['code'] else: code = get_start_code(m) target_features = get_features(target_code) features = get_features(code) distance = get_distance(target_features, features) if get_entries(log, "init"): init = get_entries(log, "init")[0] assert(init['data']['target_code'] == target_code) assert(init['data']['target_features'] == escape_features(target_features)) # load history from log code_history = get_code_history(log) else: # create init entry add_to_log(log, { "start_code": code, "start_features": escape_features(features), "target_features": escape_features(target_features), "target_code": target_code, "distance": distance, "model": m.meta }, name="init") write_log(log, logpath) code_history = [code] # keep track of best if len(steps): best = steps[-1]['data']['best'] else: best = { "distance": distance, "code": code, "improvement_count": 0 } # maximum number of mutations before stopping search MAX_STEPS = 1000 for i in range(len(steps), MAX_STEPS): print("step", i, "of", MAX_STEPS) newcode, mutate_idx, mutate_seed, attempts = get_mutation(m, code) try: features = get_features(newcode) distance = get_distance(target_features, features) except ValueError: newcode = None entry = { "count": i, "attempts": attempts } if newcode: entry["base_code"] = code entry["code"] = newcode entry["distance"] = distance entry["distance_diff"] = 1 - distance / best["distance"] entry["features"] = escape_features(features) entry["mutate_idx"] = mutate_idx entry["mutate_seed"] = mutate_seed code_history.append(code) else: print(" -> step back") # step back if len(code_history): code = code_history.pop() entry["step_back"] = code if distance < best["distance"]: print(" -> improvement {:.1f}%".format( entry["distance_diff"] * 100)) best["distance"] = distance best["code"] = newcode best["features"] = escape_features(features) best["improvement_count"] += 1 else: if newcode: print(" -> regression {:.1f}%".format( entry["distance_diff"] * 100)) entry["best"] = best add_to_log(log, entry, name="step") write_log(log, logpath) # doesn't have to be exactly zero but whatever if distance <= 0.001: print("found exact match!") break add_to_log(log, { "best_code": best['code'], "best_features": escape_features(best['features']), "best_distance": best['distance'] }, name="end") write_log(log, logpath)
def evaluate(model, embeddings, folder_data, samples_per_class, folder_results, dense_layer_size, print_summary, num_epochs, batch_size): # Set seed for reproducibility seed = 204 #################################################################################################################### # Get data vsamples_per_class = FLAGS.vsamples # Data acquisition num_classes = 104 y_train = np.empty(0) # training X_train = list() folder_data_train = folder_data + '_train' y_val = np.empty(0) # validation X_val = list() folder_data_val = folder_data + '_val' y_test = np.empty(0) # testing X_test = list() folder_data_test = folder_data + '_test' print('Getting file names for', num_classes, 'classes from folders:') print(folder_data_train) print(folder_data_val) print(folder_data_test) for i in range(1, num_classes + 1): # loop over classes # training: Read data file names folder = os.path.join(folder_data_train, str(i)) assert os.path.exists(folder), "Folder: " + folder + ' does not exist' print('\ttraining : Read file names from folder ', folder) listing = os.listdir(folder + '/') seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec'] # training: Randomly pick programs assert len(seq_files) >= samples_per_class, "Cannot sample " + str(samples_per_class) + " from " + str( len(seq_files)) + " files found in " + folder X_train += resample(seq_files, replace=False, n_samples=samples_per_class, random_state=seed) y_train = np.concatenate([y_train, np.array([int(i)] * samples_per_class, dtype=np.int32)]) # validation: Read data file names folder = os.path.join(folder_data_val, str(i)) assert os.path.exists(folder), "Folder: " + folder + ' does not exist' print('\tvalidation: Read file names from folder ', folder) listing = os.listdir(folder + '/') seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec'] # validation: Randomly pick programs if vsamples_per_class > 0: assert len(seq_files) >= vsamples_per_class, "Cannot sample " + str(vsamples_per_class) + " from " + str( len(seq_files)) + " files found in " + folder X_val += resample(seq_files, replace=False, n_samples=vsamples_per_class, random_state=seed) y_val = np.concatenate([y_val, np.array([int(i)] * vsamples_per_class, dtype=np.int32)]) else: assert len(seq_files) > 0, "No .rec files found in" + folder X_val += seq_files y_val = np.concatenate([y_val, np.array([int(i)] * len(seq_files), dtype=np.int32)]) # test: Read data file names folder = os.path.join(folder_data_test, str(i)) assert os.path.exists(folder), "Folder: " + folder + ' does not exist' print('\ttest : Read file names from folder ', folder) listing = os.listdir(folder + '/') seq_files = [os.path.join(folder, f) for f in listing if f[-4:] == '.rec'] assert len(seq_files) > 0, "No .rec files found in" + folder X_test += seq_files y_test = np.concatenate([y_test, np.array([int(i)] * len(seq_files), dtype=np.int32)]) # Load dictionary and cutoff statements folder_vocabulary = FLAGS.vocabulary_dir dictionary_pickle = os.path.join(folder_vocabulary, 'dic_pickle') print('\tLoading dictionary from file', dictionary_pickle) with open(dictionary_pickle, 'rb') as f: dictionary = pickle.load(f) unk_index = dictionary[rgx.unknown_token] del dictionary # Encode source codes and get max. sequence length X_seq_train, maxlen_train = encode_srcs(X_train, 'training', unk_index) X_seq_val, maxlen_val = encode_srcs(X_val, 'validation', unk_index) X_seq_test, maxlen_test = encode_srcs(X_test, 'testing', unk_index) maxlen = max(maxlen_train, maxlen_test, maxlen_val) print('Max. sequence length overall:', maxlen) print('Padding sequences') X_seq_train = pad_src(X_seq_train, maxlen, unk_index) X_seq_val = pad_src(X_seq_val, maxlen, unk_index) X_seq_test = pad_src(X_seq_test, maxlen, unk_index) # Get one-hot vectors for classification print('YTRAIN\n', y_train) y_1hot_train = get_onehot(y_train, num_classes) y_1hot_val = get_onehot(y_val, num_classes) #################################################################################################################### # Setup paths # Set up names paths model_name = model.__name__ model_path = os.path.join(folder_results, "classifyapp/models/{}.model".format(model_name)) predictions_path = os.path.join(folder_results, "classifyapp/predictions/{}.result".format(model_name)) # If predictions have already been made with these embeddings, load them if fs.exists(predictions_path): print("\tFound predictions in", predictions_path, ", skipping...") with open(predictions_path, 'rb') as infile: p = pickle.load(infile) else: # could not find predictions already computed with these embeddings # Embeddings import tensorflow as tf # for embeddings lookup embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1) vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape print('XSEQ:\n', X_seq_train) print('EMB:\n', embedding_matrix_normalized) gen_test = EmbeddingPredictionSequence(batch_size, X_seq_test, embedding_matrix_normalized) # If models have already been made with these embeddings, load them if fs.exists(model_path): print("\n\tFound trained model in", model_path, ", skipping...") model.restore(model_path) else: # could not find models already computed with these embeddings gen_train = EmbeddingSequence(batch_size, X_seq_train, y_1hot_train, embedding_matrix_normalized) gen_val = EmbeddingSequence(batch_size, X_seq_val, y_1hot_val, embedding_matrix_normalized) ############################################################################################################ # Train # Create a new model and train it print('\n--- Initializing model...') model.init(seed=seed, maxlen=maxlen, embedding_dim=int(embedding_dimension), num_classes=num_classes, dense_layer_size=dense_layer_size) if print_summary: model.model.summary() print('\n--- Training model...') model.train_gen(train_generator=gen_train, validation_generator=gen_val, verbose=True, epochs=num_epochs) # Save the model fs.mkdir(fs.dirname(model_path)) model.save(model_path) print('\tsaved model to', model_path) ################################################################################################################ # Test # Test model print('\n--- Testing model...') p = model.predict_gen(generator=gen_test)[0] # cache the prediction fs.mkdir(fs.dirname(predictions_path)) with open(predictions_path, 'wb') as outfile: pickle.dump(p, outfile) print('\tsaved predictions to', predictions_path) #################################################################################################################### # Return accuracy accuracy = p == y_test # prediction accuracy return accuracy
def test_finalise_figsize(self): self._mkplot() viz.finalise("/tmp/labm8.png", figsize=(10, 5)) self.assertTrue(fs.exists("/tmp/labm8.png")) fs.rm("/tmp/labm8.png")
def test_exists(self): self._test(True, fs.exists(__file__)) self._test(True, fs.exists("/")) self._test(False, fs.exists("/not/a/real/path (I hope!)"))
def search(m, target_code, logpath, start_code=None): # resume search if fs.exists(logpath): log = clgen.load_json_file(logpath) print("resuming search of", len(get_steps(log)), "steps") else: log = [] steps = get_steps(log) if start_code and not len(steps): code = start_code elif len(steps): code = steps[-1]['data']['code'] else: code = get_start_code(m) target_features = get_features(target_code) features = get_features(code) distance = get_distance(target_features, features) if get_entries(log, "init"): init = get_entries(log, "init")[0] assert (init['data']['target_code'] == target_code) assert (init['data']['target_features'] == escape_features( target_features)) # load history from log code_history = get_code_history(log) else: # create init entry add_to_log(log, { "start_code": code, "start_features": escape_features(features), "target_features": escape_features(target_features), "target_code": target_code, "distance": distance, "model": m.meta }, name="init") write_log(log, logpath) code_history = [code] # keep track of best if len(steps): best = steps[-1]['data']['best'] else: best = {"distance": distance, "code": code, "improvement_count": 0} # maximum number of mutations before stopping search MAX_STEPS = 1000 for i in range(len(steps), MAX_STEPS): print("step", i, "of", MAX_STEPS) newcode, mutate_idx, mutate_seed, attempts = get_mutation(m, code) try: features = get_features(newcode) distance = get_distance(target_features, features) except ValueError: newcode = None entry = {"count": i, "attempts": attempts} if newcode: entry["base_code"] = code entry["code"] = newcode entry["distance"] = distance entry["distance_diff"] = 1 - distance / best["distance"] entry["features"] = escape_features(features) entry["mutate_idx"] = mutate_idx entry["mutate_seed"] = mutate_seed code_history.append(code) else: print(" -> step back") # step back if len(code_history): code = code_history.pop() entry["step_back"] = code if distance < best["distance"]: print(" -> improvement {:.1f}%".format(entry["distance_diff"] * 100)) best["distance"] = distance best["code"] = newcode best["features"] = escape_features(features) best["improvement_count"] += 1 else: if newcode: print(" -> regression {:.1f}%".format( entry["distance_diff"] * 100)) entry["best"] = best add_to_log(log, entry, name="step") write_log(log, logpath) # doesn't have to be exactly zero but whatever if distance <= 0.001: print("found exact match!") break add_to_log(log, { "best_code": best['code'], "best_features": escape_features(best['features']), "best_distance": best['distance'] }, name="end") write_log(log, logpath)
def test_finalise_figsize(): _MakeTestPlot() viz.finalise("/tmp/labm8.png", figsize=(10, 5)) assert fs.exists("/tmp/labm8.png") fs.rm("/tmp/labm8.png")
def test_finalise_tight(): _MakeTestPlot() viz.finalise("/tmp/labm8.png", tight=True) assert fs.exists("/tmp/labm8.png") fs.rm("/tmp/labm8.png")
def test_finalise(): _MakeTestPlot() viz.finalise("/tmp/labm8.png") assert fs.exists("/tmp/labm8.png") fs.rm("/tmp/labm8.png")
def islocked(self): """ Whether the directory is locked. """ return fs.exists(self.path)
def evaluate(model, device, data_folder, out_folder, embeddings, dense_layer_size, print_summary, num_epochs, batch_size): data = [] # Create device list if device == 'all': device_list = ["Cypress", "Tahiti", "Fermi", "Kepler"] else: device_list = [device] for i, platform in enumerate(device_list): print( '\n------------------------------------------------------------------' ) print('--- Platform', platform, '[', i + 1, '/ 4 ]') print( '------------------------------------------------------------------' ) platform_name = platform2str(platform) # Read data oracle_file = os.path.join(data_folder, "pact-2014-oracles.csv") oracles = pd.read_csv(oracle_file) runtimes_file = os.path.join(data_folder, "pact-2014-runtimes.csv") df = pd.read_csv(runtimes_file) print('\tRead data from', oracle_file, '\n\tand', runtimes_file) # Extract data oracle_runtimes = np.array( [float(x) for x in oracles["runtime_" + platform]]) y = np.array([int(x) for x in oracles["cf_" + platform]], dtype=np.int32) y_1hot = get_onehot(oracles, platform) # Encode source codes X_seq, maxlen = encode_srcs(data_folder, df) # Embeddings import tensorflow as tf # for embeddings lookup embedding_matrix_normalized = tf.nn.l2_normalize(embeddings, axis=1) vocabulary_size, embedding_dimension = embedding_matrix_normalized.shape seq_ = tf.placeholder(dtype=tf.int32) # Tensor of shape (num_input_files, sequence length, embbedding dimension) embedding_input_ = tf.nn.embedding_lookup(embedding_matrix_normalized, seq_) # Make tf block less gpu memory config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: embedding_input = sess.run(embedding_input_, feed_dict={seq_: X_seq}) # Leave-one-out cross-validation kf = KFold(n_splits=len(y), shuffle=False) for j, (train_index, test_index) in enumerate(kf.split(y)): print('--- Cross validation step [', j + 1, '/ ', len(y), ']') kernel = sorted(set(df["kernel"]))[test_index[0]] X_cc, y_cc = get_magni_features(df, oracles, platform) model_name = model.__name__ model_basename = model.__basename__ model_path = os.path.join( out_folder, "models/{model_basename}-{platform}-{j}.model".format( model_basename=model_basename, platform=platform, j=j)) predictions_path = os.path.join( out_folder, "predictions/{model_basename}-{platform}-{j}.result".format( model_basename=model_basename, platform=platform, j=j)) if fs.exists(predictions_path): # load result from cache print("\tFound predictions in", predictions_path, ", skipping...") with open(predictions_path, 'rb') as infile: p = pickle.load(infile) else: if fs.exists(model_path): # load a trained model from cache print("\n\tFound trained model in", model_path, ", skipping...") model.restore(model_path) else: # Initialize model and print summary print('\n--- Training model...') model.init(seed, maxlen, int(embedding_dimension), dense_layer_size) if print_summary: model.model.summary() # Train and cache a model model.train(sequences=embedding_input[train_index, :, :], verbose=True, y_1hot=y_1hot[train_index], epochs=num_epochs, batch_size=batch_size) # cache the model fs.mkdir(fs.dirname(model_path)) model.save(model_path) print('\tsaved model to', model_path) # test model print('\n--- Testing model...') p = model.predict(sequences=embedding_input[test_index, :, :], batch_size=batch_size)[0] # The runtimes of some coarsening factors are not recorded in the data table. If that is the case for # the predicted cf, clamp it down to the highest cf for which the runtime is recorded p = min(p, 2**(len(X_cc[test_index[0]]) - 1)) # cache the prediction fs.mkdir(fs.dirname(predictions_path)) with open(predictions_path, 'wb') as outfile: pickle.dump(p, outfile) print('\tsaved predictions to', predictions_path) o = y[test_index[0]] # oracle prediction (true value) correct = p == o # predictions' correctness # get runtime without thread coarsening row = df[(df["kernel"] == kernel) & (df["cf"] == 1)] assert (len(row) == 1) # sanity check nocf_runtime = float(row["runtime_" + platform]) # get runtime of prediction row = df[(df["kernel"] == kernel) & (df["cf"] == p)] assert (len(row) == 1) # sanity check p_runtime = float(row["runtime_" + platform]) # get runtime of oracle coarsening factor o_runtime = oracle_runtimes[test_index[0]] # speedup and % oracle s_oracle = nocf_runtime / o_runtime p_speedup = nocf_runtime / p_runtime p_oracle = o_runtime / p_runtime # record result data.append({ "Model": model_name, "Platform": platform_name, "Kernel": kernel, "Oracle-CF": o, "Predicted-CF": p, "Speedup": p_speedup, "Oracle": p_oracle }) return pd.DataFrame(data, columns=[ "Model", "Platform", "Kernel", "Oracle-CF", "Predicted-CF", "Speedup", "Oracle" ])
def test_exists(): assert fs.exists(__file__) assert fs.exists("/") assert not fs.exists("/not/a/real/path (I hope!)")
def test_finalise_tight(self): self._mkplot() viz.finalise("/tmp/labm8.png", tight=True) self.assertTrue(fs.exists("/tmp/labm8.png")) fs.rm("/tmp/labm8.png")