def unpack_directory_if_needed(path: str) -> str: """ If path is a tarball, unpack it. If path doesn't exist but there is a tarball with the same name, unpack it. Parameters ---------- path : str Path to directory or tarball. Returns ------- str Path to directory. Raises ------ clgen.InternalError If unable to extract archive. """ if fs.isdir(path): return path if fs.isfile(path) and path.endswith(".tar.bz2"): log.info("unpacking '{}'".format(path)) tar.unpack_archive(path) return re.sub(r'.tar.bz2$', '', path) if fs.isfile(path + ".tar.bz2"): log.info("unpacking '{}'".format(path + ".tar.bz2")) tar.unpack_archive(path + ".tar.bz2") return path raise clgen.InternalError("cannot interpret archive '{path}'" .format(**vars()))
def print_bytecode_features(db_path: str) -> None: """ Print Bytecode features. Arguments: db_path: Path to dataset. """ db = dbutil.connect(db_path) c = db.cursor() c.execute('SELECT sha,contents FROM Bytecodes') query = c.fetchall() uniq_features = set() for row in query: sha, contents = row features = bytecode_features(contents) # Add the table key features['sha'] = sha for key in features.keys(): uniq_features.add(key) log.info('Features:') for feature in uniq_features: log.info(' ', feature)
def unpack_directory_if_needed(path: str) -> str: """ If path is a tarball, unpack it. If path doesn't exist but there is a tarball with the same name, unpack it. Arguments: path (str): Path to directory or tarball. Returns: str: Path to directory. """ if fs.isdir(path): return path if fs.isfile(path) and path.endswith(".tar.bz2"): log.info("unpacking '{}'".format(path)) clgen.unpack_archive(path) return re.sub(r'.tar.bz2$', '', path) if fs.isfile(path + ".tar.bz2"): log.info("unpacking '{}'".format(path + ".tar.bz2")) clgen.unpack_archive(path + ".tar.bz2") return path return path
def preprocess_inplace(paths: str, max_num_workers: int = cpu_count(), attempt: int = 1) -> None: """ Preprocess a list of files in place. Arguments: paths (str[]): List of paths. max_num_workers (int, optional): Number of processes to spawn. """ if attempt >= MAX_OS_RETRIES: raise clgen.InternalError("Failed to process files") num_workers = min(len(paths), max_num_workers) try: log.info('spawned', num_workers, 'worker threads to process', len(paths), 'files ...') with clgen.terminating(Pool(num_workers)) as pool: pool.map(_preprocess_inplace_worker, paths) except OSError as e: log.error(e) # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) preprocess_inplace(paths, max_num_workers=max_num_workers, attempt=attempt + 1)
def remove_bad_preprocessed(db_path: str) -> None: """ Remove all ugly and bad contents from PreprocessedFiles table. Parameters ---------- db_path : str Dataset. """ original_size = fs.du(db_path, human_readable=False) original_size_human_readable = fs.du(db_path, human_readable=True) log.info("vacuuming", original_size_human_readable, "database") sys.stdout.flush() # Remove contents from bad or ugly preprocessed files. db = connect(db_path) c = db.cursor() c.execute("UPDATE PreprocessedFiles SET contents='[DELETED]' " "WHERE status=1 OR status=2") db.commit() c.close() db.close() db = connect(db_path) c = db.cursor() c.execute("VACUUM") db.commit() c.close() new_size = fs.du(db_path, human_readable=False) new_size_human_readable = fs.du(db_path, human_readable=True) reduction_ratio = (1 - (new_size / original_size)) * 100 log.info("done. new size {}. ({:.0f}% reduction)" .format(new_size_human_readable, reduction_ratio), sep=".")
def _main(infile: TextIO, vocab: str, size: bool) -> None: atoms = corpus.atomize(infile.read(), vocab=vocab) if size: log.info("size:", len(atoms)) else: log.info('\n'.join(atoms))
def sample(self, model: clgen.Model) -> None: """ Sample CLgen model. Parameters ---------- model : clgen.Model CLgen model. """ cache = self.cache(model) # create samples database if it doesn't exist if not cache.get("kernels.db"): tmp_kernels_db = cache.keypath("kernels.tmp.db") dbutil.create_db(tmp_kernels_db) cache["kernels.db"] = tmp_kernels_db # producer-consumer queue queue = Queue(maxsize=128) log.info("sampling", self) sampler = SampleProducer(model, self.start_text, queue, **self.kernel_opts) sampler.start() consumer = SampleConsumer(cache["kernels.db"], sampler, self, cache, queue, **self.sampler_opts) consumer.start() sampler.join() consumer.join() clgen.explore(cache["kernels.db"])
def merge(outpath, inpaths=None): """ Merge kernel datasets. """ from clgen import explore if not fs.isfile(outpath): create_db(outpath) log.info("created", outpath) db = connect(outpath) if not inpaths: inpaths = get_all_sampler_datasets() for inpath in inpaths: log.info("merging from", inpath) c = db.cursor() c.execute("ATTACH '{}' AS rhs".format(inpath)) c.execute("INSERT OR IGNORE INTO ContentFiles " "SELECT * FROM rhs.ContentFiles") c.execute("INSERT OR IGNORE INTO PreprocessedFiles " "SELECT * FROM rhs.PreprocessedFiles") db.commit() c.execute("DETACH rhs") c.close() explore.explore(outpath)
def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") for path in files_to_rm: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err
def _init_error(err: Exception) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") paths = [ fs.path(self.contentcache.path, "kernels.db"), fs.path(self.cache.path, "corpus.txt"), fs.path(self.cache.path, "tensor.npy"), fs.path(self.cache.path, "atomizer.pkl") ] for path in paths: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err
def preprocess_inplace(paths: List[str], max_num_workers: int = cpu_count(), max_attempts: int = 100, attempt: int = 1) -> None: """ Preprocess a list of files in place. Parameters ---------- paths : List[str] List of paths. max_num_workers : int, optional Number of processes to spawn. max_attempts : int, optional In case of an OSError or TimeoutError, this number of attempts will be made. """ if attempt > max_attempts: raise clgen.InternalError( f"Failed to process files after {max_attempts} attempts") elif attempt > 1: log.warning("preprocess attempt #.", attempt) num_workers = min(len(paths), max_num_workers) try: log.info('spawned', num_workers, 'worker threads to process', len(paths), 'files ...') with clgen.terminating(Pool(num_workers)) as pool: pool.map(_preprocess_inplace_worker, paths) except (OSError, TimeoutError) as e: log.error(e) # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) preprocess_inplace(paths, max_num_workers=max_num_workers, attempt=attempt + 1, max_attempts=max_attempts)
def _main() -> None: cache = clgen.cachepath() log.warning("Not Implemented: refresh corpuses") if fs.isdir(cache, "model"): cached_modeldirs = fs.ls(fs.path(cache, "model"), abspaths=True) for cached_modeldir in cached_modeldirs: cached_model_id = fs.basename(cached_modeldir) cached_meta = jsonutil.read_file(fs.path(cached_modeldir, "META")) model = clgen.Model.from_json(cached_meta) if cached_model_id != model.hash: log.info(cached_model_id, '->', model.hash) if fs.isdir(model.cache.path): log.fatal("cache conflict", file=sys.stderr) fs.mv(cached_modeldir, model.cache.path) log.warning("Not Implemented: refresh samplers")
def sample(self, model: Model, quiet: bool = False) -> None: """ Sample CLgen model. Arguments: model (Model): CLgen model. """ cache = self.cache(model) # create samples database if it doesn't exist if not cache["kernels.db"]: dbutil.create_db(fs.path(cache.path, "kernels.tmp.db")) cache["kernels.db"] = fs.path(cache.path, "kernels.tmp.db") batch_i = 0 while True: # stop if we have enough kernels has_max_kernels = self.max_kernels >= 0 num_good_kernels = dbutil.num_good_kernels(cache["kernels.db"]) if has_max_kernels and num_good_kernels >= self.max_kernels: return # stop if we've done enough batches has_max_batches = self.max_batches >= 0 if has_max_batches and batch_i >= self.max_batches: return batch_i += 1 print("sample batch", batch_i, "...") self.sample_iteration(model, quiet=quiet) print() explore(self.cache(model)["kernels.db"]) log.info("samples database:", cache["kernels.db"])
def preprocess_file(path: str, inplace: bool = False) -> None: """ Preprocess a file. Prints output to stdout by default. If preprocessing fails, this function exits. Arguments: path (str): String path to file. inplace (bool, optional): If True, overwrite input file. """ with open(path) as infile: contents = infile.read() try: out = preprocess(contents) if inplace: with open(path, 'w') as outfile: outfile.write(out) else: log.info('preprocess', out) except BadCodeException as e: log.fatal(e, ret=1) except UglyCodeException as e: log.fatal(e, ret=2)
def _dump_db(db, out_path: str, gh: bool=False, fileid: bool=False, reverse: bool=False, input_samples: bool=False, status: int=0, eof: bool=False, dir: bool=False) -> None: """ Dump database contents. Parameters ---------- db : slite3.Connection Dataset. out_path : str Path to output. gh : bool, optional Dataset is GitHub. fileid : bool, optional Include file IDs. reverse : bool, optional Reverse ordering of output. input_samples : bool, optional If True, use un-preprocessed files. status : int, optional Filter preprocess status. eof : bool, optional Include EOF separators. dir : bool, optional Write output to directory. """ log.info('writing corpus', out_path, '...') order = 'ASC' if reverse else 'DESC' c = db.cursor() # Query components table = 'ContentFiles' if input_samples else 'PreprocessedFiles' select = 'SELECT {}.id,{}.contents'.format(table, table, table) if input_samples: qualifier = '' else: qualifier = 'WHERE {}.status={}'.format(table, status) if gh: table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id' ' LEFT JOIN Repositories ON ' 'ContentMeta.repo_url=Repositories.url' .format(table)) orderby = 'Repositories.stars' else: orderby = 'LC_col(contents)' query = ('{select} FROM {table} {qualifier} ORDER BY {orderby} {order}' .format(select=select, table=table, qualifier=qualifier, orderby=orderby, order=order)) c.execute(query) rows = c.fetchall() if dir: log.info('writing to directory ', out_path, '/', sep='') if not os.path.exists(out_path): os.makedirs(out_path) for row in rows: id, contents = row path = os.path.join(out_path, kid_to_path(id) + '.cl') with open(path, 'w') as out: out.write(contents) else: log.info('writing file', out_path) with open(out_path, 'wb') as out: for row in rows: id, contents = row if fileid: # Print file ID out.write('/* ID: {} */\n\n'.format(id).encode('utf-8')) out.write(contents.encode('utf-8')) if eof: # Print EOF token out.write('\n/* EOF */\n\n'.encode('utf-8')) else: out.write('\n\n'.encode('utf-8'))
def train(self, quiet: bool = False) -> None: """ Train model. """ tf = self._init_tensorflow(infer=False) # training options learning_rate = self.train_opts["learning_rate"] decay_rate = self.train_opts["lr_decary_rate"] checkpoint_path = fs.path(self.cache.path, "model.ckpt") # resume from prior checkpoint ckpt_path, ckpt_paths = None, None if self.checkpoint_path: # check if all necessary files exist assert (fs.isdir(self.checkpoint_path)) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) assert (ckpt) assert (ckpt.model_checkpoint_path) ckpt_path, ckpt_paths = self._get_params_path(ckpt) with tf.Session() as sess: tf.global_variables_initializer().run() # keep all checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # restore model from closest checkpoint if ckpt_path: log.debug("restoring", ckpt_path) saver.restore(sess, ckpt_path) log.info("restored checkpoint {}".format(ckpt_path)) # make sure we don't lose track of other checkpoints if ckpt_paths: saver.recover_last_checkpoints(ckpt_paths) start_batch = sess.run(self.epoch) * self.corpus.num_batches batch_count = 0 total_elapsed = 0 total_atomize = 0 total_checkpoint, avg_checkpoint = 0, 0 eta_d, eta_h, eta_m = 0, 0, 0 for e in range(sess.run(self.epoch) + 1, self.epochs + 1): if quiet: log.info("epoch", e, "of", self.epochs + 1) # decay and set learning rate new_learning_rate = learning_rate * ( (float(100 - decay_rate) / 100.0)**(e - 1)) sess.run(tf.assign(self.learning_rate, new_learning_rate)) sess.run(tf.assign(self.epoch, e)) time_start = time.time() self.corpus.create_batches() total_atomize += time.time() - time_start avg_atomize = total_atomize / e state = sess.run(self.initial_state) for b in range(self.corpus.num_batches): time_start = time.time() batch_count += 1 x, y = self.corpus.next_batch() feed = {self.input_data: x, self.targets: y} for i, (c, h) in enumerate(self.initial_state): feed[c] = state[i].c feed[h] = state[i].h train_loss, state, _ = sess.run( [self.cost, self.final_state, self.train_op], feed) batch_num = (e - 1) * self.corpus.num_batches + b max_batch = self.epochs * self.corpus.num_batches progress = float((batch_num + 1 - start_batch) / (max_batch - start_batch)) time_end = time.time() elapsed = time_end - time_start if not quiet: total_elapsed += elapsed avg_elapsed = total_elapsed / batch_count remaining_time = ( (max_batch - batch_count) * avg_elapsed + # batches (e - self.epochs) * avg_atomize + # atomizings (e - self.epochs) * avg_checkpoint) # checkpoints eta_h, eta_m = divmod(remaining_time / 60, 60) eta_d, eta_h = divmod(eta_h, 24) print("\r\033[K" "{progress:3.1f}% | " "{size}x{layers}x{max_epoch} {model} | " "epoch={epoch_num}/{max_epoch} | " "batch={batch_num}/{max_batch} | " "lr={lr:.5f} | " "loss={tloss:.3f} | " "t1={time_atomize:.3f}s " "t2={time_batch:.3f}s " "t3={time_checkpoint:.3f}s | " "eta={eta_d}d{eta_h}h{eta_m:02d}m".format( size=self.rnn_size, layers=self.num_layers, model=self.model_type.upper(), progress=progress * 100, epoch_num=e, max_epoch=self.epochs, batch_num=b + 1, max_batch=self.corpus.num_batches, lr=new_learning_rate, tloss=train_loss, time_atomize=avg_atomize, time_batch=avg_elapsed, time_checkpoint=avg_checkpoint, eta_d=int(eta_d), eta_h=int(eta_h), eta_m=int(eta_m)), end="") save = self.opts["train_opts"]["intermediate_checkpoints"] save |= e == self.epochs # last epoch if save: if not quiet: print() time_start = time.time() saver.save(sess, checkpoint_path, global_step=batch_num) total_checkpoint += time.time() - time_start avg_checkpoint = total_checkpoint / e log.info("model saved to {}".format(checkpoint_path))
def _main(model_file: TextIO) -> None: model_json = jsonutil.loads(model_file.read()) model = clgen.Model.from_json(model_json) model.train() log.info("done.")
def _main(db_file: BinaryIO, paths: List[Path]) -> None: clgen.fetch(db_file.name, paths) log.info("done.")
def _locked_train(self) -> 'Model': tf = self._init_tensorflow(infer=False) # training options learning_rate = self.train_opts["learning_rate"] decay_rate = self.train_opts["lr_decay_rate"] # resume from prior checkpoint ckpt_path, ckpt_paths = None, None if self.checkpoint_path: # check that all necessary files exist assert(fs.isdir(self.checkpoint_path)) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) assert(ckpt) assert(ckpt.model_checkpoint_path) ckpt_path, ckpt_paths = self._get_params_path(ckpt) with tf.Session() as sess: tf.global_variables_initializer().run() # keep all checkpoints saver = tf.train.Saver(tf.global_variables(), max_to_keep=100) # restore model from closest checkpoint if ckpt_path: log.debug("restoring", ckpt_path) saver.restore(sess, ckpt_path) log.verbose("restored checkpoint {}".format(ckpt_path)) # make sure we don't lose track of other checkpoints if ckpt_paths: saver.recover_last_checkpoints(ckpt_paths) coord = tf.train.Coordinator() self.corpus.create_batches() threading.Thread(target=self.enqueue_x, args=(coord, sess)).start() max_batch = self.epochs * self.corpus.num_batches # progress bar bar = progressbar.ProgressBar(max_value=max_batch) if sess.run(self.epoch) != self.epochs: log.info("training", self) for e in range(sess.run(self.epoch) + 1, self.epochs + 1): epoch_start = time() # decay and set learning rate new_learning_rate = learning_rate * ( (float(100 - decay_rate) / 100.0) ** (e - 1)) sess.run(tf.assign(self.learning_rate, new_learning_rate)) sess.run(tf.assign(self.epoch, e)) for b in range(self.corpus.num_batches): train_cost, _, state, _ = sess.run([self.cost, self.KL_cost, self.final_state, self.train_op]) # update progress bar batch_num = (e - 1) * self.corpus.num_batches + b bar.update(batch_num) save = self.opts["train_opts"]["intermediate_checkpoints"] save |= e == self.epochs # always save on last epoch if save: saver.save(sess, self.cache.keypath("model.ckpt"), global_step=batch_num) next_checkpoint = e * self.corpus.num_batches + b max_epoch = self.epochs log.verbose("\n{self} epoch {e} / {max_epoch}. " "next checkpoint at batch {next_checkpoint}" .format(**vars())) # update training time epoch_duration = time() - epoch_start self.stats["epoch_costs"].append(float(train_cost)) self.stats["epoch_times"].append(epoch_duration) self.stats["epoch_batches"].append(batch_num + 1) self._flush_meta() coord.request_stop() return self
def preprocess_contentfiles(db_path: str, max_num_workers: int = cpu_count(), attempt: int = 1) -> None: """ Preprocess OpenCL dataset. Arguments: db_path (str): OpenCL kernels dataset. max_num_workers (int, optional): Number of processes to spawn. """ def _finalize(db_path, cache): """Tidy up after worker threads finish""" log.debug("worker finalize") db = dbutil.connect(db_path) c = db.cursor() # import results from worker threads for outpath in fs.ls(cache.path, abspaths=True): with open(outpath) as infile: for line in infile: c.execute( 'INSERT OR REPLACE INTO PreprocessedFiles ' 'VALUES(?,?,?)', json.loads(line)) # write changes to database and remove cache db.commit() db.close() cache.empty() if attempt >= MAX_OS_RETRIES: raise clgen.InternalError("failed to preprocess files") num_contentfiles = dbutil.num_rows_in(db_path, 'ContentFiles') num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles') log.info("{n} ({r:.1%}) files need preprocessing".format( n=num_contentfiles - num_preprocessedfiles, r=(num_contentfiles - num_preprocessedfiles) / num_contentfiles)) # split into mulitple jobs of a maximum size jobsize = min(512, num_contentfiles) numjobs = math.ceil(num_contentfiles / jobsize) for j, offset in enumerate(range(0, num_contentfiles, jobsize)): num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles') num_workers = min(num_contentfiles, max_num_workers) files_per_worker = math.ceil(jobsize / num_workers) # temporary cache used for worker thread results cache = Cache("{pid}.preprocess".format(pid=os.getpid())) # each worker thread receives a range of database indices to preprocess, # and a JSON file to write results into jobs = [{ "db_in": db_path, "db_index_range": (offset + i * files_per_worker, offset + i * files_per_worker + files_per_worker), "json_out": fs.path(cache.path, "{i}.json".format(i=i)) } for i in range(num_workers)] # spool up worker threads then finalize log.info('job {j} of {numjobs}: spawning {num_workers} worker threads ' 'to process {jobsize} files ...'.format(**vars())) try: with clgen.terminating(Pool(num_workers)) as pool: pool.map(_preprocess_db_worker, jobs) except OSError as e: _finalize(db_path, cache) log.error(e) # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) preprocess_contentfiles(db_path, max_num_workers=max_num_workers, attempt=attempt + 1) except Exception as e: _finalize(db_path, cache) raise e _finalize(db_path, cache)
def _preprocess_db(db_path: str, max_num_workers: int = cpu_count(), max_attempts: int = 100, attempt: int = 1, **preprocess_opts) -> None: """ Preprocess OpenCL dataset. Parameters ---------- db_path : str OpenCL kernels dataset. max_num_workers : int, optional Number of processes to spawn. max_attempts : int, optional In case of an OSError or TimeoutError, this number of attempts will be made. """ if attempt > max_attempts: raise clgen.InternalError( f"failed to preprocess files after {max_attempts} attempts") log.verbose("determining jobs") contentfiles = set(dbutil.kernel_ids(db_path, "ContentFiles")) preprocessedfiles = set(dbutil.kernel_ids(db_path, "PreprocessedFiles")) ncontentfiles = len(contentfiles) npreprocessedfiles = len(preprocessedfiles) todo = contentfiles - preprocessedfiles ntodo = len(todo) # check we have something to do if not ntodo: return todo_ratio = ntodo / ncontentfiles log.info("{ntodo} ({todo_ratio:.1%}) samples need preprocessing".format( **vars())) log.verbose("creating jobs") # Determine if we need to inline kernels when creating jobs db = sqlite3.connect(db_path) c = db.cursor() c.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='ContentMeta';" ) meta_table = c.fetchone() c.close() db.close() if meta_table: get_kernel = lambda kid: dbutil.get_inlined_kernel( db_path, kid, lang=preprocess_opts["lang"]) else: get_kernel = lambda kid: dbutil.get_kernel( db_path, kid, table="ContentFiles") # create jobs jobs = [{ "id": kid, "src": get_kernel(kid), "preprocess_opts": preprocess_opts, } for kid in todo] random.shuffle(jobs) # split size worker_njobs = math.ceil(ntodo / max_num_workers) # producer-consumer queue queue = Queue(maxsize=128) log.verbose(f"assigning {ntodo} jobs to {max_num_workers} threads") try: # our worker threads. these busy little bees will do the heavy lifting # of preprocessing the contentfiles, pushing their results onto # the queue producers = [ PreprocessWorker(jobs[i:i + worker_njobs], queue) for i in range(0, ntodo, worker_njobs) ] # fly, my pretties, fly! for producer in producers: producer.start() # consume the results from the worker threads from the main thread for i in progressbar.ProgressBar()(range(ntodo)): # pull a fresh result from the queue (block if necessary) try: result = queue.get(timeout=90) except QueueEmpty as e: raise TimeoutError('failed to fetch result after 90 seconds. ' 'something went wrong') from e # insert result into database db = dbutil.connect(db_path) c = db.cursor() c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)", (result["id"], result["status"], result["contents"])) c.close() db.commit() db.close() for producer in producers: producer.join() except (OSError, TimeoutError) as e: log.error(e) if attempt > 2 and not i: log.warning("no progress has been made since previous attempt. " "I'm not going to try another attempt.") return # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) _preprocess_db(db_path, max_num_workers=max_num_workers, attempt=attempt + 1, max_attempts=max_attempts, **preprocess_opts)
def _preprocess_inplace_worker(path: str) -> None: """worker function for preprocess_inplace()""" log.info('preprocess', path) preprocess_file(path, inplace=True)