def merge(outpath, inpaths=None): """ Merge kernel datasets. """ from clgen import explore if not fs.isfile(outpath): create_db(outpath) log.info("created", outpath) db = connect(outpath) if not inpaths: inpaths = get_all_sampler_datasets() for inpath in inpaths: log.info("merging from", inpath) c = db.cursor() c.execute("ATTACH '{}' AS rhs".format(inpath)) c.execute("INSERT OR IGNORE INTO ContentFiles " "SELECT * FROM rhs.ContentFiles") c.execute("INSERT OR IGNORE INTO PreprocessedFiles " "SELECT * FROM rhs.PreprocessedFiles") db.commit() c.execute("DETACH rhs") c.close() explore.explore(outpath)
def _create_kernels_db(self, path: str, encoding: str = "default") -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = fs.path(self.contentcache.path, "kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [ f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f) ] # import files into database fetch.fetch_fs(self.contentcache["kernels.db"], filelist) # preprocess files preprocess.preprocess_db(self.contentcache["kernels.db"]) # encode kernel db encode(self.contentcache["kernels.db"], encoding) # print database stats explore.explore(self.contentcache["kernels.db"])
def sample(self, model: Model, quiet: bool = False) -> None: """ Sample CLgen model. Arguments: model (Model): CLgen model. """ cache = self.cache(model) # create samples database if it doesn't exist if not cache["kernels.db"]: dbutil.create_db(fs.path(cache.path, "kernels.tmp.db")) cache["kernels.db"] = fs.path(cache.path, "kernels.tmp.db") batch_i = 0 while True: # stop if we have enough kernels has_max_kernels = self.max_kernels >= 0 num_good_kernels = dbutil.num_good_kernels(cache["kernels.db"]) if has_max_kernels and num_good_kernels >= self.max_kernels: return # stop if we've done enough batches has_max_batches = self.max_batches >= 0 if has_max_batches and batch_i >= self.max_batches: return batch_i += 1 print("sample batch", batch_i, "...") self.sample_iteration(model, quiet=quiet) print() explore(self.cache(model)["kernels.db"]) log.info("samples database:", cache["kernels.db"])