def evaluate(model, sampler): """ evaluate sampling efficiency """ print("starting sampling") sampler.sample(model) print("preprocessing sample") sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") return { "argspec": sampler.kernel_opts["args"], "host": system.HOSTNAME, "date": time.nowstr(), "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def test_remove_preprocessed(self): tmpdb = 'test_remove_preprocessed.db' fs.cp(tests.db_path('10-kernels-preprocessed'), tmpdb) self.assertEqual(8, dbutil.num_good_kernels(tmpdb)) db = dbutil.connect(tmpdb) self.assertFalse(dbutil.is_modified(db)) db.close() dbutil.remove_preprocessed(tmpdb) self.assertEqual(0, dbutil.num_good_kernels(tmpdb)) db = dbutil.connect(tmpdb) self.assertTrue(dbutil.is_modified(db)) db.close() fs.rm(tmpdb)
def evaluate(model, sampler): """ evaluate sampling efficiency """ model.cache.empty() # clear checkpoint cache print("starting training") tstart = time() # start timer model.train() # train model training_time = time() - tstart # clear the sample cache sampler.cache(model).empty() # sample kernels and time print("starting sampling") tstart = time() sampler.sample(model) tend = time() elapsed = tend - tstart # preprocess sample sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") efficiency = good_charcount / total_charcount throughput = good_charcount / elapsed return { "training_time": training_time, "sampling_time": elapsed, "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "efficiency": efficiency, # good_chars / total_chars "throughput": throughput, # good_chars / second "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def sample(self, model: Model, quiet: bool = False) -> None: """ Sample CLgen model. Arguments: model (Model): CLgen model. """ cache = self.cache(model) # create samples database if it doesn't exist if not cache["kernels.db"]: dbutil.create_db(fs.path(cache.path, "kernels.tmp.db")) cache["kernels.db"] = fs.path(cache.path, "kernels.tmp.db") batch_i = 0 while True: # stop if we have enough kernels has_max_kernels = self.max_kernels >= 0 num_good_kernels = dbutil.num_good_kernels(cache["kernels.db"]) if has_max_kernels and num_good_kernels >= self.max_kernels: return # stop if we've done enough batches has_max_batches = self.max_batches >= 0 if has_max_batches and batch_i >= self.max_batches: return batch_i += 1 print("sample batch", batch_i, "...") self.sample_iteration(model, quiet=quiet) print() explore(self.cache(model)["kernels.db"]) log.info("samples database:", cache["kernels.db"])
def __repr__(self) -> str: nf = dbutil.num_good_kernels(self.contentcache['kernels.db']) return (f"corpus[{self.shorthash}]: {nf} files, {self.size} tokens " + f"using {self.opts['vocabulary']} vocabulary of size " + f"{self.atomizer.vocab_size}")
def num_good_kernels(self) -> int: return dbutil.num_good_kernels(self.db_path)
def min_kernels_progress(self) -> int: return min(dbutil.num_good_kernels(self.db_path), self.sampler_opts["min_kernels"])
def __repr__(self) -> str: n = dbutil.num_good_kernels(self.contentcache['kernels.db']) return "corpus of {n} files".format(n=n)