def test_sample(): m = _get_test_model() m.train() argspec = [ '__global float*', '__global float*', '__global float*', 'const int' ] s = clgen.Sampler.from_json({ "kernels": { "language": "opencl", "args": argspec, "max_length": 300, }, "sampler": { "min_samples": 1 } }) s.cache(m).clear() # clear old samples # sample a single kernel: s.sample(m) num_contentfiles = dbutil.num_rows_in( s.cache(m)["kernels.db"], "ContentFiles") assert num_contentfiles >= 1 s.sample(m) num_contentfiles2 = dbutil.num_rows_in( s.cache(m)["kernels.db"], "ContentFiles") diff = num_contentfiles2 - num_contentfiles # if sample is the same as previous, then there will still only be a # single sample in db: assert diff >= 1
def evaluate(model, sampler): """ evaluate sampling efficiency """ print("starting sampling") sampler.sample(model) print("preprocessing sample") sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") return { "argspec": sampler.kernel_opts["args"], "host": system.HOSTNAME, "date": time.nowstr(), "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def evaluate(model, sampler): """ evaluate sampling efficiency """ model.cache.empty() # clear checkpoint cache print("starting training") tstart = time() # start timer model.train() # train model training_time = time() - tstart # clear the sample cache sampler.cache(model).empty() # sample kernels and time print("starting sampling") tstart = time() sampler.sample(model) tend = time() elapsed = tend - tstart # preprocess sample sample_db = sampler.cache(model)["kernels.db"] preprocess.preprocess_db(sample_db) num_kernels = dbutil.num_rows_in(sample_db, "ContentFiles") num_good_kernels = dbutil.num_good_kernels(sample_db) num_ugly_kernels = dbutil.num_rows_in(sample_db, "PreprocessedFiles", "WHERE status=2") discard_rate = 1 - (num_good_kernels / num_kernels) ugly_rate = 1 - (num_ugly_kernels / num_kernels) total_charcount = dbutil.cc(sample_db, "ContentFiles") good_charcount = dbutil.cc(sample_db, "PreprocessedFiles", condition="WHERE status=0") efficiency = good_charcount / total_charcount throughput = good_charcount / elapsed return { "training_time": training_time, "sampling_time": elapsed, "num_kernels": num_kernels, "num_good_kernels": num_good_kernels, "discard_rate": discard_rate, "ugly_rate": ugly_rate, "total_charcount": total_charcount, "good_charcount": good_charcount, "efficiency": efficiency, # good_chars / total_chars "throughput": throughput, # good_chars / second "corpus_dir": model.corpus.cache.path, "model_dir": model.cache.path, "sampler_dir": sampler.cache(model).path, }
def run(self) -> None: i = dbutil.num_rows_in(self.db_path, "ContentFiles") if not log.is_verbose(): bar = progressbar.ProgressBar(max_value=self.max_i) bar.update(self.progress()) try: while True: sample_time = time() sample = self.queue.get(timeout=60) kernels = clutil.get_cl_kernels(sample) ids = [crypto.sha1_str(k) for k in kernels] if self.sampler_opts["static_checker"]: preprocess_opts = { "use_shim": False, "use_gpuverify": self.sampler_opts["gpuverify"] } pp = [clgen.preprocess_for_db(k, **preprocess_opts) for k in kernels] db = dbutil.connect(self.db_path) c = db.cursor() # insert raw samples for kid, src in zip(ids, kernels): dbutil.sql_insert_dict(c, "ContentFiles", {"id": kid, "contents": src}, ignore_existing=True) # insert preprocessed samples if self.sampler_opts["static_checker"]: for kid, (status, src) in zip(ids, pp): dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": kid, "status": status, "contents": src }, ignore_existing=True) c.close() db.commit() db.close() # update progress bar progress = self.progress() if not log.is_verbose(): bar.update(progress) sample_time = time() - sample_time self.sampler.stats["progress"] = progress self.sampler.stats["time"] += sample_time self.sampler._flush_meta(self.cache) # determine if we are done sampling if self.term_condition(): self.producer.stop() return finally: # always kill the sampler thread print() self.producer.stop()
def test_num_rows_in(self): self.assertEqual( 10, dbutil.num_rows_in(tests.db_path('10-kernels'), "ContentFiles")) self.assertEqual( 0, dbutil.num_rows_in(tests.db_path('10-kernels'), "PreprocessedFiles")) self.assertEqual( 8, dbutil.num_rows_in(tests.db_path('10-kernels-preprocessed'), "PreprocessedFiles", "WHERE status=0")) self.assertEqual( 2, dbutil.num_rows_in(tests.db_path('10-kernels-preprocessed'), "PreprocessedFiles", "WHERE status!=0"))
def test_sample(self): m = get_test_model() m.train() argspec = [ '__global float*', '__global float*', '__global float*', 'const int' ] s = sampler.from_json({ "kernels": { "args": argspec, "max_length": 300, }, "sampler": { "batch_size": 1, "max_batches": 1 } }) s.cache(m).empty() # clear old samples # sample a single kernel: s.sample(m) nun_contentfiles = dbutil.num_rows_in( s.cache(m)["kernels.db"], "ContentFiles") num_preprocessed = dbutil.num_rows_in( s.cache(m)["kernels.db"], "PreProcessedFiles") self.assertEqual(nun_contentfiles, 1) self.assertEqual(num_preprocessed, 1) s.sample(m) nun_contentfiles = dbutil.num_rows_in( s.cache(m)["kernels.db"], "ContentFiles") num_preprocessed = dbutil.num_rows_in( s.cache(m)["kernels.db"], "PreProcessedFiles") # if sample is the same as previous, then there will still only be a # single sample in db: self.assertTrue(nun_contentfiles >= 1) self.assertTrue(num_preprocessed >= 1)
def run(self) -> None: i = dbutil.num_rows_in(self.db_path, "ContentFiles") if not log.is_verbose(): bar = progressbar.ProgressBar(max_value=self.max_i) bar.update(self.progress()) try: while True: sample_time = time() # Block while waiting for a new sample to come in: sample = self.queue.get(timeout=120).strip() # Compute the sample ID: kid = crypto.sha1_str(sample) # Add the new sample to the database: db = dbutil.connect(self.db_path) c = db.cursor() dbutil.sql_insert_dict(c, "ContentFiles", { "id": kid, "contents": sample }, ignore_existing=True) c.close() db.commit() db.close() # update progress bar progress = self.progress() if not log.is_verbose(): bar.update(progress) sample_time = time() - sample_time self.sampler.stats["progress"] = progress self.sampler.stats["time"] += sample_time self.sampler._flush_meta(self.cache) # determine if we are done sampling if self.term_condition(): self.producer.stop() return finally: # always kill the sampler thread print() self.producer.stop()
def test_insert(): db_path = tests.data_path("db", "tmp.db", exists=False) fs.rm(db_path) dbutil.create_db(db_path) db = dbutil.connect(db_path) c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 0 dbutil.sql_insert_dict(c, "ContentFiles", {"id": "a", "contents": "foo"}) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "a", "status": 0, "contents": "bar" }) dbutil.sql_insert_dict(c, "PreprocessedFiles", { "id": "b", "status": 1, "contents": "car" }) db.commit() c = db.cursor() assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 assert dbutil.cc(db_path, "ContentFiles", "contents") == 3 assert dbutil.cc(db_path, "ContentFiles", "id") == 1 assert dbutil.lc(db_path, "ContentFiles", "contents") == 1 dbutil.remove_bad_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 # remove_bad_preprocessed doesn't actually delete any rows, just # replaces contents assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 2 dbutil.remove_preprocessed(db_path) assert dbutil.num_rows_in(db_path, "ContentFiles") == 1 assert dbutil.num_rows_in(db_path, "PreprocessedFiles") == 0
def num_samples(self) -> int: return dbutil.num_rows_in(self.db_path, "ContentFiles")
def null_progress(self) -> int: return dbutil.num_rows_in(self.db_path, "ContentFiles")
def min_samples_progress(self) -> int: return min(dbutil.num_rows_in(self.db_path, "ContentFiles"), self.sampler_opts["min_samples"])
def min_samples_cond(self) -> bool: return (dbutil.num_rows_in(self.db_path, "ContentFiles") >= self.sampler_opts["min_samples"])
def preprocess_contentfiles(db_path: str, max_num_workers: int = cpu_count(), attempt: int = 1) -> None: """ Preprocess OpenCL dataset. Arguments: db_path (str): OpenCL kernels dataset. max_num_workers (int, optional): Number of processes to spawn. """ def _finalize(db_path, cache): """Tidy up after worker threads finish""" log.debug("worker finalize") db = dbutil.connect(db_path) c = db.cursor() # import results from worker threads for outpath in fs.ls(cache.path, abspaths=True): with open(outpath) as infile: for line in infile: c.execute( 'INSERT OR REPLACE INTO PreprocessedFiles ' 'VALUES(?,?,?)', json.loads(line)) # write changes to database and remove cache db.commit() db.close() cache.empty() if attempt >= MAX_OS_RETRIES: raise clgen.InternalError("failed to preprocess files") num_contentfiles = dbutil.num_rows_in(db_path, 'ContentFiles') num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles') log.info("{n} ({r:.1%}) files need preprocessing".format( n=num_contentfiles - num_preprocessedfiles, r=(num_contentfiles - num_preprocessedfiles) / num_contentfiles)) # split into mulitple jobs of a maximum size jobsize = min(512, num_contentfiles) numjobs = math.ceil(num_contentfiles / jobsize) for j, offset in enumerate(range(0, num_contentfiles, jobsize)): num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles') num_workers = min(num_contentfiles, max_num_workers) files_per_worker = math.ceil(jobsize / num_workers) # temporary cache used for worker thread results cache = Cache("{pid}.preprocess".format(pid=os.getpid())) # each worker thread receives a range of database indices to preprocess, # and a JSON file to write results into jobs = [{ "db_in": db_path, "db_index_range": (offset + i * files_per_worker, offset + i * files_per_worker + files_per_worker), "json_out": fs.path(cache.path, "{i}.json".format(i=i)) } for i in range(num_workers)] # spool up worker threads then finalize log.info('job {j} of {numjobs}: spawning {num_workers} worker threads ' 'to process {jobsize} files ...'.format(**vars())) try: with clgen.terminating(Pool(num_workers)) as pool: pool.map(_preprocess_db_worker, jobs) except OSError as e: _finalize(db_path, cache) log.error(e) # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) preprocess_contentfiles(db_path, max_num_workers=max_num_workers, attempt=attempt + 1) except Exception as e: _finalize(db_path, cache) raise e _finalize(db_path, cache)