def meta(self) -> dict: """ Get trained model metadata. Format spec: https://github.com/ChrisCummins/clgen/issues/25 Returns: dict: Metadata. """ # checksum corpus and model cache files. Paths are relative to cache # root. cache_root_re = r'^' + cache.ROOT + '/' corpus_files = dict( (re.sub(cache_root_re, "", x), clgen.checksum_file(x)) for x in fs.ls(self.corpus.cache.path, abspaths=True)) model_files = dict( (re.sub(cache_root_re, "", x), clgen.checksum_file(x)) for x in fs.ls(self.cache.path, abspaths=True)) contents = corpus_files.copy() contents.update(model_files) _meta = deepcopy(self.opts) _meta["version"] = clgen.version() _meta["date_packaged"] = labtime.nowstr() _meta["corpus"] = self.corpus.meta, _meta["contents"] = contents return _meta
def test_cli(): fs.rm("kernels.db") cli.main("db init kernels.db".split()) assert fs.exists("kernels.db") corpus_path = tests.archive("tiny", "corpus") cli.main("db explore kernels.db".split()) cli.main(f"fetch fs kernels.db {corpus_path}".split()) cli.main("preprocess kernels.db".split()) cli.main("db explore kernels.db".split()) fs.rm("kernels_out") cli.main("db dump kernels.db -d kernels_out".split()) assert fs.isdir("kernels_out") assert len(fs.ls("kernels_out")) >= 1 fs.rm("kernels.cl") cli.main("db dump kernels.db kernels.cl --file-sep --eof --reverse".split()) assert fs.isfile("kernels.cl") fs.rm("kernels_out") cli.main("db dump kernels.db --input-samples -d kernels_out".split()) assert fs.isdir("kernels_out") assert len(fs.ls("kernels_out")) == 250 fs.rm("kernels.db") fs.rm("kernels_out")
def get_all_sampler_datasets(): datasets = [] sampledirs = [] for versioncache in fs.ls(fs.path("~/.cache/clgen"), abspaths=True): samplerdir = fs.path(versioncache, "sampler") if fs.isdir(samplerdir): sampledirs += fs.ls(samplerdir, abspaths=True) for samplerdir in sampledirs: inpath = fs.path(samplerdir, "kernels.db") if fs.isfile(inpath): datasets.append(inpath) return datasets
def files_from_list(paths: list) -> list: """ Return a list of all file paths from a list of files or directories. For each path in the input: if it is a file, return it; if it is a directory, return a list of files in the directory. Arguments: paths (list of str): List of file and directory paths. Returns: list of str: Absolute file paths. Raises: File404: If any of the paths do not exist. """ ret = [] for path in paths: if fs.isfile(path): ret.append(fs.abspath(path)) elif fs.isdir(path): ret += [ f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f) ] else: raise File404(path) return ret
def _main(infiles: List[TextIO], dir_mode: bool, summarise: bool, fatal_errors: bool, use_shum: bool, quiet: bool, no_header: bool) -> None: from clgen import features input_paths = [infile.name for infile in infiles] def features_dir(csv_path): return fs.basename(fs.dirname(csv_path)) if summarise: stats = [features.summarize(f) for f in input_paths] print('dataset', *list(stats[0].keys()), sep=',') for path, stat in zip(input_paths, stats): print(features_dir(path), *list(stat.values()), sep=',') return if dir_mode: trees = [fs.ls(d, abspaths=True, recursive=True) for d in input_paths] paths = [item for sublist in trees for item in sublist] else: paths = [fs.path(f) for f in input_paths] features.files(paths, fatal_errors=fatal_errors, quiet=quiet, use_shim=use_shim, header=not no_header)
def _shorthash(hash: str, cachedir: str, min_len: int = 7) -> str: """ Truncate the hash to a shorter length, while maintaining uniqueness. This returns the shortest hash required to uniquely identify all elements in the cache. Parameters ---------- hash : str Hash to truncate. cachedir : str Path to cache. min_len : int, optional Minimum length of hash to try. Returns ------- str Truncated hash. """ for shorthash_len in range(min_len, len(hash)): entries = [x[:shorthash_len] for x in fs.ls(cachedir)] if len(entries) == len(set(entries)): break return hash[:shorthash_len]
def _create_kernels_db(self, path: str, encoding: str = "default") -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = fs.path(self.contentcache.path, "kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [ f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f) ] # import files into database fetch.fetch_fs(self.contentcache["kernels.db"], filelist) # preprocess files preprocess.preprocess_db(self.contentcache["kernels.db"]) # encode kernel db encode(self.contentcache["kernels.db"], encoding) # print database stats explore.explore(self.contentcache["kernels.db"])
def get_all_sampler_datasets(all_clgen_versions: bool=True) -> list: if all_clgen_versions: versiondirs = fs.ls(fs.path("~/.cache/clgen"), abspaths=True) else: versiondirs = [fs.path("~/.cache/clgen", clgen.version())] versiondirs = [v for v in versiondirs if fs.isdir(v, "sampler")] datasets = [] for versiondir in versiondirs: for samplerdir in fs.ls(fs.path(versiondir, "sampler"), abspaths=True): inpath = fs.path(samplerdir, "kernels.db") if fs.isfile(inpath): datasets.append(inpath) return datasets
def main(model, sampler, print_file_list=False, print_corpus_dir=False, print_model_dir=False, print_sampler_dir=False) -> None: """ Main entry point for clgen. Arguments: model (str): Path to model. sample (str): Path to sampler. print_corpus_dir (bool, optional): If True, print cache path and exit. print_model_dir (bool, optional): If True, print cache path and exit. print_sampler_dir (bool, optional): If True, print cache path and exit. """ import clgen.model import clgen.sampler from clgen import log model_json = load_json_file(model) model = clgen.model.from_json(model_json) sampler_json = load_json_file(sampler) sampler = clgen.sampler.from_json(sampler_json) # print cache paths if print_file_list: files = sorted( fs.ls(model.corpus.cache.path, abspaths=True, recursive=True) + fs.ls(model.cache.path, abspaths=True, recursive=True) + fs.ls(sampler.cache(model).path, abspaths=True, recursive=True)) print('\n'.join(files)) sys.exit(0) elif print_corpus_dir: print(model.corpus.cache.path) sys.exit(0) elif print_model_dir: print(model.cache.path) sys.exit(0) elif print_sampler_dir: print(sampler.cache(model).path) sys.exit(0) model.train() sampler.sample(model)
def __len__(self): """ Get the number of entries in the cache. Returns: int: Number of entries in the cache. """ return len(list(fs.ls(self.path)))
def __iter__(self): """ Iterate over all cached files. Returns: iterable: Paths in cache. """ for path in fs.ls(self.path, abspaths=True): yield path
def main(): """ Reduce all databases to oracle. """ dbs = [migrate(_db.Database(path)) for path in fs.ls(experiment.DB_DEST, abspaths=True) if not re.search("oracle.db$", path) and re.search(".db$", path)] merge(fs.abspath(experiment.DB_DEST, "oracle.db"), dbs, experiment.ORACLE_PATH)
def test_ls_abspaths(self): fs.cp("tests/data/testdir", "/tmp/testdir") self._test(["/tmp/testdir/a", "/tmp/testdir/b", "/tmp/testdir/c", "/tmp/testdir/d"], fs.ls("/tmp/testdir", abspaths=True)) self._test(["/tmp/testdir/a", "/tmp/testdir/b", "/tmp/testdir/c", "/tmp/testdir/c/e", "/tmp/testdir/c/f", "/tmp/testdir/c/f/f", "/tmp/testdir/c/f/f/i", "/tmp/testdir/c/f/h", "/tmp/testdir/c/g", "/tmp/testdir/d"], fs.ls("/tmp/testdir", recursive=True, abspaths=True)) fs.rm("/tmp/testdir")
def ls(self, **kwargs): """ List files in cache. Arguments: **kwargs: Keyword options to pass to fs.ls(). Returns: iterable: List of files. """ return fs.ls(self.path, **kwargs)
def main(): """ Reduce all databases to oracle. """ dbs = [ migrate(_db.Database(path)) for path in fs.ls(experiment.DB_DEST, abspaths=True) if not re.search("oracle.db$", path) and re.search(".db$", path) ] merge(fs.abspath(experiment.DB_DEST, "oracle.db"), dbs, experiment.ORACLE_PATH)
def test_ls_abspaths(): fs.cp("labm8/data/test/testdir", "/tmp/testdir") assert fs.ls("/tmp/testdir", abspaths=True) == [ "/tmp/testdir/a", "/tmp/testdir/b", "/tmp/testdir/c", "/tmp/testdir/d", ] assert fs.ls("/tmp/testdir", recursive=True, abspaths=True) == [ "/tmp/testdir/a", "/tmp/testdir/b", "/tmp/testdir/c", "/tmp/testdir/c/e", "/tmp/testdir/c/f", "/tmp/testdir/c/f/f", "/tmp/testdir/c/f/f/i", "/tmp/testdir/c/f/h", "/tmp/testdir/c/g", "/tmp/testdir/d", ] fs.rm("/tmp/testdir")
def test_ls_recursive(): assert fs.ls("labm8/data/test/testdir", recursive=True) == [ "a", "b", "c", "c/e", "c/f", "c/f/f", "c/f/f/i", "c/f/h", "c/g", "d", ]
def models() -> Iterator[Model]: """ Iterate over all cached models. Returns ------- Iterator[Model] An iterable over all cached models. """ if fs.isdir(clgen.cachepath(), "model"): modeldirs = fs.ls(fs.path(clgen.cachepath(), "model"), abspaths=True) for modeldir in modeldirs: meta = jsonutil.read_file(fs.path(modeldir, "META")) model = Model.from_json(meta) yield model
def _ExportProtos() -> None: proto_dir = pathlib.Path(FLAGS.proto_dir) assert proto_dir credentials = _GetMySqlCredentials() cnx = MySQLdb.connect(database='dsmith_04_opencl', host='cc1', user=credentials[0], password=credentials[1]) cursor = cnx.cursor() (proto_dir / 'testcases').mkdir(parents=True, exist_ok=True) (proto_dir / 'results').mkdir(parents=True, exist_ok=True) for program_id in FLAGS.program_ids: logging.info("Exporting OpenCL program %s", program_id) _ExportOpenCLResults(cursor, program_id, proto_dir) cursor.close() cnx.close() logging.info('Exported %d testcases and %d results', len(fs.ls(proto_dir / 'testcases')), len(fs.ls(proto_dir / 'results')))
def _create_kernels_db(self, path: str) -> None: """creates and caches kernels.db""" log.debug("creating database") # create a database and put it in the cache tmppath = self.contentcache.keypath("kernels.db.tmp") dbutil.create_db(tmppath) self.contentcache["kernels.db"] = tmppath # get a list of files in the corpus filelist = [f for f in fs.ls(path, abspaths=True, recursive=True) if fs.isfile(f)] # import files into database clgen.fetch(self.contentcache["kernels.db"], filelist)
def _finalize(db_path, cache): """Tidy up after worker threads finish""" log.debug("worker finalize") db = dbutil.connect(db_path) c = db.cursor() # import results from worker threads for outpath in fs.ls(cache.path, abspaths=True): with open(outpath) as infile: for line in infile: c.execute( 'INSERT OR REPLACE INTO PreprocessedFiles ' 'VALUES(?,?,?)', json.loads(line)) # write changes to database and remove cache db.commit() db.close() cache.empty()
def gather(): benchmarks = { "canny": {}, "fdtd": {}, "gol": {}, "gaussian": {}, "heat": {}, "simple": {}, "simplecomplex": {} } for benchmark in benchmarks: io.info("Benchmark %s" % benchmark) fs.cd("/home/chris/src/msc-thesis/scraps/05-12/kernels/%s" % benchmark) instcounts = [] for file in fs.ls(): instcounts.append(get_instcount(file)) benchmarks[benchmark] = merge_counts(instcounts) return benchmarks
def _main() -> None: cache = clgen.cachepath() log.warning("Not Implemented: refresh corpuses") if fs.isdir(cache, "model"): cached_modeldirs = fs.ls(fs.path(cache, "model"), abspaths=True) for cached_modeldir in cached_modeldirs: cached_model_id = fs.basename(cached_modeldir) cached_meta = jsonutil.read_file(fs.path(cached_modeldir, "META")) model = clgen.Model.from_json(cached_meta) if cached_model_id != model.hash: log.info(cached_model_id, '->', model.hash) if fs.isdir(model.cache.path): log.fatal("cache conflict", file=sys.stderr) fs.mv(cached_modeldir, model.cache.path) log.warning("Not Implemented: refresh samplers")
def test_ls_empty_dir(): fs.mkdir("/tmp/labm8.empty") assert not fs.ls("/tmp/labm8.empty") fs.rm("/tmp/labm8.empty")
def _dump_db(db, out_path: str, gh: bool = False, fileid: bool = False, reverse: bool = False, input_samples: bool = False, status: int = 0, eof: bool = False, dir: bool = False) -> None: """ Dump database contents. Parameters ---------- db : slite3.Connection Dataset. out_path : str Path to output. gh : bool, optional Dataset is GitHub. fileid : bool, optional Include file IDs. reverse : bool, optional Reverse ordering of output. input_samples : bool, optional If True, use un-preprocessed files. status : int, optional Filter preprocess status. eof : bool, optional Include EOF separators. dir : bool, optional Write output to directory. """ log.info('writing corpus', out_path, '...') order = 'ASC' if reverse else 'DESC' c = db.cursor() # Query components table = 'ContentFiles' if input_samples else 'PreprocessedFiles' select = 'SELECT {}.id,{}.contents'.format(table, table, table) if input_samples: qualifier = '' else: qualifier = 'WHERE {}.status={}'.format(table, status) if gh: table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id' ' LEFT JOIN Repositories ON ' 'ContentMeta.repo_url=Repositories.url'.format(table)) orderby = 'Repositories.stars' else: orderby = 'LC_col(contents)' query = ( '{select} FROM {table} {qualifier} ORDER BY {orderby} {order}'.format( select=select, table=table, qualifier=qualifier, orderby=orderby, order=order)) c.execute(query) rows = c.fetchall() if dir: log.info('writing to directory ', out_path, '/', sep='') if os.path.exists(out_path): if len(fs.ls(out_path)): raise clgen.UserError('directory already exists!') else: os.makedirs(out_path) for row in rows: id, contents = row path = os.path.join(out_path, kid_to_path(id) + '.cl') with open(path, 'w') as out: out.write(contents) else: log.info('writing file', out_path) with open(out_path, 'wb') as out: for row in rows: id, contents = row if fileid: # Print file ID out.write('/* ID: {} */\n\n'.format(id).encode('utf-8')) out.write(contents.encode('utf-8')) if eof: # Print EOF token out.write('\n/* EOF */\n\n'.encode('utf-8')) else: out.write('\n\n'.encode('utf-8'))
def test_ls_single_file(self): self._test(["a"], fs.ls("tests/data/testdir/a"))
def test_ls_bad_path(self): with self.assertRaises(OSError): fs.ls("/not/a/real/path/bro")
def test_ls_empty_dir(self): fs.mkdir("/tmp/labm8.empty") self._test([], fs.ls("/tmp/labm8.empty")) fs.rm("/tmp/labm8.empty")
def test_ls_recursive(self): self._test(["a", "b", "c", "c/e", "c/f", "c/f/f", "c/f/f/i", "c/f/h", "c/g", "d"], fs.ls("tests/data/testdir", recursive=True))
def test_ls(self): self._test(["a", "b", "c", "d"], fs.ls("tests/data/testdir"))
def test_ls_single_file(): assert ["a"] == fs.ls("labm8/data/test/testdir/a")
def test_ls_bad_path(): with pytest.raises(OSError): fs.ls("/not/a/real/path/bro")
def host_has_opencl(): try: return system.is_mac() or len(fs.ls('/etc/OpenCL/vendors')) except FileNotFoundError: return False
def create_corpus(db, out_path, gh=False, fileid=False, reverse=False, input_samples=False, status=0, eof=False, dir=False): # Dump all the preprocessed OpenCL files print('creating DNN corpus', out_path, '...') order = 'ASC' if reverse else 'DESC' c = db.cursor() # Query components table = 'ContentFiles' if input_samples else 'PreprocessedFiles' select = 'SELECT {}.id,{}.contents'.format(table, table, table) if input_samples: qualifier = '' else: qualifier = 'WHERE {}.status={}'.format(table, status) if gh: table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id' ' LEFT JOIN Repositories ON ' 'ContentMeta.repo_url=Repositories.url' .format(table)) orderby = 'Repositories.stars' else: orderby = 'LC(contents)' query = ('{select} FROM {table} {qualifier} ORDER BY {orderby} {order}' .format(select=select, table=table, qualifier=qualifier, orderby=orderby, order=order)) c.execute(query) rows = c.fetchall() if dir: print('writing to directory ', out_path, '/', sep='') if os.path.exists(out_path): if len(fs.ls(out_path)): print('fatal: directory already exists!', file=sys.stderr) return 1 else: os.makedirs(out_path) for row in rows: id,contents = row path = os.path.join(out_path, sanitize_id(id) + '.cl') with open(path, 'w') as out: out.write(contents) return 0 else: print('writing file', out_path) with open(out_path, 'w') as out: for row in rows: id,contents = row if fileid: # Print file ID out.write('/* ID: {} */\n\n'.format(id)) out.write(contents) if eof: # Print EOF token out.write('\n/* EOF */\n\n') else: out.write('\n\n') return 0
def import_from_dir(self, indir: Path) -> None: """ import program sources from a directory """ with Session() as s: start_num_progs = self.num_programs(s) def _save(proxies): # Create records from proxies: programs = [proxy.to_record(s) for proxy in proxies] logging.warning(getattr(type(programs[0]), "sha1")) import sys sys.exit(0) # Filter duplicates in the set of new records: programs = dict( (program.sha1, program) for program in programs).values() # Fetch a list of dupe keys already in the database: sha1s = [program.sha1 for program in programs] dupes = set(x[0] for x in s.query(Program.sha1).filter( Program.sha1.in_(sha1s))) # Filter the list of records to import, excluding dupes: uniq = [ program for program in programs if program.sha1 not in dupes ] # Import those suckas: s.add_all(uniq) s.commit() nprog, nuniq = len(programs), len(uniq) logging.info(f"imported {nuniq} of {nprog} unique programs") num_progs = self.num_programs(s) # Print a preamble message: paths = fs.ls(indir, abspaths=True) num_to_import = humanize.intcomma(len(paths)) print(f"{Colors.BOLD}{num_to_import}{Colors.END} files are " "to be imported.") bar = progressbar.ProgressBar(redirect_stdout=True) # The actual import loop: buf = [] for i, path in enumerate(bar(paths)): buf.append(self.import_from_file(s, path)) if len(buf) >= dsmith.DB_BUF_SIZE: save_proxies_uniq_on(s, buf, "sha1") buf = [] save_proxies_uniq_on(s, buf, "sha1") num_imported = humanize.intcomma( self.num_programs(s) - start_num_progs) num_progs = humanize.intcomma(self.num_programs(s)) print(f"All done! Imported {Colors.BOLD}{num_imported}{Colors.END} " f"new {self} programs. You now have " f"{Colors.BOLD}{num_progs}{Colors.END} {self} programs in the " "database")
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None: db = dbutil.connect(db_path) if not dbutil.is_github(db): raise clgen.UserError("not a GitHub database") c = db.cursor() for directory in fs.ls(indir, abspaths=True): # hacky hardcoded interpretation of `git remote -v` gitdir = fs.path(directory, ".git") output = subprocess.check_output( ["git", "--git-dir", gitdir, "remote", "-v"], universal_newlines=True) url = output.split("\n")[0].split("\t")[1].split(" ")[0] name = fs.basename(directory) output = subprocess.check_output( f"git --git-dir {gitdir} rev-list --format=format:'%ai' " + f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1", shell=True, universal_newlines=True) try: updated_at = dateutil.parser.parse(output) except ValueError: log.error(f"failed to process {name} {url}") continue c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, )) cached_updated_at = c.fetchone() # Do nothing unless updated timestamps don't match # if cached_updated_at and cached_updated_at[0] >= updated_at: # log.verbose(name, "already in database") # continue c.execute("DELETE FROM Repositories WHERE url=?", (url, )) c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)", (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at)) name_str = " -o ".join( [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)]) output = subprocess.check_output( f"find {directory} -type f {name_str} | grep -v '.git/' || true", shell=True, universal_newlines=True) files = [x.strip() for x in output.split("\n") if x.strip()] # nothing to import if not len(files): # log.verbose("no files in", name) continue log.verbose("processing", len(files), "files in", name) for path in files: relpath = path[len(directory) + 1:] try: contents = inline_fs_headers(path, [], lang=lang) sha = crypto.sha1_str(contents) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (sha, contents)) c.execute( "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)", (sha, relpath, url, sha, len(contents))) except UnicodeDecodeError: log.warning("non UTF-8 file", path) db.commit() c = db.cursor()
def create_corpus(db, out_path: str, gh: bool=False, fileid: bool=False, reverse: bool=False, input_samples: bool=False, status: int=0, eof: bool=False, dir: bool=False) -> None: """ Create CLgen training corpus. Arguments: db (slite3.Connection): Dataset. out_path (str): Path to output. gh (bool, optional): Dataset is GitHub. fileid (bool, optional): Include file IDs. reverse (bool, optional): Reverse ordering of output. input_samples (bool, optional): If True, use un-preprocessed files. status (int, optional): Filter preprocess status. eof (bool, optional): Include EOF separators. dir (bool, optional): Write output to directory. """ # Dump all the preprocessed OpenCL files print('creating DNN corpus', out_path, '...') order = 'ASC' if reverse else 'DESC' c = db.cursor() # Query components table = 'ContentFiles' if input_samples else 'PreprocessedFiles' select = 'SELECT {}.id,{}.contents'.format(table, table, table) if input_samples: qualifier = '' else: qualifier = 'WHERE {}.status={}'.format(table, status) if gh: table += (' LEFT JOIN ContentMeta ON {}.id=ContentMeta.id' ' LEFT JOIN Repositories ON ' 'ContentMeta.repo_url=Repositories.url' .format(table)) orderby = 'Repositories.stars' else: orderby = 'LC(contents)' query = ('{select} FROM {table} {qualifier} ORDER BY {orderby} {order}' .format(select=select, table=table, qualifier=qualifier, orderby=orderby, order=order)) c.execute(query) rows = c.fetchall() if dir: print('writing to directory ', out_path, '/', sep='') if os.path.exists(out_path): if len(fs.ls(out_path)): print('fatal: directory already exists!', file=sys.stderr) return 1 else: os.makedirs(out_path) for row in rows: id, contents = row path = os.path.join(out_path, sanitize_id(id) + '.cl') with open(path, 'w') as out: out.write(contents) return 0 else: print('writing file', out_path) with open(out_path, 'wb') as out: for row in rows: id, contents = row if fileid: # Print file ID out.write('/* ID: {} */\n\n'.format(id).encode('utf-8')) out.write(contents.encode('utf-8')) if eof: # Print EOF token out.write('\n/* EOF */\n\n'.encode('utf-8')) else: out.write('\n\n'.encode('utf-8')) return 0
def test_ls(): assert ["a", "b", "c", "d"] == fs.ls("labm8/data/test/testdir")