def clangformat(src: str, id: str = 'anon', timeout: int = 60) -> str: """ Enforce code style on source file. Parameters ---------- src : str Source code. id : str, optional Name of source file. Returns ------- str Styled source. Raises ------ ClangFormatException If formatting errors. """ cmd = [ "timeout", "-s9", str(timeout), native.CLANG_FORMAT, '-style={}'.format(json.dumps(clangformat_config)) ] process = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate(src.encode('utf-8')) if stderr: log.error(stderr.decode('utf-8')) if process.returncode != 0: raise ClangFormatException(stderr.decode('utf-8')) return stdout.decode('utf-8')
def preprocess_inplace(paths: str, max_num_workers: int = cpu_count(), attempt: int = 1) -> None: """ Preprocess a list of files in place. Arguments: paths (str[]): List of paths. max_num_workers (int, optional): Number of processes to spawn. """ if attempt >= MAX_OS_RETRIES: raise clgen.InternalError("Failed to process files") num_workers = min(len(paths), max_num_workers) try: log.info('spawned', num_workers, 'worker threads to process', len(paths), 'files ...') with clgen.terminating(Pool(num_workers)) as pool: pool.map(_preprocess_inplace_worker, paths) except OSError as e: log.error(e) # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) preprocess_inplace(paths, max_num_workers=max_num_workers, attempt=attempt + 1)
def clangformat_ocl(src: str, id: str = 'anon') -> str: """ Enforce code style on OpenCL file. Arguments: src (str): OpenCL source. id (str, optional): Name of OpenCL source. Returns: str: Styled source. Raises: ClangFormatException: If formatting errors. """ cmd = [ native.CLANG_FORMAT, '-style={}'.format(json.dumps(clangformat_config)) ] process = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) stdout, stderr = process.communicate(src.encode('utf-8')) if stderr: log.error(stderr.decode('utf-8')) if process.returncode != 0: raise ClangFormatException(stderr.decode('utf-8')) return stdout.decode('utf-8')
def _init_error(err: Exception, files_to_rm: List[str]=[]) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") for path in files_to_rm: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err
def _process_file(path: str, **kwargs): buf = StringIO() features(path=path, file=buf, **kwargs) ret = buf.getvalue() try: # last line is empty: lines = ret.split('\n')[:-1] # first two cols are ignored (path and kernel names): parse = lambda l: np.array([float(x) for x in l.split(',')[2:]]) return [parse(line) for line in lines] except IndexError: log.error("lines:", lines) raise FeatureExtractionError
def _init_error(err: Exception) -> None: """ tidy up in case of error """ log.error("corpus creation failed. Deleting corpus files") paths = [ fs.path(self.contentcache.path, "kernels.db"), fs.path(self.cache.path, "corpus.txt"), fs.path(self.cache.path, "tensor.npy"), fs.path(self.cache.path, "atomizer.pkl") ] for path in paths: if fs.exists(path): log.info("removing", path) fs.rm(path) raise err
def get_features(code: str) -> np.array: """ Get features for code. Arguments: code (str): Source code. Returns: np.array: Feature values. """ with NamedTemporaryFile() as outfile: outfile.write(code.encode("utf-8")) outfile.seek(0) f = features.to_np_arrays([outfile.name]) if len(f) != 1: log.error("features:", f) raise FeaturesError("code contains more than one kernel") return f[0]
def preprocess_inplace(paths: List[str], max_num_workers: int = cpu_count(), max_attempts: int = 100, attempt: int = 1) -> None: """ Preprocess a list of files in place. Parameters ---------- paths : List[str] List of paths. max_num_workers : int, optional Number of processes to spawn. max_attempts : int, optional In case of an OSError or TimeoutError, this number of attempts will be made. """ if attempt > max_attempts: raise clgen.InternalError( f"Failed to process files after {max_attempts} attempts") elif attempt > 1: log.warning("preprocess attempt #.", attempt) num_workers = min(len(paths), max_num_workers) try: log.info('spawned', num_workers, 'worker threads to process', len(paths), 'files ...') with clgen.terminating(Pool(num_workers)) as pool: pool.map(_preprocess_inplace_worker, paths) except (OSError, TimeoutError) as e: log.error(e) # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) preprocess_inplace(paths, max_num_workers=max_num_workers, attempt=attempt + 1, max_attempts=max_attempts)
def get_kernel_features(code: str, **kwargs) -> np.array: """ Get features for code. Parameters ---------- code : str Source code. **kwargs Arguments to features.features() Returns ------- np.array Feature values. """ with NamedTemporaryFile() as outfile: outfile.write(code.encode("utf-8")) outfile.seek(0) f = features.to_np_arrays([outfile.name], **kwargs) if len(f) != 1: log.error("features:", f) raise FeaturesError("code contains more than one kernel") return f[0]
def _preprocess_db(db_path: str, max_num_workers: int = cpu_count(), max_attempts: int = 100, attempt: int = 1, **preprocess_opts) -> None: """ Preprocess OpenCL dataset. Parameters ---------- db_path : str OpenCL kernels dataset. max_num_workers : int, optional Number of processes to spawn. max_attempts : int, optional In case of an OSError or TimeoutError, this number of attempts will be made. """ if attempt > max_attempts: raise clgen.InternalError( f"failed to preprocess files after {max_attempts} attempts") log.verbose("determining jobs") contentfiles = set(dbutil.kernel_ids(db_path, "ContentFiles")) preprocessedfiles = set(dbutil.kernel_ids(db_path, "PreprocessedFiles")) ncontentfiles = len(contentfiles) npreprocessedfiles = len(preprocessedfiles) todo = contentfiles - preprocessedfiles ntodo = len(todo) # check we have something to do if not ntodo: return todo_ratio = ntodo / ncontentfiles log.info("{ntodo} ({todo_ratio:.1%}) samples need preprocessing".format( **vars())) log.verbose("creating jobs") # Determine if we need to inline kernels when creating jobs db = sqlite3.connect(db_path) c = db.cursor() c.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='ContentMeta';" ) meta_table = c.fetchone() c.close() db.close() if meta_table: get_kernel = lambda kid: dbutil.get_inlined_kernel( db_path, kid, lang=preprocess_opts["lang"]) else: get_kernel = lambda kid: dbutil.get_kernel( db_path, kid, table="ContentFiles") # create jobs jobs = [{ "id": kid, "src": get_kernel(kid), "preprocess_opts": preprocess_opts, } for kid in todo] random.shuffle(jobs) # split size worker_njobs = math.ceil(ntodo / max_num_workers) # producer-consumer queue queue = Queue(maxsize=128) log.verbose(f"assigning {ntodo} jobs to {max_num_workers} threads") try: # our worker threads. these busy little bees will do the heavy lifting # of preprocessing the contentfiles, pushing their results onto # the queue producers = [ PreprocessWorker(jobs[i:i + worker_njobs], queue) for i in range(0, ntodo, worker_njobs) ] # fly, my pretties, fly! for producer in producers: producer.start() # consume the results from the worker threads from the main thread for i in progressbar.ProgressBar()(range(ntodo)): # pull a fresh result from the queue (block if necessary) try: result = queue.get(timeout=90) except QueueEmpty as e: raise TimeoutError('failed to fetch result after 90 seconds. ' 'something went wrong') from e # insert result into database db = dbutil.connect(db_path) c = db.cursor() c.execute("INSERT INTO PreprocessedFiles VALUES(?,?,?)", (result["id"], result["status"], result["contents"])) c.close() db.commit() db.close() for producer in producers: producer.join() except (OSError, TimeoutError) as e: log.error(e) if attempt > 2 and not i: log.warning("no progress has been made since previous attempt. " "I'm not going to try another attempt.") return # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) _preprocess_db(db_path, max_num_workers=max_num_workers, attempt=attempt + 1, max_attempts=max_attempts, **preprocess_opts)
def fetch_repos(db_path: Path, indir: Path, lang: clgen.Language) -> None: db = dbutil.connect(db_path) if not dbutil.is_github(db): raise clgen.UserError("not a GitHub database") c = db.cursor() for directory in fs.ls(indir, abspaths=True): # hacky hardcoded interpretation of `git remote -v` gitdir = fs.path(directory, ".git") output = subprocess.check_output( ["git", "--git-dir", gitdir, "remote", "-v"], universal_newlines=True) url = output.split("\n")[0].split("\t")[1].split(" ")[0] name = fs.basename(directory) output = subprocess.check_output( f"git --git-dir {gitdir} rev-list --format=format:'%ai' " + f"--max-count=1 $(git --git-dir {gitdir} rev-parse HEAD) | tail -n1", shell=True, universal_newlines=True) try: updated_at = dateutil.parser.parse(output) except ValueError: log.error(f"failed to process {name} {url}") continue c.execute("SELECT updated_at FROM Repositories WHERE url=?", (url, )) cached_updated_at = c.fetchone() # Do nothing unless updated timestamps don't match # if cached_updated_at and cached_updated_at[0] >= updated_at: # log.verbose(name, "already in database") # continue c.execute("DELETE FROM Repositories WHERE url=?", (url, )) c.execute("INSERT INTO Repositories VALUES(?,?,?,?,?,?,?,?,?)", (url, "<unknown>", name, 0, 0, 0, 0, updated_at, updated_at)) name_str = " -o ".join( [f"-name '*{ext}'" for ext in clgen.file_extensions(lang)]) output = subprocess.check_output( f"find {directory} -type f {name_str} | grep -v '.git/' || true", shell=True, universal_newlines=True) files = [x.strip() for x in output.split("\n") if x.strip()] # nothing to import if not len(files): # log.verbose("no files in", name) continue log.verbose("processing", len(files), "files in", name) for path in files: relpath = path[len(directory) + 1:] try: contents = inline_fs_headers(path, [], lang=lang) sha = crypto.sha1_str(contents) c.execute('INSERT OR IGNORE INTO ContentFiles VALUES(?,?)', (sha, contents)) c.execute( "INSERT OR IGNORE INTO ContentMeta VALUES(?,?,?,?,?)", (sha, relpath, url, sha, len(contents))) except UnicodeDecodeError: log.warning("non UTF-8 file", path) db.commit() c = db.cursor()
def __init__(self, corpus: clgen.Corpus, **opts): """ Instantiate model. Parameters ---------- corpus : clgen.Corpus Corpus instance. **opts Training options. """ assert(isinstance(corpus, clgen.Corpus)) def _hash(corpus: clgen.Corpus, opts: dict) -> str: """ compute model hash """ hashopts = deepcopy(opts) del hashopts["created"] del hashopts["train_opts"]["epochs"] return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts)) # Validate options for key in opts: if key not in DEFAULT_MODEL_OPTS: raise clgen.UserError( "Unsupported model option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys())))) # set properties self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts) self.corpus = corpus self.hash = _hash(self.corpus, self.opts) self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}") log.debug("model", self.hash) # validate metadata against cache, and restore stats self.stats = { "epoch_times": [], "epoch_costs": [], "epoch_batches": [] } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "created" in cached_meta["corpus"]: del cached_meta["corpus"]["created"] del meta["corpus"]["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if "epochs" in cached_meta["train_opts"]: del cached_meta["train_opts"]["epochs"] del meta["train_opts"]["epochs"] if meta != cached_meta: log.error("Computed META:", jsonutil.format_json(meta)) raise clgen.InternalError( "metadata mismatch in model %s" % self.cache["META"]) else: self._flush_meta()
def preprocess_contentfiles(db_path: str, max_num_workers: int = cpu_count(), attempt: int = 1) -> None: """ Preprocess OpenCL dataset. Arguments: db_path (str): OpenCL kernels dataset. max_num_workers (int, optional): Number of processes to spawn. """ def _finalize(db_path, cache): """Tidy up after worker threads finish""" log.debug("worker finalize") db = dbutil.connect(db_path) c = db.cursor() # import results from worker threads for outpath in fs.ls(cache.path, abspaths=True): with open(outpath) as infile: for line in infile: c.execute( 'INSERT OR REPLACE INTO PreprocessedFiles ' 'VALUES(?,?,?)', json.loads(line)) # write changes to database and remove cache db.commit() db.close() cache.empty() if attempt >= MAX_OS_RETRIES: raise clgen.InternalError("failed to preprocess files") num_contentfiles = dbutil.num_rows_in(db_path, 'ContentFiles') num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles') log.info("{n} ({r:.1%}) files need preprocessing".format( n=num_contentfiles - num_preprocessedfiles, r=(num_contentfiles - num_preprocessedfiles) / num_contentfiles)) # split into mulitple jobs of a maximum size jobsize = min(512, num_contentfiles) numjobs = math.ceil(num_contentfiles / jobsize) for j, offset in enumerate(range(0, num_contentfiles, jobsize)): num_preprocessedfiles = dbutil.num_rows_in(db_path, 'PreprocessedFiles') num_workers = min(num_contentfiles, max_num_workers) files_per_worker = math.ceil(jobsize / num_workers) # temporary cache used for worker thread results cache = Cache("{pid}.preprocess".format(pid=os.getpid())) # each worker thread receives a range of database indices to preprocess, # and a JSON file to write results into jobs = [{ "db_in": db_path, "db_index_range": (offset + i * files_per_worker, offset + i * files_per_worker + files_per_worker), "json_out": fs.path(cache.path, "{i}.json".format(i=i)) } for i in range(num_workers)] # spool up worker threads then finalize log.info('job {j} of {numjobs}: spawning {num_workers} worker threads ' 'to process {jobsize} files ...'.format(**vars())) try: with clgen.terminating(Pool(num_workers)) as pool: pool.map(_preprocess_db_worker, jobs) except OSError as e: _finalize(db_path, cache) log.error(e) # Try again with fewer threads. # See: https://github.com/ChrisCummins/clgen/issues/64 max_num_workers = max(int(max_num_workers / 2), 1) preprocess_contentfiles(db_path, max_num_workers=max_num_workers, attempt=attempt + 1) except Exception as e: _finalize(db_path, cache) raise e _finalize(db_path, cache)