def models() -> Iterator[Model]: """ Iterate over all cached models. Returns ------- Iterator[Model] An iterable over all cached models. """ if fs.isdir(clgen.cachepath(), "model"): modeldirs = fs.ls(fs.path(clgen.cachepath(), "model"), abspaths=True) for modeldir in modeldirs: meta = jsonutil.read_file(fs.path(modeldir, "META")) model = Model.from_json(meta) yield model
def from_json(corpus_json: dict) -> 'Corpus': """ Instantiate Corpus from JSON. Parameters ---------- corpus_json : dict Specification. Returns ------- Corpus Insantiated corpus. """ path = corpus_json.pop("path", None) uid = corpus_json.pop("id", None) language = clgen.Language.from_str(corpus_json.get("language")) if path: path = unpack_directory_if_needed(fs.abspath(path)) if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) dirhashcache = DirHashCache(clgen.cachepath("dirhash.db"), 'sha1') uid = prof.profile(dirhashcache.dirhash, path) elif uid: cache_path = clgen.mkcache("contentfiles", f"{language}-{uid}").path if not fs.isdir(cache_path): raise clgen.UserError( "Corpus content {} not found".format(uid)) else: raise clgen.UserError("No corpus path or ID provided") if "stats" in corpus_json: # ignore stats del corpus_json["stats"] if "contentfiles" in corpus_json: del corpus_json["contentfiles"] return prof.profile(Corpus, uid, path=path, **corpus_json)
def _main() -> None: cache = clgen.cachepath() log.warning("Not Implemented: refresh corpuses") if fs.isdir(cache, "model"): cached_modeldirs = fs.ls(fs.path(cache, "model"), abspaths=True) for cached_modeldir in cached_modeldirs: cached_model_id = fs.basename(cached_modeldir) cached_meta = jsonutil.read_file(fs.path(cached_modeldir, "META")) model = clgen.Model.from_json(cached_meta) if cached_model_id != model.hash: log.info(cached_model_id, '->', model.hash) if fs.isdir(model.cache.path): log.fatal("cache conflict", file=sys.stderr) fs.mv(cached_modeldir, model.cache.path) log.warning("Not Implemented: refresh samplers")
def shorthash(self): return clgen._shorthash(self.hash, clgen.cachepath("corpus"))
def __init__(self, contentid: str, path: str=None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Parameters ---------- contentid : str ID of corpus content. path : str, optional Path to corpus. **opts Keyword options. """ # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) types.update(self.opts, opts) self.opts["id"] = contentid # check that contentid exists self.language = clgen.Language.from_str(opts.get("language")) if (path is None and not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))): raise clgen.UserError("corpus {self.language}-{contentid} not found" .format(**vars())) self.contentid = contentid self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}") self.kernels_db = self.contentcache.keypath('kernels.db') self.hash = self._hash(contentid, self.opts) self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}") log.debug("contentfiles {self.contentid}".format(**vars())) log.debug("corpus {hash}".format(hash=self.hash)) # validate metadata against cache self.stats = { "preprocess_time": 0 } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if meta != cached_meta: raise clgen.InternalError("corpus metadata mismatch") else: self._flush_meta() with self.lock.acquire(replace_stale=True): self._create_files(path)
def shorthash(self) -> str: return clgen._shorthash(self.hash, clgen.cachepath("sampler"))
def shorthash(self): return clgen._shorthash(self.hash, clgen.cachepath("model"))