def cache(self, model: clgen.Model): """ Return sampler cache. Parameters ---------- model : clgen.Model CLgen model. Returns ------- labm8 FSCache: Cache. """ sampler_model_hash = crypto.sha1_str(self.hash + model.hash) cache = clgen.mkcache("sampler", sampler_model_hash) # validate metadata against cache self.stats = { "time": 0, "progress": 0 } meta = deepcopy(self.to_json()) if cache.get("META"): cached_meta = jsonutil.read_file(cache["META"]) if "stats" in cached_meta: self.stats = cached_meta["stats"] del cached_meta["stats"] if "created" in cached_meta["sampler"]: del cached_meta["sampler"]["created"] del meta["sampler"]["created"] if "min_samples" in cached_meta["sampler"]: del cached_meta["sampler"]["min_samples"] del meta["sampler"]["min_samples"] if "min_kernels" in cached_meta["sampler"]: del cached_meta["sampler"]["min_kernels"] del meta["sampler"]["min_kernels"] if meta != cached_meta: raise clgen.InternalError("sampler metadata mismatch") else: self._flush_meta(cache) return cache
def from_json(corpus_json: dict) -> 'Corpus': """ Instantiate Corpus from JSON. Parameters ---------- corpus_json : dict Specification. Returns ------- Corpus Insantiated corpus. """ path = corpus_json.pop("path", None) uid = corpus_json.pop("id", None) language = clgen.Language.from_str(corpus_json.get("language")) if path: path = unpack_directory_if_needed(fs.abspath(path)) if not fs.isdir(path): raise clgen.UserError( "Corpus path '{}' is not a directory".format(path)) dirhashcache = DirHashCache(clgen.cachepath("dirhash.db"), 'sha1') uid = prof.profile(dirhashcache.dirhash, path) elif uid: cache_path = clgen.mkcache("contentfiles", f"{language}-{uid}").path if not fs.isdir(cache_path): raise clgen.UserError( "Corpus content {} not found".format(uid)) else: raise clgen.UserError("No corpus path or ID provided") if "stats" in corpus_json: # ignore stats del corpus_json["stats"] if "contentfiles" in corpus_json: del corpus_json["contentfiles"] return prof.profile(Corpus, uid, path=path, **corpus_json)
def __init__(self, contentid: str, path: str=None, **opts): """ Instantiate a corpus. If this is a new corpus, a number of files will be created, which may take some time. Parameters ---------- contentid : str ID of corpus content. path : str, optional Path to corpus. **opts Keyword options. """ # Validate options for key in opts.keys(): if key not in DEFAULT_CORPUS_OPTS: raise clgen.UserError( "Unsupported corpus option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_CORPUS_OPTS.keys())))) self.opts = deepcopy(DEFAULT_CORPUS_OPTS) types.update(self.opts, opts) self.opts["id"] = contentid # check that contentid exists self.language = clgen.Language.from_str(opts.get("language")) if (path is None and not fs.isdir(clgen.cachepath("contentfiles", f"{self.language}-{contentid}"))): raise clgen.UserError("corpus {self.language}-{contentid} not found" .format(**vars())) self.contentid = contentid self.contentcache = clgen.mkcache("contentfiles", f"{self.language}-{contentid}") self.kernels_db = self.contentcache.keypath('kernels.db') self.hash = self._hash(contentid, self.opts) self.cache = clgen.mkcache("corpus", f"{self.language}-{self.hash}") log.debug("contentfiles {self.contentid}".format(**vars())) log.debug("corpus {hash}".format(hash=self.hash)) # validate metadata against cache self.stats = { "preprocess_time": 0 } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if meta != cached_meta: raise clgen.InternalError("corpus metadata mismatch") else: self._flush_meta() with self.lock.acquire(replace_stale=True): self._create_files(path)
def __init__(self, corpus: clgen.Corpus, **opts): """ Instantiate model. Parameters ---------- corpus : clgen.Corpus Corpus instance. **opts Training options. """ assert(isinstance(corpus, clgen.Corpus)) def _hash(corpus: clgen.Corpus, opts: dict) -> str: """ compute model hash """ hashopts = deepcopy(opts) del hashopts["created"] del hashopts["train_opts"]["epochs"] return crypto.sha1_list(corpus.hash, *types.dict_values(hashopts)) # Validate options for key in opts: if key not in DEFAULT_MODEL_OPTS: raise clgen.UserError( "Unsupported model option '{}'. Valid keys: {}".format( key, ','.join(sorted(DEFAULT_MODEL_OPTS.keys())))) # set properties self.opts = types.update(deepcopy(DEFAULT_MODEL_OPTS), opts) self.corpus = corpus self.hash = _hash(self.corpus, self.opts) self.cache = clgen.mkcache("model", f"{corpus.language}-{self.hash}") log.debug("model", self.hash) # validate metadata against cache, and restore stats self.stats = { "epoch_times": [], "epoch_costs": [], "epoch_batches": [] } meta = deepcopy(self.to_json()) if self.cache.get("META"): cached_meta = jsonutil.read_file(self.cache["META"]) self.stats = cached_meta["stats"] # restore stats if "created" in cached_meta: del cached_meta["created"] del meta["created"] if "created" in cached_meta["corpus"]: del cached_meta["corpus"]["created"] del meta["corpus"]["created"] if "stats" in cached_meta: del cached_meta["stats"] del meta["stats"] if "epochs" in cached_meta["train_opts"]: del cached_meta["train_opts"]["epochs"] del meta["train_opts"]["epochs"] if meta != cached_meta: log.error("Computed META:", jsonutil.format_json(meta)) raise clgen.InternalError( "metadata mismatch in model %s" % self.cache["META"]) else: self._flush_meta()