def parseCorpus(workspace_path): corpuses = [] if (workspace_path / "corpus" / "encoded").exists(): corpus_path = workspace_path / "corpus" / "encoded" for corpus_sha in corpus_path.iterdir(): encoded_db = encoded.EncodedContentFiles("sqlite:///{}".format( corpus_sha / "encoded.db"), must_exist=True) corpus = { 'path': corpus_path / corpus_sha, 'sha': str(corpus_sha.stem), 'datapoint_count': encoded_db.size, 'summary': "{} datapoint corpus, {}".format(encoded_db.size, str(corpus_sha.stem)), 'models': parseModels(workspace_path, str(corpus_sha.stem)) } global cached_corpuses cached_corpuses[crypto.sha256_str( str(workspace_path.name) + str(corpus_sha.name))] = corpus corpuses.append(corpus) return corpuses
def encoded_contentfile(corpus_id: int, encoded_id: int): (encoded_url, ) = (db.session.query( dashboard_db.Corpus.encoded_url).filter( dashboard_db.Corpus.id == corpus_id).one()) encoded_db = encoded.EncodedContentFiles(encoded_url, must_exist=True) with encoded_db.Session() as session: cf = (session.query(encoded.EncodedContentFile).filter( encoded.EncodedContentFile.id == encoded_id).limit(1).one()) indices = cf.indices_array vocab = { v: k for k, v in encoded.EncodedContentFiles.GetVocabFromMetaTable( session).items() } tokens = [vocab[i] for i in indices] text = "".join(tokens) encoded_cf = { "id": cf.id, "tokencount": humanize.Commas(cf.tokencount), "indices": indices, "text": text, "tokens": tokens, } vocab = { "table": [(k, v) for k, v in vocab.items()], "size": len(vocab), } data = GetBaseTemplateArgs() data["encoded"] = encoded_cf data["vocab"] = vocab data["urls"]["view_encoded_file"] = f"/corpus/{corpus_id}/encoded/random/" return flask.render_template("encoded_contentfile.html", **data)
def main(*args): db = encoded.EncodedContentFiles( url="sqlite:///{}".format(ENCODED_DB_PATH), must_exist=True) tokenizer = tokenizers.TokenizerBase.FromFile( pathlib.Path(TOKENIZER_PATH).resolve()) feat_vecs = get_data_features(db, tokenizer) Train(feat_vecs) return
def random_encoded_contentfile(corpus_id: int): (encoded_url, ) = (db.session.query( dashboard_db.Corpus.encoded_url).filter( dashboard_db.Corpus.id == corpus_id).one()) encoded_db = encoded.EncodedContentFiles(encoded_url, must_exist=True) with encoded_db.Session() as session: (random_id, ) = (session.query(encoded.EncodedContentFile.id).order_by( encoded_db.Random()).limit(1).one()) return flask.redirect(f"/corpus/{corpus_id}/encoded/{random_id}/", code=302)
def _CreateTokenizer(self) -> tokenizers.TokenizerBase: """Creates and caches an tokenizer.""" corpus_txt = self.GetTextCorpus(shuffle=False) if self.config.HasField("pre_encoded_corpus_url"): encoded_db = encoded.EncodedContentFiles( self.config.pre_encoded_corpus_url, self.pre_train ) tokenizer = WordTokenizerFromEncodedDb(self.config.tokenizer, encoded_db) else: tokenizer = tokenizers.FromText(self.config.tokenizer, self.config.contentfile_separator, corpus_txt) tokenizer.ToFile(self.tokenizer_path) return tokenizer
def _CreateAtomizer(self) -> atomizers.AtomizerBase: """Creates and caches an atomizer.""" app.Log(1, "Deriving atomizer from preprocessed corpus") corpus_txt = self.GetTextCorpus(shuffle=False) if self.config.HasField("ascii_character_atomizer"): atomizer = atomizers.AsciiCharacterAtomizer.FromText(corpus_txt) elif self.config.HasField("greedy_multichar_atomizer"): atoms = set(self.config.greedy_multichar_atomizer.tokens) atomizer = atomizers.GreedyAtomizer.FromText(corpus_txt, atoms) elif self.config.HasField("pre_encoded_corpus_url"): encoded_db = encoded.EncodedContentFiles( self.config.pre_encoded_corpus_url) atomizer = GreedyAtomizerFromEncodedDb(encoded_db) else: raise NotImplementedError atomizer.ToFile(self.atomizer_path) return atomizer
def main(*args): db = encoded.EncodedContentFiles(url = "sqlite:///{}".format(ENCODED_DB_PATH), must_exist = True) tokenizer = tokenizers.TokenizerBase.FromFile(pathlib.Path(TOKENIZER_PATH).resolve()) distr = { "GreweFeatures": None, "AutophaseFeatures": None, "InstCountFeatures": None, } distr_768 = { "GreweFeatures": None, "AutophaseFeatures": None, "InstCountFeatures": None, } for fspace in {"GreweFeatures", "AutophaseFeatures", "InstCountFeatures"}: feat_vecs = [v for s, i, v in get_data_features(fspace, db, tokenizer)] flat_vals = [] for vec in feat_vecs: for v in vec.values(): try: flat_vals.append(4 * int(v // 4)) except Exception: pass distr[fspace] = distributions.GenericDistribution(flat_vals, "feature_vals", fspace) distr[fspace].plot() for fspace in {"GreweFeatures", "AutophaseFeatures", "InstCountFeatures"}: feat_vecs = [v for s, i, v in get_data_features(fspace, db, tokenizer, 768)] flat_vals = [] for vec in feat_vecs: for v in vec.values(): try: flat_vals.append(4 * int(v // 4)) except Exception: pass distr_768[fspace] = distributions.GenericDistribution(flat_vals, "feature_vals", "{}_768".format(fspace)) distr_768[fspace].plot() return
def abc_pre_encoded() -> str: """Test fixture that returns a database of a single encoded content file.""" with tempfile.TemporaryDirectory( prefix="phd_deeplearning_clgen_corpuses_") as d: url = f"sqlite:///{d}/encoded.db" db = encoded.EncodedContentFiles(url) with db.Session(commit=True) as s: s.add( encoded.EncodedContentFile( data="0.1.2.0.1", tokencount=5, encoding_time_ms=10, wall_time_ms=10, date_added=datetime.datetime.utcnow(), )) s.add( encoded.EncodedContentFile( data="2.2.2", tokencount=3, encoding_time_ms=10, wall_time_ms=10, date_added=datetime.datetime.utcnow(), )) yield db.url
def temp_db() -> encoded.EncodedContentFiles: """A test fixture which returns an empty EncodedContentFiles db.""" with tempfile.TemporaryDirectory() as d: yield encoded.EncodedContentFiles(f"sqlite:///{d}/test.db")
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False self.dashboard_db = dashboard_db.GetDatabase() self._dashboard_db_id: typing.Optional[int] = None # Set in Create() # An in-memory cache of the encoded contentfiles indices arrays. # Set and used in GetTrainingData(). self._indices_arrays: typing.Optional[typing.List[np.array]] = None cache.cachepath("corpus").mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath("hashcache.db"), "sha1") self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath("corpus", "preprocessed", preprocessed_id, "preprocessed.db") if (self.config.HasField("content_id") and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( f"sqlite:///{preprocessed_db_path}") # Create symlink to contentfiles. symlink = ( pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent / "contentfiles") if not symlink.is_symlink(): if config.HasField("local_directory"): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("local_tar_archive"): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix, )), symlink, ) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True) db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db") # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic by making Corpus an abstract class and creating concrete subclasses # for the different types of corpus. if self.config.HasField("pre_encoded_corpus_url"): self.encoded = encoded.EncodedContentFiles( config.pre_encoded_corpus_url) else: self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}") self.atomizer_path = cache.cachepath("corpus", "encoded", encoded_id, "atomizer.pkl") # Create symlink to preprocessed files. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic after splitting Corpus class. if not self.config.HasField("pre_encoded_corpus_url"): symlink = ( pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "preprocessed") if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path( self.preprocessed.url[len("sqlite:///"):]).parent, pathlib.Path( self.encoded.url[len("sqlite:///"):]).parent, ), symlink, ) self.hash = encoded_id self.cache = cache.mkcache("corpus", "encoded", encoded_id)
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False cache.cachepath('corpus').mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1') self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath('corpus', 'preprocessed', preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath('corpus', 'preprocessed', preprocessed_id, 'preprocessed.db') if (self.config.HasField('content_id') and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( preprocessed_db_path) # Create symlink to contentfiles. symlink = self.preprocessed.database_path.parent / 'contentfiles' if not symlink.is_symlink(): if config.HasField('local_directory'): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) elif config.HasField('local_tar_archive'): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True, parents=True) self.encoded = encoded.EncodedContentFiles( cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db')) self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id, 'atomizer.pkl') # Create symlink to preprocessed files. symlink = self.encoded.database_path.parent / 'preprocessed' if not symlink.is_symlink(): os.symlink( os.path.relpath(self.preprocessed.database_path.parent, self.encoded.database_path.parent), symlink) self.hash = encoded_id self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
def encoded_db(tempdir: pathlib.Path) -> encoded.EncodedContentFiles: db = encoded.EncodedContentFiles(f"sqlite:///{tempdir}/encoded") return db
def __init__(self, config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus) and not isinstance(config, corpus_pb2.PreTrainCorpus): raise TypeError(f"Config must be a Corpus proto. Received: '{type(config).__name__}'") # Make a local copy of the configuration. if isinstance(config, corpus_pb2.Corpus): self.config = corpus_pb2.Corpus() self.pre_train = False else: self.config = corpus_pb2.PreTrainCorpus() self.pre_train = True self.config.CopyFrom(AssertConfigIsValid(config)) self._tokenizer = None self._created = False # An in-memory cache of the encoded contentfiles indices arrays. # Set and used in GetTrainingData(). self._indices_arrays: typing.Optional[typing.List[np.array]] = None if environment.WORLD_RANK == 0: cache.cachepath("corpus").mkdir(parents=True, exist_ok=True) distrib.barrier() self.content_id = ResolveContentId(self.config) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) if environment.WORLD_RANK == 0: cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True) distrib.barrier() preprocessed_db_path = cache.cachepath("corpus", "preprocessed", preprocessed_id, "preprocessed.db") if self.config.HasField("content_id") and not preprocessed_db_path.is_file(): raise ValueError(f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( f"sqlite:///{preprocessed_db_path}" ) # Create symlink to contentfiles. if environment.WORLD_RANK == 0: symlink = (pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent / "contentfiles") if not symlink.is_symlink(): if config.HasField("local_directory"): os.symlink( str(ExpandConfigPath(config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("local_tar_archive"): os.symlink( str(ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("bq_database"): os.symlink( str(ExpandConfigPath(config.bq_database, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) # elif config.HasField("fetch_github"): # os.symlink( # str(ExpandConfigPath(config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix)), # symlink, # ) distrib.barrier() # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) if environment.WORLD_RANK == 0: cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True) distrib.barrier() db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db") if self.config.HasField("pre_encoded_corpus_url"): self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url, self.pre_train) else: self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}", self.pre_train) self.tokenizer_path = cache.cachepath( "corpus", "encoded", encoded_id, "tokenizer.pkl" ) if environment.WORLD_RANK == 0 and not self.config.HasField("pre_encoded_corpus_url"): symlink = (pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent / "preprocessed") if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent, pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent, ), symlink, ) self.hash = encoded_id self.cache = cache.mkcache("corpus", "encoded", encoded_id) if environment.WORLD_RANK == 0: commit.saveCommit(self.cache.path) commit.saveCommit(self.cache.path.parent.parent / "preprocessed" / preprocessed_id) distrib.barrier() l.logger().info("Initialized {}train corpus in {}".format("pre_" if self.pre_train else "", self.cache.path)) return