def test_EncodedContentFiles_empty_preprocessed_db( temp_db: encoded.EncodedContentFiles, abc_atomizer: atomizers.AsciiCharacterAtomizer): """Test that EmptyCorpusException raised if preprocessed db is empty.""" with tempfile.TemporaryDirectory() as d: p = preprocessed.PreprocessedContentFiles( f'sqlite:///{pathlib.Path(d)}/preprocessed.db') with pytest.raises(errors.EmptyCorpusException): temp_db.Create(p, abc_atomizer, '\n\n')
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False self.dashboard_db = dashboard_db.GetDatabase() self._dashboard_db_id: typing.Optional[int] = None # Set in Create() # An in-memory cache of the encoded contentfiles indices arrays. # Set and used in GetTrainingData(). self._indices_arrays: typing.Optional[typing.List[np.array]] = None cache.cachepath("corpus").mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath("hashcache.db"), "sha1") self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath("corpus", "preprocessed", preprocessed_id, "preprocessed.db") if (self.config.HasField("content_id") and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( f"sqlite:///{preprocessed_db_path}") # Create symlink to contentfiles. symlink = ( pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent / "contentfiles") if not symlink.is_symlink(): if config.HasField("local_directory"): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("local_tar_archive"): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix, )), symlink, ) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True) db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db") # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic by making Corpus an abstract class and creating concrete subclasses # for the different types of corpus. if self.config.HasField("pre_encoded_corpus_url"): self.encoded = encoded.EncodedContentFiles( config.pre_encoded_corpus_url) else: self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}") self.atomizer_path = cache.cachepath("corpus", "encoded", encoded_id, "atomizer.pkl") # Create symlink to preprocessed files. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic after splitting Corpus class. if not self.config.HasField("pre_encoded_corpus_url"): symlink = ( pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "preprocessed") if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path( self.preprocessed.url[len("sqlite:///"):]).parent, pathlib.Path( self.encoded.url[len("sqlite:///"):]).parent, ), symlink, ) self.hash = encoded_id self.cache = cache.mkcache("corpus", "encoded", encoded_id)
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False cache.cachepath('corpus').mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1') self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath('corpus', 'preprocessed', preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath('corpus', 'preprocessed', preprocessed_id, 'preprocessed.db') if (self.config.HasField('content_id') and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( preprocessed_db_path) # Create symlink to contentfiles. symlink = self.preprocessed.database_path.parent / 'contentfiles' if not symlink.is_symlink(): if config.HasField('local_directory'): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) elif config.HasField('local_tar_archive'): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True, parents=True) self.encoded = encoded.EncodedContentFiles( cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db')) self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id, 'atomizer.pkl') # Create symlink to preprocessed files. symlink = self.encoded.database_path.parent / 'preprocessed' if not symlink.is_symlink(): os.symlink( os.path.relpath(self.preprocessed.database_path.parent, self.encoded.database_path.parent), symlink) self.hash = encoded_id self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
def preprocessed_db( tempdir: pathlib.Path, ) -> preprocessed.PreprocessedContentFiles: """A preprocessed database with three files: a -> Hello, world a2 -> This is a duplicate (has same sha256 as 'a') b -> Hello, foo c -> ERROR: failure (not successfully preprocessed) """ db = preprocessed.PreprocessedContentFiles( f"sqlite:///{tempdir}/preprocessed.db") with db.Session(commit=True) as session: session.add_all([ preprocessed.PreprocessedContentFile( input_relpath="a", input_sha256="00000000", input_charcount=10, input_linecount=10, sha256="00000000", charcount=10, linecount=1, text="Hello, world", preprocessing_succeeded=True, preprocess_time_ms=4, wall_time_ms=4, ), preprocessed.PreprocessedContentFile( input_relpath="a2", input_sha256="00000000", input_charcount=10, input_linecount=10, sha256="00000000", charcount=10, linecount=1, text="This is a duplicate", preprocessing_succeeded=True, preprocess_time_ms=4, wall_time_ms=4, ), preprocessed.PreprocessedContentFile( input_relpath="b", input_sha256="11111111", input_charcount=10, input_linecount=10, sha256="11111111", charcount=10, linecount=1, text="Hello, foo", preprocessing_succeeded=True, preprocess_time_ms=4, wall_time_ms=4, ), preprocessed.PreprocessedContentFile( input_relpath="c", input_sha256="22222222", input_charcount=10, input_linecount=10, sha256="22222222", charcount=10, linecount=1, text="ERROR: failure", preprocessing_succeeded=False, preprocess_time_ms=4, wall_time_ms=4, ), ]) yield db
def preprocessed_db( tempdir: pathlib.Path, ) -> preprocessed.PreprocessedContentFile: db = preprocessed.PreprocessedContentFiles( f"sqlite:///{tempdir}/preprocessed") return db
def __init__(self, config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus) and not isinstance(config, corpus_pb2.PreTrainCorpus): raise TypeError(f"Config must be a Corpus proto. Received: '{type(config).__name__}'") # Make a local copy of the configuration. if isinstance(config, corpus_pb2.Corpus): self.config = corpus_pb2.Corpus() self.pre_train = False else: self.config = corpus_pb2.PreTrainCorpus() self.pre_train = True self.config.CopyFrom(AssertConfigIsValid(config)) self._tokenizer = None self._created = False # An in-memory cache of the encoded contentfiles indices arrays. # Set and used in GetTrainingData(). self._indices_arrays: typing.Optional[typing.List[np.array]] = None if environment.WORLD_RANK == 0: cache.cachepath("corpus").mkdir(parents=True, exist_ok=True) distrib.barrier() self.content_id = ResolveContentId(self.config) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) if environment.WORLD_RANK == 0: cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True) distrib.barrier() preprocessed_db_path = cache.cachepath("corpus", "preprocessed", preprocessed_id, "preprocessed.db") if self.config.HasField("content_id") and not preprocessed_db_path.is_file(): raise ValueError(f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( f"sqlite:///{preprocessed_db_path}" ) # Create symlink to contentfiles. if environment.WORLD_RANK == 0: symlink = (pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent / "contentfiles") if not symlink.is_symlink(): if config.HasField("local_directory"): os.symlink( str(ExpandConfigPath(config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("local_tar_archive"): os.symlink( str(ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("bq_database"): os.symlink( str(ExpandConfigPath(config.bq_database, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) # elif config.HasField("fetch_github"): # os.symlink( # str(ExpandConfigPath(config.fetch_github, path_prefix=FLAGS.clgen_local_path_prefix)), # symlink, # ) distrib.barrier() # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) if environment.WORLD_RANK == 0: cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True) distrib.barrier() db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db") if self.config.HasField("pre_encoded_corpus_url"): self.encoded = encoded.EncodedContentFiles(config.pre_encoded_corpus_url, self.pre_train) else: self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}", self.pre_train) self.tokenizer_path = cache.cachepath( "corpus", "encoded", encoded_id, "tokenizer.pkl" ) if environment.WORLD_RANK == 0 and not self.config.HasField("pre_encoded_corpus_url"): symlink = (pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent / "preprocessed") if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path(self.preprocessed.url[len("sqlite:///") :]).parent, pathlib.Path(self.encoded.url[len("sqlite:///") :]).parent, ), symlink, ) self.hash = encoded_id self.cache = cache.mkcache("corpus", "encoded", encoded_id) if environment.WORLD_RANK == 0: commit.saveCommit(self.cache.path) commit.saveCommit(self.cache.path.parent.parent / "preprocessed" / preprocessed_id) distrib.barrier() l.logger().info("Initialized {}train corpus in {}".format("pre_" if self.pre_train else "", self.cache.path)) return