def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False self.dashboard_db = dashboard_db.GetDatabase() self._dashboard_db_id: typing.Optional[int] = None # Set in Create() # An in-memory cache of the encoded contentfiles indices arrays. # Set and used in GetTrainingData(). self._indices_arrays: typing.Optional[typing.List[np.array]] = None cache.cachepath("corpus").mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath("hashcache.db"), "sha1") self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath("corpus", "preprocessed", preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath("corpus", "preprocessed", preprocessed_id, "preprocessed.db") if (self.config.HasField("content_id") and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( f"sqlite:///{preprocessed_db_path}") # Create symlink to contentfiles. symlink = ( pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent / "contentfiles") if not symlink.is_symlink(): if config.HasField("local_directory"): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink, ) elif config.HasField("local_tar_archive"): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix, )), symlink, ) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath("corpus", "encoded", encoded_id).mkdir(exist_ok=True, parents=True) db_path = cache.cachepath("corpus", "encoded", encoded_id, "encoded.db") # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic by making Corpus an abstract class and creating concrete subclasses # for the different types of corpus. if self.config.HasField("pre_encoded_corpus_url"): self.encoded = encoded.EncodedContentFiles( config.pre_encoded_corpus_url) else: self.encoded = encoded.EncodedContentFiles(f"sqlite:///{db_path}") self.atomizer_path = cache.cachepath("corpus", "encoded", encoded_id, "atomizer.pkl") # Create symlink to preprocessed files. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this conditional # logic after splitting Corpus class. if not self.config.HasField("pre_encoded_corpus_url"): symlink = ( pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "preprocessed") if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path( self.preprocessed.url[len("sqlite:///"):]).parent, pathlib.Path( self.encoded.url[len("sqlite:///"):]).parent, ), symlink, ) self.hash = encoded_id self.cache = cache.mkcache("corpus", "encoded", encoded_id)
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False cache.cachepath('corpus').mkdir(parents=True, exist_ok=True) hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1') self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath('corpus', 'preprocessed', preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath('corpus', 'preprocessed', preprocessed_id, 'preprocessed.db') if (self.config.HasField('content_id') and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( preprocessed_db_path) # Create symlink to contentfiles. symlink = self.preprocessed.database_path.parent / 'contentfiles' if not symlink.is_symlink(): if config.HasField('local_directory'): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) elif config.HasField('local_tar_archive'): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True, parents=True) self.encoded = encoded.EncodedContentFiles( cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db')) self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id, 'atomizer.pkl') # Create symlink to preprocessed files. symlink = self.encoded.database_path.parent / 'preprocessed' if not symlink.is_symlink(): os.symlink( os.path.relpath(self.preprocessed.database_path.parent, self.encoded.database_path.parent), symlink) self.hash = encoded_id self.cache = cache.mkcache('corpus', 'encoded', encoded_id)