def GetImportRelpaths( self, contentfile_root: pathlib.Path ) -> typing.List[str]: """Get relative paths to all files in the content files directory. Args: contentfile_root: The root of the content files directory. Returns: A list of paths relative to the content files root. Raises: EmptyCorpusException: If the content files directory is empty. """ with fs.chdir(contentfile_root): find_output = ( subprocess.check_output(["find", ".", "-type", "f"]) .decode("utf-8") .strip() ) if not find_output: raise errors.EmptyCorpusException( f"Empty content files directory: '{contentfile_root}'" ) return find_output.split("\n")
def Create(self) -> None: """Create the corpus files. Raises: EmptyCorpusException: If there are no content files, or no successfully pre-processed files. """ self._created = True logging.info('Content ID: %s', self.content_id) preprocessed_lock_path = self.preprocessed.database_path.parent / 'LOCK' with lockfile.LockFile(preprocessed_lock_path).acquire( replace_stale=True, block=True): self.preprocessed.Create(self.config) if not self.preprocessed.size: raise errors.EmptyCorpusException( "Pre-processed corpus contains no files: " f"'{self.preprocessed.database_path}'") encoded_lock_path = self.encoded.database_path.parent / 'LOCK' with lockfile.LockFile(encoded_lock_path).acquire(replace_stale=True, block=True): start_time = time.time() atomizer = self.atomizer logging.info( '%s: %s tokens in %s ms', type(atomizer).__name__, humanize.intcomma(atomizer.vocab_size), humanize.intcomma(int((time.time() - start_time) * 1000))) for key, value in atomizer.vocab.items(): logging.info('atomizer.vocab %s : %s', key, value) self.encoded.Create(self.preprocessed, atomizer, self.config.contentfile_separator)
def Create(self) -> None: """Create the corpus files. Raises: EmptyCorpusException: If there are no content files, or no successfully pre-processed files. """ self._created = True app.Log(1, 'Content ID: %s', self.content_id) # Nothing to do for already-encoded databases. # TODO(github.com/ChrisCummins/phd/issues/46): Refactor this after splitting # out Corpus class. if self.config.HasField('pre_encoded_corpus_url'): return preprocessed_lock_path = pathlib.Path( self.preprocessed.url[len('sqlite:///'):]).parent / 'LOCK' with lockfile.LockFile(preprocessed_lock_path): self.preprocessed.Create(self.config) if not self.preprocessed.size: raise errors.EmptyCorpusException( f"Pre-processed corpus contains no files: '{self.preprocessed.url}'") encoded_lock_path = pathlib.Path( self.encoded.url[len('sqlite:///'):]).parent / 'LOCK' with lockfile.LockFile(encoded_lock_path): start_time = time.time() atomizer = self.atomizer app.Log(1, '%s: %s tokens in %s ms', type(atomizer).__name__, humanize.Commas(atomizer.vocab_size), humanize.Commas(int((time.time() - start_time) * 1000))) self.encoded.Create(self.preprocessed, atomizer, self.config.contentfile_separator)
def Import( self, session: sqlutil.Session, preprocessed_db: preprocessed.PreprocessedContentFiles, atomizer: atomizers.AtomizerBase, contentfile_separator: str, ) -> None: with preprocessed_db.Session() as p_session: query = p_session.query( preprocessed.PreprocessedContentFile).filter( preprocessed.PreprocessedContentFile. preprocessing_succeeded == True, ~preprocessed.PreprocessedContentFile.id.in_( session.query(EncodedContentFile.id).all()), ) jobs = [ internal_pb2.EncoderWorker( id=x.id, text=x.text, contentfile_separator=contentfile_separator, pickled_atomizer=pickle.dumps(atomizer), ) for x in query ] if not jobs: raise errors.EmptyCorpusException( "Pre-processed corpus contains no files: " f"'{preprocessed_db.url}'") app.Log( 1, "Encoding %s of %s preprocessed files", humanize.Commas(query.count()), humanize.Commas( p_session.query( preprocessed.PreprocessedContentFile).filter( preprocessed.PreprocessedContentFile. preprocessing_succeeded == True).count()), ) pool = multiprocessing.Pool() bar = progressbar.ProgressBar(max_value=len(jobs)) last_commit = time.time() wall_time_start = time.time() for encoded_cf in bar(pool.imap_unordered(EncoderWorker, jobs)): wall_time_end = time.time() # TODO(cec): Remove the if check once EncoderWorker no longer returns # None on atomizer encode error. if encoded_cf: encoded_cf.wall_time_ms = int( (wall_time_end - wall_time_start) * 1000) session.add(encoded_cf) wall_time_start = wall_time_end if wall_time_end - last_commit > 10: session.commit() last_commit = wall_time_end
def Create(self) -> None: """Create the corpus files. Raises: EmptyCorpusException: If there are no content files, or no successfully pre-processed files. """ self._created = True app.Log(1, "Content ID: %s", self.content_id) # Nothing to do for already-encoded databases. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor this after # splitting out Corpus class. if self.config.HasField("pre_encoded_corpus_url"): with self.dashboard_db.Session(commit=True) as session: config_to_store = corpus_pb2.Corpus() config_to_store.CopyFrom(self.config) # Clear the contentfiles field, since we use the content_id to uniquely # identify the input files. This means that corpuses with the same content # files delivered through different means (e.g. two separate but identical # directories) have the same hash. config_to_store.ClearField("contentfiles") corpus = session.GetOrAdd( dashboard_db.Corpus, config_proto_sha1=crypto.sha1( config_to_store.SerializeToString()), config_proto=str(config_to_store), preprocessed_url="", encoded_url=self.encoded.url, summary=self.GetShortSummary(), ) session.flush() self._dashboard_db_id = corpus.id return preprocessed_lock_path = ( pathlib.Path(self.preprocessed.url[len("sqlite:///"):]).parent / "LOCK") with lockfile.LockFile(preprocessed_lock_path): self.preprocessed.Create(self.config) if not self.preprocessed.size: raise errors.EmptyCorpusException( f"Pre-processed corpus contains no files: '{self.preprocessed.url}'" ) encoded_lock_path = ( pathlib.Path(self.encoded.url[len("sqlite:///"):]).parent / "LOCK") with lockfile.LockFile(encoded_lock_path): start_time = time.time() atomizer = self.atomizer app.Log( 1, "%s: %s tokens in %s ms", type(atomizer).__name__, humanize.Commas(atomizer.vocab_size), humanize.Commas(int((time.time() - start_time) * 1000)), ) self.encoded.Create(self.preprocessed, atomizer, self.config.contentfile_separator) # Add entry to dashboard database with self.dashboard_db.Session(commit=True) as session: config_to_store = corpus_pb2.Corpus() config_to_store.CopyFrom(self.config) # Clear the contentfiles field, since we use the content_id to uniquely # identify the input files. This means that corpuses with the same content # files delivered through different means (e.g. two separate but identical # directories) have the same hash. config_to_store.ClearField("contentfiles") corpus = session.GetOrAdd( dashboard_db.Corpus, config_proto_sha1=crypto.sha1( config_to_store.SerializeToString()), config_proto=str(config_to_store), preprocessed_url=self.preprocessed.url, encoded_url=self.encoded.url, summary=self.GetShortSummary(), ) session.flush() self._dashboard_db_id = corpus.id