def EncoderWorker( job: internal_pb2.EncoderWorker ) -> typing.Optional[EncodedContentFile]: """Encode a single content file.""" # TODO(cec): There is a bug in the atomizer creation logic such that the # derived atomizer is not always capable of encoding the preprocessed files. # Once this has been fixed, there is no need to catch the VocabError here, # and EncoderWorker can always return an EncodedContentFile instance. try: return EncodedContentFile.FromPreprocessed( preprocessed.PreprocessedContentFile(id=job.id, text=job.text), pickle.loads(job.pickled_atomizer), job.contentfile_separator) except errors.VocabError: return None
def _PreprocessedContentFile( relpath: str, text: str, preprocessing_succeeded: bool) -> preprocessed.PreprocessedContentFile: return preprocessed.PreprocessedContentFile( input_relpath=relpath, input_sha256="000", input_charcount=0, input_linecount=0, sha256="000", charcount=0, linecount=0, text=text, preprocessing_succeeded=preprocessing_succeeded, preprocess_time_ms=0, wall_time_ms=0, )
def EncoderWorker( job: internal_pb2.EncoderWorker, tokenizer, contentfile_separator, is_pre_train, ) -> typing.Optional[EncodedContentFile]: """Encode a single content file.""" # TODO(cec): There is a bug in the tokenizer creation logic such that the # derived tokenizer is not always capable of encoding the preprocessed files. # Once this has been fixed, there is no need to catch the VocabError here, # and EncoderWorker can always return an EncodedContentFile instance. try: return EncodedContentFile.FromPreprocessed( preprocessed.PreprocessedContentFile(id=job.id, text=job.text), tokenizer, contentfile_separator, is_pre_train, ) except Exception as e: raise e
def PreprocessContentfiles( cfs: typing.List[contentfiles.ContentFile], ) -> typing.List[preprocessed.PreprocessedContentFile]: start_time = time.time() output_message = PreprocessStringList([cf.text for cf in cfs]) wall_time_ms = int((time.time() - start_time) * 1000) assert (len(cfs) == len(output_message.outcome) == len( output_message.preprocess_time_ms)) pp_cfs = [ preprocessed.PreprocessedContentFile( input_relpath= f"{cf.clone_from_url}:{cf.relpath}:{cf.artifact_index}", input_sha256=cf.sha256, input_charcount=cf.charcount, input_linecount=cf.linecount, sha256=hashlib.sha256( outcome.contents.encode("utf-8")).hexdigest(), charcount=len(outcome.contents), linecount=len(outcome.contents.split("\n")), text=outcome.contents, preprocessing_succeeded=(outcome.status == internal_pb2. PreprocessorWorkerJobOutcome.OK), preprocess_time_ms=preprocess_time_ms, wall_time_ms=wall_time_ms, ) for cf, outcome, preprocess_time_ms in zip( cfs, output_message.outcome, output_message.preprocess_time_ms) ] # Scan for secrets. for pp_cf in pp_cfs: if pp_cf.preprocessing_succeeded: try: secrets.ScanForSecrets(pp_cf.text) except secrets.TextContainsSecret as e: pp_cf.preprocessing_succeeded = False pp_cf.text = f"Text contains secrets: {e}" return pp_cfs
def abc_preprocessed() -> preprocessed.PreprocessedContentFile: """A test fixture which returns a preprocessed content file.""" return preprocessed.PreprocessedContentFile(id=123, text="aabbccddee")
def preprocessed_db( tempdir: pathlib.Path, ) -> preprocessed.PreprocessedContentFiles: """A preprocessed database with three files: a -> Hello, world a2 -> This is a duplicate (has same sha256 as 'a') b -> Hello, foo c -> ERROR: failure (not successfully preprocessed) """ db = preprocessed.PreprocessedContentFiles( f"sqlite:///{tempdir}/preprocessed.db") with db.Session(commit=True) as session: session.add_all([ preprocessed.PreprocessedContentFile( input_relpath="a", input_sha256="00000000", input_charcount=10, input_linecount=10, sha256="00000000", charcount=10, linecount=1, text="Hello, world", preprocessing_succeeded=True, preprocess_time_ms=4, wall_time_ms=4, ), preprocessed.PreprocessedContentFile( input_relpath="a2", input_sha256="00000000", input_charcount=10, input_linecount=10, sha256="00000000", charcount=10, linecount=1, text="This is a duplicate", preprocessing_succeeded=True, preprocess_time_ms=4, wall_time_ms=4, ), preprocessed.PreprocessedContentFile( input_relpath="b", input_sha256="11111111", input_charcount=10, input_linecount=10, sha256="11111111", charcount=10, linecount=1, text="Hello, foo", preprocessing_succeeded=True, preprocess_time_ms=4, wall_time_ms=4, ), preprocessed.PreprocessedContentFile( input_relpath="c", input_sha256="22222222", input_charcount=10, input_linecount=10, sha256="22222222", charcount=10, linecount=1, text="ERROR: failure", preprocessing_succeeded=False, preprocess_time_ms=4, wall_time_ms=4, ), ]) yield db