Example #1
0
def EncoderWorker(
        job: internal_pb2.EncoderWorker
) -> typing.Optional[EncodedContentFile]:
    """Encode a single content file."""
    # TODO(cec): There is a bug in the atomizer creation logic such that the
    # derived atomizer is not always capable of encoding the preprocessed files.
    # Once this has been fixed, there is no need to catch the VocabError here,
    # and EncoderWorker can always return an EncodedContentFile instance.
    try:
        return EncodedContentFile.FromPreprocessed(
            preprocessed.PreprocessedContentFile(id=job.id, text=job.text),
            pickle.loads(job.pickled_atomizer), job.contentfile_separator)
    except errors.VocabError:
        return None
Example #2
0
def _PreprocessedContentFile(
        relpath: str, text: str,
        preprocessing_succeeded: bool) -> preprocessed.PreprocessedContentFile:
    return preprocessed.PreprocessedContentFile(
        input_relpath=relpath,
        input_sha256="000",
        input_charcount=0,
        input_linecount=0,
        sha256="000",
        charcount=0,
        linecount=0,
        text=text,
        preprocessing_succeeded=preprocessing_succeeded,
        preprocess_time_ms=0,
        wall_time_ms=0,
    )
Example #3
0
def EncoderWorker(
  job: internal_pb2.EncoderWorker,
  tokenizer,
  contentfile_separator,
  is_pre_train,
) -> typing.Optional[EncodedContentFile]:
  """Encode a single content file."""
  # TODO(cec): There is a bug in the tokenizer creation logic such that the
  # derived tokenizer is not always capable of encoding the preprocessed files.
  # Once this has been fixed, there is no need to catch the VocabError here,
  # and EncoderWorker can always return an EncodedContentFile instance.
  try:
    return EncodedContentFile.FromPreprocessed(
      preprocessed.PreprocessedContentFile(id=job.id, text=job.text),
      tokenizer,
      contentfile_separator,
      is_pre_train,
    )
  except Exception as e:
    raise e
Example #4
0
def PreprocessContentfiles(
    cfs: typing.List[contentfiles.ContentFile],
) -> typing.List[preprocessed.PreprocessedContentFile]:
    start_time = time.time()
    output_message = PreprocessStringList([cf.text for cf in cfs])
    wall_time_ms = int((time.time() - start_time) * 1000)

    assert (len(cfs) == len(output_message.outcome) == len(
        output_message.preprocess_time_ms))

    pp_cfs = [
        preprocessed.PreprocessedContentFile(
            input_relpath=
            f"{cf.clone_from_url}:{cf.relpath}:{cf.artifact_index}",
            input_sha256=cf.sha256,
            input_charcount=cf.charcount,
            input_linecount=cf.linecount,
            sha256=hashlib.sha256(
                outcome.contents.encode("utf-8")).hexdigest(),
            charcount=len(outcome.contents),
            linecount=len(outcome.contents.split("\n")),
            text=outcome.contents,
            preprocessing_succeeded=(outcome.status == internal_pb2.
                                     PreprocessorWorkerJobOutcome.OK),
            preprocess_time_ms=preprocess_time_ms,
            wall_time_ms=wall_time_ms,
        ) for cf, outcome, preprocess_time_ms in zip(
            cfs, output_message.outcome, output_message.preprocess_time_ms)
    ]

    # Scan for secrets.
    for pp_cf in pp_cfs:
        if pp_cf.preprocessing_succeeded:
            try:
                secrets.ScanForSecrets(pp_cf.text)
            except secrets.TextContainsSecret as e:
                pp_cf.preprocessing_succeeded = False
                pp_cf.text = f"Text contains secrets: {e}"

    return pp_cfs
Example #5
0
def abc_preprocessed() -> preprocessed.PreprocessedContentFile:
    """A test fixture which returns a preprocessed content file."""
    return preprocessed.PreprocessedContentFile(id=123, text="aabbccddee")
Example #6
0
def preprocessed_db(
    tempdir: pathlib.Path, ) -> preprocessed.PreprocessedContentFiles:
    """A preprocessed database with three files:

    a -> Hello, world
    a2 -> This is a duplicate (has same sha256 as 'a')
    b -> Hello, foo
    c -> ERROR: failure (not successfully preprocessed)
  """
    db = preprocessed.PreprocessedContentFiles(
        f"sqlite:///{tempdir}/preprocessed.db")

    with db.Session(commit=True) as session:
        session.add_all([
            preprocessed.PreprocessedContentFile(
                input_relpath="a",
                input_sha256="00000000",
                input_charcount=10,
                input_linecount=10,
                sha256="00000000",
                charcount=10,
                linecount=1,
                text="Hello, world",
                preprocessing_succeeded=True,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
            preprocessed.PreprocessedContentFile(
                input_relpath="a2",
                input_sha256="00000000",
                input_charcount=10,
                input_linecount=10,
                sha256="00000000",
                charcount=10,
                linecount=1,
                text="This is a duplicate",
                preprocessing_succeeded=True,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
            preprocessed.PreprocessedContentFile(
                input_relpath="b",
                input_sha256="11111111",
                input_charcount=10,
                input_linecount=10,
                sha256="11111111",
                charcount=10,
                linecount=1,
                text="Hello, foo",
                preprocessing_succeeded=True,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
            preprocessed.PreprocessedContentFile(
                input_relpath="c",
                input_sha256="22222222",
                input_charcount=10,
                input_linecount=10,
                sha256="22222222",
                charcount=10,
                linecount=1,
                text="ERROR: failure",
                preprocessing_succeeded=False,
                preprocess_time_ms=4,
                wall_time_ms=4,
            ),
        ])
    yield db