Ejemplo n.º 1
0
def IndexContentFiles(job: scrape_repos_pb2.ImportWorker) -> None:
  """Index content files."""
  relpath = job.abspath[len(str(job.clone_dir)) + 1 :]
  try:
    texts = preprocessors.Preprocess(
      pathlib.Path(job.clone_dir),
      relpath,
      job.all_files_relpaths,
      job.preprocessors,
    )
    for i, text in enumerate(texts):
      sha256 = hashlib.sha256(text.encode("utf-8"))
      proto = scrape_repos_pb2.ContentFile(
        clone_from_url=job.clone_from_url,
        relpath=relpath,
        artifact_index=i,
        sha256=sha256.digest(),
        charcount=len(text),
        linecount=len(text.split("\n")),
        text=text,
      )
      path = pathlib.Path(job.index_dir) / (
        binascii.hexlify(proto.sha256).decode("utf-8") + ".pbtxt"
      )
      pbutil.ToFile(proto, path)
  except UnicodeDecodeError:
    app.Warning("Failed to decode %s", relpath)
Ejemplo n.º 2
0
def test_Preprocess_mock_preprocessor(tempdir):
    """Test unmodified output if no preprocessors."""
    MakeFile(tempdir, 'a', 'hello')
    assert preprocessors.Preprocess(tempdir, 'a', ['a'], [
        'datasets.github.scrape_repos.preprocessors.preprocessors_test'
        ':MockPreprocessor'
    ]) == ['PREPROCESSED']
Ejemplo n.º 3
0
def test_Preprocess_mock_preprocessor_exception(tempdir):
    """Test that an exception is propagated."""
    MakeFile(tempdir, 'a', 'hello')
    with pytest.raises(ValueError):
        preprocessors.Preprocess(tempdir, 'a', ['a'], [
            'datasets.github.scrape_repos.preprocessors.preprocessors_test'
            ':MockPreprocessorInternalError'
        ])
Ejemplo n.º 4
0
def test_Preprocess_mock_preprocessor(tempdir):
  """Test unmodified output if no preprocessors."""
  MakeFile(tempdir, "a", "hello")
  assert preprocessors.Preprocess(
    tempdir,
    "a",
    ["a"],
    [
      "datasets.github.scrape_repos.preprocessors.preprocessors_test"
      ":MockPreprocessor"
    ],
  ) == ["PREPROCESSED"]
Ejemplo n.º 5
0
def test_Preprocess_mock_preprocessor_exception(tempdir):
  """Test that an exception is propagated."""
  MakeFile(tempdir, "a", "hello")
  with test.Raises(ValueError):
    preprocessors.Preprocess(
      tempdir,
      "a",
      ["a"],
      [
        "datasets.github.scrape_repos.preprocessors.preprocessors_test"
        ":MockPreprocessorInternalError"
      ],
    )
Ejemplo n.º 6
0
def ImportWorker(
    job: scrape_repos_pb2.ImportWorker
) -> typing.List[contentfiles.ContentFile]:
  """Import a content file."""
  relpath = job.abspath[len(str(job.clone_dir)) + 1:]
  outputs: typing.List[contentfiles.ContentFile] = []
  try:
    texts = preprocessors.Preprocess(pathlib.Path(job.clone_dir), relpath,
                                     job.all_files_relpaths, job.preprocessors)
    for i, text in enumerate(texts):
      sha256 = hashlib.sha256(text.encode('utf-8'))
      outputs.append(contentfiles.ContentFile(
          clone_from_url=job.clone_from_url,
          relpath=relpath, artifact_index=i,
          sha256=sha256.digest(), charcount=len(text),
          linecount=len(text.split('\n')), text=text))
  except UnicodeDecodeError:
    logging.warning('Failed to decode %s', relpath)
  return outputs
Ejemplo n.º 7
0
def test_Preprocess_no_preprocessors(tempdir):
    """Test unmodified output if no preprocessors."""
    MakeFile(tempdir, 'a', 'hello')
    assert preprocessors.Preprocess(tempdir, 'a', ['a'], []) == ['hello']
Ejemplo n.º 8
0
def test_Preprocess_no_preprocessors(tempdir):
  """Test unmodified output if no preprocessors."""
  MakeFile(tempdir, "a", "hello")
  assert preprocessors.Preprocess(tempdir, "a", ["a"], []) == ["hello"]