def IndexContentFiles(job: scrape_repos_pb2.ImportWorker) -> None: """Index content files.""" relpath = job.abspath[len(str(job.clone_dir)) + 1 :] try: texts = preprocessors.Preprocess( pathlib.Path(job.clone_dir), relpath, job.all_files_relpaths, job.preprocessors, ) for i, text in enumerate(texts): sha256 = hashlib.sha256(text.encode("utf-8")) proto = scrape_repos_pb2.ContentFile( clone_from_url=job.clone_from_url, relpath=relpath, artifact_index=i, sha256=sha256.digest(), charcount=len(text), linecount=len(text.split("\n")), text=text, ) path = pathlib.Path(job.index_dir) / ( binascii.hexlify(proto.sha256).decode("utf-8") + ".pbtxt" ) pbutil.ToFile(proto, path) except UnicodeDecodeError: app.Warning("Failed to decode %s", relpath)
def test_Preprocess_mock_preprocessor(tempdir): """Test unmodified output if no preprocessors.""" MakeFile(tempdir, 'a', 'hello') assert preprocessors.Preprocess(tempdir, 'a', ['a'], [ 'datasets.github.scrape_repos.preprocessors.preprocessors_test' ':MockPreprocessor' ]) == ['PREPROCESSED']
def test_Preprocess_mock_preprocessor_exception(tempdir): """Test that an exception is propagated.""" MakeFile(tempdir, 'a', 'hello') with pytest.raises(ValueError): preprocessors.Preprocess(tempdir, 'a', ['a'], [ 'datasets.github.scrape_repos.preprocessors.preprocessors_test' ':MockPreprocessorInternalError' ])
def test_Preprocess_mock_preprocessor(tempdir): """Test unmodified output if no preprocessors.""" MakeFile(tempdir, "a", "hello") assert preprocessors.Preprocess( tempdir, "a", ["a"], [ "datasets.github.scrape_repos.preprocessors.preprocessors_test" ":MockPreprocessor" ], ) == ["PREPROCESSED"]
def test_Preprocess_mock_preprocessor_exception(tempdir): """Test that an exception is propagated.""" MakeFile(tempdir, "a", "hello") with test.Raises(ValueError): preprocessors.Preprocess( tempdir, "a", ["a"], [ "datasets.github.scrape_repos.preprocessors.preprocessors_test" ":MockPreprocessorInternalError" ], )
def ImportWorker( job: scrape_repos_pb2.ImportWorker ) -> typing.List[contentfiles.ContentFile]: """Import a content file.""" relpath = job.abspath[len(str(job.clone_dir)) + 1:] outputs: typing.List[contentfiles.ContentFile] = [] try: texts = preprocessors.Preprocess(pathlib.Path(job.clone_dir), relpath, job.all_files_relpaths, job.preprocessors) for i, text in enumerate(texts): sha256 = hashlib.sha256(text.encode('utf-8')) outputs.append(contentfiles.ContentFile( clone_from_url=job.clone_from_url, relpath=relpath, artifact_index=i, sha256=sha256.digest(), charcount=len(text), linecount=len(text.split('\n')), text=text)) except UnicodeDecodeError: logging.warning('Failed to decode %s', relpath) return outputs
def test_Preprocess_no_preprocessors(tempdir): """Test unmodified output if no preprocessors.""" MakeFile(tempdir, 'a', 'hello') assert preprocessors.Preprocess(tempdir, 'a', ['a'], []) == ['hello']
def test_Preprocess_no_preprocessors(tempdir): """Test unmodified output if no preprocessors.""" MakeFile(tempdir, "a", "hello") assert preprocessors.Preprocess(tempdir, "a", ["a"], []) == ["hello"]