def test_GetPreprocessorFunction_absolute_path_not_found( tempdir: pathlib.Path): """Test loading module when file not found.""" path = tempdir / "foo.py" fs.Write(path, "".encode("utf-8")) with test.Raises(errors.UserError): preprocessors.GetPreprocessorFunction(f"{path}:NotFound")
def AssertConfigIsValid(config: corpus_pb2.Corpus) -> corpus_pb2.Corpus: """Assert that config proto is valid. Args: config: A Corpus proto. Returns: The Corpus proto. Raises: UserError: If the config is invalid. """ try: pbutil.AssertFieldIsSet(config, 'contentfiles') pbutil.AssertFieldIsSet(config, 'atomizer') pbutil.AssertFieldIsSet(config, 'contentfile_separator') # Check that the preprocessor pipeline resolves to preprocessor functions. [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor] if config.HasField('greedy_multichar_atomizer'): if not config.greedy_multichar_atomizer.tokens: raise errors.UserError( 'GreedyMulticharAtomizer.tokens is empty') for atom in config.greedy_multichar_atomizer.tokens: if not atom: raise errors.UserError( 'Empty string found in GreedyMulticharAtomizer.tokens is empty' ) return config except pbutil.ProtoValueError as e: raise errors.UserError(e)
def test_GetPreprocessFunction_undecorated_preprocessor(): """Test that an UserError is raised if preprocessor not decorated.""" with pytest.raises(errors.UserError) as e_info: preprocessors.GetPreprocessorFunction( 'deeplearning.clgen.preprocessors.preprocessors_test' ':MockUndecoratedPreprocessor') assert '@clgen_preprocessor' in str(e_info.value)
def test_GetPreprocessFunction_missing_function(): """Test that UserError is raised if module exists but function doesn't.""" with test.Raises(errors.UserError) as e_info: preprocessors.GetPreprocessorFunction( "deeplearning.clgen.preprocessors.preprocessors_test:Foo" ) assert "not found" in str(e_info.value)
def test_GetPreprocessorFunction_absolute_path_with_dep(tempdir: pathlib.Path): """Test loading module from file which has a dependency.""" lib_module = tempdir / "lib_module.py" fs.Write( lib_module, """ def PreprocessImplementation(src): return src.replace('b', 'c') """.encode( "utf-8" ), ) path = tempdir / "lib_module.py" fs.Write( path, """ from . import lib_module def Preprocess(src): return lib_module.PreprocessImplementation(src) """.encode( "utf-8" ), ) with test.Raises(errors.UserError): preprocessors.GetPreprocessorFunction(f"{path}:Preprocess")
def test_GetPreprocessorFunction_absolute_path(tempdir: pathlib.Path): """Test loading module from absolute path to file.""" path = tempdir / 'preprocessor.py' fs.Write( path, """ def Preprocess(src: str) -> str: return src.replace('a', 'b') """.encode('utf-8')) f = preprocessors.GetPreprocessorFunction(f'{path}:Preprocess') assert f('abc') == 'bbc'
def test_GetPreprocessorFunction_absolute_path(tempdir: pathlib.Path): """Test loading module from absolute path to file.""" path = tempdir / "preprocessor.py" fs.Write( path, """ def Preprocess(src: str) -> str: return src.replace('a', 'b') """.encode("utf-8"), ) f = preprocessors.GetPreprocessorFunction(f"{path}:Preprocess") assert f("abc") == "bbc"
def Preprocess(contentfiles: pathlib.Path, outdir: pathlib.Path, preprocessor_names): # Error early if preprocessors are bad. [preprocessors.GetPreprocessorFunction(f) for f in preprocessor_names] # This is basically the same code as: # deeplearning.clgen.corpuses.preprocessed.PreprocessedContentFiles:Import() # Only it's writing the results of preprocessing to files rather than to a # database. Consider refactoring. relpaths = {f.name for f in contentfiles.iterdir()} done = {f.name for f in outdir.iterdir()} todo = relpaths - done app.Log( 1, "Preprocessing %s of %s content files", humanize.Commas(len(todo)), humanize.Commas(len(relpaths)), ) jobs = [ internal_pb2.PreprocessorWorker( contentfile_root=str(contentfiles), relpath=t, preprocessors=preprocessor_names, ) for t in todo ] pool = multiprocessing.Pool() bar = progressbar.ProgressBar(max_value=len(jobs)) wall_time_start = time.time() workers = pool.imap_unordered(preprocessed.PreprocessorWorker, jobs) succeeded_count = 0 for preprocessed_cf in bar(workers): wall_time_end = time.time() preprocessed_cf.wall_time_ms = int( (wall_time_end - wall_time_start) * 1000) wall_time_start = wall_time_end if preprocessed_cf.preprocessing_succeeded: succeeded_count += 1 with open(outdir / preprocessed_cf.input_relpath, "w") as f: f.write(preprocessed_cf.text) app.Log( 1, "Successfully preprocessed %s of %s files (%.2f %%)", humanize.Commas(succeeded_count), humanize.Commas(len(todo)), (succeeded_count / min(len(todo), 1)) * 100, )
def AssertConfigIsValid(config: typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus] ) -> typing.Union[corpus_pb2.Corpus, corpus_pb2.PreTrainCorpus]: """Assert that config proto is valid. Args: config: A Corpus proto. Returns: The Corpus proto. Raises: UserError: If the config is invalid. """ try: # Early-exit to support corpuses derived from databases of pre-encoded # content files. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor after splitting # Corpus class. if config.HasField("pre_encoded_corpus_url"): return config pbutil.AssertFieldIsSet(config, "contentfiles") if isinstance(config, corpus_pb2.Corpus): pbutil.AssertFieldIsSet(config, "tokenizer") pbutil.AssertFieldIsSet(config.tokenizer, "token_type") pbutil.AssertFieldConstraint(config.tokenizer, "token_type", lambda x: x == "character" or x == "word" or x == "ast", "tokenizer is either character or word based." ) if config.tokenizer.token_type == "word": pbutil.AssertFieldConstraint(config.tokenizer, "token_list", lambda x: os.path.isfile(str(ExpandConfigPath(x, path_prefix=FLAGS.clgen_local_path_prefix))), "Invalid token_list file" ) else: if config.HasField("tokenizer"): raise ValueError("Pre-train corpus cannot have a distinct tokenizer.") pbutil.AssertFieldIsSet(config, "contentfile_separator") # Check that the preprocessor pipeline resolves to preprocessor functions. [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor] return config except pbutil.ProtoValueError as e: raise e
def AssertConfigIsValid(config: corpus_pb2.Corpus) -> corpus_pb2.Corpus: """Assert that config proto is valid. Args: config: A Corpus proto. Returns: The Corpus proto. Raises: UserError: If the config is invalid. """ try: # Early-exit to support corpuses derived from databases of pre-encoded # content files. # TODO(github.com/ChrisCummins/clgen/issues/130): Refactor after splitting # Corpus class. if config.HasField("pre_encoded_corpus_url"): return config pbutil.AssertFieldIsSet(config, "contentfiles") pbutil.AssertFieldIsSet(config, "atomizer") pbutil.AssertFieldIsSet(config, "contentfile_separator") # Check that the preprocessor pipeline resolves to preprocessor functions. [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor] if config.HasField("greedy_multichar_atomizer"): if not config.greedy_multichar_atomizer.tokens: raise errors.UserError( "GreedyMulticharAtomizer.tokens is empty") for atom in config.greedy_multichar_atomizer.tokens: if not atom: raise errors.UserError( "Empty string found in GreedyMulticharAtomizer.tokens is empty" ) return config except pbutil.ProtoValueError as e: raise errors.UserError(e)
def test_GetPreprocessFunction_mock_preprocessor(): """Test that a mock preprocessor can be found.""" f = preprocessors.GetPreprocessorFunction( 'deeplearning.clgen.preprocessors.preprocessors_test:MockPreprocessor') assert f == MockPreprocessor
def test_GetPreprocessFunction_missing_module(): """Test that UserError is raised if module not found.""" with pytest.raises(errors.UserError) as e_info: preprocessors.GetPreprocessorFunction('not.a.real.module:Foo') assert 'not found' in str(e_info.value)
def test_GetPreprocessFunction_empty_string(): """Test that an UserError is raised if no preprocessor is given.""" with pytest.raises(errors.UserError) as e_info: preprocessors.GetPreprocessorFunction('') assert 'Invalid preprocessor name' in str(e_info.value)
def test_GetPreprocessorFunction_absolute_function_not_found( tempdir: pathlib.Path): """Test loading module when file not found.""" with pytest.raises(errors.UserError): preprocessors.GetPreprocessorFunction(f'{tempdir}/foo.py:Preprocess')
def test_GetPreprocessorFunction_absolute_path_not_found(tempdir: pathlib.Path): """Test loading module when file not found.""" path = tempdir / 'foo.py' fs.Write(path, "".encode('utf-8')) with pytest.raises(errors.UserError): preprocessors.GetPreprocessorFunction(f'{path}:NotFound')
def test_GetPreprocessFunction_mock_preprocessor(): """Test that a mock preprocessor can be found.""" f = preprocessors.GetPreprocessorFunction( "deeplearning.clgen.preprocessors.preprocessors_test:MockPreprocessor") assert f.__name__ == "MockPreprocessor"