def test_Preprocess_mock_preprocessor_bad_code(): """Test that BadCodeException is propagated.""" with pytest.raises(errors.BadCodeException): preprocessors.Preprocess('', [ 'deeplearning.clgen.preprocessors.preprocessors_test' ':MockPreprocessorBadCode' ])
def FromContentFile( cls, contentfile_root: pathlib.Path, relpath: pathlib.Path, preprocessors_: typing.List[str]) -> 'PreprocessedContentFile': """Instantiate a PreprocessedContentFile.""" start_time = time.time() preprocessing_succeeded = False try: with open(contentfile_root / relpath) as f: input_text = f.read() text = preprocessors.Preprocess(input_text, preprocessors_) preprocessing_succeeded = True except UnicodeDecodeError as e: text = 'Unicode error' except errors.BadCodeException as e: text = str(e) end_time = time.time() preprocess_time_ms = int((end_time - start_time) * 1000) input_text_stripped = input_text.strip() return cls( input_relpath=relpath, input_sha256=GetFileSha256(contentfile_root / relpath), input_charcount=len(input_text_stripped), input_linecount=len(input_text_stripped.split('\n')), sha256=hashlib.sha256(text.encode('utf-8')).digest(), charcount=len(text), linecount=len(text.split('\n')), text=text, preprocessing_succeeded=preprocessing_succeeded, preprocess_time_ms=preprocess_time_ms, wall_time_ms=preprocess_time_ms, # The outer-loop may change this. date_added=datetime.datetime.utcnow(), )
def test_Preprocess_mock_preprocessor_internal_error(): """Test that InternalError is propagated.""" with pytest.raises(errors.InternalError): preprocessors.Preprocess('', [ 'deeplearning.clgen.preprocessors.preprocessors_test' ':MockPreprocessorInternalError' ])
def test_Preprocess_RejectSecrets(): """Test that InternalError is propagated.""" with test.Raises(errors.BadCodeException): preprocessors.Preprocess( "-----BEGIN RSA PRIVATE KEY-----", ["deeplearning.clgen.preprocessors.preprocessors" ":RejectSecrets"], )
def FromBQFile( cls, file: bqdb.bqMainFile, preprocessors_: typing.List[str], ) -> "PreprocessedContentFile": """Instantiate a PreprocessedContentFile.""" start_time = time.time() preprocessing_succeeded = False try: input_text = file.content text_generator = preprocessors.Preprocess(input_text, preprocessors_) # preprocessing_succeeded = True except Exception as e: raise("Unexpected exception: {}".format(e)) end_time = time.time() preprocess_time_ms = int((end_time - start_time) * 1000) input_text_stripped = input_text.strip() return [ cls( input_relpath = "main_files/{}".format(file.id), input_sha256 = file.id, input_charcount = len(input_text_stripped), input_linecount = len(input_text_stripped.split("\n")), sha256 = hashlib.sha256(text.encode("utf-8")).hexdigest(), charcount = len(text), linecount = len(text.split("\n")), text = text, preprocessing_succeeded = success, preprocess_time_ms = preprocess_time_ms, wall_time_ms = preprocess_time_ms, # The outer-loop may change this. date_added = datetime.datetime.utcnow(), ) for (text, success) in text_generator ]
def test_Preprocess_RejectSecrets(): """Test that InternalError is propagated.""" assert (preprocessors.Preprocess( "Hello, world!", ["deeplearning.clgen.preprocessors.preprocessors" ":RejectSecrets"], ) == "Hello, world!")
def test_Preprocess_RejectSecrets(): """Test that InternalError is propagated.""" with pytest.raises(errors.BadCodeException): preprocessors.Preprocess( '-----BEGIN RSA PRIVATE KEY-----', ['deeplearning.clgen.preprocessors.preprocessors' ':RejectSecrets'])
def test_Preprocess_mock_preprocessor(): """Test unmodified output if no preprocessors.""" assert (preprocessors.Preprocess( "hello", [ "deeplearning.clgen.preprocessors.preprocessors_test:MockPreprocessor" ], ) == "PREPROCESSED")
def test_Preprocess_mock_preprocessor_internal_error(): """Test that InternalError is propagated.""" with test.Raises(errors.InternalError): preprocessors.Preprocess( "", [ "deeplearning.clgen.preprocessors.preprocessors_test" ":MockPreprocessorInternalError" ], )
def test_Preprocess_mock_preprocessor_bad_code(): """Test that BadCodeException is propagated.""" with test.Raises(errors.BadCodeException): preprocessors.Preprocess( "", [ "deeplearning.clgen.preprocessors.preprocessors_test" ":MockPreprocessorBadCode" ], )
def _SampleBatch( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], ) -> bool: """Run a single iteration of the batched sample inner-loop.""" start_time = labdate.MillisecondsTimestamp() # We're use the sampler.encoded_start_text attribute as a way to re-seed the # model state during rollback, so save the original value here so that we # can restore it at the end of the sample batch. original_sampler_encoded_start_text = sampler.encoded_start_text.copy() self.backend.InitSampleBatch(sampler) backtracker = OpenClBacktrackingHelper(atomizer, self._target_features) self._logger.OnSampleStart(backtracker) sampled_tokens = self.SampleOneWithBacktracking( sampler, atomizer, backtracker) self._logger.OnSampleEnd(backtracker) end_time = labdate.MillisecondsTimestamp() # Format text. if len(sampled_tokens): text = preprocessors.Preprocess( "".join(sampled_tokens), [ "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.cxx:ClangFormat", ], ) else: text = "" # Restore the sampler's start text. sampler.encoded_start_text = original_sampler_encoded_start_text # Notify sample observers. sample = model_pb2.Sample( text=text, sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - start_time, num_tokens=len(sampled_tokens), ) return all([not obs.OnSample(sample) for obs in sample_observers])
def _PrettifySource(src: str) -> str: """Format an OpenCL source.""" return preprocessors.Preprocess( src, [ "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.cxx:ClangFormat", ], )
def FromContentFile( cls, contentfile_root: pathlib.Path, relpath: pathlib.Path, preprocessors_: typing.List[str], ) -> "PreprocessedContentFile": """Instantiate a PreprocessedContentFile.""" start_time = time.time() input_text = "" preprocessing_succeeded = False try: with open(contentfile_root / relpath) as f: try: input_text = f.read() except UnicodeDecodeError: input_text = "/*corrupted file format*/" except UnicodeError: input_text = "/*corrupted file format*/" except Exception: input_text = "/*corrupted file format*/" text_generator = preprocessors.Preprocess(input_text, preprocessors_) # preprocessing_succeeded = True except Exception as e: raise("Unexpected exception: {}".format(e)) end_time = time.time() preprocess_time_ms = int((end_time - start_time) * 1000) input_text_stripped = input_text.strip() return [ cls( input_relpath = relpath, input_sha256 = GetFileSha256(contentfile_root / (relpath)), input_charcount = len(input_text_stripped), input_linecount = len(input_text_stripped.split("\n")), sha256 = hashlib.sha256(text.encode("utf-8")).hexdigest(), charcount = len(text), linecount = len(text.split("\n")), text = text, preprocessing_succeeded = success, preprocess_time_ms = preprocess_time_ms, wall_time_ms = preprocess_time_ms, # The outer-loop may change this. date_added = datetime.datetime.utcnow(), ) for (text, success) in text_generator ]
def FromContentFile( cls, contentfile_root: pathlib.Path, relpath: pathlib.Path, preprocessors_: typing.List[str], ) -> "PreprocessedContentFile": """Instantiate a PreprocessedContentFile.""" start_time = time.time() input_text = "" preprocessing_succeeded = False try: with open(contentfile_root / relpath) as f: input_text = f.read() text = preprocessors.Preprocess(input_text, preprocessors_) preprocessing_succeeded = True except UnicodeDecodeError as e: text = "Unicode error" except ValueError as e: # BadCodeException subclasses ValueError. Catch the more general # ValueError so that custom preprocessors can raise ValueError and don't # have to depend on CLgen sources. text = str(e) end_time = time.time() preprocess_time_ms = int((end_time - start_time) * 1000) input_text_stripped = input_text.strip() return cls( input_relpath=relpath, input_sha256=GetFileSha256(contentfile_root / relpath), input_charcount=len(input_text_stripped), input_linecount=len(input_text_stripped.split("\n")), sha256=hashlib.sha256(text.encode("utf-8")).hexdigest(), charcount=len(text), linecount=len(text.split("\n")), text=text, preprocessing_succeeded=preprocessing_succeeded, preprocess_time_ms=preprocess_time_ms, wall_time_ms=preprocess_time_ms, # The outer-loop may change this. date_added=datetime.datetime.utcnow(), )
def _PreprocessBenchmarkInnerLoopBadCode(preprocessors_: typing.List[str], code_in): """Benchmark inner loop for bad code.""" with test.Raises(errors.BadCodeException): preprocessors.Preprocess(code_in, preprocessors_)
def _PreprocessBenchmarkInnerLoop(preprocessors_: typing.List[str], code_in: str, code_out: str): """Benchmark inner loop for code with expected output.""" assert preprocessors.Preprocess(code_in, preprocessors_) == code_out
def test_Preprocess_RejectSecrets(): """Test that InternalError is propagated.""" assert preprocessors.Preprocess( 'Hello, world!', ['deeplearning.clgen.preprocessors.preprocessors' ':RejectSecrets']) == 'Hello, world!'
def test_Preprocess_mock_preprocessor(): """Test unmodified output if no preprocessors.""" assert preprocessors.Preprocess('hello', [ 'deeplearning.clgen.preprocessors.preprocessors_test:MockPreprocessor']) \ == 'PREPROCESSED'
def test_Preprocess_no_preprocessors(): """Test unmodified output if no preprocessors.""" assert preprocessors.Preprocess('hello', []) == 'hello'