def CreateSamplerProtoFromFlags() -> sampler_pb2.Sampler: sampler = sampler_pb2.Sampler( start_text=FLAGS.clgen_seed_text, batch_size=FLAGS.clgen_sample_batch_size, sequence_length=FLAGS.clgen_sample_sequence_length, temperature_micros=int(FLAGS.clgen_sample_temperature * 1000000), termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", ) ), ], ) if FLAGS.clgen_max_sample_length: sampler.termination_criteria.extend( [ sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=FLAGS.clgen_max_sample_length, ) ), ] ) return sampler
def MakeClgenInstanceConfig( working_dir: pathlib.Path, encoded_db: encoded.EncodedContentFiles, num_training_epochs: int, seed_text: str, neurons_per_layer: int, num_layers: int, ) -> clgen_pb2.Instance: """Construct a CLgen instance. Args: working_dir: The directory to cache CLgen working files in. encoded_db: The directory of encoded content files. num_training_epochs: The number of epochs to train for. seed_text: The text to initiate sampling with. neurons_per_layer: Number of neurons in a layer. """ return clgen_pb2.Instance( working_dir=str(working_dir), model=model_pb2.Model( corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=neurons_per_layer, num_layers=num_layers, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=num_training_epochs, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text=seed_text, batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), )
def test_SymmetrcalTokenDepthCriterion_SampleIsComplete_reverse_order(): """Test that sample is not complete if right token appears before left.""" t = samplers.SymmetricalTokenDepthCriterion( sampler_pb2.SymmetricalTokenDepth(depth_increase_token="+", depth_decrease_token="-")) assert not t.SampleIsComplete(["-", "+"]) assert not t.SampleIsComplete(["-", "a", "b", "c", "+"]) assert t.SampleIsComplete(["-", "a", "b", "c", "+", "+", "-"])
def test_SymmetricalTokenDepthCriterion_same_tokens(): """test that error is raised if depth tokens are the same.""" config = sampler_pb2.SymmetricalTokenDepth(depth_increase_token="a", depth_decrease_token="a") with test.Raises(errors.UserError) as e_info: samplers.SymmetricalTokenDepthCriterion(config) assert "SymmetricalTokenDepth tokens must be different" == str( e_info.value)
def test_SymmetrcalTokenDepthCriterion_SampleIsComplete_reverse_order(): """Test that sample is not complete if right token appears before left.""" t = samplers.SymmetricalTokenDepthCriterion( sampler_pb2.SymmetricalTokenDepth(depth_increase_token='+', depth_decrease_token='-')) assert not t.SampleIsComplete(['-', '+']) assert not t.SampleIsComplete(['-', 'a', 'b', 'c', '+']) assert t.SampleIsComplete(['-', 'a', 'b', 'c', '+', '+', '-'])
def test_SymmetricalTokenDepthCriterion_depth_increase_token(): """Test that error is raised if depth_increase_token is invalid.""" config = sampler_pb2.SymmetricalTokenDepth(depth_increase_token="a") # Field is missing. with test.Raises(errors.UserError) as e_info: samplers.SymmetricalTokenDepthCriterion(config) assert "SymmetricalTokenDepth.depth_decrease_token must be a string" == str( e_info.value) # Value is empty. config.depth_decrease_token = "" with test.Raises(errors.UserError) as e_info: samplers.SymmetricalTokenDepthCriterion(config) assert "SymmetricalTokenDepth.depth_decrease_token must be a string" == str( e_info.value)
def test_SymmetricalTokenDepthCriterion_SampleIsComplete(): """Test SampleIsComplete() returns expected values.""" t = samplers.SymmetricalTokenDepthCriterion( sampler_pb2.SymmetricalTokenDepth(depth_increase_token="+", depth_decrease_token="-")) # Depth 0, incomplete. assert not t.SampleIsComplete([]) # Depth 1, incomplete. assert not t.SampleIsComplete(["+"]) # Depth -1, complete. assert t.SampleIsComplete(["-"]) # Depth 0, complete. assert t.SampleIsComplete(["+", "-"]) # Depth 1, incomplete. assert not t.SampleIsComplete(["a", "+", "b", "c"]) # Depth 1, incomplete. assert not t.SampleIsComplete(["a", "+", "+", "b", "c", "-"]) # Depth 0, complete. assert t.SampleIsComplete(["a", "+", "-", "+", "b", "c", "-"])
def test_SymmetricalTokenDepthCriterion_SampleIsComplete(): """Test SampleIsComplete() returns expected values.""" t = samplers.SymmetricalTokenDepthCriterion( sampler_pb2.SymmetricalTokenDepth(depth_increase_token='+', depth_decrease_token='-')) # Depth 0, incomplete. assert not t.SampleIsComplete([]) # Depth 1, incomplete. assert not t.SampleIsComplete(['+']) # Depth -1, complete. assert t.SampleIsComplete(['-']) # Depth 0, complete. assert t.SampleIsComplete(['+', '-']) # Depth 1, incomplete. assert not t.SampleIsComplete(['a', '+', 'b', 'c']) # Depth 1, incomplete. assert not t.SampleIsComplete(['a', '+', '+', 'b', 'c', '-']) # Depth 0, complete. assert t.SampleIsComplete(['a', '+', '-', '+', 'b', 'c', '-'])
def __init__( self, atomizer: atomizers.AtomizerBase, target_features: typing.Optional[np.array], ): # Temporary working directory is used to write files that the Grewe feature # extractor can use. self.working_dir = pathlib.Path( tempfile.mkdtemp(prefix="phd_clgen_backtracking_")) self.symtok = samplers.SymmetricalTokenDepthCriterion( sampler_pb2.SymmetricalTokenDepth(depth_increase_token="{", depth_decrease_token="}")) self.symtok.Specialize(atomizer) # Feature hill climbing state. self._previous_src = "" self._target_features = target_features if self._target_features is not None: self._previous_features = np.array([0, 0, 0, 0], dtype=np.int) self._init_feature_distance = scipy.spatial.distance.euclidean( self._previous_features, self._target_features) self._previous_feature_distance = self._init_feature_distance
def main(argv: typing.List[str]): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(" ".join( argv[1:]))) instance = clgen.Instance( clgen_pb2.Instance( working_dir=FLAGS.clgen_dir, model=model_pb2.Model( corpus=corpus_pb2.Corpus( local_directory=FLAGS.clgen_corpus_dir, ascii_character_atomizer=True, preprocessor=[ "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim", "deeplearning.clgen.preprocessors.opencl:Compile", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:ClangFormat", "deeplearning.clgen.preprocessors.common:MinimumLineCount3", "deeplearning.clgen.preprocessors.opencl:Compile", ], contentfile_separator="\n\n", ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=512, num_layers=2, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=50, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text="kernel void ", batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), ), ) db = grewe_features_db.Database(FLAGS.db) profile_dir = pathlib.Path(FLAGS.profile_dir) profile_dir.mkdir(parents=True, exist_ok=True) profiler = prof.AutoCsvProfiler(profile_dir) with instance.Session(), multiprocessing.Pool() as pool: while True: Sample(instance, db, profiler, pool)