def CreateSamplerProtoFromFlags() -> sampler_pb2.Sampler: sampler = sampler_pb2.Sampler( start_text=FLAGS.clgen_seed_text, batch_size=FLAGS.clgen_sample_batch_size, sequence_length=FLAGS.clgen_sample_sequence_length, temperature_micros=int(FLAGS.clgen_sample_temperature * 1000000), termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", ) ), ], ) if FLAGS.clgen_max_sample_length: sampler.termination_criteria.extend( [ sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=FLAGS.clgen_max_sample_length, ) ), ] ) return sampler
def MakeClgenInstanceConfig( working_dir: pathlib.Path, encoded_db: encoded.EncodedContentFiles, num_training_epochs: int, seed_text: str, neurons_per_layer: int, num_layers: int, ) -> clgen_pb2.Instance: """Construct a CLgen instance. Args: working_dir: The directory to cache CLgen working files in. encoded_db: The directory of encoded content files. num_training_epochs: The number of epochs to train for. seed_text: The text to initiate sampling with. neurons_per_layer: Number of neurons in a layer. """ return clgen_pb2.Instance( working_dir=str(working_dir), model=model_pb2.Model( corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=neurons_per_layer, num_layers=num_layers, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=num_training_epochs, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text=seed_text, batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), )
def abc_sampler_config(): """The sampler config for a simple Sampler.""" maxlen = sampler_pb2.MaxTokenLength(maximum_tokens_in_sample=5) sample_stop = [sampler_pb2.SampleTerminationCriterion(maxlen=maxlen)] return sampler_pb2.Sampler(start_text='a', batch_size=5, termination_criteria=sample_stop, temperature_micros=1000000)
def Train(self, *args, **kwargs) -> None: with self.Session(): test_sampler_config = sampler_pb2.Sampler() test_sampler_config.CopyFrom(self.sampler.config) # Make all test samples the same 512-token length. del test_sampler_config.termination_criteria[:] test_sampler_config.termination_criteria.extend([ sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=512)), ]) test_sampler = samplers.Sampler(test_sampler_config) # We inject the `test_sampler` argument so that we can create samples # during training. self.model.Train(*args, test_sampler=test_sampler, **kwargs)
def main(argv: typing.List[str]): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(" ".join( argv[1:]))) instance = clgen.Instance( clgen_pb2.Instance( working_dir=FLAGS.clgen_dir, model=model_pb2.Model( corpus=corpus_pb2.Corpus( local_directory=FLAGS.clgen_corpus_dir, ascii_character_atomizer=True, preprocessor=[ "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim", "deeplearning.clgen.preprocessors.opencl:Compile", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:ClangFormat", "deeplearning.clgen.preprocessors.common:MinimumLineCount3", "deeplearning.clgen.preprocessors.opencl:Compile", ], contentfile_separator="\n\n", ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=512, num_layers=2, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=50, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text="kernel void ", batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), ), ) db = grewe_features_db.Database(FLAGS.db) profile_dir = pathlib.Path(FLAGS.profile_dir) profile_dir.mkdir(parents=True, exist_ok=True) profiler = prof.AutoCsvProfiler(profile_dir) with instance.Session(), multiprocessing.Pool() as pool: while True: Sample(instance, db, profiler, pool)