def CreateInstanceProtoFromFlags() -> clgen_pb2.Instance: if FLAGS.clgen_instance: return pbutil.FromFile( pathlib.Path(FLAGS.clgen_instance), clgen_pb2.Instance()) else: return clgen_pb2.Instance( working_dir=FLAGS.clgen_working_dir, model=CreateModelProtoFromFlags(), sampler=CreateSamplerProtoFromFlags(), )
def ToProto(self) -> clgen_pb2.Instance: """Get the proto config for the instance.""" config = clgen_pb2.Instance() config.working_dir = str(self.working_dir) config.model.CopyFrom(self.model.config) config.sampler.CopyFrom(self.sampler.config) return config
def EnumerateLanguageInstanceConfigs( language: typing.Dict[str, typing.List[str]] ) -> typing.List[clgen_pb2.Instance]: """Enumerate the options for a language.""" configs = [] for corpus, model, sampler in itertools.product(language["corpuses"], EnumerateModels(), language["samplers"]): instance_config = clgen_pb2.Instance() instance_config.working_dir = FLAGS.working_dir instance_config.model.CopyFrom(model) instance_config.model.corpus.CopyFrom( pbutil.FromFile( bazelutil.DataPath( f"phd/experimental/deeplearning/polyglot/corpuses/{corpus}.pbtxt" ), corpus_pb2.Corpus(), )) instance_config.sampler.CopyFrom( pbutil.FromFile( bazelutil.DataPath( f"phd/experimental/deeplearning/polyglot/samplers/{sampler}.pbtxt" ), sampler_pb2.Sampler(), )) configs.append(instance_config) return configs
def ConfigFromFlags() -> clgen_pb2.Instance: config_path = pathlib.Path(FLAGS.config) if not config_path.is_file(): raise app.UsageError(f"CLgen --config file not found: '{config_path}'") config = pbutil.FromFile(config_path, clgen_pb2.Instance()) os.environ['PWD'] = str(config_path.parent) return config
def MakeClgenInstanceConfig( working_dir: pathlib.Path, encoded_db: encoded.EncodedContentFiles, num_training_epochs: int, seed_text: str, neurons_per_layer: int, num_layers: int, ) -> clgen_pb2.Instance: """Construct a CLgen instance. Args: working_dir: The directory to cache CLgen working files in. encoded_db: The directory of encoded content files. num_training_epochs: The number of epochs to train for. seed_text: The text to initiate sampling with. neurons_per_layer: Number of neurons in a layer. """ return clgen_pb2.Instance( working_dir=str(working_dir), model=model_pb2.Model( corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=neurons_per_layer, num_layers=num_layers, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=num_training_epochs, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text=seed_text, batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), )
def abc_instance_config(clgen_cache_dir, abc_model_config, abc_sampler_config) -> clgen_pb2.Instance: """A test fixture that returns an Instance config proto.""" return clgen_pb2.Instance( working_dir=clgen_cache_dir, model=abc_model_config, sampler=abc_sampler_config, )
def test_main_stop_after_train(abc_instance_file): """Test that --stop_after train trains the model.""" app.FLAGS.unparse_flags() app.FLAGS( ['argv[0]', '--config', abc_instance_file, '--stop_after', 'train']) clgen.main([]) instance = clgen.Instance( pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance())) assert instance.model.is_trained
def test_main_stop_after_corpus(abc_instance_file): """Test that --stop_after corpus prevents model training.""" app.FLAGS.unparse_flags() app.FLAGS( ['argv[0]', '--config', abc_instance_file, '--stop_after', 'corpus']) clgen.main([]) instance = clgen.Instance( pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance())) assert not instance.model.is_trained
def DoFlagsAction(): """Do the action requested by the command line flags.""" if not FLAGS.config: raise app.UsageError("Missing required argument: '--config'") config_path = pathlib.Path(FLAGS.config) if not config_path.is_file(): raise app.UsageError(f"File not found: '{config_path}'") config = pbutil.FromFile(config_path, clgen_pb2.Instance()) os.environ['PWD'] = str(config_path.parent) if FLAGS.clgen_profiling: prof.enable() instance = Instance(config) with instance.Session(): if FLAGS.print_cache_path == 'corpus': print(instance.model.corpus.cache.path) return elif FLAGS.print_cache_path == 'model': print(instance.model.cache.path) return elif FLAGS.print_cache_path == 'sampler': print(instance.model.SamplerCache(instance.sampler)) return elif FLAGS.print_cache_path: raise app.UsageError( f"Invalid --print_cache_path argument: '{FLAGS.print_cache_path}'" ) if FLAGS.print_preprocessed: print(instance.model.corpus.GetTextCorpus(shuffle=False)) return # The default action is to sample the model. if FLAGS.stop_after == 'corpus': instance.model.corpus.Create() elif FLAGS.stop_after == 'train': instance.model.Train() logging.info('Model: %s', instance.model.cache.path) elif FLAGS.stop_after: raise app.UsageError( f"Invalid --stop_after argument: '{FLAGS.stop_after}'") elif FLAGS.export_model: instance.model.Train() export_dir = pathlib.Path(FLAGS.export_model) for path in instance.model.InferenceManifest(): relpath = pathlib.Path( os.path.relpath(path, instance.model.cache.path)) (export_dir / relpath.parent).mkdir(parents=True, exist_ok=True) shutil.copyfile(path, export_dir / relpath) print(export_dir / relpath) else: instance.model.Sample(instance.sampler, FLAGS.min_samples)
def test_main_stop_after_train(abc_instance_file): """Test that --stop_after train trains the model.""" FLAGS.unparse_flags() FLAGS(["argv0"]) FLAGS.config = abc_instance_file FLAGS.stop_after = "train" clgen.main() instance = clgen.Instance( pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance()) ) assert instance.model.is_trained
def test_main_stop_after_corpus(abc_instance_file): """Test that --stop_after corpus prevents model training.""" FLAGS.unparse_flags() FLAGS(["argv0"]) FLAGS.config = abc_instance_file FLAGS.stop_after = "corpus" clgen.main() instance = clgen.Instance( pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance()) ) assert not instance.model.is_trained
def test_config_is_valid(): """Test that config proto is valid.""" with tempfile.TemporaryDirectory() as d: config = pbutil.FromFile( bazelutil.DataPath( 'phd/deeplearning/clgen/tests/data/c99/config.pbtxt'), clgen_pb2.Instance()) # Change the working directory and corpus path to our bazel run dir. config.working_dir = d config.model.corpus.local_directory = str( bazelutil.DataPath('phd/deeplearning/clgen/tests/data/c99/src/')) clgen.Instance(config)
def FromFile(cls, path: pathlib.Path) -> 'Instance': return cls(pbutil.FromFile(path, clgen_pb2.Instance()))
def main(argv: typing.List[str]): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments: '{}'.".format(" ".join( argv[1:]))) instance = clgen.Instance( clgen_pb2.Instance( working_dir=FLAGS.clgen_dir, model=model_pb2.Model( corpus=corpus_pb2.Corpus( local_directory=FLAGS.clgen_corpus_dir, ascii_character_atomizer=True, preprocessor=[ "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim", "deeplearning.clgen.preprocessors.opencl:Compile", "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers", "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes", "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines", "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype", "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace", "deeplearning.clgen.preprocessors.opencl:ClangFormat", "deeplearning.clgen.preprocessors.common:MinimumLineCount3", "deeplearning.clgen.preprocessors.opencl:Compile", ], contentfile_separator="\n\n", ), architecture=model_pb2.NetworkArchitecture( backend=model_pb2.NetworkArchitecture.TENSORFLOW, neuron_type=model_pb2.NetworkArchitecture.LSTM, neurons_per_layer=512, num_layers=2, post_layer_dropout_micros=0, ), training=model_pb2.TrainingOptions( num_epochs=50, sequence_length=64, batch_size=64, shuffle_corpus_contentfiles_between_epochs=True, adam_optimizer=model_pb2.AdamOptimizer( initial_learning_rate_micros=2000, learning_rate_decay_per_epoch_micros=50000, beta_1_micros=900000, beta_2_micros=999000, normalized_gradient_clip_micros=5000000, ), ), ), sampler=sampler_pb2.Sampler( start_text="kernel void ", batch_size=64, sequence_length=1024, temperature_micros=1000000, # = 1.0 real value termination_criteria=[ sampler_pb2.SampleTerminationCriterion( symtok=sampler_pb2.SymmetricalTokenDepth( depth_increase_token="{", depth_decrease_token="}", )), sampler_pb2.SampleTerminationCriterion( maxlen=sampler_pb2.MaxTokenLength( maximum_tokens_in_sample=20000, )), ], ), ), ) db = grewe_features_db.Database(FLAGS.db) profile_dir = pathlib.Path(FLAGS.profile_dir) profile_dir.mkdir(parents=True, exist_ok=True) profiler = prof.AutoCsvProfiler(profile_dir) with instance.Session(), multiprocessing.Pool() as pool: while True: Sample(instance, db, profiler, pool)