Exemple #1
0
def CreateInstanceProtoFromFlags() -> clgen_pb2.Instance:
  if FLAGS.clgen_instance:
    return pbutil.FromFile(
        pathlib.Path(FLAGS.clgen_instance), clgen_pb2.Instance())
  else:
    return clgen_pb2.Instance(
        working_dir=FLAGS.clgen_working_dir,
        model=CreateModelProtoFromFlags(),
        sampler=CreateSamplerProtoFromFlags(),
    )
Exemple #2
0
 def ToProto(self) -> clgen_pb2.Instance:
     """Get the proto config for the instance."""
     config = clgen_pb2.Instance()
     config.working_dir = str(self.working_dir)
     config.model.CopyFrom(self.model.config)
     config.sampler.CopyFrom(self.sampler.config)
     return config
Exemple #3
0
def EnumerateLanguageInstanceConfigs(
    language: typing.Dict[str, typing.List[str]]
) -> typing.List[clgen_pb2.Instance]:
    """Enumerate the options for a language."""
    configs = []
    for corpus, model, sampler in itertools.product(language["corpuses"],
                                                    EnumerateModels(),
                                                    language["samplers"]):
        instance_config = clgen_pb2.Instance()
        instance_config.working_dir = FLAGS.working_dir
        instance_config.model.CopyFrom(model)
        instance_config.model.corpus.CopyFrom(
            pbutil.FromFile(
                bazelutil.DataPath(
                    f"phd/experimental/deeplearning/polyglot/corpuses/{corpus}.pbtxt"
                ),
                corpus_pb2.Corpus(),
            ))
        instance_config.sampler.CopyFrom(
            pbutil.FromFile(
                bazelutil.DataPath(
                    f"phd/experimental/deeplearning/polyglot/samplers/{sampler}.pbtxt"
                ),
                sampler_pb2.Sampler(),
            ))
        configs.append(instance_config)
    return configs
Exemple #4
0
def ConfigFromFlags() -> clgen_pb2.Instance:
  config_path = pathlib.Path(FLAGS.config)
  if not config_path.is_file():
    raise app.UsageError(f"CLgen --config file not found: '{config_path}'")
  config = pbutil.FromFile(config_path, clgen_pb2.Instance())
  os.environ['PWD'] = str(config_path.parent)
  return config
Exemple #5
0
def MakeClgenInstanceConfig(
    working_dir: pathlib.Path,
    encoded_db: encoded.EncodedContentFiles,
    num_training_epochs: int,
    seed_text: str,
    neurons_per_layer: int,
    num_layers: int,
) -> clgen_pb2.Instance:
    """Construct a CLgen instance.

  Args:
    working_dir: The directory to cache CLgen working files in.
    encoded_db: The directory of encoded content files.
    num_training_epochs: The number of epochs to train for.
    seed_text: The text to initiate sampling with.
    neurons_per_layer: Number of neurons in a layer.
  """
    return clgen_pb2.Instance(
        working_dir=str(working_dir),
        model=model_pb2.Model(
            corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ),
            architecture=model_pb2.NetworkArchitecture(
                backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                neuron_type=model_pb2.NetworkArchitecture.LSTM,
                neurons_per_layer=neurons_per_layer,
                num_layers=num_layers,
                post_layer_dropout_micros=0,
            ),
            training=model_pb2.TrainingOptions(
                num_epochs=num_training_epochs,
                sequence_length=64,
                batch_size=64,
                shuffle_corpus_contentfiles_between_epochs=True,
                adam_optimizer=model_pb2.AdamOptimizer(
                    initial_learning_rate_micros=2000,
                    learning_rate_decay_per_epoch_micros=50000,
                    beta_1_micros=900000,
                    beta_2_micros=999000,
                    normalized_gradient_clip_micros=5000000,
                ),
            ),
        ),
        sampler=sampler_pb2.Sampler(
            start_text=seed_text,
            batch_size=64,
            sequence_length=1024,
            temperature_micros=1000000,  # = 1.0 real value
            termination_criteria=[
                sampler_pb2.SampleTerminationCriterion(
                    symtok=sampler_pb2.SymmetricalTokenDepth(
                        depth_increase_token="{",
                        depth_decrease_token="}",
                    )),
                sampler_pb2.SampleTerminationCriterion(
                    maxlen=sampler_pb2.MaxTokenLength(
                        maximum_tokens_in_sample=20000, )),
            ],
        ),
    )
Exemple #6
0
def abc_instance_config(clgen_cache_dir, abc_model_config,
                        abc_sampler_config) -> clgen_pb2.Instance:
    """A test fixture that returns an Instance config proto."""
    return clgen_pb2.Instance(
        working_dir=clgen_cache_dir,
        model=abc_model_config,
        sampler=abc_sampler_config,
    )
Exemple #7
0
def test_main_stop_after_train(abc_instance_file):
    """Test that --stop_after train trains the model."""
    app.FLAGS.unparse_flags()
    app.FLAGS(
        ['argv[0]', '--config', abc_instance_file, '--stop_after', 'train'])
    clgen.main([])
    instance = clgen.Instance(
        pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance()))
    assert instance.model.is_trained
Exemple #8
0
def test_main_stop_after_corpus(abc_instance_file):
    """Test that --stop_after corpus prevents model training."""
    app.FLAGS.unparse_flags()
    app.FLAGS(
        ['argv[0]', '--config', abc_instance_file, '--stop_after', 'corpus'])
    clgen.main([])
    instance = clgen.Instance(
        pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance()))
    assert not instance.model.is_trained
Exemple #9
0
def DoFlagsAction():
    """Do the action requested by the command line flags."""
    if not FLAGS.config:
        raise app.UsageError("Missing required argument: '--config'")
    config_path = pathlib.Path(FLAGS.config)
    if not config_path.is_file():
        raise app.UsageError(f"File not found: '{config_path}'")
    config = pbutil.FromFile(config_path, clgen_pb2.Instance())
    os.environ['PWD'] = str(config_path.parent)

    if FLAGS.clgen_profiling:
        prof.enable()

    instance = Instance(config)
    with instance.Session():
        if FLAGS.print_cache_path == 'corpus':
            print(instance.model.corpus.cache.path)
            return
        elif FLAGS.print_cache_path == 'model':
            print(instance.model.cache.path)
            return
        elif FLAGS.print_cache_path == 'sampler':
            print(instance.model.SamplerCache(instance.sampler))
            return
        elif FLAGS.print_cache_path:
            raise app.UsageError(
                f"Invalid --print_cache_path argument: '{FLAGS.print_cache_path}'"
            )

        if FLAGS.print_preprocessed:
            print(instance.model.corpus.GetTextCorpus(shuffle=False))
            return

        # The default action is to sample the model.
        if FLAGS.stop_after == 'corpus':
            instance.model.corpus.Create()
        elif FLAGS.stop_after == 'train':
            instance.model.Train()
            logging.info('Model: %s', instance.model.cache.path)
        elif FLAGS.stop_after:
            raise app.UsageError(
                f"Invalid --stop_after argument: '{FLAGS.stop_after}'")
        elif FLAGS.export_model:
            instance.model.Train()
            export_dir = pathlib.Path(FLAGS.export_model)
            for path in instance.model.InferenceManifest():
                relpath = pathlib.Path(
                    os.path.relpath(path, instance.model.cache.path))
                (export_dir / relpath.parent).mkdir(parents=True,
                                                    exist_ok=True)
                shutil.copyfile(path, export_dir / relpath)
                print(export_dir / relpath)
        else:
            instance.model.Sample(instance.sampler, FLAGS.min_samples)
Exemple #10
0
def test_main_stop_after_train(abc_instance_file):
  """Test that --stop_after train trains the model."""
  FLAGS.unparse_flags()
  FLAGS(["argv0"])
  FLAGS.config = abc_instance_file
  FLAGS.stop_after = "train"
  clgen.main()
  instance = clgen.Instance(
    pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance())
  )
  assert instance.model.is_trained
Exemple #11
0
def test_main_stop_after_corpus(abc_instance_file):
  """Test that --stop_after corpus prevents model training."""
  FLAGS.unparse_flags()
  FLAGS(["argv0"])
  FLAGS.config = abc_instance_file
  FLAGS.stop_after = "corpus"
  clgen.main()
  instance = clgen.Instance(
    pbutil.FromFile(pathlib.Path(abc_instance_file), clgen_pb2.Instance())
  )
  assert not instance.model.is_trained
Exemple #12
0
def test_config_is_valid():
    """Test that config proto is valid."""
    with tempfile.TemporaryDirectory() as d:
        config = pbutil.FromFile(
            bazelutil.DataPath(
                'phd/deeplearning/clgen/tests/data/c99/config.pbtxt'),
            clgen_pb2.Instance())
        # Change the working directory and corpus path to our bazel run dir.
        config.working_dir = d
        config.model.corpus.local_directory = str(
            bazelutil.DataPath('phd/deeplearning/clgen/tests/data/c99/src/'))
        clgen.Instance(config)
Exemple #13
0
 def FromFile(cls, path: pathlib.Path) -> 'Instance':
     return cls(pbutil.FromFile(path, clgen_pb2.Instance()))
Exemple #14
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(" ".join(
            argv[1:])))

    instance = clgen.Instance(
        clgen_pb2.Instance(
            working_dir=FLAGS.clgen_dir,
            model=model_pb2.Model(
                corpus=corpus_pb2.Corpus(
                    local_directory=FLAGS.clgen_corpus_dir,
                    ascii_character_atomizer=True,
                    preprocessor=[
                        "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                        "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                        "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
                        "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                        "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                        "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                        "deeplearning.clgen.preprocessors.opencl:ClangFormat",
                        "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                    ],
                    contentfile_separator="\n\n",
                ),
                architecture=model_pb2.NetworkArchitecture(
                    backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                    neuron_type=model_pb2.NetworkArchitecture.LSTM,
                    neurons_per_layer=512,
                    num_layers=2,
                    post_layer_dropout_micros=0,
                ),
                training=model_pb2.TrainingOptions(
                    num_epochs=50,
                    sequence_length=64,
                    batch_size=64,
                    shuffle_corpus_contentfiles_between_epochs=True,
                    adam_optimizer=model_pb2.AdamOptimizer(
                        initial_learning_rate_micros=2000,
                        learning_rate_decay_per_epoch_micros=50000,
                        beta_1_micros=900000,
                        beta_2_micros=999000,
                        normalized_gradient_clip_micros=5000000,
                    ),
                ),
            ),
            sampler=sampler_pb2.Sampler(
                start_text="kernel void ",
                batch_size=64,
                sequence_length=1024,
                temperature_micros=1000000,  # = 1.0 real value
                termination_criteria=[
                    sampler_pb2.SampleTerminationCriterion(
                        symtok=sampler_pb2.SymmetricalTokenDepth(
                            depth_increase_token="{",
                            depth_decrease_token="}",
                        )),
                    sampler_pb2.SampleTerminationCriterion(
                        maxlen=sampler_pb2.MaxTokenLength(
                            maximum_tokens_in_sample=20000, )),
                ],
            ),
        ), )
    db = grewe_features_db.Database(FLAGS.db)
    profile_dir = pathlib.Path(FLAGS.profile_dir)
    profile_dir.mkdir(parents=True, exist_ok=True)
    profiler = prof.AutoCsvProfiler(profile_dir)

    with instance.Session(), multiprocessing.Pool() as pool:
        while True:
            Sample(instance, db, profiler, pool)