Ejemplo n.º 1
0
def CreateSamplerProtoFromFlags() -> sampler_pb2.Sampler:
  sampler = sampler_pb2.Sampler(
    start_text=FLAGS.clgen_seed_text,
    batch_size=FLAGS.clgen_sample_batch_size,
    sequence_length=FLAGS.clgen_sample_sequence_length,
    temperature_micros=int(FLAGS.clgen_sample_temperature * 1000000),
    termination_criteria=[
      sampler_pb2.SampleTerminationCriterion(
        symtok=sampler_pb2.SymmetricalTokenDepth(
          depth_increase_token="{", depth_decrease_token="}",
        )
      ),
    ],
  )
  if FLAGS.clgen_max_sample_length:
    sampler.termination_criteria.extend(
      [
        sampler_pb2.SampleTerminationCriterion(
          maxlen=sampler_pb2.MaxTokenLength(
            maximum_tokens_in_sample=FLAGS.clgen_max_sample_length,
          )
        ),
      ]
    )
  return sampler
Ejemplo n.º 2
0
def abc_sampler_config():
  """The sampler config for a simple Sampler."""
  maxlen = sampler_pb2.MaxTokenLength(maximum_tokens_in_sample=5)
  sample_stop = [sampler_pb2.SampleTerminationCriterion(maxlen=maxlen)]
  return sampler_pb2.Sampler(start_text='a', batch_size=5,
                             termination_criteria=sample_stop,
                             temperature_micros=1000000)
Ejemplo n.º 3
0
def MakeClgenInstanceConfig(
    working_dir: pathlib.Path,
    encoded_db: encoded.EncodedContentFiles,
    num_training_epochs: int,
    seed_text: str,
    neurons_per_layer: int,
    num_layers: int,
) -> clgen_pb2.Instance:
    """Construct a CLgen instance.

  Args:
    working_dir: The directory to cache CLgen working files in.
    encoded_db: The directory of encoded content files.
    num_training_epochs: The number of epochs to train for.
    seed_text: The text to initiate sampling with.
    neurons_per_layer: Number of neurons in a layer.
  """
    return clgen_pb2.Instance(
        working_dir=str(working_dir),
        model=model_pb2.Model(
            corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ),
            architecture=model_pb2.NetworkArchitecture(
                backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                neuron_type=model_pb2.NetworkArchitecture.LSTM,
                neurons_per_layer=neurons_per_layer,
                num_layers=num_layers,
                post_layer_dropout_micros=0,
            ),
            training=model_pb2.TrainingOptions(
                num_epochs=num_training_epochs,
                sequence_length=64,
                batch_size=64,
                shuffle_corpus_contentfiles_between_epochs=True,
                adam_optimizer=model_pb2.AdamOptimizer(
                    initial_learning_rate_micros=2000,
                    learning_rate_decay_per_epoch_micros=50000,
                    beta_1_micros=900000,
                    beta_2_micros=999000,
                    normalized_gradient_clip_micros=5000000,
                ),
            ),
        ),
        sampler=sampler_pb2.Sampler(
            start_text=seed_text,
            batch_size=64,
            sequence_length=1024,
            temperature_micros=1000000,  # = 1.0 real value
            termination_criteria=[
                sampler_pb2.SampleTerminationCriterion(
                    symtok=sampler_pb2.SymmetricalTokenDepth(
                        depth_increase_token="{",
                        depth_decrease_token="}",
                    )),
                sampler_pb2.SampleTerminationCriterion(
                    maxlen=sampler_pb2.MaxTokenLength(
                        maximum_tokens_in_sample=20000, )),
            ],
        ),
    )
Ejemplo n.º 4
0
def test_MaxlenTerminationCriterion_SampleIsComplete():
    """Test SampleIsComplete() returns expected values."""
    t = samplers.MaxlenTerminationCriterion(
        sampler_pb2.MaxTokenLength(maximum_tokens_in_sample=3))
    assert not t.SampleIsComplete([])
    assert not t.SampleIsComplete(["a"])
    assert not t.SampleIsComplete(["a", "b"])
    assert t.SampleIsComplete(["a", "b", "c"])
    assert t.SampleIsComplete(["a", "b", "c", "d"])
    assert t.SampleIsComplete(["a", "b", "c", "d", "e"])
Ejemplo n.º 5
0
def test_MaxlenTerminationCriterion_SampleIsComplete():
    """Test SampleIsComplete() returns expected values."""
    t = samplers.MaxlenTerminationCriterion(
        sampler_pb2.MaxTokenLength(maximum_tokens_in_sample=3))
    assert not t.SampleIsComplete([])
    assert not t.SampleIsComplete(['a'])
    assert not t.SampleIsComplete(['a', 'b'])
    assert t.SampleIsComplete(['a', 'b', 'c'])
    assert t.SampleIsComplete(['a', 'b', 'c', 'd'])
    assert t.SampleIsComplete(['a', 'b', 'c', 'd', 'e'])
Ejemplo n.º 6
0
def test_MaxlenTerminationCriterion_invalid_maximum_tokens_in_sample():
    """Test that error is raised if maximum_tokens_in_sample is invalid."""
    config = sampler_pb2.MaxTokenLength()
    # Field is missing.
    with test.Raises(errors.UserError) as e_info:
        samplers.MaxlenTerminationCriterion(config)
    assert "MaxTokenLength.maximum_tokens_in_sample must be > 0" == str(
        e_info.value)
    # Value is zero.
    config.maximum_tokens_in_sample = 0
    with test.Raises(errors.UserError) as e_info:
        samplers.MaxlenTerminationCriterion(config)
    assert "MaxTokenLength.maximum_tokens_in_sample must be > 0" == str(
        e_info.value)
Ejemplo n.º 7
0
    def Train(self, *args, **kwargs) -> None:
        with self.Session():
            test_sampler_config = sampler_pb2.Sampler()
            test_sampler_config.CopyFrom(self.sampler.config)
            # Make all test samples the same 512-token length.
            del test_sampler_config.termination_criteria[:]
            test_sampler_config.termination_criteria.extend([
                sampler_pb2.SampleTerminationCriterion(
                    maxlen=sampler_pb2.MaxTokenLength(
                        maximum_tokens_in_sample=512)),
            ])
            test_sampler = samplers.Sampler(test_sampler_config)

            # We inject the `test_sampler` argument so that we can create samples
            # during training.
            self.model.Train(*args, test_sampler=test_sampler, **kwargs)
Ejemplo n.º 8
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(" ".join(
            argv[1:])))

    instance = clgen.Instance(
        clgen_pb2.Instance(
            working_dir=FLAGS.clgen_dir,
            model=model_pb2.Model(
                corpus=corpus_pb2.Corpus(
                    local_directory=FLAGS.clgen_corpus_dir,
                    ascii_character_atomizer=True,
                    preprocessor=[
                        "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                        "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                        "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
                        "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                        "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                        "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                        "deeplearning.clgen.preprocessors.opencl:ClangFormat",
                        "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                    ],
                    contentfile_separator="\n\n",
                ),
                architecture=model_pb2.NetworkArchitecture(
                    backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                    neuron_type=model_pb2.NetworkArchitecture.LSTM,
                    neurons_per_layer=512,
                    num_layers=2,
                    post_layer_dropout_micros=0,
                ),
                training=model_pb2.TrainingOptions(
                    num_epochs=50,
                    sequence_length=64,
                    batch_size=64,
                    shuffle_corpus_contentfiles_between_epochs=True,
                    adam_optimizer=model_pb2.AdamOptimizer(
                        initial_learning_rate_micros=2000,
                        learning_rate_decay_per_epoch_micros=50000,
                        beta_1_micros=900000,
                        beta_2_micros=999000,
                        normalized_gradient_clip_micros=5000000,
                    ),
                ),
            ),
            sampler=sampler_pb2.Sampler(
                start_text="kernel void ",
                batch_size=64,
                sequence_length=1024,
                temperature_micros=1000000,  # = 1.0 real value
                termination_criteria=[
                    sampler_pb2.SampleTerminationCriterion(
                        symtok=sampler_pb2.SymmetricalTokenDepth(
                            depth_increase_token="{",
                            depth_decrease_token="}",
                        )),
                    sampler_pb2.SampleTerminationCriterion(
                        maxlen=sampler_pb2.MaxTokenLength(
                            maximum_tokens_in_sample=20000, )),
                ],
            ),
        ), )
    db = grewe_features_db.Database(FLAGS.db)
    profile_dir = pathlib.Path(FLAGS.profile_dir)
    profile_dir.mkdir(parents=True, exist_ok=True)
    profiler = prof.AutoCsvProfiler(profile_dir)

    with instance.Session(), multiprocessing.Pool() as pool:
        while True:
            Sample(instance, db, profiler, pool)