Exemple #1
0
def CreateSamplerProtoFromFlags() -> sampler_pb2.Sampler:
  sampler = sampler_pb2.Sampler(
    start_text=FLAGS.clgen_seed_text,
    batch_size=FLAGS.clgen_sample_batch_size,
    sequence_length=FLAGS.clgen_sample_sequence_length,
    temperature_micros=int(FLAGS.clgen_sample_temperature * 1000000),
    termination_criteria=[
      sampler_pb2.SampleTerminationCriterion(
        symtok=sampler_pb2.SymmetricalTokenDepth(
          depth_increase_token="{", depth_decrease_token="}",
        )
      ),
    ],
  )
  if FLAGS.clgen_max_sample_length:
    sampler.termination_criteria.extend(
      [
        sampler_pb2.SampleTerminationCriterion(
          maxlen=sampler_pb2.MaxTokenLength(
            maximum_tokens_in_sample=FLAGS.clgen_max_sample_length,
          )
        ),
      ]
    )
  return sampler
Exemple #2
0
def MakeClgenInstanceConfig(
    working_dir: pathlib.Path,
    encoded_db: encoded.EncodedContentFiles,
    num_training_epochs: int,
    seed_text: str,
    neurons_per_layer: int,
    num_layers: int,
) -> clgen_pb2.Instance:
    """Construct a CLgen instance.

  Args:
    working_dir: The directory to cache CLgen working files in.
    encoded_db: The directory of encoded content files.
    num_training_epochs: The number of epochs to train for.
    seed_text: The text to initiate sampling with.
    neurons_per_layer: Number of neurons in a layer.
  """
    return clgen_pb2.Instance(
        working_dir=str(working_dir),
        model=model_pb2.Model(
            corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ),
            architecture=model_pb2.NetworkArchitecture(
                backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                neuron_type=model_pb2.NetworkArchitecture.LSTM,
                neurons_per_layer=neurons_per_layer,
                num_layers=num_layers,
                post_layer_dropout_micros=0,
            ),
            training=model_pb2.TrainingOptions(
                num_epochs=num_training_epochs,
                sequence_length=64,
                batch_size=64,
                shuffle_corpus_contentfiles_between_epochs=True,
                adam_optimizer=model_pb2.AdamOptimizer(
                    initial_learning_rate_micros=2000,
                    learning_rate_decay_per_epoch_micros=50000,
                    beta_1_micros=900000,
                    beta_2_micros=999000,
                    normalized_gradient_clip_micros=5000000,
                ),
            ),
        ),
        sampler=sampler_pb2.Sampler(
            start_text=seed_text,
            batch_size=64,
            sequence_length=1024,
            temperature_micros=1000000,  # = 1.0 real value
            termination_criteria=[
                sampler_pb2.SampleTerminationCriterion(
                    symtok=sampler_pb2.SymmetricalTokenDepth(
                        depth_increase_token="{",
                        depth_decrease_token="}",
                    )),
                sampler_pb2.SampleTerminationCriterion(
                    maxlen=sampler_pb2.MaxTokenLength(
                        maximum_tokens_in_sample=20000, )),
            ],
        ),
    )
Exemple #3
0
def test_SymmetrcalTokenDepthCriterion_SampleIsComplete_reverse_order():
    """Test that sample is not complete if right token appears before left."""
    t = samplers.SymmetricalTokenDepthCriterion(
        sampler_pb2.SymmetricalTokenDepth(depth_increase_token="+",
                                          depth_decrease_token="-"))
    assert not t.SampleIsComplete(["-", "+"])
    assert not t.SampleIsComplete(["-", "a", "b", "c", "+"])
    assert t.SampleIsComplete(["-", "a", "b", "c", "+", "+", "-"])
Exemple #4
0
def test_SymmetricalTokenDepthCriterion_same_tokens():
    """test that error is raised if depth tokens are the same."""
    config = sampler_pb2.SymmetricalTokenDepth(depth_increase_token="a",
                                               depth_decrease_token="a")
    with test.Raises(errors.UserError) as e_info:
        samplers.SymmetricalTokenDepthCriterion(config)
    assert "SymmetricalTokenDepth tokens must be different" == str(
        e_info.value)
Exemple #5
0
def test_SymmetrcalTokenDepthCriterion_SampleIsComplete_reverse_order():
    """Test that sample is not complete if right token appears before left."""
    t = samplers.SymmetricalTokenDepthCriterion(
        sampler_pb2.SymmetricalTokenDepth(depth_increase_token='+',
                                          depth_decrease_token='-'))
    assert not t.SampleIsComplete(['-', '+'])
    assert not t.SampleIsComplete(['-', 'a', 'b', 'c', '+'])
    assert t.SampleIsComplete(['-', 'a', 'b', 'c', '+', '+', '-'])
Exemple #6
0
def test_SymmetricalTokenDepthCriterion_depth_increase_token():
    """Test that error is raised if depth_increase_token is invalid."""
    config = sampler_pb2.SymmetricalTokenDepth(depth_increase_token="a")
    # Field is missing.
    with test.Raises(errors.UserError) as e_info:
        samplers.SymmetricalTokenDepthCriterion(config)
    assert "SymmetricalTokenDepth.depth_decrease_token must be a string" == str(
        e_info.value)
    # Value is empty.
    config.depth_decrease_token = ""
    with test.Raises(errors.UserError) as e_info:
        samplers.SymmetricalTokenDepthCriterion(config)
    assert "SymmetricalTokenDepth.depth_decrease_token must be a string" == str(
        e_info.value)
Exemple #7
0
def test_SymmetricalTokenDepthCriterion_SampleIsComplete():
    """Test SampleIsComplete() returns expected values."""
    t = samplers.SymmetricalTokenDepthCriterion(
        sampler_pb2.SymmetricalTokenDepth(depth_increase_token="+",
                                          depth_decrease_token="-"))
    # Depth 0, incomplete.
    assert not t.SampleIsComplete([])
    # Depth 1, incomplete.
    assert not t.SampleIsComplete(["+"])
    # Depth -1, complete.
    assert t.SampleIsComplete(["-"])
    # Depth 0, complete.
    assert t.SampleIsComplete(["+", "-"])
    # Depth 1, incomplete.
    assert not t.SampleIsComplete(["a", "+", "b", "c"])
    # Depth 1, incomplete.
    assert not t.SampleIsComplete(["a", "+", "+", "b", "c", "-"])
    # Depth 0, complete.
    assert t.SampleIsComplete(["a", "+", "-", "+", "b", "c", "-"])
Exemple #8
0
def test_SymmetricalTokenDepthCriterion_SampleIsComplete():
    """Test SampleIsComplete() returns expected values."""
    t = samplers.SymmetricalTokenDepthCriterion(
        sampler_pb2.SymmetricalTokenDepth(depth_increase_token='+',
                                          depth_decrease_token='-'))
    # Depth 0, incomplete.
    assert not t.SampleIsComplete([])
    # Depth 1, incomplete.
    assert not t.SampleIsComplete(['+'])
    # Depth -1, complete.
    assert t.SampleIsComplete(['-'])
    # Depth 0, complete.
    assert t.SampleIsComplete(['+', '-'])
    # Depth 1, incomplete.
    assert not t.SampleIsComplete(['a', '+', 'b', 'c'])
    # Depth 1, incomplete.
    assert not t.SampleIsComplete(['a', '+', '+', 'b', 'c', '-'])
    # Depth 0, complete.
    assert t.SampleIsComplete(['a', '+', '-', '+', 'b', 'c', '-'])
Exemple #9
0
    def __init__(
        self,
        atomizer: atomizers.AtomizerBase,
        target_features: typing.Optional[np.array],
    ):
        # Temporary working directory is used to write files that the Grewe feature
        # extractor can use.
        self.working_dir = pathlib.Path(
            tempfile.mkdtemp(prefix="phd_clgen_backtracking_"))
        self.symtok = samplers.SymmetricalTokenDepthCriterion(
            sampler_pb2.SymmetricalTokenDepth(depth_increase_token="{",
                                              depth_decrease_token="}"))
        self.symtok.Specialize(atomizer)

        # Feature hill climbing state.
        self._previous_src = ""
        self._target_features = target_features
        if self._target_features is not None:
            self._previous_features = np.array([0, 0, 0, 0], dtype=np.int)
            self._init_feature_distance = scipy.spatial.distance.euclidean(
                self._previous_features, self._target_features)
            self._previous_feature_distance = self._init_feature_distance
Exemple #10
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(" ".join(
            argv[1:])))

    instance = clgen.Instance(
        clgen_pb2.Instance(
            working_dir=FLAGS.clgen_dir,
            model=model_pb2.Model(
                corpus=corpus_pb2.Corpus(
                    local_directory=FLAGS.clgen_corpus_dir,
                    ascii_character_atomizer=True,
                    preprocessor=[
                        "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                        "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                        "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
                        "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                        "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                        "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                        "deeplearning.clgen.preprocessors.opencl:ClangFormat",
                        "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                    ],
                    contentfile_separator="\n\n",
                ),
                architecture=model_pb2.NetworkArchitecture(
                    backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                    neuron_type=model_pb2.NetworkArchitecture.LSTM,
                    neurons_per_layer=512,
                    num_layers=2,
                    post_layer_dropout_micros=0,
                ),
                training=model_pb2.TrainingOptions(
                    num_epochs=50,
                    sequence_length=64,
                    batch_size=64,
                    shuffle_corpus_contentfiles_between_epochs=True,
                    adam_optimizer=model_pb2.AdamOptimizer(
                        initial_learning_rate_micros=2000,
                        learning_rate_decay_per_epoch_micros=50000,
                        beta_1_micros=900000,
                        beta_2_micros=999000,
                        normalized_gradient_clip_micros=5000000,
                    ),
                ),
            ),
            sampler=sampler_pb2.Sampler(
                start_text="kernel void ",
                batch_size=64,
                sequence_length=1024,
                temperature_micros=1000000,  # = 1.0 real value
                termination_criteria=[
                    sampler_pb2.SampleTerminationCriterion(
                        symtok=sampler_pb2.SymmetricalTokenDepth(
                            depth_increase_token="{",
                            depth_decrease_token="}",
                        )),
                    sampler_pb2.SampleTerminationCriterion(
                        maxlen=sampler_pb2.MaxTokenLength(
                            maximum_tokens_in_sample=20000, )),
                ],
            ),
        ), )
    db = grewe_features_db.Database(FLAGS.db)
    profile_dir = pathlib.Path(FLAGS.profile_dir)
    profile_dir.mkdir(parents=True, exist_ok=True)
    profiler = prof.AutoCsvProfiler(profile_dir)

    with instance.Session(), multiprocessing.Pool() as pool:
        while True:
            Sample(instance, db, profiler, pool)