Esempio n. 1
0
def EnumerateModels() -> typing.List[model_pb2.Model]:
    """Enumerate the model configurations."""
    models = []
    base_model = pbutil.FromString(BASE_MODEL, model_pb2.Model())
    for num_neurons, num_layers in itertools.product(NUM_NEURONS, NUM_LAYERS):
        model = model_pb2.Model()
        model.CopyFrom(base_model)
        model.architecture.neurons_per_layer = num_neurons
        model.architecture.num_layers = num_layers
        models.append(model)
    return models
Esempio n. 2
0
    def Create(self) -> bool:
        if self._created:
            return False
        self._created = True
        self.corpus.Create()

        # Add entry to dashboard database
        with self.dashboard_db.Session(commit=True) as session:
            config_to_store = model_pb2.Model()
            config_to_store.CopyFrom(self.config)
            config_to_store.ClearField("corpus")
            config_to_store.training.ClearField("num_epochs")
            corpus = session.GetOrAdd(
                dashboard_db.Model,
                corpus_id=self.corpus.dashboard_db_id,
                config_proto_sha1=crypto.sha1(
                    config_to_store.SerializeToString()),
                config_proto=str(config_to_store),
                cache_path=(f"ssh://{system.USERNAME}@{system.HOSTNAME}"
                            f"/{self.cache.path}"),
                summary=self.GetShortSummary(),
            )
            session.flush()
            self._dashboard_db_id = corpus.id
            self.backend.dashboard_model_id = self.dashboard_db_id
            self.backend.dashboard_db = self.dashboard_db
Esempio n. 3
0
def abc_model_config(abc_corpus_config):
    """The proto config for a simple Model."""
    architecture = model_pb2.NetworkArchitecture(
        backend=model_pb2.NetworkArchitecture.TENSORFLOW,
        embedding_size=2,
        neuron_type=model_pb2.NetworkArchitecture.LSTM,
        neurons_per_layer=4,
        num_layers=1,
        post_layer_dropout_micros=2000,
    )
    optimizer = model_pb2.AdamOptimizer(
        initial_learning_rate_micros=2000,
        learning_rate_decay_per_epoch_micros=5000,
        beta_1_micros=900000,
        beta_2_micros=999000,
        normalized_gradient_clip_micros=5000000,
    )
    training = model_pb2.TrainingOptions(
        num_epochs=1,
        sequence_length=10,
        batch_size=5,
        shuffle_corpus_contentfiles_between_epochs=False,
        adam_optimizer=optimizer,
    )
    return model_pb2.Model(corpus=abc_corpus_config,
                           architecture=architecture,
                           training=training)
Esempio n. 4
0
def MakeClgenInstanceConfig(
    working_dir: pathlib.Path,
    encoded_db: encoded.EncodedContentFiles,
    num_training_epochs: int,
    seed_text: str,
    neurons_per_layer: int,
    num_layers: int,
) -> clgen_pb2.Instance:
    """Construct a CLgen instance.

  Args:
    working_dir: The directory to cache CLgen working files in.
    encoded_db: The directory of encoded content files.
    num_training_epochs: The number of epochs to train for.
    seed_text: The text to initiate sampling with.
    neurons_per_layer: Number of neurons in a layer.
  """
    return clgen_pb2.Instance(
        working_dir=str(working_dir),
        model=model_pb2.Model(
            corpus=corpus_pb2.Corpus(pre_encoded_corpus_url=encoded_db.url, ),
            architecture=model_pb2.NetworkArchitecture(
                backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                neuron_type=model_pb2.NetworkArchitecture.LSTM,
                neurons_per_layer=neurons_per_layer,
                num_layers=num_layers,
                post_layer_dropout_micros=0,
            ),
            training=model_pb2.TrainingOptions(
                num_epochs=num_training_epochs,
                sequence_length=64,
                batch_size=64,
                shuffle_corpus_contentfiles_between_epochs=True,
                adam_optimizer=model_pb2.AdamOptimizer(
                    initial_learning_rate_micros=2000,
                    learning_rate_decay_per_epoch_micros=50000,
                    beta_1_micros=900000,
                    beta_2_micros=999000,
                    normalized_gradient_clip_micros=5000000,
                ),
            ),
        ),
        sampler=sampler_pb2.Sampler(
            start_text=seed_text,
            batch_size=64,
            sequence_length=1024,
            temperature_micros=1000000,  # = 1.0 real value
            termination_criteria=[
                sampler_pb2.SampleTerminationCriterion(
                    symtok=sampler_pb2.SymmetricalTokenDepth(
                        depth_increase_token="{",
                        depth_decrease_token="}",
                    )),
                sampler_pb2.SampleTerminationCriterion(
                    maxlen=sampler_pb2.MaxTokenLength(
                        maximum_tokens_in_sample=20000, )),
            ],
        ),
    )
Esempio n. 5
0
def test_BuildOptimizer_rmsprop():
  """Test RmsOptimizer proto value conversion to Keras config."""
  config = model_pb2.Model()
  config.training.ClearField("optimizer")
  config.training.rmsprop_optimizer.initial_learning_rate_micros = 1000
  config.training.rmsprop_optimizer.learning_rate_decay_per_epoch_micros = 1000
  optimizer = builders.BuildOptimizer(config)
  optimizer_config = optimizer.get_config()
  assert pytest.approx(optimizer_config["decay"]) == 0.001
  assert pytest.approx(optimizer_config["rho"]) == 0.9
Esempio n. 6
0
def test_BuildOptimizer_adam():
  """Test AdamOptimizer proto value conversion to Keras config."""
  config = model_pb2.Model()
  config.training.ClearField("optimizer")
  config.training.adam_optimizer.initial_learning_rate_micros = 2000
  config.training.adam_optimizer.learning_rate_decay_per_epoch_micros = 5000
  config.training.adam_optimizer.beta_1_micros = 900000
  config.training.adam_optimizer.beta_2_micros = 999000
  config.training.adam_optimizer.normalized_gradient_clip_micros = 5000000
  optimizer = builders.BuildOptimizer(config)
  optimizer_config = optimizer.get_config()
  assert pytest.approx(optimizer_config["decay"]) == 0.005
  assert pytest.approx(optimizer_config["beta_1"]) == 0.9
  assert pytest.approx(optimizer_config["beta_2"]) == 0.999
  assert pytest.approx(optimizer_config["clipnorm"]) == 5.0
Esempio n. 7
0
    def _ComputeHash(
        pre_train_corpus_: corpuses.Corpus,
        corpus_: corpuses.Corpus,
        config: model_pb2.Model,
    ) -> str:
        """Compute model hash.

    The hash is computed from the ID of the corpus and the serialized
    representation of the config proto. The number of epochs that the model is
    trained for does not affect the hash, since we can share checkpoints
    between different models if the only variable is the epoch count. E.g.
    we have a model trained for 10 epochs, we can use the checkpoint as the
    starting point for a training a model for 20 epochs.

    Args:
      corpus: A corpus instance.
      config: A Model config proto.

    Returns:
      The unique model ID.
    """
        config_to_hash = model_pb2.Model()
        config_to_hash.CopyFrom(config)
        config_to_hash.ClearField("pre_train_corpus")
        config_to_hash.ClearField("corpus")
        config_to_hash.training.ClearField("num_epochs")
        config_to_hash.training.ClearField("num_train_steps")
        config_to_hash.training.ClearField("batch_size")
        if config_to_hash.training.HasField("data_generator"):
            config_to_hash.training.data_generator.ClearField(
                "steps_per_epoch")
            config_to_hash.training.data_generator.ClearField("validation_set")
        if pre_train_corpus_:
            hash_list = [
                pre_train_corpus_.hash, corpus_.hash,
                config_to_hash.SerializeToString()
            ]
        else:
            hash_list = [corpus_.hash, config_to_hash.SerializeToString()]
        return crypto.sha1_list(hash_list)
Esempio n. 8
0
  def _ComputeHash(corpus_: corpuses.Corpus, config: model_pb2.Model) -> str:
    """Compute model hash.

    The hash is computed from the ID of the corpus and the serialized
    representation of the config proto. The number of epochs that the model is
    trained for does not affect the hash, since we can share checkpoints
    between different models if the only variable is the epoch count. E.g.
    we have a model trained for 10 epochs, we can use the checkpoint as the
    starting point for a training a model for 20 epochs.

    Args:
      corpus: A corpus instance.
      config: A Model config proto.

    Returns:
      The unique model ID.
    """
    config_to_hash = model_pb2.Model()
    config_to_hash.CopyFrom(config)
    config_to_hash.ClearField("corpus")
    config_to_hash.training.ClearField("num_epochs")
    return crypto.sha1_list(corpus_.hash, config_to_hash.SerializeToString())
Esempio n. 9
0
def CreateModelProtoFromFlags() -> model_pb2.Model:
  return model_pb2.Model(
      corpus=CreateCorpusProtoFromFlags(),
      architecture=model_pb2.NetworkArchitecture(
          backend=model_pb2.NetworkArchitecture.TENSORFLOW,
          neuron_type=model_pb2.NetworkArchitecture.LSTM,
          neurons_per_layer=FLAGS.clgen_layer_size,
          num_layers=FLAGS.clgen_num_layers,
          post_layer_dropout_micros=0,
      ),
      training=model_pb2.TrainingOptions(
          num_epochs=FLAGS.clgen_num_epochs,
          sequence_length=FLAGS.clgen_training_sequence_length,
          batch_size=FLAGS.clgen_training_batch_size,
          shuffle_corpus_contentfiles_between_epochs=True,
          adam_optimizer=model_pb2.AdamOptimizer(
              initial_learning_rate_micros=2000,
              learning_rate_decay_per_epoch_micros=50000,
              beta_1_micros=900000,
              beta_2_micros=999000,
              normalized_gradient_clip_micros=5000000,
          ),
      ))
Esempio n. 10
0
    def __init__(self, config: model_pb2.Model):
        """Instantiate a model.

    Args:
      config: A Model message.

    Raises:
      TypeError: If the config argument is not a Model proto.
      UserError: In case on an invalid config.
    """
        # Error early, so that a cache isn't created.
        if not isinstance(config, model_pb2.Model):
            t = type(config).__name__
            raise TypeError(f"Config must be a Model proto. Received: '{t}'")
        # Validate config options.
        if config.training.sequence_length < 1:
            raise errors.UserError(
                'TrainingOptions.sequence_length must be >= 1')

        self.config = model_pb2.Model()
        self.config.CopyFrom(builders.AssertIsBuildable(config))
        self.corpus = corpuses.Corpus(config.corpus)
        self.hash = self._ComputeHash(self.corpus, self.config)
        self.cache = cache.mkcache('model', self.hash)
        # Create the necessary cache directories.
        (self.cache.path / 'checkpoints').mkdir(exist_ok=True)
        (self.cache.path / 'samples').mkdir(exist_ok=True)
        (self.cache.path / 'logs').mkdir(exist_ok=True)

        # Create symlink to encoded corpus.
        symlink = self.cache.path / 'corpus'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(
                    pathlib.Path(
                        self.corpus.encoded.url[len('sqlite:///'):]).parent,
                    self.cache.path), symlink)

        # Create symlink to the atomizer.
        symlink = self.cache.path / 'atomizer'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(self.corpus.atomizer_path, self.cache.path),
                symlink)

        # Validate metadata against cache.
        if self.cache.get('META.pbtxt'):
            cached_meta = pbutil.FromFile(
                pathlib.Path(self.cache['META.pbtxt']),
                internal_pb2.ModelMeta())
            # Exclude num_epochs and corpus location from metadata comparison.
            config_to_compare = model_pb2.Model()
            config_to_compare.CopyFrom(self.config)
            config_to_compare.corpus.ClearField('contentfiles')
            config_to_compare.training.ClearField('num_epochs')
            # These fields should have already been cleared, but we'll do it again
            # so that metadata comparisons don't fail when the cached meta schema
            # is updated.
            cached_to_compare = model_pb2.Model()
            cached_to_compare.CopyFrom(cached_meta.config)
            cached_to_compare.corpus.ClearField('contentfiles')
            cached_to_compare.training.ClearField('num_epochs')
            if config_to_compare != cached_to_compare:
                raise errors.InternalError('Metadata mismatch')
            self.meta = cached_meta
        else:
            self.meta = internal_pb2.ModelMeta()
            self.meta.config.CopyFrom(self.config)
            self._WriteMetafile()

        self.backend = {
            model_pb2.NetworkArchitecture.TENSORFLOW:
            tensorflow_backend.TensorFlowBackend,
            model_pb2.NetworkArchitecture.KERAS: keras_backend.KerasBackend,
        }[config.architecture.backend](self.config, self.cache, self.corpus)
Esempio n. 11
0
    def __init__(self, config: model_pb2.Model):
        """Instantiate a model.

    Args:
      config: A Model message.

    Raises:
      TypeError: If the config argument is not a Model proto.
      UserError: In case on an invalid config.
    """
        # Error early, so that a cache isn't created.
        if not isinstance(config, model_pb2.Model):
            t = type(config).__name__
            raise TypeError(f"Config must be a Model proto. Received: '{t}'")

        self.config = model_pb2.Model()
        # Validate config options.
        self.config.CopyFrom(builders.AssertIsBuildable(config))
        if FLAGS.num_train_steps:
            self.config.training.num_train_steps = FLAGS.num_train_steps
        if FLAGS.num_pretrain_steps:
            self.config.training.num_pretrain_steps = FLAGS.num_pretrain_steps
        if FLAGS.num_epochs:
            self.config.training.num_epochs = FLAGS.num_epochs

        # Initialize distrib lock path.
        if environment.WORLD_SIZE > 1:
            if environment.WORLD_RANK == 0:
                lock_cache = cache.mkcache("locks")
                lock_cache.path.mkdir(exist_ok=True)
            else:
                while not cache.cachepath("locks").exists():
                    time.sleep(0.5)
                lock_cache = cache.mkcache("locks")
            distrib.init(lock_cache.path)

        # Initialize corpuses
        self.corpus = corpuses.Corpus(config.corpus)
        self.pre_train_corpus = None
        if config.HasField("pre_train_corpus"):
            self.pre_train_corpus = corpuses.Corpus(config.pre_train_corpus)

        self.hash = self._ComputeHash(self.pre_train_corpus, self.corpus,
                                      self.config)
        self._created = False

        distrib.lock()
        self.cache = cache.mkcache("model", self.hash)
        distrib.unlock()

        if environment.WORLD_RANK == 0:
            # Create the necessary cache directories.
            (self.cache.path / "checkpoints").mkdir(exist_ok=True)
            (self.cache.path / "samples").mkdir(exist_ok=True)
            # Create symlink to encoded corpus.
            symlink = self.cache.path / "corpus"
            if not symlink.is_symlink():
                os.symlink(
                    os.path.relpath(
                        pathlib.Path(self.corpus.encoded.url[len("sqlite:///"
                                                                 ):]).parent,
                        self.cache.path,
                    ),
                    symlink,
                )
            if self.pre_train_corpus:
                symlink = self.cache.path / "pre_train_corpus"
                if not symlink.is_symlink():
                    os.symlink(
                        os.path.relpath(
                            pathlib.Path(self.pre_train_corpus.encoded.
                                         url[len("sqlite:///"):]).parent,
                            self.cache.path,
                        ),
                        symlink,
                    )

            # Create symlink to the tokenizer and create a backup inside checkpoints.
            symlink = self.cache.path / "tokenizer"
            if not symlink.is_symlink():
                os.symlink(
                    os.path.relpath(self.corpus.tokenizer_path,
                                    self.cache.path), symlink)
            if (self.cache.path / "checkpoints" /
                    "backup_tokenizer.pkl").exists():
                shutil.copyfile(
                    self.cache.path / "checkpoints" / "backup_tokenizer.pkl",
                    self.corpus.tokenizer_path)

            # Validate metadata against cache.
            if self.cache.get("META.pbtxt"):
                cached_meta = pbutil.FromFile(
                    pathlib.Path(self.cache["META.pbtxt"]),
                    internal_pb2.ModelMeta())
                # Exclude num_epochs and corpus location from metadata comparison.
                config_to_compare = model_pb2.Model()
                config_to_compare.CopyFrom(self.config)
                config_to_compare.corpus.ClearField("contentfiles")
                if config_to_compare.HasField("pre_train_corpus"):
                    config_to_compare.pre_train_corpus.ClearField(
                        "contentfiles")
                config_to_compare.training.ClearField("num_epochs")
                config_to_compare.training.ClearField("num_train_steps")
                if config_to_compare.HasField("pre_train_corpus"):
                    config_to_compare.training.ClearField("num_pretrain_steps")
                config_to_compare.training.ClearField("batch_size")
                if config_to_compare.training.HasField("data_generator"):
                    config_to_compare.training.data_generator.ClearField(
                        "steps_per_epoch")
                    config_to_compare.training.data_generator.ClearField(
                        "validation_set")
                # These fields should have already been cleared, but we'll do it again
                # so that metadata comparisons don't fail when the cached meta schema
                # is updated.
                cached_to_compare = model_pb2.Model()
                cached_to_compare.CopyFrom(cached_meta.config)
                cached_to_compare.corpus.ClearField("contentfiles")
                if cached_to_compare.HasField("pre_train_corpus"):
                    cached_to_compare.pre_train_corpus.ClearField(
                        "contentfiles")
                cached_to_compare.training.ClearField("num_epochs")
                cached_to_compare.training.ClearField("num_train_steps")
                if cached_to_compare.HasField("pre_train_corpus"):
                    cached_to_compare.training.ClearField("num_pretrain_steps")
                cached_to_compare.training.ClearField("batch_size")
                if cached_to_compare.training.HasField("data_generator"):
                    cached_to_compare.training.data_generator.ClearField(
                        "steps_per_epoch")
                    cached_to_compare.training.data_generator.ClearField(
                        "validation_set")
                if cached_to_compare.training.sequence_length != config_to_compare.training.sequence_length:
                    l.logger().warning(
                        "Mismatch between pre-trained and current config sequence_length!\
            This can only be intended in BERT model!")
                cached_to_compare.training.ClearField("sequence_length")
                config_to_compare.training.ClearField("sequence_length")
                if config_to_compare != cached_to_compare:
                    raise SystemError("Metadata mismatch: {} \n\n {}".format(
                        config_to_compare, cached_to_compare))
                self.meta = cached_meta
            else:
                self.meta = internal_pb2.ModelMeta()
                self.meta.config.CopyFrom(self.config)
                self._WriteMetafile()

            ## Store current commit
            commit.saveCommit(self.cache.path)

        self.backend = {
            model_pb2.NetworkArchitecture.TENSORFLOW_SEQ:
            tf_sequential.tfSequential,
            model_pb2.NetworkArchitecture.KERAS_SEQ:
            keras_sequential.kerasSequential,
            model_pb2.NetworkArchitecture.TENSORFLOW_BERT: tf_bert.tfBert,
            model_pb2.NetworkArchitecture.TORCH_BERT: torch_bert.torchBert,
        }[config.architecture.backend](self.config, self.cache, self.hash)
        l.logger().info("Initialized {} in {}".format(self.backend,
                                                      self.cache.path))
        return
Esempio n. 12
0
def main(argv: typing.List[str]):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments: '{}'.".format(" ".join(
            argv[1:])))

    instance = clgen.Instance(
        clgen_pb2.Instance(
            working_dir=FLAGS.clgen_dir,
            model=model_pb2.Model(
                corpus=corpus_pb2.Corpus(
                    local_directory=FLAGS.clgen_corpus_dir,
                    ascii_character_atomizer=True,
                    preprocessor=[
                        "deeplearning.clgen.preprocessors.opencl:ClangPreprocessWithShim",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                        "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                        "deeplearning.clgen.preprocessors.opencl:StripDoubleUnderscorePrefixes",
                        "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                        "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                        "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                        "deeplearning.clgen.preprocessors.opencl:ClangFormat",
                        "deeplearning.clgen.preprocessors.common:MinimumLineCount3",
                        "deeplearning.clgen.preprocessors.opencl:Compile",
                    ],
                    contentfile_separator="\n\n",
                ),
                architecture=model_pb2.NetworkArchitecture(
                    backend=model_pb2.NetworkArchitecture.TENSORFLOW,
                    neuron_type=model_pb2.NetworkArchitecture.LSTM,
                    neurons_per_layer=512,
                    num_layers=2,
                    post_layer_dropout_micros=0,
                ),
                training=model_pb2.TrainingOptions(
                    num_epochs=50,
                    sequence_length=64,
                    batch_size=64,
                    shuffle_corpus_contentfiles_between_epochs=True,
                    adam_optimizer=model_pb2.AdamOptimizer(
                        initial_learning_rate_micros=2000,
                        learning_rate_decay_per_epoch_micros=50000,
                        beta_1_micros=900000,
                        beta_2_micros=999000,
                        normalized_gradient_clip_micros=5000000,
                    ),
                ),
            ),
            sampler=sampler_pb2.Sampler(
                start_text="kernel void ",
                batch_size=64,
                sequence_length=1024,
                temperature_micros=1000000,  # = 1.0 real value
                termination_criteria=[
                    sampler_pb2.SampleTerminationCriterion(
                        symtok=sampler_pb2.SymmetricalTokenDepth(
                            depth_increase_token="{",
                            depth_decrease_token="}",
                        )),
                    sampler_pb2.SampleTerminationCriterion(
                        maxlen=sampler_pb2.MaxTokenLength(
                            maximum_tokens_in_sample=20000, )),
                ],
            ),
        ), )
    db = grewe_features_db.Database(FLAGS.db)
    profile_dir = pathlib.Path(FLAGS.profile_dir)
    profile_dir.mkdir(parents=True, exist_ok=True)
    profiler = prof.AutoCsvProfiler(profile_dir)

    with instance.Session(), multiprocessing.Pool() as pool:
        while True:
            Sample(instance, db, profiler, pool)