Esempio n. 1
0
def AssertConfigIsValid(config: corpus_pb2.Corpus) -> corpus_pb2.Corpus:
    """Assert that config proto is valid.

  Args:
    config: A Corpus proto.

  Returns:
    The Corpus proto.

  Raises:
    UserError: If the config is invalid.
  """
    try:
        pbutil.AssertFieldIsSet(config, 'contentfiles')
        pbutil.AssertFieldIsSet(config, 'atomizer')
        pbutil.AssertFieldIsSet(config, 'contentfile_separator')
        # Check that the preprocessor pipeline resolves to preprocessor functions.
        [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor]

        if config.HasField('greedy_multichar_atomizer'):
            if not config.greedy_multichar_atomizer.tokens:
                raise errors.UserError(
                    'GreedyMulticharAtomizer.tokens is empty')
            for atom in config.greedy_multichar_atomizer.tokens:
                if not atom:
                    raise errors.UserError(
                        'Empty string found in GreedyMulticharAtomizer.tokens is empty'
                    )

        return config
    except pbutil.ProtoValueError as e:
        raise errors.UserError(e)
Esempio n. 2
0
def GetPreprocessorFunction(name: str) -> public.PreprocessorFunction:
  """Lookup a preprocess function by name.

  A preprocessor is a function which takes a single argument 'text' of type str,
  and returns a str. The name is the fully qualified name of the python
  function which implements it, in the form <module>:<name>. For example,
  the name 'preprocessors.cxx:Compile' will return the
  function 'Compile' in the module 'preprocessors.cxx'.

  Args:
    name: The name of the preprocessor to get.

  Returns:
    The python preprocessor function.

  Raises:
    UserError: If the requested name cannot be found or is not a
      @clgen_preprocessor decorated function.
  """
  components = name.split(':')
  if len(components) != 2:
    raise errors.UserError(f'Invalid preprocessor name {name}')
  module_name, function_name = components
  
  try:
    module = importlib.import_module(module_name)
    function_ = getattr(module, function_name)
  except (ModuleNotFoundError, AttributeError):
    raise errors.UserError(f'Preprocessor {name} not found.')
  if not function_.__dict__.get('is_clgen_preprocessor'):
    raise errors.UserError(
        f'Preprocessor {name} not decorated with @clgen_preprocessor')
  return function_
Esempio n. 3
0
def ResolveContentId(config: corpus_pb2.Corpus,
                     hc: hashcache.HashCache) -> str:
    """Compute the hash of the input contentfiles.

  This function resolves the unique sha1 checksum of a set of content files.

  Args:
    config: The corpus config proto.
    hc: A hashcache database instance, used for resolving directory hashes.

  Returns:
    A hex encoded sha1 string.
  """
    # We can take a massive shortcut if the content ID is already set in the
    # config proto.
    if config.HasField('content_id'):
        return config.content_id

    start_time = time.time()
    if config.HasField('local_directory'):
        # After the first time we compute the hash of a directory, we write it into
        # a file. This is a shortcut to work around the fact that computing the
        # directory checksum is O(n) with respect to the number of files in the
        # directory (even if the directory is already cached by the hash cache).
        # This means that it is the responsibility of the user to delete this cached
        # file if the directory is changed.
        hash_file_path = pathlib.Path(
            str(pathlib.Path(config.local_directory)) + '.sha1.txt')
        if hash_file_path.is_file():
            logging.info("Reading directory hash: '%s'.", hash_file_path)
            with open(hash_file_path) as f:
                content_id = f.read().rstrip()
        else:
            # No hash file, so compute the directory hash and create it.
            try:
                content_id = hc.GetHash(
                    ExpandConfigPath(
                        config.local_directory,
                        path_prefix=FLAGS.clgen_local_path_prefix))
            except FileNotFoundError as e:
                raise errors.UserError(e)
            # Create the hash file in the directory so that next time we don't need
            # to reference the hash cache.
            with open(hash_file_path, 'w') as f:
                print(content_id, file=f)
            logging.info("Wrote directory hash: '%s'.", hash_file_path)
    elif config.HasField('local_tar_archive'):
        # This if not an efficient means of getting the hash, as it requires always
        # unpacking the archive and reading the entire contents. It would be nicer
        # to maintain a cache which maps the mtime of tarballs to their content ID,
        # similart to how local_directory is implemented.
        content_id = GetHashOfArchiveContents(
            ExpandConfigPath(config.local_tar_archive,
                             path_prefix=FLAGS.clgen_local_path_prefix))
    else:
        raise NotImplementedError(
            'Unsupported Corpus.contentfiles field value')
    logging.debug('Resolved Content ID %s in %s ms.', content_id,
                  humanize.intcomma(int((time.time() - start_time) * 1000)))
    return content_id
Esempio n. 4
0
    def __init__(self, config: clgen_pb2.Instance):
        """Instantiate an instance.

    Args:
      config: An Instance proto.

    Raises:
      UserError: If the instance proto contains invalid values, is missing
        a model or sampler fields.
    """

        try:
            pbutil.AssertFieldIsSet(config, 'model_specification')
            pbutil.AssertFieldIsSet(config, 'sampler')
        except pbutil.ProtoValueError as e:
            raise errors.UserError(e)

        self.working_dir = None
        if config.HasField('working_dir'):
            self.working_dir: pathlib.Path = pathlib.Path(
                os.path.expandvars(
                    config.working_dir)).expanduser().absolute()
        # Enter a session so that the cache paths are set relative to any requested
        # working directory.
        with self.Session():
            if config.HasField('model'):
                self.model: models.Model = models.Model(config.model)
            else:
                self.model: pretrained.PreTrainedModel = pretrained.PreTrainedModel(
                    pathlib.Path(config.pretrained_model))
            self.sampler: samplers.Sampler = samplers.Sampler(config.sampler)
Esempio n. 5
0
def GetHashOfArchiveContents(archive: pathlib.Path) -> str:
    """Compute the checksum of the contents of a directory.

  Args:
    archive: Path of the archive.

  Returns:
    Checksum of the archive.

  Raises:
    UserError: If the requested archive does not exist, or cannot be unpacked.
  """
    if not archive.is_file():
        raise errors.UserError(f"Archive not found: '{archive}'")

    with tempfile.TemporaryDirectory(prefix='clgen_corpus_') as d:
        cmd = ['tar', '-xf', str(archive), '-C', d]
        try:
            subprocess.check_call(cmd)
        except subprocess.CalledProcessError:
            raise errors.UserError(f"Archive unpack failed: '{archive}'")
        return checksumdir.dirhash(d, 'sha1')
Esempio n. 6
0
def GetTrainingCorpus(
    corpus: 'corpuses.Corpus',
    training_opts: model_pb2.TrainingOptions) -> typing.Tuple[
  np.ndarray, np.ndarray, int]:
  """Get the corpus to train over.

  Args:
    corpus: A Corpus instance.
    training_opts: A TrainingOptions proto.

  Returns:
    An X, y pair of data for an epoch, and the number of steps in the epoch.

  Raises:
    UserError: If batch_size and sequence_length are too large for the corpus,
      yielding no batches.
  """
  start_time = time.time()
  encoded_corpus = corpus.GetTrainingData(
      shuffle=training_opts.shuffle_corpus_contentfiles_between_epochs)
  corpus_length = len(encoded_corpus)
  steps_per_epoch = (corpus_length - 1) // (
      training_opts.batch_size * training_opts.sequence_length)
  if not steps_per_epoch:
    raise errors.UserError(
        f'Requested batch size ({training_opts.batch_size}) and '
        f'sequence length ({training_opts.sequence_length}) are too large for '
        f'corpus of size {corpus_length}.')

  clipped_corpus_length = (
      steps_per_epoch * training_opts.batch_size *
      training_opts.sequence_length)

  x = np.reshape(
      encoded_corpus[:clipped_corpus_length],
      [training_opts.batch_size,
       steps_per_epoch * training_opts.sequence_length])
  y = np.reshape(
      encoded_corpus[1:clipped_corpus_length + 1],
      [training_opts.batch_size,
       steps_per_epoch * training_opts.sequence_length])

  logging.info(
      'Encoded corpus of %s tokens (clipped last %s tokens) in %s ms.',
      humanize.intcomma(clipped_corpus_length),
      humanize.intcomma(corpus_length - clipped_corpus_length),
      humanize.intcomma(int((time.time() - start_time) * 1000)))
  return x, y, steps_per_epoch
Esempio n. 7
0
  def CreateBatches(self) -> None:
    start_time = time.time()

    # generate a kernel corpus
    self.i = 0
    if (self.encoded_corpus is None or
        self.training_opts.shuffle_corpus_contentfiles_between_epochs):
      self.encoded_corpus = self.corpus.GetTrainingData(
          shuffle=self.training_opts.shuffle_corpus_contentfiles_between_epochs)

    batch_size = self.training_opts.batch_size
    sequence_length = self.training_opts.sequence_length

    # set corpus size and number of batches
    self.num_batches = int(len(self.encoded_corpus) / (
        batch_size * sequence_length))
    if self.num_batches == 0:
      raise errors.UserError(
          "Not enough data. Use a smaller sequence_length and batch_size")

    # split into batches
    clipped_corpus_length = self.num_batches * batch_size * sequence_length
    clipped_corpus = self.encoded_corpus[:clipped_corpus_length]
    xdata = clipped_corpus
    ydata = np.copy(clipped_corpus)

    # Wrap-around.
    ydata[:-1] = xdata[1:]
    ydata[-1] = xdata[0]
    self.batches = [
      DataBatch(x, y) for x, y in zip(
          np.split(xdata.reshape(batch_size, -1), self.num_batches, 1),
          np.split(ydata.reshape(batch_size, -1), self.num_batches, 1)
      )
    ]
    logging.info(
        'Encoded corpus of %s tokens (clipped last %s tokens) in %s ms.',
        humanize.intcomma(clipped_corpus_length),
        humanize.intcomma(len(self.encoded_corpus) - clipped_corpus_length),
        humanize.intcomma(int((time.time() - start_time) * 1000)))
Esempio n. 8
0
    def FromText(cls, text: str, atoms: typing.Set[str]) -> 'GreedyAtomizer':
        """Instantiate and an atomizer from a corpus text.

    Args:
      text: Text corpus
      atoms: A list of multi-character tokens.

    Returns:
      An atomizer instance.
    """
        if not atoms:
            raise errors.UserError('No atoms specified')

        # Instantiate a greedy atomizer using the full vocabulary.
        full_vocab = dict(zip(atoms, range(len(atoms))))
        c = GreedyAtomizer(full_vocab, determine_chars=True)
        # Derive the subset of the vocabulary required to encode the given text.
        tokens = sorted(list(set(c.TokenizeString(text))))
        vocab_subset = dict(zip(tokens, range(len(tokens))))
        end_time = labdate.MillisecondsTimestamp()
        # Return a new atomizer using the subset vocabulary.
        return GreedyAtomizer(vocab_subset)
Esempio n. 9
0
    def __init__(self, config: corpus_pb2.Corpus):
        """Instantiate a corpus from a proto config.

    If this is a new corpus, a number of files will be created, which may
    take some time.

    Args:
      config: A Corpus message.

    Raises:
      TypeError: If the config argument is not a Sampler proto.
      UserError: In case the corpus is not found, or config contains invalid
        options.
      EmptyCorpusException: In case the corpus contains no data.
    """
        if not isinstance(config, corpus_pb2.Corpus):
            t = type(config).__name__
            raise TypeError(f"Config must be a Corpus proto. Received: '{t}'")

        # Make a local copy of the configuration.
        self.config = corpus_pb2.Corpus()
        self.config.CopyFrom(AssertConfigIsValid(config))
        self._atomizer = None
        self._created = False

        cache.cachepath('corpus').mkdir(parents=True, exist_ok=True)
        print(cache.cachepath('hashcache.db'))
        hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1')
        self.content_id = ResolveContentId(self.config, hc)
        # Database of pre-processed files.
        preprocessed_id = ResolvePreprocessedId(self.content_id, self.config)
        cache.cachepath('corpus', 'preprocessed',
                        preprocessed_id).mkdir(exist_ok=True, parents=True)
        preprocessed_db_path = cache.cachepath('corpus', 'preprocessed',
                                               preprocessed_id,
                                               'preprocessed.db')
        if (self.config.HasField('content_id')
                and not preprocessed_db_path.is_file()):
            raise errors.UserError(
                f"Content ID not found: '{self.content_id}'")
        self.preprocessed = preprocessed.PreprocessedContentFiles(
            preprocessed_db_path)
        # Create symlink to contentfiles.
        symlink = pathlib.Path(
            self.preprocessed.url[len('sqlite:///'):]).parent / 'contentfiles'
        if not symlink.is_symlink():
            if config.HasField('local_directory'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_directory,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
            elif config.HasField('local_tar_archive'):
                os.symlink(
                    str(
                        ExpandConfigPath(
                            config.local_tar_archive,
                            path_prefix=FLAGS.clgen_local_path_prefix)),
                    symlink)
        # Data of encoded pre-preprocessed files.
        encoded_id = ResolveEncodedId(self.content_id, self.config)
        cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True,
                                                               parents=True)
        self.encoded = encoded.EncodedContentFiles(
            cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db'))
        self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id,
                                             'atomizer.pkl')
        # Create symlink to preprocessed files.
        symlink = pathlib.Path(
            self.encoded.url[len('sqlite:///'):]).parent / 'preprocessed'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(
                    pathlib.Path(
                        self.preprocessed.url[len('sqlite:///'):]).parent,
                    pathlib.Path(self.encoded.url[len('sqlite:///'):]).parent),
                symlink)
        self.hash = encoded_id
        self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
Esempio n. 10
0
def AssertIsBuildable(config: model_pb2.Model) -> model_pb2.Model:
    """Assert that a model configuration is buildable.

  Args:
    config: A model proto.

  Returns:
    The input model proto, unmodified.

  Raises:
    UserError: If the model is not buildable.
    InternalError: If the value of the training.optimizer field is not
      understood.
  """
    # Any change to the Model proto schema will require a change to this function.
    try:
        pbutil.AssertFieldIsSet(config, 'corpus')
        pbutil.AssertFieldIsSet(config, 'architecture')
        pbutil.AssertFieldIsSet(config, 'training')
        pbutil.AssertFieldIsSet(config.architecture, 'backend')
        pbutil.AssertFieldIsSet(config.architecture, 'neuron_type')
        if config.architecture.backend == model_pb2.NetworkArchitecture.KERAS:
            pbutil.AssertFieldConstraint(
                config.architecture, 'embedding_size', lambda x: 0 < x,
                'NetworkArchitecture.embedding_size must be > 0')
        pbutil.AssertFieldConstraint(
            config.architecture, 'neurons_per_layer', lambda x: 0 < x,
            'NetworkArchitecture.neurons_per_layer must be > 0')
        pbutil.AssertFieldConstraint(
            config.architecture, 'num_layers', lambda x: 0 < x,
            'NetworkArchitecture.num_layers must be > 0')
        pbutil.AssertFieldConstraint(
            config.architecture, 'post_layer_dropout_micros',
            lambda x: 0 <= x <= 1000000,
            'NetworkArchitecture.post_layer_dropout_micros '
            'must be >= 0 and <= 1000000')
        pbutil.AssertFieldConstraint(config.training, 'num_epochs',
                                     lambda x: 0 < x,
                                     'TrainingOptions.num_epochs must be > 0')
        pbutil.AssertFieldIsSet(config.training,
                                'shuffle_corpus_contentfiles_between_epochs')
        pbutil.AssertFieldConstraint(config.training, 'batch_size',
                                     lambda x: 0 < x,
                                     'TrainingOptions.batch_size must be > 0')
        pbutil.AssertFieldIsSet(config.training, 'optimizer')
        if config.training.HasField('adam_optimizer'):
            pbutil.AssertFieldConstraint(
                config.training.adam_optimizer, 'initial_learning_rate_micros',
                lambda x: 0 <= x,
                'AdamOptimizer.initial_learning_rate_micros must be >= 0')
            pbutil.AssertFieldConstraint(
                config.training.adam_optimizer,
                'learning_rate_decay_per_epoch_micros', lambda x: 0 <= x,
                'AdamOptimizer.learning_rate_decay_per_epoch_micros must be >= 0'
            )
            pbutil.AssertFieldConstraint(
                config.training.adam_optimizer, 'beta_1_micros',
                lambda x: 0 <= x <= 1000000,
                'AdamOptimizer.beta_1_micros must be >= 0 and <= 1000000')
            pbutil.AssertFieldConstraint(
                config.training.adam_optimizer, 'beta_2_micros',
                lambda x: 0 <= x <= 1000000,
                'AdamOptimizer.beta_2_micros must be >= 0 and <= 1000000')
            pbutil.AssertFieldConstraint(
                config.training.adam_optimizer,
                'normalized_gradient_clip_micros', lambda x: 0 <= x,
                'AdamOptimizer.normalized_gradient_clip_micros must be >= 0')
        elif config.training.HasField('rmsprop_optimizer'):
            pbutil.AssertFieldConstraint(
                config.training.rmsprop_optimizer,
                'initial_learning_rate_micros', lambda x: 0 <= x,
                'RmsPropOptimizer.initial_learning_rate_micros must be >= 0')
            pbutil.AssertFieldConstraint(
                config.training.rmsprop_optimizer,
                'learning_rate_decay_per_epoch_micros', lambda x: 0 <= x,
                'RmsPropOptimizer.learning_rate_decay_per_epoch_micros must be >= 0'
            )
        else:
            raise errors.InternalError(
                "Unrecognized value: 'TrainingOptions.optimizer'")
    except pbutil.ProtoValueError as e:
        raise errors.UserError(str(e))
    return config
Esempio n. 11
0
    def __init__(self, config: model_pb2.Model):
        """Instantiate a model.

    Args:
      config: A Model message.

    Raises:
      TypeError: If the config argument is not a Model proto.
      UserError: In case on an invalid config.
    """
        # Error early, so that a cache isn't created.
        if not isinstance(config, model_pb2.Model):
            t = type(config).__name__
            raise TypeError(f"Config must be a Model proto. Received: '{t}'")
        # Validate config options.
        if config.training.sequence_length < 1:
            raise errors.UserError(
                'TrainingOptions.sequence_length must be >= 1')

        self.config = model_pb2.Model()

        self.config.CopyFrom(builders.AssertIsBuildable(config))

        self.corpus = corpuses.Corpus(config.corpus)
        self.hash = self._ComputeHash(self.corpus, self.config)
        self.cache = cache.mkcache('model', self.hash)
        # Create the necessary cache directories.
        (self.cache.path / 'checkpoints').mkdir(exist_ok=True)
        (self.cache.path / 'samples').mkdir(exist_ok=True)
        (self.cache.path / 'logs').mkdir(exist_ok=True)

        # Create symlink to encoded corpus.
        symlink = self.cache.path / 'corpus'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(
                    pathlib.Path(
                        self.corpus.encoded.url[len('sqlite:///'):]).parent,
                    self.cache.path), symlink)

        # Create symlink to the atomizer.
        symlink = self.cache.path / 'atomizer'
        if not symlink.is_symlink():
            os.symlink(
                os.path.relpath(self.corpus.atomizer_path, self.cache.path),
                symlink)

        # Validate metadata against cache.
        if self.cache.get('META.pbtxt'):
            cached_meta = pbutil.FromFile(
                pathlib.Path(self.cache['META.pbtxt']),
                internal_pb2.ModelMeta())
            # Exclude num_epochs and corpus location from metadata comparison.
            config_to_compare = model_pb2.Model()
            config_to_compare.CopyFrom(self.config)
            config_to_compare.corpus.ClearField('contentfiles')
            config_to_compare.training.ClearField('num_epochs')
            # These fields should have already been cleared, but we'll do it again
            # so that metadata comparisons don't fail when the cached meta schema
            # is updated.
            cached_to_compare = model_pb2.Model()
            cached_to_compare.CopyFrom(cached_meta.config)
            cached_to_compare.corpus.ClearField('contentfiles')
            cached_to_compare.training.ClearField('num_epochs')
            if config_to_compare != cached_to_compare:
                raise errors.InternalError('Metadata mismatch')
            self.meta = cached_meta
        else:
            self.meta = internal_pb2.ModelMeta()
            self.meta.config.CopyFrom(self.config)
            self._WriteMetafile()

        self.backend = {
            model_pb2.NetworkArchitecture.TENSORFLOW:
            tensorflow_backend.TensorFlowBackend,
            model_pb2.NetworkArchitecture.KERAS: keras_backend.KerasBackend,
        }[config.architecture.backend](self.config, self.cache, self.corpus)