def AssertConfigIsValid(config: corpus_pb2.Corpus) -> corpus_pb2.Corpus: """Assert that config proto is valid. Args: config: A Corpus proto. Returns: The Corpus proto. Raises: UserError: If the config is invalid. """ try: pbutil.AssertFieldIsSet(config, 'contentfiles') pbutil.AssertFieldIsSet(config, 'atomizer') pbutil.AssertFieldIsSet(config, 'contentfile_separator') # Check that the preprocessor pipeline resolves to preprocessor functions. [preprocessors.GetPreprocessorFunction(p) for p in config.preprocessor] if config.HasField('greedy_multichar_atomizer'): if not config.greedy_multichar_atomizer.tokens: raise errors.UserError( 'GreedyMulticharAtomizer.tokens is empty') for atom in config.greedy_multichar_atomizer.tokens: if not atom: raise errors.UserError( 'Empty string found in GreedyMulticharAtomizer.tokens is empty' ) return config except pbutil.ProtoValueError as e: raise errors.UserError(e)
def GetPreprocessorFunction(name: str) -> public.PreprocessorFunction: """Lookup a preprocess function by name. A preprocessor is a function which takes a single argument 'text' of type str, and returns a str. The name is the fully qualified name of the python function which implements it, in the form <module>:<name>. For example, the name 'preprocessors.cxx:Compile' will return the function 'Compile' in the module 'preprocessors.cxx'. Args: name: The name of the preprocessor to get. Returns: The python preprocessor function. Raises: UserError: If the requested name cannot be found or is not a @clgen_preprocessor decorated function. """ components = name.split(':') if len(components) != 2: raise errors.UserError(f'Invalid preprocessor name {name}') module_name, function_name = components try: module = importlib.import_module(module_name) function_ = getattr(module, function_name) except (ModuleNotFoundError, AttributeError): raise errors.UserError(f'Preprocessor {name} not found.') if not function_.__dict__.get('is_clgen_preprocessor'): raise errors.UserError( f'Preprocessor {name} not decorated with @clgen_preprocessor') return function_
def ResolveContentId(config: corpus_pb2.Corpus, hc: hashcache.HashCache) -> str: """Compute the hash of the input contentfiles. This function resolves the unique sha1 checksum of a set of content files. Args: config: The corpus config proto. hc: A hashcache database instance, used for resolving directory hashes. Returns: A hex encoded sha1 string. """ # We can take a massive shortcut if the content ID is already set in the # config proto. if config.HasField('content_id'): return config.content_id start_time = time.time() if config.HasField('local_directory'): # After the first time we compute the hash of a directory, we write it into # a file. This is a shortcut to work around the fact that computing the # directory checksum is O(n) with respect to the number of files in the # directory (even if the directory is already cached by the hash cache). # This means that it is the responsibility of the user to delete this cached # file if the directory is changed. hash_file_path = pathlib.Path( str(pathlib.Path(config.local_directory)) + '.sha1.txt') if hash_file_path.is_file(): logging.info("Reading directory hash: '%s'.", hash_file_path) with open(hash_file_path) as f: content_id = f.read().rstrip() else: # No hash file, so compute the directory hash and create it. try: content_id = hc.GetHash( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)) except FileNotFoundError as e: raise errors.UserError(e) # Create the hash file in the directory so that next time we don't need # to reference the hash cache. with open(hash_file_path, 'w') as f: print(content_id, file=f) logging.info("Wrote directory hash: '%s'.", hash_file_path) elif config.HasField('local_tar_archive'): # This if not an efficient means of getting the hash, as it requires always # unpacking the archive and reading the entire contents. It would be nicer # to maintain a cache which maps the mtime of tarballs to their content ID, # similart to how local_directory is implemented. content_id = GetHashOfArchiveContents( ExpandConfigPath(config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)) else: raise NotImplementedError( 'Unsupported Corpus.contentfiles field value') logging.debug('Resolved Content ID %s in %s ms.', content_id, humanize.intcomma(int((time.time() - start_time) * 1000))) return content_id
def __init__(self, config: clgen_pb2.Instance): """Instantiate an instance. Args: config: An Instance proto. Raises: UserError: If the instance proto contains invalid values, is missing a model or sampler fields. """ try: pbutil.AssertFieldIsSet(config, 'model_specification') pbutil.AssertFieldIsSet(config, 'sampler') except pbutil.ProtoValueError as e: raise errors.UserError(e) self.working_dir = None if config.HasField('working_dir'): self.working_dir: pathlib.Path = pathlib.Path( os.path.expandvars( config.working_dir)).expanduser().absolute() # Enter a session so that the cache paths are set relative to any requested # working directory. with self.Session(): if config.HasField('model'): self.model: models.Model = models.Model(config.model) else: self.model: pretrained.PreTrainedModel = pretrained.PreTrainedModel( pathlib.Path(config.pretrained_model)) self.sampler: samplers.Sampler = samplers.Sampler(config.sampler)
def GetHashOfArchiveContents(archive: pathlib.Path) -> str: """Compute the checksum of the contents of a directory. Args: archive: Path of the archive. Returns: Checksum of the archive. Raises: UserError: If the requested archive does not exist, or cannot be unpacked. """ if not archive.is_file(): raise errors.UserError(f"Archive not found: '{archive}'") with tempfile.TemporaryDirectory(prefix='clgen_corpus_') as d: cmd = ['tar', '-xf', str(archive), '-C', d] try: subprocess.check_call(cmd) except subprocess.CalledProcessError: raise errors.UserError(f"Archive unpack failed: '{archive}'") return checksumdir.dirhash(d, 'sha1')
def GetTrainingCorpus( corpus: 'corpuses.Corpus', training_opts: model_pb2.TrainingOptions) -> typing.Tuple[ np.ndarray, np.ndarray, int]: """Get the corpus to train over. Args: corpus: A Corpus instance. training_opts: A TrainingOptions proto. Returns: An X, y pair of data for an epoch, and the number of steps in the epoch. Raises: UserError: If batch_size and sequence_length are too large for the corpus, yielding no batches. """ start_time = time.time() encoded_corpus = corpus.GetTrainingData( shuffle=training_opts.shuffle_corpus_contentfiles_between_epochs) corpus_length = len(encoded_corpus) steps_per_epoch = (corpus_length - 1) // ( training_opts.batch_size * training_opts.sequence_length) if not steps_per_epoch: raise errors.UserError( f'Requested batch size ({training_opts.batch_size}) and ' f'sequence length ({training_opts.sequence_length}) are too large for ' f'corpus of size {corpus_length}.') clipped_corpus_length = ( steps_per_epoch * training_opts.batch_size * training_opts.sequence_length) x = np.reshape( encoded_corpus[:clipped_corpus_length], [training_opts.batch_size, steps_per_epoch * training_opts.sequence_length]) y = np.reshape( encoded_corpus[1:clipped_corpus_length + 1], [training_opts.batch_size, steps_per_epoch * training_opts.sequence_length]) logging.info( 'Encoded corpus of %s tokens (clipped last %s tokens) in %s ms.', humanize.intcomma(clipped_corpus_length), humanize.intcomma(corpus_length - clipped_corpus_length), humanize.intcomma(int((time.time() - start_time) * 1000))) return x, y, steps_per_epoch
def CreateBatches(self) -> None: start_time = time.time() # generate a kernel corpus self.i = 0 if (self.encoded_corpus is None or self.training_opts.shuffle_corpus_contentfiles_between_epochs): self.encoded_corpus = self.corpus.GetTrainingData( shuffle=self.training_opts.shuffle_corpus_contentfiles_between_epochs) batch_size = self.training_opts.batch_size sequence_length = self.training_opts.sequence_length # set corpus size and number of batches self.num_batches = int(len(self.encoded_corpus) / ( batch_size * sequence_length)) if self.num_batches == 0: raise errors.UserError( "Not enough data. Use a smaller sequence_length and batch_size") # split into batches clipped_corpus_length = self.num_batches * batch_size * sequence_length clipped_corpus = self.encoded_corpus[:clipped_corpus_length] xdata = clipped_corpus ydata = np.copy(clipped_corpus) # Wrap-around. ydata[:-1] = xdata[1:] ydata[-1] = xdata[0] self.batches = [ DataBatch(x, y) for x, y in zip( np.split(xdata.reshape(batch_size, -1), self.num_batches, 1), np.split(ydata.reshape(batch_size, -1), self.num_batches, 1) ) ] logging.info( 'Encoded corpus of %s tokens (clipped last %s tokens) in %s ms.', humanize.intcomma(clipped_corpus_length), humanize.intcomma(len(self.encoded_corpus) - clipped_corpus_length), humanize.intcomma(int((time.time() - start_time) * 1000)))
def FromText(cls, text: str, atoms: typing.Set[str]) -> 'GreedyAtomizer': """Instantiate and an atomizer from a corpus text. Args: text: Text corpus atoms: A list of multi-character tokens. Returns: An atomizer instance. """ if not atoms: raise errors.UserError('No atoms specified') # Instantiate a greedy atomizer using the full vocabulary. full_vocab = dict(zip(atoms, range(len(atoms)))) c = GreedyAtomizer(full_vocab, determine_chars=True) # Derive the subset of the vocabulary required to encode the given text. tokens = sorted(list(set(c.TokenizeString(text)))) vocab_subset = dict(zip(tokens, range(len(tokens)))) end_time = labdate.MillisecondsTimestamp() # Return a new atomizer using the subset vocabulary. return GreedyAtomizer(vocab_subset)
def __init__(self, config: corpus_pb2.Corpus): """Instantiate a corpus from a proto config. If this is a new corpus, a number of files will be created, which may take some time. Args: config: A Corpus message. Raises: TypeError: If the config argument is not a Sampler proto. UserError: In case the corpus is not found, or config contains invalid options. EmptyCorpusException: In case the corpus contains no data. """ if not isinstance(config, corpus_pb2.Corpus): t = type(config).__name__ raise TypeError(f"Config must be a Corpus proto. Received: '{t}'") # Make a local copy of the configuration. self.config = corpus_pb2.Corpus() self.config.CopyFrom(AssertConfigIsValid(config)) self._atomizer = None self._created = False cache.cachepath('corpus').mkdir(parents=True, exist_ok=True) print(cache.cachepath('hashcache.db')) hc = hashcache.HashCache(cache.cachepath('hashcache.db'), 'sha1') self.content_id = ResolveContentId(self.config, hc) # Database of pre-processed files. preprocessed_id = ResolvePreprocessedId(self.content_id, self.config) cache.cachepath('corpus', 'preprocessed', preprocessed_id).mkdir(exist_ok=True, parents=True) preprocessed_db_path = cache.cachepath('corpus', 'preprocessed', preprocessed_id, 'preprocessed.db') if (self.config.HasField('content_id') and not preprocessed_db_path.is_file()): raise errors.UserError( f"Content ID not found: '{self.content_id}'") self.preprocessed = preprocessed.PreprocessedContentFiles( preprocessed_db_path) # Create symlink to contentfiles. symlink = pathlib.Path( self.preprocessed.url[len('sqlite:///'):]).parent / 'contentfiles' if not symlink.is_symlink(): if config.HasField('local_directory'): os.symlink( str( ExpandConfigPath( config.local_directory, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) elif config.HasField('local_tar_archive'): os.symlink( str( ExpandConfigPath( config.local_tar_archive, path_prefix=FLAGS.clgen_local_path_prefix)), symlink) # Data of encoded pre-preprocessed files. encoded_id = ResolveEncodedId(self.content_id, self.config) cache.cachepath('corpus', 'encoded', encoded_id).mkdir(exist_ok=True, parents=True) self.encoded = encoded.EncodedContentFiles( cache.cachepath('corpus', 'encoded', encoded_id, 'encoded.db')) self.atomizer_path = cache.cachepath('corpus', 'encoded', encoded_id, 'atomizer.pkl') # Create symlink to preprocessed files. symlink = pathlib.Path( self.encoded.url[len('sqlite:///'):]).parent / 'preprocessed' if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path( self.preprocessed.url[len('sqlite:///'):]).parent, pathlib.Path(self.encoded.url[len('sqlite:///'):]).parent), symlink) self.hash = encoded_id self.cache = cache.mkcache('corpus', 'encoded', encoded_id)
def AssertIsBuildable(config: model_pb2.Model) -> model_pb2.Model: """Assert that a model configuration is buildable. Args: config: A model proto. Returns: The input model proto, unmodified. Raises: UserError: If the model is not buildable. InternalError: If the value of the training.optimizer field is not understood. """ # Any change to the Model proto schema will require a change to this function. try: pbutil.AssertFieldIsSet(config, 'corpus') pbutil.AssertFieldIsSet(config, 'architecture') pbutil.AssertFieldIsSet(config, 'training') pbutil.AssertFieldIsSet(config.architecture, 'backend') pbutil.AssertFieldIsSet(config.architecture, 'neuron_type') if config.architecture.backend == model_pb2.NetworkArchitecture.KERAS: pbutil.AssertFieldConstraint( config.architecture, 'embedding_size', lambda x: 0 < x, 'NetworkArchitecture.embedding_size must be > 0') pbutil.AssertFieldConstraint( config.architecture, 'neurons_per_layer', lambda x: 0 < x, 'NetworkArchitecture.neurons_per_layer must be > 0') pbutil.AssertFieldConstraint( config.architecture, 'num_layers', lambda x: 0 < x, 'NetworkArchitecture.num_layers must be > 0') pbutil.AssertFieldConstraint( config.architecture, 'post_layer_dropout_micros', lambda x: 0 <= x <= 1000000, 'NetworkArchitecture.post_layer_dropout_micros ' 'must be >= 0 and <= 1000000') pbutil.AssertFieldConstraint(config.training, 'num_epochs', lambda x: 0 < x, 'TrainingOptions.num_epochs must be > 0') pbutil.AssertFieldIsSet(config.training, 'shuffle_corpus_contentfiles_between_epochs') pbutil.AssertFieldConstraint(config.training, 'batch_size', lambda x: 0 < x, 'TrainingOptions.batch_size must be > 0') pbutil.AssertFieldIsSet(config.training, 'optimizer') if config.training.HasField('adam_optimizer'): pbutil.AssertFieldConstraint( config.training.adam_optimizer, 'initial_learning_rate_micros', lambda x: 0 <= x, 'AdamOptimizer.initial_learning_rate_micros must be >= 0') pbutil.AssertFieldConstraint( config.training.adam_optimizer, 'learning_rate_decay_per_epoch_micros', lambda x: 0 <= x, 'AdamOptimizer.learning_rate_decay_per_epoch_micros must be >= 0' ) pbutil.AssertFieldConstraint( config.training.adam_optimizer, 'beta_1_micros', lambda x: 0 <= x <= 1000000, 'AdamOptimizer.beta_1_micros must be >= 0 and <= 1000000') pbutil.AssertFieldConstraint( config.training.adam_optimizer, 'beta_2_micros', lambda x: 0 <= x <= 1000000, 'AdamOptimizer.beta_2_micros must be >= 0 and <= 1000000') pbutil.AssertFieldConstraint( config.training.adam_optimizer, 'normalized_gradient_clip_micros', lambda x: 0 <= x, 'AdamOptimizer.normalized_gradient_clip_micros must be >= 0') elif config.training.HasField('rmsprop_optimizer'): pbutil.AssertFieldConstraint( config.training.rmsprop_optimizer, 'initial_learning_rate_micros', lambda x: 0 <= x, 'RmsPropOptimizer.initial_learning_rate_micros must be >= 0') pbutil.AssertFieldConstraint( config.training.rmsprop_optimizer, 'learning_rate_decay_per_epoch_micros', lambda x: 0 <= x, 'RmsPropOptimizer.learning_rate_decay_per_epoch_micros must be >= 0' ) else: raise errors.InternalError( "Unrecognized value: 'TrainingOptions.optimizer'") except pbutil.ProtoValueError as e: raise errors.UserError(str(e)) return config
def __init__(self, config: model_pb2.Model): """Instantiate a model. Args: config: A Model message. Raises: TypeError: If the config argument is not a Model proto. UserError: In case on an invalid config. """ # Error early, so that a cache isn't created. if not isinstance(config, model_pb2.Model): t = type(config).__name__ raise TypeError(f"Config must be a Model proto. Received: '{t}'") # Validate config options. if config.training.sequence_length < 1: raise errors.UserError( 'TrainingOptions.sequence_length must be >= 1') self.config = model_pb2.Model() self.config.CopyFrom(builders.AssertIsBuildable(config)) self.corpus = corpuses.Corpus(config.corpus) self.hash = self._ComputeHash(self.corpus, self.config) self.cache = cache.mkcache('model', self.hash) # Create the necessary cache directories. (self.cache.path / 'checkpoints').mkdir(exist_ok=True) (self.cache.path / 'samples').mkdir(exist_ok=True) (self.cache.path / 'logs').mkdir(exist_ok=True) # Create symlink to encoded corpus. symlink = self.cache.path / 'corpus' if not symlink.is_symlink(): os.symlink( os.path.relpath( pathlib.Path( self.corpus.encoded.url[len('sqlite:///'):]).parent, self.cache.path), symlink) # Create symlink to the atomizer. symlink = self.cache.path / 'atomizer' if not symlink.is_symlink(): os.symlink( os.path.relpath(self.corpus.atomizer_path, self.cache.path), symlink) # Validate metadata against cache. if self.cache.get('META.pbtxt'): cached_meta = pbutil.FromFile( pathlib.Path(self.cache['META.pbtxt']), internal_pb2.ModelMeta()) # Exclude num_epochs and corpus location from metadata comparison. config_to_compare = model_pb2.Model() config_to_compare.CopyFrom(self.config) config_to_compare.corpus.ClearField('contentfiles') config_to_compare.training.ClearField('num_epochs') # These fields should have already been cleared, but we'll do it again # so that metadata comparisons don't fail when the cached meta schema # is updated. cached_to_compare = model_pb2.Model() cached_to_compare.CopyFrom(cached_meta.config) cached_to_compare.corpus.ClearField('contentfiles') cached_to_compare.training.ClearField('num_epochs') if config_to_compare != cached_to_compare: raise errors.InternalError('Metadata mismatch') self.meta = cached_meta else: self.meta = internal_pb2.ModelMeta() self.meta.config.CopyFrom(self.config) self._WriteMetafile() self.backend = { model_pb2.NetworkArchitecture.TENSORFLOW: tensorflow_backend.TensorFlowBackend, model_pb2.NetworkArchitecture.KERAS: keras_backend.KerasBackend, }[config.architecture.backend](self.config, self.cache, self.corpus)