def _EndOfEpochTestSample( self, corpus, sampler: samplers.Sampler, step: int, epoch_num: int ): """Run sampler""" import tensorflow as tf atomizer = corpus.atomizer sampler.Specialize(atomizer) sampler.batch_size = 1 seed = 0 self.InitSampling(sampler, seed) self.InitSampleBatch(sampler) samples, stats = [], [] for i in range(FLAGS.clgen_per_epoch_test_samples): done = np.zeros(1, dtype=np.bool) while not done[0]: start_time = time.time() sample_in_progress = sampler.tokenized_start_text.copy() indices = self.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for index in indices[0]: sample_in_progress.append(atomizer.decoder[index]) if not sampler.SampleIsComplete(sample_in_progress): continue stats.append( (len(sample_in_progress), int((time.time() - start_time) * 1000)) ) sample = "".join(sample_in_progress) samples.append(sample) app.Log(1, "End-of-epoch sample %d:\n%s", i + 1, sample) done[0] = True break # Write samples to file. with self.dashboard_db.Session(commit=True) as dbs: dbs.add_all( [ dashboard_db.TrainingSample( model_id=self.dashboard_model_id, epoch=epoch_num, step=step, sample=sample, token_count=stats[0], sample_time=stats[1], ) for sample, stats in zip(samples, stats) ] ) samples_as_markdown = [ self.FormatCodeAsMarkdown(sample) for sample in samples ] samples_tensor = tf.convert_to_tensor(samples_as_markdown, dtype=tf.string) summary_op = tf.summary.text("samples", samples_tensor) summary = self.inference_sess.run(summary_op) self.summary_writer.add_summary(summary, step)
def _SampleBatch( self, sampler: samplers.Sampler, atomizer: atomizers.AtomizerBase, sample_observers: typing.List[sample_observers_lib.SampleObserver], ) -> typing.List[model_pb2.Sample]: """Run a single iteration of the batched sample inner-loop.""" samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(sampler.batch_size) ] done = np.zeros(sampler.batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler) # The return value of this method. If any of the sample_observers return # False, this value is set to False. continue_sampling = True # Sampling loop. Continues until all samples in the batch are done. while not done.all(): indices = self.backend.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for i in range(sampler.batch_size): if done[i]: continue for index in indices[i]: samples_in_progress[i].append(atomizer.decoder[index]) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text="".join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i]), ) # Notify sample observers. continue_sampling &= all([ not obs.OnSample(sample) for obs in sample_observers ]) # Wall sample time is the difference between the end of the previous # sample and the end of the current sample. wall_time_start = labdate.MillisecondsTimestamp() break return continue_sampling
def _EndOfEpochTestSample(self, corpus, sampler: samplers.Sampler, step: int): """Run sampler""" import tensorflow as tf atomizer = corpus.atomizer sampler.Specialize(atomizer) sampler.batch_size = 1 seed = 0 self.InitSampling(sampler, seed) self.InitSampleBatch(sampler) samples = [] for i in range(12): done = np.zeros(1, dtype=np.bool) while not done[0]: sample_in_progress = sampler.tokenized_start_text.copy() indices = self.SampleNextIndices(sampler, done) # Iterate over all samples in batch to determine whether they're # done. for index in indices[0]: sample_in_progress.append(atomizer.decoder[index]) if not sampler.SampleIsComplete(sample_in_progress): continue sample = ''.join(sample_in_progress) samples.append(sample) app.Log(1, 'End-of-epoch sample %d:\n%s', i + 1, sample) done[0] = True break # Write samples to file. samples_as_markdown = [ f'```\n{sample.strip()}\n```' for sample in samples ] samples_tensor = tf.convert_to_tensor(samples_as_markdown, dtype=tf.string) summary_op = tf.summary.text('samples', samples_tensor) summary = self.inference_sess.run(summary_op) self.summary_writer.add_summary(summary, step)
def SampleFast(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. Same as Sample(), but without printing or caching samples. Because samples are not cached, infinite sampling loops are not supported, since we must return the sample protos at some point. Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() sample_count = 1 with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) samples = [] # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) sample_count += 1 samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / len(samples)))) break return samples
def Sample(self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.List[model_pb2.Sample]: """Sample a model. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A list of Sample protos. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ self.Train() sample_count = 1 self.SamplerCache(sampler).mkdir(exist_ok=True) with logutil.TeeLogsToFile(f'sampler_{sampler.hash}', self.cache.path / 'logs'): logging.info("Sampling: '%s'", sampler.start_text) if min_num_samples < 0: logging.warning( 'Entering an infinite sample loop, this process will never end!' ) sample_start_time = labdate.MillisecondsTimestamp() atomizer = self.corpus.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) samples = [] sample_dir = self.SamplerCache(sampler) # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size) ] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices( sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) print(f'=== BEGIN CLGEN SAMPLE {sample_count} ' f'===\n\n{sample.text}\n') sample_count += 1 sample_id = crypto.sha256_str(sample.text) sample_path = sample_dir / f'{sample_id}.pbtxt' pbutil.ToFile(sample, sample_path) if min_num_samples > 0: samples.append(sample) wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(len(samples)), humanize.intcomma( int((now - sample_start_time) / max(len(samples), 1)))) break return samples
def Sample( self, sampler: samplers.Sampler, min_num_samples: int, seed: int = None) -> typing.Iterable[model_pb2.Sample]: """Sample a model. If the model is not already trained, calling Sample() first trains the model. Thus a call to Sample() is equivalent to calling Train() then Sample(). Args: sampler: The sampler to sample using. min_num_samples: The minimum number of samples to return. Note that the true number of samples returned may be higher than this value, as sampling occurs in batches. The model will continue producing samples until the lowest mulitple of the sampler batch size property that is larger than this value. E.g. if min_num_samples is 7 and the Sampler batch size is 10, 10 samples will be returned. seed: A numeric value to seed the RNG with. If not present, the RNG is seeded randomly. Returns: A iterator over samples. Raises: UnableToAcquireLockError: If the model is locked (i.e. there is another process currently modifying the model). InvalidStartText: If the sampler start text cannot be encoded. InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be encoded. """ sample_count = 1 atomizer = self.atomizer sampler.Specialize(atomizer) batch_size = self.backend.InitSampling(sampler, seed) sample_start_time = labdate.MillisecondsTimestamp() # Per-sample batch outer loop. Continues until we have as many samples # as we want. while True: samples_in_progress = [ sampler.tokenized_start_text.copy() for _ in range(batch_size)] done = np.zeros(batch_size, dtype=np.bool) start_time = labdate.MillisecondsTimestamp() wall_time_start = start_time self.backend.InitSampleBatch(sampler, batch_size) # Sampling loop. Continues until all samples in the batch are done. while True: indices = self.backend.SampleNextIndices(sampler, batch_size) # Iterate over all samples in batch to determine whether they're # done. for i in range(batch_size): if done[i]: continue token = atomizer.decoder[indices[i]] samples_in_progress[i].append(token) if sampler.SampleIsComplete(samples_in_progress[i]): end_time = labdate.MillisecondsTimestamp() done[i] = 1 sample = model_pb2.Sample( text=''.join(samples_in_progress[i]), sample_start_epoch_ms_utc=start_time, sample_time_ms=end_time - start_time, wall_time_ms=end_time - wall_time_start, num_tokens=len(samples_in_progress[i])) sample_count += 1 yield sample wall_time_start = labdate.MillisecondsTimestamp() # Complete the batch. if done.all(): break # Complete sampling. Note that sample_count starts at 1. if sample_count > min_num_samples: now = labdate.MillisecondsTimestamp() logging.info( 'Produced %s samples at a rate of %s ms / sample.', humanize.intcomma(sample_count - 1), humanize.intcomma( int((now - sample_start_time) / max(sample_count - 1, 1)))) break