コード例 #1
0
  def _EndOfEpochTestSample(
    self, corpus, sampler: samplers.Sampler, step: int, epoch_num: int
  ):
    """Run sampler"""
    import tensorflow as tf

    atomizer = corpus.atomizer
    sampler.Specialize(atomizer)
    sampler.batch_size = 1
    seed = 0

    self.InitSampling(sampler, seed)
    self.InitSampleBatch(sampler)

    samples, stats = [], []
    for i in range(FLAGS.clgen_per_epoch_test_samples):
      done = np.zeros(1, dtype=np.bool)
      while not done[0]:
        start_time = time.time()
        sample_in_progress = sampler.tokenized_start_text.copy()
        indices = self.SampleNextIndices(sampler, done)

        # Iterate over all samples in batch to determine whether they're
        # done.
        for index in indices[0]:
          sample_in_progress.append(atomizer.decoder[index])
          if not sampler.SampleIsComplete(sample_in_progress):
            continue

          stats.append(
            (len(sample_in_progress), int((time.time() - start_time) * 1000))
          )
          sample = "".join(sample_in_progress)
          samples.append(sample)
          app.Log(1, "End-of-epoch sample %d:\n%s", i + 1, sample)
          done[0] = True
          break

    # Write samples to file.
    with self.dashboard_db.Session(commit=True) as dbs:
      dbs.add_all(
        [
          dashboard_db.TrainingSample(
            model_id=self.dashboard_model_id,
            epoch=epoch_num,
            step=step,
            sample=sample,
            token_count=stats[0],
            sample_time=stats[1],
          )
          for sample, stats in zip(samples, stats)
        ]
      )
    samples_as_markdown = [
      self.FormatCodeAsMarkdown(sample) for sample in samples
    ]
    samples_tensor = tf.convert_to_tensor(samples_as_markdown, dtype=tf.string)
    summary_op = tf.summary.text("samples", samples_tensor)
    summary = self.inference_sess.run(summary_op)
    self.summary_writer.add_summary(summary, step)
コード例 #2
0
ファイル: pretrained.py プロジェクト: whatsmyname/clgen
    def _SampleBatch(
        self,
        sampler: samplers.Sampler,
        atomizer: atomizers.AtomizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
    ) -> typing.List[model_pb2.Sample]:
        """Run a single iteration of the batched sample inner-loop."""
        samples_in_progress = [
            sampler.tokenized_start_text.copy()
            for _ in range(sampler.batch_size)
        ]
        done = np.zeros(sampler.batch_size, dtype=np.bool)
        start_time = labdate.MillisecondsTimestamp()
        wall_time_start = start_time

        self.backend.InitSampleBatch(sampler)

        # The return value of this method. If any of the sample_observers return
        # False, this value is set to False.
        continue_sampling = True

        # Sampling loop. Continues until all samples in the batch are done.
        while not done.all():
            indices = self.backend.SampleNextIndices(sampler, done)

            # Iterate over all samples in batch to determine whether they're
            # done.
            for i in range(sampler.batch_size):
                if done[i]:
                    continue

                for index in indices[i]:
                    samples_in_progress[i].append(atomizer.decoder[index])
                    if sampler.SampleIsComplete(samples_in_progress[i]):
                        end_time = labdate.MillisecondsTimestamp()
                        done[i] = 1
                        sample = model_pb2.Sample(
                            text="".join(samples_in_progress[i]),
                            sample_start_epoch_ms_utc=start_time,
                            sample_time_ms=end_time - start_time,
                            wall_time_ms=end_time - wall_time_start,
                            num_tokens=len(samples_in_progress[i]),
                        )
                        # Notify sample observers.
                        continue_sampling &= all([
                            not obs.OnSample(sample)
                            for obs in sample_observers
                        ])

                        # Wall sample time is the difference between the end of the previous
                        # sample and the end of the current sample.
                        wall_time_start = labdate.MillisecondsTimestamp()
                        break

        return continue_sampling
コード例 #3
0
    def _EndOfEpochTestSample(self, corpus, sampler: samplers.Sampler,
                              step: int):
        """Run sampler"""
        import tensorflow as tf
        atomizer = corpus.atomizer
        sampler.Specialize(atomizer)
        sampler.batch_size = 1
        seed = 0

        self.InitSampling(sampler, seed)
        self.InitSampleBatch(sampler)

        samples = []
        for i in range(12):
            done = np.zeros(1, dtype=np.bool)
            while not done[0]:
                sample_in_progress = sampler.tokenized_start_text.copy()
                indices = self.SampleNextIndices(sampler, done)

                # Iterate over all samples in batch to determine whether they're
                # done.
                for index in indices[0]:
                    sample_in_progress.append(atomizer.decoder[index])
                    if not sampler.SampleIsComplete(sample_in_progress):
                        continue

                    sample = ''.join(sample_in_progress)
                    samples.append(sample)
                    app.Log(1, 'End-of-epoch sample %d:\n%s', i + 1, sample)
                    done[0] = True
                    break

        # Write samples to file.
        samples_as_markdown = [
            f'```\n{sample.strip()}\n```' for sample in samples
        ]
        samples_tensor = tf.convert_to_tensor(samples_as_markdown,
                                              dtype=tf.string)
        summary_op = tf.summary.text('samples', samples_tensor)
        summary = self.inference_sess.run(summary_op)
        self.summary_writer.add_summary(summary, step)
コード例 #4
0
    def SampleFast(self,
                   sampler: samplers.Sampler,
                   min_num_samples: int,
                   seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    Same as Sample(), but without printing or caching samples. Because samples
    are not cached, infinite sampling loops are not supported, since we must
    return the sample protos at some point.

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            sample_start_time = labdate.MillisecondsTimestamp()
            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)
            samples = []

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)

                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            sample_count += 1
                            samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) / len(samples))))
                    break

        return samples
コード例 #5
0
    def Sample(self,
               sampler: samplers.Sampler,
               min_num_samples: int,
               seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        self.SamplerCache(sampler).mkdir(exist_ok=True)
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            if min_num_samples < 0:
                logging.warning(
                    'Entering an infinite sample loop, this process will never end!'
                )
            sample_start_time = labdate.MillisecondsTimestamp()

            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)

            samples = []
            sample_dir = self.SamplerCache(sampler)

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)

                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            print(f'=== BEGIN CLGEN SAMPLE {sample_count} '
                                  f'===\n\n{sample.text}\n')
                            sample_count += 1
                            sample_id = crypto.sha256_str(sample.text)
                            sample_path = sample_dir / f'{sample_id}.pbtxt'
                            pbutil.ToFile(sample, sample_path)
                            if min_num_samples > 0:
                                samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) /
                                max(len(samples), 1))))
                    break

        return samples
コード例 #6
0
ファイル: pretrained.py プロジェクト: BeauJoh/phd
  def Sample(
      self, sampler: samplers.Sampler, min_num_samples: int,
      seed: int = None) -> typing.Iterable[model_pb2.Sample]:
    """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A iterator over samples.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
    sample_count = 1
    atomizer = self.atomizer
    sampler.Specialize(atomizer)
    batch_size = self.backend.InitSampling(sampler, seed)
    sample_start_time = labdate.MillisecondsTimestamp()
    # Per-sample batch outer loop. Continues until we have as many samples
    # as we want.
    while True:
      samples_in_progress = [
        sampler.tokenized_start_text.copy()
        for _ in range(batch_size)]
      done = np.zeros(batch_size, dtype=np.bool)
      start_time = labdate.MillisecondsTimestamp()
      wall_time_start = start_time

      self.backend.InitSampleBatch(sampler, batch_size)

      # Sampling loop. Continues until all samples in the batch are done.
      while True:
        indices = self.backend.SampleNextIndices(sampler, batch_size)

        # Iterate over all samples in batch to determine whether they're
        # done.
        for i in range(batch_size):
          if done[i]:
            continue

          token = atomizer.decoder[indices[i]]
          samples_in_progress[i].append(token)
          if sampler.SampleIsComplete(samples_in_progress[i]):
            end_time = labdate.MillisecondsTimestamp()
            done[i] = 1
            sample = model_pb2.Sample(
                text=''.join(samples_in_progress[i]),
                sample_start_epoch_ms_utc=start_time,
                sample_time_ms=end_time - start_time,
                wall_time_ms=end_time - wall_time_start,
                num_tokens=len(samples_in_progress[i]))
            sample_count += 1
            yield sample
            wall_time_start = labdate.MillisecondsTimestamp()

        # Complete the batch.
        if done.all():
          break

      # Complete sampling. Note that sample_count starts at 1.
      if sample_count > min_num_samples:
        now = labdate.MillisecondsTimestamp()
        logging.info(
            'Produced %s samples at a rate of %s ms / sample.',
            humanize.intcomma(sample_count - 1),
            humanize.intcomma(
                int((now - sample_start_time) / max(sample_count - 1, 1))))
        break