def _EndOfEpochTestSample(
    self, corpus, sampler: samplers.Sampler, step: int, epoch_num: int
  ):
    """Run sampler"""
    import tensorflow as tf

    atomizer = corpus.atomizer
    sampler.Specialize(atomizer)
    sampler.batch_size = 1
    seed = 0

    self.InitSampling(sampler, seed)
    self.InitSampleBatch(sampler)

    samples, stats = [], []
    for i in range(FLAGS.clgen_per_epoch_test_samples):
      done = np.zeros(1, dtype=np.bool)
      while not done[0]:
        start_time = time.time()
        sample_in_progress = sampler.tokenized_start_text.copy()
        indices = self.SampleNextIndices(sampler, done)

        # Iterate over all samples in batch to determine whether they're
        # done.
        for index in indices[0]:
          sample_in_progress.append(atomizer.decoder[index])
          if not sampler.SampleIsComplete(sample_in_progress):
            continue

          stats.append(
            (len(sample_in_progress), int((time.time() - start_time) * 1000))
          )
          sample = "".join(sample_in_progress)
          samples.append(sample)
          app.Log(1, "End-of-epoch sample %d:\n%s", i + 1, sample)
          done[0] = True
          break

    # Write samples to file.
    with self.dashboard_db.Session(commit=True) as dbs:
      dbs.add_all(
        [
          dashboard_db.TrainingSample(
            model_id=self.dashboard_model_id,
            epoch=epoch_num,
            step=step,
            sample=sample,
            token_count=stats[0],
            sample_time=stats[1],
          )
          for sample, stats in zip(samples, stats)
        ]
      )
    samples_as_markdown = [
      self.FormatCodeAsMarkdown(sample) for sample in samples
    ]
    samples_tensor = tf.convert_to_tensor(samples_as_markdown, dtype=tf.string)
    summary_op = tf.summary.text("samples", samples_tensor)
    summary = self.inference_sess.run(summary_op)
    self.summary_writer.add_summary(summary, step)
Example #2
0
    def Sample(
        self,
        sampler: samplers.Sampler,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
        seed: int = None,
    ) -> None:
        """Sample a model.

    This method uses the observer model, returning nothing. To access the
    samples produced, implement a SampleObserver and pass it in as an argument.
    Sampling continues indefinitely until one of the sample observers returns
    False when notified of a new sample.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      sample_observers: A list of SampleObserver objects that are notified of
        new generated samples.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Raises:
      UserError: If called with no sample observers.
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        if not sample_observers:
            raise errors.UserError("Cannot sample without any observers")

        sample_start_time = labdate.MillisecondsTimestamp()

        self.Train()

        with logutil.TeeLogsToFile(f"sampler_{sampler.hash}",
                                   self.cache.path / "logs"):
            app.Log(1, "Sampling: '%s'", sampler.start_text)

            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            self.backend.InitSampling(sampler, seed)
            [obs.Specialize(self, sampler) for obs in sample_observers]

            batch_count = 1
            while self._SampleBatch(sampler, atomizer, sample_observers):
                batch_count += 1

            time_now = labdate.MillisecondsTimestamp()
            app.Log(
                1,
                "Produced %s sample batches at a rate of %s ms / batch.",
                humanize.Commas(batch_count),
                humanize.Commas(
                    int((time_now - sample_start_time) / max(batch_count, 1))),
            )
Example #3
0
    def _SampleBatch(
        self,
        sampler: samplers.Sampler,
        atomizer: atomizers.AtomizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
    ) -> typing.List[model_pb2.Sample]:
        """Run a single iteration of the batched sample inner-loop."""
        samples_in_progress = [
            sampler.tokenized_start_text.copy()
            for _ in range(sampler.batch_size)
        ]
        done = np.zeros(sampler.batch_size, dtype=np.bool)
        start_time = labdate.MillisecondsTimestamp()
        wall_time_start = start_time

        self.backend.InitSampleBatch(sampler)

        # The return value of this method. If any of the sample_observers return
        # False, this value is set to False.
        continue_sampling = True

        # Sampling loop. Continues until all samples in the batch are done.
        while not done.all():
            indices = self.backend.SampleNextIndices(sampler, done)

            # Iterate over all samples in batch to determine whether they're
            # done.
            for i in range(sampler.batch_size):
                if done[i]:
                    continue

                for index in indices[i]:
                    samples_in_progress[i].append(atomizer.decoder[index])
                    if sampler.SampleIsComplete(samples_in_progress[i]):
                        end_time = labdate.MillisecondsTimestamp()
                        done[i] = 1
                        sample = model_pb2.Sample(
                            text="".join(samples_in_progress[i]),
                            sample_start_epoch_ms_utc=start_time,
                            sample_time_ms=end_time - start_time,
                            wall_time_ms=end_time - wall_time_start,
                            num_tokens=len(samples_in_progress[i]),
                        )
                        # Notify sample observers.
                        continue_sampling &= all([
                            not obs.OnSample(sample)
                            for obs in sample_observers
                        ])

                        # Wall sample time is the difference between the end of the previous
                        # sample and the end of the current sample.
                        wall_time_start = labdate.MillisecondsTimestamp()
                        break

        return continue_sampling
Example #4
0
    def _EndOfEpochTestSample(self, corpus, sampler: samplers.Sampler,
                              step: int):
        """Run sampler"""
        import tensorflow as tf
        atomizer = corpus.atomizer
        sampler.Specialize(atomizer)
        sampler.batch_size = 1
        seed = 0

        self.InitSampling(sampler, seed)
        self.InitSampleBatch(sampler)

        samples = []
        for i in range(12):
            done = np.zeros(1, dtype=np.bool)
            while not done[0]:
                sample_in_progress = sampler.tokenized_start_text.copy()
                indices = self.SampleNextIndices(sampler, done)

                # Iterate over all samples in batch to determine whether they're
                # done.
                for index in indices[0]:
                    sample_in_progress.append(atomizer.decoder[index])
                    if not sampler.SampleIsComplete(sample_in_progress):
                        continue

                    sample = ''.join(sample_in_progress)
                    samples.append(sample)
                    app.Log(1, 'End-of-epoch sample %d:\n%s', i + 1, sample)
                    done[0] = True
                    break

        # Write samples to file.
        samples_as_markdown = [
            f'```\n{sample.strip()}\n```' for sample in samples
        ]
        samples_tensor = tf.convert_to_tensor(samples_as_markdown,
                                              dtype=tf.string)
        summary_op = tf.summary.text('samples', samples_tensor)
        summary = self.inference_sess.run(summary_op)
        self.summary_writer.add_summary(summary, step)
Example #5
0
    def _SampleBatch(
        self,
        sampler: samplers.Sampler,
        atomizer: atomizers.AtomizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
    ) -> bool:
        """Run a single iteration of the batched sample inner-loop."""
        start_time = labdate.MillisecondsTimestamp()

        # We're use the sampler.encoded_start_text attribute as a way to re-seed the
        # model state during rollback, so save the original value here so that we
        # can restore it at the end of the sample batch.
        original_sampler_encoded_start_text = sampler.encoded_start_text.copy()

        self.backend.InitSampleBatch(sampler)

        backtracker = OpenClBacktrackingHelper(atomizer, self._target_features)
        self._logger.OnSampleStart(backtracker)
        sampled_tokens = self.SampleOneWithBacktracking(
            sampler, atomizer, backtracker)
        self._logger.OnSampleEnd(backtracker)

        end_time = labdate.MillisecondsTimestamp()

        # Format text.
        if len(sampled_tokens):
            text = preprocessors.Preprocess(
                "".join(sampled_tokens),
                [
                    "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                    "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                    "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                    "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                    "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                    "deeplearning.clgen.preprocessors.cxx:ClangFormat",
                ],
            )
        else:
            text = ""

        # Restore the sampler's start text.
        sampler.encoded_start_text = original_sampler_encoded_start_text

        # Notify sample observers.
        sample = model_pb2.Sample(
            text=text,
            sample_start_epoch_ms_utc=start_time,
            sample_time_ms=end_time - start_time,
            wall_time_ms=end_time - start_time,
            num_tokens=len(sampled_tokens),
        )
        return all([not obs.OnSample(sample) for obs in sample_observers])
Example #6
0
    def SampleFast(self,
                   sampler: samplers.Sampler,
                   min_num_samples: int,
                   seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    Same as Sample(), but without printing or caching samples. Because samples
    are not cached, infinite sampling loops are not supported, since we must
    return the sample protos at some point.

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            sample_start_time = labdate.MillisecondsTimestamp()
            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)
            samples = []

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)

                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            sample_count += 1
                            samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) / len(samples))))
                    break

        return samples
Example #7
0
    def Sample(self,
               sampler: samplers.Sampler,
               min_num_samples: int,
               seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        self.SamplerCache(sampler).mkdir(exist_ok=True)
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            if min_num_samples < 0:
                logging.warning(
                    'Entering an infinite sample loop, this process will never end!'
                )
            sample_start_time = labdate.MillisecondsTimestamp()

            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)

            samples = []
            sample_dir = self.SamplerCache(sampler)

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)

                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            print(f'=== BEGIN CLGEN SAMPLE {sample_count} '
                                  f'===\n\n{sample.text}\n')
                            sample_count += 1
                            sample_id = crypto.sha256_str(sample.text)
                            sample_path = sample_dir / f'{sample_id}.pbtxt'
                            pbutil.ToFile(sample, sample_path)
                            if min_num_samples > 0:
                                samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) /
                                max(len(samples), 1))))
                    break

        return samples
Example #8
0
  def Sample(
      self, sampler: samplers.Sampler, min_num_samples: int,
      seed: int = None) -> typing.Iterable[model_pb2.Sample]:
    """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A iterator over samples.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
    sample_count = 1
    atomizer = self.atomizer
    sampler.Specialize(atomizer)
    batch_size = self.backend.InitSampling(sampler, seed)
    sample_start_time = labdate.MillisecondsTimestamp()
    # Per-sample batch outer loop. Continues until we have as many samples
    # as we want.
    while True:
      samples_in_progress = [
        sampler.tokenized_start_text.copy()
        for _ in range(batch_size)]
      done = np.zeros(batch_size, dtype=np.bool)
      start_time = labdate.MillisecondsTimestamp()
      wall_time_start = start_time

      self.backend.InitSampleBatch(sampler, batch_size)

      # Sampling loop. Continues until all samples in the batch are done.
      while True:
        indices = self.backend.SampleNextIndices(sampler, batch_size)

        # Iterate over all samples in batch to determine whether they're
        # done.
        for i in range(batch_size):
          if done[i]:
            continue

          token = atomizer.decoder[indices[i]]
          samples_in_progress[i].append(token)
          if sampler.SampleIsComplete(samples_in_progress[i]):
            end_time = labdate.MillisecondsTimestamp()
            done[i] = 1
            sample = model_pb2.Sample(
                text=''.join(samples_in_progress[i]),
                sample_start_epoch_ms_utc=start_time,
                sample_time_ms=end_time - start_time,
                wall_time_ms=end_time - wall_time_start,
                num_tokens=len(samples_in_progress[i]))
            sample_count += 1
            yield sample
            wall_time_start = labdate.MillisecondsTimestamp()

        # Complete the batch.
        if done.all():
          break

      # Complete sampling. Note that sample_count starts at 1.
      if sample_count > min_num_samples:
        now = labdate.MillisecondsTimestamp()
        logging.info(
            'Produced %s samples at a rate of %s ms / sample.',
            humanize.intcomma(sample_count - 1),
            humanize.intcomma(
                int((now - sample_start_time) / max(sample_count - 1, 1))))
        break