def test_TensorFlowBackend_Sample_return_value_matches_cached_sample(
        clgen_cache_dir, abc_tensorflow_model_config):
    """Test that Sample() returns Sample protos."""
    del clgen_cache_dir
    abc_tensorflow_model_config.training.batch_size = 1
    m = models.Model(abc_tensorflow_model_config)
    sample_observer = sample_observers.InMemorySampleSaver()
    m.Sample(
        MockSampler(hash="hash"),
        [
            sample_observers.MaxSampleCountObserver(1),
            sample_observer,
            sample_observers.LegacySampleCacheObserver(),
        ],
    )
    samples = sample_observer.samples
    # Samples are produced in batches of sampler.batch_size elements.
    assert len(samples) == 1
    assert len(list((m.cache.path / "samples" / "hash").iterdir())) == 1
    cached_sample_path = (m.cache.path / "samples" / "hash" / list(
        (m.cache.path / "samples" / "hash").iterdir())[0])
    assert cached_sample_path.is_file()
    cached_sample = pbutil.FromFile(cached_sample_path, model_pb2.Sample())
    assert samples[0].text == cached_sample.text
    assert samples[0].sample_time_ms == cached_sample.sample_time_ms
    assert (samples[0].sample_start_epoch_ms_utc ==
            cached_sample.sample_start_epoch_ms_utc)
def test_PrintSampleObserver(capsys):
    observer = sample_observers.PrintSampleObserver()
    sample = model_pb2.Sample(text="Hello, world!")

    assert observer.OnSample(sample)
    captured = capsys.readouterr()
    assert captured.out == """\
Beispiel #3
0
def PostprocessSampleCorpus(instance: clgen.Instance):
    """Create a corpus from the model samples and pre-process."""
    sample_dir = instance.model.SamplerCache(instance.sampler)

    # Read the sample protos and write them to a directory of content files.
    contentfiles_dir = pathlib.Path(str(sample_dir) + '.contentfiles')
    contentfiles_dir.mkdir(exist_ok=True)
    logging.info('Writing output contentfiles to %s', contentfiles_dir)
    if len(list(contentfiles_dir.iterdir())) != len(list(
            sample_dir.iterdir())):
        for proto_path in sample_dir.iterdir():
            sample = pbutil.FromFile(proto_path, model_pb2.Sample())
            with open(contentfiles_dir / proto_path.name, 'w') as f:
                f.write(sample.text)

    logging.info('Creating output corpus')
    output_corpus_config = corpus_pb2.Corpus()
    output_corpus_config.CopyFrom(instance.model.corpus.config)
    output_corpus_config.local_directory = str(contentfiles_dir)
    # We derive the programming language name from the input corpus directory.
    # This depends on corpuses being in directories named after their language,
    # e.g. ~/corpuses/opencl, or ~/corpuses/java.A
    preprocessed_dir = instance.model.corpus.preprocessed.url[len('sqlite:///'
                                                                  ):].parent
    language = (preprocessed_dir / 'contentfiles').resolve().name
    output_corpus_config.preprocessor[:] = POSTPROCESSORS[language]
    output_corpus = corpuses.Corpus(output_corpus_config)
    try:
        output_corpus.Create()
    except errors.EmptyCorpusException:
        pass
    return output_corpus
Beispiel #4
0
def write_samples_cache(db_sample_obs : sample_observers.SamplesDatabaseObserver,
                        tokenizer     : "tokenizers.TokenizerBase",
                        samples       : typing.List[ActiveSample]
                        ) -> None:
  for sample in samples:
    try:
      s = model_pb2.Sample(
        train_step = -1,
        text = tokenizer.ArrayToCode(sample.sample, with_formatting = True),
        sample_indices = "",
        encoded_sample_indices = "",
        original_input = "",
        sample_feed    = tokenizer.ArrayToCode(sample.sample_feed.input_feed, with_formatting = True),
        encoded_text   = "",
        sample_start_epoch_ms_utc = 0,
        sample_time_ms = 0,
        wall_time_ms   = 0,
        feature_vector = '\n'.join(["{}:{}".format(k, v) for k, v in sample.features.items()]) if sample.features else "None",
        num_tokens     = np.where(sample.sample == tokenizer.padToken)[0][0] if tokenizer.padToken in sample.sample else len(sample),
        compile_status = True,
        categorical_sampling = FLAGS.categorical_sampling,
        date_added           = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"),
      )
      db_sample_obs.OnSample(s)
    except Exception:
      pass
  return
Beispiel #5
0
def test_SaveSampleTextObserver(tempdir: pathlib.Path):
    observer = sample_observers.SaveSampleTextObserver(tempdir)
    contents = "Hello, world!"
    sample = model_pb2.Sample(text=contents)

    assert observer.OnSample(sample)
    path = tempdir / f"{crypto.sha256_str(contents)}.txt"
    assert path.is_file()
    assert fs.Read(path) == contents
Beispiel #6
0
    def _SampleBatch(
        self,
        sampler: samplers.Sampler,
        atomizer: atomizers.AtomizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
    ) -> typing.List[model_pb2.Sample]:
        """Run a single iteration of the batched sample inner-loop."""
        samples_in_progress = [
            sampler.tokenized_start_text.copy()
            for _ in range(sampler.batch_size)
        ]
        done = np.zeros(sampler.batch_size, dtype=np.bool)
        start_time = labdate.MillisecondsTimestamp()
        wall_time_start = start_time

        self.backend.InitSampleBatch(sampler)

        # The return value of this method. If any of the sample_observers return
        # False, this value is set to False.
        continue_sampling = True

        # Sampling loop. Continues until all samples in the batch are done.
        while not done.all():
            indices = self.backend.SampleNextIndices(sampler, done)

            # Iterate over all samples in batch to determine whether they're
            # done.
            for i in range(sampler.batch_size):
                if done[i]:
                    continue

                for index in indices[i]:
                    samples_in_progress[i].append(atomizer.decoder[index])
                    if sampler.SampleIsComplete(samples_in_progress[i]):
                        end_time = labdate.MillisecondsTimestamp()
                        done[i] = 1
                        sample = model_pb2.Sample(
                            text="".join(samples_in_progress[i]),
                            sample_start_epoch_ms_utc=start_time,
                            sample_time_ms=end_time - start_time,
                            wall_time_ms=end_time - wall_time_start,
                            num_tokens=len(samples_in_progress[i]),
                        )
                        # Notify sample observers.
                        continue_sampling &= all([
                            not obs.OnSample(sample)
                            for obs in sample_observers
                        ])

                        # Wall sample time is the difference between the end of the previous
                        # sample and the end of the current sample.
                        wall_time_start = labdate.MillisecondsTimestamp()
                        break

        return continue_sampling
Beispiel #7
0
def test_InMemorySampleSaver():
    observer = sample_observers.InMemorySampleSaver()
    sample = model_pb2.Sample(text="Hello, world!")

    assert observer.OnSample(sample)
    assert len(observer.samples) == 1
    assert observer.samples[-1].text == "Hello, world!"

    assert observer.OnSample(sample)
    assert len(observer.samples) == 2
    assert observer.samples[-1].text == "Hello, world!"
Beispiel #8
0
    def _SampleBatch(
        self,
        sampler: samplers.Sampler,
        atomizer: atomizers.AtomizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
    ) -> bool:
        """Run a single iteration of the batched sample inner-loop."""
        start_time = labdate.MillisecondsTimestamp()

        # We're use the sampler.encoded_start_text attribute as a way to re-seed the
        # model state during rollback, so save the original value here so that we
        # can restore it at the end of the sample batch.
        original_sampler_encoded_start_text = sampler.encoded_start_text.copy()

        self.backend.InitSampleBatch(sampler)

        backtracker = OpenClBacktrackingHelper(atomizer, self._target_features)
        self._logger.OnSampleStart(backtracker)
        sampled_tokens = self.SampleOneWithBacktracking(
            sampler, atomizer, backtracker)
        self._logger.OnSampleEnd(backtracker)

        end_time = labdate.MillisecondsTimestamp()

        # Format text.
        if len(sampled_tokens):
            text = preprocessors.Preprocess(
                "".join(sampled_tokens),
                [
                    "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                    "deeplearning.clgen.preprocessors.opencl:SanitizeKernelPrototype",
                    "deeplearning.clgen.preprocessors.common:StripTrailingWhitespace",
                    "deeplearning.clgen.preprocessors.opencl:NormalizeIdentifiers",
                    "deeplearning.clgen.preprocessors.common:StripDuplicateEmptyLines",
                    "deeplearning.clgen.preprocessors.cxx:ClangFormat",
                ],
            )
        else:
            text = ""

        # Restore the sampler's start text.
        sampler.encoded_start_text = original_sampler_encoded_start_text

        # Notify sample observers.
        sample = model_pb2.Sample(
            text=text,
            sample_start_epoch_ms_utc=start_time,
            sample_time_ms=end_time - start_time,
            wall_time_ms=end_time - start_time,
            num_tokens=len(sampled_tokens),
        )
        return all([not obs.OnSample(sample) for obs in sample_observers])
Beispiel #9
0
def test_SamplesDatabaseObserver_add_one(db: samples_database.SamplesDatabase):
    sample_proto = model_pb2.Sample(
        text='Hello, observer',
        num_tokens=10,
        wall_time_ms=5,
        sample_start_epoch_ms_utc=1000,
    )

    with db.Observer() as obs:
        obs.OnSample(sample_proto)

    with db.Session() as s:
        assert s.query(samples_database.Sample).count() == 1
        assert s.query(samples_database.Sample).one().ToProto() == sample_proto
Beispiel #10
0
def run_extractors(sample: Sample) -> Sample:
    if sample.compile_status:
        return Sample(**Sample.FromProto(
            0,
            model_pb2.Sample(
                train_step=sample.train_step,
                text=sample.text,
                sample_indices=sample.sample_indices,
                encoded_sample_indices=sample.encoded_sample_indices,
                original_input=sample.original_input,
                sample_feed=sample.sample_feed,
                encoded_text=sample.encoded_text,
                sample_time_ms=sample.sample_time_ms,
                feature_vector=extractor.ExtractRawFeatures(sample.text),
                num_tokens=sample.num_tokens,
                compile_status=sample.compile_status,
                categorical_sampling=int(sample.categorical_sampling),
                date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"),
            )))
    else:
        return Sample(**Sample.FromProto(
            0,
            model_pb2.Sample(
                train_step=sample.train_step,
                text=sample.text,
                sample_indices=sample.sample_indices,
                encoded_sample_indices=sample.encoded_sample_indices,
                original_input=sample.original_input,
                sample_feed=sample.sample_feed,
                encoded_text=sample.encoded_text,
                sample_time_ms=sample.sample_time_ms,
                feature_vector="",
                num_tokens=sample.num_tokens,
                compile_status=sample.compile_status,
                categorical_sampling=int(sample.categorical_sampling),
                date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"),
            )))
Beispiel #11
0
def test_KerasBackend_Sample_return_value_matches_cached_sample(
        clgen_cache_dir, abc_keras_model_config):
    """Test that Sample() returns Sample protos."""
    del clgen_cache_dir
    m = models.Model(abc_keras_model_config)
    samples = m.Sample(MockSampler(hash='hash'), 1)
    assert len(samples) == 1
    assert len(list((m.cache.path / 'samples' / 'hash').iterdir())) == 1
    cached_sample_path = (m.cache.path / 'samples' / 'hash' / list(
        (m.cache.path / 'samples' / 'hash').iterdir())[0])
    assert cached_sample_path.is_file()
    cached_sample = pbutil.FromFile(cached_sample_path, model_pb2.Sample())
    assert samples[0].text == cached_sample.text
    assert samples[0].sample_time_ms == cached_sample.sample_time_ms
    assert samples[
        0].sample_start_epoch_ms_utc == cached_sample.sample_start_epoch_ms_utc
Beispiel #12
0
def ToProto(dp: ActiveFeed) -> samples_database.Sample:
    return samples_database.Sample(**samples_database.Sample.FromProto(
        0,
        model_pb2.Sample(
            train_step=-1,
            text=dp.sample,
            sample_indices="",
            encoded_sample_indices="",
            original_input="",
            sample_feed=dp.input_feed,
            encoded_text="",
            sample_time_ms=0,
            feature_vector=extractor.ExtractRawFeatures(dp.sample),
            num_tokens=dp.num_tokens,
            compile_status=dp.compile_status,
            categorical_sampling=1,
            date_added=dp.date_added.strftime("%m/%d/%Y, %H:%M:%S"),
        )))
Beispiel #13
0
def update_tokenizer(sample: Sample, tokenizer) -> Sample:
    encoded = tokenizer.TokenizeString(sample.text)
    return Sample(**Sample.FromProto(
        0,
        model_pb2.Sample(
            train_step=sample.train_step,
            text=sample.text,
            sample_indices=sample.sample_indices,
            encoded_sample_indices=sample.sample_indices,
            original_input=sample.original_input,
            sample_feed=sample.sample_feed,
            encoded_text=','.join([str(x) for x in encoded]),
            sample_time_ms=sample.sample_time_ms,
            feature_vector=sample.feature_vector,
            num_tokens=len(encoded),
            compile_status=sample.compile_status,
            categorical_sampling=int(sample.categorical_sampling),
            date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"),
        )))
def test_KerasBackend_Sample_return_value_matches_cached_sample(
        clgen_cache_dir, abc_keras_model_config):
    """Test that Sample() returns Sample protos."""
    del clgen_cache_dir
    m = models.Model(abc_keras_model_config)
    sample_observer = sample_observers.InMemorySampleSaver()
    m.Sample(
        MockSampler(hash="hash"),
        [sample_observers.MaxSampleCountObserver(1), sample_observer],
    )
    samples = sample_observer.samples
    assert len(samples) == 1
    assert len(list((m.cache.path / "samples" / "hash").iterdir())) == 1
    cached_sample_path = (m.cache.path / "samples" / "hash" / list(
        (m.cache.path / "samples" / "hash").iterdir())[0])
    assert cached_sample_path.is_file()
    cached_sample = pbutil.FromFile(cached_sample_path, model_pb2.Sample())
    assert samples[0].text == cached_sample.text
    assert samples[0].sample_time_ms == cached_sample.sample_time_ms
    assert (samples[0].sample_start_epoch_ms_utc ==
            cached_sample.sample_start_epoch_ms_utc)
Beispiel #15
0
    def _SampleLMBatch(
        self,
        sampler: 'samplers.Sampler',
        tokenizer: tokenizers.TokenizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
        epoch: int,
    ) -> bool:
        """
    Run a sampling iteration over BERT models.
    """
        start_time = datetime.datetime.utcnow()
        seq_count = 0
        self.backend.InitSampleBatch(sampler,
                                     workload_size=FLAGS.sample_workload_size)
        try:
            org_inputs, input_ids, samples, indices = self.backend.SampleNextIndices(
                sampler)
        except StopIteration:
            return False, seq_count

        if not samples:
            # Return empty means model has not produced something that can be stored.
            # This 'if' accommodates active sampling, which is very selective.
            return True, seq_count

        continue_sampling = True

        if environment.WORLD_RANK == 0:
            assert len(org_inputs) == len(input_ids) == len(samples) == len(
                indices), "Length mismatch, {}-{}-{}-{}".format(
                    len(org_inputs), len(input_ids), len(samples),
                    len(indices))
            for org, inp, sample, idxs in zip(org_inputs, input_ids, samples,
                                              indices):

                src = self.tokenizer.ArrayToCode(sample, with_formatting=True)
                try:
                    stdout = opencl.Compile(src)
                    compile_flag = True
                    features = extractor.ExtractRawFeatures(src)
                except ValueError:
                    compile_flag = False
                    features = ""

                end_time = datetime.datetime.utcnow()
                sample = model_pb2.Sample(
                    train_step=epoch,
                    text=src,
                    sample_indices=','.join([
                        self.tokenizer.decoder[idx].replace('\n', '\\n')
                        for idx in idxs
                    ]).replace('\n', '\\n'),
                    encoded_sample_indices=','.join([str(idx)
                                                     for idx in idxs]),
                    original_input=self.tokenizer.tokensToString(
                        org,
                        with_formatting=False,
                        ignore_token=self.tokenizer.padToken),
                    sample_feed=self.tokenizer.tokensToString(
                        inp,
                        with_formatting=False,
                        ignore_token=self.tokenizer.padToken),
                    encoded_text=",".join([str(x) for x in sample]),
                    sample_start_epoch_ms_utc=int(start_time.strftime("%s%f")),
                    sample_time_ms=int(
                        round(1000 * ((end_time - start_time) /
                                      len(samples)).total_seconds())),
                    wall_time_ms=int(
                        round(1000 * ((end_time - start_time) /
                                      len(samples)).total_seconds())),
                    feature_vector=features,
                    num_tokens=np.where(
                        sample == self.tokenizer.padToken)[0][0]
                    if self.tokenizer.padToken in sample else len(sample),
                    compile_status=compile_flag,
                    categorical_sampling=self.backend.samplesWithCategorical(),
                    date_added=datetime.datetime.utcnow().strftime(
                        "%m/%d/%Y, %H:%M:%S"),
                )
                # Notify sample observers.
                continue_sampling &= all(
                    [obs.OnSample(sample) for obs in sample_observers])
                seq_count += 1
            if environment.WORLD_SIZE > 1:
                distrib.write(str(continue_sampling))
        else:
            status = distrib.read()
            if status == "True":
                continue_sampling = True
            elif status == "False":
                continue_sampling = False
            else:
                raise OSError(
                    "Broken distributed message: '{}'".format(status))
        return continue_sampling, seq_count
Beispiel #16
0
    def _Train(
        self,
        corpus,
        test_sampler: typing.Optional[samplers.Sampler],
    ) -> None:
        """Core training function"""
        if not self.is_trained:

            train_input_fn = self.train.data_generator.generateTfDataset(
                sequence_length=self.config.training.sequence_length,
                num_cpu_threads=os.cpu_count(),
                use_tpu=FLAGS.use_tpu,
                is_training=True)

            l.logger().info(
                "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)"
                .format(
                    self.num_train_steps, self.num_epochs,
                    self.steps_per_epoch,
                    self.config.training.num_train_steps -
                    self.num_train_steps))
            try:
                if FLAGS.sample_per_epoch == 0:
                    self.train.estimator.train(input_fn=train_input_fn,
                                               max_steps=self.num_train_steps)
                else:
                    sampler, observers = self._getTestSampler(
                        test_sampler, self.config.training.sequence_length)
                    self.InitSampling(sampler,
                                      self.config.training.random_seed)
                    for ep in range(self.num_epochs):
                        self.train.estimator.train(input_fn=train_input_fn,
                                                   steps=self.steps_per_epoch)
                        for _ in range(FLAGS.sample_per_epoch):
                            start_time = datetime.datetime.utcnow()
                            self.InitSampleBatch()
                            sample_batch, sample_indices = self.SampleNextIndices(
                            )
                            end_time = datetime.datetime.utcnow()
                            for sample, sind in zip(sample_batch,
                                                    sample_indices):

                                try:
                                    stdout = opencl.Compile(
                                        self.tokenizer.ArrayToCode(sample))
                                    compile_flag = 1
                                except ValueError:
                                    compile_flag = 0

                                feature_vector = extractor.ExtractFeatures(
                                    self.tokenizer.ArrayToCode(sample))
                                sample_proto = model_pb2.Sample(
                                    train_step=(ep + 1) * self.steps_per_epoch,
                                    sample_feed=sampler.start_text,
                                    text=self.tokenizer.tokensToString(
                                        sample,
                                        ignore_token=self.tokenizer.padToken).
                                    replace("\\n", "\n"),
                                    encoded_text=",".join(
                                        [str(t) for t in sample]),
                                    sample_indices='\n'.join([
                                        self.tokenizer.tokensToString(
                                            mind).replace('\n', '\\n')
                                        for mind in sind
                                    ]),
                                    encoded_sample_indices='\n'.join([
                                        ','.join([str(x) for x in mind])
                                        for mind in sind
                                    ]),
                                    sample_time_ms=int(
                                        round(1000 * ((end_time - start_time) /
                                                      sampler.batch_size
                                                      ).total_seconds())),
                                    feature_vector="\n".join([
                                        "{}:{}".format(k, v)
                                        for (k, v) in feature_vector.items()
                                    ]),
                                    num_tokens=len(sample),
                                    compile_status=compile_flag,
                                    categorical_sampling=self.
                                    samplesWithCategorical(),
                                    date_added=datetime.datetime.utcnow(
                                    ).strftime("%m/%d/%Y, %H:%M:%S"),
                                )
                                for obs in observers:
                                    obs.OnSample(sample_proto)
            except KeyboardInterrupt:
                pass
            if not FLAGS.force_eval:
                self.Validate()

        if FLAGS.force_eval and not self.is_validated:
            self.Validate()
        # self.telemetry.TfRecordEpochs()
        return
Beispiel #17
0
    def _SampleSeqBatch(
        self,
        sampler: 'samplers.Sampler',
        tokenizer: tokenizers.TokenizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
        epoch: int,
    ) -> bool:
        """
    Run a single iteration of the batched sample inner-loop for sequential models.
    """

        start_time = datetime.datetime.utcnow()

        self.backend.InitSampleBatch(sampler)
        samples_in_progress = [
            sampler.tokenized_start_text.copy()
            for _ in range(sampler.batch_size)
        ]
        done = np.zeros(sampler.batch_size, dtype=np.bool)
        wall_time_start = start_time
        seq_count = 0

        # The return value of this method. If any of the sample_observers return
        # False, this value is set to False.
        continue_sampling = True

        # Sampling loop. Continues until all samples in the batch are done.
        while not done.all():
            indices = self.backend.SampleNextIndices(sampler, done)
            # Iterate over all samples in batch to determine whether they're
            # done.

            for i in range(len(indices)):
                if done[i]:
                    continue

                for index in indices[i]:
                    samples_in_progress[i].append(tokenizer.decoder[index])
                    step_ind = ""
                    encoded_step_indices = ""

                    if sampler.SampleIsComplete(samples_in_progress[i]):
                        end_time = datetime.datetime.utcnow()
                        sample_kernel = [x for x in samples_in_progress[i]]
                        features = extractor.ExtractRawFeatures(''.join(
                            samples_in_progress[i]))
                        done[i] = 1
                        try:
                            stdout = opencl.Compile(''.join(
                                samples_in_progress[i]))
                            compile_flag = True
                        except ValueError:
                            compile_flag = False

                        sample = model_pb2.Sample(
                            train_step=epoch,
                            text=samples_in_progress[i],
                            sample_indices="",
                            encoded_sample_indices="",
                            sample_feed=sampler.start_text,
                            encoded_text=",".join([
                                str(tokenizer.vocab[x]) for x in sample_kernel
                            ]),
                            sample_start_epoch_ms_utc=int(
                                start_time.strftime("%s%f")),
                            sample_time_ms=int(
                                round(1000 *
                                      ((end_time - start_time) /
                                       sampler.batch_size).total_seconds())),
                            wall_time_ms=int(
                                round(1000 *
                                      ((end_time - start_time) /
                                       sampler.batch_size).total_seconds())),
                            feature_vector=features,
                            num_tokens=len(samples_in_progress[i]),
                            compile_status=compile_flag,
                            categorical_sampling=self.backend.
                            samplesWithCategorical(),
                            date_added=datetime.datetime.utcnow().strftime(
                                "%m/%d/%Y, %H:%M:%S"),
                        )
                        # Notify sample observers.
                        continue_sampling &= all(
                            [obs.OnSample(sample) for obs in sample_observers])
                        seq_count += 1
                        # Wall sample time is the difference between the end of the previous
                        # sample and the end of the current sample.
                        wall_time_start = datetime.datetime.utcnow()
                        break
        return continue_sampling, seq_count
Beispiel #18
0
  def Train(self,
            corpus,
            test_sampler : typing.Optional[samplers.Sampler] = None,
            pre_train    : bool = False,
            **unused_kwargs
            ) -> None:
    """
    Main training entry point.
    """
    self._ConfigTrainParams(
      torchLMDataGenerator.TrainMaskLMBatchGenerator(
        corpus, self.config.training,
        self.cache.path,
        self.config.training.num_pretrain_steps if pre_train else None,
        pre_train,
        self.feature_encoder,
        self.feature_tokenizer,
        self.feature_sequence_length,
      ), pre_train
    )

    if FLAGS.only_sample:
      return
      
    self.current_step = self.loadCheckpoint(self.train, pre_train = pre_train)
    if self.pytorch.num_gpus > 0:
      self.torch.cuda.empty_cache()
    if self.current_step >= 0:
      l.logger().info("Loaded checkpoint step {}".format(self.current_step))
    self.current_step = max(0, self.current_step)

    if self.current_step < self.num_train_steps:
      self.train.model.zero_grad()

      ## Set batch size in case of TPU training or distributed training.
      if self.torch_tpu_available:
        total_train_batch_size = self.train_batch_size * self.pytorch.torch_xla.xrt_world_size()
      else:
        total_train_batch_size = (
          self.train_batch_size
          * (self.torch.distributed.get_world_size() if self.pytorch.num_nodes > 1 else 1)
        )

      # Set dataloader in case of TPU training.
      if self.torch_tpu_available:
        loader = self.pytorch.torch_ploader.ParallelLoader(
                            self.train.data_generator.dataloader, [self.pytorch.device]
                          ).per_device_loader(self.pytorch.device)
      else:
        loader = self.train.data_generator.dataloader

      # Get dataloader iterator and setup hooks.
      batch_iterator = iter(loader)
      if self.is_world_process_zero():
        train_hook = hooks.tensorMonitorHook(
          self.logfile_path if not pre_train else self.pre_logfile_path, self.current_step, min(self.steps_per_epoch, FLAGS.monitor_frequency)
        )
      if FLAGS.reward_compilation >= 0 and not pre_train:
        correct_sample_obs = sample_observers.SamplesDatabaseObserver(
          self.logfile_path / "correct_samples.db"
        )
      else:
        correct_sample_obs = None
      
      total_steps = self.config.training.num_pretrain_steps if pre_train else self.config.training.num_train_steps
      l.logger().info(
        "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)".format(
          self.num_train_steps, self.num_epochs, 
          self.steps_per_epoch, total_steps - self.num_train_steps
        )
      )
      try:
        self.train.model.train()
        epoch_iter = tqdm.auto.trange(self.num_epochs, desc="Epoch", leave = False) if self.is_world_process_zero() else range(self.num_epochs)
        for epoch in epoch_iter:

          # In distributed mode, calling the set_epoch() method at
          # the beginning of each epoch before creating the DataLoader iterator
          # is necessary to make shuffling work properly across multiple epochs.
          # Otherwise, the same ordering will be always used.
          if self.pytorch.num_nodes > 1:
            loader.sampler.set_epoch(epoch)

          if epoch < self.current_step // self.steps_per_epoch:
            continue # Stupid bar won't resume.

          batch_iter = tqdm.auto.trange(self.steps_per_epoch, desc="Batch", leave = False) if self.is_world_process_zero() else range(self.steps_per_epoch)
          for step in batch_iter:
            if self.is_world_process_zero():
              start = datetime.datetime.utcnow()
            try:
              inputs = next(batch_iterator)
            except StopIteration:
              # dataloader has different len() than steps_per_epoch.
              # This is the easiest way to infinite-loop dataloaders in pytorch.
              batch_iterator = iter(loader)
              inputs = next(batch_iterator)

            self.current_step += 1
            # Move inputs to torch device.
            inputs     = self.to_device(inputs)
            # Run model step on batch
            step_out   = self.model_step(self.train.model, inputs, step = epoch * self.steps_per_epoch + step)
            # Collect losses and backpropagate
            total_loss = step_out['total_loss'].mean()
            total_loss.backward()

            self.torch.nn.utils.clip_grad_norm_(self.train.model.parameters(), self.max_grad_norm)
            if self.torch_tpu_available:
              self.pytorch.torch_xla.optimizer_step(self.train.optimizer)
            else:
              self.train.optimizer.step()
            self.train.scheduler.step()

            ## Collect tensors for logging.
            if self.pytorch.num_nodes > 1:
              total_loss         = [self.torch.zeros(tuple(step_out['total_loss'        ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              masked_lm_loss     = [self.torch.zeros(tuple(step_out['masked_lm_loss'    ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              # next_sentence_loss = [self.torch.zeros(tuple(step_out['next_sentence_loss'].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              masked_lm_lengths  = [self.torch.zeros(tuple(inputs  ['masked_lm_lengths' ].shape), dtype = self.torch.int64  ).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]

              self.torch.distributed.all_gather(masked_lm_loss,     step_out["masked_lm_loss"])
              # self.torch.distributed.all_gather(next_sentence_loss, step_out["next_sentence_loss"])
              self.torch.distributed.all_gather(masked_lm_lengths,  inputs['masked_lm_lengths'].to(self.pytorch.device))
              self.torch.distributed.all_gather(total_loss,         step_out['total_loss'])
            else:
              total_loss         = step_out['total_loss'        ].unsqueeze(0).cpu()
              masked_lm_loss     = step_out['masked_lm_loss'    ].unsqueeze(0).cpu()
              # next_sentence_loss = step_out['next_sentence_loss'].unsqueeze(0).cpu()
              masked_lm_lengths  = inputs['masked_lm_lengths' ].cpu()

            if self.is_world_process_zero():
              exec_time_ms = int(round((datetime.datetime.utcnow() - start).total_seconds() * 1000))
              if FLAGS.reward_compilation >= 0 and FLAGS.reward_compilation <= epoch * self.steps_per_epoch + step and not pre_train:
                ## Logging when compiler reward is enabled in training.
                ## This is not compatible with using DDP, and basically compiler-rewarded training is deprecated and proven to be wrong and inefficient.
                correct_samples = [(x, y) for en, (x, y) in enumerate(zip(inputs['input_ids'].cpu().numpy(), step_out['generated_samples'].cpu().numpy())) if step_out['compile_status'][en] == 1]
                for s in correct_samples:
                  feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(s[1]))
                  correct_sample_obs.OnSample(model_pb2.Sample(
                      train_step             = self.current_step,
                      sample_feed            = self.tokenizer.tokensToString(s[0], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                      text                   = self.tokenizer.tokensToString(s[1], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                      encoded_text           = ",".join([str(t) for t in s[1]]),
                      sample_indices         = '',
                      encoded_sample_indices = '',
                      sample_time_ms         = int(round(exec_time_ms / self.train_batch_size)),
                      feature_vector         = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]),
                      num_tokens             = len([x for x in s[1] if x != self.tokenizer.padToken]),
                      categorical_sampling   = False,
                      compile_status         = True,
                      date_added             = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"),
                    )
                  )
              if not pre_train:
                ## Fine-tuning logging.
                train_hook.step(
                  masked_lm_loss          = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss),
                  # next_sentence_loss      = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss),
                  total_loss              = sum([tl.mean().item() for tl in total_loss]) / len(total_loss),
                  learning_rate           = self.train.scheduler.get_last_lr()[0],
                  num_correct_samples     = (correct_sample_obs.sample_id if correct_sample_obs is not None else None),
                  batch_avg_hole_len      = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1])
                                                 for b in masked_lm_lengths]) / len(masked_lm_lengths),
                  batch_execution_time_ms = exec_time_ms,
                  time_per_sample_ms      = exec_time_ms / self.train_batch_size,
                )
              else:
                ## Pre-training logging.
                train_hook.step(
                  masked_lm_loss          = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss),
                  # next_sentence_loss      = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss),
                  total_loss              = sum([tl.mean().item() for tl in total_loss]) / len(total_loss),
                  learning_rate           = self.train.scheduler.get_last_lr()[0],
                  batch_avg_hole_len      = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1])
                                                 for b in masked_lm_lengths]) / len(masked_lm_lengths),
                  batch_execution_time_ms = exec_time_ms,
                  time_per_sample_ms      = exec_time_ms / self.train_batch_size,
                )
            self.train.model.zero_grad()
            if self.current_step == 0:
              l.logger().info("Starting Loss: {}".format(sum([tl.mean().item() for tl in total_loss]) / len(total_loss)))

          # End of Epoch
          self.saveCheckpoint(self.train, pre_train)
          if self.is_world_process_zero():
            set_mail = "Epoch {} Loss: {}\n".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss)
            l.logger().info("Epoch {} Loss: {}".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss))

          if self.pytorch.num_nodes > 1:
            loader.sampler.set_epoch(epoch)

          if FLAGS.validate_per_epoch and self.train.data_generator.config.validation_split > 0:
            val_ml_loss = self.Validate(per_epoch = True, pre_train = pre_train)
            if self.is_world_process_zero():
              train_hook.end_epoch(
              val_masked_lm_loss      = val_ml_loss,
              # val_next_sentence_loss  = val_nsp_loss,
              val_total_loss          = val_ml_loss # + val_nsp_loss,
              )
            set_mail += "Validation Loss: {}\n".format(val_ml_loss)
          elif self.is_world_process_zero():
            train_hook.end_epoch()

          if FLAGS.notify_me:
            client.getClient().send_message("clgen:torch_bert", set_mail)

          if self.torch_tpu_available:
            self.pytorch.torch_xla.master_print(self.pytorch.torch_xla_met.metrics_report())

          if FLAGS.sample_per_epoch > 0:
            sampler, observers = self._getTestSampler(test_sampler, self.config.training.sequence_length)
            self.InitSampling(sampler, self.config.training.random_seed)
            for _ in range(FLAGS.sample_per_epoch):
              start_time   = datetime.datetime.utcnow()
              self.InitSampleBatch(sampler)
              org_inputs, input_ids, samples, indices = self.SampleNextIndices()
              end_time = datetime.datetime.utcnow()
              for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices):
                try:
                  stdout = opencl.Compile(self.tokenizer.ArrayToCode(sample))
                  compile_flag = 1
                except ValueError:
                  compile_flag = 0

                feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(sample))
                sample_proto = model_pb2.Sample(
                  train_step             = self.current_step,
                  sample_feed            = sampler.start_text,
                  original_input         = self.tokenizer.tokensToString(org,    with_formatting = True, ignore_token = self.tokenizer.padToken),
                  text                   = self.tokenizer.tokensToString(sample, with_formatting = True, ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                  encoded_text           = ",".join([str(t) for t in sample]),
                  sample_indices         = ','.join([self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs]).replace('\n', '\\n'),
                  encoded_sample_indices = ','.join([str(idx) for idx in idxs]),
                  sample_time_ms         = int(round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())),
                  feature_vector         = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]),
                  num_tokens             = len(sample),
                  compile_status         = compile_flag,
                  categorical_sampling   = self.samplesWithCategorical(),
                  date_added             = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"),
                )
                for obs in observers:
                  obs.OnSample(sample_proto)
      except KeyboardInterrupt:
        pass

      if not FLAGS.force_eval:
        _ = self.Validate(pre_train = pre_train)

    if FLAGS.force_eval and not self.is_validated:
      _ = self.Validate(pre_train = pre_train)
    return
Beispiel #19
0
  def Sample(
      self, sampler: samplers.Sampler, min_num_samples: int,
      seed: int = None) -> typing.Iterable[model_pb2.Sample]:
    """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A iterator over samples.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
    sample_count = 1
    atomizer = self.atomizer
    sampler.Specialize(atomizer)
    batch_size = self.backend.InitSampling(sampler, seed)
    sample_start_time = labdate.MillisecondsTimestamp()
    # Per-sample batch outer loop. Continues until we have as many samples
    # as we want.
    while True:
      samples_in_progress = [
        sampler.tokenized_start_text.copy()
        for _ in range(batch_size)]
      done = np.zeros(batch_size, dtype=np.bool)
      start_time = labdate.MillisecondsTimestamp()
      wall_time_start = start_time

      self.backend.InitSampleBatch(sampler, batch_size)

      # Sampling loop. Continues until all samples in the batch are done.
      while True:
        indices = self.backend.SampleNextIndices(sampler, batch_size)

        # Iterate over all samples in batch to determine whether they're
        # done.
        for i in range(batch_size):
          if done[i]:
            continue

          token = atomizer.decoder[indices[i]]
          samples_in_progress[i].append(token)
          if sampler.SampleIsComplete(samples_in_progress[i]):
            end_time = labdate.MillisecondsTimestamp()
            done[i] = 1
            sample = model_pb2.Sample(
                text=''.join(samples_in_progress[i]),
                sample_start_epoch_ms_utc=start_time,
                sample_time_ms=end_time - start_time,
                wall_time_ms=end_time - wall_time_start,
                num_tokens=len(samples_in_progress[i]))
            sample_count += 1
            yield sample
            wall_time_start = labdate.MillisecondsTimestamp()

        # Complete the batch.
        if done.all():
          break

      # Complete sampling. Note that sample_count starts at 1.
      if sample_count > min_num_samples:
        now = labdate.MillisecondsTimestamp()
        logging.info(
            'Produced %s samples at a rate of %s ms / sample.',
            humanize.intcomma(sample_count - 1),
            humanize.intcomma(
                int((now - sample_start_time) / max(sample_count - 1, 1))))
        break
Beispiel #20
0
    def SampleFast(self,
                   sampler: samplers.Sampler,
                   min_num_samples: int,
                   seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    Same as Sample(), but without printing or caching samples. Because samples
    are not cached, infinite sampling loops are not supported, since we must
    return the sample protos at some point.

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            sample_start_time = labdate.MillisecondsTimestamp()
            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)
            samples = []

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)

                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            sample_count += 1
                            samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) / len(samples))))
                    break

        return samples
Beispiel #21
0
    def Sample(self,
               sampler: samplers.Sampler,
               min_num_samples: int,
               seed: int = None) -> typing.List[model_pb2.Sample]:
        """Sample a model.

    If the model is not already trained, calling Sample() first trains the
    model. Thus a call to Sample() is equivalent to calling Train() then
    Sample().

    Args:
      sampler: The sampler to sample using.
      min_num_samples: The minimum number of samples to return. Note that the
        true number of samples returned may be higher than this value, as
        sampling occurs in batches. The model will continue producing samples
        until the lowest mulitple of the sampler batch size property that is
        larger than this value. E.g. if min_num_samples is 7 and the Sampler
        batch size is 10, 10 samples will be returned.
      seed: A numeric value to seed the RNG with. If not present, the RNG is
        seeded randomly.

    Returns:
      A list of Sample protos.

    Raises:
      UnableToAcquireLockError: If the model is locked (i.e. there is another
        process currently modifying the model).
      InvalidStartText: If the sampler start text cannot be encoded.
      InvalidSymtokTokens: If the sampler symmetrical depth tokens cannot be
        encoded.
    """
        self.Train()

        sample_count = 1
        self.SamplerCache(sampler).mkdir(exist_ok=True)
        with logutil.TeeLogsToFile(f'sampler_{sampler.hash}',
                                   self.cache.path / 'logs'):
            logging.info("Sampling: '%s'", sampler.start_text)
            if min_num_samples < 0:
                logging.warning(
                    'Entering an infinite sample loop, this process will never end!'
                )
            sample_start_time = labdate.MillisecondsTimestamp()

            atomizer = self.corpus.atomizer
            sampler.Specialize(atomizer)
            batch_size = self.backend.InitSampling(sampler, seed)

            samples = []
            sample_dir = self.SamplerCache(sampler)

            # Per-sample batch outer loop. Continues until we have as many samples
            # as we want.
            while True:
                samples_in_progress = [
                    sampler.tokenized_start_text.copy()
                    for _ in range(batch_size)
                ]
                done = np.zeros(batch_size, dtype=np.bool)
                start_time = labdate.MillisecondsTimestamp()
                wall_time_start = start_time

                self.backend.InitSampleBatch(sampler, batch_size)

                # Sampling loop. Continues until all samples in the batch are done.
                while True:
                    indices = self.backend.SampleNextIndices(
                        sampler, batch_size)

                    # Iterate over all samples in batch to determine whether they're
                    # done.
                    for i in range(batch_size):
                        if done[i]:
                            continue

                        token = atomizer.decoder[indices[i]]
                        samples_in_progress[i].append(token)
                        if sampler.SampleIsComplete(samples_in_progress[i]):
                            end_time = labdate.MillisecondsTimestamp()
                            done[i] = 1
                            sample = model_pb2.Sample(
                                text=''.join(samples_in_progress[i]),
                                sample_start_epoch_ms_utc=start_time,
                                sample_time_ms=end_time - start_time,
                                wall_time_ms=end_time - wall_time_start,
                                num_tokens=len(samples_in_progress[i]))
                            print(f'=== BEGIN CLGEN SAMPLE {sample_count} '
                                  f'===\n\n{sample.text}\n')
                            sample_count += 1
                            sample_id = crypto.sha256_str(sample.text)
                            sample_path = sample_dir / f'{sample_id}.pbtxt'
                            pbutil.ToFile(sample, sample_path)
                            if min_num_samples > 0:
                                samples.append(sample)
                            wall_time_start = labdate.MillisecondsTimestamp()

                    # Complete the batch.
                    if done.all():
                        break

                # Complete sampling. Note that sample_count starts at 1.
                if sample_count > min_num_samples:
                    now = labdate.MillisecondsTimestamp()
                    logging.info(
                        'Produced %s samples at a rate of %s ms / sample.',
                        humanize.intcomma(len(samples)),
                        humanize.intcomma(
                            int((now - sample_start_time) /
                                max(len(samples), 1))))
                    break

        return samples