Exemple #1
0
  def initOrGetQueue(self) -> np.array:
    """
    If feed queue is not initialized, initialize it by getting new datapoint.
    Otherwise, don't do anything as feed_queue is already loaded from checkpoint.
    Adds datapoint to InputFeed table of database.

    Returns:
      Starting input feed of sampling.
    """
    if not self.feed_queue:
      try:
        cf = next(self.loader).squeeze(0)
      except StopIteration:
        self.loader = iter(self.dataloader)
        cf = next(self.loader).squeeze(0)
      cf = [int(x) for x in cf]
      self.feed_queue.append(
        ActiveSampleFeed(
          input_feed     = cf,
          input_features = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(cf), [self.feat_sampler.feature_space])[self.feat_sampler.feature_space],
          input_score    = math.inf,
          gen_id         = 0,
        )
      )
      if environment.WORLD_RANK == 0:
        self.addToDB(
          active_feed_database.ActiveInput.FromArgs(
            tokenizer      = self.tokenizer, id = self.active_db.input_count,
            input_feed     = cf, input_features = self.feed_queue[-1].input_features,
          )
        )
    l.logger().info("Feed queue input scores: {}".format(', '.join([str(round(c.input_score, 3)) for c in self.feed_queue])))
    return self.feed_queue[0].input_feed
Exemple #2
0
def text_candidate_worker(sample       : np.array,
                          # feed         : np.array,
                          feat_sampler : feature_sampler.EuclideanSampler,
                          tokenizer    : typing.TypeVar('corpuses.tokenizers.TokenizerBase'),
                          ) -> ActiveSample:
  sample, sample_indices, input_ids, mlm_lengths, feed = sample
  try:
    code = tokenizer.ArrayToCode(sample, with_formatting = False)
    _ = opencl.Compile(code)
    features = extractor.ExtractFeatures(code, [feat_sampler.feature_space])[feat_sampler.feature_space]
    if features:
      return (True, ActiveSample(
        sample_feed = feed,
        sample      = sample,
        sample_indices = [x for x in sample_indices if x != tokenizer.padToken],
        input_ids      = [x for x in input_ids if x != tokenizer.padToken],
        hole_lengths   = mlm_lengths,
        sample_indices_size = len([x for x in sample_indices if x != tokenizer.padToken]),
        features       = features,
        score          = feat_sampler.calculate_distance(features),
      ))
  except ValueError:
    pass
  except Exception as e:
    raise e
  return (False, ActiveSample(
    sample_feed = feed,
    sample      = sample,
    sample_indices = [x for x in sample_indices if x != tokenizer.padToken],
    input_ids      = [x for x in input_ids if x != tokenizer.padToken],
    hole_lengths   = mlm_lengths,
    sample_indices_size = len([x for x in sample_indices if x != tokenizer.padToken]),
    features       = {},
    score          = math.inf,
  ))
Exemple #3
0
def benchmark_worker(benchmark, feature_space, reduced_git_corpus):
    p, k, h = benchmark
    features = extractor.ExtractFeatures(k, [feature_space],
                                         header_file=h,
                                         use_aux_headers=False)
    closest_git = sorted(
        [(cf, calculate_distance(fts, features[feature_space], feature_space))
         for cf, fts in reduced_git_corpus],
        key=lambda x: x[1])[0]
    if features[feature_space] and closest_git[1] > 0:
        return Benchmark(p, p.name, k, features[feature_space])
Exemple #4
0
def ExtractAndCalculate(src_incl: typing.Tuple[str, str],
                        target_features: typing.Dict[str, float],
                        feature_space: str) -> typing.Tuple[str, str, float]:
    """
  Extract features for source code and calculate distance from target.

  Returns:
    Tuple of source code with distance.
  """
    src, incl = src_incl
    f = extractor.ExtractFeatures(
        src, [feature_space],
        header_file=incl,
        extra_args=[
            "-include{}".format(
                pathlib.Path(environment.CLSMITH_INCLUDE) / "CLSmith.h")
        ] if incl else [""])
    if feature_space in f and f[feature_space]:
        return src, incl, feature_sampler.calculate_distance(
            f[feature_space], target_features, feature_space)
    return None
Exemple #5
0
    def _Train(
        self,
        corpus,
        test_sampler: typing.Optional[samplers.Sampler],
    ) -> None:
        """Core training function"""
        if not self.is_trained:

            train_input_fn = self.train.data_generator.generateTfDataset(
                sequence_length=self.config.training.sequence_length,
                num_cpu_threads=os.cpu_count(),
                use_tpu=FLAGS.use_tpu,
                is_training=True)

            l.logger().info(
                "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)"
                .format(
                    self.num_train_steps, self.num_epochs,
                    self.steps_per_epoch,
                    self.config.training.num_train_steps -
                    self.num_train_steps))
            try:
                if FLAGS.sample_per_epoch == 0:
                    self.train.estimator.train(input_fn=train_input_fn,
                                               max_steps=self.num_train_steps)
                else:
                    sampler, observers = self._getTestSampler(
                        test_sampler, self.config.training.sequence_length)
                    self.InitSampling(sampler,
                                      self.config.training.random_seed)
                    for ep in range(self.num_epochs):
                        self.train.estimator.train(input_fn=train_input_fn,
                                                   steps=self.steps_per_epoch)
                        for _ in range(FLAGS.sample_per_epoch):
                            start_time = datetime.datetime.utcnow()
                            self.InitSampleBatch()
                            sample_batch, sample_indices = self.SampleNextIndices(
                            )
                            end_time = datetime.datetime.utcnow()
                            for sample, sind in zip(sample_batch,
                                                    sample_indices):

                                try:
                                    stdout = opencl.Compile(
                                        self.tokenizer.ArrayToCode(sample))
                                    compile_flag = 1
                                except ValueError:
                                    compile_flag = 0

                                feature_vector = extractor.ExtractFeatures(
                                    self.tokenizer.ArrayToCode(sample))
                                sample_proto = model_pb2.Sample(
                                    train_step=(ep + 1) * self.steps_per_epoch,
                                    sample_feed=sampler.start_text,
                                    text=self.tokenizer.tokensToString(
                                        sample,
                                        ignore_token=self.tokenizer.padToken).
                                    replace("\\n", "\n"),
                                    encoded_text=",".join(
                                        [str(t) for t in sample]),
                                    sample_indices='\n'.join([
                                        self.tokenizer.tokensToString(
                                            mind).replace('\n', '\\n')
                                        for mind in sind
                                    ]),
                                    encoded_sample_indices='\n'.join([
                                        ','.join([str(x) for x in mind])
                                        for mind in sind
                                    ]),
                                    sample_time_ms=int(
                                        round(1000 * ((end_time - start_time) /
                                                      sampler.batch_size
                                                      ).total_seconds())),
                                    feature_vector="\n".join([
                                        "{}:{}".format(k, v)
                                        for (k, v) in feature_vector.items()
                                    ]),
                                    num_tokens=len(sample),
                                    compile_status=compile_flag,
                                    categorical_sampling=self.
                                    samplesWithCategorical(),
                                    date_added=datetime.datetime.utcnow(
                                    ).strftime("%m/%d/%Y, %H:%M:%S"),
                                )
                                for obs in observers:
                                    obs.OnSample(sample_proto)
            except KeyboardInterrupt:
                pass
            if not FLAGS.force_eval:
                self.Validate()

        if FLAGS.force_eval and not self.is_validated:
            self.Validate()
        # self.telemetry.TfRecordEpochs()
        return
Exemple #6
0
  def Train(self,
            corpus,
            test_sampler : typing.Optional[samplers.Sampler] = None,
            pre_train    : bool = False,
            **unused_kwargs
            ) -> None:
    """
    Main training entry point.
    """
    self._ConfigTrainParams(
      torchLMDataGenerator.TrainMaskLMBatchGenerator(
        corpus, self.config.training,
        self.cache.path,
        self.config.training.num_pretrain_steps if pre_train else None,
        pre_train,
        self.feature_encoder,
        self.feature_tokenizer,
        self.feature_sequence_length,
      ), pre_train
    )

    if FLAGS.only_sample:
      return
      
    self.current_step = self.loadCheckpoint(self.train, pre_train = pre_train)
    if self.pytorch.num_gpus > 0:
      self.torch.cuda.empty_cache()
    if self.current_step >= 0:
      l.logger().info("Loaded checkpoint step {}".format(self.current_step))
    self.current_step = max(0, self.current_step)

    if self.current_step < self.num_train_steps:
      self.train.model.zero_grad()

      ## Set batch size in case of TPU training or distributed training.
      if self.torch_tpu_available:
        total_train_batch_size = self.train_batch_size * self.pytorch.torch_xla.xrt_world_size()
      else:
        total_train_batch_size = (
          self.train_batch_size
          * (self.torch.distributed.get_world_size() if self.pytorch.num_nodes > 1 else 1)
        )

      # Set dataloader in case of TPU training.
      if self.torch_tpu_available:
        loader = self.pytorch.torch_ploader.ParallelLoader(
                            self.train.data_generator.dataloader, [self.pytorch.device]
                          ).per_device_loader(self.pytorch.device)
      else:
        loader = self.train.data_generator.dataloader

      # Get dataloader iterator and setup hooks.
      batch_iterator = iter(loader)
      if self.is_world_process_zero():
        train_hook = hooks.tensorMonitorHook(
          self.logfile_path if not pre_train else self.pre_logfile_path, self.current_step, min(self.steps_per_epoch, FLAGS.monitor_frequency)
        )
      if FLAGS.reward_compilation >= 0 and not pre_train:
        correct_sample_obs = sample_observers.SamplesDatabaseObserver(
          self.logfile_path / "correct_samples.db"
        )
      else:
        correct_sample_obs = None
      
      total_steps = self.config.training.num_pretrain_steps if pre_train else self.config.training.num_train_steps
      l.logger().info(
        "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)".format(
          self.num_train_steps, self.num_epochs, 
          self.steps_per_epoch, total_steps - self.num_train_steps
        )
      )
      try:
        self.train.model.train()
        epoch_iter = tqdm.auto.trange(self.num_epochs, desc="Epoch", leave = False) if self.is_world_process_zero() else range(self.num_epochs)
        for epoch in epoch_iter:

          # In distributed mode, calling the set_epoch() method at
          # the beginning of each epoch before creating the DataLoader iterator
          # is necessary to make shuffling work properly across multiple epochs.
          # Otherwise, the same ordering will be always used.
          if self.pytorch.num_nodes > 1:
            loader.sampler.set_epoch(epoch)

          if epoch < self.current_step // self.steps_per_epoch:
            continue # Stupid bar won't resume.

          batch_iter = tqdm.auto.trange(self.steps_per_epoch, desc="Batch", leave = False) if self.is_world_process_zero() else range(self.steps_per_epoch)
          for step in batch_iter:
            if self.is_world_process_zero():
              start = datetime.datetime.utcnow()
            try:
              inputs = next(batch_iterator)
            except StopIteration:
              # dataloader has different len() than steps_per_epoch.
              # This is the easiest way to infinite-loop dataloaders in pytorch.
              batch_iterator = iter(loader)
              inputs = next(batch_iterator)

            self.current_step += 1
            # Move inputs to torch device.
            inputs     = self.to_device(inputs)
            # Run model step on batch
            step_out   = self.model_step(self.train.model, inputs, step = epoch * self.steps_per_epoch + step)
            # Collect losses and backpropagate
            total_loss = step_out['total_loss'].mean()
            total_loss.backward()

            self.torch.nn.utils.clip_grad_norm_(self.train.model.parameters(), self.max_grad_norm)
            if self.torch_tpu_available:
              self.pytorch.torch_xla.optimizer_step(self.train.optimizer)
            else:
              self.train.optimizer.step()
            self.train.scheduler.step()

            ## Collect tensors for logging.
            if self.pytorch.num_nodes > 1:
              total_loss         = [self.torch.zeros(tuple(step_out['total_loss'        ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              masked_lm_loss     = [self.torch.zeros(tuple(step_out['masked_lm_loss'    ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              # next_sentence_loss = [self.torch.zeros(tuple(step_out['next_sentence_loss'].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              masked_lm_lengths  = [self.torch.zeros(tuple(inputs  ['masked_lm_lengths' ].shape), dtype = self.torch.int64  ).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]

              self.torch.distributed.all_gather(masked_lm_loss,     step_out["masked_lm_loss"])
              # self.torch.distributed.all_gather(next_sentence_loss, step_out["next_sentence_loss"])
              self.torch.distributed.all_gather(masked_lm_lengths,  inputs['masked_lm_lengths'].to(self.pytorch.device))
              self.torch.distributed.all_gather(total_loss,         step_out['total_loss'])
            else:
              total_loss         = step_out['total_loss'        ].unsqueeze(0).cpu()
              masked_lm_loss     = step_out['masked_lm_loss'    ].unsqueeze(0).cpu()
              # next_sentence_loss = step_out['next_sentence_loss'].unsqueeze(0).cpu()
              masked_lm_lengths  = inputs['masked_lm_lengths' ].cpu()

            if self.is_world_process_zero():
              exec_time_ms = int(round((datetime.datetime.utcnow() - start).total_seconds() * 1000))
              if FLAGS.reward_compilation >= 0 and FLAGS.reward_compilation <= epoch * self.steps_per_epoch + step and not pre_train:
                ## Logging when compiler reward is enabled in training.
                ## This is not compatible with using DDP, and basically compiler-rewarded training is deprecated and proven to be wrong and inefficient.
                correct_samples = [(x, y) for en, (x, y) in enumerate(zip(inputs['input_ids'].cpu().numpy(), step_out['generated_samples'].cpu().numpy())) if step_out['compile_status'][en] == 1]
                for s in correct_samples:
                  feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(s[1]))
                  correct_sample_obs.OnSample(model_pb2.Sample(
                      train_step             = self.current_step,
                      sample_feed            = self.tokenizer.tokensToString(s[0], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                      text                   = self.tokenizer.tokensToString(s[1], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                      encoded_text           = ",".join([str(t) for t in s[1]]),
                      sample_indices         = '',
                      encoded_sample_indices = '',
                      sample_time_ms         = int(round(exec_time_ms / self.train_batch_size)),
                      feature_vector         = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]),
                      num_tokens             = len([x for x in s[1] if x != self.tokenizer.padToken]),
                      categorical_sampling   = False,
                      compile_status         = True,
                      date_added             = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"),
                    )
                  )
              if not pre_train:
                ## Fine-tuning logging.
                train_hook.step(
                  masked_lm_loss          = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss),
                  # next_sentence_loss      = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss),
                  total_loss              = sum([tl.mean().item() for tl in total_loss]) / len(total_loss),
                  learning_rate           = self.train.scheduler.get_last_lr()[0],
                  num_correct_samples     = (correct_sample_obs.sample_id if correct_sample_obs is not None else None),
                  batch_avg_hole_len      = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1])
                                                 for b in masked_lm_lengths]) / len(masked_lm_lengths),
                  batch_execution_time_ms = exec_time_ms,
                  time_per_sample_ms      = exec_time_ms / self.train_batch_size,
                )
              else:
                ## Pre-training logging.
                train_hook.step(
                  masked_lm_loss          = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss),
                  # next_sentence_loss      = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss),
                  total_loss              = sum([tl.mean().item() for tl in total_loss]) / len(total_loss),
                  learning_rate           = self.train.scheduler.get_last_lr()[0],
                  batch_avg_hole_len      = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1])
                                                 for b in masked_lm_lengths]) / len(masked_lm_lengths),
                  batch_execution_time_ms = exec_time_ms,
                  time_per_sample_ms      = exec_time_ms / self.train_batch_size,
                )
            self.train.model.zero_grad()
            if self.current_step == 0:
              l.logger().info("Starting Loss: {}".format(sum([tl.mean().item() for tl in total_loss]) / len(total_loss)))

          # End of Epoch
          self.saveCheckpoint(self.train, pre_train)
          if self.is_world_process_zero():
            set_mail = "Epoch {} Loss: {}\n".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss)
            l.logger().info("Epoch {} Loss: {}".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss))

          if self.pytorch.num_nodes > 1:
            loader.sampler.set_epoch(epoch)

          if FLAGS.validate_per_epoch and self.train.data_generator.config.validation_split > 0:
            val_ml_loss = self.Validate(per_epoch = True, pre_train = pre_train)
            if self.is_world_process_zero():
              train_hook.end_epoch(
              val_masked_lm_loss      = val_ml_loss,
              # val_next_sentence_loss  = val_nsp_loss,
              val_total_loss          = val_ml_loss # + val_nsp_loss,
              )
            set_mail += "Validation Loss: {}\n".format(val_ml_loss)
          elif self.is_world_process_zero():
            train_hook.end_epoch()

          if FLAGS.notify_me:
            client.getClient().send_message("clgen:torch_bert", set_mail)

          if self.torch_tpu_available:
            self.pytorch.torch_xla.master_print(self.pytorch.torch_xla_met.metrics_report())

          if FLAGS.sample_per_epoch > 0:
            sampler, observers = self._getTestSampler(test_sampler, self.config.training.sequence_length)
            self.InitSampling(sampler, self.config.training.random_seed)
            for _ in range(FLAGS.sample_per_epoch):
              start_time   = datetime.datetime.utcnow()
              self.InitSampleBatch(sampler)
              org_inputs, input_ids, samples, indices = self.SampleNextIndices()
              end_time = datetime.datetime.utcnow()
              for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices):
                try:
                  stdout = opencl.Compile(self.tokenizer.ArrayToCode(sample))
                  compile_flag = 1
                except ValueError:
                  compile_flag = 0

                feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(sample))
                sample_proto = model_pb2.Sample(
                  train_step             = self.current_step,
                  sample_feed            = sampler.start_text,
                  original_input         = self.tokenizer.tokensToString(org,    with_formatting = True, ignore_token = self.tokenizer.padToken),
                  text                   = self.tokenizer.tokensToString(sample, with_formatting = True, ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                  encoded_text           = ",".join([str(t) for t in sample]),
                  sample_indices         = ','.join([self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs]).replace('\n', '\\n'),
                  encoded_sample_indices = ','.join([str(idx) for idx in idxs]),
                  sample_time_ms         = int(round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())),
                  feature_vector         = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]),
                  num_tokens             = len(sample),
                  compile_status         = compile_flag,
                  categorical_sampling   = self.samplesWithCategorical(),
                  date_added             = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"),
                )
                for obs in observers:
                  obs.OnSample(sample_proto)
      except KeyboardInterrupt:
        pass

      if not FLAGS.force_eval:
        _ = self.Validate(pre_train = pre_train)

    if FLAGS.force_eval and not self.is_validated:
      _ = self.Validate(pre_train = pre_train)
    return