Example #1
0
def test_Compile_small_program():
    """Test that Compile accepts a small program."""
    assert opencl.Compile("""
kernel void A(global int*a ) {
  a[get_global_id(0)] = 0;
}
""") == """
Example #2
0
def text_candidate_worker(sample       : np.array,
                          # feed         : np.array,
                          feat_sampler : feature_sampler.EuclideanSampler,
                          tokenizer    : typing.TypeVar('corpuses.tokenizers.TokenizerBase'),
                          ) -> ActiveSample:
  sample, sample_indices, input_ids, mlm_lengths, feed = sample
  try:
    code = tokenizer.ArrayToCode(sample, with_formatting = False)
    _ = opencl.Compile(code)
    features = extractor.ExtractFeatures(code, [feat_sampler.feature_space])[feat_sampler.feature_space]
    if features:
      return (True, ActiveSample(
        sample_feed = feed,
        sample      = sample,
        sample_indices = [x for x in sample_indices if x != tokenizer.padToken],
        input_ids      = [x for x in input_ids if x != tokenizer.padToken],
        hole_lengths   = mlm_lengths,
        sample_indices_size = len([x for x in sample_indices if x != tokenizer.padToken]),
        features       = features,
        score          = feat_sampler.calculate_distance(features),
      ))
  except ValueError:
    pass
  except Exception as e:
    raise e
  return (False, ActiveSample(
    sample_feed = feed,
    sample      = sample,
    sample_indices = [x for x in sample_indices if x != tokenizer.padToken],
    input_ids      = [x for x in input_ids if x != tokenizer.padToken],
    hole_lengths   = mlm_lengths,
    sample_indices_size = len([x for x in sample_indices if x != tokenizer.padToken]),
    features       = {},
    score          = math.inf,
  ))
Example #3
0
 def checkIfBatchCompiles(self, sample: np.array) -> int:
     """Sends a filled sequence to the compiler"""
     try:
         stdout = opencl.Compile(self.tokenizer.ArrayToCode(sample))
         return 1
     except ValueError:
         return 0
Example #4
0
def test_Compile_missing_shim_define():
  """Test that Compile rejects a program which depends on the shim header."""
  with test.Raises(errors.ClangException):
    opencl.Compile(
      """
kernel void A(global FLOAT_T* a) {}
"""
    )
Example #5
0
def test_Compile_undefined_variable():
    """Test that Compile rejects a program with an undefined variable."""
    with pytest.raises(errors.ClangException) as e_info:
        opencl.Compile("""
kernel void A(global int* a) {
  undefined_variable;
}
""")
    assert 'use of undeclared identifier' in str(e_info.value)
Example #6
0
def test_Compile_undefined_function():
    """Test that Compile rejects a program with an undefined function."""
    with pytest.raises(errors.ClangException) as e_info:
        opencl.Compile("""
kernel void A(global int* a) {
  undefined_function(a);
}
""")
    assert 'implicit declaration of function' in str(e_info.value)
Example #7
0
def test_Compile_user_define():
  """Test that Compile accepts a program with a custom #define."""
  assert (
    opencl.Compile(
      """
#define FLOAT_T float
kernel void A(global FLOAT_T* a) {}
"""
    )
    == """
#define FLOAT_T float
kernel void A(global FLOAT_T* a) {}
"""
  )
Example #8
0
def test_Compile_syntax_error():
    """Test that Compile rejects a program with invalid syntax."""
    with pytest.raises(errors.ClangException) as e_info:
        opencl.Compile("kernel void A2@@1!!!#")
    assert 'error: ' in str(e_info.value)
Example #9
0
def test_Compile_empty_input():
    """Test that Compile accepts an empty input."""
    assert opencl.Compile('') == ''
Example #10
0
    def _Train(
        self,
        corpus,
        test_sampler: typing.Optional[samplers.Sampler],
    ) -> None:
        """Core training function"""
        if not self.is_trained:

            train_input_fn = self.train.data_generator.generateTfDataset(
                sequence_length=self.config.training.sequence_length,
                num_cpu_threads=os.cpu_count(),
                use_tpu=FLAGS.use_tpu,
                is_training=True)

            l.logger().info(
                "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)"
                .format(
                    self.num_train_steps, self.num_epochs,
                    self.steps_per_epoch,
                    self.config.training.num_train_steps -
                    self.num_train_steps))
            try:
                if FLAGS.sample_per_epoch == 0:
                    self.train.estimator.train(input_fn=train_input_fn,
                                               max_steps=self.num_train_steps)
                else:
                    sampler, observers = self._getTestSampler(
                        test_sampler, self.config.training.sequence_length)
                    self.InitSampling(sampler,
                                      self.config.training.random_seed)
                    for ep in range(self.num_epochs):
                        self.train.estimator.train(input_fn=train_input_fn,
                                                   steps=self.steps_per_epoch)
                        for _ in range(FLAGS.sample_per_epoch):
                            start_time = datetime.datetime.utcnow()
                            self.InitSampleBatch()
                            sample_batch, sample_indices = self.SampleNextIndices(
                            )
                            end_time = datetime.datetime.utcnow()
                            for sample, sind in zip(sample_batch,
                                                    sample_indices):

                                try:
                                    stdout = opencl.Compile(
                                        self.tokenizer.ArrayToCode(sample))
                                    compile_flag = 1
                                except ValueError:
                                    compile_flag = 0

                                feature_vector = extractor.ExtractFeatures(
                                    self.tokenizer.ArrayToCode(sample))
                                sample_proto = model_pb2.Sample(
                                    train_step=(ep + 1) * self.steps_per_epoch,
                                    sample_feed=sampler.start_text,
                                    text=self.tokenizer.tokensToString(
                                        sample,
                                        ignore_token=self.tokenizer.padToken).
                                    replace("\\n", "\n"),
                                    encoded_text=",".join(
                                        [str(t) for t in sample]),
                                    sample_indices='\n'.join([
                                        self.tokenizer.tokensToString(
                                            mind).replace('\n', '\\n')
                                        for mind in sind
                                    ]),
                                    encoded_sample_indices='\n'.join([
                                        ','.join([str(x) for x in mind])
                                        for mind in sind
                                    ]),
                                    sample_time_ms=int(
                                        round(1000 * ((end_time - start_time) /
                                                      sampler.batch_size
                                                      ).total_seconds())),
                                    feature_vector="\n".join([
                                        "{}:{}".format(k, v)
                                        for (k, v) in feature_vector.items()
                                    ]),
                                    num_tokens=len(sample),
                                    compile_status=compile_flag,
                                    categorical_sampling=self.
                                    samplesWithCategorical(),
                                    date_added=datetime.datetime.utcnow(
                                    ).strftime("%m/%d/%Y, %H:%M:%S"),
                                )
                                for obs in observers:
                                    obs.OnSample(sample_proto)
            except KeyboardInterrupt:
                pass
            if not FLAGS.force_eval:
                self.Validate()

        if FLAGS.force_eval and not self.is_validated:
            self.Validate()
        # self.telemetry.TfRecordEpochs()
        return
Example #11
0
    def ShouldProceed(self,
                      sample_in_progress: typing.List[str],
                      force: bool = False) -> bool:
        """Determine if a partial sample should be used as the new rollback state.

    Args:
      sample_in_progress: A list of strings, where each string is a token. The
        last token must be ';'.

    Returns:
      True if sampling should proceed with the current partial sample, else
      False.
    """
        candidate_src = self.TryToCloseProgram(sample_in_progress)
        if not candidate_src:
            app.Log(
                4,
                "Failed to produce syntactically valid program from partial sample"
            )
            return False

        # Feature extractor reads from files.
        path = self.working_dir / "kernel.cl"
        fs.Write(path, candidate_src.encode("utf-8"))

        features = self.TryToExtractFeatures(path)
        if features is None:
            app.Log(4, "Failed to extract features from partial sample")
            return False

        # Grewe feature extractor is robust to code that doesn't compile (i.e. code
        # containing implicit declarations). Run the code through clang to check
        # if it actually compiles, else reject it. This is more expensive than the
        # feature extractor, so run it after.
        try:
            opencl.Compile(candidate_src)
        except clgen_errors.ClangException:
            app.Log(4, "Failed to compile partial sample")
            return False

        # Implement pure hill climbing approach to match a target feature vector.
        # When enabled, partial samples which increase the distance to the target
        # feature vector are rejected.
        if self._target_features is not None:
            new_feature_distance = scipy.spatial.distance.euclidean(
                features, self._target_features)
            app.Log(
                2,
                "Features: %s, distance=%f, norm=%f, delta=%f",
                features,
                new_feature_distance,
                new_feature_distance / self._init_feature_distance,
                new_feature_distance - self._previous_feature_distance,
            )
            if not force and new_feature_distance > self._previous_feature_distance:
                # This will only happen once feature values are great than target
                # feature values.
                app.Log(
                    2, "Rejecting candidate because of positive feature delta")
                return False
            if (not force
                    and new_feature_distance == self._previous_feature_distance
                    and random.random() > FLAGS.
                    experimental_clgen_backtracking_reject_no_progress_probability
                ):
                app.Log(2, "Randomly rejecting candidate with no progress")
                return False
            self._previous_features = features
            self._previous_src = candidate_src
            self._previous_feature_distance = new_feature_distance

        return True
Example #12
0
    def _SampleSeqBatch(
        self,
        sampler: 'samplers.Sampler',
        tokenizer: tokenizers.TokenizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
        epoch: int,
    ) -> bool:
        """
    Run a single iteration of the batched sample inner-loop for sequential models.
    """

        start_time = datetime.datetime.utcnow()

        self.backend.InitSampleBatch(sampler)
        samples_in_progress = [
            sampler.tokenized_start_text.copy()
            for _ in range(sampler.batch_size)
        ]
        done = np.zeros(sampler.batch_size, dtype=np.bool)
        wall_time_start = start_time
        seq_count = 0

        # The return value of this method. If any of the sample_observers return
        # False, this value is set to False.
        continue_sampling = True

        # Sampling loop. Continues until all samples in the batch are done.
        while not done.all():
            indices = self.backend.SampleNextIndices(sampler, done)
            # Iterate over all samples in batch to determine whether they're
            # done.

            for i in range(len(indices)):
                if done[i]:
                    continue

                for index in indices[i]:
                    samples_in_progress[i].append(tokenizer.decoder[index])
                    step_ind = ""
                    encoded_step_indices = ""

                    if sampler.SampleIsComplete(samples_in_progress[i]):
                        end_time = datetime.datetime.utcnow()
                        sample_kernel = [x for x in samples_in_progress[i]]
                        features = extractor.ExtractRawFeatures(''.join(
                            samples_in_progress[i]))
                        done[i] = 1
                        try:
                            stdout = opencl.Compile(''.join(
                                samples_in_progress[i]))
                            compile_flag = True
                        except ValueError:
                            compile_flag = False

                        sample = model_pb2.Sample(
                            train_step=epoch,
                            text=samples_in_progress[i],
                            sample_indices="",
                            encoded_sample_indices="",
                            sample_feed=sampler.start_text,
                            encoded_text=",".join([
                                str(tokenizer.vocab[x]) for x in sample_kernel
                            ]),
                            sample_start_epoch_ms_utc=int(
                                start_time.strftime("%s%f")),
                            sample_time_ms=int(
                                round(1000 *
                                      ((end_time - start_time) /
                                       sampler.batch_size).total_seconds())),
                            wall_time_ms=int(
                                round(1000 *
                                      ((end_time - start_time) /
                                       sampler.batch_size).total_seconds())),
                            feature_vector=features,
                            num_tokens=len(samples_in_progress[i]),
                            compile_status=compile_flag,
                            categorical_sampling=self.backend.
                            samplesWithCategorical(),
                            date_added=datetime.datetime.utcnow().strftime(
                                "%m/%d/%Y, %H:%M:%S"),
                        )
                        # Notify sample observers.
                        continue_sampling &= all(
                            [obs.OnSample(sample) for obs in sample_observers])
                        seq_count += 1
                        # Wall sample time is the difference between the end of the previous
                        # sample and the end of the current sample.
                        wall_time_start = datetime.datetime.utcnow()
                        break
        return continue_sampling, seq_count
Example #13
0
    def _SampleLMBatch(
        self,
        sampler: 'samplers.Sampler',
        tokenizer: tokenizers.TokenizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
        epoch: int,
    ) -> bool:
        """
    Run a sampling iteration over BERT models.
    """
        start_time = datetime.datetime.utcnow()
        seq_count = 0
        self.backend.InitSampleBatch(sampler,
                                     workload_size=FLAGS.sample_workload_size)
        try:
            org_inputs, input_ids, samples, indices = self.backend.SampleNextIndices(
                sampler)
        except StopIteration:
            return False, seq_count

        if not samples:
            # Return empty means model has not produced something that can be stored.
            # This 'if' accommodates active sampling, which is very selective.
            return True, seq_count

        continue_sampling = True

        if environment.WORLD_RANK == 0:
            assert len(org_inputs) == len(input_ids) == len(samples) == len(
                indices), "Length mismatch, {}-{}-{}-{}".format(
                    len(org_inputs), len(input_ids), len(samples),
                    len(indices))
            for org, inp, sample, idxs in zip(org_inputs, input_ids, samples,
                                              indices):

                src = self.tokenizer.ArrayToCode(sample, with_formatting=True)
                try:
                    stdout = opencl.Compile(src)
                    compile_flag = True
                    features = extractor.ExtractRawFeatures(src)
                except ValueError:
                    compile_flag = False
                    features = ""

                end_time = datetime.datetime.utcnow()
                sample = model_pb2.Sample(
                    train_step=epoch,
                    text=src,
                    sample_indices=','.join([
                        self.tokenizer.decoder[idx].replace('\n', '\\n')
                        for idx in idxs
                    ]).replace('\n', '\\n'),
                    encoded_sample_indices=','.join([str(idx)
                                                     for idx in idxs]),
                    original_input=self.tokenizer.tokensToString(
                        org,
                        with_formatting=False,
                        ignore_token=self.tokenizer.padToken),
                    sample_feed=self.tokenizer.tokensToString(
                        inp,
                        with_formatting=False,
                        ignore_token=self.tokenizer.padToken),
                    encoded_text=",".join([str(x) for x in sample]),
                    sample_start_epoch_ms_utc=int(start_time.strftime("%s%f")),
                    sample_time_ms=int(
                        round(1000 * ((end_time - start_time) /
                                      len(samples)).total_seconds())),
                    wall_time_ms=int(
                        round(1000 * ((end_time - start_time) /
                                      len(samples)).total_seconds())),
                    feature_vector=features,
                    num_tokens=np.where(
                        sample == self.tokenizer.padToken)[0][0]
                    if self.tokenizer.padToken in sample else len(sample),
                    compile_status=compile_flag,
                    categorical_sampling=self.backend.samplesWithCategorical(),
                    date_added=datetime.datetime.utcnow().strftime(
                        "%m/%d/%Y, %H:%M:%S"),
                )
                # Notify sample observers.
                continue_sampling &= all(
                    [obs.OnSample(sample) for obs in sample_observers])
                seq_count += 1
            if environment.WORLD_SIZE > 1:
                distrib.write(str(continue_sampling))
        else:
            status = distrib.read()
            if status == "True":
                continue_sampling = True
            elif status == "False":
                continue_sampling = False
            else:
                raise OSError(
                    "Broken distributed message: '{}'".format(status))
        return continue_sampling, seq_count
Example #14
0
def write_eval_db(eval_db   : evaluate_cand_database.SearchCandidateDatabase,
                  tokenizer : "tokenizers.TokenizerBase",
                  samples   : typing.List[ActiveSample],
                  target_benchmark : typing.Tuple[str, str],
                  target_features  : typing.Dict[str, float],
                  gen_id    : int,
                  ) -> None:
  objs = {}
  # l.logger().warn("Before prep loop in eval db")
  for sample in samples:
    try:
      _ = opencl.Compile(tokenizer.ArrayToCode(sample.sample))
      compile_status = True
    except ValueError:
      compile_status = False

    sobj = evaluate_cand_database.SearchCandidate.FromArgs(
      tokenizer        = tokenizer,
      id               = eval_db.count,
      input_feed       = sample.sample_feed.input_feed,
      input_ids        = sample.input_ids,
      input_features   = sample.sample_feed.input_features,
      input_score      = sample.sample_feed.input_score,
      hole_lengths     = sample.hole_lengths,
      sample           = sample.sample,
      sample_indices   = sample.sample_indices,
      output_features  = sample.features,
      sample_score     = sample.score,
      target_benchmark = target_benchmark,
      target_features  = target_features,
      compile_status   = compile_status,
      generation_id    = gen_id,
    )
    if sobj.sha256 in objs:
      objs[sobj.sha256][1] += 1
    else:
      objs[sobj.sha256] = [sobj, 1]
  # l.logger().warn(eval_db.count)
  with eval_db.Session(commit = True) as session:
    offset_idx = 0
    for sha, obj in objs.items():
      try:
        entry = session.query(evaluate_cand_database.SearchCandidate).filter_by(sha256 = sha).first()
        if entry is not None:
          entry.frequency += obj[1]
        else:
          obj[0].frequency = obj[1]
          obj[0].id += offset_idx
          offset_idx += 1
          session.add(obj[0])
        session.commit()
      except Exception as e:
        l.logger().error(entry)
        if entry is not None:
          l.logger().error(entry.id)
          l.logger().error(entry.sha256)
        l.logger().error(sha)
        l.logger().error("count: {}".format(eval_db.count))
        l.logger().error("offset_idx: {}".format(offset_idx))
        print(e)
  # l.logger().warn("Finished eval_DB thread")
  return
Example #15
0
def execute_clsmith(idx: int, tokenizer, timeout_seconds: int = 15) -> typing.List[CLSmithSample]:
  """
  Execute clsmith and return sample.
  """
  try:
    tdir = pathlib.Path(FLAGS.local_filesystem).resolve()
  except Exception:
    tdir = None

  extra_args = ["-include{}".format(pathlib.Path(CLSMITH_INCLUDE) / "CLSmith.h")]
  with tempfile.NamedTemporaryFile("w", prefix = "clsmith_", suffix = ".cl", dir = tdir) as f:
    cmd =[
      "timeout",
      "-s9",
      str(timeout_seconds),
      CLSMITH,
      "-o",
      str(f.name)
    ]
    process = subprocess.Popen(
      cmd,
      stdout = subprocess.PIPE,
      stderr = subprocess.PIPE,
      universal_newlines = True,
    )
    try:
      stdout, stderr = process.communicate()
    except TimeoutError:
      return None

    contentfile = open(str(f.name), 'r').read()

  try:
    ks = opencl.ExtractSingleKernelsHeaders(
           opencl.StripDoubleUnderscorePrefixes(
               c.StripIncludes(contentfile),
           )
         )
  except ValueError as e:
    l.logger().error(contentfile)
    raise e

  samples = []
  for kernel, include in ks:
    encoded_sample = tokenizer.AtomizeString(kernel)
    try:
      stdout = opencl.Compile(kernel, header_file = include, extra_args = extra_args)
      compile_status = True
    except ValueError as e:
      stdout = str(e)
      compile_status = False

    samples.append(
      CLSmithSample.FromArgs(
        id             = idx,
        sample         = stdout,
        include        = include,
        encoded_sample = ','.join(encoded_sample),
        compile_status = compile_status,
        feature_vector = extractor.ExtractRawFeatures(kernel, header_file = include, extra_args = extra_args),
        num_tokens     = len(encoded_sample)
      )
    )
  return samples
Example #16
0
  def Train(self,
            corpus,
            test_sampler : typing.Optional[samplers.Sampler] = None,
            pre_train    : bool = False,
            **unused_kwargs
            ) -> None:
    """
    Main training entry point.
    """
    self._ConfigTrainParams(
      torchLMDataGenerator.TrainMaskLMBatchGenerator(
        corpus, self.config.training,
        self.cache.path,
        self.config.training.num_pretrain_steps if pre_train else None,
        pre_train,
        self.feature_encoder,
        self.feature_tokenizer,
        self.feature_sequence_length,
      ), pre_train
    )

    if FLAGS.only_sample:
      return
      
    self.current_step = self.loadCheckpoint(self.train, pre_train = pre_train)
    if self.pytorch.num_gpus > 0:
      self.torch.cuda.empty_cache()
    if self.current_step >= 0:
      l.logger().info("Loaded checkpoint step {}".format(self.current_step))
    self.current_step = max(0, self.current_step)

    if self.current_step < self.num_train_steps:
      self.train.model.zero_grad()

      ## Set batch size in case of TPU training or distributed training.
      if self.torch_tpu_available:
        total_train_batch_size = self.train_batch_size * self.pytorch.torch_xla.xrt_world_size()
      else:
        total_train_batch_size = (
          self.train_batch_size
          * (self.torch.distributed.get_world_size() if self.pytorch.num_nodes > 1 else 1)
        )

      # Set dataloader in case of TPU training.
      if self.torch_tpu_available:
        loader = self.pytorch.torch_ploader.ParallelLoader(
                            self.train.data_generator.dataloader, [self.pytorch.device]
                          ).per_device_loader(self.pytorch.device)
      else:
        loader = self.train.data_generator.dataloader

      # Get dataloader iterator and setup hooks.
      batch_iterator = iter(loader)
      if self.is_world_process_zero():
        train_hook = hooks.tensorMonitorHook(
          self.logfile_path if not pre_train else self.pre_logfile_path, self.current_step, min(self.steps_per_epoch, FLAGS.monitor_frequency)
        )
      if FLAGS.reward_compilation >= 0 and not pre_train:
        correct_sample_obs = sample_observers.SamplesDatabaseObserver(
          self.logfile_path / "correct_samples.db"
        )
      else:
        correct_sample_obs = None
      
      total_steps = self.config.training.num_pretrain_steps if pre_train else self.config.training.num_train_steps
      l.logger().info(
        "Splitting {} steps into {} equivalent epochs, {} steps each. Rejected {} redundant step(s)".format(
          self.num_train_steps, self.num_epochs, 
          self.steps_per_epoch, total_steps - self.num_train_steps
        )
      )
      try:
        self.train.model.train()
        epoch_iter = tqdm.auto.trange(self.num_epochs, desc="Epoch", leave = False) if self.is_world_process_zero() else range(self.num_epochs)
        for epoch in epoch_iter:

          # In distributed mode, calling the set_epoch() method at
          # the beginning of each epoch before creating the DataLoader iterator
          # is necessary to make shuffling work properly across multiple epochs.
          # Otherwise, the same ordering will be always used.
          if self.pytorch.num_nodes > 1:
            loader.sampler.set_epoch(epoch)

          if epoch < self.current_step // self.steps_per_epoch:
            continue # Stupid bar won't resume.

          batch_iter = tqdm.auto.trange(self.steps_per_epoch, desc="Batch", leave = False) if self.is_world_process_zero() else range(self.steps_per_epoch)
          for step in batch_iter:
            if self.is_world_process_zero():
              start = datetime.datetime.utcnow()
            try:
              inputs = next(batch_iterator)
            except StopIteration:
              # dataloader has different len() than steps_per_epoch.
              # This is the easiest way to infinite-loop dataloaders in pytorch.
              batch_iterator = iter(loader)
              inputs = next(batch_iterator)

            self.current_step += 1
            # Move inputs to torch device.
            inputs     = self.to_device(inputs)
            # Run model step on batch
            step_out   = self.model_step(self.train.model, inputs, step = epoch * self.steps_per_epoch + step)
            # Collect losses and backpropagate
            total_loss = step_out['total_loss'].mean()
            total_loss.backward()

            self.torch.nn.utils.clip_grad_norm_(self.train.model.parameters(), self.max_grad_norm)
            if self.torch_tpu_available:
              self.pytorch.torch_xla.optimizer_step(self.train.optimizer)
            else:
              self.train.optimizer.step()
            self.train.scheduler.step()

            ## Collect tensors for logging.
            if self.pytorch.num_nodes > 1:
              total_loss         = [self.torch.zeros(tuple(step_out['total_loss'        ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              masked_lm_loss     = [self.torch.zeros(tuple(step_out['masked_lm_loss'    ].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              # next_sentence_loss = [self.torch.zeros(tuple(step_out['next_sentence_loss'].shape), dtype = self.torch.float32).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]
              masked_lm_lengths  = [self.torch.zeros(tuple(inputs  ['masked_lm_lengths' ].shape), dtype = self.torch.int64  ).to(self.pytorch.device) for _ in range(self.torch.distributed.get_world_size())]

              self.torch.distributed.all_gather(masked_lm_loss,     step_out["masked_lm_loss"])
              # self.torch.distributed.all_gather(next_sentence_loss, step_out["next_sentence_loss"])
              self.torch.distributed.all_gather(masked_lm_lengths,  inputs['masked_lm_lengths'].to(self.pytorch.device))
              self.torch.distributed.all_gather(total_loss,         step_out['total_loss'])
            else:
              total_loss         = step_out['total_loss'        ].unsqueeze(0).cpu()
              masked_lm_loss     = step_out['masked_lm_loss'    ].unsqueeze(0).cpu()
              # next_sentence_loss = step_out['next_sentence_loss'].unsqueeze(0).cpu()
              masked_lm_lengths  = inputs['masked_lm_lengths' ].cpu()

            if self.is_world_process_zero():
              exec_time_ms = int(round((datetime.datetime.utcnow() - start).total_seconds() * 1000))
              if FLAGS.reward_compilation >= 0 and FLAGS.reward_compilation <= epoch * self.steps_per_epoch + step and not pre_train:
                ## Logging when compiler reward is enabled in training.
                ## This is not compatible with using DDP, and basically compiler-rewarded training is deprecated and proven to be wrong and inefficient.
                correct_samples = [(x, y) for en, (x, y) in enumerate(zip(inputs['input_ids'].cpu().numpy(), step_out['generated_samples'].cpu().numpy())) if step_out['compile_status'][en] == 1]
                for s in correct_samples:
                  feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(s[1]))
                  correct_sample_obs.OnSample(model_pb2.Sample(
                      train_step             = self.current_step,
                      sample_feed            = self.tokenizer.tokensToString(s[0], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                      text                   = self.tokenizer.tokensToString(s[1], ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                      encoded_text           = ",".join([str(t) for t in s[1]]),
                      sample_indices         = '',
                      encoded_sample_indices = '',
                      sample_time_ms         = int(round(exec_time_ms / self.train_batch_size)),
                      feature_vector         = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]),
                      num_tokens             = len([x for x in s[1] if x != self.tokenizer.padToken]),
                      categorical_sampling   = False,
                      compile_status         = True,
                      date_added             = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"),
                    )
                  )
              if not pre_train:
                ## Fine-tuning logging.
                train_hook.step(
                  masked_lm_loss          = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss),
                  # next_sentence_loss      = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss),
                  total_loss              = sum([tl.mean().item() for tl in total_loss]) / len(total_loss),
                  learning_rate           = self.train.scheduler.get_last_lr()[0],
                  num_correct_samples     = (correct_sample_obs.sample_id if correct_sample_obs is not None else None),
                  batch_avg_hole_len      = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1])
                                                 for b in masked_lm_lengths]) / len(masked_lm_lengths),
                  batch_execution_time_ms = exec_time_ms,
                  time_per_sample_ms      = exec_time_ms / self.train_batch_size,
                )
              else:
                ## Pre-training logging.
                train_hook.step(
                  masked_lm_loss          = sum([ml.mean().item() for ml in masked_lm_loss]) / len(masked_lm_loss),
                  # next_sentence_loss      = sum([nsl.mean().item() for nsl in next_sentence_loss]) / len(next_sentence_loss),
                  total_loss              = sum([tl.mean().item() for tl in total_loss]) / len(total_loss),
                  learning_rate           = self.train.scheduler.get_last_lr()[0],
                  batch_avg_hole_len      = sum([sum([int(l) for l in b if l != -1]) / len([int(l) for l in b if l != -1])
                                                 for b in masked_lm_lengths]) / len(masked_lm_lengths),
                  batch_execution_time_ms = exec_time_ms,
                  time_per_sample_ms      = exec_time_ms / self.train_batch_size,
                )
            self.train.model.zero_grad()
            if self.current_step == 0:
              l.logger().info("Starting Loss: {}".format(sum([tl.mean().item() for tl in total_loss]) / len(total_loss)))

          # End of Epoch
          self.saveCheckpoint(self.train, pre_train)
          if self.is_world_process_zero():
            set_mail = "Epoch {} Loss: {}\n".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss)
            l.logger().info("Epoch {} Loss: {}".format(self.current_step // self.steps_per_epoch, train_hook.epoch_loss))

          if self.pytorch.num_nodes > 1:
            loader.sampler.set_epoch(epoch)

          if FLAGS.validate_per_epoch and self.train.data_generator.config.validation_split > 0:
            val_ml_loss = self.Validate(per_epoch = True, pre_train = pre_train)
            if self.is_world_process_zero():
              train_hook.end_epoch(
              val_masked_lm_loss      = val_ml_loss,
              # val_next_sentence_loss  = val_nsp_loss,
              val_total_loss          = val_ml_loss # + val_nsp_loss,
              )
            set_mail += "Validation Loss: {}\n".format(val_ml_loss)
          elif self.is_world_process_zero():
            train_hook.end_epoch()

          if FLAGS.notify_me:
            client.getClient().send_message("clgen:torch_bert", set_mail)

          if self.torch_tpu_available:
            self.pytorch.torch_xla.master_print(self.pytorch.torch_xla_met.metrics_report())

          if FLAGS.sample_per_epoch > 0:
            sampler, observers = self._getTestSampler(test_sampler, self.config.training.sequence_length)
            self.InitSampling(sampler, self.config.training.random_seed)
            for _ in range(FLAGS.sample_per_epoch):
              start_time   = datetime.datetime.utcnow()
              self.InitSampleBatch(sampler)
              org_inputs, input_ids, samples, indices = self.SampleNextIndices()
              end_time = datetime.datetime.utcnow()
              for org, inp, sample, idxs in zip(org_inputs, input_ids, samples, indices):
                try:
                  stdout = opencl.Compile(self.tokenizer.ArrayToCode(sample))
                  compile_flag = 1
                except ValueError:
                  compile_flag = 0

                feature_vector = extractor.ExtractFeatures(self.tokenizer.ArrayToCode(sample))
                sample_proto = model_pb2.Sample(
                  train_step             = self.current_step,
                  sample_feed            = sampler.start_text,
                  original_input         = self.tokenizer.tokensToString(org,    with_formatting = True, ignore_token = self.tokenizer.padToken),
                  text                   = self.tokenizer.tokensToString(sample, with_formatting = True, ignore_token = self.tokenizer.padToken).replace("\\n", "\n"),
                  encoded_text           = ",".join([str(t) for t in sample]),
                  sample_indices         = ','.join([self.tokenizer.decoder[idx].replace('\n', '\\n') for idx in idxs]).replace('\n', '\\n'),
                  encoded_sample_indices = ','.join([str(idx) for idx in idxs]),
                  sample_time_ms         = int(round(1000 * ((end_time - start_time) / sampler.batch_size).total_seconds())),
                  feature_vector         = "\n".join(["{}:{}".format(k, v) for (k, v) in feature_vector.items()]),
                  num_tokens             = len(sample),
                  compile_status         = compile_flag,
                  categorical_sampling   = self.samplesWithCategorical(),
                  date_added             = datetime.datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S"),
                )
                for obs in observers:
                  obs.OnSample(sample_proto)
      except KeyboardInterrupt:
        pass

      if not FLAGS.force_eval:
        _ = self.Validate(pre_train = pre_train)

    if FLAGS.force_eval and not self.is_validated:
      _ = self.Validate(pre_train = pre_train)
    return
Example #17
0
def beam_mutec(
    srcs: typing.List[typing.Tuple[str, str, float]],
    target_features: typing.Dict[str, float],
    feat_space: str,
    beam_width: int,
    mutec_cache: samples_database.SamplesDatabase,
) -> typing.List[typing.Tuple[str, float]]:
    """
  Run generational beam search over starting github kernels
  to minimize distance from target features.
  """
    better_score = True
    total_beams, beam, closest = set(), [], []
    gen_id = 0

    while better_score:

        cands = set()
        ## Generate mutants for current generation.
        for src, incl, dist in tqdm.tqdm(
                srcs,
                total=len(srcs),
                desc="Mutec candidates {}".format(gen_id),
                leave=False):
            cands.update(
                generate_mutants(src, incl)
            )  ### This should collect all mutants and return them, out of a single source.

        ## Extract their features and calculate distances.
        pool = multiprocessing.Pool()
        f = functools.partial(
            workers.ExtractAndCalculate,
            target_features=target_features,
            feature_space=feat_space,
        )
        # total.update(cands)
        try:
            for cand in tqdm.tqdm(pool.imap_unordered(f, cands),
                                  total=len(cands),
                                  desc="Extract Features {}".format(gen_id),
                                  leave=False):
                if cand:
                    beam.append(cand)
        except Exception as e:
            l.logger().error(e)
            pool.terminate()
            raise e
        pool.close()

        ## Sort by distance in ascending order. If score is better, keep doing beam search
        ## srcs are included to the outputs, in order to keep them if the offsprings are worse.
        closest = sorted(beam + srcs, key=lambda x: x[2])[:beam_width]
        total_beams.update([(x, y) for x, y, _ in closest])

        min_length = min(len(closest), len(srcs))
        if sum([x for _, _, x in closest[:min_length]]) < sum([
                x for _, _, x in srcs[:min_length]
        ]) and gen_id < SEARCH_DEPTH_HARD_LIMIT:
            srcs = closest
            beam = []
        else:
            better_score = False
        gen_id += 1

    ## Store all mutants in database.
    with mutec_cache.Session(commit=True) as s:
        pool = multiprocessing.Pool()
        try:
            idx = mutec_cache.count
            for dp in tqdm.tqdm(pool.imap_unordered(workers.FeatureExtractor,
                                                    total_beams),
                                total=len(total_beams),
                                desc="Add mutants to DB",
                                leave=False):
                if dp:
                    src, incl, feats = dp
                    try:
                        _ = opencl.Compile(
                            src,
                            header_file=incl,
                            extra_args=[
                                "-include{}".format(
                                    pathlib.Path(environment.CLSMITH_INCLUDE) /
                                    "CLSmith.h")
                            ] if incl else [""])
                        compiles = True
                    except ValueError:
                        compiles = False
                    sample = samples_database.Sample.FromArgsLite(
                        idx, incl + src, feats, compiles)
                    exists = s.query(samples_database.Sample.sha256).filter_by(
                        sha256=sample.sha256).scalar() is not None
                    if not exists:
                        s.add(sample)
                        idx += 1
        except Exception as e:
            l.logger().error(e)
            pool.terminate()
            raise e
        pool.close()
        s.commit()
    return closest