Beispiel #1
0
  def FromPreprocessed(
    cls,
    preprocessed_cf: preprocessed.PreprocessedContentFile,
    tokenizer: tokenizers.TokenizerBase,
    eof: str,
    pre_train: bool,
  ) -> "EncodedContentFile":
    """Instantiate an EncodedContentFile from a preprocessed file.

    Args:
      preprocessed_cf: A PreprocessedContentFile instance.
      tokenizer: The tokenizer to encode using.
      eof: An end-of-file marker which is concatenated to the encoded sequence.

    Returns:
      An EncodedContentFile instance.
    """
    start_time = time.time()
    try:
      data = tokenizer.TokenizeString(preprocessed_cf.text)
    except ValueError as e:
      l.logger().warn(e)
      return None
    ####
    # TODO kernel analytics
    # encoded_length = len(data)
    # token_values = data.sorted()
    ####
    encoding_time_ms = int((time.time() - start_time) * 1000)
    try:
      if not pre_train:
        feature_vector = extractor.ExtractRawFeatures(preprocessed_cf.text)
      else:
        feature_vector = ""
    except Exception as e:
      raise e
    return EncodedContentFile(
      id = preprocessed_cf.id,
      # Encode the end-of-file marker separately to ensure that it resolves to
      # the correct token. For example if the vocabulary contains 'a', 'b',
      # and 'ab', then a content file 'a' with EOF marker 'b' would be encoded
      # as 'ab', instead of 'a'+'b'.
      data = cls.NumpyArrayToDataString(
        np.concatenate((data, tokenizer.TokenizeString(eof)))
      ),
      tokencount       = len(data),
      feature_vector   = feature_vector,
      encoding_time_ms = encoding_time_ms,
      wall_time_ms     = encoding_time_ms,  # The outer-loop may change this.
      date_added       = datetime.datetime.utcnow(),
    )
Beispiel #2
0
def FeatureExtractor(
        src_incl: typing.Tuple[str, str]) -> typing.Tuple[str, str, str]:
    """
  Extracts Raw features for all feat spaces and returns tuple of source and features.
  """
    src, incl = src_incl
    try:
        return src, incl, extractor.ExtractRawFeatures(
            src,
            header_file=incl,
            extra_args=[
                "-include{}".format(
                    pathlib.Path(environment.CLSMITH_INCLUDE) / "CLSmith.h")
            ] if incl else [""])
    except ValueError:
        return src, incl, ""
Beispiel #3
0
def ToProto(dp: ActiveFeed) -> samples_database.Sample:
    return samples_database.Sample(**samples_database.Sample.FromProto(
        0,
        model_pb2.Sample(
            train_step=-1,
            text=dp.sample,
            sample_indices="",
            encoded_sample_indices="",
            original_input="",
            sample_feed=dp.input_feed,
            encoded_text="",
            sample_time_ms=0,
            feature_vector=extractor.ExtractRawFeatures(dp.sample),
            num_tokens=dp.num_tokens,
            compile_status=dp.compile_status,
            categorical_sampling=1,
            date_added=dp.date_added.strftime("%m/%d/%Y, %H:%M:%S"),
        )))
Beispiel #4
0
def run_extractors(sample: Sample) -> Sample:
    if sample.compile_status:
        return Sample(**Sample.FromProto(
            0,
            model_pb2.Sample(
                train_step=sample.train_step,
                text=sample.text,
                sample_indices=sample.sample_indices,
                encoded_sample_indices=sample.encoded_sample_indices,
                original_input=sample.original_input,
                sample_feed=sample.sample_feed,
                encoded_text=sample.encoded_text,
                sample_time_ms=sample.sample_time_ms,
                feature_vector=extractor.ExtractRawFeatures(sample.text),
                num_tokens=sample.num_tokens,
                compile_status=sample.compile_status,
                categorical_sampling=int(sample.categorical_sampling),
                date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"),
            )))
    else:
        return Sample(**Sample.FromProto(
            0,
            model_pb2.Sample(
                train_step=sample.train_step,
                text=sample.text,
                sample_indices=sample.sample_indices,
                encoded_sample_indices=sample.encoded_sample_indices,
                original_input=sample.original_input,
                sample_feed=sample.sample_feed,
                encoded_text=sample.encoded_text,
                sample_time_ms=sample.sample_time_ms,
                feature_vector="",
                num_tokens=sample.num_tokens,
                compile_status=sample.compile_status,
                categorical_sampling=int(sample.categorical_sampling),
                date_added=sample.date_added.strftime("%m/%d/%Y, %H:%M:%S"),
            )))
Beispiel #5
0
    def _SampleSeqBatch(
        self,
        sampler: 'samplers.Sampler',
        tokenizer: tokenizers.TokenizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
        epoch: int,
    ) -> bool:
        """
    Run a single iteration of the batched sample inner-loop for sequential models.
    """

        start_time = datetime.datetime.utcnow()

        self.backend.InitSampleBatch(sampler)
        samples_in_progress = [
            sampler.tokenized_start_text.copy()
            for _ in range(sampler.batch_size)
        ]
        done = np.zeros(sampler.batch_size, dtype=np.bool)
        wall_time_start = start_time
        seq_count = 0

        # The return value of this method. If any of the sample_observers return
        # False, this value is set to False.
        continue_sampling = True

        # Sampling loop. Continues until all samples in the batch are done.
        while not done.all():
            indices = self.backend.SampleNextIndices(sampler, done)
            # Iterate over all samples in batch to determine whether they're
            # done.

            for i in range(len(indices)):
                if done[i]:
                    continue

                for index in indices[i]:
                    samples_in_progress[i].append(tokenizer.decoder[index])
                    step_ind = ""
                    encoded_step_indices = ""

                    if sampler.SampleIsComplete(samples_in_progress[i]):
                        end_time = datetime.datetime.utcnow()
                        sample_kernel = [x for x in samples_in_progress[i]]
                        features = extractor.ExtractRawFeatures(''.join(
                            samples_in_progress[i]))
                        done[i] = 1
                        try:
                            stdout = opencl.Compile(''.join(
                                samples_in_progress[i]))
                            compile_flag = True
                        except ValueError:
                            compile_flag = False

                        sample = model_pb2.Sample(
                            train_step=epoch,
                            text=samples_in_progress[i],
                            sample_indices="",
                            encoded_sample_indices="",
                            sample_feed=sampler.start_text,
                            encoded_text=",".join([
                                str(tokenizer.vocab[x]) for x in sample_kernel
                            ]),
                            sample_start_epoch_ms_utc=int(
                                start_time.strftime("%s%f")),
                            sample_time_ms=int(
                                round(1000 *
                                      ((end_time - start_time) /
                                       sampler.batch_size).total_seconds())),
                            wall_time_ms=int(
                                round(1000 *
                                      ((end_time - start_time) /
                                       sampler.batch_size).total_seconds())),
                            feature_vector=features,
                            num_tokens=len(samples_in_progress[i]),
                            compile_status=compile_flag,
                            categorical_sampling=self.backend.
                            samplesWithCategorical(),
                            date_added=datetime.datetime.utcnow().strftime(
                                "%m/%d/%Y, %H:%M:%S"),
                        )
                        # Notify sample observers.
                        continue_sampling &= all(
                            [obs.OnSample(sample) for obs in sample_observers])
                        seq_count += 1
                        # Wall sample time is the difference between the end of the previous
                        # sample and the end of the current sample.
                        wall_time_start = datetime.datetime.utcnow()
                        break
        return continue_sampling, seq_count
Beispiel #6
0
    def _SampleLMBatch(
        self,
        sampler: 'samplers.Sampler',
        tokenizer: tokenizers.TokenizerBase,
        sample_observers: typing.List[sample_observers_lib.SampleObserver],
        epoch: int,
    ) -> bool:
        """
    Run a sampling iteration over BERT models.
    """
        start_time = datetime.datetime.utcnow()
        seq_count = 0
        self.backend.InitSampleBatch(sampler,
                                     workload_size=FLAGS.sample_workload_size)
        try:
            org_inputs, input_ids, samples, indices = self.backend.SampleNextIndices(
                sampler)
        except StopIteration:
            return False, seq_count

        if not samples:
            # Return empty means model has not produced something that can be stored.
            # This 'if' accommodates active sampling, which is very selective.
            return True, seq_count

        continue_sampling = True

        if environment.WORLD_RANK == 0:
            assert len(org_inputs) == len(input_ids) == len(samples) == len(
                indices), "Length mismatch, {}-{}-{}-{}".format(
                    len(org_inputs), len(input_ids), len(samples),
                    len(indices))
            for org, inp, sample, idxs in zip(org_inputs, input_ids, samples,
                                              indices):

                src = self.tokenizer.ArrayToCode(sample, with_formatting=True)
                try:
                    stdout = opencl.Compile(src)
                    compile_flag = True
                    features = extractor.ExtractRawFeatures(src)
                except ValueError:
                    compile_flag = False
                    features = ""

                end_time = datetime.datetime.utcnow()
                sample = model_pb2.Sample(
                    train_step=epoch,
                    text=src,
                    sample_indices=','.join([
                        self.tokenizer.decoder[idx].replace('\n', '\\n')
                        for idx in idxs
                    ]).replace('\n', '\\n'),
                    encoded_sample_indices=','.join([str(idx)
                                                     for idx in idxs]),
                    original_input=self.tokenizer.tokensToString(
                        org,
                        with_formatting=False,
                        ignore_token=self.tokenizer.padToken),
                    sample_feed=self.tokenizer.tokensToString(
                        inp,
                        with_formatting=False,
                        ignore_token=self.tokenizer.padToken),
                    encoded_text=",".join([str(x) for x in sample]),
                    sample_start_epoch_ms_utc=int(start_time.strftime("%s%f")),
                    sample_time_ms=int(
                        round(1000 * ((end_time - start_time) /
                                      len(samples)).total_seconds())),
                    wall_time_ms=int(
                        round(1000 * ((end_time - start_time) /
                                      len(samples)).total_seconds())),
                    feature_vector=features,
                    num_tokens=np.where(
                        sample == self.tokenizer.padToken)[0][0]
                    if self.tokenizer.padToken in sample else len(sample),
                    compile_status=compile_flag,
                    categorical_sampling=self.backend.samplesWithCategorical(),
                    date_added=datetime.datetime.utcnow().strftime(
                        "%m/%d/%Y, %H:%M:%S"),
                )
                # Notify sample observers.
                continue_sampling &= all(
                    [obs.OnSample(sample) for obs in sample_observers])
                seq_count += 1
            if environment.WORLD_SIZE > 1:
                distrib.write(str(continue_sampling))
        else:
            status = distrib.read()
            if status == "True":
                continue_sampling = True
            elif status == "False":
                continue_sampling = False
            else:
                raise OSError(
                    "Broken distributed message: '{}'".format(status))
        return continue_sampling, seq_count
Beispiel #7
0
def execute_clsmith(idx: int, tokenizer, timeout_seconds: int = 15) -> typing.List[CLSmithSample]:
  """
  Execute clsmith and return sample.
  """
  try:
    tdir = pathlib.Path(FLAGS.local_filesystem).resolve()
  except Exception:
    tdir = None

  extra_args = ["-include{}".format(pathlib.Path(CLSMITH_INCLUDE) / "CLSmith.h")]
  with tempfile.NamedTemporaryFile("w", prefix = "clsmith_", suffix = ".cl", dir = tdir) as f:
    cmd =[
      "timeout",
      "-s9",
      str(timeout_seconds),
      CLSMITH,
      "-o",
      str(f.name)
    ]
    process = subprocess.Popen(
      cmd,
      stdout = subprocess.PIPE,
      stderr = subprocess.PIPE,
      universal_newlines = True,
    )
    try:
      stdout, stderr = process.communicate()
    except TimeoutError:
      return None

    contentfile = open(str(f.name), 'r').read()

  try:
    ks = opencl.ExtractSingleKernelsHeaders(
           opencl.StripDoubleUnderscorePrefixes(
               c.StripIncludes(contentfile),
           )
         )
  except ValueError as e:
    l.logger().error(contentfile)
    raise e

  samples = []
  for kernel, include in ks:
    encoded_sample = tokenizer.AtomizeString(kernel)
    try:
      stdout = opencl.Compile(kernel, header_file = include, extra_args = extra_args)
      compile_status = True
    except ValueError as e:
      stdout = str(e)
      compile_status = False

    samples.append(
      CLSmithSample.FromArgs(
        id             = idx,
        sample         = stdout,
        include        = include,
        encoded_sample = ','.join(encoded_sample),
        compile_status = compile_status,
        feature_vector = extractor.ExtractRawFeatures(kernel, header_file = include, extra_args = extra_args),
        num_tokens     = len(encoded_sample)
      )
    )
  return samples