Example #1
0
    def Specialize(self, atomizer: atomizers.AtomizerBase) -> None:
        """Specialize a termination criteria to a vocabulary.

    This enables the termination criteria to set state specialized to a specific
    encoding vocabulary. This is guaranteed to be called before
    SampleIsComplete(), and ensures that the vocabulary used for all sample
    arguments to SampleIsComplete() is from this vocabulary.

    Args:
      atomizer: An atomizer to specialize to.

    Raises:
      InvalidSymtokTokens: If the depth tokens can't be encoded, or they encode
        to more than one token.
    """
        try:
            left = atomizer.AtomizeString(self.left_token)
            right = atomizer.AtomizeString(self.right_token)
            if len(left) > 1 or len(right) > 1:
                raise errors.InvalidSymtokTokens(
                    "Sampler symmetrical depth tokens do not encode to a single "
                    "token using the corpus vocabulary")
        except errors.VocabError:
            raise errors.InvalidSymtokTokens(
                "Sampler symmetrical depth tokens cannot be encoded using the "
                "corpus vocabulary")
Example #2
0
    def Specialize(self, atomizer: atomizers.AtomizerBase) -> None:
        """Specialize a sampler a vocabulary.

    This enables the sampler to set state specialized to a specific encoding
    vocabulary. This is guaranteed to be called before SampleIsComplete(), and
    ensures that the vocabulary used for all sample arguments to
    SampleIsComplete() is from this vocabulary.

    Args:
      atomizer: An atomizer to specialize to.

    Raises:
      InvalidStartText: If the start_text cannot be encoded using the
        vocabulary.
      UserError: In case the sampler cannot be specialized to this vocabulary.
    """
        try:
            self.encoded_start_text = atomizer.AtomizeString(self.start_text)
            self.tokenized_start_text = atomizer.TokenizeString(
                self.start_text)
        except errors.VocabError:
            raise errors.InvalidStartText(
                "Sampler start text cannot be encoded using the corpus vocabulary: "
                f"'{self.start_text}'")

        if len(self.encoded_start_text) >= self.sequence_length:
            raise errors.InvalidStartText(
                "Encoded sampler start text must be less than sampler sequence "
                f"length. Sampler sequence length={self.sequence_length}, encoded "
                f"start text length={len(self.encoded_start_text)}")

        [terminator.Specialize(atomizer) for terminator in self.terminators]
Example #3
0
    def FromPreprocessed(
        cls,
        preprocessed_cf: preprocessed.PreprocessedContentFile,
        atomizer: atomizers.AtomizerBase,
        eof: str,
    ) -> "EncodedContentFile":
        """Instantiate an EncodedContentFile from a preprocessed file.

    Args:
      preprocessed_cf: A PreprocessedContentFile instance.
      atomizer: The atomizer to encode using.
      eof: An end-of-file marker which is concatenated to the encoded sequence.

    Returns:
      An EncodedContentFile instance.
    """
        start_time = time.time()
        data = atomizer.AtomizeString(preprocessed_cf.text)
        encoding_time_ms = int((time.time() - start_time) * 1000)
        return EncodedContentFile(
            id=preprocessed_cf.id,
            # Encode the end-of-file marker separately to ensure that it resolves to
            # the correct token. For example if the vocabulary contains 'a', 'b',
            # and 'ab', then a content file 'a' with EOF marker 'b' would be encoded
            # as 'ab', instead of 'a'+'b'.
            data=cls.NumpyArrayToDataString(
                np.concatenate((data, atomizer.AtomizeString(eof)))),
            tokencount=len(data),
            encoding_time_ms=encoding_time_ms,
            wall_time_ms=encoding_time_ms,  # The outer-loop may change this.
            date_added=datetime.datetime.utcnow(),
        )
Example #4
0
def EncodeAndPad(srcs: typing.List[str], padded_length: int,
                 atomizer: atomizers.AtomizerBase) -> np.array:
    """Encode and pad source code strings for training."""
    seqs = [atomizer.AtomizeString(src) for src in srcs]
    pad_val = atomizer.vocab_size
    encoded = np.array(
        sequence.pad_sequences(seqs, maxlen=padded_length, value=pad_val))
    return np.vstack([np.expand_dims(x, axis=0) for x in encoded])
Example #5
0
def EncodeAndPadSources(atomizer: atomizers.AtomizerBase,
                        srcs: typing.List[str], maxlen: int) -> np.array:
    """Encode and pad source code for learning."""
    seqs = [atomizer.AtomizeString(src) for src in srcs]
    pad_val = atomizer.vocab_size
    encoded = np.array(
        keras_sequence.pad_sequences(seqs, maxlen=maxlen, value=pad_val))
    return np.vstack([np.expand_dims(x, axis=0) for x in encoded])
Example #6
0
 def MakeProgram(
     self,
     sampled_tokens: typing.List[str],
     backtracker: OpenClBacktrackingHelper,
     atomizer: atomizers.AtomizerBase,
 ) -> typing.List[str]:
     """Produce a kernel from a sample."""
     src = backtracker.TryToCloseProgram(sampled_tokens) or ""
     return atomizer.TokenizeString(src)
Example #7
0
    def SampleOneWithBacktrackingToTextStream(
        self,
        sampler: samplers.Sampler,
        atomizer: atomizers.AtomizerBase,
        backtracker: OpenClBacktrackingHelper,
    ) -> typing.List[str]:
        """Produce a single sample using backtracking.

    Args:
      sampler: A Sampler instance, used to determine the start text, and when to
        terminate sampling.
      atomizer: The corpus vocabulary atomizer.
      backtracker: An instance of the backtracking helper class.

    Returns:
      A sample, as a sequence of strings.
    """

        data = {
            "sample_in_progress": "",
            "candidate": "",
            "status": "running",
            "elapsed": 0,
        }

        def Data(data: Dict[str, Any]):
            data["elapsed"] = humanize.Duration(time.time() - start_time)
            return f"retry: 100\ndata: {json.dumps(data)}\n\n"

        start_time = time.time()

        # During sampling, 'sample_in_progress' contains the sequence of tokens that
        # is restored when backtracking.
        sample_in_progress = sampler.tokenized_start_text.copy()
        rollback_state, rollback_index = self.backend.EvaluateSampleState(
            sampler)
        data["sample_in_progress"] = "".join(sample_in_progress)
        yield Data(data)

        # Generate a batch of candidates.
        for step_count in range(
                1, FLAGS.experimental_clgen_backtracking_max_steps + 1):
            self._logger.OnSampleStep(backtracker, 0, len(sample_in_progress))
            data["sample_in_progress"] = "".join(sample_in_progress)
            data["candidate"] = ""
            data["status"] = f"step {step_count}"
            app.Log(4, "Current sample in progress: `%s`",
                    "".join(sample_in_progress))
            yield Data(data)
            # Generate a batch of candidates statements to choose from.
            candidate_statements = []
            for _ in range(FLAGS.experimental_clgen_backtracking_max_attempts):
                yield Data(data)
                candidate_statements.extend(
                    self.TryToGenerateCandidateStatements(
                        sample_in_progress,
                        rollback_state,
                        rollback_index,
                        backtracker,
                        sampler,
                        atomizer,
                    ))
                if candidate_statements:
                    data["candidate"] = html.escape("".join(
                        candidate_statements[-1].statement))
                yield Data(data)
                candidate_statements = [
                    c for c in candidate_statements
                    if c.feature_distance is not None
                ]
                if (len(candidate_statements) >= FLAGS.
                        experimental_clgen_backtracking_candidates_per_step):
                    break

            if not candidate_statements:
                app.Log(
                    2,
                    "Failed to produce any candidate statement after %d attempts",
                    FLAGS.experimental_clgen_backtracking_max_attempts,
                )
                break

            # Select the best candidate.
            best_candidate = min(candidate_statements,
                                 key=lambda x: x.feature_distance)

            # Select a candidate using stochastic hill climbing
            old = backtracker.feature_distance
            deltas = [(old - c.feature_distance) for c in candidate_statements]
            deltas = np.array(deltas) + 0.5
            deltas[deltas < 0.5] = 0.1
            deltas = deltas / np.sum(deltas)
            sel_candidate_idx = np.random.choice(len(candidate_statements),
                                                 p=deltas)
            sel_candidate = candidate_statements[sel_candidate_idx]

            app.Log(
                2,
                "Selected best feature distance (%f) at step %d from candidates: %s",
                sel_candidate.feature_distance,
                step_count,
                SummarizeFloats(c.feature_distance
                                for c in candidate_statements),
            )
            app.Log(4, "Selected best statement: %s",
                    "".join(sel_candidate.statement))

            if backtracker.feature_distance - sel_candidate.feature_distance <= 0:
                stagnation += 1
                if stagnation > 10:
                    (
                        backtracker,
                        sample_in_progress,
                        rollback_state,
                        rollback_index,
                    ) = rollback_history.pop()
                    stagnation = 0
                    app.Log(4, "Got Stuck. Backtracking")
                    continue
            else:
                stagnation = 0
                if step_count % 10 == 0:
                    rollback_history.append((
                        copy.deepcopy(backtracker),
                        list(sample_in_progress),
                        rollback_state,
                        rollback_index,
                    ))

            # Set the sampler's rollback state to be the state produced by feeding
            # the best candidate in the input, so that future samples start from
            # the right state
            if len(sel_candidate.statement) > 0:
                sample_in_progress += sel_candidate.statement
                encoded_sel_candidate = atomizer.AtomizeString("".join(
                    sel_candidate.statement))
                arr = np.concatenate([rollback_index, encoded_sel_candidate])
                self.backend.ResetSampleState(sampler,
                                              state=rollback_state,
                                              seed=arr)
                rollback_state, rollback_index = self.backend.EvaluateSampleState(
                    sampler)

            app.Log(
                5,
                "Current sample at step %d: %s",
                step_count,
                "".join(sample_in_progress),
            )

            if backtracker.IsDone(sample_in_progress):
                app.Log(2, "Backtracking complete after %d steps", step_count)
                break
        else:
            app.Log(2, "Backtracking failed to complete after %d steps",
                    step_count)

        yield Data(data)
Example #8
0
    def SampleOneWithBacktracking(
        self,
        sampler: samplers.Sampler,
        atomizer: atomizers.AtomizerBase,
        backtracker: OpenClBacktrackingHelper,
    ) -> typing.List[str]:
        """Produce a single sample using backtracking.

    Args:
      sampler: A Sampler instance, used to determine the start text, and when to
        terminate sampling.
      atomizer: The corpus vocabulary atomizer.
      backtracker: An instance of the backtracking helper class.

    Returns:
      A sample, as a sequence of strings.
    """
        # During sampling, 'sample_in_progress' contains the sequence of tokens that
        # is restored when backtracking.
        sample_in_progress = sampler.tokenized_start_text.copy()
        self.backend.RandomizeSampleState()
        rollback_state, rollback_index = self.backend.EvaluateSampleState(
            sampler)
        rollback_history = [(
            copy.deepcopy(backtracker),
            list(sample_in_progress),
            rollback_state,
            rollback_index,
        )]
        stagnation = 0

        # Generate a batch of candidates.
        for step_count in range(
                1, FLAGS.experimental_clgen_backtracking_max_steps + 1):
            self._logger.OnSampleStep(backtracker, 0, len(sample_in_progress))
            app.Log(4, "Current sample in progress: `%s`",
                    "".join(sample_in_progress))

            # Generate a batch of candidates statements to choose from.
            candidate_statements = []
            for _ in range(
                    FLAGS.experimental_clgen_backtracking_max_attempts *
                    FLAGS.experimental_clgen_backtracking_candidates_per_step):
                candidate_statements.extend([
                    c for c in self.TryToGenerateCandidateStatements(
                        sample_in_progress,
                        rollback_state,
                        rollback_index,
                        backtracker,
                        sampler,
                        atomizer,
                    ) if c.feature_distance is not None
                ])
                if (len(candidate_statements) >= FLAGS.
                        experimental_clgen_backtracking_candidates_per_step):
                    break

            if not candidate_statements:
                app.Log(
                    2,
                    "Failed to produce any candidate statement after %d attempts",
                    FLAGS.experimental_clgen_backtracking_max_attempts,
                )
                break

            # Select the best candidate.
            if self._target_features is not None:
                best_candidate = min(candidate_statements,
                                     key=lambda x: x.feature_distance)
            else:
                best_candidate = random.choice(candidate_statements)
            app.Log(
                2,
                "Selected best feature distance (%f) at step %d from candidates: %s",
                best_candidate.feature_distance,
                step_count,
                SummarizeFloats(c.feature_distance
                                for c in candidate_statements),
            )
            app.Log(4, "Selected best statement: %s",
                    "".join(best_candidate.statement))

            # Set the sampler's rollback state to be the state produced by feeding
            # the best candidate in the input, so that future samples start from
            # the right state
            if len(best_candidate.statement) > 0:
                sample_in_progress += best_candidate.statement
                encoded_best_candidate = atomizer.AtomizeString("".join(
                    best_candidate.statement))
                arr = np.concatenate([rollback_index, encoded_best_candidate])
                self.backend.ResetSampleState(sampler,
                                              state=rollback_state,
                                              seed=arr)
                rollback_state, rollback_index = self.backend.EvaluateSampleState(
                    sampler)

            app.Log(
                5,
                "Current sample at step %d: %s",
                step_count,
                "".join(sample_in_progress),
            )

            if backtracker.IsDone(sample_in_progress):
                app.Log(2, "Backtracking complete after %d steps", step_count)
                break
        else:
            app.Log(2, "Backtracking failed to complete after %d steps",
                    step_count)

        return self.MakeProgram(sample_in_progress, backtracker, atomizer)